kernel-rt.patch

   1 diff --git a/Documentation/hwlat_detector.txt b/Documentation/hwlat_detector.txt
   2 new file mode 100644
   3 index 000000000000..cb61516483d3
   4 --- /dev/null
   5 +++ b/Documentation/hwlat_detector.txt
   6 @@ -0,0 +1,64 @@
   7 +Introduction:
   8 +-------------
   9 +
  10 +The module hwlat_detector is a special purpose kernel module that is used to
  11 +detect large system latencies induced by the behavior of certain underlying
  12 +hardware or firmware, independent of Linux itself. The code was developed
  13 +originally to detect SMIs (System Management Interrupts) on x86 systems,
  14 +however there is nothing x86 specific about this patchset. It was
  15 +originally written for use by the "RT" patch since the Real Time
  16 +kernel is highly latency sensitive.
  17 +
  18 +SMIs are usually not serviced by the Linux kernel, which typically does not
  19 +even know that they are occuring. SMIs are instead are set up by BIOS code
  20 +and are serviced by BIOS code, usually for "critical" events such as
  21 +management of thermal sensors and fans. Sometimes though, SMIs are used for
  22 +other tasks and those tasks can spend an inordinate amount of time in the
  23 +handler (sometimes measured in milliseconds). Obviously this is a problem if
  24 +you are trying to keep event service latencies down in the microsecond range.
  25 +
  26 +The hardware latency detector works by hogging all of the cpus for configurable
  27 +amounts of time (by calling stop_machine()), polling the CPU Time Stamp Counter
  28 +for some period, then looking for gaps in the TSC data. Any gap indicates a
  29 +time when the polling was interrupted and since the machine is stopped and
  30 +interrupts turned off the only thing that could do that would be an SMI.
  31 +
  32 +Note that the SMI detector should *NEVER* be used in a production environment.
  33 +It is intended to be run manually to determine if the hardware platform has a
  34 +problem with long system firmware service routines.
  35 +
  36 +Usage:
  37 +------
  38 +
  39 +Loading the module hwlat_detector passing the parameter "enabled=1" (or by
  40 +setting the "enable" entry in "hwlat_detector" debugfs toggled on) is the only
  41 +step required to start the hwlat_detector. It is possible to redefine the
  42 +threshold in microseconds (us) above which latency spikes will be taken
  43 +into account (parameter "threshold=").
  44 +
  45 +Example:
  46 +
  47 +       # modprobe hwlat_detector enabled=1 threshold=100
  48 +
  49 +After the module is loaded, it creates a directory named "hwlat_detector" under
  50 +the debugfs mountpoint, "/debug/hwlat_detector" for this text. It is necessary
  51 +to have debugfs mounted, which might be on /sys/debug on your system.
  52 +
  53 +The /debug/hwlat_detector interface contains the following files:
  54 +
  55 +count                  - number of latency spikes observed since last reset
  56 +enable                 - a global enable/disable toggle (0/1), resets count
  57 +max                    - maximum hardware latency actually observed (usecs)
  58 +sample                 - a pipe from which to read current raw sample data
  59 +                         in the format <timestamp> <latency observed usecs>
  60 +                         (can be opened O_NONBLOCK for a single sample)
  61 +threshold              - minimum latency value to be considered (usecs)
  62 +width                  - time period to sample with CPUs held (usecs)
  63 +                         must be less than the total window size (enforced)
  64 +window                 - total period of sampling, width being inside (usecs)
  65 +
  66 +By default we will set width to 500,000 and window to 1,000,000, meaning that
  67 +we will sample every 1,000,000 usecs (1s) for 500,000 usecs (0.5s). If we
  68 +observe any latencies that exceed the threshold (initially 100 usecs),
  69 +then we write to a global sample ring buffer of 8K samples, which is
  70 +consumed by reading from the "sample" (pipe) debugfs file interface.
  71 diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
  72 index c360f80c3473..5489dea355a2 100644
  73 --- a/Documentation/kernel-parameters.txt
  74 +++ b/Documentation/kernel-parameters.txt
  75 @@ -1636,6 +1636,15 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
  76         ip=             [IP_PNP]
  77                         See Documentation/filesystems/nfs/nfsroot.txt.
  78
  79 +       irqaffinity=    [SMP] Set the default irq affinity mask
  80 +                       Format:
  81 +                       <cpu number>,...,<cpu number>
  82 +                       or
  83 +                       <cpu number>-<cpu number>
  84 +                       (must be a positive range in ascending order)
  85 +                       or a mixture
  86 +                       <cpu number>,...,<cpu number>-<cpu number>
  87 +
  88         irqfixup        [HW]
  89                         When an interrupt is not handled search all handlers
  90                         for it. Intended to get systems with badly broken
  91 diff --git a/Documentation/sysrq.txt b/Documentation/sysrq.txt
  92 index 13f5619b2203..f64d075ba647 100644
  93 --- a/Documentation/sysrq.txt
  94 +++ b/Documentation/sysrq.txt
  95 @@ -59,10 +59,17 @@ On PowerPC - Press 'ALT - Print Screen (or F13) - <command key>,
  96  On other - If you know of the key combos for other architectures, please
  97             let me know so I can add them to this section.
  98
  99 -On all -  write a character to /proc/sysrq-trigger.  e.g.:
 100 -
 101 +On all -  write a character to /proc/sysrq-trigger, e.g.:
 102                 echo t > /proc/sysrq-trigger
 103
 104 +On all - Enable network SysRq by writing a cookie to icmp_echo_sysrq, e.g.
 105 +               echo 0x01020304 >/proc/sys/net/ipv4/icmp_echo_sysrq
 106 +        Send an ICMP echo request with this pattern plus the particular
 107 +        SysRq command key. Example:
 108 +               # ping -c1 -s57 -p0102030468
 109 +        will trigger the SysRq-H (help) command.
 110 +
 111 +
 112  *  What are the 'command' keys?
 113  ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 114  'b'     - Will immediately reboot the system without syncing or unmounting
 115 diff --git a/Documentation/trace/histograms.txt b/Documentation/trace/histograms.txt
 116 new file mode 100644
 117 index 000000000000..6f2aeabf7faa
 118 --- /dev/null
 119 +++ b/Documentation/trace/histograms.txt
 120 @@ -0,0 +1,186 @@
 121 +               Using the Linux Kernel Latency Histograms
 122 +
 123 +
 124 +This document gives a short explanation how to enable, configure and use
 125 +latency histograms. Latency histograms are primarily relevant in the
 126 +context of real-time enabled kernels (CONFIG_PREEMPT/CONFIG_PREEMPT_RT)
 127 +and are used in the quality management of the Linux real-time
 128 +capabilities.
 129 +
 130 +
 131 +* Purpose of latency histograms
 132 +
 133 +A latency histogram continuously accumulates the frequencies of latency
 134 +data. There are two types of histograms
 135 +- potential sources of latencies
 136 +- effective latencies
 137 +
 138 +
 139 +* Potential sources of latencies
 140 +
 141 +Potential sources of latencies are code segments where interrupts,
 142 +preemption or both are disabled (aka critical sections). To create
 143 +histograms of potential sources of latency, the kernel stores the time
 144 +stamp at the start of a critical section, determines the time elapsed
 145 +when the end of the section is reached, and increments the frequency
 146 +counter of that latency value - irrespective of whether any concurrently
 147 +running process is affected by latency or not.
 148 +- Configuration items (in the Kernel hacking/Tracers submenu)
 149 +  CONFIG_INTERRUPT_OFF_LATENCY
 150 +  CONFIG_PREEMPT_OFF_LATENCY
 151 +
 152 +
 153 +* Effective latencies
 154 +
 155 +Effective latencies are actually occuring during wakeup of a process. To
 156 +determine effective latencies, the kernel stores the time stamp when a
 157 +process is scheduled to be woken up, and determines the duration of the
 158 +wakeup time shortly before control is passed over to this process. Note
 159 +that the apparent latency in user space may be somewhat longer, since the
 160 +process may be interrupted after control is passed over to it but before
 161 +the execution in user space takes place. Simply measuring the interval
 162 +between enqueuing and wakeup may also not appropriate in cases when a
 163 +process is scheduled as a result of a timer expiration. The timer may have
 164 +missed its deadline, e.g. due to disabled interrupts, but this latency
 165 +would not be registered. Therefore, the offsets of missed timers are
 166 +recorded in a separate histogram. If both wakeup latency and missed timer
 167 +offsets are configured and enabled, a third histogram may be enabled that
 168 +records the overall latency as a sum of the timer latency, if any, and the
 169 +wakeup latency. This histogram is called "timerandwakeup".
 170 +- Configuration items (in the Kernel hacking/Tracers submenu)
 171 +  CONFIG_WAKEUP_LATENCY
 172 +  CONFIG_MISSED_TIMER_OFSETS
 173 +
 174 +
 175 +* Usage
 176 +
 177 +The interface to the administration of the latency histograms is located
 178 +in the debugfs file system. To mount it, either enter
 179 +
 180 +mount -t sysfs nodev /sys
 181 +mount -t debugfs nodev /sys/kernel/debug
 182 +
 183 +from shell command line level, or add
 184 +
 185 +nodev  /sys                    sysfs   defaults        0 0
 186 +nodev  /sys/kernel/debug       debugfs defaults        0 0
 187 +
 188 +to the file /etc/fstab. All latency histogram related files are then
 189 +available in the directory /sys/kernel/debug/tracing/latency_hist. A
 190 +particular histogram type is enabled by writing non-zero to the related
 191 +variable in the /sys/kernel/debug/tracing/latency_hist/enable directory.
 192 +Select "preemptirqsoff" for the histograms of potential sources of
 193 +latencies and "wakeup" for histograms of effective latencies etc. The
 194 +histogram data - one per CPU - are available in the files
 195 +
 196 +/sys/kernel/debug/tracing/latency_hist/preemptoff/CPUx
 197 +/sys/kernel/debug/tracing/latency_hist/irqsoff/CPUx
 198 +/sys/kernel/debug/tracing/latency_hist/preemptirqsoff/CPUx
 199 +/sys/kernel/debug/tracing/latency_hist/wakeup/CPUx
 200 +/sys/kernel/debug/tracing/latency_hist/wakeup/sharedprio/CPUx
 201 +/sys/kernel/debug/tracing/latency_hist/missed_timer_offsets/CPUx
 202 +/sys/kernel/debug/tracing/latency_hist/timerandwakeup/CPUx
 203 +
 204 +The histograms are reset by writing non-zero to the file "reset" in a
 205 +particular latency directory. To reset all latency data, use
 206 +
 207 +#!/bin/sh
 208 +
 209 +TRACINGDIR=/sys/kernel/debug/tracing
 210 +HISTDIR=$TRACINGDIR/latency_hist
 211 +
 212 +if test -d $HISTDIR
 213 +then
 214 +  cd $HISTDIR
 215 +  for i in `find . | grep /reset$`
 216 +  do
 217 +    echo 1 >$i
 218 +  done
 219 +fi
 220 +
 221 +
 222 +* Data format
 223 +
 224 +Latency data are stored with a resolution of one microsecond. The
 225 +maximum latency is 10,240 microseconds. The data are only valid, if the
 226 +overflow register is empty. Every output line contains the latency in
 227 +microseconds in the first row and the number of samples in the second
 228 +row. To display only lines with a positive latency count, use, for
 229 +example,
 230 +
 231 +grep -v " 0$" /sys/kernel/debug/tracing/latency_hist/preemptoff/CPU0
 232 +
 233 +#Minimum latency: 0 microseconds.
 234 +#Average latency: 0 microseconds.
 235 +#Maximum latency: 25 microseconds.
 236 +#Total samples: 3104770694
 237 +#There are 0 samples greater or equal than 10240 microseconds
 238 +#usecs          samples
 239 +    0        2984486876
 240 +    1          49843506
 241 +    2          58219047
 242 +    3           5348126
 243 +    4           2187960
 244 +    5           3388262
 245 +    6            959289
 246 +    7            208294
 247 +    8             40420
 248 +    9              4485
 249 +   10             14918
 250 +   11             18340
 251 +   12             25052
 252 +   13             19455
 253 +   14              5602
 254 +   15               969
 255 +   16                47
 256 +   17                18
 257 +   18                14
 258 +   19                 1
 259 +   20                 3
 260 +   21                 2
 261 +   22                 5
 262 +   23                 2
 263 +   25                 1
 264 +
 265 +
 266 +* Wakeup latency of a selected process
 267 +
 268 +To only collect wakeup latency data of a particular process, write the
 269 +PID of the requested process to
 270 +
 271 +/sys/kernel/debug/tracing/latency_hist/wakeup/pid
 272 +
 273 +PIDs are not considered, if this variable is set to 0.
 274 +
 275 +
 276 +* Details of the process with the highest wakeup latency so far
 277 +
 278 +Selected data of the process that suffered from the highest wakeup
 279 +latency that occurred in a particular CPU are available in the file
 280 +
 281 +/sys/kernel/debug/tracing/latency_hist/wakeup/max_latency-CPUx.
 282 +
 283 +In addition, other relevant system data at the time when the
 284 +latency occurred are given.
 285 +
 286 +The format of the data is (all in one line):
 287 +<PID> <Priority> <Latency> (<Timeroffset>) <Command> \
 288 +<- <PID> <Priority> <Command> <Timestamp>
 289 +
 290 +The value of <Timeroffset> is only relevant in the combined timer
 291 +and wakeup latency recording. In the wakeup recording, it is
 292 +always 0, in the missed_timer_offsets recording, it is the same
 293 +as <Latency>.
 294 +
 295 +When retrospectively searching for the origin of a latency and
 296 +tracing was not enabled, it may be helpful to know the name and
 297 +some basic data of the task that (finally) was switching to the
 298 +late real-tlme task. In addition to the victim's data, also the
 299 +data of the possible culprit are therefore displayed after the
 300 +"<-" symbol.
 301 +
 302 +Finally, the timestamp of the time when the latency occurred
 303 +in <seconds>.<microseconds> after the most recent system boot
 304 +is provided.
 305 +
 306 +These data are also reset when the wakeup histogram is reset.
 307 diff --git a/Makefile b/Makefile
 308 index 98239d56924c..5ed3edefebde 100644
 309 --- a/Makefile
 310 +++ b/Makefile
 311 @@ -394,7 +394,7 @@ KBUILD_CPPFLAGS := -D__KERNEL__
 312  KBUILD_CFLAGS   := -Wall -Wundef -Wstrict-prototypes -Wno-trigraphs \
 313                    -fno-strict-aliasing -fno-common \
 314                    -Werror-implicit-function-declaration \
 315 -                  -Wno-format-security \
 316 +                  -Wno-format-security -fno-PIE \
 317                    -std=gnu89
 318
 319  KBUILD_AFLAGS_KERNEL :=
 320 @@ -783,6 +783,9 @@ KBUILD_CFLAGS   += $(call cc-option,-Werror=strict-prototypes)
 321  # Prohibit date/time macros, which would make the build non-deterministic
 322  KBUILD_CFLAGS   += $(call cc-option,-Werror=date-time)
 323
 324 +# enforce correct pointer usage
 325 +KBUILD_CFLAGS   += $(call cc-option,-Werror=incompatible-pointer-types)
 326 +
 327  # use the deterministic mode of AR if available
 328  KBUILD_ARFLAGS := $(call ar-option,D)
 329
 330 diff --git a/arch/Kconfig b/arch/Kconfig
 331 index 4e949e58b192..3b26d76933fb 100644
 332 --- a/arch/Kconfig
 333 +++ b/arch/Kconfig
 334 @@ -9,6 +9,7 @@ config OPROFILE
 335         tristate "OProfile system profiling"
 336         depends on PROFILING
 337         depends on HAVE_OPROFILE
 338 +       depends on !PREEMPT_RT_FULL
 339         select RING_BUFFER
 340         select RING_BUFFER_ALLOW_SWAP
 341         help
 342 @@ -52,6 +53,7 @@ config KPROBES
 343  config JUMP_LABEL
 344         bool "Optimize very unlikely/likely branches"
 345         depends on HAVE_ARCH_JUMP_LABEL
 346 +       depends on (!INTERRUPT_OFF_HIST && !PREEMPT_OFF_HIST && !WAKEUP_LATENCY_HIST && !MISSED_TIMER_OFFSETS_HIST)
 347         help
 348           This option enables a transparent branch optimization that
 349          makes certain almost-always-true or almost-always-false branch
 350 diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig
 351 index 34e1569a11ee..79c4603e9453 100644
 352 --- a/arch/arm/Kconfig
 353 +++ b/arch/arm/Kconfig
 354 @@ -33,7 +33,7 @@ config ARM
 355         select HARDIRQS_SW_RESEND
 356         select HAVE_ARCH_AUDITSYSCALL if (AEABI && !OABI_COMPAT)
 357         select HAVE_ARCH_BITREVERSE if (CPU_32v7M || CPU_32v7) && !CPU_32v6
 358 -       select HAVE_ARCH_JUMP_LABEL if !XIP_KERNEL && !CPU_ENDIAN_BE32
 359 +       select HAVE_ARCH_JUMP_LABEL if !XIP_KERNEL && !CPU_ENDIAN_BE32 && !PREEMPT_RT_BASE
 360         select HAVE_ARCH_KGDB if !CPU_ENDIAN_BE32
 361         select HAVE_ARCH_SECCOMP_FILTER if (AEABI && !OABI_COMPAT)
 362         select HAVE_ARCH_TRACEHOOK
 363 @@ -68,6 +68,7 @@ config ARM
 364         select HAVE_PERF_EVENTS
 365         select HAVE_PERF_REGS
 366         select HAVE_PERF_USER_STACK_DUMP
 367 +       select HAVE_PREEMPT_LAZY
 368         select HAVE_RCU_TABLE_FREE if (SMP && ARM_LPAE)
 369         select HAVE_REGS_AND_STACK_ACCESS_API
 370         select HAVE_SYSCALL_TRACEPOINTS
 371 diff --git a/arch/arm/include/asm/switch_to.h b/arch/arm/include/asm/switch_to.h
 372 index 12ebfcc1d539..c962084605bc 100644
 373 --- a/arch/arm/include/asm/switch_to.h
 374 +++ b/arch/arm/include/asm/switch_to.h
 375 @@ -3,6 +3,13 @@
 376
 377  #include <linux/thread_info.h>
 378
 379 +#if defined CONFIG_PREEMPT_RT_FULL && defined CONFIG_HIGHMEM
 380 +void switch_kmaps(struct task_struct *prev_p, struct task_struct *next_p);
 381 +#else
 382 +static inline void
 383 +switch_kmaps(struct task_struct *prev_p, struct task_struct *next_p) { }
 384 +#endif
 385 +
 386  /*
 387   * For v7 SMP cores running a preemptible kernel we may be pre-empted
 388   * during a TLB maintenance operation, so execute an inner-shareable dsb
 389 @@ -25,6 +32,7 @@ extern struct task_struct *__switch_to(struct task_struct *, struct thread_info
 390  #define switch_to(prev,next,last)                                      \
 391  do {                                                                   \
 392         __complete_pending_tlbi();                                      \
 393 +       switch_kmaps(prev, next);                                       \
 394         last = __switch_to(prev,task_thread_info(prev), task_thread_info(next));        \
 395  } while (0)
 396
 397 diff --git a/arch/arm/include/asm/thread_info.h b/arch/arm/include/asm/thread_info.h
 398 index 776757d1604a..1f36a4eccc72 100644
 399 --- a/arch/arm/include/asm/thread_info.h
 400 +++ b/arch/arm/include/asm/thread_info.h
 401 @@ -49,6 +49,7 @@ struct cpu_context_save {
 402  struct thread_info {
 403         unsigned long           flags;          /* low level flags */
 404         int                     preempt_count;  /* 0 => preemptable, <0 => bug */
 405 +       int                     preempt_lazy_count; /* 0 => preemptable, <0 => bug */
 406         mm_segment_t            addr_limit;     /* address limit */
 407         struct task_struct      *task;          /* main task structure */
 408         __u32                   cpu;            /* cpu */
 409 @@ -142,7 +143,8 @@ extern int vfp_restore_user_hwstate(struct user_vfp __user *,
 410  #define TIF_SYSCALL_TRACE      4       /* syscall trace active */
 411  #define TIF_SYSCALL_AUDIT      5       /* syscall auditing active */
 412  #define TIF_SYSCALL_TRACEPOINT 6       /* syscall tracepoint instrumentation */
 413 -#define TIF_SECCOMP            7       /* seccomp syscall filtering active */
 414 +#define TIF_SECCOMP            8       /* seccomp syscall filtering active */
 415 +#define TIF_NEED_RESCHED_LAZY  7
 416
 417  #define TIF_NOHZ               12      /* in adaptive nohz mode */
 418  #define TIF_USING_IWMMXT       17
 419 @@ -152,6 +154,7 @@ extern int vfp_restore_user_hwstate(struct user_vfp __user *,
 420  #define _TIF_SIGPENDING                (1 << TIF_SIGPENDING)
 421  #define _TIF_NEED_RESCHED      (1 << TIF_NEED_RESCHED)
 422  #define _TIF_NOTIFY_RESUME     (1 << TIF_NOTIFY_RESUME)
 423 +#define _TIF_NEED_RESCHED_LAZY (1 << TIF_NEED_RESCHED_LAZY)
 424  #define _TIF_UPROBE            (1 << TIF_UPROBE)
 425  #define _TIF_SYSCALL_TRACE     (1 << TIF_SYSCALL_TRACE)
 426  #define _TIF_SYSCALL_AUDIT     (1 << TIF_SYSCALL_AUDIT)
 427 @@ -167,7 +170,8 @@ extern int vfp_restore_user_hwstate(struct user_vfp __user *,
 428   * Change these and you break ASM code in entry-common.S
 429   */
 430  #define _TIF_WORK_MASK         (_TIF_NEED_RESCHED | _TIF_SIGPENDING | \
 431 -                                _TIF_NOTIFY_RESUME | _TIF_UPROBE)
 432 +                                _TIF_NOTIFY_RESUME | _TIF_UPROBE | \
 433 +                                _TIF_NEED_RESCHED_LAZY)
 434
 435  #endif /* __KERNEL__ */
 436  #endif /* __ASM_ARM_THREAD_INFO_H */
 437 diff --git a/arch/arm/kernel/asm-offsets.c b/arch/arm/kernel/asm-offsets.c
 438 index 871b8267d211..4dbe70de7318 100644
 439 --- a/arch/arm/kernel/asm-offsets.c
 440 +++ b/arch/arm/kernel/asm-offsets.c
 441 @@ -65,6 +65,7 @@ int main(void)
 442    BLANK();
 443    DEFINE(TI_FLAGS,             offsetof(struct thread_info, flags));
 444    DEFINE(TI_PREEMPT,           offsetof(struct thread_info, preempt_count));
 445 +  DEFINE(TI_PREEMPT_LAZY,      offsetof(struct thread_info, preempt_lazy_count));
 446    DEFINE(TI_ADDR_LIMIT,                offsetof(struct thread_info, addr_limit));
 447    DEFINE(TI_TASK,              offsetof(struct thread_info, task));
 448    DEFINE(TI_CPU,               offsetof(struct thread_info, cpu));
 449 diff --git a/arch/arm/kernel/entry-armv.S b/arch/arm/kernel/entry-armv.S
 450 index 3ce377f7251f..d044cea59f54 100644
 451 --- a/arch/arm/kernel/entry-armv.S
 452 +++ b/arch/arm/kernel/entry-armv.S
 453 @@ -215,11 +215,18 @@ __irq_svc:
 454  #ifdef CONFIG_PREEMPT
 455         get_thread_info tsk
 456         ldr     r8, [tsk, #TI_PREEMPT]          @ get preempt count
 457 -       ldr     r0, [tsk, #TI_FLAGS]            @ get flags
 458         teq     r8, #0                          @ if preempt count != 0
 459 +       bne     1f                              @ return from exeption
 460 +       ldr     r0, [tsk, #TI_FLAGS]            @ get flags
 461 +       tst     r0, #_TIF_NEED_RESCHED          @ if NEED_RESCHED is set
 462 +       blne    svc_preempt                     @ preempt!
 463 +
 464 +       ldr     r8, [tsk, #TI_PREEMPT_LAZY]     @ get preempt lazy count
 465 +       teq     r8, #0                          @ if preempt lazy count != 0
 466         movne   r0, #0                          @ force flags to 0
 467 -       tst     r0, #_TIF_NEED_RESCHED
 468 +       tst     r0, #_TIF_NEED_RESCHED_LAZY
 469         blne    svc_preempt
 470 +1:
 471  #endif
 472
 473         svc_exit r5, irq = 1                    @ return from exception
 474 @@ -234,8 +241,14 @@ svc_preempt:
 475  1:     bl      preempt_schedule_irq            @ irq en/disable is done inside
 476         ldr     r0, [tsk, #TI_FLAGS]            @ get new tasks TI_FLAGS
 477         tst     r0, #_TIF_NEED_RESCHED
 478 +       bne     1b
 479 +       tst     r0, #_TIF_NEED_RESCHED_LAZY
 480         reteq   r8                              @ go again
 481 -       b       1b
 482 +       ldr     r0, [tsk, #TI_PREEMPT_LAZY]     @ get preempt lazy count
 483 +       teq     r0, #0                          @ if preempt lazy count != 0
 484 +       beq     1b
 485 +       ret     r8                              @ go again
 486 +
 487  #endif
 488
 489  __und_fault:
 490 diff --git a/arch/arm/kernel/entry-common.S b/arch/arm/kernel/entry-common.S
 491 index 30a7228eaceb..c3bd6cbfce4b 100644
 492 --- a/arch/arm/kernel/entry-common.S
 493 +++ b/arch/arm/kernel/entry-common.S
 494 @@ -36,7 +36,9 @@ ret_fast_syscall:
 495   UNWIND(.cantunwind    )
 496         disable_irq_notrace                     @ disable interrupts
 497         ldr     r1, [tsk, #TI_FLAGS]            @ re-check for syscall tracing
 498 -       tst     r1, #_TIF_SYSCALL_WORK | _TIF_WORK_MASK
 499 +       tst     r1, #((_TIF_SYSCALL_WORK | _TIF_WORK_MASK) & ~_TIF_SECCOMP)
 500 +       bne     fast_work_pending
 501 +       tst     r1, #_TIF_SECCOMP
 502         bne     fast_work_pending
 503
 504         /* perform architecture specific actions before user return */
 505 @@ -62,8 +64,11 @@ ret_fast_syscall:
 506         str     r0, [sp, #S_R0 + S_OFF]!        @ save returned r0
 507         disable_irq_notrace                     @ disable interrupts
 508         ldr     r1, [tsk, #TI_FLAGS]            @ re-check for syscall tracing
 509 -       tst     r1, #_TIF_SYSCALL_WORK | _TIF_WORK_MASK
 510 +       tst     r1, #((_TIF_SYSCALL_WORK | _TIF_WORK_MASK) & ~_TIF_SECCOMP)
 511 +       bne     do_slower_path
 512 +       tst     r1, #_TIF_SECCOMP
 513         beq     no_work_pending
 514 +do_slower_path:
 515   UNWIND(.fnend         )
 516  ENDPROC(ret_fast_syscall)
 517
 518 diff --git a/arch/arm/kernel/process.c b/arch/arm/kernel/process.c
 519 index 4adfb46e3ee9..15f1d94b47c5 100644
 520 --- a/arch/arm/kernel/process.c
 521 +++ b/arch/arm/kernel/process.c
 522 @@ -319,6 +319,30 @@ unsigned long arch_randomize_brk(struct mm_struct *mm)
 523  }
 524
 525  #ifdef CONFIG_MMU
 526 +/*
 527 + * CONFIG_SPLIT_PTLOCK_CPUS results in a page->ptl lock.  If the lock is not
 528 + * initialized by pgtable_page_ctor() then a coredump of the vector page will
 529 + * fail.
 530 + */
 531 +static int __init vectors_user_mapping_init_page(void)
 532 +{
 533 +       struct page *page;
 534 +       unsigned long addr = 0xffff0000;
 535 +       pgd_t *pgd;
 536 +       pud_t *pud;
 537 +       pmd_t *pmd;
 538 +
 539 +       pgd = pgd_offset_k(addr);
 540 +       pud = pud_offset(pgd, addr);
 541 +       pmd = pmd_offset(pud, addr);
 542 +       page = pmd_page(*(pmd));
 543 +
 544 +       pgtable_page_ctor(page);
 545 +
 546 +       return 0;
 547 +}
 548 +late_initcall(vectors_user_mapping_init_page);
 549 +
 550  #ifdef CONFIG_KUSER_HELPERS
 551  /*
 552   * The vectors page is always readable from user space for the
 553 diff --git a/arch/arm/kernel/signal.c b/arch/arm/kernel/signal.c
 554 index 7b8f2141427b..96541e00b74a 100644
 555 --- a/arch/arm/kernel/signal.c
 556 +++ b/arch/arm/kernel/signal.c
 557 @@ -572,7 +572,8 @@ do_work_pending(struct pt_regs *regs, unsigned int thread_flags, int syscall)
 558          */
 559         trace_hardirqs_off();
 560         do {
 561 -               if (likely(thread_flags & _TIF_NEED_RESCHED)) {
 562 +               if (likely(thread_flags & (_TIF_NEED_RESCHED |
 563 +                                          _TIF_NEED_RESCHED_LAZY))) {
 564                         schedule();
 565                 } else {
 566                         if (unlikely(!user_mode(regs)))
 567 diff --git a/arch/arm/kernel/smp.c b/arch/arm/kernel/smp.c
 568 index b26361355dae..e5754e3b03c4 100644
 569 --- a/arch/arm/kernel/smp.c
 570 +++ b/arch/arm/kernel/smp.c
 571 @@ -230,8 +230,6 @@ int __cpu_disable(void)
 572         flush_cache_louis();
 573         local_flush_tlb_all();
 574
 575 -       clear_tasks_mm_cpumask(cpu);
 576 -
 577         return 0;
 578  }
 579
 580 @@ -247,6 +245,9 @@ void __cpu_die(unsigned int cpu)
 581                 pr_err("CPU%u: cpu didn't die\n", cpu);
 582                 return;
 583         }
 584 +
 585 +       clear_tasks_mm_cpumask(cpu);
 586 +
 587         pr_notice("CPU%u: shutdown\n", cpu);
 588
 589         /*
 590 diff --git a/arch/arm/kernel/unwind.c b/arch/arm/kernel/unwind.c
 591 index 0bee233fef9a..314cfb232a63 100644
 592 --- a/arch/arm/kernel/unwind.c
 593 +++ b/arch/arm/kernel/unwind.c
 594 @@ -93,7 +93,7 @@ extern const struct unwind_idx __start_unwind_idx[];
 595  static const struct unwind_idx *__origin_unwind_idx;
 596  extern const struct unwind_idx __stop_unwind_idx[];
 597
 598 -static DEFINE_SPINLOCK(unwind_lock);
 599 +static DEFINE_RAW_SPINLOCK(unwind_lock);
 600  static LIST_HEAD(unwind_tables);
 601
 602  /* Convert a prel31 symbol to an absolute address */
 603 @@ -201,7 +201,7 @@ static const struct unwind_idx *unwind_find_idx(unsigned long addr)
 604                 /* module unwind tables */
 605                 struct unwind_table *table;
 606
 607 -               spin_lock_irqsave(&unwind_lock, flags);
 608 +               raw_spin_lock_irqsave(&unwind_lock, flags);
 609                 list_for_each_entry(table, &unwind_tables, list) {
 610                         if (addr >= table->begin_addr &&
 611                             addr < table->end_addr) {
 612 @@ -213,7 +213,7 @@ static const struct unwind_idx *unwind_find_idx(unsigned long addr)
 613                                 break;
 614                         }
 615                 }
 616 -               spin_unlock_irqrestore(&unwind_lock, flags);
 617 +               raw_spin_unlock_irqrestore(&unwind_lock, flags);
 618         }
 619
 620         pr_debug("%s: idx = %p\n", __func__, idx);
 621 @@ -529,9 +529,9 @@ struct unwind_table *unwind_table_add(unsigned long start, unsigned long size,
 622         tab->begin_addr = text_addr;
 623         tab->end_addr = text_addr + text_size;
 624
 625 -       spin_lock_irqsave(&unwind_lock, flags);
 626 +       raw_spin_lock_irqsave(&unwind_lock, flags);
 627         list_add_tail(&tab->list, &unwind_tables);
 628 -       spin_unlock_irqrestore(&unwind_lock, flags);
 629 +       raw_spin_unlock_irqrestore(&unwind_lock, flags);
 630
 631         return tab;
 632  }
 633 @@ -543,9 +543,9 @@ void unwind_table_del(struct unwind_table *tab)
 634         if (!tab)
 635                 return;
 636
 637 -       spin_lock_irqsave(&unwind_lock, flags);
 638 +       raw_spin_lock_irqsave(&unwind_lock, flags);
 639         list_del(&tab->list);
 640 -       spin_unlock_irqrestore(&unwind_lock, flags);
 641 +       raw_spin_unlock_irqrestore(&unwind_lock, flags);
 642
 643         kfree(tab);
 644  }
 645 diff --git a/arch/arm/kvm/arm.c b/arch/arm/kvm/arm.c
 646 index d7bef2144760..36a3e51492f7 100644
 647 --- a/arch/arm/kvm/arm.c
 648 +++ b/arch/arm/kvm/arm.c
 649 @@ -496,18 +496,18 @@ static void kvm_arm_resume_guest(struct kvm *kvm)
 650         struct kvm_vcpu *vcpu;
 651
 652         kvm_for_each_vcpu(i, vcpu, kvm) {
 653 -               wait_queue_head_t *wq = kvm_arch_vcpu_wq(vcpu);
 654 +               struct swait_queue_head *wq = kvm_arch_vcpu_wq(vcpu);
 655
 656                 vcpu->arch.pause = false;
 657 -               wake_up_interruptible(wq);
 658 +               swake_up(wq);
 659         }
 660  }
 661
 662  static void vcpu_sleep(struct kvm_vcpu *vcpu)
 663  {
 664 -       wait_queue_head_t *wq = kvm_arch_vcpu_wq(vcpu);
 665 +       struct swait_queue_head *wq = kvm_arch_vcpu_wq(vcpu);
 666
 667 -       wait_event_interruptible(*wq, ((!vcpu->arch.power_off) &&
 668 +       swait_event_interruptible(*wq, ((!vcpu->arch.power_off) &&
 669                                        (!vcpu->arch.pause)));
 670  }
 671
 672 @@ -566,7 +566,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run)
 673                  * involves poking the GIC, which must be done in a
 674                  * non-preemptible context.
 675                  */
 676 -               preempt_disable();
 677 +               migrate_disable();
 678                 kvm_timer_flush_hwstate(vcpu);
 679                 kvm_vgic_flush_hwstate(vcpu);
 680
 681 @@ -585,7 +585,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run)
 682                         local_irq_enable();
 683                         kvm_timer_sync_hwstate(vcpu);
 684                         kvm_vgic_sync_hwstate(vcpu);
 685 -                       preempt_enable();
 686 +                       migrate_enable();
 687                         continue;
 688                 }
 689
 690 @@ -639,7 +639,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run)
 691
 692                 kvm_vgic_sync_hwstate(vcpu);
 693
 694 -               preempt_enable();
 695 +               migrate_enable();
 696
 697                 ret = handle_exit(vcpu, run, ret);
 698         }
 699 diff --git a/arch/arm/kvm/psci.c b/arch/arm/kvm/psci.c
 700 index a9b3b905e661..c2b131527a64 100644
 701 --- a/arch/arm/kvm/psci.c
 702 +++ b/arch/arm/kvm/psci.c
 703 @@ -70,7 +70,7 @@ static unsigned long kvm_psci_vcpu_on(struct kvm_vcpu *source_vcpu)
 704  {
 705         struct kvm *kvm = source_vcpu->kvm;
 706         struct kvm_vcpu *vcpu = NULL;
 707 -       wait_queue_head_t *wq;
 708 +       struct swait_queue_head *wq;
 709         unsigned long cpu_id;
 710         unsigned long context_id;
 711         phys_addr_t target_pc;
 712 @@ -119,7 +119,7 @@ static unsigned long kvm_psci_vcpu_on(struct kvm_vcpu *source_vcpu)
 713         smp_mb();               /* Make sure the above is visible */
 714
 715         wq = kvm_arch_vcpu_wq(vcpu);
 716 -       wake_up_interruptible(wq);
 717 +       swake_up(wq);
 718
 719         return PSCI_RET_SUCCESS;
 720  }
 721 diff --git a/arch/arm/mach-at91/Kconfig b/arch/arm/mach-at91/Kconfig
 722 index 28656c2b54a0..3f501305ca26 100644
 723 --- a/arch/arm/mach-at91/Kconfig
 724 +++ b/arch/arm/mach-at91/Kconfig
 725 @@ -99,6 +99,7 @@ config HAVE_AT91_USB_CLK
 726  config COMMON_CLK_AT91
 727         bool
 728         select COMMON_CLK
 729 +       select MFD_SYSCON
 730
 731  config HAVE_AT91_SMD
 732         bool
 733 diff --git a/arch/arm/mach-at91/at91rm9200.c b/arch/arm/mach-at91/at91rm9200.c
 734 index c1a7c6cc00e1..63b4fa25b48a 100644
 735 --- a/arch/arm/mach-at91/at91rm9200.c
 736 +++ b/arch/arm/mach-at91/at91rm9200.c
 737 @@ -12,7 +12,6 @@
 738  #include <linux/of_platform.h>
 739
 740  #include <asm/mach/arch.h>
 741 -#include <asm/system_misc.h>
 742
 743  #include "generic.h"
 744  #include "soc.h"
 745 @@ -33,7 +32,6 @@ static void __init at91rm9200_dt_device_init(void)
 746
 747         of_platform_populate(NULL, of_default_bus_match_table, NULL, soc_dev);
 748
 749 -       arm_pm_idle = at91rm9200_idle;
 750         at91rm9200_pm_init();
 751  }
 752
 753 diff --git a/arch/arm/mach-at91/at91sam9.c b/arch/arm/mach-at91/at91sam9.c
 754 index 7eb64f763034..cada2a6412b3 100644
 755 --- a/arch/arm/mach-at91/at91sam9.c
 756 +++ b/arch/arm/mach-at91/at91sam9.c
 757 @@ -62,8 +62,6 @@ static void __init at91sam9_common_init(void)
 758                 soc_dev = soc_device_to_device(soc);
 759
 760         of_platform_populate(NULL, of_default_bus_match_table, NULL, soc_dev);
 761 -
 762 -       arm_pm_idle = at91sam9_idle;
 763  }
 764
 765  static void __init at91sam9_dt_device_init(void)
 766 diff --git a/arch/arm/mach-at91/generic.h b/arch/arm/mach-at91/generic.h
 767 index b0fa7dc7286d..28ca57a2060f 100644
 768 --- a/arch/arm/mach-at91/generic.h
 769 +++ b/arch/arm/mach-at91/generic.h
 770 @@ -11,27 +11,18 @@
 771  #ifndef _AT91_GENERIC_H
 772  #define _AT91_GENERIC_H
 773
 774 -#include <linux/of.h>
 775 -#include <linux/reboot.h>
 776 -
 777 - /* Map io */
 778 -extern void __init at91_map_io(void);
 779 -extern void __init at91_alt_map_io(void);
 780 -
 781 -/* idle */
 782 -extern void at91rm9200_idle(void);
 783 -extern void at91sam9_idle(void);
 784 -
 785  #ifdef CONFIG_PM
 786  extern void __init at91rm9200_pm_init(void);
 787  extern void __init at91sam9260_pm_init(void);
 788  extern void __init at91sam9g45_pm_init(void);
 789  extern void __init at91sam9x5_pm_init(void);
 790 +extern void __init sama5_pm_init(void);
 791  #else
 792  static inline void __init at91rm9200_pm_init(void) { }
 793  static inline void __init at91sam9260_pm_init(void) { }
 794  static inline void __init at91sam9g45_pm_init(void) { }
 795  static inline void __init at91sam9x5_pm_init(void) { }
 796 +static inline void __init sama5_pm_init(void) { }
 797  #endif
 798
 799  #endif /* _AT91_GENERIC_H */
 800 diff --git a/arch/arm/mach-at91/pm.c b/arch/arm/mach-at91/pm.c
 801 index 23726fb31741..f06270198bf1 100644
 802 --- a/arch/arm/mach-at91/pm.c
 803 +++ b/arch/arm/mach-at91/pm.c
 804 @@ -31,10 +31,13 @@
 805  #include <asm/mach/irq.h>
 806  #include <asm/fncpy.h>
 807  #include <asm/cacheflush.h>
 808 +#include <asm/system_misc.h>
 809
 810  #include "generic.h"
 811  #include "pm.h"
 812
 813 +static void __iomem *pmc;
 814 +
 815  /*
 816   * FIXME: this is needed to communicate between the pinctrl driver and
 817   * the PM implementation in the machine. Possibly part of the PM
 818 @@ -87,7 +90,7 @@ static int at91_pm_verify_clocks(void)
 819         unsigned long scsr;
 820         int i;
 821
 822 -       scsr = at91_pmc_read(AT91_PMC_SCSR);
 823 +       scsr = readl(pmc + AT91_PMC_SCSR);
 824
 825         /* USB must not be using PLLB */
 826         if ((scsr & at91_pm_data.uhp_udp_mask) != 0) {
 827 @@ -101,8 +104,7 @@ static int at91_pm_verify_clocks(void)
 828
 829                 if ((scsr & (AT91_PMC_PCK0 << i)) == 0)
 830                         continue;
 831 -
 832 -               css = at91_pmc_read(AT91_PMC_PCKR(i)) & AT91_PMC_CSS;
 833 +               css = readl(pmc + AT91_PMC_PCKR(i)) & AT91_PMC_CSS;
 834                 if (css != AT91_PMC_CSS_SLOW) {
 835                         pr_err("AT91: PM - Suspend-to-RAM with PCK%d src %d\n", i, css);
 836                         return 0;
 837 @@ -145,8 +147,8 @@ static void at91_pm_suspend(suspend_state_t state)
 838         flush_cache_all();
 839         outer_disable();
 840
 841 -       at91_suspend_sram_fn(at91_pmc_base, at91_ramc_base[0],
 842 -                               at91_ramc_base[1], pm_data);
 843 +       at91_suspend_sram_fn(pmc, at91_ramc_base[0],
 844 +                            at91_ramc_base[1], pm_data);
 845
 846         outer_resume();
 847  }
 848 @@ -353,6 +355,21 @@ static __init void at91_dt_ramc(void)
 849         at91_pm_set_standby(standby);
 850  }
 851
 852 +void at91rm9200_idle(void)
 853 +{
 854 +       /*
 855 +        * Disable the processor clock.  The processor will be automatically
 856 +        * re-enabled by an interrupt or by a reset.
 857 +        */
 858 +       writel(AT91_PMC_PCK, pmc + AT91_PMC_SCDR);
 859 +}
 860 +
 861 +void at91sam9_idle(void)
 862 +{
 863 +       writel(AT91_PMC_PCK, pmc + AT91_PMC_SCDR);
 864 +       cpu_do_idle();
 865 +}
 866 +
 867  static void __init at91_pm_sram_init(void)
 868  {
 869         struct gen_pool *sram_pool;
 870 @@ -399,13 +416,36 @@ static void __init at91_pm_sram_init(void)
 871                         &at91_pm_suspend_in_sram, at91_pm_suspend_in_sram_sz);
 872  }
 873
 874 -static void __init at91_pm_init(void)
 875 +static const struct of_device_id atmel_pmc_ids[] __initconst = {
 876 +       { .compatible = "atmel,at91rm9200-pmc"  },
 877 +       { .compatible = "atmel,at91sam9260-pmc" },
 878 +       { .compatible = "atmel,at91sam9g45-pmc" },
 879 +       { .compatible = "atmel,at91sam9n12-pmc" },
 880 +       { .compatible = "atmel,at91sam9x5-pmc" },
 881 +       { .compatible = "atmel,sama5d3-pmc" },
 882 +       { .compatible = "atmel,sama5d2-pmc" },
 883 +       { /* sentinel */ },
 884 +};
 885 +
 886 +static void __init at91_pm_init(void (*pm_idle)(void))
 887  {
 888 -       at91_pm_sram_init();
 889 +       struct device_node *pmc_np;
 890
 891         if (at91_cpuidle_device.dev.platform_data)
 892                 platform_device_register(&at91_cpuidle_device);
 893
 894 +       pmc_np = of_find_matching_node(NULL, atmel_pmc_ids);
 895 +       pmc = of_iomap(pmc_np, 0);
 896 +       if (!pmc) {
 897 +               pr_err("AT91: PM not supported, PMC not found\n");
 898 +               return;
 899 +       }
 900 +
 901 +       if (pm_idle)
 902 +               arm_pm_idle = pm_idle;
 903 +
 904 +       at91_pm_sram_init();
 905 +
 906         if (at91_suspend_sram_fn)
 907                 suspend_set_ops(&at91_pm_ops);
 908         else
 909 @@ -424,7 +464,7 @@ void __init at91rm9200_pm_init(void)
 910         at91_pm_data.uhp_udp_mask = AT91RM9200_PMC_UHP | AT91RM9200_PMC_UDP;
 911         at91_pm_data.memctrl = AT91_MEMCTRL_MC;
 912
 913 -       at91_pm_init();
 914 +       at91_pm_init(at91rm9200_idle);
 915  }
 916
 917  void __init at91sam9260_pm_init(void)
 918 @@ -432,7 +472,7 @@ void __init at91sam9260_pm_init(void)
 919         at91_dt_ramc();
 920         at91_pm_data.memctrl = AT91_MEMCTRL_SDRAMC;
 921         at91_pm_data.uhp_udp_mask = AT91SAM926x_PMC_UHP | AT91SAM926x_PMC_UDP;
 922 -       return at91_pm_init();
 923 +       at91_pm_init(at91sam9_idle);
 924  }
 925
 926  void __init at91sam9g45_pm_init(void)
 927 @@ -440,7 +480,7 @@ void __init at91sam9g45_pm_init(void)
 928         at91_dt_ramc();
 929         at91_pm_data.uhp_udp_mask = AT91SAM926x_PMC_UHP;
 930         at91_pm_data.memctrl = AT91_MEMCTRL_DDRSDR;
 931 -       return at91_pm_init();
 932 +       at91_pm_init(at91sam9_idle);
 933  }
 934
 935  void __init at91sam9x5_pm_init(void)
 936 @@ -448,5 +488,13 @@ void __init at91sam9x5_pm_init(void)
 937         at91_dt_ramc();
 938         at91_pm_data.uhp_udp_mask = AT91SAM926x_PMC_UHP | AT91SAM926x_PMC_UDP;
 939         at91_pm_data.memctrl = AT91_MEMCTRL_DDRSDR;
 940 -       return at91_pm_init();
 941 +       at91_pm_init(at91sam9_idle);
 942 +}
 943 +
 944 +void __init sama5_pm_init(void)
 945 +{
 946 +       at91_dt_ramc();
 947 +       at91_pm_data.uhp_udp_mask = AT91SAM926x_PMC_UHP | AT91SAM926x_PMC_UDP;
 948 +       at91_pm_data.memctrl = AT91_MEMCTRL_DDRSDR;
 949 +       at91_pm_init(NULL);
 950  }
 951 diff --git a/arch/arm/mach-at91/sama5.c b/arch/arm/mach-at91/sama5.c
 952 index d9cf6799aec0..df8fdf1cf66d 100644
 953 --- a/arch/arm/mach-at91/sama5.c
 954 +++ b/arch/arm/mach-at91/sama5.c
 955 @@ -51,7 +51,7 @@ static void __init sama5_dt_device_init(void)
 956                 soc_dev = soc_device_to_device(soc);
 957
 958         of_platform_populate(NULL, of_default_bus_match_table, NULL, soc_dev);
 959 -       at91sam9x5_pm_init();
 960 +       sama5_pm_init();
 961  }
 962
 963  static const char *const sama5_dt_board_compat[] __initconst = {
 964 diff --git a/arch/arm/mach-exynos/platsmp.c b/arch/arm/mach-exynos/platsmp.c
 965 index 98a2c0cbb833..310dce500d3e 100644
 966 --- a/arch/arm/mach-exynos/platsmp.c
 967 +++ b/arch/arm/mach-exynos/platsmp.c
 968 @@ -230,7 +230,7 @@ static void __iomem *scu_base_addr(void)
 969         return (void __iomem *)(S5P_VA_SCU);
 970  }
 971
 972 -static DEFINE_SPINLOCK(boot_lock);
 973 +static DEFINE_RAW_SPINLOCK(boot_lock);
 974
 975  static void exynos_secondary_init(unsigned int cpu)
 976  {
 977 @@ -243,8 +243,8 @@ static void exynos_secondary_init(unsigned int cpu)
 978         /*
 979          * Synchronise with the boot thread.
 980          */
 981 -       spin_lock(&boot_lock);
 982 -       spin_unlock(&boot_lock);
 983 +       raw_spin_lock(&boot_lock);
 984 +       raw_spin_unlock(&boot_lock);
 985  }
 986
 987  int exynos_set_boot_addr(u32 core_id, unsigned long boot_addr)
 988 @@ -308,7 +308,7 @@ static int exynos_boot_secondary(unsigned int cpu, struct task_struct *idle)
 989          * Set synchronisation state between this boot processor
 990          * and the secondary one
 991          */
 992 -       spin_lock(&boot_lock);
 993 +       raw_spin_lock(&boot_lock);
 994
 995         /*
 996          * The secondary processor is waiting to be released from
 997 @@ -335,7 +335,7 @@ static int exynos_boot_secondary(unsigned int cpu, struct task_struct *idle)
 998
 999                 if (timeout == 0) {
1000                         printk(KERN_ERR "cpu1 power enable failed");
1001 -                       spin_unlock(&boot_lock);
1002 +                       raw_spin_unlock(&boot_lock);
1003                         return -ETIMEDOUT;
1004                 }
1005         }
1006 @@ -381,7 +381,7 @@ static int exynos_boot_secondary(unsigned int cpu, struct task_struct *idle)
1007          * calibrations, then wait for it to finish
1008          */
1009  fail:
1010 -       spin_unlock(&boot_lock);
1011 +       raw_spin_unlock(&boot_lock);
1012
1013         return pen_release != -1 ? ret : 0;
1014  }
1015 diff --git a/arch/arm/mach-hisi/platmcpm.c b/arch/arm/mach-hisi/platmcpm.c
1016 index b5f8f5ffda79..9753a84df9c4 100644
1017 --- a/arch/arm/mach-hisi/platmcpm.c
1018 +++ b/arch/arm/mach-hisi/platmcpm.c
1019 @@ -61,7 +61,7 @@
1020
1021  static void __iomem *sysctrl, *fabric;
1022  static int hip04_cpu_table[HIP04_MAX_CLUSTERS][HIP04_MAX_CPUS_PER_CLUSTER];
1023 -static DEFINE_SPINLOCK(boot_lock);
1024 +static DEFINE_RAW_SPINLOCK(boot_lock);
1025  static u32 fabric_phys_addr;
1026  /*
1027   * [0]: bootwrapper physical address
1028 @@ -113,7 +113,7 @@ static int hip04_boot_secondary(unsigned int l_cpu, struct task_struct *idle)
1029         if (cluster >= HIP04_MAX_CLUSTERS || cpu >= HIP04_MAX_CPUS_PER_CLUSTER)
1030                 return -EINVAL;
1031
1032 -       spin_lock_irq(&boot_lock);
1033 +       raw_spin_lock_irq(&boot_lock);
1034
1035         if (hip04_cpu_table[cluster][cpu])
1036                 goto out;
1037 @@ -147,7 +147,7 @@ static int hip04_boot_secondary(unsigned int l_cpu, struct task_struct *idle)
1038
1039  out:
1040         hip04_cpu_table[cluster][cpu]++;
1041 -       spin_unlock_irq(&boot_lock);
1042 +       raw_spin_unlock_irq(&boot_lock);
1043
1044         return 0;
1045  }
1046 @@ -162,11 +162,11 @@ static void hip04_cpu_die(unsigned int l_cpu)
1047         cpu = MPIDR_AFFINITY_LEVEL(mpidr, 0);
1048         cluster = MPIDR_AFFINITY_LEVEL(mpidr, 1);
1049
1050 -       spin_lock(&boot_lock);
1051 +       raw_spin_lock(&boot_lock);
1052         hip04_cpu_table[cluster][cpu]--;
1053         if (hip04_cpu_table[cluster][cpu] == 1) {
1054                 /* A power_up request went ahead of us. */
1055 -               spin_unlock(&boot_lock);
1056 +               raw_spin_unlock(&boot_lock);
1057                 return;
1058         } else if (hip04_cpu_table[cluster][cpu] > 1) {
1059                 pr_err("Cluster %d CPU%d boots multiple times\n", cluster, cpu);
1060 @@ -174,7 +174,7 @@ static void hip04_cpu_die(unsigned int l_cpu)
1061         }
1062
1063         last_man = hip04_cluster_is_down(cluster);
1064 -       spin_unlock(&boot_lock);
1065 +       raw_spin_unlock(&boot_lock);
1066         if (last_man) {
1067                 /* Since it's Cortex A15, disable L2 prefetching. */
1068                 asm volatile(
1069 @@ -203,7 +203,7 @@ static int hip04_cpu_kill(unsigned int l_cpu)
1070                cpu >= HIP04_MAX_CPUS_PER_CLUSTER);
1071
1072         count = TIMEOUT_MSEC / POLL_MSEC;
1073 -       spin_lock_irq(&boot_lock);
1074 +       raw_spin_lock_irq(&boot_lock);
1075         for (tries = 0; tries < count; tries++) {
1076                 if (hip04_cpu_table[cluster][cpu])
1077                         goto err;
1078 @@ -211,10 +211,10 @@ static int hip04_cpu_kill(unsigned int l_cpu)
1079                 data = readl_relaxed(sysctrl + SC_CPU_RESET_STATUS(cluster));
1080                 if (data & CORE_WFI_STATUS(cpu))
1081                         break;
1082 -               spin_unlock_irq(&boot_lock);
1083 +               raw_spin_unlock_irq(&boot_lock);
1084                 /* Wait for clean L2 when the whole cluster is down. */
1085                 msleep(POLL_MSEC);
1086 -               spin_lock_irq(&boot_lock);
1087 +               raw_spin_lock_irq(&boot_lock);
1088         }
1089         if (tries >= count)
1090                 goto err;
1091 @@ -231,10 +231,10 @@ static int hip04_cpu_kill(unsigned int l_cpu)
1092                 goto err;
1093         if (hip04_cluster_is_down(cluster))
1094                 hip04_set_snoop_filter(cluster, 0);
1095 -       spin_unlock_irq(&boot_lock);
1096 +       raw_spin_unlock_irq(&boot_lock);
1097         return 1;
1098  err:
1099 -       spin_unlock_irq(&boot_lock);
1100 +       raw_spin_unlock_irq(&boot_lock);
1101         return 0;
1102  }
1103  #endif
1104 diff --git a/arch/arm/mach-imx/Kconfig b/arch/arm/mach-imx/Kconfig
1105 index 8ceda2844c4f..08bcf8fb76f2 100644
1106 --- a/arch/arm/mach-imx/Kconfig
1107 +++ b/arch/arm/mach-imx/Kconfig
1108 @@ -524,7 +524,7 @@ config SOC_IMX6Q
1109         bool "i.MX6 Quad/DualLite support"
1110         select ARM_ERRATA_764369 if SMP
1111         select HAVE_ARM_SCU if SMP
1112 -       select HAVE_ARM_TWD if SMP
1113 +       select HAVE_ARM_TWD
1114         select PCI_DOMAINS if PCI
1115         select PINCTRL_IMX6Q
1116         select SOC_IMX6
1117 diff --git a/arch/arm/mach-omap2/omap-smp.c b/arch/arm/mach-omap2/omap-smp.c
1118 index 79e1f876d1c9..7e625c17f78e 100644
1119 --- a/arch/arm/mach-omap2/omap-smp.c
1120 +++ b/arch/arm/mach-omap2/omap-smp.c
1121 @@ -43,7 +43,7 @@
1122  /* SCU base address */
1123  static void __iomem *scu_base;
1124
1125 -static DEFINE_SPINLOCK(boot_lock);
1126 +static DEFINE_RAW_SPINLOCK(boot_lock);
1127
1128  void __iomem *omap4_get_scu_base(void)
1129  {
1130 @@ -74,8 +74,8 @@ static void omap4_secondary_init(unsigned int cpu)
1131         /*
1132          * Synchronise with the boot thread.
1133          */
1134 -       spin_lock(&boot_lock);
1135 -       spin_unlock(&boot_lock);
1136 +       raw_spin_lock(&boot_lock);
1137 +       raw_spin_unlock(&boot_lock);
1138  }
1139
1140  static int omap4_boot_secondary(unsigned int cpu, struct task_struct *idle)
1141 @@ -89,7 +89,7 @@ static int omap4_boot_secondary(unsigned int cpu, struct task_struct *idle)
1142          * Set synchronisation state between this boot processor
1143          * and the secondary one
1144          */
1145 -       spin_lock(&boot_lock);
1146 +       raw_spin_lock(&boot_lock);
1147
1148         /*
1149          * Update the AuxCoreBoot0 with boot state for secondary core.
1150 @@ -166,7 +166,7 @@ static int omap4_boot_secondary(unsigned int cpu, struct task_struct *idle)
1151          * Now the secondary core is starting up let it run its
1152          * calibrations, then wait for it to finish
1153          */
1154 -       spin_unlock(&boot_lock);
1155 +       raw_spin_unlock(&boot_lock);
1156
1157         return 0;
1158  }
1159 diff --git a/arch/arm/mach-prima2/platsmp.c b/arch/arm/mach-prima2/platsmp.c
1160 index e46c91094dde..dcb3ed0c26da 100644
1161 --- a/arch/arm/mach-prima2/platsmp.c
1162 +++ b/arch/arm/mach-prima2/platsmp.c
1163 @@ -22,7 +22,7 @@
1164
1165  static void __iomem *clk_base;
1166
1167 -static DEFINE_SPINLOCK(boot_lock);
1168 +static DEFINE_RAW_SPINLOCK(boot_lock);
1169
1170  static void sirfsoc_secondary_init(unsigned int cpu)
1171  {
1172 @@ -36,8 +36,8 @@ static void sirfsoc_secondary_init(unsigned int cpu)
1173         /*
1174          * Synchronise with the boot thread.
1175          */
1176 -       spin_lock(&boot_lock);
1177 -       spin_unlock(&boot_lock);
1178 +       raw_spin_lock(&boot_lock);
1179 +       raw_spin_unlock(&boot_lock);
1180  }
1181
1182  static const struct of_device_id clk_ids[]  = {
1183 @@ -75,7 +75,7 @@ static int sirfsoc_boot_secondary(unsigned int cpu, struct task_struct *idle)
1184         /* make sure write buffer is drained */
1185         mb();
1186
1187 -       spin_lock(&boot_lock);
1188 +       raw_spin_lock(&boot_lock);
1189
1190         /*
1191          * The secondary processor is waiting to be released from
1192 @@ -107,7 +107,7 @@ static int sirfsoc_boot_secondary(unsigned int cpu, struct task_struct *idle)
1193          * now the secondary core is starting up let it run its
1194          * calibrations, then wait for it to finish
1195          */
1196 -       spin_unlock(&boot_lock);
1197 +       raw_spin_unlock(&boot_lock);
1198
1199         return pen_release != -1 ? -ENOSYS : 0;
1200  }
1201 diff --git a/arch/arm/mach-qcom/platsmp.c b/arch/arm/mach-qcom/platsmp.c
1202 index 9b00123a315d..0a49fe1bc8cf 100644
1203 --- a/arch/arm/mach-qcom/platsmp.c
1204 +++ b/arch/arm/mach-qcom/platsmp.c
1205 @@ -46,7 +46,7 @@
1206
1207  extern void secondary_startup_arm(void);
1208
1209 -static DEFINE_SPINLOCK(boot_lock);
1210 +static DEFINE_RAW_SPINLOCK(boot_lock);
1211
1212  #ifdef CONFIG_HOTPLUG_CPU
1213  static void qcom_cpu_die(unsigned int cpu)
1214 @@ -60,8 +60,8 @@ static void qcom_secondary_init(unsigned int cpu)
1215         /*
1216          * Synchronise with the boot thread.
1217          */
1218 -       spin_lock(&boot_lock);
1219 -       spin_unlock(&boot_lock);
1220 +       raw_spin_lock(&boot_lock);
1221 +       raw_spin_unlock(&boot_lock);
1222  }
1223
1224  static int scss_release_secondary(unsigned int cpu)
1225 @@ -284,7 +284,7 @@ static int qcom_boot_secondary(unsigned int cpu, int (*func)(unsigned int))
1226          * set synchronisation state between this boot processor
1227          * and the secondary one
1228          */
1229 -       spin_lock(&boot_lock);
1230 +       raw_spin_lock(&boot_lock);
1231
1232         /*
1233          * Send the secondary CPU a soft interrupt, thereby causing
1234 @@ -297,7 +297,7 @@ static int qcom_boot_secondary(unsigned int cpu, int (*func)(unsigned int))
1235          * now the secondary core is starting up let it run its
1236          * calibrations, then wait for it to finish
1237          */
1238 -       spin_unlock(&boot_lock);
1239 +       raw_spin_unlock(&boot_lock);
1240
1241         return ret;
1242  }
1243 diff --git a/arch/arm/mach-spear/platsmp.c b/arch/arm/mach-spear/platsmp.c
1244 index fd4297713d67..b0553b2c2d53 100644
1245 --- a/arch/arm/mach-spear/platsmp.c
1246 +++ b/arch/arm/mach-spear/platsmp.c
1247 @@ -32,7 +32,7 @@ static void write_pen_release(int val)
1248         sync_cache_w(&pen_release);
1249  }
1250
1251 -static DEFINE_SPINLOCK(boot_lock);
1252 +static DEFINE_RAW_SPINLOCK(boot_lock);
1253
1254  static void __iomem *scu_base = IOMEM(VA_SCU_BASE);
1255
1256 @@ -47,8 +47,8 @@ static void spear13xx_secondary_init(unsigned int cpu)
1257         /*
1258          * Synchronise with the boot thread.
1259          */
1260 -       spin_lock(&boot_lock);
1261 -       spin_unlock(&boot_lock);
1262 +       raw_spin_lock(&boot_lock);
1263 +       raw_spin_unlock(&boot_lock);
1264  }
1265
1266  static int spear13xx_boot_secondary(unsigned int cpu, struct task_struct *idle)
1267 @@ -59,7 +59,7 @@ static int spear13xx_boot_secondary(unsigned int cpu, struct task_struct *idle)
1268          * set synchronisation state between this boot processor
1269          * and the secondary one
1270          */
1271 -       spin_lock(&boot_lock);
1272 +       raw_spin_lock(&boot_lock);
1273
1274         /*
1275          * The secondary processor is waiting to be released from
1276 @@ -84,7 +84,7 @@ static int spear13xx_boot_secondary(unsigned int cpu, struct task_struct *idle)
1277          * now the secondary core is starting up let it run its
1278          * calibrations, then wait for it to finish
1279          */
1280 -       spin_unlock(&boot_lock);
1281 +       raw_spin_unlock(&boot_lock);
1282
1283         return pen_release != -1 ? -ENOSYS : 0;
1284  }
1285 diff --git a/arch/arm/mach-sti/platsmp.c b/arch/arm/mach-sti/platsmp.c
1286 index c4ad6eae67fa..e830b20b212f 100644
1287 --- a/arch/arm/mach-sti/platsmp.c
1288 +++ b/arch/arm/mach-sti/platsmp.c
1289 @@ -35,7 +35,7 @@ static void write_pen_release(int val)
1290         sync_cache_w(&pen_release);
1291  }
1292
1293 -static DEFINE_SPINLOCK(boot_lock);
1294 +static DEFINE_RAW_SPINLOCK(boot_lock);
1295
1296  static void sti_secondary_init(unsigned int cpu)
1297  {
1298 @@ -48,8 +48,8 @@ static void sti_secondary_init(unsigned int cpu)
1299         /*
1300          * Synchronise with the boot thread.
1301          */
1302 -       spin_lock(&boot_lock);
1303 -       spin_unlock(&boot_lock);
1304 +       raw_spin_lock(&boot_lock);
1305 +       raw_spin_unlock(&boot_lock);
1306  }
1307
1308  static int sti_boot_secondary(unsigned int cpu, struct task_struct *idle)
1309 @@ -60,7 +60,7 @@ static int sti_boot_secondary(unsigned int cpu, struct task_struct *idle)
1310          * set synchronisation state between this boot processor
1311          * and the secondary one
1312          */
1313 -       spin_lock(&boot_lock);
1314 +       raw_spin_lock(&boot_lock);
1315
1316         /*
1317          * The secondary processor is waiting to be released from
1318 @@ -91,7 +91,7 @@ static int sti_boot_secondary(unsigned int cpu, struct task_struct *idle)
1319          * now the secondary core is starting up let it run its
1320          * calibrations, then wait for it to finish
1321          */
1322 -       spin_unlock(&boot_lock);
1323 +       raw_spin_unlock(&boot_lock);
1324
1325         return pen_release != -1 ? -ENOSYS : 0;
1326  }
1327 diff --git a/arch/arm/mm/fault.c b/arch/arm/mm/fault.c
1328 index daafcf121ce0..b8aa1e9ee8ee 100644
1329 --- a/arch/arm/mm/fault.c
1330 +++ b/arch/arm/mm/fault.c
1331 @@ -430,6 +430,9 @@ do_translation_fault(unsigned long addr, unsigned int fsr,
1332         if (addr < TASK_SIZE)
1333                 return do_page_fault(addr, fsr, regs);
1334
1335 +       if (interrupts_enabled(regs))
1336 +               local_irq_enable();
1337 +
1338         if (user_mode(regs))
1339                 goto bad_area;
1340
1341 @@ -497,6 +500,9 @@ do_translation_fault(unsigned long addr, unsigned int fsr,
1342  static int
1343  do_sect_fault(unsigned long addr, unsigned int fsr, struct pt_regs *regs)
1344  {
1345 +       if (interrupts_enabled(regs))
1346 +               local_irq_enable();
1347 +
1348         do_bad_area(addr, fsr, regs);
1349         return 0;
1350  }
1351 diff --git a/arch/arm/mm/highmem.c b/arch/arm/mm/highmem.c
1352 index d02f8187b1cc..542692dbd40a 100644
1353 --- a/arch/arm/mm/highmem.c
1354 +++ b/arch/arm/mm/highmem.c
1355 @@ -34,6 +34,11 @@ static inline pte_t get_fixmap_pte(unsigned long vaddr)
1356         return *ptep;
1357  }
1358
1359 +static unsigned int fixmap_idx(int type)
1360 +{
1361 +       return FIX_KMAP_BEGIN + type + KM_TYPE_NR * smp_processor_id();
1362 +}
1363 +
1364  void *kmap(struct page *page)
1365  {
1366         might_sleep();
1367 @@ -54,12 +59,13 @@ EXPORT_SYMBOL(kunmap);
1368
1369  void *kmap_atomic(struct page *page)
1370  {
1371 +       pte_t pte = mk_pte(page, kmap_prot);
1372         unsigned int idx;
1373         unsigned long vaddr;
1374         void *kmap;
1375         int type;
1376
1377 -       preempt_disable();
1378 +       preempt_disable_nort();
1379         pagefault_disable();
1380         if (!PageHighMem(page))
1381                 return page_address(page);
1382 @@ -79,7 +85,7 @@ void *kmap_atomic(struct page *page)
1383
1384         type = kmap_atomic_idx_push();
1385
1386 -       idx = FIX_KMAP_BEGIN + type + KM_TYPE_NR * smp_processor_id();
1387 +       idx = fixmap_idx(type);
1388         vaddr = __fix_to_virt(idx);
1389  #ifdef CONFIG_DEBUG_HIGHMEM
1390         /*
1391 @@ -93,7 +99,10 @@ void *kmap_atomic(struct page *page)
1392          * in place, so the contained TLB flush ensures the TLB is updated
1393          * with the new mapping.
1394          */
1395 -       set_fixmap_pte(idx, mk_pte(page, kmap_prot));
1396 +#ifdef CONFIG_PREEMPT_RT_FULL
1397 +       current->kmap_pte[type] = pte;
1398 +#endif
1399 +       set_fixmap_pte(idx, pte);
1400
1401         return (void *)vaddr;
1402  }
1403 @@ -106,44 +115,75 @@ void __kunmap_atomic(void *kvaddr)
1404
1405         if (kvaddr >= (void *)FIXADDR_START) {
1406                 type = kmap_atomic_idx();
1407 -               idx = FIX_KMAP_BEGIN + type + KM_TYPE_NR * smp_processor_id();
1408 +               idx = fixmap_idx(type);
1409
1410                 if (cache_is_vivt())
1411                         __cpuc_flush_dcache_area((void *)vaddr, PAGE_SIZE);
1412 +#ifdef CONFIG_PREEMPT_RT_FULL
1413 +               current->kmap_pte[type] = __pte(0);
1414 +#endif
1415  #ifdef CONFIG_DEBUG_HIGHMEM
1416                 BUG_ON(vaddr != __fix_to_virt(idx));
1417 -               set_fixmap_pte(idx, __pte(0));
1418  #else
1419                 (void) idx;  /* to kill a warning */
1420  #endif
1421 +               set_fixmap_pte(idx, __pte(0));
1422                 kmap_atomic_idx_pop();
1423         } else if (vaddr >= PKMAP_ADDR(0) && vaddr < PKMAP_ADDR(LAST_PKMAP)) {
1424                 /* this address was obtained through kmap_high_get() */
1425                 kunmap_high(pte_page(pkmap_page_table[PKMAP_NR(vaddr)]));
1426         }
1427         pagefault_enable();
1428 -       preempt_enable();
1429 +       preempt_enable_nort();
1430  }
1431  EXPORT_SYMBOL(__kunmap_atomic);
1432
1433  void *kmap_atomic_pfn(unsigned long pfn)
1434  {
1435 +       pte_t pte = pfn_pte(pfn, kmap_prot);
1436         unsigned long vaddr;
1437         int idx, type;
1438         struct page *page = pfn_to_page(pfn);
1439
1440 -       preempt_disable();
1441 +       preempt_disable_nort();
1442         pagefault_disable();
1443         if (!PageHighMem(page))
1444                 return page_address(page);
1445
1446         type = kmap_atomic_idx_push();
1447 -       idx = FIX_KMAP_BEGIN + type + KM_TYPE_NR * smp_processor_id();
1448 +       idx = fixmap_idx(type);
1449         vaddr = __fix_to_virt(idx);
1450  #ifdef CONFIG_DEBUG_HIGHMEM
1451         BUG_ON(!pte_none(get_fixmap_pte(vaddr)));
1452  #endif
1453 -       set_fixmap_pte(idx, pfn_pte(pfn, kmap_prot));
1454 +#ifdef CONFIG_PREEMPT_RT_FULL
1455 +       current->kmap_pte[type] = pte;
1456 +#endif
1457 +       set_fixmap_pte(idx, pte);
1458
1459         return (void *)vaddr;
1460  }
1461 +#if defined CONFIG_PREEMPT_RT_FULL
1462 +void switch_kmaps(struct task_struct *prev_p, struct task_struct *next_p)
1463 +{
1464 +       int i;
1465 +
1466 +       /*
1467 +        * Clear @prev's kmap_atomic mappings
1468 +        */
1469 +       for (i = 0; i < prev_p->kmap_idx; i++) {
1470 +               int idx = fixmap_idx(i);
1471 +
1472 +               set_fixmap_pte(idx, __pte(0));
1473 +       }
1474 +       /*
1475 +        * Restore @next_p's kmap_atomic mappings
1476 +        */
1477 +       for (i = 0; i < next_p->kmap_idx; i++) {
1478 +               int idx = fixmap_idx(i);
1479 +
1480 +               if (!pte_none(next_p->kmap_pte[i]))
1481 +                       set_fixmap_pte(idx, next_p->kmap_pte[i]);
1482 +       }
1483 +}
1484 +#endif
1485 diff --git a/arch/arm/plat-versatile/platsmp.c b/arch/arm/plat-versatile/platsmp.c
1486 index 53feb90c840c..b4a8d54fc3f3 100644
1487 --- a/arch/arm/plat-versatile/platsmp.c
1488 +++ b/arch/arm/plat-versatile/platsmp.c
1489 @@ -30,7 +30,7 @@ static void write_pen_release(int val)
1490         sync_cache_w(&pen_release);
1491  }
1492
1493 -static DEFINE_SPINLOCK(boot_lock);
1494 +static DEFINE_RAW_SPINLOCK(boot_lock);
1495
1496  void versatile_secondary_init(unsigned int cpu)
1497  {
1498 @@ -43,8 +43,8 @@ void versatile_secondary_init(unsigned int cpu)
1499         /*
1500          * Synchronise with the boot thread.
1501          */
1502 -       spin_lock(&boot_lock);
1503 -       spin_unlock(&boot_lock);
1504 +       raw_spin_lock(&boot_lock);
1505 +       raw_spin_unlock(&boot_lock);
1506  }
1507
1508  int versatile_boot_secondary(unsigned int cpu, struct task_struct *idle)
1509 @@ -55,7 +55,7 @@ int versatile_boot_secondary(unsigned int cpu, struct task_struct *idle)
1510          * Set synchronisation state between this boot processor
1511          * and the secondary one
1512          */
1513 -       spin_lock(&boot_lock);
1514 +       raw_spin_lock(&boot_lock);
1515
1516         /*
1517          * This is really belt and braces; we hold unintended secondary
1518 @@ -85,7 +85,7 @@ int versatile_boot_secondary(unsigned int cpu, struct task_struct *idle)
1519          * now the secondary core is starting up let it run its
1520          * calibrations, then wait for it to finish
1521          */
1522 -       spin_unlock(&boot_lock);
1523 +       raw_spin_unlock(&boot_lock);
1524
1525         return pen_release != -1 ? -ENOSYS : 0;
1526  }
1527 diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
1528 index 14cdc6dea493..9196cf82f7be 100644
1529 --- a/arch/arm64/Kconfig
1530 +++ b/arch/arm64/Kconfig
1531 @@ -76,6 +76,7 @@ config ARM64
1532         select HAVE_PERF_REGS
1533         select HAVE_PERF_USER_STACK_DUMP
1534         select HAVE_RCU_TABLE_FREE
1535 +       select HAVE_PREEMPT_LAZY
1536         select HAVE_SYSCALL_TRACEPOINTS
1537         select IOMMU_DMA if IOMMU_SUPPORT
1538         select IRQ_DOMAIN
1539 @@ -582,7 +583,7 @@ config XEN_DOM0
1540
1541  config XEN
1542         bool "Xen guest support on ARM64"
1543 -       depends on ARM64 && OF
1544 +       depends on ARM64 && OF && !PREEMPT_RT_FULL
1545         select SWIOTLB_XEN
1546         help
1547           Say Y if you want to run Linux in a Virtual Machine on Xen on ARM64.
1548 diff --git a/arch/arm64/include/asm/thread_info.h b/arch/arm64/include/asm/thread_info.h
1549 index 90c7ff233735..5f4e89fbc290 100644
1550 --- a/arch/arm64/include/asm/thread_info.h
1551 +++ b/arch/arm64/include/asm/thread_info.h
1552 @@ -49,6 +49,7 @@ struct thread_info {
1553         mm_segment_t            addr_limit;     /* address limit */
1554         struct task_struct      *task;          /* main task structure */
1555         int                     preempt_count;  /* 0 => preemptable, <0 => bug */
1556 +       int                     preempt_lazy_count; /* 0 => preemptable, <0 => bug */
1557         int                     cpu;            /* cpu */
1558  };
1559
1560 @@ -103,6 +104,7 @@ static inline struct thread_info *current_thread_info(void)
1561  #define TIF_NEED_RESCHED       1
1562  #define TIF_NOTIFY_RESUME      2       /* callback before returning to user */
1563  #define TIF_FOREIGN_FPSTATE    3       /* CPU's FP state is not current's */
1564 +#define TIF_NEED_RESCHED_LAZY  4
1565  #define TIF_NOHZ               7
1566  #define TIF_SYSCALL_TRACE      8
1567  #define TIF_SYSCALL_AUDIT      9
1568 @@ -118,6 +120,7 @@ static inline struct thread_info *current_thread_info(void)
1569  #define _TIF_NEED_RESCHED      (1 << TIF_NEED_RESCHED)
1570  #define _TIF_NOTIFY_RESUME     (1 << TIF_NOTIFY_RESUME)
1571  #define _TIF_FOREIGN_FPSTATE   (1 << TIF_FOREIGN_FPSTATE)
1572 +#define _TIF_NEED_RESCHED_LAZY (1 << TIF_NEED_RESCHED_LAZY)
1573  #define _TIF_NOHZ              (1 << TIF_NOHZ)
1574  #define _TIF_SYSCALL_TRACE     (1 << TIF_SYSCALL_TRACE)
1575  #define _TIF_SYSCALL_AUDIT     (1 << TIF_SYSCALL_AUDIT)
1576 @@ -126,7 +129,8 @@ static inline struct thread_info *current_thread_info(void)
1577  #define _TIF_32BIT             (1 << TIF_32BIT)
1578
1579  #define _TIF_WORK_MASK         (_TIF_NEED_RESCHED | _TIF_SIGPENDING | \
1580 -                                _TIF_NOTIFY_RESUME | _TIF_FOREIGN_FPSTATE)
1581 +                                _TIF_NOTIFY_RESUME | _TIF_FOREIGN_FPSTATE | \
1582 +                                _TIF_NEED_RESCHED_LAZY)
1583
1584  #define _TIF_SYSCALL_WORK      (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_AUDIT | \
1585                                  _TIF_SYSCALL_TRACEPOINT | _TIF_SECCOMP | \
1586 diff --git a/arch/arm64/kernel/asm-offsets.c b/arch/arm64/kernel/asm-offsets.c
1587 index 087cf9a65359..d74475928399 100644
1588 --- a/arch/arm64/kernel/asm-offsets.c
1589 +++ b/arch/arm64/kernel/asm-offsets.c
1590 @@ -35,6 +35,7 @@ int main(void)
1591    BLANK();
1592    DEFINE(TI_FLAGS,             offsetof(struct thread_info, flags));
1593    DEFINE(TI_PREEMPT,           offsetof(struct thread_info, preempt_count));
1594 +  DEFINE(TI_PREEMPT_LAZY,      offsetof(struct thread_info, preempt_lazy_count));
1595    DEFINE(TI_ADDR_LIMIT,                offsetof(struct thread_info, addr_limit));
1596    DEFINE(TI_TASK,              offsetof(struct thread_info, task));
1597    DEFINE(TI_CPU,               offsetof(struct thread_info, cpu));
1598 diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S
1599 index 5a3753d09e20..05d73c4c03f6 100644
1600 --- a/arch/arm64/kernel/entry.S
1601 +++ b/arch/arm64/kernel/entry.S
1602 @@ -376,11 +376,16 @@ el1_irq:
1603  #ifdef CONFIG_PREEMPT
1604         get_thread_info tsk
1605         ldr     w24, [tsk, #TI_PREEMPT]         // get preempt count
1606 -       cbnz    w24, 1f                         // preempt count != 0
1607 +       cbnz    w24, 2f                         // preempt count != 0
1608         ldr     x0, [tsk, #TI_FLAGS]            // get flags
1609 -       tbz     x0, #TIF_NEED_RESCHED, 1f       // needs rescheduling?
1610 -       bl      el1_preempt
1611 +       tbnz    x0, #TIF_NEED_RESCHED, 1f       // needs rescheduling?
1612 +
1613 +       ldr     w24, [tsk, #TI_PREEMPT_LAZY]    // get preempt lazy count
1614 +       cbnz    w24, 2f                         // preempt lazy count != 0
1615 +       tbz     x0, #TIF_NEED_RESCHED_LAZY, 2f  // needs rescheduling?
1616  1:
1617 +       bl      el1_preempt
1618 +2:
1619  #endif
1620  #ifdef CONFIG_TRACE_IRQFLAGS
1621         bl      trace_hardirqs_on
1622 @@ -394,6 +399,7 @@ el1_preempt:
1623  1:     bl      preempt_schedule_irq            // irq en/disable is done inside
1624         ldr     x0, [tsk, #TI_FLAGS]            // get new tasks TI_FLAGS
1625         tbnz    x0, #TIF_NEED_RESCHED, 1b       // needs rescheduling?
1626 +       tbnz    x0, #TIF_NEED_RESCHED_LAZY, 1b  // needs rescheduling?
1627         ret     x24
1628  #endif
1629
1630 @@ -638,6 +644,7 @@ ret_fast_syscall_trace:
1631   */
1632  work_pending:
1633         tbnz    x1, #TIF_NEED_RESCHED, work_resched
1634 +       tbnz    x1, #TIF_NEED_RESCHED_LAZY, work_resched
1635         /* TIF_SIGPENDING, TIF_NOTIFY_RESUME or TIF_FOREIGN_FPSTATE case */
1636         ldr     x2, [sp, #S_PSTATE]
1637         mov     x0, sp                          // 'regs'
1638 diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig
1639 index db459612de44..bd8be6a0e745 100644
1640 --- a/arch/mips/Kconfig
1641 +++ b/arch/mips/Kconfig
1642 @@ -2410,7 +2410,7 @@ config CPU_R4400_WORKAROUNDS
1643  #
1644  config HIGHMEM
1645         bool "High Memory Support"
1646 -       depends on 32BIT && CPU_SUPPORTS_HIGHMEM && SYS_SUPPORTS_HIGHMEM && !CPU_MIPS32_3_5_EVA
1647 +       depends on 32BIT && CPU_SUPPORTS_HIGHMEM && SYS_SUPPORTS_HIGHMEM && !CPU_MIPS32_3_5_EVA && !PREEMPT_RT_FULL
1648
1649  config CPU_SUPPORTS_HIGHMEM
1650         bool
1651 diff --git a/arch/mips/kvm/mips.c b/arch/mips/kvm/mips.c
1652 index e86b7499921a..b2a2f678c5dc 100644
1653 --- a/arch/mips/kvm/mips.c
1654 +++ b/arch/mips/kvm/mips.c
1655 @@ -454,8 +454,8 @@ int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu,
1656
1657         dvcpu->arch.wait = 0;
1658
1659 -       if (waitqueue_active(&dvcpu->wq))
1660 -               wake_up_interruptible(&dvcpu->wq);
1661 +       if (swait_active(&dvcpu->wq))
1662 +               swake_up(&dvcpu->wq);
1663
1664         return 0;
1665  }
1666 @@ -1183,8 +1183,8 @@ static void kvm_mips_comparecount_func(unsigned long data)
1667         kvm_mips_callbacks->queue_timer_int(vcpu);
1668
1669         vcpu->arch.wait = 0;
1670 -       if (waitqueue_active(&vcpu->wq))
1671 -               wake_up_interruptible(&vcpu->wq);
1672 +       if (swait_active(&vcpu->wq))
1673 +               swake_up(&vcpu->wq);
1674  }
1675
1676  /* low level hrtimer wake routine */
1677 diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
1678 index db49e0d796b1..1d2be228661c 100644
1679 --- a/arch/powerpc/Kconfig
1680 +++ b/arch/powerpc/Kconfig
1681 @@ -60,10 +60,11 @@ config LOCKDEP_SUPPORT
1682
1683  config RWSEM_GENERIC_SPINLOCK
1684         bool
1685 +       default y if PREEMPT_RT_FULL
1686
1687  config RWSEM_XCHGADD_ALGORITHM
1688         bool
1689 -       default y
1690 +       default y if !PREEMPT_RT_FULL
1691
1692  config GENERIC_LOCKBREAK
1693         bool
1694 @@ -141,6 +142,7 @@ config PPC
1695         select ARCH_HAS_TICK_BROADCAST if GENERIC_CLOCKEVENTS_BROADCAST
1696         select GENERIC_STRNCPY_FROM_USER
1697         select GENERIC_STRNLEN_USER
1698 +       select HAVE_PREEMPT_LAZY
1699         select HAVE_MOD_ARCH_SPECIFIC
1700         select MODULES_USE_ELF_RELA
1701         select CLONE_BACKWARDS
1702 @@ -319,7 +321,7 @@ menu "Kernel options"
1703
1704  config HIGHMEM
1705         bool "High memory support"
1706 -       depends on PPC32
1707 +       depends on PPC32 && !PREEMPT_RT_FULL
1708
1709  source kernel/Kconfig.hz
1710  source kernel/Kconfig.preempt
1711 diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h
1712 index cfa758c6b4f6..f8673ff84b31 100644
1713 --- a/arch/powerpc/include/asm/kvm_host.h
1714 +++ b/arch/powerpc/include/asm/kvm_host.h
1715 @@ -286,7 +286,7 @@ struct kvmppc_vcore {
1716         struct list_head runnable_threads;
1717         struct list_head preempt_list;
1718         spinlock_t lock;
1719 -       wait_queue_head_t wq;
1720 +       struct swait_queue_head wq;
1721         spinlock_t stoltb_lock; /* protects stolen_tb and preempt_tb */
1722         u64 stolen_tb;
1723         u64 preempt_tb;
1724 @@ -626,7 +626,7 @@ struct kvm_vcpu_arch {
1725         u8 prodded;
1726         u32 last_inst;
1727
1728 -       wait_queue_head_t *wqp;
1729 +       struct swait_queue_head *wqp;
1730         struct kvmppc_vcore *vcore;
1731         int ret;
1732         int trap;
1733 diff --git a/arch/powerpc/include/asm/thread_info.h b/arch/powerpc/include/asm/thread_info.h
1734 index 7efee4a3240b..40e6fa1b85b2 100644
1735 --- a/arch/powerpc/include/asm/thread_info.h
1736 +++ b/arch/powerpc/include/asm/thread_info.h
1737 @@ -42,6 +42,8 @@ struct thread_info {
1738         int             cpu;                    /* cpu we're on */
1739         int             preempt_count;          /* 0 => preemptable,
1740                                                    <0 => BUG */
1741 +       int             preempt_lazy_count;      /* 0 => preemptable,
1742 +                                                  <0 => BUG */
1743         unsigned long   local_flags;            /* private flags for thread */
1744
1745         /* low level flags - has atomic operations done on it */
1746 @@ -82,8 +84,7 @@ static inline struct thread_info *current_thread_info(void)
1747  #define TIF_SYSCALL_TRACE      0       /* syscall trace active */
1748  #define TIF_SIGPENDING         1       /* signal pending */
1749  #define TIF_NEED_RESCHED       2       /* rescheduling necessary */
1750 -#define TIF_POLLING_NRFLAG     3       /* true if poll_idle() is polling
1751 -                                          TIF_NEED_RESCHED */
1752 +#define TIF_NEED_RESCHED_LAZY  3       /* lazy rescheduling necessary */
1753  #define TIF_32BIT              4       /* 32 bit binary */
1754  #define TIF_RESTORE_TM         5       /* need to restore TM FP/VEC/VSX */
1755  #define TIF_SYSCALL_AUDIT      7       /* syscall auditing active */
1756 @@ -101,6 +102,8 @@ static inline struct thread_info *current_thread_info(void)
1757  #if defined(CONFIG_PPC64)
1758  #define TIF_ELF2ABI            18      /* function descriptors must die! */
1759  #endif
1760 +#define TIF_POLLING_NRFLAG     19      /* true if poll_idle() is polling
1761 +                                          TIF_NEED_RESCHED */
1762
1763  /* as above, but as bit values */
1764  #define _TIF_SYSCALL_TRACE     (1<<TIF_SYSCALL_TRACE)
1765 @@ -119,14 +122,16 @@ static inline struct thread_info *current_thread_info(void)
1766  #define _TIF_SYSCALL_TRACEPOINT        (1<<TIF_SYSCALL_TRACEPOINT)
1767  #define _TIF_EMULATE_STACK_STORE       (1<<TIF_EMULATE_STACK_STORE)
1768  #define _TIF_NOHZ              (1<<TIF_NOHZ)
1769 +#define _TIF_NEED_RESCHED_LAZY (1<<TIF_NEED_RESCHED_LAZY)
1770  #define _TIF_SYSCALL_DOTRACE   (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_AUDIT | \
1771                                  _TIF_SECCOMP | _TIF_SYSCALL_TRACEPOINT | \
1772                                  _TIF_NOHZ)
1773
1774  #define _TIF_USER_WORK_MASK    (_TIF_SIGPENDING | _TIF_NEED_RESCHED | \
1775                                  _TIF_NOTIFY_RESUME | _TIF_UPROBE | \
1776 -                                _TIF_RESTORE_TM)
1777 +                                _TIF_RESTORE_TM | _TIF_NEED_RESCHED_LAZY)
1778  #define _TIF_PERSYSCALL_MASK   (_TIF_RESTOREALL|_TIF_NOERROR)
1779 +#define _TIF_NEED_RESCHED_MASK (_TIF_NEED_RESCHED | _TIF_NEED_RESCHED_LAZY)
1780
1781  /* Bits in local_flags */
1782  /* Don't move TLF_NAPPING without adjusting the code in entry_32.S */
1783 diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c
1784 index 221d584d089f..d6d0c59ef8ae 100644
1785 --- a/arch/powerpc/kernel/asm-offsets.c
1786 +++ b/arch/powerpc/kernel/asm-offsets.c
1787 @@ -160,6 +160,7 @@ int main(void)
1788         DEFINE(TI_FLAGS, offsetof(struct thread_info, flags));
1789         DEFINE(TI_LOCAL_FLAGS, offsetof(struct thread_info, local_flags));
1790         DEFINE(TI_PREEMPT, offsetof(struct thread_info, preempt_count));
1791 +       DEFINE(TI_PREEMPT_LAZY, offsetof(struct thread_info, preempt_lazy_count));
1792         DEFINE(TI_TASK, offsetof(struct thread_info, task));
1793         DEFINE(TI_CPU, offsetof(struct thread_info, cpu));
1794
1795 diff --git a/arch/powerpc/kernel/entry_32.S b/arch/powerpc/kernel/entry_32.S
1796 index 2405631e91a2..c21b4b42eaa0 100644
1797 --- a/arch/powerpc/kernel/entry_32.S
1798 +++ b/arch/powerpc/kernel/entry_32.S
1799 @@ -818,7 +818,14 @@ resume_kernel:
1800         cmpwi   0,r0,0          /* if non-zero, just restore regs and return */
1801         bne     restore
1802         andi.   r8,r8,_TIF_NEED_RESCHED
1803 +       bne+    1f
1804 +       lwz     r0,TI_PREEMPT_LAZY(r9)
1805 +       cmpwi   0,r0,0          /* if non-zero, just restore regs and return */
1806 +       bne     restore
1807 +       lwz     r0,TI_FLAGS(r9)
1808 +       andi.   r0,r0,_TIF_NEED_RESCHED_LAZY
1809         beq+    restore
1810 +1:
1811         lwz     r3,_MSR(r1)
1812         andi.   r0,r3,MSR_EE    /* interrupts off? */
1813         beq     restore         /* don't schedule if so */
1814 @@ -829,11 +836,11 @@ resume_kernel:
1815          */
1816         bl      trace_hardirqs_off
1817  #endif
1818 -1:     bl      preempt_schedule_irq
1819 +2:     bl      preempt_schedule_irq
1820         CURRENT_THREAD_INFO(r9, r1)
1821         lwz     r3,TI_FLAGS(r9)
1822 -       andi.   r0,r3,_TIF_NEED_RESCHED
1823 -       bne-    1b
1824 +       andi.   r0,r3,_TIF_NEED_RESCHED_MASK
1825 +       bne-    2b
1826  #ifdef CONFIG_TRACE_IRQFLAGS
1827         /* And now, to properly rebalance the above, we tell lockdep they
1828          * are being turned back on, which will happen when we return
1829 @@ -1154,7 +1161,7 @@ global_dbcr0:
1830  #endif /* !(CONFIG_4xx || CONFIG_BOOKE) */
1831
1832  do_work:                       /* r10 contains MSR_KERNEL here */
1833 -       andi.   r0,r9,_TIF_NEED_RESCHED
1834 +       andi.   r0,r9,_TIF_NEED_RESCHED_MASK
1835         beq     do_user_signal
1836
1837  do_resched:                    /* r10 contains MSR_KERNEL here */
1838 @@ -1175,7 +1182,7 @@ recheck:
1839         MTMSRD(r10)             /* disable interrupts */
1840         CURRENT_THREAD_INFO(r9, r1)
1841         lwz     r9,TI_FLAGS(r9)
1842 -       andi.   r0,r9,_TIF_NEED_RESCHED
1843 +       andi.   r0,r9,_TIF_NEED_RESCHED_MASK
1844         bne-    do_resched
1845         andi.   r0,r9,_TIF_USER_WORK_MASK
1846         beq     restore_user
1847 diff --git a/arch/powerpc/kernel/entry_64.S b/arch/powerpc/kernel/entry_64.S
1848 index edba294620db..1aae3fdb0c2a 100644
1849 --- a/arch/powerpc/kernel/entry_64.S
1850 +++ b/arch/powerpc/kernel/entry_64.S
1851 @@ -683,7 +683,7 @@ _GLOBAL(ret_from_except_lite)
1852  #else
1853         beq     restore
1854  #endif
1855 -1:     andi.   r0,r4,_TIF_NEED_RESCHED
1856 +1:     andi.   r0,r4,_TIF_NEED_RESCHED_MASK
1857         beq     2f
1858         bl      restore_interrupts
1859         SCHEDULE_USER
1860 @@ -745,10 +745,18 @@ resume_kernel:
1861
1862  #ifdef CONFIG_PREEMPT
1863         /* Check if we need to preempt */
1864 +       lwz     r8,TI_PREEMPT(r9)
1865 +       cmpwi   0,r8,0          /* if non-zero, just restore regs and return */
1866 +       bne     restore
1867         andi.   r0,r4,_TIF_NEED_RESCHED
1868 +       bne+    check_count
1869 +
1870 +       andi.   r0,r4,_TIF_NEED_RESCHED_LAZY
1871         beq+    restore
1872 +       lwz     r8,TI_PREEMPT_LAZY(r9)
1873 +
1874         /* Check that preempt_count() == 0 and interrupts are enabled */
1875 -       lwz     r8,TI_PREEMPT(r9)
1876 +check_count:
1877         cmpwi   cr1,r8,0
1878         ld      r0,SOFTE(r1)
1879         cmpdi   r0,0
1880 @@ -765,7 +773,7 @@ resume_kernel:
1881         /* Re-test flags and eventually loop */
1882         CURRENT_THREAD_INFO(r9, r1)
1883         ld      r4,TI_FLAGS(r9)
1884 -       andi.   r0,r4,_TIF_NEED_RESCHED
1885 +       andi.   r0,r4,_TIF_NEED_RESCHED_MASK
1886         bne     1b
1887
1888         /*
1889 diff --git a/arch/powerpc/kernel/irq.c b/arch/powerpc/kernel/irq.c
1890 index 290559df1e8b..070afa6da35d 100644
1891 --- a/arch/powerpc/kernel/irq.c
1892 +++ b/arch/powerpc/kernel/irq.c
1893 @@ -614,6 +614,7 @@ void irq_ctx_init(void)
1894         }
1895  }
1896
1897 +#ifndef CONFIG_PREEMPT_RT_FULL
1898  void do_softirq_own_stack(void)
1899  {
1900         struct thread_info *curtp, *irqtp;
1901 @@ -631,6 +632,7 @@ void do_softirq_own_stack(void)
1902         if (irqtp->flags)
1903                 set_bits(irqtp->flags, &curtp->flags);
1904  }
1905 +#endif
1906
1907  irq_hw_number_t virq_to_hw(unsigned int virq)
1908  {
1909 diff --git a/arch/powerpc/kernel/misc_32.S b/arch/powerpc/kernel/misc_32.S
1910 index ed3ab509faca..8b261416c070 100644
1911 --- a/arch/powerpc/kernel/misc_32.S
1912 +++ b/arch/powerpc/kernel/misc_32.S
1913 @@ -40,6 +40,7 @@
1914   * We store the saved ksp_limit in the unused part
1915   * of the STACK_FRAME_OVERHEAD
1916   */
1917 +#ifndef CONFIG_PREEMPT_RT_FULL
1918  _GLOBAL(call_do_softirq)
1919         mflr    r0
1920         stw     r0,4(r1)
1921 @@ -56,6 +57,7 @@ _GLOBAL(call_do_softirq)
1922         stw     r10,THREAD+KSP_LIMIT(r2)
1923         mtlr    r0
1924         blr
1925 +#endif
1926
1927  /*
1928   * void call_do_irq(struct pt_regs *regs, struct thread_info *irqtp);
1929 diff --git a/arch/powerpc/kernel/misc_64.S b/arch/powerpc/kernel/misc_64.S
1930 index db475d41b57a..96b7ef80e05d 100644
1931 --- a/arch/powerpc/kernel/misc_64.S
1932 +++ b/arch/powerpc/kernel/misc_64.S
1933 @@ -30,6 +30,7 @@
1934
1935         .text
1936
1937 +#ifndef CONFIG_PREEMPT_RT_FULL
1938  _GLOBAL(call_do_softirq)
1939         mflr    r0
1940         std     r0,16(r1)
1941 @@ -40,6 +41,7 @@ _GLOBAL(call_do_softirq)
1942         ld      r0,16(r1)
1943         mtlr    r0
1944         blr
1945 +#endif
1946
1947  _GLOBAL(call_do_irq)
1948         mflr    r0
1949 diff --git a/arch/powerpc/kvm/Kconfig b/arch/powerpc/kvm/Kconfig
1950 index c2024ac9d4e8..2303788da7e1 100644
1951 --- a/arch/powerpc/kvm/Kconfig
1952 +++ b/arch/powerpc/kvm/Kconfig
1953 @@ -172,6 +172,7 @@ config KVM_E500MC
1954  config KVM_MPIC
1955         bool "KVM in-kernel MPIC emulation"
1956         depends on KVM && E500
1957 +       depends on !PREEMPT_RT_FULL
1958         select HAVE_KVM_IRQCHIP
1959         select HAVE_KVM_IRQFD
1960         select HAVE_KVM_IRQ_ROUTING
1961 diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
1962 index a7352b59e6f9..df34a6432873 100644
1963 --- a/arch/powerpc/kvm/book3s_hv.c
1964 +++ b/arch/powerpc/kvm/book3s_hv.c
1965 @@ -114,11 +114,11 @@ static bool kvmppc_ipi_thread(int cpu)
1966  static void kvmppc_fast_vcpu_kick_hv(struct kvm_vcpu *vcpu)
1967  {
1968         int cpu;
1969 -       wait_queue_head_t *wqp;
1970 +       struct swait_queue_head *wqp;
1971
1972         wqp = kvm_arch_vcpu_wq(vcpu);
1973 -       if (waitqueue_active(wqp)) {
1974 -               wake_up_interruptible(wqp);
1975 +       if (swait_active(wqp)) {
1976 +               swake_up(wqp);
1977                 ++vcpu->stat.halt_wakeup;
1978         }
1979
1980 @@ -707,8 +707,8 @@ int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu)
1981                 tvcpu->arch.prodded = 1;
1982                 smp_mb();
1983                 if (vcpu->arch.ceded) {
1984 -                       if (waitqueue_active(&vcpu->wq)) {
1985 -                               wake_up_interruptible(&vcpu->wq);
1986 +                       if (swait_active(&vcpu->wq)) {
1987 +                               swake_up(&vcpu->wq);
1988                                 vcpu->stat.halt_wakeup++;
1989                         }
1990                 }
1991 @@ -1447,7 +1447,7 @@ static struct kvmppc_vcore *kvmppc_vcore_create(struct kvm *kvm, int core)
1992         INIT_LIST_HEAD(&vcore->runnable_threads);
1993         spin_lock_init(&vcore->lock);
1994         spin_lock_init(&vcore->stoltb_lock);
1995 -       init_waitqueue_head(&vcore->wq);
1996 +       init_swait_queue_head(&vcore->wq);
1997         vcore->preempt_tb = TB_NIL;
1998         vcore->lpcr = kvm->arch.lpcr;
1999         vcore->first_vcpuid = core * threads_per_subcore;
2000 @@ -2519,10 +2519,9 @@ static void kvmppc_vcore_blocked(struct kvmppc_vcore *vc)
2001  {
2002         struct kvm_vcpu *vcpu;
2003         int do_sleep = 1;
2004 +       DECLARE_SWAITQUEUE(wait);
2005
2006 -       DEFINE_WAIT(wait);
2007 -
2008 -       prepare_to_wait(&vc->wq, &wait, TASK_INTERRUPTIBLE);
2009 +       prepare_to_swait(&vc->wq, &wait, TASK_INTERRUPTIBLE);
2010
2011         /*
2012          * Check one last time for pending exceptions and ceded state after
2013 @@ -2536,7 +2535,7 @@ static void kvmppc_vcore_blocked(struct kvmppc_vcore *vc)
2014         }
2015
2016         if (!do_sleep) {
2017 -               finish_wait(&vc->wq, &wait);
2018 +               finish_swait(&vc->wq, &wait);
2019                 return;
2020         }
2021
2022 @@ -2544,7 +2543,7 @@ static void kvmppc_vcore_blocked(struct kvmppc_vcore *vc)
2023         trace_kvmppc_vcore_blocked(vc, 0);
2024         spin_unlock(&vc->lock);
2025         schedule();
2026 -       finish_wait(&vc->wq, &wait);
2027 +       finish_swait(&vc->wq, &wait);
2028         spin_lock(&vc->lock);
2029         vc->vcore_state = VCORE_INACTIVE;
2030         trace_kvmppc_vcore_blocked(vc, 1);
2031 @@ -2600,7 +2599,7 @@ static int kvmppc_run_vcpu(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
2032                         kvmppc_start_thread(vcpu, vc);
2033                         trace_kvm_guest_enter(vcpu);
2034                 } else if (vc->vcore_state == VCORE_SLEEPING) {
2035 -                       wake_up(&vc->wq);
2036 +                       swake_up(&vc->wq);
2037                 }
2038
2039         }
2040 diff --git a/arch/powerpc/platforms/ps3/device-init.c b/arch/powerpc/platforms/ps3/device-init.c
2041 index 3f175e8aedb4..c4c02f91904c 100644
2042 --- a/arch/powerpc/platforms/ps3/device-init.c
2043 +++ b/arch/powerpc/platforms/ps3/device-init.c
2044 @@ -752,7 +752,7 @@ static int ps3_notification_read_write(struct ps3_notification_device *dev,
2045         }
2046         pr_debug("%s:%u: notification %s issued\n", __func__, __LINE__, op);
2047
2048 -       res = wait_event_interruptible(dev->done.wait,
2049 +       res = swait_event_interruptible(dev->done.wait,
2050                                        dev->done.done || kthread_should_stop());
2051         if (kthread_should_stop())
2052                 res = -EINTR;
2053 diff --git a/arch/s390/include/asm/kvm_host.h b/arch/s390/include/asm/kvm_host.h
2054 index e9a983f40a24..bbdc539fb3c6 100644
2055 --- a/arch/s390/include/asm/kvm_host.h
2056 +++ b/arch/s390/include/asm/kvm_host.h
2057 @@ -427,7 +427,7 @@ struct kvm_s390_irq_payload {
2058  struct kvm_s390_local_interrupt {
2059         spinlock_t lock;
2060         struct kvm_s390_float_interrupt *float_int;
2061 -       wait_queue_head_t *wq;
2062 +       struct swait_queue_head *wq;
2063         atomic_t *cpuflags;
2064         DECLARE_BITMAP(sigp_emerg_pending, KVM_MAX_VCPUS);
2065         struct kvm_s390_irq_payload irq;
2066 diff --git a/arch/s390/kvm/interrupt.c b/arch/s390/kvm/interrupt.c
2067 index 6a75352f453c..cc862c486002 100644
2068 --- a/arch/s390/kvm/interrupt.c
2069 +++ b/arch/s390/kvm/interrupt.c
2070 @@ -868,13 +868,13 @@ no_timer:
2071
2072  void kvm_s390_vcpu_wakeup(struct kvm_vcpu *vcpu)
2073  {
2074 -       if (waitqueue_active(&vcpu->wq)) {
2075 +       if (swait_active(&vcpu->wq)) {
2076                 /*
2077                  * The vcpu gave up the cpu voluntarily, mark it as a good
2078                  * yield-candidate.
2079                  */
2080                 vcpu->preempted = true;
2081 -               wake_up_interruptible(&vcpu->wq);
2082 +               swake_up(&vcpu->wq);
2083                 vcpu->stat.halt_wakeup++;
2084         }
2085  }
2086 diff --git a/arch/sh/kernel/irq.c b/arch/sh/kernel/irq.c
2087 index 6c0378c0b8b5..abd58b4dff97 100644
2088 --- a/arch/sh/kernel/irq.c
2089 +++ b/arch/sh/kernel/irq.c
2090 @@ -147,6 +147,7 @@ void irq_ctx_exit(int cpu)
2091         hardirq_ctx[cpu] = NULL;
2092  }
2093
2094 +#ifndef CONFIG_PREEMPT_RT_FULL
2095  void do_softirq_own_stack(void)
2096  {
2097         struct thread_info *curctx;
2098 @@ -174,6 +175,7 @@ void do_softirq_own_stack(void)
2099                   "r5", "r6", "r7", "r8", "r9", "r15", "t", "pr"
2100         );
2101  }
2102 +#endif
2103  #else
2104  static inline void handle_one_irq(unsigned int irq)
2105  {
2106 diff --git a/arch/sparc/Kconfig b/arch/sparc/Kconfig
2107 index 56442d2d7bbc..8c9598f534c9 100644
2108 --- a/arch/sparc/Kconfig
2109 +++ b/arch/sparc/Kconfig
2110 @@ -189,12 +189,10 @@ config NR_CPUS
2111  source kernel/Kconfig.hz
2112
2113  config RWSEM_GENERIC_SPINLOCK
2114 -       bool
2115 -       default y if SPARC32
2116 +       def_bool PREEMPT_RT_FULL
2117
2118  config RWSEM_XCHGADD_ALGORITHM
2119 -       bool
2120 -       default y if SPARC64
2121 +       def_bool !RWSEM_GENERIC_SPINLOCK && !PREEMPT_RT_FULL
2122
2123  config GENERIC_HWEIGHT
2124         bool
2125 diff --git a/arch/sparc/kernel/irq_64.c b/arch/sparc/kernel/irq_64.c
2126 index e22416ce56ea..d359de71153a 100644
2127 --- a/arch/sparc/kernel/irq_64.c
2128 +++ b/arch/sparc/kernel/irq_64.c
2129 @@ -854,6 +854,7 @@ void __irq_entry handler_irq(int pil, struct pt_regs *regs)
2130         set_irq_regs(old_regs);
2131  }
2132
2133 +#ifndef CONFIG_PREEMPT_RT_FULL
2134  void do_softirq_own_stack(void)
2135  {
2136         void *orig_sp, *sp = softirq_stack[smp_processor_id()];
2137 @@ -868,6 +869,7 @@ void do_softirq_own_stack(void)
2138         __asm__ __volatile__("mov %0, %%sp"
2139                              : : "r" (orig_sp));
2140  }
2141 +#endif
2142
2143  #ifdef CONFIG_HOTPLUG_CPU
2144  void fixup_irqs(void)
2145 diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
2146 index 436639a31624..6ee1dd0deadc 100644
2147 --- a/arch/x86/Kconfig
2148 +++ b/arch/x86/Kconfig
2149 @@ -17,6 +17,7 @@ config X86_64
2150  ### Arch settings
2151  config X86
2152         def_bool y
2153 +       select HAVE_PREEMPT_LAZY
2154         select ACPI_LEGACY_TABLES_LOOKUP        if ACPI
2155         select ACPI_SYSTEM_POWER_STATES_SUPPORT if ACPI
2156         select ANON_INODES
2157 @@ -212,8 +213,11 @@ config ARCH_MAY_HAVE_PC_FDC
2158         def_bool y
2159         depends on ISA_DMA_API
2160
2161 +config RWSEM_GENERIC_SPINLOCK
2162 +       def_bool PREEMPT_RT_FULL
2163 +
2164  config RWSEM_XCHGADD_ALGORITHM
2165 -       def_bool y
2166 +       def_bool !RWSEM_GENERIC_SPINLOCK && !PREEMPT_RT_FULL
2167
2168  config GENERIC_CALIBRATE_DELAY
2169         def_bool y
2170 @@ -848,7 +852,7 @@ config IOMMU_HELPER
2171  config MAXSMP
2172         bool "Enable Maximum number of SMP Processors and NUMA Nodes"
2173         depends on X86_64 && SMP && DEBUG_KERNEL
2174 -       select CPUMASK_OFFSTACK
2175 +       select CPUMASK_OFFSTACK if !PREEMPT_RT_FULL
2176         ---help---
2177           Enable maximum number of CPUS and NUMA Nodes for this architecture.
2178           If unsure, say N.
2179 diff --git a/arch/x86/crypto/aesni-intel_glue.c b/arch/x86/crypto/aesni-intel_glue.c
2180 index 3633ad6145c5..c6d5458ee7f9 100644
2181 --- a/arch/x86/crypto/aesni-intel_glue.c
2182 +++ b/arch/x86/crypto/aesni-intel_glue.c
2183 @@ -383,14 +383,14 @@ static int ecb_encrypt(struct blkcipher_desc *desc,
2184         err = blkcipher_walk_virt(desc, &walk);
2185         desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
2186
2187 -       kernel_fpu_begin();
2188         while ((nbytes = walk.nbytes)) {
2189 +               kernel_fpu_begin();
2190                 aesni_ecb_enc(ctx, walk.dst.virt.addr, walk.src.virt.addr,
2191 -                             nbytes & AES_BLOCK_MASK);
2192 +                               nbytes & AES_BLOCK_MASK);
2193 +               kernel_fpu_end();
2194                 nbytes &= AES_BLOCK_SIZE - 1;
2195                 err = blkcipher_walk_done(desc, &walk, nbytes);
2196         }
2197 -       kernel_fpu_end();
2198
2199         return err;
2200  }
2201 @@ -407,14 +407,14 @@ static int ecb_decrypt(struct blkcipher_desc *desc,
2202         err = blkcipher_walk_virt(desc, &walk);
2203         desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
2204
2205 -       kernel_fpu_begin();
2206         while ((nbytes = walk.nbytes)) {
2207 +               kernel_fpu_begin();
2208                 aesni_ecb_dec(ctx, walk.dst.virt.addr, walk.src.virt.addr,
2209                               nbytes & AES_BLOCK_MASK);
2210 +               kernel_fpu_end();
2211                 nbytes &= AES_BLOCK_SIZE - 1;
2212                 err = blkcipher_walk_done(desc, &walk, nbytes);
2213         }
2214 -       kernel_fpu_end();
2215
2216         return err;
2217  }
2218 @@ -431,14 +431,14 @@ static int cbc_encrypt(struct blkcipher_desc *desc,
2219         err = blkcipher_walk_virt(desc, &walk);
2220         desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
2221
2222 -       kernel_fpu_begin();
2223         while ((nbytes = walk.nbytes)) {
2224 +               kernel_fpu_begin();
2225                 aesni_cbc_enc(ctx, walk.dst.virt.addr, walk.src.virt.addr,
2226                               nbytes & AES_BLOCK_MASK, walk.iv);
2227 +               kernel_fpu_end();
2228                 nbytes &= AES_BLOCK_SIZE - 1;
2229                 err = blkcipher_walk_done(desc, &walk, nbytes);
2230         }
2231 -       kernel_fpu_end();
2232
2233         return err;
2234  }
2235 @@ -455,14 +455,14 @@ static int cbc_decrypt(struct blkcipher_desc *desc,
2236         err = blkcipher_walk_virt(desc, &walk);
2237         desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
2238
2239 -       kernel_fpu_begin();
2240         while ((nbytes = walk.nbytes)) {
2241 +               kernel_fpu_begin();
2242                 aesni_cbc_dec(ctx, walk.dst.virt.addr, walk.src.virt.addr,
2243                               nbytes & AES_BLOCK_MASK, walk.iv);
2244 +               kernel_fpu_end();
2245                 nbytes &= AES_BLOCK_SIZE - 1;
2246                 err = blkcipher_walk_done(desc, &walk, nbytes);
2247         }
2248 -       kernel_fpu_end();
2249
2250         return err;
2251  }
2252 @@ -514,18 +514,20 @@ static int ctr_crypt(struct blkcipher_desc *desc,
2253         err = blkcipher_walk_virt_block(desc, &walk, AES_BLOCK_SIZE);
2254         desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
2255
2256 -       kernel_fpu_begin();
2257         while ((nbytes = walk.nbytes) >= AES_BLOCK_SIZE) {
2258 +               kernel_fpu_begin();
2259                 aesni_ctr_enc_tfm(ctx, walk.dst.virt.addr, walk.src.virt.addr,
2260                                       nbytes & AES_BLOCK_MASK, walk.iv);
2261 +               kernel_fpu_end();
2262                 nbytes &= AES_BLOCK_SIZE - 1;
2263                 err = blkcipher_walk_done(desc, &walk, nbytes);
2264         }
2265         if (walk.nbytes) {
2266 +               kernel_fpu_begin();
2267                 ctr_crypt_final(ctx, &walk);
2268 +               kernel_fpu_end();
2269                 err = blkcipher_walk_done(desc, &walk, 0);
2270         }
2271 -       kernel_fpu_end();
2272
2273         return err;
2274  }
2275 diff --git a/arch/x86/crypto/cast5_avx_glue.c b/arch/x86/crypto/cast5_avx_glue.c
2276 index 8648158f3916..d7699130ee36 100644
2277 --- a/arch/x86/crypto/cast5_avx_glue.c
2278 +++ b/arch/x86/crypto/cast5_avx_glue.c
2279 @@ -59,7 +59,7 @@ static inline void cast5_fpu_end(bool fpu_enabled)
2280  static int ecb_crypt(struct blkcipher_desc *desc, struct blkcipher_walk *walk,
2281                      bool enc)
2282  {
2283 -       bool fpu_enabled = false;
2284 +       bool fpu_enabled;
2285         struct cast5_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
2286         const unsigned int bsize = CAST5_BLOCK_SIZE;
2287         unsigned int nbytes;
2288 @@ -75,7 +75,7 @@ static int ecb_crypt(struct blkcipher_desc *desc, struct blkcipher_walk *walk,
2289                 u8 *wsrc = walk->src.virt.addr;
2290                 u8 *wdst = walk->dst.virt.addr;
2291
2292 -               fpu_enabled = cast5_fpu_begin(fpu_enabled, nbytes);
2293 +               fpu_enabled = cast5_fpu_begin(false, nbytes);
2294
2295                 /* Process multi-block batch */
2296                 if (nbytes >= bsize * CAST5_PARALLEL_BLOCKS) {
2297 @@ -103,10 +103,9 @@ static int ecb_crypt(struct blkcipher_desc *desc, struct blkcipher_walk *walk,
2298                 } while (nbytes >= bsize);
2299
2300  done:
2301 +               cast5_fpu_end(fpu_enabled);
2302                 err = blkcipher_walk_done(desc, walk, nbytes);
2303         }
2304 -
2305 -       cast5_fpu_end(fpu_enabled);
2306         return err;
2307  }
2308
2309 @@ -227,7 +226,7 @@ done:
2310  static int cbc_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
2311                        struct scatterlist *src, unsigned int nbytes)
2312  {
2313 -       bool fpu_enabled = false;
2314 +       bool fpu_enabled;
2315         struct blkcipher_walk walk;
2316         int err;
2317
2318 @@ -236,12 +235,11 @@ static int cbc_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
2319         desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
2320
2321         while ((nbytes = walk.nbytes)) {
2322 -               fpu_enabled = cast5_fpu_begin(fpu_enabled, nbytes);
2323 +               fpu_enabled = cast5_fpu_begin(false, nbytes);
2324                 nbytes = __cbc_decrypt(desc, &walk);
2325 +               cast5_fpu_end(fpu_enabled);
2326                 err = blkcipher_walk_done(desc, &walk, nbytes);
2327         }
2328 -
2329 -       cast5_fpu_end(fpu_enabled);
2330         return err;
2331  }
2332
2333 @@ -311,7 +309,7 @@ done:
2334  static int ctr_crypt(struct blkcipher_desc *desc, struct scatterlist *dst,
2335                      struct scatterlist *src, unsigned int nbytes)
2336  {
2337 -       bool fpu_enabled = false;
2338 +       bool fpu_enabled;
2339         struct blkcipher_walk walk;
2340         int err;
2341
2342 @@ -320,13 +318,12 @@ static int ctr_crypt(struct blkcipher_desc *desc, struct scatterlist *dst,
2343         desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
2344
2345         while ((nbytes = walk.nbytes) >= CAST5_BLOCK_SIZE) {
2346 -               fpu_enabled = cast5_fpu_begin(fpu_enabled, nbytes);
2347 +               fpu_enabled = cast5_fpu_begin(false, nbytes);
2348                 nbytes = __ctr_crypt(desc, &walk);
2349 +               cast5_fpu_end(fpu_enabled);
2350                 err = blkcipher_walk_done(desc, &walk, nbytes);
2351         }
2352
2353 -       cast5_fpu_end(fpu_enabled);
2354 -
2355         if (walk.nbytes) {
2356                 ctr_crypt_final(desc, &walk);
2357                 err = blkcipher_walk_done(desc, &walk, 0);
2358 diff --git a/arch/x86/crypto/glue_helper.c b/arch/x86/crypto/glue_helper.c
2359 index 6a85598931b5..3a506ce7ed93 100644
2360 --- a/arch/x86/crypto/glue_helper.c
2361 +++ b/arch/x86/crypto/glue_helper.c
2362 @@ -39,7 +39,7 @@ static int __glue_ecb_crypt_128bit(const struct common_glue_ctx *gctx,
2363         void *ctx = crypto_blkcipher_ctx(desc->tfm);
2364         const unsigned int bsize = 128 / 8;
2365         unsigned int nbytes, i, func_bytes;
2366 -       bool fpu_enabled = false;
2367 +       bool fpu_enabled;
2368         int err;
2369
2370         err = blkcipher_walk_virt(desc, walk);
2371 @@ -49,7 +49,7 @@ static int __glue_ecb_crypt_128bit(const struct common_glue_ctx *gctx,
2372                 u8 *wdst = walk->dst.virt.addr;
2373
2374                 fpu_enabled = glue_fpu_begin(bsize, gctx->fpu_blocks_limit,
2375 -                                            desc, fpu_enabled, nbytes);
2376 +                                            desc, false, nbytes);
2377
2378                 for (i = 0; i < gctx->num_funcs; i++) {
2379                         func_bytes = bsize * gctx->funcs[i].num_blocks;
2380 @@ -71,10 +71,10 @@ static int __glue_ecb_crypt_128bit(const struct common_glue_ctx *gctx,
2381                 }
2382
2383  done:
2384 +               glue_fpu_end(fpu_enabled);
2385                 err = blkcipher_walk_done(desc, walk, nbytes);
2386         }
2387
2388 -       glue_fpu_end(fpu_enabled);
2389         return err;
2390  }
2391
2392 @@ -194,7 +194,7 @@ int glue_cbc_decrypt_128bit(const struct common_glue_ctx *gctx,
2393                             struct scatterlist *src, unsigned int nbytes)
2394  {
2395         const unsigned int bsize = 128 / 8;
2396 -       bool fpu_enabled = false;
2397 +       bool fpu_enabled;
2398         struct blkcipher_walk walk;
2399         int err;
2400
2401 @@ -203,12 +203,12 @@ int glue_cbc_decrypt_128bit(const struct common_glue_ctx *gctx,
2402
2403         while ((nbytes = walk.nbytes)) {
2404                 fpu_enabled = glue_fpu_begin(bsize, gctx->fpu_blocks_limit,
2405 -                                            desc, fpu_enabled, nbytes);
2406 +                                            desc, false, nbytes);
2407                 nbytes = __glue_cbc_decrypt_128bit(gctx, desc, &walk);
2408 +               glue_fpu_end(fpu_enabled);
2409                 err = blkcipher_walk_done(desc, &walk, nbytes);
2410         }
2411
2412 -       glue_fpu_end(fpu_enabled);
2413         return err;
2414  }
2415  EXPORT_SYMBOL_GPL(glue_cbc_decrypt_128bit);
2416 @@ -277,7 +277,7 @@ int glue_ctr_crypt_128bit(const struct common_glue_ctx *gctx,
2417                           struct scatterlist *src, unsigned int nbytes)
2418  {
2419         const unsigned int bsize = 128 / 8;
2420 -       bool fpu_enabled = false;
2421 +       bool fpu_enabled;
2422         struct blkcipher_walk walk;
2423         int err;
2424
2425 @@ -286,13 +286,12 @@ int glue_ctr_crypt_128bit(const struct common_glue_ctx *gctx,
2426
2427         while ((nbytes = walk.nbytes) >= bsize) {
2428                 fpu_enabled = glue_fpu_begin(bsize, gctx->fpu_blocks_limit,
2429 -                                            desc, fpu_enabled, nbytes);
2430 +                                            desc, false, nbytes);
2431                 nbytes = __glue_ctr_crypt_128bit(gctx, desc, &walk);
2432 +               glue_fpu_end(fpu_enabled);
2433                 err = blkcipher_walk_done(desc, &walk, nbytes);
2434         }
2435
2436 -       glue_fpu_end(fpu_enabled);
2437 -
2438         if (walk.nbytes) {
2439                 glue_ctr_crypt_final_128bit(
2440                         gctx->funcs[gctx->num_funcs - 1].fn_u.ctr, desc, &walk);
2441 @@ -347,7 +346,7 @@ int glue_xts_crypt_128bit(const struct common_glue_ctx *gctx,
2442                           void *tweak_ctx, void *crypt_ctx)
2443  {
2444         const unsigned int bsize = 128 / 8;
2445 -       bool fpu_enabled = false;
2446 +       bool fpu_enabled;
2447         struct blkcipher_walk walk;
2448         int err;
2449
2450 @@ -360,21 +359,21 @@ int glue_xts_crypt_128bit(const struct common_glue_ctx *gctx,
2451
2452         /* set minimum length to bsize, for tweak_fn */
2453         fpu_enabled = glue_fpu_begin(bsize, gctx->fpu_blocks_limit,
2454 -                                    desc, fpu_enabled,
2455 +                                    desc, false,
2456                                      nbytes < bsize ? bsize : nbytes);
2457 -
2458         /* calculate first value of T */
2459         tweak_fn(tweak_ctx, walk.iv, walk.iv);
2460 +       glue_fpu_end(fpu_enabled);
2461
2462         while (nbytes) {
2463 +               fpu_enabled = glue_fpu_begin(bsize, gctx->fpu_blocks_limit,
2464 +                               desc, false, nbytes);
2465                 nbytes = __glue_xts_crypt_128bit(gctx, crypt_ctx, desc, &walk);
2466
2467 +               glue_fpu_end(fpu_enabled);
2468                 err = blkcipher_walk_done(desc, &walk, nbytes);
2469                 nbytes = walk.nbytes;
2470         }
2471 -
2472 -       glue_fpu_end(fpu_enabled);
2473 -
2474         return err;
2475  }
2476  EXPORT_SYMBOL_GPL(glue_xts_crypt_128bit);
2477 diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c
2478 index 1a4477cedc49..75a301b6a5b6 100644
2479 --- a/arch/x86/entry/common.c
2480 +++ b/arch/x86/entry/common.c
2481 @@ -220,7 +220,7 @@ long syscall_trace_enter(struct pt_regs *regs)
2482
2483  #define EXIT_TO_USERMODE_LOOP_FLAGS                            \
2484         (_TIF_SIGPENDING | _TIF_NOTIFY_RESUME | _TIF_UPROBE |   \
2485 -        _TIF_NEED_RESCHED | _TIF_USER_RETURN_NOTIFY)
2486 +        _TIF_NEED_RESCHED_MASK | _TIF_USER_RETURN_NOTIFY)
2487
2488  static void exit_to_usermode_loop(struct pt_regs *regs, u32 cached_flags)
2489  {
2490 @@ -236,9 +236,16 @@ static void exit_to_usermode_loop(struct pt_regs *regs, u32 cached_flags)
2491                 /* We have work to do. */
2492                 local_irq_enable();
2493
2494 -               if (cached_flags & _TIF_NEED_RESCHED)
2495 +               if (cached_flags & _TIF_NEED_RESCHED_MASK)
2496                         schedule();
2497
2498 +#ifdef ARCH_RT_DELAYS_SIGNAL_SEND
2499 +               if (unlikely(current->forced_info.si_signo)) {
2500 +                       struct task_struct *t = current;
2501 +                       force_sig_info(t->forced_info.si_signo, &t->forced_info, t);
2502 +                       t->forced_info.si_signo = 0;
2503 +               }
2504 +#endif
2505                 if (cached_flags & _TIF_UPROBE)
2506                         uprobe_notify_resume(regs);
2507
2508 diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S
2509 index f3b6d54e0042..2d722ee01fc2 100644
2510 --- a/arch/x86/entry/entry_32.S
2511 +++ b/arch/x86/entry/entry_32.S
2512 @@ -278,8 +278,24 @@ END(ret_from_exception)
2513  ENTRY(resume_kernel)
2514         DISABLE_INTERRUPTS(CLBR_ANY)
2515  need_resched:
2516 +       # preempt count == 0 + NEED_RS set?
2517         cmpl    $0, PER_CPU_VAR(__preempt_count)
2518 +#ifndef CONFIG_PREEMPT_LAZY
2519         jnz     restore_all
2520 +#else
2521 +       jz test_int_off
2522 +
2523 +       # atleast preempt count == 0 ?
2524 +       cmpl $_PREEMPT_ENABLED,PER_CPU_VAR(__preempt_count)
2525 +       jne restore_all
2526 +
2527 +       cmpl $0,TI_preempt_lazy_count(%ebp)     # non-zero preempt_lazy_count ?
2528 +       jnz restore_all
2529 +
2530 +       testl $_TIF_NEED_RESCHED_LAZY, TI_flags(%ebp)
2531 +       jz restore_all
2532 +test_int_off:
2533 +#endif
2534         testl   $X86_EFLAGS_IF, PT_EFLAGS(%esp) # interrupts off (exception path) ?
2535         jz      restore_all
2536         call    preempt_schedule_irq
2537 diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
2538 index a55697d19824..316081a2ca85 100644
2539 --- a/arch/x86/entry/entry_64.S
2540 +++ b/arch/x86/entry/entry_64.S
2541 @@ -579,7 +579,23 @@ retint_kernel:
2542         bt      $9, EFLAGS(%rsp)                /* were interrupts off? */
2543         jnc     1f
2544  0:     cmpl    $0, PER_CPU_VAR(__preempt_count)
2545 +#ifndef CONFIG_PREEMPT_LAZY
2546         jnz     1f
2547 +#else
2548 +       jz      do_preempt_schedule_irq
2549 +
2550 +       # atleast preempt count == 0 ?
2551 +       cmpl $_PREEMPT_ENABLED,PER_CPU_VAR(__preempt_count)
2552 +       jnz     1f
2553 +
2554 +       GET_THREAD_INFO(%rcx)
2555 +       cmpl    $0, TI_preempt_lazy_count(%rcx)
2556 +       jnz     1f
2557 +
2558 +       bt      $TIF_NEED_RESCHED_LAZY,TI_flags(%rcx)
2559 +       jnc     1f
2560 +do_preempt_schedule_irq:
2561 +#endif
2562         call    preempt_schedule_irq
2563         jmp     0b
2564  1:
2565 @@ -867,6 +883,7 @@ bad_gs:
2566         jmp     2b
2567         .previous
2568
2569 +#ifndef CONFIG_PREEMPT_RT_FULL
2570  /* Call softirq on interrupt stack. Interrupts are off. */
2571  ENTRY(do_softirq_own_stack)
2572         pushq   %rbp
2573 @@ -879,6 +896,7 @@ ENTRY(do_softirq_own_stack)
2574         decl    PER_CPU_VAR(irq_count)
2575         ret
2576  END(do_softirq_own_stack)
2577 +#endif
2578
2579  #ifdef CONFIG_XEN
2580  idtentry xen_hypervisor_callback xen_do_hypervisor_callback has_error_code=0
2581 diff --git a/arch/x86/include/asm/preempt.h b/arch/x86/include/asm/preempt.h
2582 index 01bcde84d3e4..6f432adc55cd 100644
2583 --- a/arch/x86/include/asm/preempt.h
2584 +++ b/arch/x86/include/asm/preempt.h
2585 @@ -79,17 +79,46 @@ static __always_inline void __preempt_count_sub(int val)
2586   * a decrement which hits zero means we have no preempt_count and should
2587   * reschedule.
2588   */
2589 -static __always_inline bool __preempt_count_dec_and_test(void)
2590 +static __always_inline bool ____preempt_count_dec_and_test(void)
2591  {
2592         GEN_UNARY_RMWcc("decl", __preempt_count, __percpu_arg(0), "e");
2593  }
2594
2595 +static __always_inline bool __preempt_count_dec_and_test(void)
2596 +{
2597 +       if (____preempt_count_dec_and_test())
2598 +               return true;
2599 +#ifdef CONFIG_PREEMPT_LAZY
2600 +       if (current_thread_info()->preempt_lazy_count)
2601 +               return false;
2602 +       return test_thread_flag(TIF_NEED_RESCHED_LAZY);
2603 +#else
2604 +       return false;
2605 +#endif
2606 +}
2607 +
2608  /*
2609   * Returns true when we need to resched and can (barring IRQ state).
2610   */
2611  static __always_inline bool should_resched(int preempt_offset)
2612  {
2613 +#ifdef CONFIG_PREEMPT_LAZY
2614 +       u32 tmp;
2615 +
2616 +       tmp = raw_cpu_read_4(__preempt_count);
2617 +       if (tmp == preempt_offset)
2618 +               return true;
2619 +
2620 +       /* preempt count == 0 ? */
2621 +       tmp &= ~PREEMPT_NEED_RESCHED;
2622 +       if (tmp)
2623 +               return false;
2624 +       if (current_thread_info()->preempt_lazy_count)
2625 +               return false;
2626 +       return test_thread_flag(TIF_NEED_RESCHED_LAZY);
2627 +#else
2628         return unlikely(raw_cpu_read_4(__preempt_count) == preempt_offset);
2629 +#endif
2630  }
2631
2632  #ifdef CONFIG_PREEMPT
2633 diff --git a/arch/x86/include/asm/signal.h b/arch/x86/include/asm/signal.h
2634 index 2138c9ae19ee..3f5b4ee2e2c1 100644
2635 --- a/arch/x86/include/asm/signal.h
2636 +++ b/arch/x86/include/asm/signal.h
2637 @@ -23,6 +23,19 @@ typedef struct {
2638         unsigned long sig[_NSIG_WORDS];
2639  } sigset_t;
2640
2641 +/*
2642 + * Because some traps use the IST stack, we must keep preemption
2643 + * disabled while calling do_trap(), but do_trap() may call
2644 + * force_sig_info() which will grab the signal spin_locks for the
2645 + * task, which in PREEMPT_RT_FULL are mutexes.  By defining
2646 + * ARCH_RT_DELAYS_SIGNAL_SEND the force_sig_info() will set
2647 + * TIF_NOTIFY_RESUME and set up the signal to be sent on exit of the
2648 + * trap.
2649 + */
2650 +#if defined(CONFIG_PREEMPT_RT_FULL)
2651 +#define ARCH_RT_DELAYS_SIGNAL_SEND
2652 +#endif
2653 +
2654  #ifndef CONFIG_COMPAT
2655  typedef sigset_t compat_sigset_t;
2656  #endif
2657 diff --git a/arch/x86/include/asm/stackprotector.h b/arch/x86/include/asm/stackprotector.h
2658 index 58505f01962f..02fa39652cd6 100644
2659 --- a/arch/x86/include/asm/stackprotector.h
2660 +++ b/arch/x86/include/asm/stackprotector.h
2661 @@ -59,7 +59,7 @@
2662   */
2663  static __always_inline void boot_init_stack_canary(void)
2664  {
2665 -       u64 canary;
2666 +       u64 uninitialized_var(canary);
2667         u64 tsc;
2668
2669  #ifdef CONFIG_X86_64
2670 @@ -70,8 +70,15 @@ static __always_inline void boot_init_stack_canary(void)
2671          * of randomness. The TSC only matters for very early init,
2672          * there it already has some randomness on most systems. Later
2673          * on during the bootup the random pool has true entropy too.
2674 +        *
2675 +        * For preempt-rt we need to weaken the randomness a bit, as
2676 +        * we can't call into the random generator from atomic context
2677 +        * due to locking constraints. We just leave canary
2678 +        * uninitialized and use the TSC based randomness on top of it.
2679          */
2680 +#ifndef CONFIG_PREEMPT_RT_FULL
2681         get_random_bytes(&canary, sizeof(canary));
2682 +#endif
2683         tsc = rdtsc();
2684         canary += tsc + (tsc << 32UL);
2685
2686 diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h
2687 index c7b551028740..ddb63bd90e3c 100644
2688 --- a/arch/x86/include/asm/thread_info.h
2689 +++ b/arch/x86/include/asm/thread_info.h
2690 @@ -58,6 +58,8 @@ struct thread_info {
2691         __u32                   status;         /* thread synchronous flags */
2692         __u32                   cpu;            /* current CPU */
2693         mm_segment_t            addr_limit;
2694 +       int                     preempt_lazy_count;     /* 0 => lazy preemptable
2695 +                                                         <0 => BUG */
2696         unsigned int            sig_on_uaccess_error:1;
2697         unsigned int            uaccess_err:1;  /* uaccess failed */
2698  };
2699 @@ -95,6 +97,7 @@ struct thread_info {
2700  #define TIF_SYSCALL_EMU                6       /* syscall emulation active */
2701  #define TIF_SYSCALL_AUDIT      7       /* syscall auditing active */
2702  #define TIF_SECCOMP            8       /* secure computing */
2703 +#define TIF_NEED_RESCHED_LAZY  9       /* lazy rescheduling necessary */
2704  #define TIF_USER_RETURN_NOTIFY 11      /* notify kernel of userspace return */
2705  #define TIF_UPROBE             12      /* breakpointed or singlestepping */
2706  #define TIF_NOTSC              16      /* TSC is not accessible in userland */
2707 @@ -119,6 +122,7 @@ struct thread_info {
2708  #define _TIF_SYSCALL_EMU       (1 << TIF_SYSCALL_EMU)
2709  #define _TIF_SYSCALL_AUDIT     (1 << TIF_SYSCALL_AUDIT)
2710  #define _TIF_SECCOMP           (1 << TIF_SECCOMP)
2711 +#define _TIF_NEED_RESCHED_LAZY (1 << TIF_NEED_RESCHED_LAZY)
2712  #define _TIF_USER_RETURN_NOTIFY        (1 << TIF_USER_RETURN_NOTIFY)
2713  #define _TIF_UPROBE            (1 << TIF_UPROBE)
2714  #define _TIF_NOTSC             (1 << TIF_NOTSC)
2715 @@ -152,6 +156,8 @@ struct thread_info {
2716  #define _TIF_WORK_CTXSW_PREV (_TIF_WORK_CTXSW|_TIF_USER_RETURN_NOTIFY)
2717  #define _TIF_WORK_CTXSW_NEXT (_TIF_WORK_CTXSW)
2718
2719 +#define _TIF_NEED_RESCHED_MASK (_TIF_NEED_RESCHED | _TIF_NEED_RESCHED_LAZY)
2720 +
2721  #define STACK_WARN             (THREAD_SIZE/8)
2722
2723  /*
2724 diff --git a/arch/x86/include/asm/uv/uv_bau.h b/arch/x86/include/asm/uv/uv_bau.h
2725 index fc808b83fccb..ebb40118abf5 100644
2726 --- a/arch/x86/include/asm/uv/uv_bau.h
2727 +++ b/arch/x86/include/asm/uv/uv_bau.h
2728 @@ -615,9 +615,9 @@ struct bau_control {
2729         cycles_t                send_message;
2730         cycles_t                period_end;
2731         cycles_t                period_time;
2732 -       spinlock_t              uvhub_lock;
2733 -       spinlock_t              queue_lock;
2734 -       spinlock_t              disable_lock;
2735 +       raw_spinlock_t          uvhub_lock;
2736 +       raw_spinlock_t          queue_lock;
2737 +       raw_spinlock_t          disable_lock;
2738         /* tunables */
2739         int                     max_concurr;
2740         int                     max_concurr_const;
2741 @@ -776,15 +776,15 @@ static inline int atom_asr(short i, struct atomic_short *v)
2742   * to be lowered below the current 'v'.  atomic_add_unless can only stop
2743   * on equal.
2744   */
2745 -static inline int atomic_inc_unless_ge(spinlock_t *lock, atomic_t *v, int u)
2746 +static inline int atomic_inc_unless_ge(raw_spinlock_t *lock, atomic_t *v, int u)
2747  {
2748 -       spin_lock(lock);
2749 +       raw_spin_lock(lock);
2750         if (atomic_read(v) >= u) {
2751 -               spin_unlock(lock);
2752 +               raw_spin_unlock(lock);
2753                 return 0;
2754         }
2755         atomic_inc(v);
2756 -       spin_unlock(lock);
2757 +       raw_spin_unlock(lock);
2758         return 1;
2759  }
2760
2761 diff --git a/arch/x86/include/asm/uv/uv_hub.h b/arch/x86/include/asm/uv/uv_hub.h
2762 index ea7074784cc4..01ec643ce66e 100644
2763 --- a/arch/x86/include/asm/uv/uv_hub.h
2764 +++ b/arch/x86/include/asm/uv/uv_hub.h
2765 @@ -492,7 +492,7 @@ struct uv_blade_info {
2766         unsigned short  nr_online_cpus;
2767         unsigned short  pnode;
2768         short           memory_nid;
2769 -       spinlock_t      nmi_lock;       /* obsolete, see uv_hub_nmi */
2770 +       raw_spinlock_t  nmi_lock;       /* obsolete, see uv_hub_nmi */
2771         unsigned long   nmi_count;      /* obsolete, see uv_hub_nmi */
2772  };
2773  extern struct uv_blade_info *uv_blade_info;
2774 diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
2775 index e75907601a41..a29fc4f84fc4 100644
2776 --- a/arch/x86/kernel/acpi/boot.c
2777 +++ b/arch/x86/kernel/acpi/boot.c
2778 @@ -87,7 +87,9 @@ static u64 acpi_lapic_addr __initdata = APIC_DEFAULT_PHYS_BASE;
2779   *             ->ioapic_mutex
2780   *                     ->ioapic_lock
2781   */
2782 +#ifdef CONFIG_X86_IO_APIC
2783  static DEFINE_MUTEX(acpi_ioapic_lock);
2784 +#endif
2785
2786  /* --------------------------------------------------------------------------
2787                                Boot-time Configuration
2788 diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
2789 index fdb0fbfb1197..678c711e2a16 100644
2790 --- a/arch/x86/kernel/apic/io_apic.c
2791 +++ b/arch/x86/kernel/apic/io_apic.c
2792 @@ -1711,7 +1711,8 @@ static bool io_apic_level_ack_pending(struct mp_chip_data *data)
2793  static inline bool ioapic_irqd_mask(struct irq_data *data)
2794  {
2795         /* If we are moving the irq we need to mask it */
2796 -       if (unlikely(irqd_is_setaffinity_pending(data))) {
2797 +       if (unlikely(irqd_is_setaffinity_pending(data) &&
2798 +                    !irqd_irq_inprogress(data))) {
2799                 mask_ioapic_irq(data);
2800                 return true;
2801         }
2802 diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c
2803 index 4a139465f1d4..ad2afff02b36 100644
2804 --- a/arch/x86/kernel/apic/x2apic_uv_x.c
2805 +++ b/arch/x86/kernel/apic/x2apic_uv_x.c
2806 @@ -947,7 +947,7 @@ void __init uv_system_init(void)
2807                         uv_blade_info[blade].pnode = pnode;
2808                         uv_blade_info[blade].nr_possible_cpus = 0;
2809                         uv_blade_info[blade].nr_online_cpus = 0;
2810 -                       spin_lock_init(&uv_blade_info[blade].nmi_lock);
2811 +                       raw_spin_lock_init(&uv_blade_info[blade].nmi_lock);
2812                         min_pnode = min(pnode, min_pnode);
2813                         max_pnode = max(pnode, max_pnode);
2814                         blade++;
2815 diff --git a/arch/x86/kernel/asm-offsets.c b/arch/x86/kernel/asm-offsets.c
2816 index 439df975bc7a..b7954ddd6a0a 100644
2817 --- a/arch/x86/kernel/asm-offsets.c
2818 +++ b/arch/x86/kernel/asm-offsets.c
2819 @@ -32,6 +32,7 @@ void common(void) {
2820         OFFSET(TI_flags, thread_info, flags);
2821         OFFSET(TI_status, thread_info, status);
2822         OFFSET(TI_addr_limit, thread_info, addr_limit);
2823 +       OFFSET(TI_preempt_lazy_count, thread_info, preempt_lazy_count);
2824
2825         BLANK();
2826         OFFSET(crypto_tfm_ctx_offset, crypto_tfm, __crt_ctx);
2827 @@ -89,4 +90,5 @@ void common(void) {
2828
2829         BLANK();
2830         DEFINE(PTREGS_SIZE, sizeof(struct pt_regs));
2831 +       DEFINE(_PREEMPT_ENABLED, PREEMPT_ENABLED);
2832  }
2833 diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
2834 index 7e8a736d09db..430a4ec07811 100644
2835 --- a/arch/x86/kernel/cpu/mcheck/mce.c
2836 +++ b/arch/x86/kernel/cpu/mcheck/mce.c
2837 @@ -41,6 +41,8 @@
2838  #include <linux/debugfs.h>
2839  #include <linux/irq_work.h>
2840  #include <linux/export.h>
2841 +#include <linux/jiffies.h>
2842 +#include <linux/swork.h>
2843
2844  #include <asm/processor.h>
2845  #include <asm/traps.h>
2846 @@ -1236,7 +1238,7 @@ void mce_log_therm_throt_event(__u64 status)
2847  static unsigned long check_interval = INITIAL_CHECK_INTERVAL;
2848
2849  static DEFINE_PER_CPU(unsigned long, mce_next_interval); /* in jiffies */
2850 -static DEFINE_PER_CPU(struct timer_list, mce_timer);
2851 +static DEFINE_PER_CPU(struct hrtimer, mce_timer);
2852
2853  static unsigned long mce_adjust_timer_default(unsigned long interval)
2854  {
2855 @@ -1245,32 +1247,18 @@ static unsigned long mce_adjust_timer_default(unsigned long interval)
2856
2857  static unsigned long (*mce_adjust_timer)(unsigned long interval) = mce_adjust_timer_default;
2858
2859 -static void __restart_timer(struct timer_list *t, unsigned long interval)
2860 +static enum hrtimer_restart __restart_timer(struct hrtimer *timer, unsigned long interval)
2861  {
2862 -       unsigned long when = jiffies + interval;
2863 -       unsigned long flags;
2864 -
2865 -       local_irq_save(flags);
2866 -
2867 -       if (timer_pending(t)) {
2868 -               if (time_before(when, t->expires))
2869 -                       mod_timer_pinned(t, when);
2870 -       } else {
2871 -               t->expires = round_jiffies(when);
2872 -               add_timer_on(t, smp_processor_id());
2873 -       }
2874 -
2875 -       local_irq_restore(flags);
2876 +       if (!interval)
2877 +               return HRTIMER_NORESTART;
2878 +       hrtimer_forward_now(timer, ns_to_ktime(jiffies_to_nsecs(interval)));
2879 +       return HRTIMER_RESTART;
2880  }
2881
2882 -static void mce_timer_fn(unsigned long data)
2883 +static enum hrtimer_restart mce_timer_fn(struct hrtimer *timer)
2884  {
2885 -       struct timer_list *t = this_cpu_ptr(&mce_timer);
2886 -       int cpu = smp_processor_id();
2887         unsigned long iv;
2888
2889 -       WARN_ON(cpu != data);
2890 -
2891         iv = __this_cpu_read(mce_next_interval);
2892
2893         if (mce_available(this_cpu_ptr(&cpu_info))) {
2894 @@ -1293,7 +1281,7 @@ static void mce_timer_fn(unsigned long data)
2895
2896  done:
2897         __this_cpu_write(mce_next_interval, iv);
2898 -       __restart_timer(t, iv);
2899 +       return __restart_timer(timer, iv);
2900  }
2901
2902  /*
2903 @@ -1301,7 +1289,7 @@ done:
2904   */
2905  void mce_timer_kick(unsigned long interval)
2906  {
2907 -       struct timer_list *t = this_cpu_ptr(&mce_timer);
2908 +       struct hrtimer *t = this_cpu_ptr(&mce_timer);
2909         unsigned long iv = __this_cpu_read(mce_next_interval);
2910
2911         __restart_timer(t, interval);
2912 @@ -1316,7 +1304,7 @@ static void mce_timer_delete_all(void)
2913         int cpu;
2914
2915         for_each_online_cpu(cpu)
2916 -               del_timer_sync(&per_cpu(mce_timer, cpu));
2917 +               hrtimer_cancel(&per_cpu(mce_timer, cpu));
2918  }
2919
2920  static void mce_do_trigger(struct work_struct *work)
2921 @@ -1326,6 +1314,56 @@ static void mce_do_trigger(struct work_struct *work)
2922
2923  static DECLARE_WORK(mce_trigger_work, mce_do_trigger);
2924
2925 +static void __mce_notify_work(struct swork_event *event)
2926 +{
2927 +       /* Not more than two messages every minute */
2928 +       static DEFINE_RATELIMIT_STATE(ratelimit, 60*HZ, 2);
2929 +
2930 +       /* wake processes polling /dev/mcelog */
2931 +       wake_up_interruptible(&mce_chrdev_wait);
2932 +
2933 +       /*
2934 +        * There is no risk of missing notifications because
2935 +        * work_pending is always cleared before the function is
2936 +        * executed.
2937 +        */
2938 +       if (mce_helper[0] && !work_pending(&mce_trigger_work))
2939 +               schedule_work(&mce_trigger_work);
2940 +
2941 +       if (__ratelimit(&ratelimit))
2942 +               pr_info(HW_ERR "Machine check events logged\n");
2943 +}
2944 +
2945 +#ifdef CONFIG_PREEMPT_RT_FULL
2946 +static bool notify_work_ready __read_mostly;
2947 +static struct swork_event notify_work;
2948 +
2949 +static int mce_notify_work_init(void)
2950 +{
2951 +       int err;
2952 +
2953 +       err = swork_get();
2954 +       if (err)
2955 +               return err;
2956 +
2957 +       INIT_SWORK(&notify_work, __mce_notify_work);
2958 +       notify_work_ready = true;
2959 +       return 0;
2960 +}
2961 +
2962 +static void mce_notify_work(void)
2963 +{
2964 +       if (notify_work_ready)
2965 +               swork_queue(&notify_work);
2966 +}
2967 +#else
2968 +static void mce_notify_work(void)
2969 +{
2970 +       __mce_notify_work(NULL);
2971 +}
2972 +static inline int mce_notify_work_init(void) { return 0; }
2973 +#endif
2974 +
2975  /*
2976   * Notify the user(s) about new machine check events.
2977   * Can be called from interrupt context, but not from machine check/NMI
2978 @@ -1333,19 +1371,8 @@ static DECLARE_WORK(mce_trigger_work, mce_do_trigger);
2979   */
2980  int mce_notify_irq(void)
2981  {
2982 -       /* Not more than two messages every minute */
2983 -       static DEFINE_RATELIMIT_STATE(ratelimit, 60*HZ, 2);
2984 -
2985         if (test_and_clear_bit(0, &mce_need_notify)) {
2986 -               /* wake processes polling /dev/mcelog */
2987 -               wake_up_interruptible(&mce_chrdev_wait);
2988 -
2989 -               if (mce_helper[0])
2990 -                       schedule_work(&mce_trigger_work);
2991 -
2992 -               if (__ratelimit(&ratelimit))
2993 -                       pr_info(HW_ERR "Machine check events logged\n");
2994 -
2995 +               mce_notify_work();
2996                 return 1;
2997         }
2998         return 0;
2999 @@ -1639,7 +1666,7 @@ static void __mcheck_cpu_clear_vendor(struct cpuinfo_x86 *c)
3000         }
3001  }
3002
3003 -static void mce_start_timer(unsigned int cpu, struct timer_list *t)
3004 +static void mce_start_timer(unsigned int cpu, struct hrtimer *t)
3005  {
3006         unsigned long iv = check_interval * HZ;
3007
3008 @@ -1648,16 +1675,17 @@ static void mce_start_timer(unsigned int cpu, struct timer_list *t)
3009
3010         per_cpu(mce_next_interval, cpu) = iv;
3011
3012 -       t->expires = round_jiffies(jiffies + iv);
3013 -       add_timer_on(t, cpu);
3014 +       hrtimer_start_range_ns(t, ns_to_ktime(jiffies_to_usecs(iv) * 1000ULL),
3015 +                       0, HRTIMER_MODE_REL_PINNED);
3016  }
3017
3018  static void __mcheck_cpu_init_timer(void)
3019  {
3020 -       struct timer_list *t = this_cpu_ptr(&mce_timer);
3021 +       struct hrtimer *t = this_cpu_ptr(&mce_timer);
3022         unsigned int cpu = smp_processor_id();
3023
3024 -       setup_timer(t, mce_timer_fn, cpu);
3025 +       hrtimer_init(t, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
3026 +       t->function = mce_timer_fn;
3027         mce_start_timer(cpu, t);
3028  }
3029
3030 @@ -2376,6 +2404,8 @@ static void mce_disable_cpu(void *h)
3031         if (!mce_available(raw_cpu_ptr(&cpu_info)))
3032                 return;
3033
3034 +       hrtimer_cancel(this_cpu_ptr(&mce_timer));
3035 +
3036         if (!(action & CPU_TASKS_FROZEN))
3037                 cmci_clear();
3038
3039 @@ -2398,6 +2428,7 @@ static void mce_reenable_cpu(void *h)
3040                 if (b->init)
3041                         wrmsrl(MSR_IA32_MCx_CTL(i), b->ctl);
3042         }
3043 +       __mcheck_cpu_init_timer();
3044  }
3045
3046  /* Get notified when a cpu comes on/off. Be hotplug friendly. */
3047 @@ -2405,7 +2436,6 @@ static int
3048  mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
3049  {
3050         unsigned int cpu = (unsigned long)hcpu;
3051 -       struct timer_list *t = &per_cpu(mce_timer, cpu);
3052
3053         switch (action & ~CPU_TASKS_FROZEN) {
3054         case CPU_ONLINE:
3055 @@ -2425,11 +2455,9 @@ mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
3056                 break;
3057         case CPU_DOWN_PREPARE:
3058                 smp_call_function_single(cpu, mce_disable_cpu, &action, 1);
3059 -               del_timer_sync(t);
3060                 break;
3061         case CPU_DOWN_FAILED:
3062                 smp_call_function_single(cpu, mce_reenable_cpu, &action, 1);
3063 -               mce_start_timer(cpu, t);
3064                 break;
3065         }
3066
3067 @@ -2468,6 +2496,10 @@ static __init int mcheck_init_device(void)
3068                 goto err_out;
3069         }
3070
3071 +       err = mce_notify_work_init();
3072 +       if (err)
3073 +               goto err_out;
3074 +
3075         if (!zalloc_cpumask_var(&mce_device_initialized, GFP_KERNEL)) {
3076                 err = -ENOMEM;
3077                 goto err_out;
3078 diff --git a/arch/x86/kernel/cpu/perf_event_intel_rapl.c b/arch/x86/kernel/cpu/perf_event_intel_rapl.c
3079 index ed446bdcbf31..d2ac364e2118 100644
3080 --- a/arch/x86/kernel/cpu/perf_event_intel_rapl.c
3081 +++ b/arch/x86/kernel/cpu/perf_event_intel_rapl.c
3082 @@ -117,7 +117,7 @@ static struct perf_pmu_events_attr event_attr_##v = {                       \
3083  };
3084
3085  struct rapl_pmu {
3086 -       spinlock_t       lock;
3087 +       raw_spinlock_t   lock;
3088         int              n_active; /* number of active events */
3089         struct list_head active_list;
3090         struct pmu       *pmu; /* pointer to rapl_pmu_class */
3091 @@ -220,13 +220,13 @@ static enum hrtimer_restart rapl_hrtimer_handle(struct hrtimer *hrtimer)
3092         if (!pmu->n_active)
3093                 return HRTIMER_NORESTART;
3094
3095 -       spin_lock_irqsave(&pmu->lock, flags);
3096 +       raw_spin_lock_irqsave(&pmu->lock, flags);
3097
3098         list_for_each_entry(event, &pmu->active_list, active_entry) {
3099                 rapl_event_update(event);
3100         }
3101
3102 -       spin_unlock_irqrestore(&pmu->lock, flags);
3103 +       raw_spin_unlock_irqrestore(&pmu->lock, flags);
3104
3105         hrtimer_forward_now(hrtimer, pmu->timer_interval);
3106
3107 @@ -263,9 +263,9 @@ static void rapl_pmu_event_start(struct perf_event *event, int mode)
3108         struct rapl_pmu *pmu = __this_cpu_read(rapl_pmu);
3109         unsigned long flags;
3110
3111 -       spin_lock_irqsave(&pmu->lock, flags);
3112 +       raw_spin_lock_irqsave(&pmu->lock, flags);
3113         __rapl_pmu_event_start(pmu, event);
3114 -       spin_unlock_irqrestore(&pmu->lock, flags);
3115 +       raw_spin_unlock_irqrestore(&pmu->lock, flags);
3116  }
3117
3118  static void rapl_pmu_event_stop(struct perf_event *event, int mode)
3119 @@ -274,7 +274,7 @@ static void rapl_pmu_event_stop(struct perf_event *event, int mode)
3120         struct hw_perf_event *hwc = &event->hw;
3121         unsigned long flags;
3122
3123 -       spin_lock_irqsave(&pmu->lock, flags);
3124 +       raw_spin_lock_irqsave(&pmu->lock, flags);
3125
3126         /* mark event as deactivated and stopped */
3127         if (!(hwc->state & PERF_HES_STOPPED)) {
3128 @@ -299,7 +299,7 @@ static void rapl_pmu_event_stop(struct perf_event *event, int mode)
3129                 hwc->state |= PERF_HES_UPTODATE;
3130         }
3131
3132 -       spin_unlock_irqrestore(&pmu->lock, flags);
3133 +       raw_spin_unlock_irqrestore(&pmu->lock, flags);
3134  }
3135
3136  static int rapl_pmu_event_add(struct perf_event *event, int mode)
3137 @@ -308,14 +308,14 @@ static int rapl_pmu_event_add(struct perf_event *event, int mode)
3138         struct hw_perf_event *hwc = &event->hw;
3139         unsigned long flags;
3140
3141 -       spin_lock_irqsave(&pmu->lock, flags);
3142 +       raw_spin_lock_irqsave(&pmu->lock, flags);
3143
3144         hwc->state = PERF_HES_UPTODATE | PERF_HES_STOPPED;
3145
3146         if (mode & PERF_EF_START)
3147                 __rapl_pmu_event_start(pmu, event);
3148
3149 -       spin_unlock_irqrestore(&pmu->lock, flags);
3150 +       raw_spin_unlock_irqrestore(&pmu->lock, flags);
3151
3152         return 0;
3153  }
3154 @@ -603,7 +603,7 @@ static int rapl_cpu_prepare(int cpu)
3155         pmu = kzalloc_node(sizeof(*pmu), GFP_KERNEL, cpu_to_node(cpu));
3156         if (!pmu)
3157                 return -1;
3158 -       spin_lock_init(&pmu->lock);
3159 +       raw_spin_lock_init(&pmu->lock);
3160
3161         INIT_LIST_HEAD(&pmu->active_list);
3162
3163 diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c
3164 index 464ffd69b92e..00db1aad1548 100644
3165 --- a/arch/x86/kernel/dumpstack_32.c
3166 +++ b/arch/x86/kernel/dumpstack_32.c
3167 @@ -42,7 +42,7 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs,
3168                 unsigned long *stack, unsigned long bp,
3169                 const struct stacktrace_ops *ops, void *data)
3170  {
3171 -       const unsigned cpu = get_cpu();
3172 +       const unsigned cpu = get_cpu_light();
3173         int graph = 0;
3174         u32 *prev_esp;
3175
3176 @@ -86,7 +86,7 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs,
3177                         break;
3178                 touch_nmi_watchdog();
3179         }
3180 -       put_cpu();
3181 +       put_cpu_light();
3182  }
3183  EXPORT_SYMBOL(dump_trace);
3184
3185 diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c
3186 index 5f1c6266eb30..c331e3fef465 100644
3187 --- a/arch/x86/kernel/dumpstack_64.c
3188 +++ b/arch/x86/kernel/dumpstack_64.c
3189 @@ -152,7 +152,7 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs,
3190                 unsigned long *stack, unsigned long bp,
3191                 const struct stacktrace_ops *ops, void *data)
3192  {
3193 -       const unsigned cpu = get_cpu();
3194 +       const unsigned cpu = get_cpu_light();
3195         struct thread_info *tinfo;
3196         unsigned long *irq_stack = (unsigned long *)per_cpu(irq_stack_ptr, cpu);
3197         unsigned long dummy;
3198 @@ -241,7 +241,7 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs,
3199          * This handles the process stack:
3200          */
3201         bp = ops->walk_stack(tinfo, stack, bp, ops, data, NULL, &graph);
3202 -       put_cpu();
3203 +       put_cpu_light();
3204  }
3205  EXPORT_SYMBOL(dump_trace);
3206
3207 @@ -255,7 +255,7 @@ show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
3208         int cpu;
3209         int i;
3210
3211 -       preempt_disable();
3212 +       migrate_disable();
3213         cpu = smp_processor_id();
3214
3215         irq_stack_end   = (unsigned long *)(per_cpu(irq_stack_ptr, cpu));
3216 @@ -291,7 +291,7 @@ show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
3217                         pr_cont(" %016lx", *stack++);
3218                 touch_nmi_watchdog();
3219         }
3220 -       preempt_enable();
3221 +       migrate_enable();
3222
3223         pr_cont("\n");
3224         show_trace_log_lvl(task, regs, sp, bp, log_lvl);
3225 diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c
3226 index 38da8f29a9c8..ce71f7098f15 100644
3227 --- a/arch/x86/kernel/irq_32.c
3228 +++ b/arch/x86/kernel/irq_32.c
3229 @@ -128,6 +128,7 @@ void irq_ctx_init(int cpu)
3230                cpu, per_cpu(hardirq_stack, cpu),  per_cpu(softirq_stack, cpu));
3231  }
3232
3233 +#ifndef CONFIG_PREEMPT_RT_FULL
3234  void do_softirq_own_stack(void)
3235  {
3236         struct thread_info *curstk;
3237 @@ -146,6 +147,7 @@ void do_softirq_own_stack(void)
3238
3239         call_on_stack(__do_softirq, isp);
3240  }
3241 +#endif
3242
3243  bool handle_irq(struct irq_desc *desc, struct pt_regs *regs)
3244  {
3245 diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
3246 index 47190bd399e7..807950860fb7 100644
3247 --- a/arch/x86/kernel/kvm.c
3248 +++ b/arch/x86/kernel/kvm.c
3249 @@ -36,6 +36,7 @@
3250  #include <linux/kprobes.h>
3251  #include <linux/debugfs.h>
3252  #include <linux/nmi.h>
3253 +#include <linux/swait.h>
3254  #include <asm/timer.h>
3255  #include <asm/cpu.h>
3256  #include <asm/traps.h>
3257 @@ -91,14 +92,14 @@ static void kvm_io_delay(void)
3258
3259  struct kvm_task_sleep_node {
3260         struct hlist_node link;
3261 -       wait_queue_head_t wq;
3262 +       struct swait_queue_head wq;
3263         u32 token;
3264         int cpu;
3265         bool halted;
3266  };
3267
3268  static struct kvm_task_sleep_head {
3269 -       spinlock_t lock;
3270 +       raw_spinlock_t lock;
3271         struct hlist_head list;
3272  } async_pf_sleepers[KVM_TASK_SLEEP_HASHSIZE];
3273
3274 @@ -122,17 +123,17 @@ void kvm_async_pf_task_wait(u32 token)
3275         u32 key = hash_32(token, KVM_TASK_SLEEP_HASHBITS);
3276         struct kvm_task_sleep_head *b = &async_pf_sleepers[key];
3277         struct kvm_task_sleep_node n, *e;
3278 -       DEFINE_WAIT(wait);
3279 +       DECLARE_SWAITQUEUE(wait);
3280
3281         rcu_irq_enter();
3282
3283 -       spin_lock(&b->lock);
3284 +       raw_spin_lock(&b->lock);
3285         e = _find_apf_task(b, token);
3286         if (e) {
3287                 /* dummy entry exist -> wake up was delivered ahead of PF */
3288                 hlist_del(&e->link);
3289                 kfree(e);
3290 -               spin_unlock(&b->lock);
3291 +               raw_spin_unlock(&b->lock);
3292
3293                 rcu_irq_exit();
3294                 return;
3295 @@ -141,13 +142,13 @@ void kvm_async_pf_task_wait(u32 token)
3296         n.token = token;
3297         n.cpu = smp_processor_id();
3298         n.halted = is_idle_task(current) || preempt_count() > 1;
3299 -       init_waitqueue_head(&n.wq);
3300 +       init_swait_queue_head(&n.wq);
3301         hlist_add_head(&n.link, &b->list);
3302 -       spin_unlock(&b->lock);
3303 +       raw_spin_unlock(&b->lock);
3304
3305         for (;;) {
3306                 if (!n.halted)
3307 -                       prepare_to_wait(&n.wq, &wait, TASK_UNINTERRUPTIBLE);
3308 +                       prepare_to_swait(&n.wq, &wait, TASK_UNINTERRUPTIBLE);
3309                 if (hlist_unhashed(&n.link))
3310                         break;
3311
3312 @@ -166,7 +167,7 @@ void kvm_async_pf_task_wait(u32 token)
3313                 }
3314         }
3315         if (!n.halted)
3316 -               finish_wait(&n.wq, &wait);
3317 +               finish_swait(&n.wq, &wait);
3318
3319         rcu_irq_exit();
3320         return;
3321 @@ -178,8 +179,8 @@ static void apf_task_wake_one(struct kvm_task_sleep_node *n)
3322         hlist_del_init(&n->link);
3323         if (n->halted)
3324                 smp_send_reschedule(n->cpu);
3325 -       else if (waitqueue_active(&n->wq))
3326 -               wake_up(&n->wq);
3327 +       else if (swait_active(&n->wq))
3328 +               swake_up(&n->wq);
3329  }
3330
3331  static void apf_task_wake_all(void)
3332 @@ -189,14 +190,14 @@ static void apf_task_wake_all(void)
3333         for (i = 0; i < KVM_TASK_SLEEP_HASHSIZE; i++) {
3334                 struct hlist_node *p, *next;
3335                 struct kvm_task_sleep_head *b = &async_pf_sleepers[i];
3336 -               spin_lock(&b->lock);
3337 +               raw_spin_lock(&b->lock);
3338                 hlist_for_each_safe(p, next, &b->list) {
3339                         struct kvm_task_sleep_node *n =
3340                                 hlist_entry(p, typeof(*n), link);
3341                         if (n->cpu == smp_processor_id())
3342                                 apf_task_wake_one(n);
3343                 }
3344 -               spin_unlock(&b->lock);
3345 +               raw_spin_unlock(&b->lock);
3346         }
3347  }
3348
3349 @@ -212,7 +213,7 @@ void kvm_async_pf_task_wake(u32 token)
3350         }
3351
3352  again:
3353 -       spin_lock(&b->lock);
3354 +       raw_spin_lock(&b->lock);
3355         n = _find_apf_task(b, token);
3356         if (!n) {
3357                 /*
3358 @@ -225,17 +226,17 @@ again:
3359                          * Allocation failed! Busy wait while other cpu
3360                          * handles async PF.
3361                          */
3362 -                       spin_unlock(&b->lock);
3363 +                       raw_spin_unlock(&b->lock);
3364                         cpu_relax();
3365                         goto again;
3366                 }
3367                 n->token = token;
3368                 n->cpu = smp_processor_id();
3369 -               init_waitqueue_head(&n->wq);
3370 +               init_swait_queue_head(&n->wq);
3371                 hlist_add_head(&n->link, &b->list);
3372         } else
3373                 apf_task_wake_one(n);
3374 -       spin_unlock(&b->lock);
3375 +       raw_spin_unlock(&b->lock);
3376         return;
3377  }
3378  EXPORT_SYMBOL_GPL(kvm_async_pf_task_wake);
3379 @@ -486,7 +487,7 @@ void __init kvm_guest_init(void)
3380         paravirt_ops_setup();
3381         register_reboot_notifier(&kvm_pv_reboot_nb);
3382         for (i = 0; i < KVM_TASK_SLEEP_HASHSIZE; i++)
3383 -               spin_lock_init(&async_pf_sleepers[i].lock);
3384 +               raw_spin_lock_init(&async_pf_sleepers[i].lock);
3385         if (kvm_para_has_feature(KVM_FEATURE_ASYNC_PF))
3386                 x86_init.irqs.trap_init = kvm_apf_trap_init;
3387
3388 diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c
3389 index 697f90db0e37..424aec4a4c71 100644
3390 --- a/arch/x86/kernel/nmi.c
3391 +++ b/arch/x86/kernel/nmi.c
3392 @@ -231,7 +231,7 @@ pci_serr_error(unsigned char reason, struct pt_regs *regs)
3393  #endif
3394
3395         if (panic_on_unrecovered_nmi)
3396 -               panic("NMI: Not continuing");
3397 +               nmi_panic(regs, "NMI: Not continuing");
3398
3399         pr_emerg("Dazed and confused, but trying to continue\n");
3400
3401 @@ -255,8 +255,16 @@ io_check_error(unsigned char reason, struct pt_regs *regs)
3402                  reason, smp_processor_id());
3403         show_regs(regs);
3404
3405 -       if (panic_on_io_nmi)
3406 -               panic("NMI IOCK error: Not continuing");
3407 +       if (panic_on_io_nmi) {
3408 +               nmi_panic(regs, "NMI IOCK error: Not continuing");
3409 +
3410 +               /*
3411 +                * If we end up here, it means we have received an NMI while
3412 +                * processing panic(). Simply return without delaying and
3413 +                * re-enabling NMIs.
3414 +                */
3415 +               return;
3416 +       }
3417
3418         /* Re-enable the IOCK line, wait for a few seconds */
3419         reason = (reason & NMI_REASON_CLEAR_MASK) | NMI_REASON_CLEAR_IOCHK;
3420 @@ -297,7 +305,7 @@ unknown_nmi_error(unsigned char reason, struct pt_regs *regs)
3421
3422         pr_emerg("Do you have a strange power saving mode enabled?\n");
3423         if (unknown_nmi_panic || panic_on_unrecovered_nmi)
3424 -               panic("NMI: Not continuing");
3425 +               nmi_panic(regs, "NMI: Not continuing");
3426
3427         pr_emerg("Dazed and confused, but trying to continue\n");
3428  }
3429 diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
3430 index 9f950917528b..4dd4beae917a 100644
3431 --- a/arch/x86/kernel/process_32.c
3432 +++ b/arch/x86/kernel/process_32.c
3433 @@ -35,6 +35,7 @@
3434  #include <linux/uaccess.h>
3435  #include <linux/io.h>
3436  #include <linux/kdebug.h>
3437 +#include <linux/highmem.h>
3438
3439  #include <asm/pgtable.h>
3440  #include <asm/ldt.h>
3441 @@ -210,6 +211,35 @@ start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp)
3442  }
3443  EXPORT_SYMBOL_GPL(start_thread);
3444
3445 +#ifdef CONFIG_PREEMPT_RT_FULL
3446 +static void switch_kmaps(struct task_struct *prev_p, struct task_struct *next_p)
3447 +{
3448 +       int i;
3449 +
3450 +       /*
3451 +        * Clear @prev's kmap_atomic mappings
3452 +        */
3453 +       for (i = 0; i < prev_p->kmap_idx; i++) {
3454 +               int idx = i + KM_TYPE_NR * smp_processor_id();
3455 +               pte_t *ptep = kmap_pte - idx;
3456 +
3457 +               kpte_clear_flush(ptep, __fix_to_virt(FIX_KMAP_BEGIN + idx));
3458 +       }
3459 +       /*
3460 +        * Restore @next_p's kmap_atomic mappings
3461 +        */
3462 +       for (i = 0; i < next_p->kmap_idx; i++) {
3463 +               int idx = i + KM_TYPE_NR * smp_processor_id();
3464 +
3465 +               if (!pte_none(next_p->kmap_pte[i]))
3466 +                       set_pte(kmap_pte - idx, next_p->kmap_pte[i]);
3467 +       }
3468 +}
3469 +#else
3470 +static inline void
3471 +switch_kmaps(struct task_struct *prev_p, struct task_struct *next_p) { }
3472 +#endif
3473 +
3474
3475  /*
3476   *     switch_to(x,y) should switch tasks from x to y.
3477 @@ -286,6 +316,8 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
3478                      task_thread_info(next_p)->flags & _TIF_WORK_CTXSW_NEXT))
3479                 __switch_to_xtra(prev_p, next_p, tss);
3480
3481 +       switch_kmaps(prev_p, next_p);
3482 +
3483         /*
3484          * Leave lazy mode, flushing any hypercalls made here.
3485          * This must be done before restoring TLS segments so
3486 diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c
3487 index f660d63f40fe..8384207adde2 100644
3488 --- a/arch/x86/kernel/reboot.c
3489 +++ b/arch/x86/kernel/reboot.c
3490 @@ -726,6 +726,7 @@ static int crashing_cpu;
3491  static nmi_shootdown_cb shootdown_callback;
3492
3493  static atomic_t waiting_for_crash_ipi;
3494 +static int crash_ipi_issued;
3495
3496  static int crash_nmi_callback(unsigned int val, struct pt_regs *regs)
3497  {
3498 @@ -788,6 +789,9 @@ void nmi_shootdown_cpus(nmi_shootdown_cb callback)
3499
3500         smp_send_nmi_allbutself();
3501
3502 +       /* Kick CPUs looping in NMI context. */
3503 +       WRITE_ONCE(crash_ipi_issued, 1);
3504 +
3505         msecs = 1000; /* Wait at most a second for the other cpus to stop */
3506         while ((atomic_read(&waiting_for_crash_ipi) > 0) && msecs) {
3507                 mdelay(1);
3508 @@ -796,6 +800,22 @@ void nmi_shootdown_cpus(nmi_shootdown_cb callback)
3509
3510         /* Leave the nmi callback set */
3511  }
3512 +
3513 +/* Override the weak function in kernel/panic.c */
3514 +void nmi_panic_self_stop(struct pt_regs *regs)
3515 +{
3516 +       while (1) {
3517 +               /*
3518 +                * Wait for the crash dumping IPI to be issued, and then
3519 +                * call its callback directly.
3520 +                */
3521 +               if (READ_ONCE(crash_ipi_issued))
3522 +                       crash_nmi_callback(0, regs); /* Don't return */
3523 +
3524 +               cpu_relax();
3525 +       }
3526 +}
3527 +
3528  #else /* !CONFIG_SMP */
3529  void nmi_shootdown_cpus(nmi_shootdown_cb callback)
3530  {
3531 diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
3532 index 4d30b865be30..20d9e9fb3b74 100644
3533 --- a/arch/x86/kvm/lapic.c
3534 +++ b/arch/x86/kvm/lapic.c
3535 @@ -1195,7 +1195,7 @@ static void apic_update_lvtt(struct kvm_lapic *apic)
3536  static void apic_timer_expired(struct kvm_lapic *apic)
3537  {
3538         struct kvm_vcpu *vcpu = apic->vcpu;
3539 -       wait_queue_head_t *q = &vcpu->wq;
3540 +       struct swait_queue_head *q = &vcpu->wq;
3541         struct kvm_timer *ktimer = &apic->lapic_timer;
3542
3543         if (atomic_read(&apic->lapic_timer.pending))
3544 @@ -1204,8 +1204,8 @@ static void apic_timer_expired(struct kvm_lapic *apic)
3545         atomic_inc(&apic->lapic_timer.pending);
3546         kvm_set_pending_timer(vcpu);
3547
3548 -       if (waitqueue_active(q))
3549 -               wake_up_interruptible(q);
3550 +       if (swait_active(q))
3551 +               swake_up(q);
3552
3553         if (apic_lvtt_tscdeadline(apic))
3554                 ktimer->expired_tscdeadline = ktimer->tscdeadline;
3555 @@ -1801,6 +1801,7 @@ int kvm_create_lapic(struct kvm_vcpu *vcpu)
3556         hrtimer_init(&apic->lapic_timer.timer, CLOCK_MONOTONIC,
3557                      HRTIMER_MODE_ABS);
3558         apic->lapic_timer.timer.function = apic_timer_fn;
3559 +       apic->lapic_timer.timer.irqsafe = 1;
3560
3561         /*
3562          * APIC is created enabled. This will prevent kvm_lapic_set_base from
3563 diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
3564 index d7cb9577fa31..77c1bdd802df 100644
3565 --- a/arch/x86/kvm/x86.c
3566 +++ b/arch/x86/kvm/x86.c
3567 @@ -5792,6 +5792,13 @@ int kvm_arch_init(void *opaque)
3568                 goto out;
3569         }
3570
3571 +#ifdef CONFIG_PREEMPT_RT_FULL
3572 +       if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) {
3573 +               printk(KERN_ERR "RT requires X86_FEATURE_CONSTANT_TSC\n");
3574 +               return -EOPNOTSUPP;
3575 +       }
3576 +#endif
3577 +
3578         r = kvm_mmu_module_init();
3579         if (r)
3580                 goto out_free_percpu;
3581 diff --git a/arch/x86/mm/highmem_32.c b/arch/x86/mm/highmem_32.c
3582 index a6d739258137..bd24ba1c4a86 100644
3583 --- a/arch/x86/mm/highmem_32.c
3584 +++ b/arch/x86/mm/highmem_32.c
3585 @@ -32,10 +32,11 @@ EXPORT_SYMBOL(kunmap);
3586   */
3587  void *kmap_atomic_prot(struct page *page, pgprot_t prot)
3588  {
3589 +       pte_t pte = mk_pte(page, prot);
3590         unsigned long vaddr;
3591         int idx, type;
3592
3593 -       preempt_disable();
3594 +       preempt_disable_nort();
3595         pagefault_disable();
3596
3597         if (!PageHighMem(page))
3598 @@ -45,7 +46,10 @@ void *kmap_atomic_prot(struct page *page, pgprot_t prot)
3599         idx = type + KM_TYPE_NR*smp_processor_id();
3600         vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx);
3601         BUG_ON(!pte_none(*(kmap_pte-idx)));
3602 -       set_pte(kmap_pte-idx, mk_pte(page, prot));
3603 +#ifdef CONFIG_PREEMPT_RT_FULL
3604 +       current->kmap_pte[type] = pte;
3605 +#endif
3606 +       set_pte(kmap_pte-idx, pte);
3607         arch_flush_lazy_mmu_mode();
3608
3609         return (void *)vaddr;
3610 @@ -88,6 +92,9 @@ void __kunmap_atomic(void *kvaddr)
3611                  * is a bad idea also, in case the page changes cacheability
3612                  * attributes or becomes a protected page in a hypervisor.
3613                  */
3614 +#ifdef CONFIG_PREEMPT_RT_FULL
3615 +               current->kmap_pte[type] = __pte(0);
3616 +#endif
3617                 kpte_clear_flush(kmap_pte-idx, vaddr);
3618                 kmap_atomic_idx_pop();
3619                 arch_flush_lazy_mmu_mode();
3620 @@ -100,7 +107,7 @@ void __kunmap_atomic(void *kvaddr)
3621  #endif
3622
3623         pagefault_enable();
3624 -       preempt_enable();
3625 +       preempt_enable_nort();
3626  }
3627  EXPORT_SYMBOL(__kunmap_atomic);
3628
3629 diff --git a/arch/x86/mm/iomap_32.c b/arch/x86/mm/iomap_32.c
3630 index 9c0ff045fdd4..dd25dd1671b6 100644
3631 --- a/arch/x86/mm/iomap_32.c
3632 +++ b/arch/x86/mm/iomap_32.c
3633 @@ -56,6 +56,7 @@ EXPORT_SYMBOL_GPL(iomap_free);
3634
3635  void *kmap_atomic_prot_pfn(unsigned long pfn, pgprot_t prot)
3636  {
3637 +       pte_t pte = pfn_pte(pfn, prot);
3638         unsigned long vaddr;
3639         int idx, type;
3640
3641 @@ -65,7 +66,12 @@ void *kmap_atomic_prot_pfn(unsigned long pfn, pgprot_t prot)
3642         type = kmap_atomic_idx_push();
3643         idx = type + KM_TYPE_NR * smp_processor_id();
3644         vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx);
3645 -       set_pte(kmap_pte - idx, pfn_pte(pfn, prot));
3646 +       WARN_ON(!pte_none(*(kmap_pte - idx)));
3647 +
3648 +#ifdef CONFIG_PREEMPT_RT_FULL
3649 +       current->kmap_pte[type] = pte;
3650 +#endif
3651 +       set_pte(kmap_pte - idx, pte);
3652         arch_flush_lazy_mmu_mode();
3653
3654         return (void *)vaddr;
3655 @@ -113,6 +119,9 @@ iounmap_atomic(void __iomem *kvaddr)
3656                  * is a bad idea also, in case the page changes cacheability
3657                  * attributes or becomes a protected page in a hypervisor.
3658                  */
3659 +#ifdef CONFIG_PREEMPT_RT_FULL
3660 +               current->kmap_pte[type] = __pte(0);
3661 +#endif
3662                 kpte_clear_flush(kmap_pte-idx, vaddr);
3663                 kmap_atomic_idx_pop();
3664         }
3665 diff --git a/arch/x86/platform/uv/tlb_uv.c b/arch/x86/platform/uv/tlb_uv.c
3666 index 3b6ec42718e4..7871083de089 100644
3667 --- a/arch/x86/platform/uv/tlb_uv.c
3668 +++ b/arch/x86/platform/uv/tlb_uv.c
3669 @@ -714,9 +714,9 @@ static void destination_plugged(struct bau_desc *bau_desc,
3670
3671                 quiesce_local_uvhub(hmaster);
3672
3673 -               spin_lock(&hmaster->queue_lock);
3674 +               raw_spin_lock(&hmaster->queue_lock);
3675                 reset_with_ipi(&bau_desc->distribution, bcp);
3676 -               spin_unlock(&hmaster->queue_lock);
3677 +               raw_spin_unlock(&hmaster->queue_lock);
3678
3679                 end_uvhub_quiesce(hmaster);
3680
3681 @@ -736,9 +736,9 @@ static void destination_timeout(struct bau_desc *bau_desc,
3682
3683                 quiesce_local_uvhub(hmaster);
3684
3685 -               spin_lock(&hmaster->queue_lock);
3686 +               raw_spin_lock(&hmaster->queue_lock);
3687                 reset_with_ipi(&bau_desc->distribution, bcp);
3688 -               spin_unlock(&hmaster->queue_lock);
3689 +               raw_spin_unlock(&hmaster->queue_lock);
3690
3691                 end_uvhub_quiesce(hmaster);
3692
3693 @@ -759,7 +759,7 @@ static void disable_for_period(struct bau_control *bcp, struct ptc_stats *stat)
3694         cycles_t tm1;
3695
3696         hmaster = bcp->uvhub_master;
3697 -       spin_lock(&hmaster->disable_lock);
3698 +       raw_spin_lock(&hmaster->disable_lock);
3699         if (!bcp->baudisabled) {
3700                 stat->s_bau_disabled++;
3701                 tm1 = get_cycles();
3702 @@ -772,7 +772,7 @@ static void disable_for_period(struct bau_control *bcp, struct ptc_stats *stat)
3703                         }
3704                 }
3705         }
3706 -       spin_unlock(&hmaster->disable_lock);
3707 +       raw_spin_unlock(&hmaster->disable_lock);
3708  }
3709
3710  static void count_max_concurr(int stat, struct bau_control *bcp,
3711 @@ -835,7 +835,7 @@ static void record_send_stats(cycles_t time1, cycles_t time2,
3712   */
3713  static void uv1_throttle(struct bau_control *hmaster, struct ptc_stats *stat)
3714  {
3715 -       spinlock_t *lock = &hmaster->uvhub_lock;
3716 +       raw_spinlock_t *lock = &hmaster->uvhub_lock;
3717         atomic_t *v;
3718
3719         v = &hmaster->active_descriptor_count;
3720 @@ -968,7 +968,7 @@ static int check_enable(struct bau_control *bcp, struct ptc_stats *stat)
3721         struct bau_control *hmaster;
3722
3723         hmaster = bcp->uvhub_master;
3724 -       spin_lock(&hmaster->disable_lock);
3725 +       raw_spin_lock(&hmaster->disable_lock);
3726         if (bcp->baudisabled && (get_cycles() >= bcp->set_bau_on_time)) {
3727                 stat->s_bau_reenabled++;
3728                 for_each_present_cpu(tcpu) {
3729 @@ -980,10 +980,10 @@ static int check_enable(struct bau_control *bcp, struct ptc_stats *stat)
3730                                 tbcp->period_giveups = 0;
3731                         }
3732                 }
3733 -               spin_unlock(&hmaster->disable_lock);
3734 +               raw_spin_unlock(&hmaster->disable_lock);
3735                 return 0;
3736         }
3737 -       spin_unlock(&hmaster->disable_lock);
3738 +       raw_spin_unlock(&hmaster->disable_lock);
3739         return -1;
3740  }
3741
3742 @@ -1901,9 +1901,9 @@ static void __init init_per_cpu_tunables(void)
3743                 bcp->cong_reps                  = congested_reps;
3744                 bcp->disabled_period =          sec_2_cycles(disabled_period);
3745                 bcp->giveup_limit =             giveup_limit;
3746 -               spin_lock_init(&bcp->queue_lock);
3747 -               spin_lock_init(&bcp->uvhub_lock);
3748 -               spin_lock_init(&bcp->disable_lock);
3749 +               raw_spin_lock_init(&bcp->queue_lock);
3750 +               raw_spin_lock_init(&bcp->uvhub_lock);
3751 +               raw_spin_lock_init(&bcp->disable_lock);
3752         }
3753  }
3754
3755 diff --git a/arch/x86/platform/uv/uv_time.c b/arch/x86/platform/uv/uv_time.c
3756 index 2b158a9fa1d7..5e0b122620cb 100644
3757 --- a/arch/x86/platform/uv/uv_time.c
3758 +++ b/arch/x86/platform/uv/uv_time.c
3759 @@ -57,7 +57,7 @@ static DEFINE_PER_CPU(struct clock_event_device, cpu_ced);
3760
3761  /* There is one of these allocated per node */
3762  struct uv_rtc_timer_head {
3763 -       spinlock_t      lock;
3764 +       raw_spinlock_t  lock;
3765         /* next cpu waiting for timer, local node relative: */
3766         int             next_cpu;
3767         /* number of cpus on this node: */
3768 @@ -177,7 +177,7 @@ static __init int uv_rtc_allocate_timers(void)
3769                                 uv_rtc_deallocate_timers();
3770                                 return -ENOMEM;
3771                         }
3772 -                       spin_lock_init(&head->lock);
3773 +                       raw_spin_lock_init(&head->lock);
3774                         head->ncpus = uv_blade_nr_possible_cpus(bid);
3775                         head->next_cpu = -1;
3776                         blade_info[bid] = head;
3777 @@ -231,7 +231,7 @@ static int uv_rtc_set_timer(int cpu, u64 expires)
3778         unsigned long flags;
3779         int next_cpu;
3780
3781 -       spin_lock_irqsave(&head->lock, flags);
3782 +       raw_spin_lock_irqsave(&head->lock, flags);
3783
3784         next_cpu = head->next_cpu;
3785         *t = expires;
3786 @@ -243,12 +243,12 @@ static int uv_rtc_set_timer(int cpu, u64 expires)
3787                 if (uv_setup_intr(cpu, expires)) {
3788                         *t = ULLONG_MAX;
3789                         uv_rtc_find_next_timer(head, pnode);
3790 -                       spin_unlock_irqrestore(&head->lock, flags);
3791 +                       raw_spin_unlock_irqrestore(&head->lock, flags);
3792                         return -ETIME;
3793                 }
3794         }
3795
3796 -       spin_unlock_irqrestore(&head->lock, flags);
3797 +       raw_spin_unlock_irqrestore(&head->lock, flags);
3798         return 0;
3799  }
3800
3801 @@ -267,7 +267,7 @@ static int uv_rtc_unset_timer(int cpu, int force)
3802         unsigned long flags;
3803         int rc = 0;
3804
3805 -       spin_lock_irqsave(&head->lock, flags);
3806 +       raw_spin_lock_irqsave(&head->lock, flags);
3807
3808         if ((head->next_cpu == bcpu && uv_read_rtc(NULL) >= *t) || force)
3809                 rc = 1;
3810 @@ -279,7 +279,7 @@ static int uv_rtc_unset_timer(int cpu, int force)
3811                         uv_rtc_find_next_timer(head, pnode);
3812         }
3813
3814 -       spin_unlock_irqrestore(&head->lock, flags);
3815 +       raw_spin_unlock_irqrestore(&head->lock, flags);
3816
3817         return rc;
3818  }
3819 @@ -299,13 +299,18 @@ static int uv_rtc_unset_timer(int cpu, int force)
3820  static cycle_t uv_read_rtc(struct clocksource *cs)
3821  {
3822         unsigned long offset;
3823 +       cycle_t cycles;
3824
3825 +       preempt_disable();
3826         if (uv_get_min_hub_revision_id() == 1)
3827                 offset = 0;
3828         else
3829                 offset = (uv_blade_processor_id() * L1_CACHE_BYTES) % PAGE_SIZE;
3830
3831 -       return (cycle_t)uv_read_local_mmr(UVH_RTC | offset);
3832 +       cycles = (cycle_t)uv_read_local_mmr(UVH_RTC | offset);
3833 +       preempt_enable();
3834 +
3835 +       return cycles;
3836  }
3837
3838  /*
3839 diff --git a/block/blk-core.c b/block/blk-core.c
3840 index 4fab5d610805..52d2fe2fec8f 100644
3841 --- a/block/blk-core.c
3842 +++ b/block/blk-core.c
3843 @@ -125,6 +125,9 @@ void blk_rq_init(struct request_queue *q, struct request *rq)
3844
3845         INIT_LIST_HEAD(&rq->queuelist);
3846         INIT_LIST_HEAD(&rq->timeout_list);
3847 +#ifdef CONFIG_PREEMPT_RT_FULL
3848 +       INIT_WORK(&rq->work, __blk_mq_complete_request_remote_work);
3849 +#endif
3850         rq->cpu = -1;
3851         rq->q = q;
3852         rq->__sector = (sector_t) -1;
3853 @@ -233,7 +236,7 @@ EXPORT_SYMBOL(blk_start_queue_async);
3854   **/
3855  void blk_start_queue(struct request_queue *q)
3856  {
3857 -       WARN_ON(!irqs_disabled());
3858 +       WARN_ON_NONRT(!irqs_disabled());
3859
3860         queue_flag_clear(QUEUE_FLAG_STOPPED, q);
3861         __blk_run_queue(q);
3862 @@ -659,7 +662,7 @@ int blk_queue_enter(struct request_queue *q, gfp_t gfp)
3863                 if (!gfpflags_allow_blocking(gfp))
3864                         return -EBUSY;
3865
3866 -               ret = wait_event_interruptible(q->mq_freeze_wq,
3867 +               ret = swait_event_interruptible(q->mq_freeze_wq,
3868                                 !atomic_read(&q->mq_freeze_depth) ||
3869                                 blk_queue_dying(q));
3870                 if (blk_queue_dying(q))
3871 @@ -679,7 +682,7 @@ static void blk_queue_usage_counter_release(struct percpu_ref *ref)
3872         struct request_queue *q =
3873                 container_of(ref, struct request_queue, q_usage_counter);
3874
3875 -       wake_up_all(&q->mq_freeze_wq);
3876 +       swake_up_all(&q->mq_freeze_wq);
3877  }
3878
3879  struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id)
3880 @@ -741,7 +744,7 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id)
3881         q->bypass_depth = 1;
3882         __set_bit(QUEUE_FLAG_BYPASS, &q->queue_flags);
3883
3884 -       init_waitqueue_head(&q->mq_freeze_wq);
3885 +       init_swait_queue_head(&q->mq_freeze_wq);
3886
3887         /*
3888          * Init percpu_ref in atomic mode so that it's faster to shutdown.
3889 @@ -3200,7 +3203,7 @@ static void queue_unplugged(struct request_queue *q, unsigned int depth,
3890                 blk_run_queue_async(q);
3891         else
3892                 __blk_run_queue(q);
3893 -       spin_unlock(q->queue_lock);
3894 +       spin_unlock_irq(q->queue_lock);
3895  }
3896
3897  static void flush_plug_callbacks(struct blk_plug *plug, bool from_schedule)
3898 @@ -3248,7 +3251,6 @@ EXPORT_SYMBOL(blk_check_plugged);
3899  void blk_flush_plug_list(struct blk_plug *plug, bool from_schedule)
3900  {
3901         struct request_queue *q;
3902 -       unsigned long flags;
3903         struct request *rq;
3904         LIST_HEAD(list);
3905         unsigned int depth;
3906 @@ -3268,11 +3270,6 @@ void blk_flush_plug_list(struct blk_plug *plug, bool from_schedule)
3907         q = NULL;
3908         depth = 0;
3909
3910 -       /*
3911 -        * Save and disable interrupts here, to avoid doing it for every
3912 -        * queue lock we have to take.
3913 -        */
3914 -       local_irq_save(flags);
3915         while (!list_empty(&list)) {
3916                 rq = list_entry_rq(list.next);
3917                 list_del_init(&rq->queuelist);
3918 @@ -3285,7 +3282,7 @@ void blk_flush_plug_list(struct blk_plug *plug, bool from_schedule)
3919                                 queue_unplugged(q, depth, from_schedule);
3920                         q = rq->q;
3921                         depth = 0;
3922 -                       spin_lock(q->queue_lock);
3923 +                       spin_lock_irq(q->queue_lock);
3924                 }
3925
3926                 /*
3927 @@ -3312,8 +3309,6 @@ void blk_flush_plug_list(struct blk_plug *plug, bool from_schedule)
3928          */
3929         if (q)
3930                 queue_unplugged(q, depth, from_schedule);
3931 -
3932 -       local_irq_restore(flags);
3933  }
3934
3935  void blk_finish_plug(struct blk_plug *plug)
3936 diff --git a/block/blk-ioc.c b/block/blk-ioc.c
3937 index 381cb50a673c..dc8785233d94 100644
3938 --- a/block/blk-ioc.c
3939 +++ b/block/blk-ioc.c
3940 @@ -7,6 +7,7 @@
3941  #include <linux/bio.h>
3942  #include <linux/blkdev.h>
3943  #include <linux/slab.h>
3944 +#include <linux/delay.h>
3945
3946  #include "blk.h"
3947
3948 @@ -109,7 +110,7 @@ static void ioc_release_fn(struct work_struct *work)
3949                         spin_unlock(q->queue_lock);
3950                 } else {
3951                         spin_unlock_irqrestore(&ioc->lock, flags);
3952 -                       cpu_relax();
3953 +                       cpu_chill();
3954                         spin_lock_irqsave_nested(&ioc->lock, flags, 1);
3955                 }
3956         }
3957 @@ -187,7 +188,7 @@ retry:
3958                         spin_unlock(icq->q->queue_lock);
3959                 } else {
3960                         spin_unlock_irqrestore(&ioc->lock, flags);
3961 -                       cpu_relax();
3962 +                       cpu_chill();
3963                         goto retry;
3964                 }
3965         }
3966 diff --git a/block/blk-iopoll.c b/block/blk-iopoll.c
3967 index 0736729d6494..3e21e31d0d7e 100644
3968 --- a/block/blk-iopoll.c
3969 +++ b/block/blk-iopoll.c
3970 @@ -35,6 +35,7 @@ void blk_iopoll_sched(struct blk_iopoll *iop)
3971         list_add_tail(&iop->list, this_cpu_ptr(&blk_cpu_iopoll));
3972         __raise_softirq_irqoff(BLOCK_IOPOLL_SOFTIRQ);
3973         local_irq_restore(flags);
3974 +       preempt_check_resched_rt();
3975  }
3976  EXPORT_SYMBOL(blk_iopoll_sched);
3977
3978 @@ -132,6 +133,7 @@ static void blk_iopoll_softirq(struct softirq_action *h)
3979                 __raise_softirq_irqoff(BLOCK_IOPOLL_SOFTIRQ);
3980
3981         local_irq_enable();
3982 +       preempt_check_resched_rt();
3983  }
3984
3985  /**
3986 @@ -201,6 +203,7 @@ static int blk_iopoll_cpu_notify(struct notifier_block *self,
3987                                  this_cpu_ptr(&blk_cpu_iopoll));
3988                 __raise_softirq_irqoff(BLOCK_IOPOLL_SOFTIRQ);
3989                 local_irq_enable();
3990 +               preempt_check_resched_rt();
3991         }
3992
3993         return NOTIFY_OK;
3994 diff --git a/block/blk-mq-cpu.c b/block/blk-mq-cpu.c
3995 index bb3ed488f7b5..628c6c13c482 100644
3996 --- a/block/blk-mq-cpu.c
3997 +++ b/block/blk-mq-cpu.c
3998 @@ -16,7 +16,7 @@
3999  #include "blk-mq.h"
4000
4001  static LIST_HEAD(blk_mq_cpu_notify_list);
4002 -static DEFINE_RAW_SPINLOCK(blk_mq_cpu_notify_lock);
4003 +static DEFINE_SPINLOCK(blk_mq_cpu_notify_lock);
4004
4005  static int blk_mq_main_cpu_notify(struct notifier_block *self,
4006                                   unsigned long action, void *hcpu)
4007 @@ -25,7 +25,10 @@ static int blk_mq_main_cpu_notify(struct notifier_block *self,
4008         struct blk_mq_cpu_notifier *notify;
4009         int ret = NOTIFY_OK;
4010
4011 -       raw_spin_lock(&blk_mq_cpu_notify_lock);
4012 +       if (action != CPU_POST_DEAD)
4013 +               return NOTIFY_OK;
4014 +
4015 +       spin_lock(&blk_mq_cpu_notify_lock);
4016
4017         list_for_each_entry(notify, &blk_mq_cpu_notify_list, list) {
4018                 ret = notify->notify(notify->data, action, cpu);
4019 @@ -33,7 +36,7 @@ static int blk_mq_main_cpu_notify(struct notifier_block *self,
4020                         break;
4021         }
4022
4023 -       raw_spin_unlock(&blk_mq_cpu_notify_lock);
4024 +       spin_unlock(&blk_mq_cpu_notify_lock);
4025         return ret;
4026  }
4027
4028 @@ -41,16 +44,16 @@ void blk_mq_register_cpu_notifier(struct blk_mq_cpu_notifier *notifier)
4029  {
4030         BUG_ON(!notifier->notify);
4031
4032 -       raw_spin_lock(&blk_mq_cpu_notify_lock);
4033 +       spin_lock(&blk_mq_cpu_notify_lock);
4034         list_add_tail(&notifier->list, &blk_mq_cpu_notify_list);
4035 -       raw_spin_unlock(&blk_mq_cpu_notify_lock);
4036 +       spin_unlock(&blk_mq_cpu_notify_lock);
4037  }
4038
4039  void blk_mq_unregister_cpu_notifier(struct blk_mq_cpu_notifier *notifier)
4040  {
4041 -       raw_spin_lock(&blk_mq_cpu_notify_lock);
4042 +       spin_lock(&blk_mq_cpu_notify_lock);
4043         list_del(&notifier->list);
4044 -       raw_spin_unlock(&blk_mq_cpu_notify_lock);
4045 +       spin_unlock(&blk_mq_cpu_notify_lock);
4046  }
4047
4048  void blk_mq_init_cpu_notifier(struct blk_mq_cpu_notifier *notifier,
4049 diff --git a/block/blk-mq.c b/block/blk-mq.c
4050 index c3e461ec40e4..03dfc2c91595 100644
4051 --- a/block/blk-mq.c
4052 +++ b/block/blk-mq.c
4053 @@ -92,7 +92,7 @@ EXPORT_SYMBOL_GPL(blk_mq_freeze_queue_start);
4054
4055  static void blk_mq_freeze_queue_wait(struct request_queue *q)
4056  {
4057 -       wait_event(q->mq_freeze_wq, percpu_ref_is_zero(&q->q_usage_counter));
4058 +       swait_event(q->mq_freeze_wq, percpu_ref_is_zero(&q->q_usage_counter));
4059  }
4060
4061  /*
4062 @@ -130,7 +130,7 @@ void blk_mq_unfreeze_queue(struct request_queue *q)
4063         WARN_ON_ONCE(freeze_depth < 0);
4064         if (!freeze_depth) {
4065                 percpu_ref_reinit(&q->q_usage_counter);
4066 -               wake_up_all(&q->mq_freeze_wq);
4067 +               swake_up_all(&q->mq_freeze_wq);
4068         }
4069  }
4070  EXPORT_SYMBOL_GPL(blk_mq_unfreeze_queue);
4071 @@ -149,7 +149,7 @@ void blk_mq_wake_waiters(struct request_queue *q)
4072          * dying, we need to ensure that processes currently waiting on
4073          * the queue are notified as well.
4074          */
4075 -       wake_up_all(&q->mq_freeze_wq);
4076 +       swake_up_all(&q->mq_freeze_wq);
4077  }
4078
4079  bool blk_mq_can_queue(struct blk_mq_hw_ctx *hctx)
4080 @@ -196,6 +196,9 @@ static void blk_mq_rq_ctx_init(struct request_queue *q, struct blk_mq_ctx *ctx,
4081         rq->resid_len = 0;
4082         rq->sense = NULL;
4083
4084 +#ifdef CONFIG_PREEMPT_RT_FULL
4085 +       INIT_WORK(&rq->work, __blk_mq_complete_request_remote_work);
4086 +#endif
4087         INIT_LIST_HEAD(&rq->timeout_list);
4088         rq->timeout = 0;
4089
4090 @@ -325,6 +328,17 @@ void blk_mq_end_request(struct request *rq, int error)
4091  }
4092  EXPORT_SYMBOL(blk_mq_end_request);
4093
4094 +#ifdef CONFIG_PREEMPT_RT_FULL
4095 +
4096 +void __blk_mq_complete_request_remote_work(struct work_struct *work)
4097 +{
4098 +       struct request *rq = container_of(work, struct request, work);
4099 +
4100 +       rq->q->softirq_done_fn(rq);
4101 +}
4102 +
4103 +#else
4104 +
4105  static void __blk_mq_complete_request_remote(void *data)
4106  {
4107         struct request *rq = data;
4108 @@ -332,6 +346,8 @@ static void __blk_mq_complete_request_remote(void *data)
4109         rq->q->softirq_done_fn(rq);
4110  }
4111
4112 +#endif
4113 +
4114  static void blk_mq_ipi_complete_request(struct request *rq)
4115  {
4116         struct blk_mq_ctx *ctx = rq->mq_ctx;
4117 @@ -343,19 +359,23 @@ static void blk_mq_ipi_complete_request(struct request *rq)
4118                 return;
4119         }
4120
4121 -       cpu = get_cpu();
4122 +       cpu = get_cpu_light();
4123         if (!test_bit(QUEUE_FLAG_SAME_FORCE, &rq->q->queue_flags))
4124                 shared = cpus_share_cache(cpu, ctx->cpu);
4125
4126         if (cpu != ctx->cpu && !shared && cpu_online(ctx->cpu)) {
4127 +#ifdef CONFIG_PREEMPT_RT_FULL
4128 +               schedule_work_on(ctx->cpu, &rq->work);
4129 +#else
4130                 rq->csd.func = __blk_mq_complete_request_remote;
4131                 rq->csd.info = rq;
4132                 rq->csd.flags = 0;
4133                 smp_call_function_single_async(ctx->cpu, &rq->csd);
4134 +#endif
4135         } else {
4136                 rq->q->softirq_done_fn(rq);
4137         }
4138 -       put_cpu();
4139 +       put_cpu_light();
4140  }
4141
4142  static void __blk_mq_complete_request(struct request *rq)
4143 @@ -864,14 +884,14 @@ void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async)
4144                 return;
4145
4146         if (!async) {
4147 -               int cpu = get_cpu();
4148 +               int cpu = get_cpu_light();
4149                 if (cpumask_test_cpu(cpu, hctx->cpumask)) {
4150                         __blk_mq_run_hw_queue(hctx);
4151 -                       put_cpu();
4152 +                       put_cpu_light();
4153                         return;
4154                 }
4155
4156 -               put_cpu();
4157 +               put_cpu_light();
4158         }
4159
4160         kblockd_schedule_delayed_work_on(blk_mq_hctx_next_cpu(hctx),
4161 @@ -1619,7 +1639,7 @@ static int blk_mq_hctx_notify(void *data, unsigned long action,
4162  {
4163         struct blk_mq_hw_ctx *hctx = data;
4164
4165 -       if (action == CPU_DEAD || action == CPU_DEAD_FROZEN)
4166 +       if (action == CPU_POST_DEAD)
4167                 return blk_mq_hctx_cpu_offline(hctx, cpu);
4168
4169         /*
4170 diff --git a/block/blk-mq.h b/block/blk-mq.h
4171 index 713820b47b31..3cb6feb4fe23 100644
4172 --- a/block/blk-mq.h
4173 +++ b/block/blk-mq.h
4174 @@ -74,7 +74,10 @@ struct blk_align_bitmap {
4175  static inline struct blk_mq_ctx *__blk_mq_get_ctx(struct request_queue *q,
4176                                            unsigned int cpu)
4177  {
4178 -       return per_cpu_ptr(q->queue_ctx, cpu);
4179 +       struct blk_mq_ctx *ctx;
4180 +
4181 +       ctx = per_cpu_ptr(q->queue_ctx, cpu);
4182 +       return ctx;
4183  }
4184
4185  /*
4186 @@ -85,12 +88,12 @@ static inline struct blk_mq_ctx *__blk_mq_get_ctx(struct request_queue *q,
4187   */
4188  static inline struct blk_mq_ctx *blk_mq_get_ctx(struct request_queue *q)
4189  {
4190 -       return __blk_mq_get_ctx(q, get_cpu());
4191 +       return __blk_mq_get_ctx(q, get_cpu_light());
4192  }
4193
4194  static inline void blk_mq_put_ctx(struct blk_mq_ctx *ctx)
4195  {
4196 -       put_cpu();
4197 +       put_cpu_light();
4198  }
4199
4200  struct blk_mq_alloc_data {
4201 diff --git a/block/blk-softirq.c b/block/blk-softirq.c
4202 index 53b1737e978d..81c3c0a62edf 100644
4203 --- a/block/blk-softirq.c
4204 +++ b/block/blk-softirq.c
4205 @@ -51,6 +51,7 @@ static void trigger_softirq(void *data)
4206                 raise_softirq_irqoff(BLOCK_SOFTIRQ);
4207
4208         local_irq_restore(flags);
4209 +       preempt_check_resched_rt();
4210  }
4211
4212  /*
4213 @@ -93,6 +94,7 @@ static int blk_cpu_notify(struct notifier_block *self, unsigned long action,
4214                                  this_cpu_ptr(&blk_cpu_done));
4215                 raise_softirq_irqoff(BLOCK_SOFTIRQ);
4216                 local_irq_enable();
4217 +               preempt_check_resched_rt();
4218         }
4219
4220         return NOTIFY_OK;
4221 @@ -150,6 +152,7 @@ do_local:
4222                 goto do_local;
4223
4224         local_irq_restore(flags);
4225 +       preempt_check_resched_rt();
4226  }
4227
4228  /**
4229 diff --git a/block/bounce.c b/block/bounce.c
4230 index 1cb5dd3a5da1..2f1ec8a67cbe 100644
4231 --- a/block/bounce.c
4232 +++ b/block/bounce.c
4233 @@ -55,11 +55,11 @@ static void bounce_copy_vec(struct bio_vec *to, unsigned char *vfrom)
4234         unsigned long flags;
4235         unsigned char *vto;
4236
4237 -       local_irq_save(flags);
4238 +       local_irq_save_nort(flags);
4239         vto = kmap_atomic(to->bv_page);
4240         memcpy(vto + to->bv_offset, vfrom, to->bv_len);
4241         kunmap_atomic(vto);
4242 -       local_irq_restore(flags);
4243 +       local_irq_restore_nort(flags);
4244  }
4245
4246  #else /* CONFIG_HIGHMEM */
4247 diff --git a/crypto/algapi.c b/crypto/algapi.c
4248 index 59bf491fe3d8..f98e79c8cd77 100644
4249 --- a/crypto/algapi.c
4250 +++ b/crypto/algapi.c
4251 @@ -719,13 +719,13 @@ EXPORT_SYMBOL_GPL(crypto_spawn_tfm2);
4252
4253  int crypto_register_notifier(struct notifier_block *nb)
4254  {
4255 -       return blocking_notifier_chain_register(&crypto_chain, nb);
4256 +       return srcu_notifier_chain_register(&crypto_chain, nb);
4257  }
4258  EXPORT_SYMBOL_GPL(crypto_register_notifier);
4259
4260  int crypto_unregister_notifier(struct notifier_block *nb)
4261  {
4262 -       return blocking_notifier_chain_unregister(&crypto_chain, nb);
4263 +       return srcu_notifier_chain_unregister(&crypto_chain, nb);
4264  }
4265  EXPORT_SYMBOL_GPL(crypto_unregister_notifier);
4266
4267 diff --git a/crypto/api.c b/crypto/api.c
4268 index bbc147cb5dec..bc1a848f02ec 100644
4269 --- a/crypto/api.c
4270 +++ b/crypto/api.c
4271 @@ -31,7 +31,7 @@ EXPORT_SYMBOL_GPL(crypto_alg_list);
4272  DECLARE_RWSEM(crypto_alg_sem);
4273  EXPORT_SYMBOL_GPL(crypto_alg_sem);
4274
4275 -BLOCKING_NOTIFIER_HEAD(crypto_chain);
4276 +SRCU_NOTIFIER_HEAD(crypto_chain);
4277  EXPORT_SYMBOL_GPL(crypto_chain);
4278
4279  static struct crypto_alg *crypto_larval_wait(struct crypto_alg *alg);
4280 @@ -236,10 +236,10 @@ int crypto_probing_notify(unsigned long val, void *v)
4281  {
4282         int ok;
4283
4284 -       ok = blocking_notifier_call_chain(&crypto_chain, val, v);
4285 +       ok = srcu_notifier_call_chain(&crypto_chain, val, v);
4286         if (ok == NOTIFY_DONE) {
4287                 request_module("cryptomgr");
4288 -               ok = blocking_notifier_call_chain(&crypto_chain, val, v);
4289 +               ok = srcu_notifier_call_chain(&crypto_chain, val, v);
4290         }
4291
4292         return ok;
4293 diff --git a/crypto/internal.h b/crypto/internal.h
4294 index 00e42a3ed814..2e85551e235f 100644
4295 --- a/crypto/internal.h
4296 +++ b/crypto/internal.h
4297 @@ -47,7 +47,7 @@ struct crypto_larval {
4298
4299  extern struct list_head crypto_alg_list;
4300  extern struct rw_semaphore crypto_alg_sem;
4301 -extern struct blocking_notifier_head crypto_chain;
4302 +extern struct srcu_notifier_head crypto_chain;
4303
4304  #ifdef CONFIG_PROC_FS
4305  void __init crypto_init_proc(void);
4306 @@ -143,7 +143,7 @@ static inline int crypto_is_moribund(struct crypto_alg *alg)
4307
4308  static inline void crypto_notify(unsigned long val, void *v)
4309  {
4310 -       blocking_notifier_call_chain(&crypto_chain, val, v);
4311 +       srcu_notifier_call_chain(&crypto_chain, val, v);
4312  }
4313
4314  #endif /* _CRYPTO_INTERNAL_H */
4315 diff --git a/drivers/acpi/acpica/acglobal.h b/drivers/acpi/acpica/acglobal.h
4316 index faa97604d878..941497f31cf0 100644
4317 --- a/drivers/acpi/acpica/acglobal.h
4318 +++ b/drivers/acpi/acpica/acglobal.h
4319 @@ -116,7 +116,7 @@ ACPI_GLOBAL(u8, acpi_gbl_global_lock_pending);
4320   * interrupt level
4321   */
4322  ACPI_GLOBAL(acpi_spinlock, acpi_gbl_gpe_lock); /* For GPE data structs and registers */
4323 -ACPI_GLOBAL(acpi_spinlock, acpi_gbl_hardware_lock);    /* For ACPI H/W except GPE registers */
4324 +ACPI_GLOBAL(acpi_raw_spinlock, acpi_gbl_hardware_lock);        /* For ACPI H/W except GPE registers */
4325  ACPI_GLOBAL(acpi_spinlock, acpi_gbl_reference_count_lock);
4326
4327  /* Mutex for _OSI support */
4328 diff --git a/drivers/acpi/acpica/hwregs.c b/drivers/acpi/acpica/hwregs.c
4329 index 3cf77afd142c..dc32e72132f1 100644
4330 --- a/drivers/acpi/acpica/hwregs.c
4331 +++ b/drivers/acpi/acpica/hwregs.c
4332 @@ -269,14 +269,14 @@ acpi_status acpi_hw_clear_acpi_status(void)
4333                           ACPI_BITMASK_ALL_FIXED_STATUS,
4334                           ACPI_FORMAT_UINT64(acpi_gbl_xpm1a_status.address)));
4335
4336 -       lock_flags = acpi_os_acquire_lock(acpi_gbl_hardware_lock);
4337 +       raw_spin_lock_irqsave(acpi_gbl_hardware_lock, lock_flags);
4338
4339         /* Clear the fixed events in PM1 A/B */
4340
4341         status = acpi_hw_register_write(ACPI_REGISTER_PM1_STATUS,
4342                                         ACPI_BITMASK_ALL_FIXED_STATUS);
4343
4344 -       acpi_os_release_lock(acpi_gbl_hardware_lock, lock_flags);
4345 +       raw_spin_unlock_irqrestore(acpi_gbl_hardware_lock, lock_flags);
4346
4347         if (ACPI_FAILURE(status)) {
4348                 goto exit;
4349 diff --git a/drivers/acpi/acpica/hwxface.c b/drivers/acpi/acpica/hwxface.c
4350 index b2e50d8007fe..ff007084dc48 100644
4351 --- a/drivers/acpi/acpica/hwxface.c
4352 +++ b/drivers/acpi/acpica/hwxface.c
4353 @@ -374,7 +374,7 @@ acpi_status acpi_write_bit_register(u32 register_id, u32 value)
4354                 return_ACPI_STATUS(AE_BAD_PARAMETER);
4355         }
4356
4357 -       lock_flags = acpi_os_acquire_lock(acpi_gbl_hardware_lock);
4358 +       raw_spin_lock_irqsave(acpi_gbl_hardware_lock, lock_flags);
4359
4360         /*
4361          * At this point, we know that the parent register is one of the
4362 @@ -435,7 +435,7 @@ acpi_status acpi_write_bit_register(u32 register_id, u32 value)
4363
4364  unlock_and_exit:
4365
4366 -       acpi_os_release_lock(acpi_gbl_hardware_lock, lock_flags);
4367 +       raw_spin_unlock_irqrestore(acpi_gbl_hardware_lock, lock_flags);
4368         return_ACPI_STATUS(status);
4369  }
4370
4371 diff --git a/drivers/acpi/acpica/utmutex.c b/drivers/acpi/acpica/utmutex.c
4372 index ce406e39b669..41a75eb3ae9d 100644
4373 --- a/drivers/acpi/acpica/utmutex.c
4374 +++ b/drivers/acpi/acpica/utmutex.c
4375 @@ -88,7 +88,7 @@ acpi_status acpi_ut_mutex_initialize(void)
4376                 return_ACPI_STATUS (status);
4377         }
4378
4379 -       status = acpi_os_create_lock (&acpi_gbl_hardware_lock);
4380 +       status = acpi_os_create_raw_lock (&acpi_gbl_hardware_lock);
4381         if (ACPI_FAILURE (status)) {
4382                 return_ACPI_STATUS (status);
4383         }
4384 @@ -156,7 +156,7 @@ void acpi_ut_mutex_terminate(void)
4385         /* Delete the spinlocks */
4386
4387         acpi_os_delete_lock(acpi_gbl_gpe_lock);
4388 -       acpi_os_delete_lock(acpi_gbl_hardware_lock);
4389 +       acpi_os_delete_raw_lock(acpi_gbl_hardware_lock);
4390         acpi_os_delete_lock(acpi_gbl_reference_count_lock);
4391
4392         /* Delete the reader/writer lock */
4393 diff --git a/drivers/ata/libata-sff.c b/drivers/ata/libata-sff.c
4394 index 7dbba387d12a..65beb7abb4e7 100644
4395 --- a/drivers/ata/libata-sff.c
4396 +++ b/drivers/ata/libata-sff.c
4397 @@ -678,9 +678,9 @@ unsigned int ata_sff_data_xfer_noirq(struct ata_device *dev, unsigned char *buf,
4398         unsigned long flags;
4399         unsigned int consumed;
4400
4401 -       local_irq_save(flags);
4402 +       local_irq_save_nort(flags);
4403         consumed = ata_sff_data_xfer32(dev, buf, buflen, rw);
4404 -       local_irq_restore(flags);
4405 +       local_irq_restore_nort(flags);
4406
4407         return consumed;
4408  }
4409 @@ -719,7 +719,7 @@ static void ata_pio_sector(struct ata_queued_cmd *qc)
4410                 unsigned long flags;
4411
4412                 /* FIXME: use a bounce buffer */
4413 -               local_irq_save(flags);
4414 +               local_irq_save_nort(flags);
4415                 buf = kmap_atomic(page);
4416
4417                 /* do the actual data transfer */
4418 @@ -727,7 +727,7 @@ static void ata_pio_sector(struct ata_queued_cmd *qc)
4419                                        do_write);
4420
4421                 kunmap_atomic(buf);
4422 -               local_irq_restore(flags);
4423 +               local_irq_restore_nort(flags);
4424         } else {
4425                 buf = page_address(page);
4426                 ap->ops->sff_data_xfer(qc->dev, buf + offset, qc->sect_size,
4427 @@ -864,7 +864,7 @@ next_sg:
4428                 unsigned long flags;
4429
4430                 /* FIXME: use bounce buffer */
4431 -               local_irq_save(flags);
4432 +               local_irq_save_nort(flags);
4433                 buf = kmap_atomic(page);
4434
4435                 /* do the actual data transfer */
4436 @@ -872,7 +872,7 @@ next_sg:
4437                                                                 count, rw);
4438
4439                 kunmap_atomic(buf);
4440 -               local_irq_restore(flags);
4441 +               local_irq_restore_nort(flags);
4442         } else {
4443                 buf = page_address(page);
4444                 consumed = ap->ops->sff_data_xfer(dev,  buf + offset,
4445 diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
4446 index 370c2f76016d..65e0b375a291 100644
4447 --- a/drivers/block/zram/zram_drv.c
4448 +++ b/drivers/block/zram/zram_drv.c
4449 @@ -520,6 +520,8 @@ static struct zram_meta *zram_meta_alloc(char *pool_name, u64 disksize)
4450                 goto out_error;
4451         }
4452
4453 +       zram_meta_init_table_locks(meta, disksize);
4454 +
4455         return meta;
4456
4457  out_error:
4458 @@ -568,12 +570,12 @@ static int zram_decompress_page(struct zram *zram, char *mem, u32 index)
4459         unsigned long handle;
4460         size_t size;
4461
4462 -       bit_spin_lock(ZRAM_ACCESS, &meta->table[index].value);
4463 +       zram_lock_table(&meta->table[index]);
4464         handle = meta->table[index].handle;
4465         size = zram_get_obj_size(meta, index);
4466
4467         if (!handle || zram_test_flag(meta, index, ZRAM_ZERO)) {
4468 -               bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value);
4469 +               zram_unlock_table(&meta->table[index]);
4470                 clear_page(mem);
4471                 return 0;
4472         }
4473 @@ -584,7 +586,7 @@ static int zram_decompress_page(struct zram *zram, char *mem, u32 index)
4474         else
4475                 ret = zcomp_decompress(zram->comp, cmem, size, mem);
4476         zs_unmap_object(meta->mem_pool, handle);
4477 -       bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value);
4478 +       zram_unlock_table(&meta->table[index]);
4479
4480         /* Should NEVER happen. Return bio error if it does. */
4481         if (unlikely(ret)) {
4482 @@ -604,14 +606,14 @@ static int zram_bvec_read(struct zram *zram, struct bio_vec *bvec,
4483         struct zram_meta *meta = zram->meta;
4484         page = bvec->bv_page;
4485
4486 -       bit_spin_lock(ZRAM_ACCESS, &meta->table[index].value);
4487 +       zram_lock_table(&meta->table[index]);
4488         if (unlikely(!meta->table[index].handle) ||
4489                         zram_test_flag(meta, index, ZRAM_ZERO)) {
4490 -               bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value);
4491 +               zram_unlock_table(&meta->table[index]);
4492                 handle_zero_page(bvec);
4493                 return 0;
4494         }
4495 -       bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value);
4496 +       zram_unlock_table(&meta->table[index]);
4497
4498         if (is_partial_io(bvec))
4499                 /* Use  a temporary buffer to decompress the page */
4500 @@ -689,10 +691,10 @@ static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index,
4501                 if (user_mem)
4502                         kunmap_atomic(user_mem);
4503                 /* Free memory associated with this sector now. */
4504 -               bit_spin_lock(ZRAM_ACCESS, &meta->table[index].value);
4505 +               zram_lock_table(&meta->table[index]);
4506                 zram_free_page(zram, index);
4507                 zram_set_flag(meta, index, ZRAM_ZERO);
4508 -               bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value);
4509 +               zram_unlock_table(&meta->table[index]);
4510
4511                 atomic64_inc(&zram->stats.zero_pages);
4512                 ret = 0;
4513 @@ -752,12 +754,12 @@ static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index,
4514          * Free memory associated with this sector
4515          * before overwriting unused sectors.
4516          */
4517 -       bit_spin_lock(ZRAM_ACCESS, &meta->table[index].value);
4518 +       zram_lock_table(&meta->table[index]);
4519         zram_free_page(zram, index);
4520
4521         meta->table[index].handle = handle;
4522         zram_set_obj_size(meta, index, clen);
4523 -       bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value);
4524 +       zram_unlock_table(&meta->table[index]);
4525
4526         /* Update stats */
4527         atomic64_add(clen, &zram->stats.compr_data_size);
4528 @@ -800,9 +802,9 @@ static void zram_bio_discard(struct zram *zram, u32 index,
4529         }
4530
4531         while (n >= PAGE_SIZE) {
4532 -               bit_spin_lock(ZRAM_ACCESS, &meta->table[index].value);
4533 +               zram_lock_table(&meta->table[index]);
4534                 zram_free_page(zram, index);
4535 -               bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value);
4536 +               zram_unlock_table(&meta->table[index]);
4537                 atomic64_inc(&zram->stats.notify_free);
4538                 index++;
4539                 n -= PAGE_SIZE;
4540 @@ -928,9 +930,9 @@ static void zram_slot_free_notify(struct block_device *bdev,
4541         zram = bdev->bd_disk->private_data;
4542         meta = zram->meta;
4543
4544 -       bit_spin_lock(ZRAM_ACCESS, &meta->table[index].value);
4545 +       zram_lock_table(&meta->table[index]);
4546         zram_free_page(zram, index);
4547 -       bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value);
4548 +       zram_unlock_table(&meta->table[index]);
4549         atomic64_inc(&zram->stats.notify_free);
4550  }
4551
4552 diff --git a/drivers/block/zram/zram_drv.h b/drivers/block/zram/zram_drv.h
4553 index 8e92339686d7..9e3e953d680e 100644
4554 --- a/drivers/block/zram/zram_drv.h
4555 +++ b/drivers/block/zram/zram_drv.h
4556 @@ -72,6 +72,9 @@ enum zram_pageflags {
4557  struct zram_table_entry {
4558         unsigned long handle;
4559         unsigned long value;
4560 +#ifdef CONFIG_PREEMPT_RT_BASE
4561 +       spinlock_t lock;
4562 +#endif
4563  };
4564
4565  struct zram_stats {
4566 @@ -119,4 +122,42 @@ struct zram {
4567          */
4568         bool claim; /* Protected by bdev->bd_mutex */
4569  };
4570 +
4571 +#ifndef CONFIG_PREEMPT_RT_BASE
4572 +static inline void zram_lock_table(struct zram_table_entry *table)
4573 +{
4574 +       bit_spin_lock(ZRAM_ACCESS, &table->value);
4575 +}
4576 +
4577 +static inline void zram_unlock_table(struct zram_table_entry *table)
4578 +{
4579 +       bit_spin_unlock(ZRAM_ACCESS, &table->value);
4580 +}
4581 +
4582 +static inline void zram_meta_init_table_locks(struct zram_meta *meta, u64 disksize) { }
4583 +#else /* CONFIG_PREEMPT_RT_BASE */
4584 +static inline void zram_lock_table(struct zram_table_entry *table)
4585 +{
4586 +       spin_lock(&table->lock);
4587 +       __set_bit(ZRAM_ACCESS, &table->value);
4588 +}
4589 +
4590 +static inline void zram_unlock_table(struct zram_table_entry *table)
4591 +{
4592 +       __clear_bit(ZRAM_ACCESS, &table->value);
4593 +       spin_unlock(&table->lock);
4594 +}
4595 +
4596 +static inline void zram_meta_init_table_locks(struct zram_meta *meta, u64 disksize)
4597 +{
4598 +        size_t num_pages = disksize >> PAGE_SHIFT;
4599 +        size_t index;
4600 +
4601 +        for (index = 0; index < num_pages; index++) {
4602 +               spinlock_t *lock = &meta->table[index].lock;
4603 +               spin_lock_init(lock);
4604 +        }
4605 +}
4606 +#endif /* CONFIG_PREEMPT_RT_BASE */
4607 +
4608  #endif
4609 diff --git a/drivers/char/random.c b/drivers/char/random.c
4610 index 491a4dce13fe..cf69b6b42208 100644
4611 --- a/drivers/char/random.c
4612 +++ b/drivers/char/random.c
4613 @@ -799,8 +799,6 @@ static void add_timer_randomness(struct timer_rand_state *state, unsigned num)
4614         } sample;
4615         long delta, delta2, delta3;
4616
4617 -       preempt_disable();
4618 -
4619         sample.jiffies = jiffies;
4620         sample.cycles = random_get_entropy();
4621         sample.num = num;
4622 @@ -841,7 +839,6 @@ static void add_timer_randomness(struct timer_rand_state *state, unsigned num)
4623                  */
4624                 credit_entropy_bits(r, min_t(int, fls(delta>>1), 11));
4625         }
4626 -       preempt_enable();
4627  }
4628
4629  void add_input_randomness(unsigned int type, unsigned int code,
4630 @@ -894,28 +891,27 @@ static __u32 get_reg(struct fast_pool *f, struct pt_regs *regs)
4631         return *(ptr + f->reg_idx++);
4632  }
4633
4634 -void add_interrupt_randomness(int irq, int irq_flags)
4635 +void add_interrupt_randomness(int irq, int irq_flags, __u64 ip)
4636  {
4637         struct entropy_store    *r;
4638         struct fast_pool        *fast_pool = this_cpu_ptr(&irq_randomness);
4639 -       struct pt_regs          *regs = get_irq_regs();
4640         unsigned long           now = jiffies;
4641         cycles_t                cycles = random_get_entropy();
4642         __u32                   c_high, j_high;
4643 -       __u64                   ip;
4644         unsigned long           seed;
4645         int                     credit = 0;
4646
4647         if (cycles == 0)
4648 -               cycles = get_reg(fast_pool, regs);
4649 +               cycles = get_reg(fast_pool, NULL);
4650         c_high = (sizeof(cycles) > 4) ? cycles >> 32 : 0;
4651         j_high = (sizeof(now) > 4) ? now >> 32 : 0;
4652         fast_pool->pool[0] ^= cycles ^ j_high ^ irq;
4653         fast_pool->pool[1] ^= now ^ c_high;
4654 -       ip = regs ? instruction_pointer(regs) : _RET_IP_;
4655 +       if (!ip)
4656 +               ip = _RET_IP_;
4657         fast_pool->pool[2] ^= ip;
4658         fast_pool->pool[3] ^= (sizeof(ip) > 4) ? ip >> 32 :
4659 -               get_reg(fast_pool, regs);
4660 +               get_reg(fast_pool, NULL);
4661
4662         fast_mix(fast_pool);
4663         add_interrupt_bench(cycles);
4664 diff --git a/drivers/clk/at91/clk-generated.c b/drivers/clk/at91/clk-generated.c
4665 index abc80949e1dd..4ad3298eb372 100644
4666 --- a/drivers/clk/at91/clk-generated.c
4667 +++ b/drivers/clk/at91/clk-generated.c
4668 @@ -15,8 +15,8 @@
4669  #include <linux/clkdev.h>
4670  #include <linux/clk/at91_pmc.h>
4671  #include <linux/of.h>
4672 -#include <linux/of_address.h>
4673 -#include <linux/io.h>
4674 +#include <linux/mfd/syscon.h>
4675 +#include <linux/regmap.h>
4676
4677  #include "pmc.h"
4678
4679 @@ -28,8 +28,9 @@
4680
4681  struct clk_generated {
4682         struct clk_hw hw;
4683 -       struct at91_pmc *pmc;
4684 +       struct regmap *regmap;
4685         struct clk_range range;
4686 +       spinlock_t *lock;
4687         u32 id;
4688         u32 gckdiv;
4689         u8 parent_id;
4690 @@ -41,49 +42,52 @@ struct clk_generated {
4691  static int clk_generated_enable(struct clk_hw *hw)
4692  {
4693         struct clk_generated *gck = to_clk_generated(hw);
4694 -       struct at91_pmc *pmc = gck->pmc;
4695 -       u32 tmp;
4696 +       unsigned long flags;
4697
4698         pr_debug("GCLK: %s, gckdiv = %d, parent id = %d\n",
4699                  __func__, gck->gckdiv, gck->parent_id);
4700
4701 -       pmc_lock(pmc);
4702 -       pmc_write(pmc, AT91_PMC_PCR, (gck->id & AT91_PMC_PCR_PID_MASK));
4703 -       tmp = pmc_read(pmc, AT91_PMC_PCR) &
4704 -                       ~(AT91_PMC_PCR_GCKDIV_MASK | AT91_PMC_PCR_GCKCSS_MASK);
4705 -       pmc_write(pmc, AT91_PMC_PCR, tmp | AT91_PMC_PCR_GCKCSS(gck->parent_id)
4706 -                                        | AT91_PMC_PCR_CMD
4707 -                                        | AT91_PMC_PCR_GCKDIV(gck->gckdiv)
4708 -                                        | AT91_PMC_PCR_GCKEN);
4709 -       pmc_unlock(pmc);
4710 +       spin_lock_irqsave(gck->lock, flags);
4711 +       regmap_write(gck->regmap, AT91_PMC_PCR,
4712 +                    (gck->id & AT91_PMC_PCR_PID_MASK));
4713 +       regmap_update_bits(gck->regmap, AT91_PMC_PCR,
4714 +                          AT91_PMC_PCR_GCKDIV_MASK | AT91_PMC_PCR_GCKCSS_MASK |
4715 +                          AT91_PMC_PCR_CMD | AT91_PMC_PCR_GCKEN,
4716 +                          AT91_PMC_PCR_GCKCSS(gck->parent_id) |
4717 +                          AT91_PMC_PCR_CMD |
4718 +                          AT91_PMC_PCR_GCKDIV(gck->gckdiv) |
4719 +                          AT91_PMC_PCR_GCKEN);
4720 +       spin_unlock_irqrestore(gck->lock, flags);
4721         return 0;
4722  }
4723
4724  static void clk_generated_disable(struct clk_hw *hw)
4725  {
4726         struct clk_generated *gck = to_clk_generated(hw);
4727 -       struct at91_pmc *pmc = gck->pmc;
4728 -       u32 tmp;
4729 -
4730 -       pmc_lock(pmc);
4731 -       pmc_write(pmc, AT91_PMC_PCR, (gck->id & AT91_PMC_PCR_PID_MASK));
4732 -       tmp = pmc_read(pmc, AT91_PMC_PCR) & ~AT91_PMC_PCR_GCKEN;
4733 -       pmc_write(pmc, AT91_PMC_PCR, tmp | AT91_PMC_PCR_CMD);
4734 -       pmc_unlock(pmc);
4735 +       unsigned long flags;
4736 +
4737 +       spin_lock_irqsave(gck->lock, flags);
4738 +       regmap_write(gck->regmap, AT91_PMC_PCR,
4739 +                    (gck->id & AT91_PMC_PCR_PID_MASK));
4740 +       regmap_update_bits(gck->regmap, AT91_PMC_PCR,
4741 +                          AT91_PMC_PCR_CMD | AT91_PMC_PCR_GCKEN,
4742 +                          AT91_PMC_PCR_CMD);
4743 +       spin_unlock_irqrestore(gck->lock, flags);
4744  }
4745
4746  static int clk_generated_is_enabled(struct clk_hw *hw)
4747  {
4748         struct clk_generated *gck = to_clk_generated(hw);
4749 -       struct at91_pmc *pmc = gck->pmc;
4750 -       int ret;
4751 +       unsigned long flags;
4752 +       unsigned int status;
4753
4754 -       pmc_lock(pmc);
4755 -       pmc_write(pmc, AT91_PMC_PCR, (gck->id & AT91_PMC_PCR_PID_MASK));
4756 -       ret = !!(pmc_read(pmc, AT91_PMC_PCR) & AT91_PMC_PCR_GCKEN);
4757 -       pmc_unlock(pmc);
4758 +       spin_lock_irqsave(gck->lock, flags);
4759 +       regmap_write(gck->regmap, AT91_PMC_PCR,
4760 +                    (gck->id & AT91_PMC_PCR_PID_MASK));
4761 +       regmap_read(gck->regmap, AT91_PMC_PCR, &status);
4762 +       spin_unlock_irqrestore(gck->lock, flags);
4763
4764 -       return ret;
4765 +       return status & AT91_PMC_PCR_GCKEN ? 1 : 0;
4766  }
4767
4768  static unsigned long
4769 @@ -214,13 +218,14 @@ static const struct clk_ops generated_ops = {
4770   */
4771  static void clk_generated_startup(struct clk_generated *gck)
4772  {
4773 -       struct at91_pmc *pmc = gck->pmc;
4774         u32 tmp;
4775 +       unsigned long flags;
4776
4777 -       pmc_lock(pmc);
4778 -       pmc_write(pmc, AT91_PMC_PCR, (gck->id & AT91_PMC_PCR_PID_MASK));
4779 -       tmp = pmc_read(pmc, AT91_PMC_PCR);
4780 -       pmc_unlock(pmc);
4781 +       spin_lock_irqsave(gck->lock, flags);
4782 +       regmap_write(gck->regmap, AT91_PMC_PCR,
4783 +                    (gck->id & AT91_PMC_PCR_PID_MASK));
4784 +       regmap_read(gck->regmap, AT91_PMC_PCR, &tmp);
4785 +       spin_unlock_irqrestore(gck->lock, flags);
4786
4787         gck->parent_id = (tmp & AT91_PMC_PCR_GCKCSS_MASK)
4788                                         >> AT91_PMC_PCR_GCKCSS_OFFSET;
4789 @@ -229,8 +234,8 @@ static void clk_generated_startup(struct clk_generated *gck)
4790  }
4791
4792  static struct clk * __init
4793 -at91_clk_register_generated(struct at91_pmc *pmc, const char *name,
4794 -                           const char **parent_names, u8 num_parents,
4795 +at91_clk_register_generated(struct regmap *regmap,  spinlock_t *lock, const char
4796 +                           *name, const char **parent_names, u8 num_parents,
4797                             u8 id, const struct clk_range *range)
4798  {
4799         struct clk_generated *gck;
4800 @@ -249,7 +254,8 @@ at91_clk_register_generated(struct at91_pmc *pmc, const char *name,
4801
4802         gck->id = id;
4803         gck->hw.init = &init;
4804 -       gck->pmc = pmc;
4805 +       gck->regmap = regmap;
4806 +       gck->lock = lock;
4807         gck->range = *range;
4808
4809         clk = clk_register(NULL, &gck->hw);
4810 @@ -261,8 +267,7 @@ at91_clk_register_generated(struct at91_pmc *pmc, const char *name,
4811         return clk;
4812  }
4813
4814 -void __init of_sama5d2_clk_generated_setup(struct device_node *np,
4815 -                                          struct at91_pmc *pmc)
4816 +void __init of_sama5d2_clk_generated_setup(struct device_node *np)
4817  {
4818         int num;
4819         u32 id;
4820 @@ -272,6 +277,7 @@ void __init of_sama5d2_clk_generated_setup(struct device_node *np,
4821         const char *parent_names[GENERATED_SOURCE_MAX];
4822         struct device_node *gcknp;
4823         struct clk_range range = CLK_RANGE(0, 0);
4824 +       struct regmap *regmap;
4825
4826         num_parents = of_clk_get_parent_count(np);
4827         if (num_parents <= 0 || num_parents > GENERATED_SOURCE_MAX)
4828 @@ -283,6 +289,10 @@ void __init of_sama5d2_clk_generated_setup(struct device_node *np,
4829         if (!num || num > PERIPHERAL_MAX)
4830                 return;
4831
4832 +       regmap = syscon_node_to_regmap(of_get_parent(np));
4833 +       if (IS_ERR(regmap))
4834 +               return;
4835 +
4836         for_each_child_of_node(np, gcknp) {
4837                 if (of_property_read_u32(gcknp, "reg", &id))
4838                         continue;
4839 @@ -296,11 +306,14 @@ void __init of_sama5d2_clk_generated_setup(struct device_node *np,
4840                 of_at91_get_clk_range(gcknp, "atmel,clk-output-range",
4841                                       &range);
4842
4843 -               clk = at91_clk_register_generated(pmc, name, parent_names,
4844 -                                                 num_parents, id, &range);
4845 +               clk = at91_clk_register_generated(regmap, &pmc_pcr_lock, name,
4846 +                                                 parent_names, num_parents,
4847 +                                                 id, &range);
4848                 if (IS_ERR(clk))
4849                         continue;
4850
4851                 of_clk_add_provider(gcknp, of_clk_src_simple_get, clk);
4852         }
4853  }
4854 +CLK_OF_DECLARE(of_sama5d2_clk_generated_setup, "atmel,sama5d2-clk-generated",
4855 +              of_sama5d2_clk_generated_setup);
4856 diff --git a/drivers/clk/at91/clk-h32mx.c b/drivers/clk/at91/clk-h32mx.c
4857 index a165230e7eda..8e20c8a76db7 100644
4858 --- a/drivers/clk/at91/clk-h32mx.c
4859 +++ b/drivers/clk/at91/clk-h32mx.c
4860 @@ -15,15 +15,9 @@
4861  #include <linux/clk-provider.h>
4862  #include <linux/clkdev.h>
4863  #include <linux/clk/at91_pmc.h>
4864 -#include <linux/delay.h>
4865  #include <linux/of.h>
4866 -#include <linux/of_address.h>
4867 -#include <linux/of_irq.h>
4868 -#include <linux/io.h>
4869 -#include <linux/interrupt.h>
4870 -#include <linux/irq.h>
4871 -#include <linux/sched.h>
4872 -#include <linux/wait.h>
4873 +#include <linux/regmap.h>
4874 +#include <linux/mfd/syscon.h>
4875
4876  #include "pmc.h"
4877
4878 @@ -31,7 +25,7 @@
4879
4880  struct clk_sama5d4_h32mx {
4881         struct clk_hw hw;
4882 -       struct at91_pmc *pmc;
4883 +       struct regmap *regmap;
4884  };
4885
4886  #define to_clk_sama5d4_h32mx(hw) container_of(hw, struct clk_sama5d4_h32mx, hw)
4887 @@ -40,8 +34,10 @@ static unsigned long clk_sama5d4_h32mx_recalc_rate(struct clk_hw *hw,
4888                                                  unsigned long parent_rate)
4889  {
4890         struct clk_sama5d4_h32mx *h32mxclk = to_clk_sama5d4_h32mx(hw);
4891 +       unsigned int mckr;
4892
4893 -       if (pmc_read(h32mxclk->pmc, AT91_PMC_MCKR) & AT91_PMC_H32MXDIV)
4894 +       regmap_read(h32mxclk->regmap, AT91_PMC_MCKR, &mckr);
4895 +       if (mckr & AT91_PMC_H32MXDIV)
4896                 return parent_rate / 2;
4897
4898         if (parent_rate > H32MX_MAX_FREQ)
4899 @@ -70,18 +66,16 @@ static int clk_sama5d4_h32mx_set_rate(struct clk_hw *hw, unsigned long rate,
4900                                     unsigned long parent_rate)
4901  {
4902         struct clk_sama5d4_h32mx *h32mxclk = to_clk_sama5d4_h32mx(hw);
4903 -       struct at91_pmc *pmc = h32mxclk->pmc;
4904 -       u32 tmp;
4905 +       u32 mckr = 0;
4906
4907         if (parent_rate != rate && (parent_rate / 2) != rate)
4908                 return -EINVAL;
4909
4910 -       pmc_lock(pmc);
4911 -       tmp = pmc_read(pmc, AT91_PMC_MCKR) & ~AT91_PMC_H32MXDIV;
4912         if ((parent_rate / 2) == rate)
4913 -               tmp |= AT91_PMC_H32MXDIV;
4914 -       pmc_write(pmc, AT91_PMC_MCKR, tmp);
4915 -       pmc_unlock(pmc);
4916 +               mckr = AT91_PMC_H32MXDIV;
4917 +
4918 +       regmap_update_bits(h32mxclk->regmap, AT91_PMC_MCKR,
4919 +                          AT91_PMC_H32MXDIV, mckr);
4920
4921         return 0;
4922  }
4923 @@ -92,14 +86,18 @@ static const struct clk_ops h32mx_ops = {
4924         .set_rate = clk_sama5d4_h32mx_set_rate,
4925  };
4926
4927 -void __init of_sama5d4_clk_h32mx_setup(struct device_node *np,
4928 -                                    struct at91_pmc *pmc)
4929 +static void __init of_sama5d4_clk_h32mx_setup(struct device_node *np)
4930  {
4931         struct clk_sama5d4_h32mx *h32mxclk;
4932         struct clk_init_data init;
4933         const char *parent_name;
4934 +       struct regmap *regmap;
4935         struct clk *clk;
4936
4937 +       regmap = syscon_node_to_regmap(of_get_parent(np));
4938 +       if (IS_ERR(regmap))
4939 +               return;
4940 +
4941         h32mxclk = kzalloc(sizeof(*h32mxclk), GFP_KERNEL);
4942         if (!h32mxclk)
4943                 return;
4944 @@ -113,7 +111,7 @@ void __init of_sama5d4_clk_h32mx_setup(struct device_node *np,
4945         init.flags = CLK_SET_RATE_GATE;
4946
4947         h32mxclk->hw.init = &init;
4948 -       h32mxclk->pmc = pmc;
4949 +       h32mxclk->regmap = regmap;
4950
4951         clk = clk_register(NULL, &h32mxclk->hw);
4952         if (IS_ERR(clk)) {
4953 @@ -123,3 +121,5 @@ void __init of_sama5d4_clk_h32mx_setup(struct device_node *np,
4954
4955         of_clk_add_provider(np, of_clk_src_simple_get, clk);
4956  }
4957 +CLK_OF_DECLARE(of_sama5d4_clk_h32mx_setup, "atmel,sama5d4-clk-h32mx",
4958 +              of_sama5d4_clk_h32mx_setup);
4959 diff --git a/drivers/clk/at91/clk-main.c b/drivers/clk/at91/clk-main.c
4960 index fd7247deabdc..4bfc94d6c26e 100644
4961 --- a/drivers/clk/at91/clk-main.c
4962 +++ b/drivers/clk/at91/clk-main.c
4963 @@ -13,13 +13,8 @@
4964  #include <linux/clk/at91_pmc.h>
4965  #include <linux/delay.h>
4966  #include <linux/of.h>
4967 -#include <linux/of_address.h>
4968 -#include <linux/of_irq.h>
4969 -#include <linux/io.h>
4970 -#include <linux/interrupt.h>
4971 -#include <linux/irq.h>
4972 -#include <linux/sched.h>
4973 -#include <linux/wait.h>
4974 +#include <linux/mfd/syscon.h>
4975 +#include <linux/regmap.h>
4976
4977  #include "pmc.h"
4978
4979 @@ -34,18 +29,14 @@
4980
4981  struct clk_main_osc {
4982         struct clk_hw hw;
4983 -       struct at91_pmc *pmc;
4984 -       unsigned int irq;
4985 -       wait_queue_head_t wait;
4986 +       struct regmap *regmap;
4987  };
4988
4989  #define to_clk_main_osc(hw) container_of(hw, struct clk_main_osc, hw)
4990
4991  struct clk_main_rc_osc {
4992         struct clk_hw hw;
4993 -       struct at91_pmc *pmc;
4994 -       unsigned int irq;
4995 -       wait_queue_head_t wait;
4996 +       struct regmap *regmap;
4997         unsigned long frequency;
4998         unsigned long accuracy;
4999  };
5000 @@ -54,51 +45,47 @@ struct clk_main_rc_osc {
5001
5002  struct clk_rm9200_main {
5003         struct clk_hw hw;
5004 -       struct at91_pmc *pmc;
5005 +       struct regmap *regmap;
5006  };
5007
5008  #define to_clk_rm9200_main(hw) container_of(hw, struct clk_rm9200_main, hw)
5009
5010  struct clk_sam9x5_main {
5011         struct clk_hw hw;
5012 -       struct at91_pmc *pmc;
5013 -       unsigned int irq;
5014 -       wait_queue_head_t wait;
5015 +       struct regmap *regmap;
5016         u8 parent;
5017  };
5018
5019  #define to_clk_sam9x5_main(hw) container_of(hw, struct clk_sam9x5_main, hw)
5020
5021 -static irqreturn_t clk_main_osc_irq_handler(int irq, void *dev_id)
5022 +static inline bool clk_main_osc_ready(struct regmap *regmap)
5023  {
5024 -       struct clk_main_osc *osc = dev_id;
5025 +       unsigned int status;
5026
5027 -       wake_up(&osc->wait);
5028 -       disable_irq_nosync(osc->irq);
5029 +       regmap_read(regmap, AT91_PMC_SR, &status);
5030
5031 -       return IRQ_HANDLED;
5032 +       return status & AT91_PMC_MOSCS;
5033  }
5034
5035  static int clk_main_osc_prepare(struct clk_hw *hw)
5036  {
5037         struct clk_main_osc *osc = to_clk_main_osc(hw);
5038 -       struct at91_pmc *pmc = osc->pmc;
5039 +       struct regmap *regmap = osc->regmap;
5040         u32 tmp;
5041
5042 -       tmp = pmc_read(pmc, AT91_CKGR_MOR) & ~MOR_KEY_MASK;
5043 +       regmap_read(regmap, AT91_CKGR_MOR, &tmp);
5044 +       tmp &= ~MOR_KEY_MASK;
5045 +
5046         if (tmp & AT91_PMC_OSCBYPASS)
5047                 return 0;
5048
5049         if (!(tmp & AT91_PMC_MOSCEN)) {
5050                 tmp |= AT91_PMC_MOSCEN | AT91_PMC_KEY;
5051 -               pmc_write(pmc, AT91_CKGR_MOR, tmp);
5052 +               regmap_write(regmap, AT91_CKGR_MOR, tmp);
5053         }
5054
5055 -       while (!(pmc_read(pmc, AT91_PMC_SR) & AT91_PMC_MOSCS)) {
5056 -               enable_irq(osc->irq);
5057 -               wait_event(osc->wait,
5058 -                          pmc_read(pmc, AT91_PMC_SR) & AT91_PMC_MOSCS);
5059 -       }
5060 +       while (!clk_main_osc_ready(regmap))
5061 +               cpu_relax();
5062
5063         return 0;
5064  }
5065 @@ -106,9 +93,10 @@ static int clk_main_osc_prepare(struct clk_hw *hw)
5066  static void clk_main_osc_unprepare(struct clk_hw *hw)
5067  {
5068         struct clk_main_osc *osc = to_clk_main_osc(hw);
5069 -       struct at91_pmc *pmc = osc->pmc;
5070 -       u32 tmp = pmc_read(pmc, AT91_CKGR_MOR);
5071 +       struct regmap *regmap = osc->regmap;
5072 +       u32 tmp;
5073
5074 +       regmap_read(regmap, AT91_CKGR_MOR, &tmp);
5075         if (tmp & AT91_PMC_OSCBYPASS)
5076                 return;
5077
5078 @@ -116,20 +104,22 @@ static void clk_main_osc_unprepare(struct clk_hw *hw)
5079                 return;
5080
5081         tmp &= ~(AT91_PMC_KEY | AT91_PMC_MOSCEN);
5082 -       pmc_write(pmc, AT91_CKGR_MOR, tmp | AT91_PMC_KEY);
5083 +       regmap_write(regmap, AT91_CKGR_MOR, tmp | AT91_PMC_KEY);
5084  }
5085
5086  static int clk_main_osc_is_prepared(struct clk_hw *hw)
5087  {
5088         struct clk_main_osc *osc = to_clk_main_osc(hw);
5089 -       struct at91_pmc *pmc = osc->pmc;
5090 -       u32 tmp = pmc_read(pmc, AT91_CKGR_MOR);
5091 +       struct regmap *regmap = osc->regmap;
5092 +       u32 tmp, status;
5093
5094 +       regmap_read(regmap, AT91_CKGR_MOR, &tmp);
5095         if (tmp & AT91_PMC_OSCBYPASS)
5096                 return 1;
5097
5098 -       return !!((pmc_read(pmc, AT91_PMC_SR) & AT91_PMC_MOSCS) &&
5099 -                 (pmc_read(pmc, AT91_CKGR_MOR) & AT91_PMC_MOSCEN));
5100 +       regmap_read(regmap, AT91_PMC_SR, &status);
5101 +
5102 +       return (status & AT91_PMC_MOSCS) && (tmp & AT91_PMC_MOSCEN);
5103  }
5104
5105  static const struct clk_ops main_osc_ops = {
5106 @@ -139,18 +129,16 @@ static const struct clk_ops main_osc_ops = {
5107  };
5108
5109  static struct clk * __init
5110 -at91_clk_register_main_osc(struct at91_pmc *pmc,
5111 -                          unsigned int irq,
5112 +at91_clk_register_main_osc(struct regmap *regmap,
5113                            const char *name,
5114                            const char *parent_name,
5115                            bool bypass)
5116  {
5117 -       int ret;
5118         struct clk_main_osc *osc;
5119         struct clk *clk = NULL;
5120         struct clk_init_data init;
5121
5122 -       if (!pmc || !irq || !name || !parent_name)
5123 +       if (!name || !parent_name)
5124                 return ERR_PTR(-EINVAL);
5125
5126         osc = kzalloc(sizeof(*osc), GFP_KERNEL);
5127 @@ -164,85 +152,70 @@ at91_clk_register_main_osc(struct at91_pmc *pmc,
5128         init.flags = CLK_IGNORE_UNUSED;
5129
5130         osc->hw.init = &init;
5131 -       osc->pmc = pmc;
5132 -       osc->irq = irq;
5133 -
5134 -       init_waitqueue_head(&osc->wait);
5135 -       irq_set_status_flags(osc->irq, IRQ_NOAUTOEN);
5136 -       ret = request_irq(osc->irq, clk_main_osc_irq_handler,
5137 -                         IRQF_TRIGGER_HIGH, name, osc);
5138 -       if (ret) {
5139 -               kfree(osc);
5140 -               return ERR_PTR(ret);
5141 -       }
5142 +       osc->regmap = regmap;
5143
5144         if (bypass)
5145 -               pmc_write(pmc, AT91_CKGR_MOR,
5146 -                         (pmc_read(pmc, AT91_CKGR_MOR) &
5147 -                          ~(MOR_KEY_MASK | AT91_PMC_MOSCEN)) |
5148 -                         AT91_PMC_OSCBYPASS | AT91_PMC_KEY);
5149 +               regmap_update_bits(regmap,
5150 +                                  AT91_CKGR_MOR, MOR_KEY_MASK |
5151 +                                  AT91_PMC_MOSCEN,
5152 +                                  AT91_PMC_OSCBYPASS | AT91_PMC_KEY);
5153
5154         clk = clk_register(NULL, &osc->hw);
5155 -       if (IS_ERR(clk)) {
5156 -               free_irq(irq, osc);
5157 +       if (IS_ERR(clk))
5158                 kfree(osc);
5159 -       }
5160
5161         return clk;
5162  }
5163
5164 -void __init of_at91rm9200_clk_main_osc_setup(struct device_node *np,
5165 -                                            struct at91_pmc *pmc)
5166 +static void __init of_at91rm9200_clk_main_osc_setup(struct device_node *np)
5167  {
5168         struct clk *clk;
5169 -       unsigned int irq;
5170         const char *name = np->name;
5171         const char *parent_name;
5172 +       struct regmap *regmap;
5173         bool bypass;
5174
5175         of_property_read_string(np, "clock-output-names", &name);
5176         bypass = of_property_read_bool(np, "atmel,osc-bypass");
5177         parent_name = of_clk_get_parent_name(np, 0);
5178
5179 -       irq = irq_of_parse_and_map(np, 0);
5180 -       if (!irq)
5181 +       regmap = syscon_node_to_regmap(of_get_parent(np));
5182 +       if (IS_ERR(regmap))
5183                 return;
5184
5185 -       clk = at91_clk_register_main_osc(pmc, irq, name, parent_name, bypass);
5186 +       clk = at91_clk_register_main_osc(regmap, name, parent_name, bypass);
5187         if (IS_ERR(clk))
5188                 return;
5189
5190         of_clk_add_provider(np, of_clk_src_simple_get, clk);
5191  }
5192 +CLK_OF_DECLARE(at91rm9200_clk_main_osc, "atmel,at91rm9200-clk-main-osc",
5193 +              of_at91rm9200_clk_main_osc_setup);
5194
5195 -static irqreturn_t clk_main_rc_osc_irq_handler(int irq, void *dev_id)
5196 +static bool clk_main_rc_osc_ready(struct regmap *regmap)
5197  {
5198 -       struct clk_main_rc_osc *osc = dev_id;
5199 +       unsigned int status;
5200
5201 -       wake_up(&osc->wait);
5202 -       disable_irq_nosync(osc->irq);
5203 +       regmap_read(regmap, AT91_PMC_SR, &status);
5204
5205 -       return IRQ_HANDLED;
5206 +       return status & AT91_PMC_MOSCRCS;
5207  }
5208
5209  static int clk_main_rc_osc_prepare(struct clk_hw *hw)
5210  {
5211         struct clk_main_rc_osc *osc = to_clk_main_rc_osc(hw);
5212 -       struct at91_pmc *pmc = osc->pmc;
5213 -       u32 tmp;
5214 +       struct regmap *regmap = osc->regmap;
5215 +       unsigned int mor;
5216
5217 -       tmp = pmc_read(pmc, AT91_CKGR_MOR) & ~MOR_KEY_MASK;
5218 +       regmap_read(regmap, AT91_CKGR_MOR, &mor);
5219
5220 -       if (!(tmp & AT91_PMC_MOSCRCEN)) {
5221 -               tmp |= AT91_PMC_MOSCRCEN | AT91_PMC_KEY;
5222 -               pmc_write(pmc, AT91_CKGR_MOR, tmp);
5223 -       }
5224 +       if (!(mor & AT91_PMC_MOSCRCEN))
5225 +               regmap_update_bits(regmap, AT91_CKGR_MOR,
5226 +                                  MOR_KEY_MASK | AT91_PMC_MOSCRCEN,
5227 +                                  AT91_PMC_MOSCRCEN | AT91_PMC_KEY);
5228
5229 -       while (!(pmc_read(pmc, AT91_PMC_SR) & AT91_PMC_MOSCRCS)) {
5230 -               enable_irq(osc->irq);
5231 -               wait_event(osc->wait,
5232 -                          pmc_read(pmc, AT91_PMC_SR) & AT91_PMC_MOSCRCS);
5233 -       }
5234 +       while (!clk_main_rc_osc_ready(regmap))
5235 +               cpu_relax();
5236
5237         return 0;
5238  }
5239 @@ -250,23 +223,28 @@ static int clk_main_rc_osc_prepare(struct clk_hw *hw)
5240  static void clk_main_rc_osc_unprepare(struct clk_hw *hw)
5241  {
5242         struct clk_main_rc_osc *osc = to_clk_main_rc_osc(hw);
5243 -       struct at91_pmc *pmc = osc->pmc;
5244 -       u32 tmp = pmc_read(pmc, AT91_CKGR_MOR);
5245 +       struct regmap *regmap = osc->regmap;
5246 +       unsigned int mor;
5247 +
5248 +       regmap_read(regmap, AT91_CKGR_MOR, &mor);
5249
5250 -       if (!(tmp & AT91_PMC_MOSCRCEN))
5251 +       if (!(mor & AT91_PMC_MOSCRCEN))
5252                 return;
5253
5254 -       tmp &= ~(MOR_KEY_MASK | AT91_PMC_MOSCRCEN);
5255 -       pmc_write(pmc, AT91_CKGR_MOR, tmp | AT91_PMC_KEY);
5256 +       regmap_update_bits(regmap, AT91_CKGR_MOR,
5257 +                          MOR_KEY_MASK | AT91_PMC_MOSCRCEN, AT91_PMC_KEY);
5258  }
5259
5260  static int clk_main_rc_osc_is_prepared(struct clk_hw *hw)
5261  {
5262         struct clk_main_rc_osc *osc = to_clk_main_rc_osc(hw);
5263 -       struct at91_pmc *pmc = osc->pmc;
5264 +       struct regmap *regmap = osc->regmap;
5265 +       unsigned int mor, status;
5266
5267 -       return !!((pmc_read(pmc, AT91_PMC_SR) & AT91_PMC_MOSCRCS) &&
5268 -                 (pmc_read(pmc, AT91_CKGR_MOR) & AT91_PMC_MOSCRCEN));
5269 +       regmap_read(regmap, AT91_CKGR_MOR, &mor);
5270 +       regmap_read(regmap, AT91_PMC_SR, &status);
5271 +
5272 +       return (mor & AT91_PMC_MOSCRCEN) && (status & AT91_PMC_MOSCRCS);
5273  }
5274
5275  static unsigned long clk_main_rc_osc_recalc_rate(struct clk_hw *hw,
5276 @@ -294,17 +272,15 @@ static const struct clk_ops main_rc_osc_ops = {
5277  };
5278
5279  static struct clk * __init
5280 -at91_clk_register_main_rc_osc(struct at91_pmc *pmc,
5281 -                             unsigned int irq,
5282 +at91_clk_register_main_rc_osc(struct regmap *regmap,
5283                               const char *name,
5284                               u32 frequency, u32 accuracy)
5285  {
5286 -       int ret;
5287         struct clk_main_rc_osc *osc;
5288         struct clk *clk = NULL;
5289         struct clk_init_data init;
5290
5291 -       if (!pmc || !irq || !name || !frequency)
5292 +       if (!name || !frequency)
5293                 return ERR_PTR(-EINVAL);
5294
5295         osc = kzalloc(sizeof(*osc), GFP_KERNEL);
5296 @@ -318,63 +294,53 @@ at91_clk_register_main_rc_osc(struct at91_pmc *pmc,
5297         init.flags = CLK_IS_ROOT | CLK_IGNORE_UNUSED;
5298
5299         osc->hw.init = &init;
5300 -       osc->pmc = pmc;
5301 -       osc->irq = irq;
5302 +       osc->regmap = regmap;
5303         osc->frequency = frequency;
5304         osc->accuracy = accuracy;
5305
5306 -       init_waitqueue_head(&osc->wait);
5307 -       irq_set_status_flags(osc->irq, IRQ_NOAUTOEN);
5308 -       ret = request_irq(osc->irq, clk_main_rc_osc_irq_handler,
5309 -                         IRQF_TRIGGER_HIGH, name, osc);
5310 -       if (ret)
5311 -               return ERR_PTR(ret);
5312 -
5313         clk = clk_register(NULL, &osc->hw);
5314 -       if (IS_ERR(clk)) {
5315 -               free_irq(irq, osc);
5316 +       if (IS_ERR(clk))
5317                 kfree(osc);
5318 -       }
5319
5320         return clk;
5321  }
5322
5323 -void __init of_at91sam9x5_clk_main_rc_osc_setup(struct device_node *np,
5324 -                                               struct at91_pmc *pmc)
5325 +static void __init of_at91sam9x5_clk_main_rc_osc_setup(struct device_node *np)
5326  {
5327         struct clk *clk;
5328 -       unsigned int irq;
5329         u32 frequency = 0;
5330         u32 accuracy = 0;
5331         const char *name = np->name;
5332 +       struct regmap *regmap;
5333
5334         of_property_read_string(np, "clock-output-names", &name);
5335         of_property_read_u32(np, "clock-frequency", &frequency);
5336         of_property_read_u32(np, "clock-accuracy", &accuracy);
5337
5338 -       irq = irq_of_parse_and_map(np, 0);
5339 -       if (!irq)
5340 +       regmap = syscon_node_to_regmap(of_get_parent(np));
5341 +       if (IS_ERR(regmap))
5342                 return;
5343
5344 -       clk = at91_clk_register_main_rc_osc(pmc, irq, name, frequency,
5345 -                                           accuracy);
5346 +       clk = at91_clk_register_main_rc_osc(regmap, name, frequency, accuracy);
5347         if (IS_ERR(clk))
5348                 return;
5349
5350         of_clk_add_provider(np, of_clk_src_simple_get, clk);
5351  }
5352 +CLK_OF_DECLARE(at91sam9x5_clk_main_rc_osc, "atmel,at91sam9x5-clk-main-rc-osc",
5353 +              of_at91sam9x5_clk_main_rc_osc_setup);
5354
5355
5356 -static int clk_main_probe_frequency(struct at91_pmc *pmc)
5357 +static int clk_main_probe_frequency(struct regmap *regmap)
5358  {
5359         unsigned long prep_time, timeout;
5360 -       u32 tmp;
5361 +       unsigned int mcfr;
5362
5363         timeout = jiffies + usecs_to_jiffies(MAINFRDY_TIMEOUT);
5364         do {
5365                 prep_time = jiffies;
5366 -               tmp = pmc_read(pmc, AT91_CKGR_MCFR);
5367 -               if (tmp & AT91_PMC_MAINRDY)
5368 +               regmap_read(regmap, AT91_CKGR_MCFR, &mcfr);
5369 +               if (mcfr & AT91_PMC_MAINRDY)
5370                         return 0;
5371                 usleep_range(MAINF_LOOP_MIN_WAIT, MAINF_LOOP_MAX_WAIT);
5372         } while (time_before(prep_time, timeout));
5373 @@ -382,34 +348,37 @@ static int clk_main_probe_frequency(struct at91_pmc *pmc)
5374         return -ETIMEDOUT;
5375  }
5376
5377 -static unsigned long clk_main_recalc_rate(struct at91_pmc *pmc,
5378 +static unsigned long clk_main_recalc_rate(struct regmap *regmap,
5379                                           unsigned long parent_rate)
5380  {
5381 -       u32 tmp;
5382 +       unsigned int mcfr;
5383
5384         if (parent_rate)
5385                 return parent_rate;
5386
5387         pr_warn("Main crystal frequency not set, using approximate value\n");
5388 -       tmp = pmc_read(pmc, AT91_CKGR_MCFR);
5389 -       if (!(tmp & AT91_PMC_MAINRDY))
5390 +       regmap_read(regmap, AT91_CKGR_MCFR, &mcfr);
5391 +       if (!(mcfr & AT91_PMC_MAINRDY))
5392                 return 0;
5393
5394 -       return ((tmp & AT91_PMC_MAINF) * SLOW_CLOCK_FREQ) / MAINF_DIV;
5395 +       return ((mcfr & AT91_PMC_MAINF) * SLOW_CLOCK_FREQ) / MAINF_DIV;
5396  }
5397
5398  static int clk_rm9200_main_prepare(struct clk_hw *hw)
5399  {
5400         struct clk_rm9200_main *clkmain = to_clk_rm9200_main(hw);
5401
5402 -       return clk_main_probe_frequency(clkmain->pmc);
5403 +       return clk_main_probe_frequency(clkmain->regmap);
5404  }
5405
5406  static int clk_rm9200_main_is_prepared(struct clk_hw *hw)
5407  {
5408         struct clk_rm9200_main *clkmain = to_clk_rm9200_main(hw);
5409 +       unsigned int status;
5410 +
5411 +       regmap_read(clkmain->regmap, AT91_CKGR_MCFR, &status);
5412
5413 -       return !!(pmc_read(clkmain->pmc, AT91_CKGR_MCFR) & AT91_PMC_MAINRDY);
5414 +       return status & AT91_PMC_MAINRDY ? 1 : 0;
5415  }
5416
5417  static unsigned long clk_rm9200_main_recalc_rate(struct clk_hw *hw,
5418 @@ -417,7 +386,7 @@ static unsigned long clk_rm9200_main_recalc_rate(struct clk_hw *hw,
5419  {
5420         struct clk_rm9200_main *clkmain = to_clk_rm9200_main(hw);
5421
5422 -       return clk_main_recalc_rate(clkmain->pmc, parent_rate);
5423 +       return clk_main_recalc_rate(clkmain->regmap, parent_rate);
5424  }
5425
5426  static const struct clk_ops rm9200_main_ops = {
5427 @@ -427,7 +396,7 @@ static const struct clk_ops rm9200_main_ops = {
5428  };
5429
5430  static struct clk * __init
5431 -at91_clk_register_rm9200_main(struct at91_pmc *pmc,
5432 +at91_clk_register_rm9200_main(struct regmap *regmap,
5433                               const char *name,
5434                               const char *parent_name)
5435  {
5436 @@ -435,7 +404,7 @@ at91_clk_register_rm9200_main(struct at91_pmc *pmc,
5437         struct clk *clk = NULL;
5438         struct clk_init_data init;
5439
5440 -       if (!pmc || !name)
5441 +       if (!name)
5442                 return ERR_PTR(-EINVAL);
5443
5444         if (!parent_name)
5445 @@ -452,7 +421,7 @@ at91_clk_register_rm9200_main(struct at91_pmc *pmc,
5446         init.flags = 0;
5447
5448         clkmain->hw.init = &init;
5449 -       clkmain->pmc = pmc;
5450 +       clkmain->regmap = regmap;
5451
5452         clk = clk_register(NULL, &clkmain->hw);
5453         if (IS_ERR(clk))
5454 @@ -461,52 +430,54 @@ at91_clk_register_rm9200_main(struct at91_pmc *pmc,
5455         return clk;
5456  }
5457
5458 -void __init of_at91rm9200_clk_main_setup(struct device_node *np,
5459 -                                        struct at91_pmc *pmc)
5460 +static void __init of_at91rm9200_clk_main_setup(struct device_node *np)
5461  {
5462         struct clk *clk;
5463         const char *parent_name;
5464         const char *name = np->name;
5465 +       struct regmap *regmap;
5466
5467         parent_name = of_clk_get_parent_name(np, 0);
5468         of_property_read_string(np, "clock-output-names", &name);
5469
5470 -       clk = at91_clk_register_rm9200_main(pmc, name, parent_name);
5471 +       regmap = syscon_node_to_regmap(of_get_parent(np));
5472 +       if (IS_ERR(regmap))
5473 +               return;
5474 +
5475 +       clk = at91_clk_register_rm9200_main(regmap, name, parent_name);
5476         if (IS_ERR(clk))
5477                 return;
5478
5479         of_clk_add_provider(np, of_clk_src_simple_get, clk);
5480  }
5481 +CLK_OF_DECLARE(at91rm9200_clk_main, "atmel,at91rm9200-clk-main",
5482 +              of_at91rm9200_clk_main_setup);
5483
5484 -static irqreturn_t clk_sam9x5_main_irq_handler(int irq, void *dev_id)
5485 +static inline bool clk_sam9x5_main_ready(struct regmap *regmap)
5486  {
5487 -       struct clk_sam9x5_main *clkmain = dev_id;
5488 +       unsigned int status;
5489
5490 -       wake_up(&clkmain->wait);
5491 -       disable_irq_nosync(clkmain->irq);
5492 +       regmap_read(regmap, AT91_PMC_SR, &status);
5493
5494 -       return IRQ_HANDLED;
5495 +       return status & AT91_PMC_MOSCSELS ? 1 : 0;
5496  }
5497
5498  static int clk_sam9x5_main_prepare(struct clk_hw *hw)
5499  {
5500         struct clk_sam9x5_main *clkmain = to_clk_sam9x5_main(hw);
5501 -       struct at91_pmc *pmc = clkmain->pmc;
5502 +       struct regmap *regmap = clkmain->regmap;
5503
5504 -       while (!(pmc_read(pmc, AT91_PMC_SR) & AT91_PMC_MOSCSELS)) {
5505 -               enable_irq(clkmain->irq);
5506 -               wait_event(clkmain->wait,
5507 -                          pmc_read(pmc, AT91_PMC_SR) & AT91_PMC_MOSCSELS);
5508 -       }
5509 +       while (!clk_sam9x5_main_ready(regmap))
5510 +               cpu_relax();
5511
5512 -       return clk_main_probe_frequency(pmc);
5513 +       return clk_main_probe_frequency(regmap);
5514  }
5515
5516  static int clk_sam9x5_main_is_prepared(struct clk_hw *hw)
5517  {
5518         struct clk_sam9x5_main *clkmain = to_clk_sam9x5_main(hw);
5519
5520 -       return !!(pmc_read(clkmain->pmc, AT91_PMC_SR) & AT91_PMC_MOSCSELS);
5521 +       return clk_sam9x5_main_ready(clkmain->regmap);
5522  }
5523
5524  static unsigned long clk_sam9x5_main_recalc_rate(struct clk_hw *hw,
5525 @@ -514,30 +485,28 @@ static unsigned long clk_sam9x5_main_recalc_rate(struct clk_hw *hw,
5526  {
5527         struct clk_sam9x5_main *clkmain = to_clk_sam9x5_main(hw);
5528
5529 -       return clk_main_recalc_rate(clkmain->pmc, parent_rate);
5530 +       return clk_main_recalc_rate(clkmain->regmap, parent_rate);
5531  }
5532
5533  static int clk_sam9x5_main_set_parent(struct clk_hw *hw, u8 index)
5534  {
5535         struct clk_sam9x5_main *clkmain = to_clk_sam9x5_main(hw);
5536 -       struct at91_pmc *pmc = clkmain->pmc;
5537 -       u32 tmp;
5538 +       struct regmap *regmap = clkmain->regmap;
5539 +       unsigned int tmp;
5540
5541         if (index > 1)
5542                 return -EINVAL;
5543
5544 -       tmp = pmc_read(pmc, AT91_CKGR_MOR) & ~MOR_KEY_MASK;
5545 +       regmap_read(regmap, AT91_CKGR_MOR, &tmp);
5546 +       tmp &= ~MOR_KEY_MASK;
5547
5548         if (index && !(tmp & AT91_PMC_MOSCSEL))
5549 -               pmc_write(pmc, AT91_CKGR_MOR, tmp | AT91_PMC_MOSCSEL);
5550 +               regmap_write(regmap, AT91_CKGR_MOR, tmp | AT91_PMC_MOSCSEL);
5551         else if (!index && (tmp & AT91_PMC_MOSCSEL))
5552 -               pmc_write(pmc, AT91_CKGR_MOR, tmp & ~AT91_PMC_MOSCSEL);
5553 +               regmap_write(regmap, AT91_CKGR_MOR, tmp & ~AT91_PMC_MOSCSEL);
5554
5555 -       while (!(pmc_read(pmc, AT91_PMC_SR) & AT91_PMC_MOSCSELS)) {
5556 -               enable_irq(clkmain->irq);
5557 -               wait_event(clkmain->wait,
5558 -                          pmc_read(pmc, AT91_PMC_SR) & AT91_PMC_MOSCSELS);
5559 -       }
5560 +       while (!clk_sam9x5_main_ready(regmap))
5561 +               cpu_relax();
5562
5563         return 0;
5564  }
5565 @@ -545,8 +514,11 @@ static int clk_sam9x5_main_set_parent(struct clk_hw *hw, u8 index)
5566  static u8 clk_sam9x5_main_get_parent(struct clk_hw *hw)
5567  {
5568         struct clk_sam9x5_main *clkmain = to_clk_sam9x5_main(hw);
5569 +       unsigned int status;
5570 +
5571 +       regmap_read(clkmain->regmap, AT91_CKGR_MOR, &status);
5572
5573 -       return !!(pmc_read(clkmain->pmc, AT91_CKGR_MOR) & AT91_PMC_MOSCEN);
5574 +       return status & AT91_PMC_MOSCEN ? 1 : 0;
5575  }
5576
5577  static const struct clk_ops sam9x5_main_ops = {
5578 @@ -558,18 +530,17 @@ static const struct clk_ops sam9x5_main_ops = {
5579  };
5580
5581  static struct clk * __init
5582 -at91_clk_register_sam9x5_main(struct at91_pmc *pmc,
5583 -                             unsigned int irq,
5584 +at91_clk_register_sam9x5_main(struct regmap *regmap,
5585                               const char *name,
5586                               const char **parent_names,
5587                               int num_parents)
5588  {
5589 -       int ret;
5590         struct clk_sam9x5_main *clkmain;
5591         struct clk *clk = NULL;
5592         struct clk_init_data init;
5593 +       unsigned int status;
5594
5595 -       if (!pmc || !irq || !name)
5596 +       if (!name)
5597                 return ERR_PTR(-EINVAL);
5598
5599         if (!parent_names || !num_parents)
5600 @@ -586,51 +557,42 @@ at91_clk_register_sam9x5_main(struct at91_pmc *pmc,
5601         init.flags = CLK_SET_PARENT_GATE;
5602
5603         clkmain->hw.init = &init;
5604 -       clkmain->pmc = pmc;
5605 -       clkmain->irq = irq;
5606 -       clkmain->parent = !!(pmc_read(clkmain->pmc, AT91_CKGR_MOR) &
5607 -                            AT91_PMC_MOSCEN);
5608 -       init_waitqueue_head(&clkmain->wait);
5609 -       irq_set_status_flags(clkmain->irq, IRQ_NOAUTOEN);
5610 -       ret = request_irq(clkmain->irq, clk_sam9x5_main_irq_handler,
5611 -                         IRQF_TRIGGER_HIGH, name, clkmain);
5612 -       if (ret)
5613 -               return ERR_PTR(ret);
5614 +       clkmain->regmap = regmap;
5615 +       regmap_read(clkmain->regmap, AT91_CKGR_MOR, &status);
5616 +       clkmain->parent = status & AT91_PMC_MOSCEN ? 1 : 0;
5617
5618         clk = clk_register(NULL, &clkmain->hw);
5619 -       if (IS_ERR(clk)) {
5620 -               free_irq(clkmain->irq, clkmain);
5621 +       if (IS_ERR(clk))
5622                 kfree(clkmain);
5623 -       }
5624
5625         return clk;
5626  }
5627
5628 -void __init of_at91sam9x5_clk_main_setup(struct device_node *np,
5629 -                                        struct at91_pmc *pmc)
5630 +static void __init of_at91sam9x5_clk_main_setup(struct device_node *np)
5631  {
5632         struct clk *clk;
5633         const char *parent_names[2];
5634         int num_parents;
5635 -       unsigned int irq;
5636         const char *name = np->name;
5637 +       struct regmap *regmap;
5638
5639         num_parents = of_clk_get_parent_count(np);
5640         if (num_parents <= 0 || num_parents > 2)
5641                 return;
5642
5643         of_clk_parent_fill(np, parent_names, num_parents);
5644 +       regmap = syscon_node_to_regmap(of_get_parent(np));
5645 +       if (IS_ERR(regmap))
5646 +               return;
5647
5648         of_property_read_string(np, "clock-output-names", &name);
5649
5650 -       irq = irq_of_parse_and_map(np, 0);
5651 -       if (!irq)
5652 -               return;
5653 -
5654 -       clk = at91_clk_register_sam9x5_main(pmc, irq, name, parent_names,
5655 +       clk = at91_clk_register_sam9x5_main(regmap, name, parent_names,
5656                                             num_parents);
5657         if (IS_ERR(clk))
5658                 return;
5659
5660         of_clk_add_provider(np, of_clk_src_simple_get, clk);
5661  }
5662 +CLK_OF_DECLARE(at91sam9x5_clk_main, "atmel,at91sam9x5-clk-main",
5663 +              of_at91sam9x5_clk_main_setup);
5664 diff --git a/drivers/clk/at91/clk-master.c b/drivers/clk/at91/clk-master.c
5665 index 620ea323356b..7d4a1864ea7c 100644
5666 --- a/drivers/clk/at91/clk-master.c
5667 +++ b/drivers/clk/at91/clk-master.c
5668 @@ -12,13 +12,8 @@
5669  #include <linux/clkdev.h>
5670  #include <linux/clk/at91_pmc.h>
5671  #include <linux/of.h>
5672 -#include <linux/of_address.h>
5673 -#include <linux/of_irq.h>
5674 -#include <linux/io.h>
5675 -#include <linux/wait.h>
5676 -#include <linux/sched.h>
5677 -#include <linux/interrupt.h>
5678 -#include <linux/irq.h>
5679 +#include <linux/mfd/syscon.h>
5680 +#include <linux/regmap.h>
5681
5682  #include "pmc.h"
5683
5684 @@ -44,32 +39,26 @@ struct clk_master_layout {
5685
5686  struct clk_master {
5687         struct clk_hw hw;
5688 -       struct at91_pmc *pmc;
5689 -       unsigned int irq;
5690 -       wait_queue_head_t wait;
5691 +       struct regmap *regmap;
5692         const struct clk_master_layout *layout;
5693         const struct clk_master_characteristics *characteristics;
5694  };
5695
5696 -static irqreturn_t clk_master_irq_handler(int irq, void *dev_id)
5697 +static inline bool clk_master_ready(struct regmap *regmap)
5698  {
5699 -       struct clk_master *master = (struct clk_master *)dev_id;
5700 +       unsigned int status;
5701
5702 -       wake_up(&master->wait);
5703 -       disable_irq_nosync(master->irq);
5704 +       regmap_read(regmap, AT91_PMC_SR, &status);
5705
5706 -       return IRQ_HANDLED;
5707 +       return status & AT91_PMC_MCKRDY ? 1 : 0;
5708  }
5709 +
5710  static int clk_master_prepare(struct clk_hw *hw)
5711  {
5712         struct clk_master *master = to_clk_master(hw);
5713 -       struct at91_pmc *pmc = master->pmc;
5714
5715 -       while (!(pmc_read(pmc, AT91_PMC_SR) & AT91_PMC_MCKRDY)) {
5716 -               enable_irq(master->irq);
5717 -               wait_event(master->wait,
5718 -                          pmc_read(pmc, AT91_PMC_SR) & AT91_PMC_MCKRDY);
5719 -       }
5720 +       while (!clk_master_ready(master->regmap))
5721 +               cpu_relax();
5722
5723         return 0;
5724  }
5725 @@ -78,7 +67,7 @@ static int clk_master_is_prepared(struct clk_hw *hw)
5726  {
5727         struct clk_master *master = to_clk_master(hw);
5728
5729 -       return !!(pmc_read(master->pmc, AT91_PMC_SR) & AT91_PMC_MCKRDY);
5730 +       return clk_master_ready(master->regmap);
5731  }
5732
5733  static unsigned long clk_master_recalc_rate(struct clk_hw *hw,
5734 @@ -88,18 +77,16 @@ static unsigned long clk_master_recalc_rate(struct clk_hw *hw,
5735         u8 div;
5736         unsigned long rate = parent_rate;
5737         struct clk_master *master = to_clk_master(hw);
5738 -       struct at91_pmc *pmc = master->pmc;
5739         const struct clk_master_layout *layout = master->layout;
5740         const struct clk_master_characteristics *characteristics =
5741                                                 master->characteristics;
5742 -       u32 tmp;
5743 +       unsigned int mckr;
5744
5745 -       pmc_lock(pmc);
5746 -       tmp = pmc_read(pmc, AT91_PMC_MCKR) & layout->mask;
5747 -       pmc_unlock(pmc);
5748 +       regmap_read(master->regmap, AT91_PMC_MCKR, &mckr);
5749 +       mckr &= layout->mask;
5750
5751 -       pres = (tmp >> layout->pres_shift) & MASTER_PRES_MASK;
5752 -       div = (tmp >> MASTER_DIV_SHIFT) & MASTER_DIV_MASK;
5753 +       pres = (mckr >> layout->pres_shift) & MASTER_PRES_MASK;
5754 +       div = (mckr >> MASTER_DIV_SHIFT) & MASTER_DIV_MASK;
5755
5756         if (characteristics->have_div3_pres && pres == MASTER_PRES_MAX)
5757                 rate /= 3;
5758 @@ -119,9 +106,11 @@ static unsigned long clk_master_recalc_rate(struct clk_hw *hw,
5759  static u8 clk_master_get_parent(struct clk_hw *hw)
5760  {
5761         struct clk_master *master = to_clk_master(hw);
5762 -       struct at91_pmc *pmc = master->pmc;
5763 +       unsigned int mckr;
5764
5765 -       return pmc_read(pmc, AT91_PMC_MCKR) & AT91_PMC_CSS;
5766 +       regmap_read(master->regmap, AT91_PMC_MCKR, &mckr);
5767 +
5768 +       return mckr & AT91_PMC_CSS;
5769  }
5770
5771  static const struct clk_ops master_ops = {
5772 @@ -132,18 +121,17 @@ static const struct clk_ops master_ops = {
5773  };
5774
5775  static struct clk * __init
5776 -at91_clk_register_master(struct at91_pmc *pmc, unsigned int irq,
5777 +at91_clk_register_master(struct regmap *regmap,
5778                 const char *name, int num_parents,
5779                 const char **parent_names,
5780                 const struct clk_master_layout *layout,
5781                 const struct clk_master_characteristics *characteristics)
5782  {
5783 -       int ret;
5784         struct clk_master *master;
5785         struct clk *clk = NULL;
5786         struct clk_init_data init;
5787
5788 -       if (!pmc || !irq || !name || !num_parents || !parent_names)
5789 +       if (!name || !num_parents || !parent_names)
5790                 return ERR_PTR(-EINVAL);
5791
5792         master = kzalloc(sizeof(*master), GFP_KERNEL);
5793 @@ -159,20 +147,10 @@ at91_clk_register_master(struct at91_pmc *pmc, unsigned int irq,
5794         master->hw.init = &init;
5795         master->layout = layout;
5796         master->characteristics = characteristics;
5797 -       master->pmc = pmc;
5798 -       master->irq = irq;
5799 -       init_waitqueue_head(&master->wait);
5800 -       irq_set_status_flags(master->irq, IRQ_NOAUTOEN);
5801 -       ret = request_irq(master->irq, clk_master_irq_handler,
5802 -                         IRQF_TRIGGER_HIGH, "clk-master", master);
5803 -       if (ret) {
5804 -               kfree(master);
5805 -               return ERR_PTR(ret);
5806 -       }
5807 +       master->regmap = regmap;
5808
5809         clk = clk_register(NULL, &master->hw);
5810         if (IS_ERR(clk)) {
5811 -               free_irq(master->irq, master);
5812                 kfree(master);
5813         }
5814
5815 @@ -217,15 +195,15 @@ out_free_characteristics:
5816  }
5817
5818  static void __init
5819 -of_at91_clk_master_setup(struct device_node *np, struct at91_pmc *pmc,
5820 +of_at91_clk_master_setup(struct device_node *np,
5821                          const struct clk_master_layout *layout)
5822  {
5823         struct clk *clk;
5824         int num_parents;
5825 -       unsigned int irq;
5826         const char *parent_names[MASTER_SOURCE_MAX];
5827         const char *name = np->name;
5828         struct clk_master_characteristics *characteristics;
5829 +       struct regmap *regmap;
5830
5831         num_parents = of_clk_get_parent_count(np);
5832         if (num_parents <= 0 || num_parents > MASTER_SOURCE_MAX)
5833 @@ -239,11 +217,11 @@ of_at91_clk_master_setup(struct device_node *np, struct at91_pmc *pmc,
5834         if (!characteristics)
5835                 return;
5836
5837 -       irq = irq_of_parse_and_map(np, 0);
5838 -       if (!irq)
5839 -               goto out_free_characteristics;
5840 +       regmap = syscon_node_to_regmap(of_get_parent(np));
5841 +       if (IS_ERR(regmap))
5842 +               return;
5843
5844 -       clk = at91_clk_register_master(pmc, irq, name, num_parents,
5845 +       clk = at91_clk_register_master(regmap, name, num_parents,
5846                                        parent_names, layout,
5847                                        characteristics);
5848         if (IS_ERR(clk))
5849 @@ -256,14 +234,16 @@ out_free_characteristics:
5850         kfree(characteristics);
5851  }
5852
5853 -void __init of_at91rm9200_clk_master_setup(struct device_node *np,
5854 -                                          struct at91_pmc *pmc)
5855 +static void __init of_at91rm9200_clk_master_setup(struct device_node *np)
5856  {
5857 -       of_at91_clk_master_setup(np, pmc, &at91rm9200_master_layout);
5858 +       of_at91_clk_master_setup(np, &at91rm9200_master_layout);
5859  }
5860 +CLK_OF_DECLARE(at91rm9200_clk_master, "atmel,at91rm9200-clk-master",
5861 +              of_at91rm9200_clk_master_setup);
5862
5863 -void __init of_at91sam9x5_clk_master_setup(struct device_node *np,
5864 -                                          struct at91_pmc *pmc)
5865 +static void __init of_at91sam9x5_clk_master_setup(struct device_node *np)
5866  {
5867 -       of_at91_clk_master_setup(np, pmc, &at91sam9x5_master_layout);
5868 +       of_at91_clk_master_setup(np, &at91sam9x5_master_layout);
5869  }
5870 +CLK_OF_DECLARE(at91sam9x5_clk_master, "atmel,at91sam9x5-clk-master",
5871 +              of_at91sam9x5_clk_master_setup);
5872 diff --git a/drivers/clk/at91/clk-peripheral.c b/drivers/clk/at91/clk-peripheral.c
5873 index 58f3b568e9cb..d69cd2a121b1 100644
5874 --- a/drivers/clk/at91/clk-peripheral.c
5875 +++ b/drivers/clk/at91/clk-peripheral.c
5876 @@ -12,11 +12,13 @@
5877  #include <linux/clkdev.h>
5878  #include <linux/clk/at91_pmc.h>
5879  #include <linux/of.h>
5880 -#include <linux/of_address.h>
5881 -#include <linux/io.h>
5882 +#include <linux/mfd/syscon.h>
5883 +#include <linux/regmap.h>
5884
5885  #include "pmc.h"
5886
5887 +DEFINE_SPINLOCK(pmc_pcr_lock);
5888 +
5889  #define PERIPHERAL_MAX         64
5890
5891  #define PERIPHERAL_AT91RM9200  0
5892 @@ -33,7 +35,7 @@
5893
5894  struct clk_peripheral {
5895         struct clk_hw hw;
5896 -       struct at91_pmc *pmc;
5897 +       struct regmap *regmap;
5898         u32 id;
5899  };
5900
5901 @@ -41,8 +43,9 @@ struct clk_peripheral {
5902
5903  struct clk_sam9x5_peripheral {
5904         struct clk_hw hw;
5905 -       struct at91_pmc *pmc;
5906 +       struct regmap *regmap;
5907         struct clk_range range;
5908 +       spinlock_t *lock;
5909         u32 id;
5910         u32 div;
5911         bool auto_div;
5912 @@ -54,7 +57,6 @@ struct clk_sam9x5_peripheral {
5913  static int clk_peripheral_enable(struct clk_hw *hw)
5914  {
5915         struct clk_peripheral *periph = to_clk_peripheral(hw);
5916 -       struct at91_pmc *pmc = periph->pmc;
5917         int offset = AT91_PMC_PCER;
5918         u32 id = periph->id;
5919
5920 @@ -62,14 +64,14 @@ static int clk_peripheral_enable(struct clk_hw *hw)
5921                 return 0;
5922         if (id > PERIPHERAL_ID_MAX)
5923                 offset = AT91_PMC_PCER1;
5924 -       pmc_write(pmc, offset, PERIPHERAL_MASK(id));
5925 +       regmap_write(periph->regmap, offset, PERIPHERAL_MASK(id));
5926 +
5927         return 0;
5928  }
5929
5930  static void clk_peripheral_disable(struct clk_hw *hw)
5931  {
5932         struct clk_peripheral *periph = to_clk_peripheral(hw);
5933 -       struct at91_pmc *pmc = periph->pmc;
5934         int offset = AT91_PMC_PCDR;
5935         u32 id = periph->id;
5936
5937 @@ -77,21 +79,23 @@ static void clk_peripheral_disable(struct clk_hw *hw)
5938                 return;
5939         if (id > PERIPHERAL_ID_MAX)
5940                 offset = AT91_PMC_PCDR1;
5941 -       pmc_write(pmc, offset, PERIPHERAL_MASK(id));
5942 +       regmap_write(periph->regmap, offset, PERIPHERAL_MASK(id));
5943  }
5944
5945  static int clk_peripheral_is_enabled(struct clk_hw *hw)
5946  {
5947         struct clk_peripheral *periph = to_clk_peripheral(hw);
5948 -       struct at91_pmc *pmc = periph->pmc;
5949         int offset = AT91_PMC_PCSR;
5950 +       unsigned int status;
5951         u32 id = periph->id;
5952
5953         if (id < PERIPHERAL_ID_MIN)
5954                 return 1;
5955         if (id > PERIPHERAL_ID_MAX)
5956                 offset = AT91_PMC_PCSR1;
5957 -       return !!(pmc_read(pmc, offset) & PERIPHERAL_MASK(id));
5958 +       regmap_read(periph->regmap, offset, &status);
5959 +
5960 +       return status & PERIPHERAL_MASK(id) ? 1 : 0;
5961  }
5962
5963  static const struct clk_ops peripheral_ops = {
5964 @@ -101,14 +105,14 @@ static const struct clk_ops peripheral_ops = {
5965  };
5966
5967  static struct clk * __init
5968 -at91_clk_register_peripheral(struct at91_pmc *pmc, const char *name,
5969 +at91_clk_register_peripheral(struct regmap *regmap, const char *name,
5970                              const char *parent_name, u32 id)
5971  {
5972         struct clk_peripheral *periph;
5973         struct clk *clk = NULL;
5974         struct clk_init_data init;
5975
5976 -       if (!pmc || !name || !parent_name || id > PERIPHERAL_ID_MAX)
5977 +       if (!name || !parent_name || id > PERIPHERAL_ID_MAX)
5978                 return ERR_PTR(-EINVAL);
5979
5980         periph = kzalloc(sizeof(*periph), GFP_KERNEL);
5981 @@ -123,7 +127,7 @@ at91_clk_register_peripheral(struct at91_pmc *pmc, const char *name,
5982
5983         periph->id = id;
5984         periph->hw.init = &init;
5985 -       periph->pmc = pmc;
5986 +       periph->regmap = regmap;
5987
5988         clk = clk_register(NULL, &periph->hw);
5989         if (IS_ERR(clk))
5990 @@ -160,53 +164,58 @@ static void clk_sam9x5_peripheral_autodiv(struct clk_sam9x5_peripheral *periph)
5991  static int clk_sam9x5_peripheral_enable(struct clk_hw *hw)
5992  {
5993         struct clk_sam9x5_peripheral *periph = to_clk_sam9x5_peripheral(hw);
5994 -       struct at91_pmc *pmc = periph->pmc;
5995 -       u32 tmp;
5996 +       unsigned long flags;
5997
5998         if (periph->id < PERIPHERAL_ID_MIN)
5999                 return 0;
6000
6001 -       pmc_lock(pmc);
6002 -       pmc_write(pmc, AT91_PMC_PCR, (periph->id & AT91_PMC_PCR_PID_MASK));
6003 -       tmp = pmc_read(pmc, AT91_PMC_PCR) & ~AT91_PMC_PCR_DIV_MASK;
6004 -       pmc_write(pmc, AT91_PMC_PCR, tmp | AT91_PMC_PCR_DIV(periph->div)
6005 -                                        | AT91_PMC_PCR_CMD
6006 -                                        | AT91_PMC_PCR_EN);
6007 -       pmc_unlock(pmc);
6008 +       spin_lock_irqsave(periph->lock, flags);
6009 +       regmap_write(periph->regmap, AT91_PMC_PCR,
6010 +                    (periph->id & AT91_PMC_PCR_PID_MASK));
6011 +       regmap_update_bits(periph->regmap, AT91_PMC_PCR,
6012 +                          AT91_PMC_PCR_DIV_MASK | AT91_PMC_PCR_CMD |
6013 +                          AT91_PMC_PCR_EN,
6014 +                          AT91_PMC_PCR_DIV(periph->div) |
6015 +                          AT91_PMC_PCR_CMD |
6016 +                          AT91_PMC_PCR_EN);
6017 +       spin_unlock_irqrestore(periph->lock, flags);
6018 +
6019         return 0;
6020  }
6021
6022  static void clk_sam9x5_peripheral_disable(struct clk_hw *hw)
6023  {
6024         struct clk_sam9x5_peripheral *periph = to_clk_sam9x5_peripheral(hw);
6025 -       struct at91_pmc *pmc = periph->pmc;
6026 -       u32 tmp;
6027 +       unsigned long flags;
6028
6029         if (periph->id < PERIPHERAL_ID_MIN)
6030                 return;
6031
6032 -       pmc_lock(pmc);
6033 -       pmc_write(pmc, AT91_PMC_PCR, (periph->id & AT91_PMC_PCR_PID_MASK));
6034 -       tmp = pmc_read(pmc, AT91_PMC_PCR) & ~AT91_PMC_PCR_EN;
6035 -       pmc_write(pmc, AT91_PMC_PCR, tmp | AT91_PMC_PCR_CMD);
6036 -       pmc_unlock(pmc);
6037 +       spin_lock_irqsave(periph->lock, flags);
6038 +       regmap_write(periph->regmap, AT91_PMC_PCR,
6039 +                    (periph->id & AT91_PMC_PCR_PID_MASK));
6040 +       regmap_update_bits(periph->regmap, AT91_PMC_PCR,
6041 +                          AT91_PMC_PCR_EN | AT91_PMC_PCR_CMD,
6042 +                          AT91_PMC_PCR_CMD);
6043 +       spin_unlock_irqrestore(periph->lock, flags);
6044  }
6045
6046  static int clk_sam9x5_peripheral_is_enabled(struct clk_hw *hw)
6047  {
6048         struct clk_sam9x5_peripheral *periph = to_clk_sam9x5_peripheral(hw);
6049 -       struct at91_pmc *pmc = periph->pmc;
6050 -       int ret;
6051 +       unsigned long flags;
6052 +       unsigned int status;
6053
6054         if (periph->id < PERIPHERAL_ID_MIN)
6055                 return 1;
6056
6057 -       pmc_lock(pmc);
6058 -       pmc_write(pmc, AT91_PMC_PCR, (periph->id & AT91_PMC_PCR_PID_MASK));
6059 -       ret = !!(pmc_read(pmc, AT91_PMC_PCR) & AT91_PMC_PCR_EN);
6060 -       pmc_unlock(pmc);
6061 +       spin_lock_irqsave(periph->lock, flags);
6062 +       regmap_write(periph->regmap, AT91_PMC_PCR,
6063 +                    (periph->id & AT91_PMC_PCR_PID_MASK));
6064 +       regmap_read(periph->regmap, AT91_PMC_PCR, &status);
6065 +       spin_unlock_irqrestore(periph->lock, flags);
6066
6067 -       return ret;
6068 +       return status & AT91_PMC_PCR_EN ? 1 : 0;
6069  }
6070
6071  static unsigned long
6072 @@ -214,19 +223,20 @@ clk_sam9x5_peripheral_recalc_rate(struct clk_hw *hw,
6073                                   unsigned long parent_rate)
6074  {
6075         struct clk_sam9x5_peripheral *periph = to_clk_sam9x5_peripheral(hw);
6076 -       struct at91_pmc *pmc = periph->pmc;
6077 -       u32 tmp;
6078 +       unsigned long flags;
6079 +       unsigned int status;
6080
6081         if (periph->id < PERIPHERAL_ID_MIN)
6082                 return parent_rate;
6083
6084 -       pmc_lock(pmc);
6085 -       pmc_write(pmc, AT91_PMC_PCR, (periph->id & AT91_PMC_PCR_PID_MASK));
6086 -       tmp = pmc_read(pmc, AT91_PMC_PCR);
6087 -       pmc_unlock(pmc);
6088 +       spin_lock_irqsave(periph->lock, flags);
6089 +       regmap_write(periph->regmap, AT91_PMC_PCR,
6090 +                    (periph->id & AT91_PMC_PCR_PID_MASK));
6091 +       regmap_read(periph->regmap, AT91_PMC_PCR, &status);
6092 +       spin_unlock_irqrestore(periph->lock, flags);
6093
6094 -       if (tmp & AT91_PMC_PCR_EN) {
6095 -               periph->div = PERIPHERAL_RSHIFT(tmp);
6096 +       if (status & AT91_PMC_PCR_EN) {
6097 +               periph->div = PERIPHERAL_RSHIFT(status);
6098                 periph->auto_div = false;
6099         } else {
6100                 clk_sam9x5_peripheral_autodiv(periph);
6101 @@ -318,15 +328,15 @@ static const struct clk_ops sam9x5_peripheral_ops = {
6102  };
6103
6104  static struct clk * __init
6105 -at91_clk_register_sam9x5_peripheral(struct at91_pmc *pmc, const char *name,
6106 -                                   const char *parent_name, u32 id,
6107 -                                   const struct clk_range *range)
6108 +at91_clk_register_sam9x5_peripheral(struct regmap *regmap, spinlock_t *lock,
6109 +                                   const char *name, const char *parent_name,
6110 +                                   u32 id, const struct clk_range *range)
6111  {
6112         struct clk_sam9x5_peripheral *periph;
6113         struct clk *clk = NULL;
6114         struct clk_init_data init;
6115
6116 -       if (!pmc || !name || !parent_name)
6117 +       if (!name || !parent_name)
6118                 return ERR_PTR(-EINVAL);
6119
6120         periph = kzalloc(sizeof(*periph), GFP_KERNEL);
6121 @@ -342,7 +352,8 @@ at91_clk_register_sam9x5_peripheral(struct at91_pmc *pmc, const char *name,
6122         periph->id = id;
6123         periph->hw.init = &init;
6124         periph->div = 0;
6125 -       periph->pmc = pmc;
6126 +       periph->regmap = regmap;
6127 +       periph->lock = lock;
6128         periph->auto_div = true;
6129         periph->range = *range;
6130
6131 @@ -356,7 +367,7 @@ at91_clk_register_sam9x5_peripheral(struct at91_pmc *pmc, const char *name,
6132  }
6133
6134  static void __init
6135 -of_at91_clk_periph_setup(struct device_node *np, struct at91_pmc *pmc, u8 type)
6136 +of_at91_clk_periph_setup(struct device_node *np, u8 type)
6137  {
6138         int num;
6139         u32 id;
6140 @@ -364,6 +375,7 @@ of_at91_clk_periph_setup(struct device_node *np, struct at91_pmc *pmc, u8 type)
6141         const char *parent_name;
6142         const char *name;
6143         struct device_node *periphclknp;
6144 +       struct regmap *regmap;
6145
6146         parent_name = of_clk_get_parent_name(np, 0);
6147         if (!parent_name)
6148 @@ -373,6 +385,10 @@ of_at91_clk_periph_setup(struct device_node *np, struct at91_pmc *pmc, u8 type)
6149         if (!num || num > PERIPHERAL_MAX)
6150                 return;
6151
6152 +       regmap = syscon_node_to_regmap(of_get_parent(np));
6153 +       if (IS_ERR(regmap))
6154 +               return;
6155 +
6156         for_each_child_of_node(np, periphclknp) {
6157                 if (of_property_read_u32(periphclknp, "reg", &id))
6158                         continue;
6159 @@ -384,7 +400,7 @@ of_at91_clk_periph_setup(struct device_node *np, struct at91_pmc *pmc, u8 type)
6160                         name = periphclknp->name;
6161
6162                 if (type == PERIPHERAL_AT91RM9200) {
6163 -                       clk = at91_clk_register_peripheral(pmc, name,
6164 +                       clk = at91_clk_register_peripheral(regmap, name,
6165                                                            parent_name, id);
6166                 } else {
6167                         struct clk_range range = CLK_RANGE(0, 0);
6168 @@ -393,7 +409,9 @@ of_at91_clk_periph_setup(struct device_node *np, struct at91_pmc *pmc, u8 type)
6169                                               "atmel,clk-output-range",
6170                                               &range);
6171
6172 -                       clk = at91_clk_register_sam9x5_peripheral(pmc, name,
6173 +                       clk = at91_clk_register_sam9x5_peripheral(regmap,
6174 +                                                                 &pmc_pcr_lock,
6175 +                                                                 name,
6176                                                                   parent_name,
6177                                                                   id, &range);
6178                 }
6179 @@ -405,14 +423,16 @@ of_at91_clk_periph_setup(struct device_node *np, struct at91_pmc *pmc, u8 type)
6180         }
6181  }
6182
6183 -void __init of_at91rm9200_clk_periph_setup(struct device_node *np,
6184 -                                          struct at91_pmc *pmc)
6185 +static void __init of_at91rm9200_clk_periph_setup(struct device_node *np)
6186  {
6187 -       of_at91_clk_periph_setup(np, pmc, PERIPHERAL_AT91RM9200);
6188 +       of_at91_clk_periph_setup(np, PERIPHERAL_AT91RM9200);
6189  }
6190 +CLK_OF_DECLARE(at91rm9200_clk_periph, "atmel,at91rm9200-clk-peripheral",
6191 +              of_at91rm9200_clk_periph_setup);
6192
6193 -void __init of_at91sam9x5_clk_periph_setup(struct device_node *np,
6194 -                                          struct at91_pmc *pmc)
6195 +static void __init of_at91sam9x5_clk_periph_setup(struct device_node *np)
6196  {
6197 -       of_at91_clk_periph_setup(np, pmc, PERIPHERAL_AT91SAM9X5);
6198 +       of_at91_clk_periph_setup(np, PERIPHERAL_AT91SAM9X5);
6199  }
6200 +CLK_OF_DECLARE(at91sam9x5_clk_periph, "atmel,at91sam9x5-clk-peripheral",
6201 +              of_at91sam9x5_clk_periph_setup);
6202 diff --git a/drivers/clk/at91/clk-pll.c b/drivers/clk/at91/clk-pll.c
6203 index 18b60f4895a6..fb2e0b56d4b7 100644
6204 --- a/drivers/clk/at91/clk-pll.c
6205 +++ b/drivers/clk/at91/clk-pll.c
6206 @@ -12,14 +12,8 @@
6207  #include <linux/clkdev.h>
6208  #include <linux/clk/at91_pmc.h>
6209  #include <linux/of.h>
6210 -#include <linux/of_address.h>
6211 -#include <linux/of_irq.h>
6212 -#include <linux/io.h>
6213 -#include <linux/kernel.h>
6214 -#include <linux/wait.h>
6215 -#include <linux/sched.h>
6216 -#include <linux/interrupt.h>
6217 -#include <linux/irq.h>
6218 +#include <linux/mfd/syscon.h>
6219 +#include <linux/regmap.h>
6220
6221  #include "pmc.h"
6222
6223 @@ -58,9 +52,7 @@ struct clk_pll_layout {
6224
6225  struct clk_pll {
6226         struct clk_hw hw;
6227 -       struct at91_pmc *pmc;
6228 -       unsigned int irq;
6229 -       wait_queue_head_t wait;
6230 +       struct regmap *regmap;
6231         u8 id;
6232         u8 div;
6233         u8 range;
6234 @@ -69,20 +61,19 @@ struct clk_pll {
6235         const struct clk_pll_characteristics *characteristics;
6236  };
6237
6238 -static irqreturn_t clk_pll_irq_handler(int irq, void *dev_id)
6239 +static inline bool clk_pll_ready(struct regmap *regmap, int id)
6240  {
6241 -       struct clk_pll *pll = (struct clk_pll *)dev_id;
6242 +       unsigned int status;
6243
6244 -       wake_up(&pll->wait);
6245 -       disable_irq_nosync(pll->irq);
6246 +       regmap_read(regmap, AT91_PMC_SR, &status);
6247
6248 -       return IRQ_HANDLED;
6249 +       return status & PLL_STATUS_MASK(id) ? 1 : 0;
6250  }
6251
6252  static int clk_pll_prepare(struct clk_hw *hw)
6253  {
6254         struct clk_pll *pll = to_clk_pll(hw);
6255 -       struct at91_pmc *pmc = pll->pmc;
6256 +       struct regmap *regmap = pll->regmap;
6257         const struct clk_pll_layout *layout = pll->layout;
6258         const struct clk_pll_characteristics *characteristics =
6259                                                         pll->characteristics;
6260 @@ -90,39 +81,34 @@ static int clk_pll_prepare(struct clk_hw *hw)
6261         u32 mask = PLL_STATUS_MASK(id);
6262         int offset = PLL_REG(id);
6263         u8 out = 0;
6264 -       u32 pllr, icpr;
6265 +       unsigned int pllr;
6266 +       unsigned int status;
6267         u8 div;
6268         u16 mul;
6269
6270 -       pllr = pmc_read(pmc, offset);
6271 +       regmap_read(regmap, offset, &pllr);
6272         div = PLL_DIV(pllr);
6273         mul = PLL_MUL(pllr, layout);
6274
6275 -       if ((pmc_read(pmc, AT91_PMC_SR) & mask) &&
6276 +       regmap_read(regmap, AT91_PMC_SR, &status);
6277 +       if ((status & mask) &&
6278             (div == pll->div && mul == pll->mul))
6279                 return 0;
6280
6281         if (characteristics->out)
6282                 out = characteristics->out[pll->range];
6283 -       if (characteristics->icpll) {
6284 -               icpr = pmc_read(pmc, AT91_PMC_PLLICPR) & ~PLL_ICPR_MASK(id);
6285 -               icpr |= (characteristics->icpll[pll->range] <<
6286 -                       PLL_ICPR_SHIFT(id));
6287 -               pmc_write(pmc, AT91_PMC_PLLICPR, icpr);
6288 -       }
6289
6290 -       pllr &= ~layout->pllr_mask;
6291 -       pllr |= layout->pllr_mask &
6292 -              (pll->div | (PLL_MAX_COUNT << PLL_COUNT_SHIFT) |
6293 -               (out << PLL_OUT_SHIFT) |
6294 -               ((pll->mul & layout->mul_mask) << layout->mul_shift));
6295 -       pmc_write(pmc, offset, pllr);
6296 -
6297 -       while (!(pmc_read(pmc, AT91_PMC_SR) & mask)) {
6298 -               enable_irq(pll->irq);
6299 -               wait_event(pll->wait,
6300 -                          pmc_read(pmc, AT91_PMC_SR) & mask);
6301 -       }
6302 +       if (characteristics->icpll)
6303 +               regmap_update_bits(regmap, AT91_PMC_PLLICPR, PLL_ICPR_MASK(id),
6304 +                       characteristics->icpll[pll->range] << PLL_ICPR_SHIFT(id));
6305 +
6306 +       regmap_update_bits(regmap, offset, layout->pllr_mask,
6307 +                       pll->div | (PLL_MAX_COUNT << PLL_COUNT_SHIFT) |
6308 +                       (out << PLL_OUT_SHIFT) |
6309 +                       ((pll->mul & layout->mul_mask) << layout->mul_shift));
6310 +
6311 +       while (!clk_pll_ready(regmap, pll->id))
6312 +               cpu_relax();
6313
6314         return 0;
6315  }
6316 @@ -130,32 +116,35 @@ static int clk_pll_prepare(struct clk_hw *hw)
6317  static int clk_pll_is_prepared(struct clk_hw *hw)
6318  {
6319         struct clk_pll *pll = to_clk_pll(hw);
6320 -       struct at91_pmc *pmc = pll->pmc;
6321
6322 -       return !!(pmc_read(pmc, AT91_PMC_SR) &
6323 -                 PLL_STATUS_MASK(pll->id));
6324 +       return clk_pll_ready(pll->regmap, pll->id);
6325  }
6326
6327  static void clk_pll_unprepare(struct clk_hw *hw)
6328  {
6329         struct clk_pll *pll = to_clk_pll(hw);
6330 -       struct at91_pmc *pmc = pll->pmc;
6331 -       const struct clk_pll_layout *layout = pll->layout;
6332 -       int offset = PLL_REG(pll->id);
6333 -       u32 tmp = pmc_read(pmc, offset) & ~(layout->pllr_mask);
6334 +       unsigned int mask = pll->layout->pllr_mask;
6335
6336 -       pmc_write(pmc, offset, tmp);
6337 +       regmap_update_bits(pll->regmap, PLL_REG(pll->id), mask, ~mask);
6338  }
6339
6340  static unsigned long clk_pll_recalc_rate(struct clk_hw *hw,
6341                                          unsigned long parent_rate)
6342  {
6343         struct clk_pll *pll = to_clk_pll(hw);
6344 +       unsigned int pllr;
6345 +       u16 mul;
6346 +       u8 div;
6347
6348 -       if (!pll->div || !pll->mul)
6349 +       regmap_read(pll->regmap, PLL_REG(pll->id), &pllr);
6350 +
6351 +       div = PLL_DIV(pllr);
6352 +       mul = PLL_MUL(pllr, pll->layout);
6353 +
6354 +       if (!div || !mul)
6355                 return 0;
6356
6357 -       return (parent_rate / pll->div) * (pll->mul + 1);
6358 +       return (parent_rate / div) * (mul + 1);
6359  }
6360
6361  static long clk_pll_get_best_div_mul(struct clk_pll *pll, unsigned long rate,
6362 @@ -308,7 +297,7 @@ static const struct clk_ops pll_ops = {
6363  };
6364
6365  static struct clk * __init
6366 -at91_clk_register_pll(struct at91_pmc *pmc, unsigned int irq, const char *name,
6367 +at91_clk_register_pll(struct regmap *regmap, const char *name,
6368                       const char *parent_name, u8 id,
6369                       const struct clk_pll_layout *layout,
6370                       const struct clk_pll_characteristics *characteristics)
6371 @@ -316,9 +305,8 @@ at91_clk_register_pll(struct at91_pmc *pmc, unsigned int irq, const char *name,
6372         struct clk_pll *pll;
6373         struct clk *clk = NULL;
6374         struct clk_init_data init;
6375 -       int ret;
6376         int offset = PLL_REG(id);
6377 -       u32 tmp;
6378 +       unsigned int pllr;
6379
6380         if (id > PLL_MAX_ID)
6381                 return ERR_PTR(-EINVAL);
6382 @@ -337,23 +325,13 @@ at91_clk_register_pll(struct at91_pmc *pmc, unsigned int irq, const char *name,
6383         pll->hw.init = &init;
6384         pll->layout = layout;
6385         pll->characteristics = characteristics;
6386 -       pll->pmc = pmc;
6387 -       pll->irq = irq;
6388 -       tmp = pmc_read(pmc, offset) & layout->pllr_mask;
6389 -       pll->div = PLL_DIV(tmp);
6390 -       pll->mul = PLL_MUL(tmp, layout);
6391 -       init_waitqueue_head(&pll->wait);
6392 -       irq_set_status_flags(pll->irq, IRQ_NOAUTOEN);
6393 -       ret = request_irq(pll->irq, clk_pll_irq_handler, IRQF_TRIGGER_HIGH,
6394 -                         id ? "clk-pllb" : "clk-plla", pll);
6395 -       if (ret) {
6396 -               kfree(pll);
6397 -               return ERR_PTR(ret);
6398 -       }
6399 +       pll->regmap = regmap;
6400 +       regmap_read(regmap, offset, &pllr);
6401 +       pll->div = PLL_DIV(pllr);
6402 +       pll->mul = PLL_MUL(pllr, layout);
6403
6404         clk = clk_register(NULL, &pll->hw);
6405         if (IS_ERR(clk)) {
6406 -               free_irq(pll->irq, pll);
6407                 kfree(pll);
6408         }
6409
6410 @@ -483,12 +461,12 @@ out_free_characteristics:
6411  }
6412
6413  static void __init
6414 -of_at91_clk_pll_setup(struct device_node *np, struct at91_pmc *pmc,
6415 +of_at91_clk_pll_setup(struct device_node *np,
6416                       const struct clk_pll_layout *layout)
6417  {
6418         u32 id;
6419 -       unsigned int irq;
6420         struct clk *clk;
6421 +       struct regmap *regmap;
6422         const char *parent_name;
6423         const char *name = np->name;
6424         struct clk_pll_characteristics *characteristics;
6425 @@ -500,15 +478,15 @@ of_at91_clk_pll_setup(struct device_node *np, struct at91_pmc *pmc,
6426
6427         of_property_read_string(np, "clock-output-names", &name);
6428
6429 -       characteristics = of_at91_clk_pll_get_characteristics(np);
6430 -       if (!characteristics)
6431 +       regmap = syscon_node_to_regmap(of_get_parent(np));
6432 +       if (IS_ERR(regmap))
6433                 return;
6434
6435 -       irq = irq_of_parse_and_map(np, 0);
6436 -       if (!irq)
6437 +       characteristics = of_at91_clk_pll_get_characteristics(np);
6438 +       if (!characteristics)
6439                 return;
6440
6441 -       clk = at91_clk_register_pll(pmc, irq, name, parent_name, id, layout,
6442 +       clk = at91_clk_register_pll(regmap, name, parent_name, id, layout,
6443                                     characteristics);
6444         if (IS_ERR(clk))
6445                 goto out_free_characteristics;
6446 @@ -520,26 +498,30 @@ out_free_characteristics:
6447         kfree(characteristics);
6448  }
6449
6450 -void __init of_at91rm9200_clk_pll_setup(struct device_node *np,
6451 -                                              struct at91_pmc *pmc)
6452 +static void __init of_at91rm9200_clk_pll_setup(struct device_node *np)
6453  {
6454 -       of_at91_clk_pll_setup(np, pmc, &at91rm9200_pll_layout);
6455 +       of_at91_clk_pll_setup(np, &at91rm9200_pll_layout);
6456  }
6457 +CLK_OF_DECLARE(at91rm9200_clk_pll, "atmel,at91rm9200-clk-pll",
6458 +              of_at91rm9200_clk_pll_setup);
6459
6460 -void __init of_at91sam9g45_clk_pll_setup(struct device_node *np,
6461 -                                               struct at91_pmc *pmc)
6462 +static void __init of_at91sam9g45_clk_pll_setup(struct device_node *np)
6463  {
6464 -       of_at91_clk_pll_setup(np, pmc, &at91sam9g45_pll_layout);
6465 +       of_at91_clk_pll_setup(np, &at91sam9g45_pll_layout);
6466  }
6467 +CLK_OF_DECLARE(at91sam9g45_clk_pll, "atmel,at91sam9g45-clk-pll",
6468 +              of_at91sam9g45_clk_pll_setup);
6469
6470 -void __init of_at91sam9g20_clk_pllb_setup(struct device_node *np,
6471 -                                                struct at91_pmc *pmc)
6472 +static void __init of_at91sam9g20_clk_pllb_setup(struct device_node *np)
6473  {
6474 -       of_at91_clk_pll_setup(np, pmc, &at91sam9g20_pllb_layout);
6475 +       of_at91_clk_pll_setup(np, &at91sam9g20_pllb_layout);
6476  }
6477 +CLK_OF_DECLARE(at91sam9g20_clk_pllb, "atmel,at91sam9g20-clk-pllb",
6478 +              of_at91sam9g20_clk_pllb_setup);
6479
6480 -void __init of_sama5d3_clk_pll_setup(struct device_node *np,
6481 -                                           struct at91_pmc *pmc)
6482 +static void __init of_sama5d3_clk_pll_setup(struct device_node *np)
6483  {
6484 -       of_at91_clk_pll_setup(np, pmc, &sama5d3_pll_layout);
6485 +       of_at91_clk_pll_setup(np, &sama5d3_pll_layout);
6486  }
6487 +CLK_OF_DECLARE(sama5d3_clk_pll, "atmel,sama5d3-clk-pll",
6488 +              of_sama5d3_clk_pll_setup);
6489 diff --git a/drivers/clk/at91/clk-plldiv.c b/drivers/clk/at91/clk-plldiv.c
6490 index ea226562bb40..2bed26481027 100644
6491 --- a/drivers/clk/at91/clk-plldiv.c
6492 +++ b/drivers/clk/at91/clk-plldiv.c
6493 @@ -12,8 +12,8 @@
6494  #include <linux/clkdev.h>
6495  #include <linux/clk/at91_pmc.h>
6496  #include <linux/of.h>
6497 -#include <linux/of_address.h>
6498 -#include <linux/io.h>
6499 +#include <linux/mfd/syscon.h>
6500 +#include <linux/regmap.h>
6501
6502  #include "pmc.h"
6503
6504 @@ -21,16 +21,18 @@
6505
6506  struct clk_plldiv {
6507         struct clk_hw hw;
6508 -       struct at91_pmc *pmc;
6509 +       struct regmap *regmap;
6510  };
6511
6512  static unsigned long clk_plldiv_recalc_rate(struct clk_hw *hw,
6513                                             unsigned long parent_rate)
6514  {
6515         struct clk_plldiv *plldiv = to_clk_plldiv(hw);
6516 -       struct at91_pmc *pmc = plldiv->pmc;
6517 +       unsigned int mckr;
6518
6519 -       if (pmc_read(pmc, AT91_PMC_MCKR) & AT91_PMC_PLLADIV2)
6520 +       regmap_read(plldiv->regmap, AT91_PMC_MCKR, &mckr);
6521 +
6522 +       if (mckr & AT91_PMC_PLLADIV2)
6523                 return parent_rate / 2;
6524
6525         return parent_rate;
6526 @@ -57,18 +59,12 @@ static int clk_plldiv_set_rate(struct clk_hw *hw, unsigned long rate,
6527                                unsigned long parent_rate)
6528  {
6529         struct clk_plldiv *plldiv = to_clk_plldiv(hw);
6530 -       struct at91_pmc *pmc = plldiv->pmc;
6531 -       u32 tmp;
6532
6533 -       if (parent_rate != rate && (parent_rate / 2) != rate)
6534 +       if ((parent_rate != rate) && (parent_rate / 2 != rate))
6535                 return -EINVAL;
6536
6537 -       pmc_lock(pmc);
6538 -       tmp = pmc_read(pmc, AT91_PMC_MCKR) & ~AT91_PMC_PLLADIV2;
6539 -       if ((parent_rate / 2) == rate)
6540 -               tmp |= AT91_PMC_PLLADIV2;
6541 -       pmc_write(pmc, AT91_PMC_MCKR, tmp);
6542 -       pmc_unlock(pmc);
6543 +       regmap_update_bits(plldiv->regmap, AT91_PMC_MCKR, AT91_PMC_PLLADIV2,
6544 +                          parent_rate != rate ? AT91_PMC_PLLADIV2 : 0);
6545
6546         return 0;
6547  }
6548 @@ -80,7 +76,7 @@ static const struct clk_ops plldiv_ops = {
6549  };
6550
6551  static struct clk * __init
6552 -at91_clk_register_plldiv(struct at91_pmc *pmc, const char *name,
6553 +at91_clk_register_plldiv(struct regmap *regmap, const char *name,
6554                          const char *parent_name)
6555  {
6556         struct clk_plldiv *plldiv;
6557 @@ -98,7 +94,7 @@ at91_clk_register_plldiv(struct at91_pmc *pmc, const char *name,
6558         init.flags = CLK_SET_RATE_GATE;
6559
6560         plldiv->hw.init = &init;
6561 -       plldiv->pmc = pmc;
6562 +       plldiv->regmap = regmap;
6563
6564         clk = clk_register(NULL, &plldiv->hw);
6565
6566 @@ -109,27 +105,27 @@ at91_clk_register_plldiv(struct at91_pmc *pmc, const char *name,
6567  }
6568
6569  static void __init
6570 -of_at91_clk_plldiv_setup(struct device_node *np, struct at91_pmc *pmc)
6571 +of_at91sam9x5_clk_plldiv_setup(struct device_node *np)
6572  {
6573         struct clk *clk;
6574         const char *parent_name;
6575         const char *name = np->name;
6576 +       struct regmap *regmap;
6577
6578         parent_name = of_clk_get_parent_name(np, 0);
6579
6580         of_property_read_string(np, "clock-output-names", &name);
6581
6582 -       clk = at91_clk_register_plldiv(pmc, name, parent_name);
6583 +       regmap = syscon_node_to_regmap(of_get_parent(np));
6584 +       if (IS_ERR(regmap))
6585 +               return;
6586
6587 +       clk = at91_clk_register_plldiv(regmap, name, parent_name);
6588         if (IS_ERR(clk))
6589                 return;
6590
6591         of_clk_add_provider(np, of_clk_src_simple_get, clk);
6592         return;
6593  }
6594 -
6595 -void __init of_at91sam9x5_clk_plldiv_setup(struct device_node *np,
6596 -                                          struct at91_pmc *pmc)
6597 -{
6598 -       of_at91_clk_plldiv_setup(np, pmc);
6599 -}
6600 +CLK_OF_DECLARE(at91sam9x5_clk_plldiv, "atmel,at91sam9x5-clk-plldiv",
6601 +              of_at91sam9x5_clk_plldiv_setup);
6602 diff --git a/drivers/clk/at91/clk-programmable.c b/drivers/clk/at91/clk-programmable.c
6603 index 14b270b85fec..bc0be629671b 100644
6604 --- a/drivers/clk/at91/clk-programmable.c
6605 +++ b/drivers/clk/at91/clk-programmable.c
6606 @@ -12,10 +12,8 @@
6607  #include <linux/clkdev.h>
6608  #include <linux/clk/at91_pmc.h>
6609  #include <linux/of.h>
6610 -#include <linux/of_address.h>
6611 -#include <linux/io.h>
6612 -#include <linux/wait.h>
6613 -#include <linux/sched.h>
6614 +#include <linux/mfd/syscon.h>
6615 +#include <linux/regmap.h>
6616
6617  #include "pmc.h"
6618
6619 @@ -24,6 +22,7 @@
6620
6621  #define PROG_STATUS_MASK(id)   (1 << ((id) + 8))
6622  #define PROG_PRES_MASK         0x7
6623 +#define PROG_PRES(layout, pckr)        ((pckr >> layout->pres_shift) & PROG_PRES_MASK)
6624  #define PROG_MAX_RM9200_CSS    3
6625
6626  struct clk_programmable_layout {
6627 @@ -34,7 +33,7 @@ struct clk_programmable_layout {
6628
6629  struct clk_programmable {
6630         struct clk_hw hw;
6631 -       struct at91_pmc *pmc;
6632 +       struct regmap *regmap;
6633         u8 id;
6634         const struct clk_programmable_layout *layout;
6635  };
6636 @@ -44,14 +43,12 @@ struct clk_programmable {
6637  static unsigned long clk_programmable_recalc_rate(struct clk_hw *hw,
6638                                                   unsigned long parent_rate)
6639  {
6640 -       u32 pres;
6641         struct clk_programmable *prog = to_clk_programmable(hw);
6642 -       struct at91_pmc *pmc = prog->pmc;
6643 -       const struct clk_programmable_layout *layout = prog->layout;
6644 +       unsigned int pckr;
6645 +
6646 +       regmap_read(prog->regmap, AT91_PMC_PCKR(prog->id), &pckr);
6647
6648 -       pres = (pmc_read(pmc, AT91_PMC_PCKR(prog->id)) >> layout->pres_shift) &
6649 -              PROG_PRES_MASK;
6650 -       return parent_rate >> pres;
6651 +       return parent_rate >> PROG_PRES(prog->layout, pckr);
6652  }
6653
6654  static int clk_programmable_determine_rate(struct clk_hw *hw,
6655 @@ -101,36 +98,36 @@ static int clk_programmable_set_parent(struct clk_hw *hw, u8 index)
6656  {
6657         struct clk_programmable *prog = to_clk_programmable(hw);
6658         const struct clk_programmable_layout *layout = prog->layout;
6659 -       struct at91_pmc *pmc = prog->pmc;
6660 -       u32 tmp = pmc_read(pmc, AT91_PMC_PCKR(prog->id)) & ~layout->css_mask;
6661 +       unsigned int mask = layout->css_mask;
6662 +       unsigned int pckr = 0;
6663
6664         if (layout->have_slck_mck)
6665 -               tmp &= AT91_PMC_CSSMCK_MCK;
6666 +               mask |= AT91_PMC_CSSMCK_MCK;
6667
6668         if (index > layout->css_mask) {
6669 -               if (index > PROG_MAX_RM9200_CSS && layout->have_slck_mck) {
6670 -                       tmp |= AT91_PMC_CSSMCK_MCK;
6671 -                       return 0;
6672 -               } else {
6673 +               if (index > PROG_MAX_RM9200_CSS && !layout->have_slck_mck)
6674                         return -EINVAL;
6675 -               }
6676 +
6677 +               pckr |= AT91_PMC_CSSMCK_MCK;
6678         }
6679
6680 -       pmc_write(pmc, AT91_PMC_PCKR(prog->id), tmp | index);
6681 +       regmap_update_bits(prog->regmap, AT91_PMC_PCKR(prog->id), mask, pckr);
6682 +
6683         return 0;
6684  }
6685
6686  static u8 clk_programmable_get_parent(struct clk_hw *hw)
6687  {
6688 -       u32 tmp;
6689 -       u8 ret;
6690         struct clk_programmable *prog = to_clk_programmable(hw);
6691 -       struct at91_pmc *pmc = prog->pmc;
6692         const struct clk_programmable_layout *layout = prog->layout;
6693 +       unsigned int pckr;
6694 +       u8 ret;
6695 +
6696 +       regmap_read(prog->regmap, AT91_PMC_PCKR(prog->id), &pckr);
6697 +
6698 +       ret = pckr & layout->css_mask;
6699
6700 -       tmp = pmc_read(pmc, AT91_PMC_PCKR(prog->id));
6701 -       ret = tmp & layout->css_mask;
6702 -       if (layout->have_slck_mck && (tmp & AT91_PMC_CSSMCK_MCK) && !ret)
6703 +       if (layout->have_slck_mck && (pckr & AT91_PMC_CSSMCK_MCK) && !ret)
6704                 ret = PROG_MAX_RM9200_CSS + 1;
6705
6706         return ret;
6707 @@ -140,26 +137,27 @@ static int clk_programmable_set_rate(struct clk_hw *hw, unsigned long rate,
6708                                      unsigned long parent_rate)
6709  {
6710         struct clk_programmable *prog = to_clk_programmable(hw);
6711 -       struct at91_pmc *pmc = prog->pmc;
6712         const struct clk_programmable_layout *layout = prog->layout;
6713         unsigned long div = parent_rate / rate;
6714 +       unsigned int pckr;
6715         int shift = 0;
6716 -       u32 tmp = pmc_read(pmc, AT91_PMC_PCKR(prog->id)) &
6717 -                 ~(PROG_PRES_MASK << layout->pres_shift);
6718 +
6719 +       regmap_read(prog->regmap, AT91_PMC_PCKR(prog->id), &pckr);
6720
6721         if (!div)
6722                 return -EINVAL;
6723
6724         shift = fls(div) - 1;
6725
6726 -       if (div != (1<<shift))
6727 +       if (div != (1 << shift))
6728                 return -EINVAL;
6729
6730         if (shift >= PROG_PRES_MASK)
6731                 return -EINVAL;
6732
6733 -       pmc_write(pmc, AT91_PMC_PCKR(prog->id),
6734 -                 tmp | (shift << layout->pres_shift));
6735 +       regmap_update_bits(prog->regmap, AT91_PMC_PCKR(prog->id),
6736 +                          PROG_PRES_MASK << layout->pres_shift,
6737 +                          shift << layout->pres_shift);
6738
6739         return 0;
6740  }
6741 @@ -173,7 +171,7 @@ static const struct clk_ops programmable_ops = {
6742  };
6743
6744  static struct clk * __init
6745 -at91_clk_register_programmable(struct at91_pmc *pmc,
6746 +at91_clk_register_programmable(struct regmap *regmap,
6747                                const char *name, const char **parent_names,
6748                                u8 num_parents, u8 id,
6749                                const struct clk_programmable_layout *layout)
6750 @@ -198,7 +196,7 @@ at91_clk_register_programmable(struct at91_pmc *pmc,
6751         prog->id = id;
6752         prog->layout = layout;
6753         prog->hw.init = &init;
6754 -       prog->pmc = pmc;
6755 +       prog->regmap = regmap;
6756
6757         clk = clk_register(NULL, &prog->hw);
6758         if (IS_ERR(clk))
6759 @@ -226,7 +224,7 @@ static const struct clk_programmable_layout at91sam9x5_programmable_layout = {
6760  };
6761
6762  static void __init
6763 -of_at91_clk_prog_setup(struct device_node *np, struct at91_pmc *pmc,
6764 +of_at91_clk_prog_setup(struct device_node *np,
6765                        const struct clk_programmable_layout *layout)
6766  {
6767         int num;
6768 @@ -236,6 +234,7 @@ of_at91_clk_prog_setup(struct device_node *np, struct at91_pmc *pmc,
6769         const char *parent_names[PROG_SOURCE_MAX];
6770         const char *name;
6771         struct device_node *progclknp;
6772 +       struct regmap *regmap;
6773
6774         num_parents = of_clk_get_parent_count(np);
6775         if (num_parents <= 0 || num_parents > PROG_SOURCE_MAX)
6776 @@ -247,6 +246,10 @@ of_at91_clk_prog_setup(struct device_node *np, struct at91_pmc *pmc,
6777         if (!num || num > (PROG_ID_MAX + 1))
6778                 return;
6779
6780 +       regmap = syscon_node_to_regmap(of_get_parent(np));
6781 +       if (IS_ERR(regmap))
6782 +               return;
6783 +
6784         for_each_child_of_node(np, progclknp) {
6785                 if (of_property_read_u32(progclknp, "reg", &id))
6786                         continue;
6787 @@ -254,7 +257,7 @@ of_at91_clk_prog_setup(struct device_node *np, struct at91_pmc *pmc,
6788                 if (of_property_read_string(np, "clock-output-names", &name))
6789                         name = progclknp->name;
6790
6791 -               clk = at91_clk_register_programmable(pmc, name,
6792 +               clk = at91_clk_register_programmable(regmap, name,
6793                                                      parent_names, num_parents,
6794                                                      id, layout);
6795                 if (IS_ERR(clk))
6796 @@ -265,20 +268,23 @@ of_at91_clk_prog_setup(struct device_node *np, struct at91_pmc *pmc,
6797  }
6798
6799
6800 -void __init of_at91rm9200_clk_prog_setup(struct device_node *np,
6801 -                                        struct at91_pmc *pmc)
6802 +static void __init of_at91rm9200_clk_prog_setup(struct device_node *np)
6803  {
6804 -       of_at91_clk_prog_setup(np, pmc, &at91rm9200_programmable_layout);
6805 +       of_at91_clk_prog_setup(np, &at91rm9200_programmable_layout);
6806  }
6807 +CLK_OF_DECLARE(at91rm9200_clk_prog, "atmel,at91rm9200-clk-programmable",
6808 +              of_at91rm9200_clk_prog_setup);
6809
6810 -void __init of_at91sam9g45_clk_prog_setup(struct device_node *np,
6811 -                                         struct at91_pmc *pmc)
6812 +static void __init of_at91sam9g45_clk_prog_setup(struct device_node *np)
6813  {
6814 -       of_at91_clk_prog_setup(np, pmc, &at91sam9g45_programmable_layout);
6815 +       of_at91_clk_prog_setup(np, &at91sam9g45_programmable_layout);
6816  }
6817 +CLK_OF_DECLARE(at91sam9g45_clk_prog, "atmel,at91sam9g45-clk-programmable",
6818 +              of_at91sam9g45_clk_prog_setup);
6819
6820 -void __init of_at91sam9x5_clk_prog_setup(struct device_node *np,
6821 -                                        struct at91_pmc *pmc)
6822 +static void __init of_at91sam9x5_clk_prog_setup(struct device_node *np)
6823  {
6824 -       of_at91_clk_prog_setup(np, pmc, &at91sam9x5_programmable_layout);
6825 +       of_at91_clk_prog_setup(np, &at91sam9x5_programmable_layout);
6826  }
6827 +CLK_OF_DECLARE(at91sam9x5_clk_prog, "atmel,at91sam9x5-clk-programmable",
6828 +              of_at91sam9x5_clk_prog_setup);
6829 diff --git a/drivers/clk/at91/clk-slow.c b/drivers/clk/at91/clk-slow.c
6830 index d0d5076a9b94..221c09684ba3 100644
6831 --- a/drivers/clk/at91/clk-slow.c
6832 +++ b/drivers/clk/at91/clk-slow.c
6833 @@ -13,17 +13,11 @@
6834  #include <linux/clk.h>
6835  #include <linux/clk-provider.h>
6836  #include <linux/clkdev.h>
6837 -#include <linux/slab.h>
6838  #include <linux/clk/at91_pmc.h>
6839  #include <linux/delay.h>
6840  #include <linux/of.h>
6841 -#include <linux/of_address.h>
6842 -#include <linux/of_irq.h>
6843 -#include <linux/io.h>
6844 -#include <linux/interrupt.h>
6845 -#include <linux/irq.h>
6846 -#include <linux/sched.h>
6847 -#include <linux/wait.h>
6848 +#include <linux/mfd/syscon.h>
6849 +#include <linux/regmap.h>
6850
6851  #include "pmc.h"
6852  #include "sckc.h"
6853 @@ -59,7 +53,7 @@ struct clk_slow_rc_osc {
6854
6855  struct clk_sam9260_slow {
6856         struct clk_hw hw;
6857 -       struct at91_pmc *pmc;
6858 +       struct regmap *regmap;
6859  };
6860
6861  #define to_clk_sam9260_slow(hw) container_of(hw, struct clk_sam9260_slow, hw)
6862 @@ -393,8 +387,11 @@ void __init of_at91sam9x5_clk_slow_setup(struct device_node *np,
6863  static u8 clk_sam9260_slow_get_parent(struct clk_hw *hw)
6864  {
6865         struct clk_sam9260_slow *slowck = to_clk_sam9260_slow(hw);
6866 +       unsigned int status;
6867
6868 -       return !!(pmc_read(slowck->pmc, AT91_PMC_SR) & AT91_PMC_OSCSEL);
6869 +       regmap_read(slowck->regmap, AT91_PMC_SR, &status);
6870 +
6871 +       return status & AT91_PMC_OSCSEL ? 1 : 0;
6872  }
6873
6874  static const struct clk_ops sam9260_slow_ops = {
6875 @@ -402,7 +399,7 @@ static const struct clk_ops sam9260_slow_ops = {
6876  };
6877
6878  static struct clk * __init
6879 -at91_clk_register_sam9260_slow(struct at91_pmc *pmc,
6880 +at91_clk_register_sam9260_slow(struct regmap *regmap,
6881                                const char *name,
6882                                const char **parent_names,
6883                                int num_parents)
6884 @@ -411,7 +408,7 @@ at91_clk_register_sam9260_slow(struct at91_pmc *pmc,
6885         struct clk *clk = NULL;
6886         struct clk_init_data init;
6887
6888 -       if (!pmc || !name)
6889 +       if (!name)
6890                 return ERR_PTR(-EINVAL);
6891
6892         if (!parent_names || !num_parents)
6893 @@ -428,7 +425,7 @@ at91_clk_register_sam9260_slow(struct at91_pmc *pmc,
6894         init.flags = 0;
6895
6896         slowck->hw.init = &init;
6897 -       slowck->pmc = pmc;
6898 +       slowck->regmap = regmap;
6899
6900         clk = clk_register(NULL, &slowck->hw);
6901         if (IS_ERR(clk))
6902 @@ -439,29 +436,34 @@ at91_clk_register_sam9260_slow(struct at91_pmc *pmc,
6903         return clk;
6904  }
6905
6906 -void __init of_at91sam9260_clk_slow_setup(struct device_node *np,
6907 -                                         struct at91_pmc *pmc)
6908 +static void __init of_at91sam9260_clk_slow_setup(struct device_node *np)
6909  {
6910         struct clk *clk;
6911         const char *parent_names[2];
6912         int num_parents;
6913         const char *name = np->name;
6914 +       struct regmap *regmap;
6915
6916         num_parents = of_clk_get_parent_count(np);
6917         if (num_parents != 2)
6918                 return;
6919
6920         of_clk_parent_fill(np, parent_names, num_parents);
6921 +       regmap = syscon_node_to_regmap(of_get_parent(np));
6922 +       if (IS_ERR(regmap))
6923 +               return;
6924
6925         of_property_read_string(np, "clock-output-names", &name);
6926
6927 -       clk = at91_clk_register_sam9260_slow(pmc, name, parent_names,
6928 +       clk = at91_clk_register_sam9260_slow(regmap, name, parent_names,
6929                                              num_parents);
6930         if (IS_ERR(clk))
6931                 return;
6932
6933         of_clk_add_provider(np, of_clk_src_simple_get, clk);
6934  }
6935 +CLK_OF_DECLARE(at91sam9260_clk_slow, "atmel,at91sam9260-clk-slow",
6936 +              of_at91sam9260_clk_slow_setup);
6937
6938  /*
6939   * FIXME: All slow clk users are not properly claiming it (get + prepare +
6940 diff --git a/drivers/clk/at91/clk-smd.c b/drivers/clk/at91/clk-smd.c
6941 index a7f8501cfa05..e6948a52005a 100644
6942 --- a/drivers/clk/at91/clk-smd.c
6943 +++ b/drivers/clk/at91/clk-smd.c
6944 @@ -12,8 +12,8 @@
6945  #include <linux/clkdev.h>
6946  #include <linux/clk/at91_pmc.h>
6947  #include <linux/of.h>
6948 -#include <linux/of_address.h>
6949 -#include <linux/io.h>
6950 +#include <linux/mfd/syscon.h>
6951 +#include <linux/regmap.h>
6952
6953  #include "pmc.h"
6954
6955 @@ -24,7 +24,7 @@
6956
6957  struct at91sam9x5_clk_smd {
6958         struct clk_hw hw;
6959 -       struct at91_pmc *pmc;
6960 +       struct regmap *regmap;
6961  };
6962
6963  #define to_at91sam9x5_clk_smd(hw) \
6964 @@ -33,13 +33,13 @@ struct at91sam9x5_clk_smd {
6965  static unsigned long at91sam9x5_clk_smd_recalc_rate(struct clk_hw *hw,
6966                                                     unsigned long parent_rate)
6967  {
6968 -       u32 tmp;
6969 -       u8 smddiv;
6970         struct at91sam9x5_clk_smd *smd = to_at91sam9x5_clk_smd(hw);
6971 -       struct at91_pmc *pmc = smd->pmc;
6972 +       unsigned int smdr;
6973 +       u8 smddiv;
6974 +
6975 +       regmap_read(smd->regmap, AT91_PMC_SMD, &smdr);
6976 +       smddiv = (smdr & AT91_PMC_SMD_DIV) >> SMD_DIV_SHIFT;
6977
6978 -       tmp = pmc_read(pmc, AT91_PMC_SMD);
6979 -       smddiv = (tmp & AT91_PMC_SMD_DIV) >> SMD_DIV_SHIFT;
6980         return parent_rate / (smddiv + 1);
6981  }
6982
6983 @@ -67,40 +67,38 @@ static long at91sam9x5_clk_smd_round_rate(struct clk_hw *hw, unsigned long rate,
6984
6985  static int at91sam9x5_clk_smd_set_parent(struct clk_hw *hw, u8 index)
6986  {
6987 -       u32 tmp;
6988         struct at91sam9x5_clk_smd *smd = to_at91sam9x5_clk_smd(hw);
6989 -       struct at91_pmc *pmc = smd->pmc;
6990
6991         if (index > 1)
6992                 return -EINVAL;
6993 -       tmp = pmc_read(pmc, AT91_PMC_SMD) & ~AT91_PMC_SMDS;
6994 -       if (index)
6995 -               tmp |= AT91_PMC_SMDS;
6996 -       pmc_write(pmc, AT91_PMC_SMD, tmp);
6997 +
6998 +       regmap_update_bits(smd->regmap, AT91_PMC_SMD, AT91_PMC_SMDS,
6999 +                          index ? AT91_PMC_SMDS : 0);
7000 +
7001         return 0;
7002  }
7003
7004  static u8 at91sam9x5_clk_smd_get_parent(struct clk_hw *hw)
7005  {
7006         struct at91sam9x5_clk_smd *smd = to_at91sam9x5_clk_smd(hw);
7007 -       struct at91_pmc *pmc = smd->pmc;
7008 +       unsigned int smdr;
7009
7010 -       return pmc_read(pmc, AT91_PMC_SMD) & AT91_PMC_SMDS;
7011 +       regmap_read(smd->regmap, AT91_PMC_SMD, &smdr);
7012 +
7013 +       return smdr & AT91_PMC_SMDS;
7014  }
7015
7016  static int at91sam9x5_clk_smd_set_rate(struct clk_hw *hw, unsigned long rate,
7017                                        unsigned long parent_rate)
7018  {
7019 -       u32 tmp;
7020         struct at91sam9x5_clk_smd *smd = to_at91sam9x5_clk_smd(hw);
7021 -       struct at91_pmc *pmc = smd->pmc;
7022         unsigned long div = parent_rate / rate;
7023
7024         if (parent_rate % rate || div < 1 || div > (SMD_MAX_DIV + 1))
7025                 return -EINVAL;
7026 -       tmp = pmc_read(pmc, AT91_PMC_SMD) & ~AT91_PMC_SMD_DIV;
7027 -       tmp |= (div - 1) << SMD_DIV_SHIFT;
7028 -       pmc_write(pmc, AT91_PMC_SMD, tmp);
7029 +
7030 +       regmap_update_bits(smd->regmap, AT91_PMC_SMD, AT91_PMC_SMD_DIV,
7031 +                          (div - 1) << SMD_DIV_SHIFT);
7032
7033         return 0;
7034  }
7035 @@ -114,7 +112,7 @@ static const struct clk_ops at91sam9x5_smd_ops = {
7036  };
7037
7038  static struct clk * __init
7039 -at91sam9x5_clk_register_smd(struct at91_pmc *pmc, const char *name,
7040 +at91sam9x5_clk_register_smd(struct regmap *regmap, const char *name,
7041                             const char **parent_names, u8 num_parents)
7042  {
7043         struct at91sam9x5_clk_smd *smd;
7044 @@ -132,7 +130,7 @@ at91sam9x5_clk_register_smd(struct at91_pmc *pmc, const char *name,
7045         init.flags = CLK_SET_RATE_GATE | CLK_SET_PARENT_GATE;
7046
7047         smd->hw.init = &init;
7048 -       smd->pmc = pmc;
7049 +       smd->regmap = regmap;
7050
7051         clk = clk_register(NULL, &smd->hw);
7052         if (IS_ERR(clk))
7053 @@ -141,13 +139,13 @@ at91sam9x5_clk_register_smd(struct at91_pmc *pmc, const char *name,
7054         return clk;
7055  }
7056
7057 -void __init of_at91sam9x5_clk_smd_setup(struct device_node *np,
7058 -                                       struct at91_pmc *pmc)
7059 +static void __init of_at91sam9x5_clk_smd_setup(struct device_node *np)
7060  {
7061         struct clk *clk;
7062         int num_parents;
7063         const char *parent_names[SMD_SOURCE_MAX];
7064         const char *name = np->name;
7065 +       struct regmap *regmap;
7066
7067         num_parents = of_clk_get_parent_count(np);
7068         if (num_parents <= 0 || num_parents > SMD_SOURCE_MAX)
7069 @@ -157,10 +155,16 @@ void __init of_at91sam9x5_clk_smd_setup(struct device_node *np,
7070
7071         of_property_read_string(np, "clock-output-names", &name);
7072
7073 -       clk = at91sam9x5_clk_register_smd(pmc, name, parent_names,
7074 +       regmap = syscon_node_to_regmap(of_get_parent(np));
7075 +       if (IS_ERR(regmap))
7076 +               return;
7077 +
7078 +       clk = at91sam9x5_clk_register_smd(regmap, name, parent_names,
7079                                           num_parents);
7080         if (IS_ERR(clk))
7081                 return;
7082
7083         of_clk_add_provider(np, of_clk_src_simple_get, clk);
7084  }
7085 +CLK_OF_DECLARE(at91sam9x5_clk_smd, "atmel,at91sam9x5-clk-smd",
7086 +              of_at91sam9x5_clk_smd_setup);
7087 diff --git a/drivers/clk/at91/clk-system.c b/drivers/clk/at91/clk-system.c
7088 index 3f5314344286..8f35d8172909 100644
7089 --- a/drivers/clk/at91/clk-system.c
7090 +++ b/drivers/clk/at91/clk-system.c
7091 @@ -12,13 +12,8 @@
7092  #include <linux/clkdev.h>
7093  #include <linux/clk/at91_pmc.h>
7094  #include <linux/of.h>
7095 -#include <linux/of_address.h>
7096 -#include <linux/io.h>
7097 -#include <linux/irq.h>
7098 -#include <linux/of_irq.h>
7099 -#include <linux/interrupt.h>
7100 -#include <linux/wait.h>
7101 -#include <linux/sched.h>
7102 +#include <linux/mfd/syscon.h>
7103 +#include <linux/regmap.h>
7104
7105  #include "pmc.h"
7106
7107 @@ -29,9 +24,7 @@
7108  #define to_clk_system(hw) container_of(hw, struct clk_system, hw)
7109  struct clk_system {
7110         struct clk_hw hw;
7111 -       struct at91_pmc *pmc;
7112 -       unsigned int irq;
7113 -       wait_queue_head_t wait;
7114 +       struct regmap *regmap;
7115         u8 id;
7116  };
7117
7118 @@ -39,58 +32,54 @@ static inline int is_pck(int id)
7119  {
7120         return (id >= 8) && (id <= 15);
7121  }
7122 -static irqreturn_t clk_system_irq_handler(int irq, void *dev_id)
7123 +
7124 +static inline bool clk_system_ready(struct regmap *regmap, int id)
7125  {
7126 -       struct clk_system *sys = (struct clk_system *)dev_id;
7127 +       unsigned int status;
7128
7129 -       wake_up(&sys->wait);
7130 -       disable_irq_nosync(sys->irq);
7131 +       regmap_read(regmap, AT91_PMC_SR, &status);
7132
7133 -       return IRQ_HANDLED;
7134 +       return status & (1 << id) ? 1 : 0;
7135  }
7136
7137  static int clk_system_prepare(struct clk_hw *hw)
7138  {
7139         struct clk_system *sys = to_clk_system(hw);
7140 -       struct at91_pmc *pmc = sys->pmc;
7141 -       u32 mask = 1 << sys->id;
7142
7143 -       pmc_write(pmc, AT91_PMC_SCER, mask);
7144 +       regmap_write(sys->regmap, AT91_PMC_SCER, 1 << sys->id);
7145
7146         if (!is_pck(sys->id))
7147                 return 0;
7148
7149 -       while (!(pmc_read(pmc, AT91_PMC_SR) & mask)) {
7150 -               if (sys->irq) {
7151 -                       enable_irq(sys->irq);
7152 -                       wait_event(sys->wait,
7153 -                                  pmc_read(pmc, AT91_PMC_SR) & mask);
7154 -               } else
7155 -                       cpu_relax();
7156 -       }
7157 +       while (!clk_system_ready(sys->regmap, sys->id))
7158 +               cpu_relax();
7159 +
7160         return 0;
7161  }
7162
7163  static void clk_system_unprepare(struct clk_hw *hw)
7164  {
7165         struct clk_system *sys = to_clk_system(hw);
7166 -       struct at91_pmc *pmc = sys->pmc;
7167
7168 -       pmc_write(pmc, AT91_PMC_SCDR, 1 << sys->id);
7169 +       regmap_write(sys->regmap, AT91_PMC_SCDR, 1 << sys->id);
7170  }
7171
7172  static int clk_system_is_prepared(struct clk_hw *hw)
7173  {
7174         struct clk_system *sys = to_clk_system(hw);
7175 -       struct at91_pmc *pmc = sys->pmc;
7176 +       unsigned int status;
7177 +
7178 +       regmap_read(sys->regmap, AT91_PMC_SCSR, &status);
7179
7180 -       if (!(pmc_read(pmc, AT91_PMC_SCSR) & (1 << sys->id)))
7181 +       if (!(status & (1 << sys->id)))
7182                 return 0;
7183
7184         if (!is_pck(sys->id))
7185                 return 1;
7186
7187 -       return !!(pmc_read(pmc, AT91_PMC_SR) & (1 << sys->id));
7188 +       regmap_read(sys->regmap, AT91_PMC_SR, &status);
7189 +
7190 +       return status & (1 << sys->id) ? 1 : 0;
7191  }
7192
7193  static const struct clk_ops system_ops = {
7194 @@ -100,13 +89,12 @@ static const struct clk_ops system_ops = {
7195  };
7196
7197  static struct clk * __init
7198 -at91_clk_register_system(struct at91_pmc *pmc, const char *name,
7199 -                        const char *parent_name, u8 id, int irq)
7200 +at91_clk_register_system(struct regmap *regmap, const char *name,
7201 +                        const char *parent_name, u8 id)
7202  {
7203         struct clk_system *sys;
7204         struct clk *clk = NULL;
7205         struct clk_init_data init;
7206 -       int ret;
7207
7208         if (!parent_name || id > SYSTEM_MAX_ID)
7209                 return ERR_PTR(-EINVAL);
7210 @@ -123,44 +111,33 @@ at91_clk_register_system(struct at91_pmc *pmc, const char *name,
7211
7212         sys->id = id;
7213         sys->hw.init = &init;
7214 -       sys->pmc = pmc;
7215 -       sys->irq = irq;
7216 -       if (irq) {
7217 -               init_waitqueue_head(&sys->wait);
7218 -               irq_set_status_flags(sys->irq, IRQ_NOAUTOEN);
7219 -               ret = request_irq(sys->irq, clk_system_irq_handler,
7220 -                               IRQF_TRIGGER_HIGH, name, sys);
7221 -               if (ret) {
7222 -                       kfree(sys);
7223 -                       return ERR_PTR(ret);
7224 -               }
7225 -       }
7226 +       sys->regmap = regmap;
7227
7228         clk = clk_register(NULL, &sys->hw);
7229 -       if (IS_ERR(clk)) {
7230 -               if (irq)
7231 -                       free_irq(sys->irq, sys);
7232 +       if (IS_ERR(clk))
7233                 kfree(sys);
7234 -       }
7235
7236         return clk;
7237  }
7238
7239 -static void __init
7240 -of_at91_clk_sys_setup(struct device_node *np, struct at91_pmc *pmc)
7241 +static void __init of_at91rm9200_clk_sys_setup(struct device_node *np)
7242  {
7243         int num;
7244 -       int irq = 0;
7245         u32 id;
7246         struct clk *clk;
7247         const char *name;
7248         struct device_node *sysclknp;
7249         const char *parent_name;
7250 +       struct regmap *regmap;
7251
7252         num = of_get_child_count(np);
7253         if (num > (SYSTEM_MAX_ID + 1))
7254                 return;
7255
7256 +       regmap = syscon_node_to_regmap(of_get_parent(np));
7257 +       if (IS_ERR(regmap))
7258 +               return;
7259 +
7260         for_each_child_of_node(np, sysclknp) {
7261                 if (of_property_read_u32(sysclknp, "reg", &id))
7262                         continue;
7263 @@ -168,21 +145,14 @@ of_at91_clk_sys_setup(struct device_node *np, struct at91_pmc *pmc)
7264                 if (of_property_read_string(np, "clock-output-names", &name))
7265                         name = sysclknp->name;
7266
7267 -               if (is_pck(id))
7268 -                       irq = irq_of_parse_and_map(sysclknp, 0);
7269 -
7270                 parent_name = of_clk_get_parent_name(sysclknp, 0);
7271
7272 -               clk = at91_clk_register_system(pmc, name, parent_name, id, irq);
7273 +               clk = at91_clk_register_system(regmap, name, parent_name, id);
7274                 if (IS_ERR(clk))
7275                         continue;
7276
7277                 of_clk_add_provider(sysclknp, of_clk_src_simple_get, clk);
7278         }
7279  }
7280 -
7281 -void __init of_at91rm9200_clk_sys_setup(struct device_node *np,
7282 -                                       struct at91_pmc *pmc)
7283 -{
7284 -       of_at91_clk_sys_setup(np, pmc);
7285 -}
7286 +CLK_OF_DECLARE(at91rm9200_clk_sys, "atmel,at91rm9200-clk-system",
7287 +              of_at91rm9200_clk_sys_setup);
7288 diff --git a/drivers/clk/at91/clk-usb.c b/drivers/clk/at91/clk-usb.c
7289 index 8ab8502778a2..650ca45892c0 100644
7290 --- a/drivers/clk/at91/clk-usb.c
7291 +++ b/drivers/clk/at91/clk-usb.c
7292 @@ -12,8 +12,8 @@
7293  #include <linux/clkdev.h>
7294  #include <linux/clk/at91_pmc.h>
7295  #include <linux/of.h>
7296 -#include <linux/of_address.h>
7297 -#include <linux/io.h>
7298 +#include <linux/mfd/syscon.h>
7299 +#include <linux/regmap.h>
7300
7301  #include "pmc.h"
7302
7303 @@ -27,7 +27,7 @@
7304
7305  struct at91sam9x5_clk_usb {
7306         struct clk_hw hw;
7307 -       struct at91_pmc *pmc;
7308 +       struct regmap *regmap;
7309  };
7310
7311  #define to_at91sam9x5_clk_usb(hw) \
7312 @@ -35,7 +35,7 @@ struct at91sam9x5_clk_usb {
7313
7314  struct at91rm9200_clk_usb {
7315         struct clk_hw hw;
7316 -       struct at91_pmc *pmc;
7317 +       struct regmap *regmap;
7318         u32 divisors[4];
7319  };
7320
7321 @@ -45,13 +45,12 @@ struct at91rm9200_clk_usb {
7322  static unsigned long at91sam9x5_clk_usb_recalc_rate(struct clk_hw *hw,
7323                                                     unsigned long parent_rate)
7324  {
7325 -       u32 tmp;
7326 -       u8 usbdiv;
7327         struct at91sam9x5_clk_usb *usb = to_at91sam9x5_clk_usb(hw);
7328 -       struct at91_pmc *pmc = usb->pmc;
7329 +       unsigned int usbr;
7330 +       u8 usbdiv;
7331
7332 -       tmp = pmc_read(pmc, AT91_PMC_USB);
7333 -       usbdiv = (tmp & AT91_PMC_OHCIUSBDIV) >> SAM9X5_USB_DIV_SHIFT;
7334 +       regmap_read(usb->regmap, AT91_PMC_USB, &usbr);
7335 +       usbdiv = (usbr & AT91_PMC_OHCIUSBDIV) >> SAM9X5_USB_DIV_SHIFT;
7336
7337         return DIV_ROUND_CLOSEST(parent_rate, (usbdiv + 1));
7338  }
7339 @@ -109,33 +108,31 @@ static int at91sam9x5_clk_usb_determine_rate(struct clk_hw *hw,
7340
7341  static int at91sam9x5_clk_usb_set_parent(struct clk_hw *hw, u8 index)
7342  {
7343 -       u32 tmp;
7344         struct at91sam9x5_clk_usb *usb = to_at91sam9x5_clk_usb(hw);
7345 -       struct at91_pmc *pmc = usb->pmc;
7346
7347         if (index > 1)
7348                 return -EINVAL;
7349 -       tmp = pmc_read(pmc, AT91_PMC_USB) & ~AT91_PMC_USBS;
7350 -       if (index)
7351 -               tmp |= AT91_PMC_USBS;
7352 -       pmc_write(pmc, AT91_PMC_USB, tmp);
7353 +
7354 +       regmap_update_bits(usb->regmap, AT91_PMC_USB, AT91_PMC_USBS,
7355 +                          index ? AT91_PMC_USBS : 0);
7356 +
7357         return 0;
7358  }
7359
7360  static u8 at91sam9x5_clk_usb_get_parent(struct clk_hw *hw)
7361  {
7362         struct at91sam9x5_clk_usb *usb = to_at91sam9x5_clk_usb(hw);
7363 -       struct at91_pmc *pmc = usb->pmc;
7364 +       unsigned int usbr;
7365
7366 -       return pmc_read(pmc, AT91_PMC_USB) & AT91_PMC_USBS;
7367 +       regmap_read(usb->regmap, AT91_PMC_USB, &usbr);
7368 +
7369 +       return usbr & AT91_PMC_USBS;
7370  }
7371
7372  static int at91sam9x5_clk_usb_set_rate(struct clk_hw *hw, unsigned long rate,
7373                                        unsigned long parent_rate)
7374  {
7375 -       u32 tmp;
7376         struct at91sam9x5_clk_usb *usb = to_at91sam9x5_clk_usb(hw);
7377 -       struct at91_pmc *pmc = usb->pmc;
7378         unsigned long div;
7379
7380         if (!rate)
7381 @@ -145,9 +142,8 @@ static int at91sam9x5_clk_usb_set_rate(struct clk_hw *hw, unsigned long rate,
7382         if (div > SAM9X5_USB_MAX_DIV + 1 || !div)
7383                 return -EINVAL;
7384
7385 -       tmp = pmc_read(pmc, AT91_PMC_USB) & ~AT91_PMC_OHCIUSBDIV;
7386 -       tmp |= (div - 1) << SAM9X5_USB_DIV_SHIFT;
7387 -       pmc_write(pmc, AT91_PMC_USB, tmp);
7388 +       regmap_update_bits(usb->regmap, AT91_PMC_USB, AT91_PMC_OHCIUSBDIV,
7389 +                          (div - 1) << SAM9X5_USB_DIV_SHIFT);
7390
7391         return 0;
7392  }
7393 @@ -163,28 +159,28 @@ static const struct clk_ops at91sam9x5_usb_ops = {
7394  static int at91sam9n12_clk_usb_enable(struct clk_hw *hw)
7395  {
7396         struct at91sam9x5_clk_usb *usb = to_at91sam9x5_clk_usb(hw);
7397 -       struct at91_pmc *pmc = usb->pmc;
7398
7399 -       pmc_write(pmc, AT91_PMC_USB,
7400 -                 pmc_read(pmc, AT91_PMC_USB) | AT91_PMC_USBS);
7401 +       regmap_update_bits(usb->regmap, AT91_PMC_USB, AT91_PMC_USBS,
7402 +                          AT91_PMC_USBS);
7403 +
7404         return 0;
7405  }
7406
7407  static void at91sam9n12_clk_usb_disable(struct clk_hw *hw)
7408  {
7409         struct at91sam9x5_clk_usb *usb = to_at91sam9x5_clk_usb(hw);
7410 -       struct at91_pmc *pmc = usb->pmc;
7411
7412 -       pmc_write(pmc, AT91_PMC_USB,
7413 -                 pmc_read(pmc, AT91_PMC_USB) & ~AT91_PMC_USBS);
7414 +       regmap_update_bits(usb->regmap, AT91_PMC_USB, AT91_PMC_USBS, 0);
7415  }
7416
7417  static int at91sam9n12_clk_usb_is_enabled(struct clk_hw *hw)
7418  {
7419         struct at91sam9x5_clk_usb *usb = to_at91sam9x5_clk_usb(hw);
7420 -       struct at91_pmc *pmc = usb->pmc;
7421 +       unsigned int usbr;
7422
7423 -       return !!(pmc_read(pmc, AT91_PMC_USB) & AT91_PMC_USBS);
7424 +       regmap_read(usb->regmap, AT91_PMC_USB, &usbr);
7425 +
7426 +       return usbr & AT91_PMC_USBS;
7427  }
7428
7429  static const struct clk_ops at91sam9n12_usb_ops = {
7430 @@ -197,7 +193,7 @@ static const struct clk_ops at91sam9n12_usb_ops = {
7431  };
7432
7433  static struct clk * __init
7434 -at91sam9x5_clk_register_usb(struct at91_pmc *pmc, const char *name,
7435 +at91sam9x5_clk_register_usb(struct regmap *regmap, const char *name,
7436                             const char **parent_names, u8 num_parents)
7437  {
7438         struct at91sam9x5_clk_usb *usb;
7439 @@ -216,7 +212,7 @@ at91sam9x5_clk_register_usb(struct at91_pmc *pmc, const char *name,
7440                      CLK_SET_RATE_PARENT;
7441
7442         usb->hw.init = &init;
7443 -       usb->pmc = pmc;
7444 +       usb->regmap = regmap;
7445
7446         clk = clk_register(NULL, &usb->hw);
7447         if (IS_ERR(clk))
7448 @@ -226,7 +222,7 @@ at91sam9x5_clk_register_usb(struct at91_pmc *pmc, const char *name,
7449  }
7450
7451  static struct clk * __init
7452 -at91sam9n12_clk_register_usb(struct at91_pmc *pmc, const char *name,
7453 +at91sam9n12_clk_register_usb(struct regmap *regmap, const char *name,
7454                              const char *parent_name)
7455  {
7456         struct at91sam9x5_clk_usb *usb;
7457 @@ -244,7 +240,7 @@ at91sam9n12_clk_register_usb(struct at91_pmc *pmc, const char *name,
7458         init.flags = CLK_SET_RATE_GATE | CLK_SET_RATE_PARENT;
7459
7460         usb->hw.init = &init;
7461 -       usb->pmc = pmc;
7462 +       usb->regmap = regmap;
7463
7464         clk = clk_register(NULL, &usb->hw);
7465         if (IS_ERR(clk))
7466 @@ -257,12 +253,12 @@ static unsigned long at91rm9200_clk_usb_recalc_rate(struct clk_hw *hw,
7467                                                     unsigned long parent_rate)
7468  {
7469         struct at91rm9200_clk_usb *usb = to_at91rm9200_clk_usb(hw);
7470 -       struct at91_pmc *pmc = usb->pmc;
7471 -       u32 tmp;
7472 +       unsigned int pllbr;
7473         u8 usbdiv;
7474
7475 -       tmp = pmc_read(pmc, AT91_CKGR_PLLBR);
7476 -       usbdiv = (tmp & AT91_PMC_USBDIV) >> RM9200_USB_DIV_SHIFT;
7477 +       regmap_read(usb->regmap, AT91_CKGR_PLLBR, &pllbr);
7478 +
7479 +       usbdiv = (pllbr & AT91_PMC_USBDIV) >> RM9200_USB_DIV_SHIFT;
7480         if (usb->divisors[usbdiv])
7481                 return parent_rate / usb->divisors[usbdiv];
7482
7483 @@ -310,10 +306,8 @@ static long at91rm9200_clk_usb_round_rate(struct clk_hw *hw, unsigned long rate,
7484  static int at91rm9200_clk_usb_set_rate(struct clk_hw *hw, unsigned long rate,
7485                                        unsigned long parent_rate)
7486  {
7487 -       u32 tmp;
7488         int i;
7489         struct at91rm9200_clk_usb *usb = to_at91rm9200_clk_usb(hw);
7490 -       struct at91_pmc *pmc = usb->pmc;
7491         unsigned long div;
7492
7493         if (!rate)
7494 @@ -323,10 +317,10 @@ static int at91rm9200_clk_usb_set_rate(struct clk_hw *hw, unsigned long rate,
7495
7496         for (i = 0; i < RM9200_USB_DIV_TAB_SIZE; i++) {
7497                 if (usb->divisors[i] == div) {
7498 -                       tmp = pmc_read(pmc, AT91_CKGR_PLLBR) &
7499 -                             ~AT91_PMC_USBDIV;
7500 -                       tmp |= i << RM9200_USB_DIV_SHIFT;
7501 -                       pmc_write(pmc, AT91_CKGR_PLLBR, tmp);
7502 +                       regmap_update_bits(usb->regmap, AT91_CKGR_PLLBR,
7503 +                                          AT91_PMC_USBDIV,
7504 +                                          i << RM9200_USB_DIV_SHIFT);
7505 +
7506                         return 0;
7507                 }
7508         }
7509 @@ -341,7 +335,7 @@ static const struct clk_ops at91rm9200_usb_ops = {
7510  };
7511
7512  static struct clk * __init
7513 -at91rm9200_clk_register_usb(struct at91_pmc *pmc, const char *name,
7514 +at91rm9200_clk_register_usb(struct regmap *regmap, const char *name,
7515                             const char *parent_name, const u32 *divisors)
7516  {
7517         struct at91rm9200_clk_usb *usb;
7518 @@ -359,7 +353,7 @@ at91rm9200_clk_register_usb(struct at91_pmc *pmc, const char *name,
7519         init.flags = CLK_SET_RATE_PARENT;
7520
7521         usb->hw.init = &init;
7522 -       usb->pmc = pmc;
7523 +       usb->regmap = regmap;
7524         memcpy(usb->divisors, divisors, sizeof(usb->divisors));
7525
7526         clk = clk_register(NULL, &usb->hw);
7527 @@ -369,13 +363,13 @@ at91rm9200_clk_register_usb(struct at91_pmc *pmc, const char *name,
7528         return clk;
7529  }
7530
7531 -void __init of_at91sam9x5_clk_usb_setup(struct device_node *np,
7532 -                                       struct at91_pmc *pmc)
7533 +static void __init of_at91sam9x5_clk_usb_setup(struct device_node *np)
7534  {
7535         struct clk *clk;
7536         int num_parents;
7537         const char *parent_names[USB_SOURCE_MAX];
7538         const char *name = np->name;
7539 +       struct regmap *regmap;
7540
7541         num_parents = of_clk_get_parent_count(np);
7542         if (num_parents <= 0 || num_parents > USB_SOURCE_MAX)
7543 @@ -385,19 +379,26 @@ void __init of_at91sam9x5_clk_usb_setup(struct device_node *np,
7544
7545         of_property_read_string(np, "clock-output-names", &name);
7546
7547 -       clk = at91sam9x5_clk_register_usb(pmc, name, parent_names, num_parents);
7548 +       regmap = syscon_node_to_regmap(of_get_parent(np));
7549 +       if (IS_ERR(regmap))
7550 +               return;
7551 +
7552 +       clk = at91sam9x5_clk_register_usb(regmap, name, parent_names,
7553 +                                         num_parents);
7554         if (IS_ERR(clk))
7555                 return;
7556
7557         of_clk_add_provider(np, of_clk_src_simple_get, clk);
7558  }
7559 +CLK_OF_DECLARE(at91sam9x5_clk_usb, "atmel,at91sam9x5-clk-usb",
7560 +              of_at91sam9x5_clk_usb_setup);
7561
7562 -void __init of_at91sam9n12_clk_usb_setup(struct device_node *np,
7563 -                                        struct at91_pmc *pmc)
7564 +static void __init of_at91sam9n12_clk_usb_setup(struct device_node *np)
7565  {
7566         struct clk *clk;
7567         const char *parent_name;
7568         const char *name = np->name;
7569 +       struct regmap *regmap;
7570
7571         parent_name = of_clk_get_parent_name(np, 0);
7572         if (!parent_name)
7573 @@ -405,20 +406,26 @@ void __init of_at91sam9n12_clk_usb_setup(struct device_node *np,
7574
7575         of_property_read_string(np, "clock-output-names", &name);
7576
7577 -       clk = at91sam9n12_clk_register_usb(pmc, name, parent_name);
7578 +       regmap = syscon_node_to_regmap(of_get_parent(np));
7579 +       if (IS_ERR(regmap))
7580 +               return;
7581 +
7582 +       clk = at91sam9n12_clk_register_usb(regmap, name, parent_name);
7583         if (IS_ERR(clk))
7584                 return;
7585
7586         of_clk_add_provider(np, of_clk_src_simple_get, clk);
7587  }
7588 +CLK_OF_DECLARE(at91sam9n12_clk_usb, "atmel,at91sam9n12-clk-usb",
7589 +              of_at91sam9n12_clk_usb_setup);
7590
7591 -void __init of_at91rm9200_clk_usb_setup(struct device_node *np,
7592 -                                       struct at91_pmc *pmc)
7593 +static void __init of_at91rm9200_clk_usb_setup(struct device_node *np)
7594  {
7595         struct clk *clk;
7596         const char *parent_name;
7597         const char *name = np->name;
7598         u32 divisors[4] = {0, 0, 0, 0};
7599 +       struct regmap *regmap;
7600
7601         parent_name = of_clk_get_parent_name(np, 0);
7602         if (!parent_name)
7603 @@ -430,9 +437,15 @@ void __init of_at91rm9200_clk_usb_setup(struct device_node *np,
7604
7605         of_property_read_string(np, "clock-output-names", &name);
7606
7607 -       clk = at91rm9200_clk_register_usb(pmc, name, parent_name, divisors);
7608 +       regmap = syscon_node_to_regmap(of_get_parent(np));
7609 +       if (IS_ERR(regmap))
7610 +               return;
7611 +
7612 +       clk = at91rm9200_clk_register_usb(regmap, name, parent_name, divisors);
7613         if (IS_ERR(clk))
7614                 return;
7615
7616         of_clk_add_provider(np, of_clk_src_simple_get, clk);
7617  }
7618 +CLK_OF_DECLARE(at91rm9200_clk_usb, "atmel,at91rm9200-clk-usb",
7619 +              of_at91rm9200_clk_usb_setup);
7620 diff --git a/drivers/clk/at91/clk-utmi.c b/drivers/clk/at91/clk-utmi.c
7621 index ca561e90a60f..61fcf399e58c 100644
7622 --- a/drivers/clk/at91/clk-utmi.c
7623 +++ b/drivers/clk/at91/clk-utmi.c
7624 @@ -11,14 +11,9 @@
7625  #include <linux/clk-provider.h>
7626  #include <linux/clkdev.h>
7627  #include <linux/clk/at91_pmc.h>
7628 -#include <linux/interrupt.h>
7629 -#include <linux/irq.h>
7630  #include <linux/of.h>
7631 -#include <linux/of_address.h>
7632 -#include <linux/of_irq.h>
7633 -#include <linux/io.h>
7634 -#include <linux/sched.h>
7635 -#include <linux/wait.h>
7636 +#include <linux/mfd/syscon.h>
7637 +#include <linux/regmap.h>
7638
7639  #include "pmc.h"
7640
7641 @@ -26,37 +21,30 @@
7642
7643  struct clk_utmi {
7644         struct clk_hw hw;
7645 -       struct at91_pmc *pmc;
7646 -       unsigned int irq;
7647 -       wait_queue_head_t wait;
7648 +       struct regmap *regmap;
7649  };
7650
7651  #define to_clk_utmi(hw) container_of(hw, struct clk_utmi, hw)
7652
7653 -static irqreturn_t clk_utmi_irq_handler(int irq, void *dev_id)
7654 +static inline bool clk_utmi_ready(struct regmap *regmap)
7655  {
7656 -       struct clk_utmi *utmi = (struct clk_utmi *)dev_id;
7657 +       unsigned int status;
7658
7659 -       wake_up(&utmi->wait);
7660 -       disable_irq_nosync(utmi->irq);
7661 +       regmap_read(regmap, AT91_PMC_SR, &status);
7662
7663 -       return IRQ_HANDLED;
7664 +       return status & AT91_PMC_LOCKU;
7665  }
7666
7667  static int clk_utmi_prepare(struct clk_hw *hw)
7668  {
7669         struct clk_utmi *utmi = to_clk_utmi(hw);
7670 -       struct at91_pmc *pmc = utmi->pmc;
7671 -       u32 tmp = pmc_read(pmc, AT91_CKGR_UCKR) | AT91_PMC_UPLLEN |
7672 -                 AT91_PMC_UPLLCOUNT | AT91_PMC_BIASEN;
7673 +       unsigned int uckr = AT91_PMC_UPLLEN | AT91_PMC_UPLLCOUNT |
7674 +                           AT91_PMC_BIASEN;
7675
7676 -       pmc_write(pmc, AT91_CKGR_UCKR, tmp);
7677 +       regmap_update_bits(utmi->regmap, AT91_CKGR_UCKR, uckr, uckr);
7678
7679 -       while (!(pmc_read(pmc, AT91_PMC_SR) & AT91_PMC_LOCKU)) {
7680 -               enable_irq(utmi->irq);
7681 -               wait_event(utmi->wait,
7682 -                          pmc_read(pmc, AT91_PMC_SR) & AT91_PMC_LOCKU);
7683 -       }
7684 +       while (!clk_utmi_ready(utmi->regmap))
7685 +               cpu_relax();
7686
7687         return 0;
7688  }
7689 @@ -64,18 +52,15 @@ static int clk_utmi_prepare(struct clk_hw *hw)
7690  static int clk_utmi_is_prepared(struct clk_hw *hw)
7691  {
7692         struct clk_utmi *utmi = to_clk_utmi(hw);
7693 -       struct at91_pmc *pmc = utmi->pmc;
7694
7695 -       return !!(pmc_read(pmc, AT91_PMC_SR) & AT91_PMC_LOCKU);
7696 +       return clk_utmi_ready(utmi->regmap);
7697  }
7698
7699  static void clk_utmi_unprepare(struct clk_hw *hw)
7700  {
7701         struct clk_utmi *utmi = to_clk_utmi(hw);
7702 -       struct at91_pmc *pmc = utmi->pmc;
7703 -       u32 tmp = pmc_read(pmc, AT91_CKGR_UCKR) & ~AT91_PMC_UPLLEN;
7704
7705 -       pmc_write(pmc, AT91_CKGR_UCKR, tmp);
7706 +       regmap_update_bits(utmi->regmap, AT91_CKGR_UCKR, AT91_PMC_UPLLEN, 0);
7707  }
7708
7709  static unsigned long clk_utmi_recalc_rate(struct clk_hw *hw,
7710 @@ -93,10 +78,9 @@ static const struct clk_ops utmi_ops = {
7711  };
7712
7713  static struct clk * __init
7714 -at91_clk_register_utmi(struct at91_pmc *pmc, unsigned int irq,
7715 +at91_clk_register_utmi(struct regmap *regmap,
7716                        const char *name, const char *parent_name)
7717  {
7718 -       int ret;
7719         struct clk_utmi *utmi;
7720         struct clk *clk = NULL;
7721         struct clk_init_data init;
7722 @@ -112,52 +96,36 @@ at91_clk_register_utmi(struct at91_pmc *pmc, unsigned int irq,
7723         init.flags = CLK_SET_RATE_GATE;
7724
7725         utmi->hw.init = &init;
7726 -       utmi->pmc = pmc;
7727 -       utmi->irq = irq;
7728 -       init_waitqueue_head(&utmi->wait);
7729 -       irq_set_status_flags(utmi->irq, IRQ_NOAUTOEN);
7730 -       ret = request_irq(utmi->irq, clk_utmi_irq_handler,
7731 -                         IRQF_TRIGGER_HIGH, "clk-utmi", utmi);
7732 -       if (ret) {
7733 -               kfree(utmi);
7734 -               return ERR_PTR(ret);
7735 -       }
7736 +       utmi->regmap = regmap;
7737
7738         clk = clk_register(NULL, &utmi->hw);
7739 -       if (IS_ERR(clk)) {
7740 -               free_irq(utmi->irq, utmi);
7741 +       if (IS_ERR(clk))
7742                 kfree(utmi);
7743 -       }
7744
7745         return clk;
7746  }
7747
7748 -static void __init
7749 -of_at91_clk_utmi_setup(struct device_node *np, struct at91_pmc *pmc)
7750 +static void __init of_at91sam9x5_clk_utmi_setup(struct device_node *np)
7751  {
7752 -       unsigned int irq;
7753         struct clk *clk;
7754         const char *parent_name;
7755         const char *name = np->name;
7756 +       struct regmap *regmap;
7757
7758         parent_name = of_clk_get_parent_name(np, 0);
7759
7760         of_property_read_string(np, "clock-output-names", &name);
7761
7762 -       irq = irq_of_parse_and_map(np, 0);
7763 -       if (!irq)
7764 +       regmap = syscon_node_to_regmap(of_get_parent(np));
7765 +       if (IS_ERR(regmap))
7766                 return;
7767
7768 -       clk = at91_clk_register_utmi(pmc, irq, name, parent_name);
7769 +       clk = at91_clk_register_utmi(regmap, name, parent_name);
7770         if (IS_ERR(clk))
7771                 return;
7772
7773         of_clk_add_provider(np, of_clk_src_simple_get, clk);
7774         return;
7775  }
7776 -
7777 -void __init of_at91sam9x5_clk_utmi_setup(struct device_node *np,
7778 -                                        struct at91_pmc *pmc)
7779 -{
7780 -       of_at91_clk_utmi_setup(np, pmc);
7781 -}
7782 +CLK_OF_DECLARE(at91sam9x5_clk_utmi, "atmel,at91sam9x5-clk-utmi",
7783 +              of_at91sam9x5_clk_utmi_setup);
7784 diff --git a/drivers/clk/at91/pmc.c b/drivers/clk/at91/pmc.c
7785 index 8476b570779b..526df5ba042d 100644
7786 --- a/drivers/clk/at91/pmc.c
7787 +++ b/drivers/clk/at91/pmc.c
7788 @@ -12,36 +12,13 @@
7789  #include <linux/clkdev.h>
7790  #include <linux/clk/at91_pmc.h>
7791  #include <linux/of.h>
7792 -#include <linux/of_address.h>
7793 -#include <linux/io.h>
7794 -#include <linux/interrupt.h>
7795 -#include <linux/irq.h>
7796 -#include <linux/irqchip/chained_irq.h>
7797 -#include <linux/irqdomain.h>
7798 -#include <linux/of_irq.h>
7799 +#include <linux/mfd/syscon.h>
7800 +#include <linux/regmap.h>
7801
7802  #include <asm/proc-fns.h>
7803
7804  #include "pmc.h"
7805
7806 -void __iomem *at91_pmc_base;
7807 -EXPORT_SYMBOL_GPL(at91_pmc_base);
7808 -
7809 -void at91rm9200_idle(void)
7810 -{
7811 -       /*
7812 -        * Disable the processor clock.  The processor will be automatically
7813 -        * re-enabled by an interrupt or by a reset.
7814 -        */
7815 -       at91_pmc_write(AT91_PMC_SCDR, AT91_PMC_PCK);
7816 -}
7817 -
7818 -void at91sam9_idle(void)
7819 -{
7820 -       at91_pmc_write(AT91_PMC_SCDR, AT91_PMC_PCK);
7821 -       cpu_do_idle();
7822 -}
7823 -
7824  int of_at91_get_clk_range(struct device_node *np, const char *propname,
7825                           struct clk_range *range)
7826  {
7827 @@ -64,402 +41,3 @@ int of_at91_get_clk_range(struct device_node *np, const char *propname,
7828         return 0;
7829  }
7830  EXPORT_SYMBOL_GPL(of_at91_get_clk_range);
7831 -
7832 -static void pmc_irq_mask(struct irq_data *d)
7833 -{
7834 -       struct at91_pmc *pmc = irq_data_get_irq_chip_data(d);
7835 -
7836 -       pmc_write(pmc, AT91_PMC_IDR, 1 << d->hwirq);
7837 -}
7838 -
7839 -static void pmc_irq_unmask(struct irq_data *d)
7840 -{
7841 -       struct at91_pmc *pmc = irq_data_get_irq_chip_data(d);
7842 -
7843 -       pmc_write(pmc, AT91_PMC_IER, 1 << d->hwirq);
7844 -}
7845 -
7846 -static int pmc_irq_set_type(struct irq_data *d, unsigned type)
7847 -{
7848 -       if (type != IRQ_TYPE_LEVEL_HIGH) {
7849 -               pr_warn("PMC: type not supported (support only IRQ_TYPE_LEVEL_HIGH type)\n");
7850 -               return -EINVAL;
7851 -       }
7852 -
7853 -       return 0;
7854 -}
7855 -
7856 -static void pmc_irq_suspend(struct irq_data *d)
7857 -{
7858 -       struct at91_pmc *pmc = irq_data_get_irq_chip_data(d);
7859 -
7860 -       pmc->imr = pmc_read(pmc, AT91_PMC_IMR);
7861 -       pmc_write(pmc, AT91_PMC_IDR, pmc->imr);
7862 -}
7863 -
7864 -static void pmc_irq_resume(struct irq_data *d)
7865 -{
7866 -       struct at91_pmc *pmc = irq_data_get_irq_chip_data(d);
7867 -
7868 -       pmc_write(pmc, AT91_PMC_IER, pmc->imr);
7869 -}
7870 -
7871 -static struct irq_chip pmc_irq = {
7872 -       .name = "PMC",
7873 -       .irq_disable = pmc_irq_mask,
7874 -       .irq_mask = pmc_irq_mask,
7875 -       .irq_unmask = pmc_irq_unmask,
7876 -       .irq_set_type = pmc_irq_set_type,
7877 -       .irq_suspend = pmc_irq_suspend,
7878 -       .irq_resume = pmc_irq_resume,
7879 -};
7880 -
7881 -static struct lock_class_key pmc_lock_class;
7882 -
7883 -static int pmc_irq_map(struct irq_domain *h, unsigned int virq,
7884 -                      irq_hw_number_t hw)
7885 -{
7886 -       struct at91_pmc *pmc = h->host_data;
7887 -
7888 -       irq_set_lockdep_class(virq, &pmc_lock_class);
7889 -
7890 -       irq_set_chip_and_handler(virq, &pmc_irq,
7891 -                                handle_level_irq);
7892 -       irq_set_chip_data(virq, pmc);
7893 -
7894 -       return 0;
7895 -}
7896 -
7897 -static int pmc_irq_domain_xlate(struct irq_domain *d,
7898 -                               struct device_node *ctrlr,
7899 -                               const u32 *intspec, unsigned int intsize,
7900 -                               irq_hw_number_t *out_hwirq,
7901 -                               unsigned int *out_type)
7902 -{
7903 -       struct at91_pmc *pmc = d->host_data;
7904 -       const struct at91_pmc_caps *caps = pmc->caps;
7905 -
7906 -       if (WARN_ON(intsize < 1))
7907 -               return -EINVAL;
7908 -
7909 -       *out_hwirq = intspec[0];
7910 -
7911 -       if (!(caps->available_irqs & (1 << *out_hwirq)))
7912 -               return -EINVAL;
7913 -
7914 -       *out_type = IRQ_TYPE_LEVEL_HIGH;
7915 -
7916 -       return 0;
7917 -}
7918 -
7919 -static const struct irq_domain_ops pmc_irq_ops = {
7920 -       .map    = pmc_irq_map,
7921 -       .xlate  = pmc_irq_domain_xlate,
7922 -};
7923 -
7924 -static irqreturn_t pmc_irq_handler(int irq, void *data)
7925 -{
7926 -       struct at91_pmc *pmc = (struct at91_pmc *)data;
7927 -       unsigned long sr;
7928 -       int n;
7929 -
7930 -       sr = pmc_read(pmc, AT91_PMC_SR) & pmc_read(pmc, AT91_PMC_IMR);
7931 -       if (!sr)
7932 -               return IRQ_NONE;
7933 -
7934 -       for_each_set_bit(n, &sr, BITS_PER_LONG)
7935 -               generic_handle_irq(irq_find_mapping(pmc->irqdomain, n));
7936 -
7937 -       return IRQ_HANDLED;
7938 -}
7939 -
7940 -static const struct at91_pmc_caps at91rm9200_caps = {
7941 -       .available_irqs = AT91_PMC_MOSCS | AT91_PMC_LOCKA | AT91_PMC_LOCKB |
7942 -                         AT91_PMC_MCKRDY | AT91_PMC_PCK0RDY |
7943 -                         AT91_PMC_PCK1RDY | AT91_PMC_PCK2RDY |
7944 -                         AT91_PMC_PCK3RDY,
7945 -};
7946 -
7947 -static const struct at91_pmc_caps at91sam9260_caps = {
7948 -       .available_irqs = AT91_PMC_MOSCS | AT91_PMC_LOCKA | AT91_PMC_LOCKB |
7949 -                         AT91_PMC_MCKRDY | AT91_PMC_PCK0RDY |
7950 -                         AT91_PMC_PCK1RDY,
7951 -};
7952 -
7953 -static const struct at91_pmc_caps at91sam9g45_caps = {
7954 -       .available_irqs = AT91_PMC_MOSCS | AT91_PMC_LOCKA | AT91_PMC_MCKRDY |
7955 -                         AT91_PMC_LOCKU | AT91_PMC_PCK0RDY |
7956 -                         AT91_PMC_PCK1RDY,
7957 -};
7958 -
7959 -static const struct at91_pmc_caps at91sam9n12_caps = {
7960 -       .available_irqs = AT91_PMC_MOSCS | AT91_PMC_LOCKA | AT91_PMC_LOCKB |
7961 -                         AT91_PMC_MCKRDY | AT91_PMC_PCK0RDY |
7962 -                         AT91_PMC_PCK1RDY | AT91_PMC_MOSCSELS |
7963 -                         AT91_PMC_MOSCRCS | AT91_PMC_CFDEV,
7964 -};
7965 -
7966 -static const struct at91_pmc_caps at91sam9x5_caps = {
7967 -       .available_irqs = AT91_PMC_MOSCS | AT91_PMC_LOCKA | AT91_PMC_MCKRDY |
7968 -                         AT91_PMC_LOCKU | AT91_PMC_PCK0RDY |
7969 -                         AT91_PMC_PCK1RDY | AT91_PMC_MOSCSELS |
7970 -                         AT91_PMC_MOSCRCS | AT91_PMC_CFDEV,
7971 -};
7972 -
7973 -static const struct at91_pmc_caps sama5d2_caps = {
7974 -       .available_irqs = AT91_PMC_MOSCS | AT91_PMC_LOCKA | AT91_PMC_MCKRDY |
7975 -                         AT91_PMC_LOCKU | AT91_PMC_PCK0RDY |
7976 -                         AT91_PMC_PCK1RDY | AT91_PMC_PCK2RDY |
7977 -                         AT91_PMC_MOSCSELS | AT91_PMC_MOSCRCS |
7978 -                         AT91_PMC_CFDEV | AT91_PMC_GCKRDY,
7979 -};
7980 -
7981 -static const struct at91_pmc_caps sama5d3_caps = {
7982 -       .available_irqs = AT91_PMC_MOSCS | AT91_PMC_LOCKA | AT91_PMC_MCKRDY |
7983 -                         AT91_PMC_LOCKU | AT91_PMC_PCK0RDY |
7984 -                         AT91_PMC_PCK1RDY | AT91_PMC_PCK2RDY |
7985 -                         AT91_PMC_MOSCSELS | AT91_PMC_MOSCRCS |
7986 -                         AT91_PMC_CFDEV,
7987 -};
7988 -
7989 -static struct at91_pmc *__init at91_pmc_init(struct device_node *np,
7990 -                                            void __iomem *regbase, int virq,
7991 -                                            const struct at91_pmc_caps *caps)
7992 -{
7993 -       struct at91_pmc *pmc;
7994 -
7995 -       if (!regbase || !virq ||  !caps)
7996 -               return NULL;
7997 -
7998 -       at91_pmc_base = regbase;
7999 -
8000 -       pmc = kzalloc(sizeof(*pmc), GFP_KERNEL);
8001 -       if (!pmc)
8002 -               return NULL;
8003 -
8004 -       spin_lock_init(&pmc->lock);
8005 -       pmc->regbase = regbase;
8006 -       pmc->virq = virq;
8007 -       pmc->caps = caps;
8008 -
8009 -       pmc->irqdomain = irq_domain_add_linear(np, 32, &pmc_irq_ops, pmc);
8010 -
8011 -       if (!pmc->irqdomain)
8012 -               goto out_free_pmc;
8013 -
8014 -       pmc_write(pmc, AT91_PMC_IDR, 0xffffffff);
8015 -       if (request_irq(pmc->virq, pmc_irq_handler,
8016 -                       IRQF_SHARED | IRQF_COND_SUSPEND, "pmc", pmc))
8017 -               goto out_remove_irqdomain;
8018 -
8019 -       return pmc;
8020 -
8021 -out_remove_irqdomain:
8022 -       irq_domain_remove(pmc->irqdomain);
8023 -out_free_pmc:
8024 -       kfree(pmc);
8025 -
8026 -       return NULL;
8027 -}
8028 -
8029 -static const struct of_device_id pmc_clk_ids[] __initconst = {
8030 -       /* Slow oscillator */
8031 -       {
8032 -               .compatible = "atmel,at91sam9260-clk-slow",
8033 -               .data = of_at91sam9260_clk_slow_setup,
8034 -       },
8035 -       /* Main clock */
8036 -       {
8037 -               .compatible = "atmel,at91rm9200-clk-main-osc",
8038 -               .data = of_at91rm9200_clk_main_osc_setup,
8039 -       },
8040 -       {
8041 -               .compatible = "atmel,at91sam9x5-clk-main-rc-osc",
8042 -               .data = of_at91sam9x5_clk_main_rc_osc_setup,
8043 -       },
8044 -       {
8045 -               .compatible = "atmel,at91rm9200-clk-main",
8046 -               .data = of_at91rm9200_clk_main_setup,
8047 -       },
8048 -       {
8049 -               .compatible = "atmel,at91sam9x5-clk-main",
8050 -               .data = of_at91sam9x5_clk_main_setup,
8051 -       },
8052 -       /* PLL clocks */
8053 -       {
8054 -               .compatible = "atmel,at91rm9200-clk-pll",
8055 -               .data = of_at91rm9200_clk_pll_setup,
8056 -       },
8057 -       {
8058 -               .compatible = "atmel,at91sam9g45-clk-pll",
8059 -               .data = of_at91sam9g45_clk_pll_setup,
8060 -       },
8061 -       {
8062 -               .compatible = "atmel,at91sam9g20-clk-pllb",
8063 -               .data = of_at91sam9g20_clk_pllb_setup,
8064 -       },
8065 -       {
8066 -               .compatible = "atmel,sama5d3-clk-pll",
8067 -               .data = of_sama5d3_clk_pll_setup,
8068 -       },
8069 -       {
8070 -               .compatible = "atmel,at91sam9x5-clk-plldiv",
8071 -               .data = of_at91sam9x5_clk_plldiv_setup,
8072 -       },
8073 -       /* Master clock */
8074 -       {
8075 -               .compatible = "atmel,at91rm9200-clk-master",
8076 -               .data = of_at91rm9200_clk_master_setup,
8077 -       },
8078 -       {
8079 -               .compatible = "atmel,at91sam9x5-clk-master",
8080 -               .data = of_at91sam9x5_clk_master_setup,
8081 -       },
8082 -       /* System clocks */
8083 -       {
8084 -               .compatible = "atmel,at91rm9200-clk-system",
8085 -               .data = of_at91rm9200_clk_sys_setup,
8086 -       },
8087 -       /* Peripheral clocks */
8088 -       {
8089 -               .compatible = "atmel,at91rm9200-clk-peripheral",
8090 -               .data = of_at91rm9200_clk_periph_setup,
8091 -       },
8092 -       {
8093 -               .compatible = "atmel,at91sam9x5-clk-peripheral",
8094 -               .data = of_at91sam9x5_clk_periph_setup,
8095 -       },
8096 -       /* Programmable clocks */
8097 -       {
8098 -               .compatible = "atmel,at91rm9200-clk-programmable",
8099 -               .data = of_at91rm9200_clk_prog_setup,
8100 -       },
8101 -       {
8102 -               .compatible = "atmel,at91sam9g45-clk-programmable",
8103 -               .data = of_at91sam9g45_clk_prog_setup,
8104 -       },
8105 -       {
8106 -               .compatible = "atmel,at91sam9x5-clk-programmable",
8107 -               .data = of_at91sam9x5_clk_prog_setup,
8108 -       },
8109 -       /* UTMI clock */
8110 -#if defined(CONFIG_HAVE_AT91_UTMI)
8111 -       {
8112 -               .compatible = "atmel,at91sam9x5-clk-utmi",
8113 -               .data = of_at91sam9x5_clk_utmi_setup,
8114 -       },
8115 -#endif
8116 -       /* USB clock */
8117 -#if defined(CONFIG_HAVE_AT91_USB_CLK)
8118 -       {
8119 -               .compatible = "atmel,at91rm9200-clk-usb",
8120 -               .data = of_at91rm9200_clk_usb_setup,
8121 -       },
8122 -       {
8123 -               .compatible = "atmel,at91sam9x5-clk-usb",
8124 -               .data = of_at91sam9x5_clk_usb_setup,
8125 -       },
8126 -       {
8127 -               .compatible = "atmel,at91sam9n12-clk-usb",
8128 -               .data = of_at91sam9n12_clk_usb_setup,
8129 -       },
8130 -#endif
8131 -       /* SMD clock */
8132 -#if defined(CONFIG_HAVE_AT91_SMD)
8133 -       {
8134 -               .compatible = "atmel,at91sam9x5-clk-smd",
8135 -               .data = of_at91sam9x5_clk_smd_setup,
8136 -       },
8137 -#endif
8138 -#if defined(CONFIG_HAVE_AT91_H32MX)
8139 -       {
8140 -               .compatible = "atmel,sama5d4-clk-h32mx",
8141 -               .data = of_sama5d4_clk_h32mx_setup,
8142 -       },
8143 -#endif
8144 -#if defined(CONFIG_HAVE_AT91_GENERATED_CLK)
8145 -       {
8146 -               .compatible = "atmel,sama5d2-clk-generated",
8147 -               .data = of_sama5d2_clk_generated_setup,
8148 -       },
8149 -#endif
8150 -       { /*sentinel*/ }
8151 -};
8152 -
8153 -static void __init of_at91_pmc_setup(struct device_node *np,
8154 -                                    const struct at91_pmc_caps *caps)
8155 -{
8156 -       struct at91_pmc *pmc;
8157 -       struct device_node *childnp;
8158 -       void (*clk_setup)(struct device_node *, struct at91_pmc *);
8159 -       const struct of_device_id *clk_id;
8160 -       void __iomem *regbase = of_iomap(np, 0);
8161 -       int virq;
8162 -
8163 -       if (!regbase)
8164 -               return;
8165 -
8166 -       virq = irq_of_parse_and_map(np, 0);
8167 -       if (!virq)
8168 -               return;
8169 -
8170 -       pmc = at91_pmc_init(np, regbase, virq, caps);
8171 -       if (!pmc)
8172 -               return;
8173 -       for_each_child_of_node(np, childnp) {
8174 -               clk_id = of_match_node(pmc_clk_ids, childnp);
8175 -               if (!clk_id)
8176 -                       continue;
8177 -               clk_setup = clk_id->data;
8178 -               clk_setup(childnp, pmc);
8179 -       }
8180 -}
8181 -
8182 -static void __init of_at91rm9200_pmc_setup(struct device_node *np)
8183 -{
8184 -       of_at91_pmc_setup(np, &at91rm9200_caps);
8185 -}
8186 -CLK_OF_DECLARE(at91rm9200_clk_pmc, "atmel,at91rm9200-pmc",
8187 -              of_at91rm9200_pmc_setup);
8188 -
8189 -static void __init of_at91sam9260_pmc_setup(struct device_node *np)
8190 -{
8191 -       of_at91_pmc_setup(np, &at91sam9260_caps);
8192 -}
8193 -CLK_OF_DECLARE(at91sam9260_clk_pmc, "atmel,at91sam9260-pmc",
8194 -              of_at91sam9260_pmc_setup);
8195 -
8196 -static void __init of_at91sam9g45_pmc_setup(struct device_node *np)
8197 -{
8198 -       of_at91_pmc_setup(np, &at91sam9g45_caps);
8199 -}
8200 -CLK_OF_DECLARE(at91sam9g45_clk_pmc, "atmel,at91sam9g45-pmc",
8201 -              of_at91sam9g45_pmc_setup);
8202 -
8203 -static void __init of_at91sam9n12_pmc_setup(struct device_node *np)
8204 -{
8205 -       of_at91_pmc_setup(np, &at91sam9n12_caps);
8206 -}
8207 -CLK_OF_DECLARE(at91sam9n12_clk_pmc, "atmel,at91sam9n12-pmc",
8208 -              of_at91sam9n12_pmc_setup);
8209 -
8210 -static void __init of_at91sam9x5_pmc_setup(struct device_node *np)
8211 -{
8212 -       of_at91_pmc_setup(np, &at91sam9x5_caps);
8213 -}
8214 -CLK_OF_DECLARE(at91sam9x5_clk_pmc, "atmel,at91sam9x5-pmc",
8215 -              of_at91sam9x5_pmc_setup);
8216 -
8217 -static void __init of_sama5d2_pmc_setup(struct device_node *np)
8218 -{
8219 -       of_at91_pmc_setup(np, &sama5d2_caps);
8220 -}
8221 -CLK_OF_DECLARE(sama5d2_clk_pmc, "atmel,sama5d2-pmc",
8222 -              of_sama5d2_pmc_setup);
8223 -
8224 -static void __init of_sama5d3_pmc_setup(struct device_node *np)
8225 -{
8226 -       of_at91_pmc_setup(np, &sama5d3_caps);
8227 -}
8228 -CLK_OF_DECLARE(sama5d3_clk_pmc, "atmel,sama5d3-pmc",
8229 -              of_sama5d3_pmc_setup);
8230 diff --git a/drivers/clk/at91/pmc.h b/drivers/clk/at91/pmc.h
8231 index f65739272779..5771fff0ee3f 100644
8232 --- a/drivers/clk/at91/pmc.h
8233 +++ b/drivers/clk/at91/pmc.h
8234 @@ -14,8 +14,11 @@
8235
8236  #include <linux/io.h>
8237  #include <linux/irqdomain.h>
8238 +#include <linux/regmap.h>
8239  #include <linux/spinlock.h>
8240
8241 +extern spinlock_t pmc_pcr_lock;
8242 +
8243  struct clk_range {
8244         unsigned long min;
8245         unsigned long max;
8246 @@ -23,102 +26,7 @@ struct clk_range {
8247
8248  #define CLK_RANGE(MIN, MAX) {.min = MIN, .max = MAX,}
8249
8250 -struct at91_pmc_caps {
8251 -       u32 available_irqs;
8252 -};
8253 -
8254 -struct at91_pmc {
8255 -       void __iomem *regbase;
8256 -       int virq;
8257 -       spinlock_t lock;
8258 -       const struct at91_pmc_caps *caps;
8259 -       struct irq_domain *irqdomain;
8260 -       u32 imr;
8261 -};
8262 -
8263 -static inline void pmc_lock(struct at91_pmc *pmc)
8264 -{
8265 -       spin_lock(&pmc->lock);
8266 -}
8267 -
8268 -static inline void pmc_unlock(struct at91_pmc *pmc)
8269 -{
8270 -       spin_unlock(&pmc->lock);
8271 -}
8272 -
8273 -static inline u32 pmc_read(struct at91_pmc *pmc, int offset)
8274 -{
8275 -       return readl(pmc->regbase + offset);
8276 -}
8277 -
8278 -static inline void pmc_write(struct at91_pmc *pmc, int offset, u32 value)
8279 -{
8280 -       writel(value, pmc->regbase + offset);
8281 -}
8282 -
8283  int of_at91_get_clk_range(struct device_node *np, const char *propname,
8284                           struct clk_range *range);
8285
8286 -void of_at91sam9260_clk_slow_setup(struct device_node *np,
8287 -                                  struct at91_pmc *pmc);
8288 -
8289 -void of_at91rm9200_clk_main_osc_setup(struct device_node *np,
8290 -                                     struct at91_pmc *pmc);
8291 -void of_at91sam9x5_clk_main_rc_osc_setup(struct device_node *np,
8292 -                                        struct at91_pmc *pmc);
8293 -void of_at91rm9200_clk_main_setup(struct device_node *np,
8294 -                                 struct at91_pmc *pmc);
8295 -void of_at91sam9x5_clk_main_setup(struct device_node *np,
8296 -                                 struct at91_pmc *pmc);
8297 -
8298 -void of_at91rm9200_clk_pll_setup(struct device_node *np,
8299 -                                struct at91_pmc *pmc);
8300 -void of_at91sam9g45_clk_pll_setup(struct device_node *np,
8301 -                                 struct at91_pmc *pmc);
8302 -void of_at91sam9g20_clk_pllb_setup(struct device_node *np,
8303 -                                  struct at91_pmc *pmc);
8304 -void of_sama5d3_clk_pll_setup(struct device_node *np,
8305 -                             struct at91_pmc *pmc);
8306 -void of_at91sam9x5_clk_plldiv_setup(struct device_node *np,
8307 -                                   struct at91_pmc *pmc);
8308 -
8309 -void of_at91rm9200_clk_master_setup(struct device_node *np,
8310 -                                   struct at91_pmc *pmc);
8311 -void of_at91sam9x5_clk_master_setup(struct device_node *np,
8312 -                                   struct at91_pmc *pmc);
8313 -
8314 -void of_at91rm9200_clk_sys_setup(struct device_node *np,
8315 -                                struct at91_pmc *pmc);
8316 -
8317 -void of_at91rm9200_clk_periph_setup(struct device_node *np,
8318 -                                   struct at91_pmc *pmc);
8319 -void of_at91sam9x5_clk_periph_setup(struct device_node *np,
8320 -                                   struct at91_pmc *pmc);
8321 -
8322 -void of_at91rm9200_clk_prog_setup(struct device_node *np,
8323 -                                 struct at91_pmc *pmc);
8324 -void of_at91sam9g45_clk_prog_setup(struct device_node *np,
8325 -                                  struct at91_pmc *pmc);
8326 -void of_at91sam9x5_clk_prog_setup(struct device_node *np,
8327 -                                 struct at91_pmc *pmc);
8328 -
8329 -void of_at91sam9x5_clk_utmi_setup(struct device_node *np,
8330 -                                 struct at91_pmc *pmc);
8331 -
8332 -void of_at91rm9200_clk_usb_setup(struct device_node *np,
8333 -                                struct at91_pmc *pmc);
8334 -void of_at91sam9x5_clk_usb_setup(struct device_node *np,
8335 -                                struct at91_pmc *pmc);
8336 -void of_at91sam9n12_clk_usb_setup(struct device_node *np,
8337 -                                 struct at91_pmc *pmc);
8338 -
8339 -void of_at91sam9x5_clk_smd_setup(struct device_node *np,
8340 -                                struct at91_pmc *pmc);
8341 -
8342 -void of_sama5d4_clk_h32mx_setup(struct device_node *np,
8343 -                               struct at91_pmc *pmc);
8344 -
8345 -void of_sama5d2_clk_generated_setup(struct device_node *np,
8346 -                                   struct at91_pmc *pmc);
8347 -
8348  #endif /* __PMC_H_ */
8349 diff --git a/drivers/clocksource/tcb_clksrc.c b/drivers/clocksource/tcb_clksrc.c
8350 index 4da2af9694a2..5b6f57f500b8 100644
8351 --- a/drivers/clocksource/tcb_clksrc.c
8352 +++ b/drivers/clocksource/tcb_clksrc.c
8353 @@ -23,8 +23,7 @@
8354   *     this 32 bit free-running counter. the second channel is not used.
8355   *
8356   *   - The third channel may be used to provide a 16-bit clockevent
8357 - *     source, used in either periodic or oneshot mode.  This runs
8358 - *     at 32 KiHZ, and can handle delays of up to two seconds.
8359 + *     source, used in either periodic or oneshot mode.
8360   *
8361   * A boot clocksource and clockevent source are also currently needed,
8362   * unless the relevant platforms (ARM/AT91, AVR32/AT32) are changed so
8363 @@ -74,6 +73,8 @@ static struct clocksource clksrc = {
8364  struct tc_clkevt_device {
8365         struct clock_event_device       clkevt;
8366         struct clk                      *clk;
8367 +       bool                            clk_enabled;
8368 +       u32                             freq;
8369         void __iomem                    *regs;
8370  };
8371
8372 @@ -82,15 +83,26 @@ static struct tc_clkevt_device *to_tc_clkevt(struct clock_event_device *clkevt)
8373         return container_of(clkevt, struct tc_clkevt_device, clkevt);
8374  }
8375
8376 -/* For now, we always use the 32K clock ... this optimizes for NO_HZ,
8377 - * because using one of the divided clocks would usually mean the
8378 - * tick rate can never be less than several dozen Hz (vs 0.5 Hz).
8379 - *
8380 - * A divided clock could be good for high resolution timers, since
8381 - * 30.5 usec resolution can seem "low".
8382 - */
8383  static u32 timer_clock;
8384
8385 +static void tc_clk_disable(struct clock_event_device *d)
8386 +{
8387 +       struct tc_clkevt_device *tcd = to_tc_clkevt(d);
8388 +
8389 +       clk_disable(tcd->clk);
8390 +       tcd->clk_enabled = false;
8391 +}
8392 +
8393 +static void tc_clk_enable(struct clock_event_device *d)
8394 +{
8395 +       struct tc_clkevt_device *tcd = to_tc_clkevt(d);
8396 +
8397 +       if (tcd->clk_enabled)
8398 +               return;
8399 +       clk_enable(tcd->clk);
8400 +       tcd->clk_enabled = true;
8401 +}
8402 +
8403  static int tc_shutdown(struct clock_event_device *d)
8404  {
8405         struct tc_clkevt_device *tcd = to_tc_clkevt(d);
8406 @@ -98,8 +110,14 @@ static int tc_shutdown(struct clock_event_device *d)
8407
8408         __raw_writel(0xff, regs + ATMEL_TC_REG(2, IDR));
8409         __raw_writel(ATMEL_TC_CLKDIS, regs + ATMEL_TC_REG(2, CCR));
8410 +       return 0;
8411 +}
8412 +
8413 +static int tc_shutdown_clk_off(struct clock_event_device *d)
8414 +{
8415 +       tc_shutdown(d);
8416         if (!clockevent_state_detached(d))
8417 -               clk_disable(tcd->clk);
8418 +               tc_clk_disable(d);
8419
8420         return 0;
8421  }
8422 @@ -112,9 +130,9 @@ static int tc_set_oneshot(struct clock_event_device *d)
8423         if (clockevent_state_oneshot(d) || clockevent_state_periodic(d))
8424                 tc_shutdown(d);
8425
8426 -       clk_enable(tcd->clk);
8427 +       tc_clk_enable(d);
8428
8429 -       /* slow clock, count up to RC, then irq and stop */
8430 +       /* count up to RC, then irq and stop */
8431         __raw_writel(timer_clock | ATMEL_TC_CPCSTOP | ATMEL_TC_WAVE |
8432                      ATMEL_TC_WAVESEL_UP_AUTO, regs + ATMEL_TC_REG(2, CMR));
8433         __raw_writel(ATMEL_TC_CPCS, regs + ATMEL_TC_REG(2, IER));
8434 @@ -134,12 +152,12 @@ static int tc_set_periodic(struct clock_event_device *d)
8435         /* By not making the gentime core emulate periodic mode on top
8436          * of oneshot, we get lower overhead and improved accuracy.
8437          */
8438 -       clk_enable(tcd->clk);
8439 +       tc_clk_enable(d);
8440
8441 -       /* slow clock, count up to RC, then irq and restart */
8442 +       /* count up to RC, then irq and restart */
8443         __raw_writel(timer_clock | ATMEL_TC_WAVE | ATMEL_TC_WAVESEL_UP_AUTO,
8444                      regs + ATMEL_TC_REG(2, CMR));
8445 -       __raw_writel((32768 + HZ / 2) / HZ, tcaddr + ATMEL_TC_REG(2, RC));
8446 +       __raw_writel((tcd->freq + HZ / 2) / HZ, tcaddr + ATMEL_TC_REG(2, RC));
8447
8448         /* Enable clock and interrupts on RC compare */
8449         __raw_writel(ATMEL_TC_CPCS, regs + ATMEL_TC_REG(2, IER));
8450 @@ -166,9 +184,13 @@ static struct tc_clkevt_device clkevt = {
8451                 .features               = CLOCK_EVT_FEAT_PERIODIC |
8452                                           CLOCK_EVT_FEAT_ONESHOT,
8453                 /* Should be lower than at91rm9200's system timer */
8454 +#ifdef CONFIG_ATMEL_TCB_CLKSRC_USE_SLOW_CLOCK
8455                 .rating                 = 125,
8456 +#else
8457 +               .rating                 = 200,
8458 +#endif
8459                 .set_next_event         = tc_next_event,
8460 -               .set_state_shutdown     = tc_shutdown,
8461 +               .set_state_shutdown     = tc_shutdown_clk_off,
8462                 .set_state_periodic     = tc_set_periodic,
8463                 .set_state_oneshot      = tc_set_oneshot,
8464         },
8465 @@ -188,8 +210,9 @@ static irqreturn_t ch2_irq(int irq, void *handle)
8466         return IRQ_NONE;
8467  }
8468
8469 -static int __init setup_clkevents(struct atmel_tc *tc, int clk32k_divisor_idx)
8470 +static int __init setup_clkevents(struct atmel_tc *tc, int divisor_idx)
8471  {
8472 +       unsigned divisor = atmel_tc_divisors[divisor_idx];
8473         int ret;
8474         struct clk *t2_clk = tc->clk[2];
8475         int irq = tc->irq[2];
8476 @@ -210,7 +233,11 @@ static int __init setup_clkevents(struct atmel_tc *tc, int clk32k_divisor_idx)
8477         clkevt.regs = tc->regs;
8478         clkevt.clk = t2_clk;
8479
8480 -       timer_clock = clk32k_divisor_idx;
8481 +       timer_clock = divisor_idx;
8482 +       if (!divisor)
8483 +               clkevt.freq = 32768;
8484 +       else
8485 +               clkevt.freq = clk_get_rate(t2_clk) / divisor;
8486
8487         clkevt.clkevt.cpumask = cpumask_of(0);
8488
8489 @@ -221,7 +248,7 @@ static int __init setup_clkevents(struct atmel_tc *tc, int clk32k_divisor_idx)
8490                 return ret;
8491         }
8492
8493 -       clockevents_config_and_register(&clkevt.clkevt, 32768, 1, 0xffff);
8494 +       clockevents_config_and_register(&clkevt.clkevt, clkevt.freq, 1, 0xffff);
8495
8496         return ret;
8497  }
8498 @@ -358,7 +385,11 @@ static int __init tcb_clksrc_init(void)
8499                 goto err_disable_t1;
8500
8501         /* channel 2:  periodic and oneshot timer support */
8502 +#ifdef CONFIG_ATMEL_TCB_CLKSRC_USE_SLOW_CLOCK
8503         ret = setup_clkevents(tc, clk32k_divisor_idx);
8504 +#else
8505 +       ret = setup_clkevents(tc, best_divisor_idx);
8506 +#endif
8507         if (ret)
8508                 goto err_unregister_clksrc;
8509
8510 diff --git a/drivers/clocksource/timer-atmel-pit.c b/drivers/clocksource/timer-atmel-pit.c
8511 index d911c5dca8f1..7a40f7e88468 100644
8512 --- a/drivers/clocksource/timer-atmel-pit.c
8513 +++ b/drivers/clocksource/timer-atmel-pit.c
8514 @@ -46,6 +46,7 @@ struct pit_data {
8515         u32             cycle;
8516         u32             cnt;
8517         unsigned int    irq;
8518 +       bool            irq_requested;
8519         struct clk      *mck;
8520  };
8521
8522 @@ -96,15 +97,29 @@ static int pit_clkevt_shutdown(struct clock_event_device *dev)
8523
8524         /* disable irq, leaving the clocksource active */
8525         pit_write(data->base, AT91_PIT_MR, (data->cycle - 1) | AT91_PIT_PITEN);
8526 +       if (data->irq_requested) {
8527 +               free_irq(data->irq, data);
8528 +               data->irq_requested = false;
8529 +       }
8530         return 0;
8531  }
8532
8533 +static irqreturn_t at91sam926x_pit_interrupt(int irq, void *dev_id);
8534  /*
8535   * Clockevent device:  interrupts every 1/HZ (== pit_cycles * MCK/16)
8536   */
8537  static int pit_clkevt_set_periodic(struct clock_event_device *dev)
8538  {
8539         struct pit_data *data = clkevt_to_pit_data(dev);
8540 +       int ret;
8541 +
8542 +       ret = request_irq(data->irq, at91sam926x_pit_interrupt,
8543 +                         IRQF_SHARED | IRQF_TIMER | IRQF_IRQPOLL,
8544 +                         "at91_tick", data);
8545 +       if (ret)
8546 +               panic(pr_fmt("Unable to setup IRQ\n"));
8547 +
8548 +       data->irq_requested = true;
8549
8550         /* update clocksource counter */
8551         data->cnt += data->cycle * PIT_PICNT(pit_read(data->base, AT91_PIT_PIVR));
8552 @@ -181,7 +196,6 @@ static void __init at91sam926x_pit_common_init(struct pit_data *data)
8553  {
8554         unsigned long   pit_rate;
8555         unsigned        bits;
8556 -       int             ret;
8557
8558         /*
8559          * Use our actual MCK to figure out how many MCK/16 ticks per
8560 @@ -206,13 +220,6 @@ static void __init at91sam926x_pit_common_init(struct pit_data *data)
8561         data->clksrc.flags = CLOCK_SOURCE_IS_CONTINUOUS;
8562         clocksource_register_hz(&data->clksrc, pit_rate);
8563
8564 -       /* Set up irq handler */
8565 -       ret = request_irq(data->irq, at91sam926x_pit_interrupt,
8566 -                         IRQF_SHARED | IRQF_TIMER | IRQF_IRQPOLL,
8567 -                         "at91_tick", data);
8568 -       if (ret)
8569 -               panic(pr_fmt("Unable to setup IRQ\n"));
8570 -
8571         /* Set up and register clockevents */
8572         data->clkevt.name = "pit";
8573         data->clkevt.features = CLOCK_EVT_FEAT_PERIODIC;
8574 diff --git a/drivers/clocksource/timer-atmel-st.c b/drivers/clocksource/timer-atmel-st.c
8575 index 29d21d68df5a..103d0fd70cc4 100644
8576 --- a/drivers/clocksource/timer-atmel-st.c
8577 +++ b/drivers/clocksource/timer-atmel-st.c
8578 @@ -115,18 +115,29 @@ static void clkdev32k_disable_and_flush_irq(void)
8579         last_crtr = read_CRTR();
8580  }
8581
8582 +static int atmel_st_irq;
8583 +
8584  static int clkevt32k_shutdown(struct clock_event_device *evt)
8585  {
8586         clkdev32k_disable_and_flush_irq();
8587         irqmask = 0;
8588         regmap_write(regmap_st, AT91_ST_IER, irqmask);
8589 +       free_irq(atmel_st_irq, regmap_st);
8590         return 0;
8591  }
8592
8593  static int clkevt32k_set_oneshot(struct clock_event_device *dev)
8594  {
8595 +       int ret;
8596 +
8597         clkdev32k_disable_and_flush_irq();
8598
8599 +       ret = request_irq(atmel_st_irq, at91rm9200_timer_interrupt,
8600 +                         IRQF_SHARED | IRQF_TIMER | IRQF_IRQPOLL,
8601 +                         "at91_tick", regmap_st);
8602 +       if (ret)
8603 +               panic(pr_fmt("Unable to setup IRQ\n"));
8604 +
8605         /*
8606          * ALM for oneshot irqs, set by next_event()
8607          * before 32 seconds have passed.
8608 @@ -139,8 +150,16 @@ static int clkevt32k_set_oneshot(struct clock_event_device *dev)
8609
8610  static int clkevt32k_set_periodic(struct clock_event_device *dev)
8611  {
8612 +       int ret;
8613 +
8614         clkdev32k_disable_and_flush_irq();
8615
8616 +       ret = request_irq(atmel_st_irq, at91rm9200_timer_interrupt,
8617 +                         IRQF_SHARED | IRQF_TIMER | IRQF_IRQPOLL,
8618 +                         "at91_tick", regmap_st);
8619 +       if (ret)
8620 +               panic(pr_fmt("Unable to setup IRQ\n"));
8621 +
8622         /* PIT for periodic irqs; fixed rate of 1/HZ */
8623         irqmask = AT91_ST_PITS;
8624         regmap_write(regmap_st, AT91_ST_PIMR, timer_latch);
8625 @@ -198,7 +217,7 @@ static void __init atmel_st_timer_init(struct device_node *node)
8626  {
8627         struct clk *sclk;
8628         unsigned int sclk_rate, val;
8629 -       int irq, ret;
8630 +       int ret;
8631
8632         regmap_st = syscon_node_to_regmap(node);
8633         if (IS_ERR(regmap_st))
8634 @@ -210,17 +229,10 @@ static void __init atmel_st_timer_init(struct device_node *node)
8635         regmap_read(regmap_st, AT91_ST_SR, &val);
8636
8637         /* Get the interrupts property */
8638 -       irq  = irq_of_parse_and_map(node, 0);
8639 -       if (!irq)
8640 +       atmel_st_irq  = irq_of_parse_and_map(node, 0);
8641 +       if (!atmel_st_irq)
8642                 panic(pr_fmt("Unable to get IRQ from DT\n"));
8643
8644 -       /* Make IRQs happen for the system timer */
8645 -       ret = request_irq(irq, at91rm9200_timer_interrupt,
8646 -                         IRQF_SHARED | IRQF_TIMER | IRQF_IRQPOLL,
8647 -                         "at91_tick", regmap_st);
8648 -       if (ret)
8649 -               panic(pr_fmt("Unable to setup IRQ\n"));
8650 -
8651         sclk = of_clk_get(node, 0);
8652         if (IS_ERR(sclk))
8653                 panic(pr_fmt("Unable to get slow clock\n"));
8654 diff --git a/drivers/cpufreq/Kconfig.x86 b/drivers/cpufreq/Kconfig.x86
8655 index c59bdcb83217..8f23161d80be 100644
8656 --- a/drivers/cpufreq/Kconfig.x86
8657 +++ b/drivers/cpufreq/Kconfig.x86
8658 @@ -123,7 +123,7 @@ config X86_POWERNOW_K7_ACPI
8659
8660  config X86_POWERNOW_K8
8661         tristate "AMD Opteron/Athlon64 PowerNow!"
8662 -       depends on ACPI && ACPI_PROCESSOR && X86_ACPI_CPUFREQ
8663 +       depends on ACPI && ACPI_PROCESSOR && X86_ACPI_CPUFREQ && !PREEMPT_RT_BASE
8664         help
8665           This adds the CPUFreq driver for K8/early Opteron/Athlon64 processors.
8666           Support for K10 and newer processors is now in acpi-cpufreq.
8667 diff --git a/drivers/cpuidle/coupled.c b/drivers/cpuidle/coupled.c
8668 index 344058f8501a..d5657d50ac40 100644
8669 --- a/drivers/cpuidle/coupled.c
8670 +++ b/drivers/cpuidle/coupled.c
8671 @@ -119,7 +119,6 @@ struct cpuidle_coupled {
8672
8673  #define CPUIDLE_COUPLED_NOT_IDLE       (-1)
8674
8675 -static DEFINE_MUTEX(cpuidle_coupled_lock);
8676  static DEFINE_PER_CPU(struct call_single_data, cpuidle_coupled_poke_cb);
8677
8678  /*
8679 diff --git a/drivers/gpu/drm/i915/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
8680 index 6ed7d63a0688..9da7482ad256 100644
8681 --- a/drivers/gpu/drm/i915/i915_gem_execbuffer.c
8682 +++ b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
8683 @@ -1264,7 +1264,9 @@ i915_gem_ringbuffer_submission(struct i915_execbuffer_params *params,
8684         if (ret)
8685                 return ret;
8686
8687 +#ifndef CONFIG_PREEMPT_RT_BASE
8688         trace_i915_gem_ring_dispatch(params->request, params->dispatch_flags);
8689 +#endif
8690
8691         i915_gem_execbuffer_move_to_active(vmas, params->request);
8692         i915_gem_execbuffer_retire_commands(params);
8693 diff --git a/drivers/gpu/drm/i915/i915_gem_shrinker.c b/drivers/gpu/drm/i915/i915_gem_shrinker.c
8694 index c0a96f1ee18e..deb1e207fa3c 100644
8695 --- a/drivers/gpu/drm/i915/i915_gem_shrinker.c
8696 +++ b/drivers/gpu/drm/i915/i915_gem_shrinker.c
8697 @@ -39,7 +39,7 @@ static bool mutex_is_locked_by(struct mutex *mutex, struct task_struct *task)
8698         if (!mutex_is_locked(mutex))
8699                 return false;
8700
8701 -#if defined(CONFIG_DEBUG_MUTEXES) || defined(CONFIG_MUTEX_SPIN_ON_OWNER)
8702 +#if (defined(CONFIG_DEBUG_MUTEXES) || defined(CONFIG_MUTEX_SPIN_ON_OWNER)) && !defined(CONFIG_PREEMPT_RT_BASE)
8703         return mutex->owner == task;
8704  #else
8705         /* Since UP may be pre-empted, we cannot assume that we own the lock */
8706 diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
8707 index 0f42a2782afc..80a1db09a379 100644
8708 --- a/drivers/gpu/drm/i915/i915_irq.c
8709 +++ b/drivers/gpu/drm/i915/i915_irq.c
8710 @@ -812,6 +812,7 @@ static int i915_get_crtc_scanoutpos(struct drm_device *dev, unsigned int pipe,
8711         spin_lock_irqsave(&dev_priv->uncore.lock, irqflags);
8712
8713         /* preempt_disable_rt() should go right here in PREEMPT_RT patchset. */
8714 +       preempt_disable_rt();
8715
8716         /* Get optional system timestamp before query. */
8717         if (stime)
8718 @@ -863,6 +864,7 @@ static int i915_get_crtc_scanoutpos(struct drm_device *dev, unsigned int pipe,
8719                 *etime = ktime_get();
8720
8721         /* preempt_enable_rt() should go right here in PREEMPT_RT patchset. */
8722 +       preempt_enable_rt();
8723
8724         spin_unlock_irqrestore(&dev_priv->uncore.lock, irqflags);
8725
8726 diff --git a/drivers/gpu/drm/i915/intel_display.c b/drivers/gpu/drm/i915/intel_display.c
8727 index 909d1d71d130..8688709b4ffa 100644
8728 --- a/drivers/gpu/drm/i915/intel_display.c
8729 +++ b/drivers/gpu/drm/i915/intel_display.c
8730 @@ -11400,7 +11400,7 @@ void intel_check_page_flip(struct drm_device *dev, int pipe)
8731         struct intel_crtc *intel_crtc = to_intel_crtc(crtc);
8732         struct intel_unpin_work *work;
8733
8734 -       WARN_ON(!in_interrupt());
8735 +       WARN_ON_NONRT(!in_interrupt());
8736
8737         if (crtc == NULL)
8738                 return;
8739 diff --git a/drivers/gpu/drm/i915/intel_sprite.c b/drivers/gpu/drm/i915/intel_sprite.c
8740 index 2cc6aa072f4c..b79d33f14868 100644
8741 --- a/drivers/gpu/drm/i915/intel_sprite.c
8742 +++ b/drivers/gpu/drm/i915/intel_sprite.c
8743 @@ -38,6 +38,7 @@
8744  #include "intel_drv.h"
8745  #include <drm/i915_drm.h>
8746  #include "i915_drv.h"
8747 +#include <linux/locallock.h>
8748
8749  static bool
8750  format_is_yuv(uint32_t format)
8751 @@ -64,6 +65,8 @@ static int usecs_to_scanlines(const struct drm_display_mode *adjusted_mode,
8752                             1000 * adjusted_mode->crtc_htotal);
8753  }
8754
8755 +static DEFINE_LOCAL_IRQ_LOCK(pipe_update_lock);
8756 +
8757  /**
8758   * intel_pipe_update_start() - start update of a set of display registers
8759   * @crtc: the crtc of which the registers are going to be updated
8760 @@ -96,7 +99,7 @@ void intel_pipe_update_start(struct intel_crtc *crtc)
8761         min = vblank_start - usecs_to_scanlines(adjusted_mode, 100);
8762         max = vblank_start - 1;
8763
8764 -       local_irq_disable();
8765 +       local_lock_irq(pipe_update_lock);
8766
8767         if (min <= 0 || max <= 0)
8768                 return;
8769 @@ -126,11 +129,11 @@ void intel_pipe_update_start(struct intel_crtc *crtc)
8770                         break;
8771                 }
8772
8773 -               local_irq_enable();
8774 +               local_unlock_irq(pipe_update_lock);
8775
8776                 timeout = schedule_timeout(timeout);
8777
8778 -               local_irq_disable();
8779 +               local_lock_irq(pipe_update_lock);
8780         }
8781
8782         finish_wait(wq, &wait);
8783 @@ -164,7 +167,7 @@ void intel_pipe_update_end(struct intel_crtc *crtc)
8784
8785         trace_i915_pipe_update_end(crtc, end_vbl_count, scanline_end);
8786
8787 -       local_irq_enable();
8788 +       local_unlock_irq(pipe_update_lock);
8789
8790         if (crtc->debug.start_vbl_count &&
8791             crtc->debug.start_vbl_count != end_vbl_count) {
8792 diff --git a/drivers/gpu/drm/radeon/radeon_display.c b/drivers/gpu/drm/radeon/radeon_display.c
8793 index 3645b223aa37..642854b2ed2c 100644
8794 --- a/drivers/gpu/drm/radeon/radeon_display.c
8795 +++ b/drivers/gpu/drm/radeon/radeon_display.c
8796 @@ -1862,6 +1862,7 @@ int radeon_get_crtc_scanoutpos(struct drm_device *dev, unsigned int pipe,
8797         struct radeon_device *rdev = dev->dev_private;
8798
8799         /* preempt_disable_rt() should go right here in PREEMPT_RT patchset. */
8800 +       preempt_disable_rt();
8801
8802         /* Get optional system timestamp before query. */
8803         if (stime)
8804 @@ -1954,6 +1955,7 @@ int radeon_get_crtc_scanoutpos(struct drm_device *dev, unsigned int pipe,
8805                 *etime = ktime_get();
8806
8807         /* preempt_enable_rt() should go right here in PREEMPT_RT patchset. */
8808 +       preempt_enable_rt();
8809
8810         /* Decode into vertical and horizontal scanout position. */
8811         *vpos = position & 0x1fff;
8812 diff --git a/drivers/hv/vmbus_drv.c b/drivers/hv/vmbus_drv.c
8813 index 509ed9731630..b2498b270f2c 100644
8814 --- a/drivers/hv/vmbus_drv.c
8815 +++ b/drivers/hv/vmbus_drv.c
8816 @@ -820,7 +820,7 @@ static void vmbus_isr(void)
8817                         tasklet_schedule(&msg_dpc);
8818         }
8819
8820 -       add_interrupt_randomness(HYPERVISOR_CALLBACK_VECTOR, 0);
8821 +       add_interrupt_randomness(HYPERVISOR_CALLBACK_VECTOR, 0, 0);
8822  }
8823
8824
8825 diff --git a/drivers/i2c/busses/i2c-omap.c b/drivers/i2c/busses/i2c-omap.c
8826 index 08d26ba61ed3..46b89dd42b10 100644
8827 --- a/drivers/i2c/busses/i2c-omap.c
8828 +++ b/drivers/i2c/busses/i2c-omap.c
8829 @@ -995,15 +995,12 @@ omap_i2c_isr(int irq, void *dev_id)
8830         u16 mask;
8831         u16 stat;
8832
8833 -       spin_lock(&omap->lock);
8834 -       mask = omap_i2c_read_reg(omap, OMAP_I2C_IE_REG);
8835         stat = omap_i2c_read_reg(omap, OMAP_I2C_STAT_REG);
8836 +       mask = omap_i2c_read_reg(omap, OMAP_I2C_IE_REG);
8837
8838         if (stat & mask)
8839                 ret = IRQ_WAKE_THREAD;
8840
8841 -       spin_unlock(&omap->lock);
8842 -
8843         return ret;
8844  }
8845
8846 diff --git a/drivers/ide/alim15x3.c b/drivers/ide/alim15x3.c
8847 index 36f76e28a0bf..394f142f90c7 100644
8848 --- a/drivers/ide/alim15x3.c
8849 +++ b/drivers/ide/alim15x3.c
8850 @@ -234,7 +234,7 @@ static int init_chipset_ali15x3(struct pci_dev *dev)
8851
8852         isa_dev = pci_get_device(PCI_VENDOR_ID_AL, PCI_DEVICE_ID_AL_M1533, NULL);
8853
8854 -       local_irq_save(flags);
8855 +       local_irq_save_nort(flags);
8856
8857         if (m5229_revision < 0xC2) {
8858                 /*
8859 @@ -325,7 +325,7 @@ out:
8860         }
8861         pci_dev_put(north);
8862         pci_dev_put(isa_dev);
8863 -       local_irq_restore(flags);
8864 +       local_irq_restore_nort(flags);
8865         return 0;
8866  }
8867
8868 diff --git a/drivers/ide/hpt366.c b/drivers/ide/hpt366.c
8869 index 696b6c1ec940..0d0a96629b73 100644
8870 --- a/drivers/ide/hpt366.c
8871 +++ b/drivers/ide/hpt366.c
8872 @@ -1241,7 +1241,7 @@ static int init_dma_hpt366(ide_hwif_t *hwif,
8873
8874         dma_old = inb(base + 2);
8875
8876 -       local_irq_save(flags);
8877 +       local_irq_save_nort(flags);
8878
8879         dma_new = dma_old;
8880         pci_read_config_byte(dev, hwif->channel ? 0x4b : 0x43, &masterdma);
8881 @@ -1252,7 +1252,7 @@ static int init_dma_hpt366(ide_hwif_t *hwif,
8882         if (dma_new != dma_old)
8883                 outb(dma_new, base + 2);
8884
8885 -       local_irq_restore(flags);
8886 +       local_irq_restore_nort(flags);
8887
8888         printk(KERN_INFO "    %s: BM-DMA at 0x%04lx-0x%04lx\n",
8889                          hwif->name, base, base + 7);
8890 diff --git a/drivers/ide/ide-io-std.c b/drivers/ide/ide-io-std.c
8891 index 19763977568c..4169433faab5 100644
8892 --- a/drivers/ide/ide-io-std.c
8893 +++ b/drivers/ide/ide-io-std.c
8894 @@ -175,7 +175,7 @@ void ide_input_data(ide_drive_t *drive, struct ide_cmd *cmd, void *buf,
8895                 unsigned long uninitialized_var(flags);
8896
8897                 if ((io_32bit & 2) && !mmio) {
8898 -                       local_irq_save(flags);
8899 +                       local_irq_save_nort(flags);
8900                         ata_vlb_sync(io_ports->nsect_addr);
8901                 }
8902
8903 @@ -186,7 +186,7 @@ void ide_input_data(ide_drive_t *drive, struct ide_cmd *cmd, void *buf,
8904                         insl(data_addr, buf, words);
8905
8906                 if ((io_32bit & 2) && !mmio)
8907 -                       local_irq_restore(flags);
8908 +                       local_irq_restore_nort(flags);
8909
8910                 if (((len + 1) & 3) < 2)
8911                         return;
8912 @@ -219,7 +219,7 @@ void ide_output_data(ide_drive_t *drive, struct ide_cmd *cmd, void *buf,
8913                 unsigned long uninitialized_var(flags);
8914
8915                 if ((io_32bit & 2) && !mmio) {
8916 -                       local_irq_save(flags);
8917 +                       local_irq_save_nort(flags);
8918                         ata_vlb_sync(io_ports->nsect_addr);
8919                 }
8920
8921 @@ -230,7 +230,7 @@ void ide_output_data(ide_drive_t *drive, struct ide_cmd *cmd, void *buf,
8922                         outsl(data_addr, buf, words);
8923
8924                 if ((io_32bit & 2) && !mmio)
8925 -                       local_irq_restore(flags);
8926 +                       local_irq_restore_nort(flags);
8927
8928                 if (((len + 1) & 3) < 2)
8929                         return;
8930 diff --git a/drivers/ide/ide-io.c b/drivers/ide/ide-io.c
8931 index 669ea1e45795..e12e43e62245 100644
8932 --- a/drivers/ide/ide-io.c
8933 +++ b/drivers/ide/ide-io.c
8934 @@ -659,7 +659,7 @@ void ide_timer_expiry (unsigned long data)
8935                 /* disable_irq_nosync ?? */
8936                 disable_irq(hwif->irq);
8937                 /* local CPU only, as if we were handling an interrupt */
8938 -               local_irq_disable();
8939 +               local_irq_disable_nort();
8940                 if (hwif->polling) {
8941                         startstop = handler(drive);
8942                 } else if (drive_is_ready(drive)) {
8943 diff --git a/drivers/ide/ide-iops.c b/drivers/ide/ide-iops.c
8944 index 376f2dc410c5..f014dd1b73dc 100644
8945 --- a/drivers/ide/ide-iops.c
8946 +++ b/drivers/ide/ide-iops.c
8947 @@ -129,12 +129,12 @@ int __ide_wait_stat(ide_drive_t *drive, u8 good, u8 bad,
8948                                 if ((stat & ATA_BUSY) == 0)
8949                                         break;
8950
8951 -                               local_irq_restore(flags);
8952 +                               local_irq_restore_nort(flags);
8953                                 *rstat = stat;
8954                                 return -EBUSY;
8955                         }
8956                 }
8957 -               local_irq_restore(flags);
8958 +               local_irq_restore_nort(flags);
8959         }
8960         /*
8961          * Allow status to settle, then read it again.
8962 diff --git a/drivers/ide/ide-probe.c b/drivers/ide/ide-probe.c
8963 index 0b63facd1d87..4ceba37afc0c 100644
8964 --- a/drivers/ide/ide-probe.c
8965 +++ b/drivers/ide/ide-probe.c
8966 @@ -196,10 +196,10 @@ static void do_identify(ide_drive_t *drive, u8 cmd, u16 *id)
8967         int bswap = 1;
8968
8969         /* local CPU only; some systems need this */
8970 -       local_irq_save(flags);
8971 +       local_irq_save_nort(flags);
8972         /* read 512 bytes of id info */
8973         hwif->tp_ops->input_data(drive, NULL, id, SECTOR_SIZE);
8974 -       local_irq_restore(flags);
8975 +       local_irq_restore_nort(flags);
8976
8977         drive->dev_flags |= IDE_DFLAG_ID_READ;
8978  #ifdef DEBUG
8979 diff --git a/drivers/ide/ide-taskfile.c b/drivers/ide/ide-taskfile.c
8980 index a716693417a3..be0568c722d6 100644
8981 --- a/drivers/ide/ide-taskfile.c
8982 +++ b/drivers/ide/ide-taskfile.c
8983 @@ -250,7 +250,7 @@ void ide_pio_bytes(ide_drive_t *drive, struct ide_cmd *cmd,
8984
8985                 page_is_high = PageHighMem(page);
8986                 if (page_is_high)
8987 -                       local_irq_save(flags);
8988 +                       local_irq_save_nort(flags);
8989
8990                 buf = kmap_atomic(page) + offset;
8991
8992 @@ -271,7 +271,7 @@ void ide_pio_bytes(ide_drive_t *drive, struct ide_cmd *cmd,
8993                 kunmap_atomic(buf);
8994
8995                 if (page_is_high)
8996 -                       local_irq_restore(flags);
8997 +                       local_irq_restore_nort(flags);
8998
8999                 len -= nr_bytes;
9000         }
9001 @@ -414,7 +414,7 @@ static ide_startstop_t pre_task_out_intr(ide_drive_t *drive,
9002         }
9003
9004         if ((drive->dev_flags & IDE_DFLAG_UNMASK) == 0)
9005 -               local_irq_disable();
9006 +               local_irq_disable_nort();
9007
9008         ide_set_handler(drive, &task_pio_intr, WAIT_WORSTCASE);
9009
9010 diff --git a/drivers/infiniband/ulp/ipoib/ipoib_multicast.c b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c
9011 index 87799de90a1d..66cdd37f8605 100644
9012 --- a/drivers/infiniband/ulp/ipoib/ipoib_multicast.c
9013 +++ b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c
9014 @@ -857,7 +857,7 @@ void ipoib_mcast_restart_task(struct work_struct *work)
9015
9016         ipoib_dbg_mcast(priv, "restarting multicast task\n");
9017
9018 -       local_irq_save(flags);
9019 +       local_irq_save_nort(flags);
9020         netif_addr_lock(dev);
9021         spin_lock(&priv->lock);
9022
9023 @@ -939,7 +939,7 @@ void ipoib_mcast_restart_task(struct work_struct *work)
9024
9025         spin_unlock(&priv->lock);
9026         netif_addr_unlock(dev);
9027 -       local_irq_restore(flags);
9028 +       local_irq_restore_nort(flags);
9029
9030         /*
9031          * make sure the in-flight joins have finished before we attempt
9032 diff --git a/drivers/input/gameport/gameport.c b/drivers/input/gameport/gameport.c
9033 index 4a2a9e370be7..e970d9afd179 100644
9034 --- a/drivers/input/gameport/gameport.c
9035 +++ b/drivers/input/gameport/gameport.c
9036 @@ -91,13 +91,13 @@ static int gameport_measure_speed(struct gameport *gameport)
9037         tx = ~0;
9038
9039         for (i = 0; i < 50; i++) {
9040 -               local_irq_save(flags);
9041 +               local_irq_save_nort(flags);
9042                 t1 = ktime_get_ns();
9043                 for (t = 0; t < 50; t++)
9044                         gameport_read(gameport);
9045                 t2 = ktime_get_ns();
9046                 t3 = ktime_get_ns();
9047 -               local_irq_restore(flags);
9048 +               local_irq_restore_nort(flags);
9049                 udelay(i * 10);
9050                 t = (t2 - t1) - (t3 - t2);
9051                 if (t < tx)
9052 @@ -124,12 +124,12 @@ static int old_gameport_measure_speed(struct gameport *gameport)
9053         tx = 1 << 30;
9054
9055         for(i = 0; i < 50; i++) {
9056 -               local_irq_save(flags);
9057 +               local_irq_save_nort(flags);
9058                 GET_TIME(t1);
9059                 for (t = 0; t < 50; t++) gameport_read(gameport);
9060                 GET_TIME(t2);
9061                 GET_TIME(t3);
9062 -               local_irq_restore(flags);
9063 +               local_irq_restore_nort(flags);
9064                 udelay(i * 10);
9065                 if ((t = DELTA(t2,t1) - DELTA(t3,t2)) < tx) tx = t;
9066         }
9067 @@ -148,11 +148,11 @@ static int old_gameport_measure_speed(struct gameport *gameport)
9068         tx = 1 << 30;
9069
9070         for(i = 0; i < 50; i++) {
9071 -               local_irq_save(flags);
9072 +               local_irq_save_nort(flags);
9073                 t1 = rdtsc();
9074                 for (t = 0; t < 50; t++) gameport_read(gameport);
9075                 t2 = rdtsc();
9076 -               local_irq_restore(flags);
9077 +               local_irq_restore_nort(flags);
9078                 udelay(i * 10);
9079                 if (t2 - t1 < tx) tx = t2 - t1;
9080         }
9081 diff --git a/drivers/iommu/amd_iommu.c b/drivers/iommu/amd_iommu.c
9082 index 0397985a2601..bc0e7d2c5cff 100644
9083 --- a/drivers/iommu/amd_iommu.c
9084 +++ b/drivers/iommu/amd_iommu.c
9085 @@ -2019,10 +2019,10 @@ static int __attach_device(struct iommu_dev_data *dev_data,
9086         int ret;
9087
9088         /*
9089 -        * Must be called with IRQs disabled. Warn here to detect early
9090 -        * when its not.
9091 +        * Must be called with IRQs disabled on a non RT kernel. Warn here to
9092 +        * detect early when its not.
9093          */
9094 -       WARN_ON(!irqs_disabled());
9095 +       WARN_ON_NONRT(!irqs_disabled());
9096
9097         /* lock domain */
9098         spin_lock(&domain->lock);
9099 @@ -2185,10 +2185,10 @@ static void __detach_device(struct iommu_dev_data *dev_data)
9100         struct protection_domain *domain;
9101
9102         /*
9103 -        * Must be called with IRQs disabled. Warn here to detect early
9104 -        * when its not.
9105 +        * Must be called with IRQs disabled on a non RT kernel. Warn here to
9106 +        * detect early when its not.
9107          */
9108 -       WARN_ON(!irqs_disabled());
9109 +       WARN_ON_NONRT(!irqs_disabled());
9110
9111         if (WARN_ON(!dev_data->domain))
9112                 return;
9113 diff --git a/drivers/leds/trigger/Kconfig b/drivers/leds/trigger/Kconfig
9114 index 5bda6a9b56bb..d6286584c807 100644
9115 --- a/drivers/leds/trigger/Kconfig
9116 +++ b/drivers/leds/trigger/Kconfig
9117 @@ -61,7 +61,7 @@ config LEDS_TRIGGER_BACKLIGHT
9118
9119  config LEDS_TRIGGER_CPU
9120         bool "LED CPU Trigger"
9121 -       depends on LEDS_TRIGGERS
9122 +       depends on LEDS_TRIGGERS && !PREEMPT_RT_BASE
9123         help
9124           This allows LEDs to be controlled by active CPUs. This shows
9125           the active CPUs across an array of LEDs so you can see which
9126 diff --git a/drivers/md/bcache/Kconfig b/drivers/md/bcache/Kconfig
9127 index 4d200883c505..98b64ed5cb81 100644
9128 --- a/drivers/md/bcache/Kconfig
9129 +++ b/drivers/md/bcache/Kconfig
9130 @@ -1,6 +1,7 @@
9131
9132  config BCACHE
9133         tristate "Block device as cache"
9134 +       depends on !PREEMPT_RT_FULL
9135         ---help---
9136         Allows a block device to be used as cache for other devices; uses
9137         a btree for indexing and the layout is optimized for SSDs.
9138 diff --git a/drivers/md/dm.c b/drivers/md/dm.c
9139 index 84aa8b1d0480..b7f070e3698e 100644
9140 --- a/drivers/md/dm.c
9141 +++ b/drivers/md/dm.c
9142 @@ -2127,7 +2127,7 @@ static void dm_request_fn(struct request_queue *q)
9143                 /* Establish tio->ti before queuing work (map_tio_request) */
9144                 tio->ti = ti;
9145                 queue_kthread_work(&md->kworker, &tio->work);
9146 -               BUG_ON(!irqs_disabled());
9147 +               BUG_ON_NONRT(!irqs_disabled());
9148         }
9149
9150         goto out;
9151 diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
9152 index 10ce885445f6..76f71791361c 100644
9153 --- a/drivers/md/raid5.c
9154 +++ b/drivers/md/raid5.c
9155 @@ -1920,8 +1920,9 @@ static void raid_run_ops(struct stripe_head *sh, unsigned long ops_request)
9156         struct raid5_percpu *percpu;
9157         unsigned long cpu;
9158
9159 -       cpu = get_cpu();
9160 +       cpu = get_cpu_light();
9161         percpu = per_cpu_ptr(conf->percpu, cpu);
9162 +       spin_lock(&percpu->lock);
9163         if (test_bit(STRIPE_OP_BIOFILL, &ops_request)) {
9164                 ops_run_biofill(sh);
9165                 overlap_clear++;
9166 @@ -1977,7 +1978,8 @@ static void raid_run_ops(struct stripe_head *sh, unsigned long ops_request)
9167                         if (test_and_clear_bit(R5_Overlap, &dev->flags))
9168                                 wake_up(&sh->raid_conf->wait_for_overlap);
9169                 }
9170 -       put_cpu();
9171 +       spin_unlock(&percpu->lock);
9172 +       put_cpu_light();
9173  }
9174
9175  static struct stripe_head *alloc_stripe(struct kmem_cache *sc, gfp_t gfp)
9176 @@ -6414,6 +6416,7 @@ static int raid5_alloc_percpu(struct r5conf *conf)
9177                                __func__, cpu);
9178                         break;
9179                 }
9180 +               spin_lock_init(&per_cpu_ptr(conf->percpu, cpu)->lock);
9181         }
9182         put_online_cpus();
9183
9184 diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h
9185 index 517d4b68a1be..efe91887ecd7 100644
9186 --- a/drivers/md/raid5.h
9187 +++ b/drivers/md/raid5.h
9188 @@ -504,6 +504,7 @@ struct r5conf {
9189         int                     recovery_disabled;
9190         /* per cpu variables */
9191         struct raid5_percpu {
9192 +               spinlock_t      lock;           /* Protection for -RT */
9193                 struct page     *spare_page; /* Used when checking P/Q in raid6 */
9194                 struct flex_array *scribble;   /* space for constructing buffer
9195                                               * lists and performing address
9196 diff --git a/drivers/media/platform/vsp1/vsp1_video.c b/drivers/media/platform/vsp1/vsp1_video.c
9197 index 5ce88e1f5d71..b4f8cd74ecb8 100644
9198 --- a/drivers/media/platform/vsp1/vsp1_video.c
9199 +++ b/drivers/media/platform/vsp1/vsp1_video.c
9200 @@ -520,7 +520,7 @@ static bool vsp1_pipeline_stopped(struct vsp1_pipeline *pipe)
9201         bool stopped;
9202
9203         spin_lock_irqsave(&pipe->irqlock, flags);
9204 -       stopped = pipe->state == VSP1_PIPELINE_STOPPED,
9205 +       stopped = pipe->state == VSP1_PIPELINE_STOPPED;
9206         spin_unlock_irqrestore(&pipe->irqlock, flags);
9207
9208         return stopped;
9209 diff --git a/drivers/misc/Kconfig b/drivers/misc/Kconfig
9210 index 4bf7d50b1bc7..6f7e99ad6e29 100644
9211 --- a/drivers/misc/Kconfig
9212 +++ b/drivers/misc/Kconfig
9213 @@ -54,6 +54,7 @@ config AD525X_DPOT_SPI
9214  config ATMEL_TCLIB
9215         bool "Atmel AT32/AT91 Timer/Counter Library"
9216         depends on (AVR32 || ARCH_AT91)
9217 +       default y if PREEMPT_RT_FULL
9218         help
9219           Select this if you want a library to allocate the Timer/Counter
9220           blocks found on many Atmel processors.  This facilitates using
9221 @@ -69,8 +70,7 @@ config ATMEL_TCB_CLKSRC
9222           are combined to make a single 32-bit timer.
9223
9224           When GENERIC_CLOCKEVENTS is defined, the third timer channel
9225 -         may be used as a clock event device supporting oneshot mode
9226 -         (delays of up to two seconds) based on the 32 KiHz clock.
9227 +         may be used as a clock event device supporting oneshot mode.
9228
9229  config ATMEL_TCB_CLKSRC_BLOCK
9230         int
9231 @@ -84,6 +84,15 @@ config ATMEL_TCB_CLKSRC_BLOCK
9232           TC can be used for other purposes, such as PWM generation and
9233           interval timing.
9234
9235 +config ATMEL_TCB_CLKSRC_USE_SLOW_CLOCK
9236 +       bool "TC Block use 32 KiHz clock"
9237 +       depends on ATMEL_TCB_CLKSRC
9238 +       default y if !PREEMPT_RT_FULL
9239 +       help
9240 +         Select this to use 32 KiHz base clock rate as TC block clock
9241 +         source for clock events.
9242 +
9243 +
9244  config DUMMY_IRQ
9245         tristate "Dummy IRQ handler"
9246         default n
9247 @@ -113,6 +122,35 @@ config IBM_ASM
9248           for information on the specific driver level and support statement
9249           for your IBM server.
9250
9251 +config HWLAT_DETECTOR
9252 +       tristate "Testing module to detect hardware-induced latencies"
9253 +       depends on DEBUG_FS
9254 +       depends on RING_BUFFER
9255 +       default m
9256 +       ---help---
9257 +         A simple hardware latency detector. Use this module to detect
9258 +         large latencies introduced by the behavior of the underlying
9259 +         system firmware external to Linux. We do this using periodic
9260 +         use of stop_machine to grab all available CPUs and measure
9261 +         for unexplainable gaps in the CPU timestamp counter(s). By
9262 +         default, the module is not enabled until the "enable" file
9263 +         within the "hwlat_detector" debugfs directory is toggled.
9264 +
9265 +         This module is often used to detect SMI (System Management
9266 +         Interrupts) on x86 systems, though is not x86 specific. To
9267 +         this end, we default to using a sample window of 1 second,
9268 +         during which we will sample for 0.5 seconds. If an SMI or
9269 +         similar event occurs during that time, it is recorded
9270 +         into an 8K samples global ring buffer until retreived.
9271 +
9272 +         WARNING: This software should never be enabled (it can be built
9273 +         but should not be turned on after it is loaded) in a production
9274 +         environment where high latencies are a concern since the
9275 +         sampling mechanism actually introduces latencies for
9276 +         regular tasks while the CPU(s) are being held.
9277 +
9278 +         If unsure, say N
9279 +
9280  config PHANTOM
9281         tristate "Sensable PHANToM (PCI)"
9282         depends on PCI
9283 diff --git a/drivers/misc/Makefile b/drivers/misc/Makefile
9284 index 537d7f3b78da..ec4aecba0656 100644
9285 --- a/drivers/misc/Makefile
9286 +++ b/drivers/misc/Makefile
9287 @@ -39,6 +39,7 @@ obj-$(CONFIG_C2PORT)          += c2port/
9288  obj-$(CONFIG_HMC6352)          += hmc6352.o
9289  obj-y                          += eeprom/
9290  obj-y                          += cb710/
9291 +obj-$(CONFIG_HWLAT_DETECTOR)   += hwlat_detector.o
9292  obj-$(CONFIG_SPEAR13XX_PCIE_GADGET)    += spear13xx_pcie_gadget.o
9293  obj-$(CONFIG_VMWARE_BALLOON)   += vmw_balloon.o
9294  obj-$(CONFIG_ARM_CHARLCD)      += arm-charlcd.o
9295 diff --git a/drivers/misc/hwlat_detector.c b/drivers/misc/hwlat_detector.c
9296 new file mode 100644
9297 index 000000000000..52f5ad5fd9c0
9298 --- /dev/null
9299 +++ b/drivers/misc/hwlat_detector.c
9300 @@ -0,0 +1,1240 @@
9301 +/*
9302 + * hwlat_detector.c - A simple Hardware Latency detector.
9303 + *
9304 + * Use this module to detect large system latencies induced by the behavior of
9305 + * certain underlying system hardware or firmware, independent of Linux itself.
9306 + * The code was developed originally to detect the presence of SMIs on Intel
9307 + * and AMD systems, although there is no dependency upon x86 herein.
9308 + *
9309 + * The classical example usage of this module is in detecting the presence of
9310 + * SMIs or System Management Interrupts on Intel and AMD systems. An SMI is a
9311 + * somewhat special form of hardware interrupt spawned from earlier CPU debug
9312 + * modes in which the (BIOS/EFI/etc.) firmware arranges for the South Bridge
9313 + * LPC (or other device) to generate a special interrupt under certain
9314 + * circumstances, for example, upon expiration of a special SMI timer device,
9315 + * due to certain external thermal readings, on certain I/O address accesses,
9316 + * and other situations. An SMI hits a special CPU pin, triggers a special
9317 + * SMI mode (complete with special memory map), and the OS is unaware.
9318 + *
9319 + * Although certain hardware-inducing latencies are necessary (for example,
9320 + * a modern system often requires an SMI handler for correct thermal control
9321 + * and remote management) they can wreak havoc upon any OS-level performance
9322 + * guarantees toward low-latency, especially when the OS is not even made
9323 + * aware of the presence of these interrupts. For this reason, we need a
9324 + * somewhat brute force mechanism to detect these interrupts. In this case,
9325 + * we do it by hogging all of the CPU(s) for configurable timer intervals,
9326 + * sampling the built-in CPU timer, looking for discontiguous readings.
9327 + *
9328 + * WARNING: This implementation necessarily introduces latencies. Therefore,
9329 + *          you should NEVER use this module in a production environment
9330 + *          requiring any kind of low-latency performance guarantee(s).
9331 + *
9332 + * Copyright (C) 2008-2009 Jon Masters, Red Hat, Inc. <jcm@redhat.com>
9333 + *
9334 + * Includes useful feedback from Clark Williams <clark@redhat.com>
9335 + *
9336 + * This file is licensed under the terms of the GNU General Public
9337 + * License version 2. This program is licensed "as is" without any
9338 + * warranty of any kind, whether express or implied.
9339 + */
9340 +
9341 +#include <linux/module.h>
9342 +#include <linux/init.h>
9343 +#include <linux/ring_buffer.h>
9344 +#include <linux/time.h>
9345 +#include <linux/hrtimer.h>
9346 +#include <linux/kthread.h>
9347 +#include <linux/debugfs.h>
9348 +#include <linux/seq_file.h>
9349 +#include <linux/uaccess.h>
9350 +#include <linux/version.h>
9351 +#include <linux/delay.h>
9352 +#include <linux/slab.h>
9353 +#include <linux/trace_clock.h>
9354 +
9355 +#define BUF_SIZE_DEFAULT       262144UL                /* 8K*(sizeof(entry)) */
9356 +#define BUF_FLAGS              (RB_FL_OVERWRITE)       /* no block on full */
9357 +#define U64STR_SIZE            22                      /* 20 digits max */
9358 +
9359 +#define VERSION                        "1.0.0"
9360 +#define BANNER                 "hwlat_detector: "
9361 +#define DRVNAME                        "hwlat_detector"
9362 +#define DEFAULT_SAMPLE_WINDOW  1000000                 /* 1s */
9363 +#define DEFAULT_SAMPLE_WIDTH   500000                  /* 0.5s */
9364 +#define DEFAULT_LAT_THRESHOLD  10                      /* 10us */
9365 +
9366 +/* Module metadata */
9367 +
9368 +MODULE_LICENSE("GPL");
9369 +MODULE_AUTHOR("Jon Masters <jcm@redhat.com>");
9370 +MODULE_DESCRIPTION("A simple hardware latency detector");
9371 +MODULE_VERSION(VERSION);
9372 +
9373 +/* Module parameters */
9374 +
9375 +static int debug;
9376 +static int enabled;
9377 +static int threshold;
9378 +
9379 +module_param(debug, int, 0);                   /* enable debug */
9380 +module_param(enabled, int, 0);                 /* enable detector */
9381 +module_param(threshold, int, 0);               /* latency threshold */
9382 +
9383 +/* Buffering and sampling */
9384 +
9385 +static struct ring_buffer *ring_buffer;                /* sample buffer */
9386 +static DEFINE_MUTEX(ring_buffer_mutex);                /* lock changes */
9387 +static unsigned long buf_size = BUF_SIZE_DEFAULT;
9388 +static struct task_struct *kthread;            /* sampling thread */
9389 +
9390 +/* DebugFS filesystem entries */
9391 +
9392 +static struct dentry *debug_dir;               /* debugfs directory */
9393 +static struct dentry *debug_max;               /* maximum TSC delta */
9394 +static struct dentry *debug_count;             /* total detect count */
9395 +static struct dentry *debug_sample_width;      /* sample width us */
9396 +static struct dentry *debug_sample_window;     /* sample window us */
9397 +static struct dentry *debug_sample;            /* raw samples us */
9398 +static struct dentry *debug_threshold;         /* threshold us */
9399 +static struct dentry *debug_enable;            /* enable/disable */
9400 +
9401 +/* Individual samples and global state */
9402 +
9403 +struct sample;                                 /* latency sample */
9404 +struct data;                                   /* Global state */
9405 +
9406 +/* Sampling functions */
9407 +static int __buffer_add_sample(struct sample *sample);
9408 +static struct sample *buffer_get_sample(struct sample *sample);
9409 +
9410 +/* Threading and state */
9411 +static int kthread_fn(void *unused);
9412 +static int start_kthread(void);
9413 +static int stop_kthread(void);
9414 +static void __reset_stats(void);
9415 +static int init_stats(void);
9416 +
9417 +/* Debugfs interface */
9418 +static ssize_t simple_data_read(struct file *filp, char __user *ubuf,
9419 +                               size_t cnt, loff_t *ppos, const u64 *entry);
9420 +static ssize_t simple_data_write(struct file *filp, const char __user *ubuf,
9421 +                                size_t cnt, loff_t *ppos, u64 *entry);
9422 +static int debug_sample_fopen(struct inode *inode, struct file *filp);
9423 +static ssize_t debug_sample_fread(struct file *filp, char __user *ubuf,
9424 +                                 size_t cnt, loff_t *ppos);
9425 +static int debug_sample_release(struct inode *inode, struct file *filp);
9426 +static int debug_enable_fopen(struct inode *inode, struct file *filp);
9427 +static ssize_t debug_enable_fread(struct file *filp, char __user *ubuf,
9428 +                                 size_t cnt, loff_t *ppos);
9429 +static ssize_t debug_enable_fwrite(struct file *file,
9430 +                                  const char __user *user_buffer,
9431 +                                  size_t user_size, loff_t *offset);
9432 +
9433 +/* Initialization functions */
9434 +static int init_debugfs(void);
9435 +static void free_debugfs(void);
9436 +static int detector_init(void);
9437 +static void detector_exit(void);
9438 +
9439 +/* Individual latency samples are stored here when detected and packed into
9440 + * the ring_buffer circular buffer, where they are overwritten when
9441 + * more than buf_size/sizeof(sample) samples are received. */
9442 +struct sample {
9443 +       u64             seqnum;         /* unique sequence */
9444 +       u64             duration;       /* ktime delta */
9445 +       u64             outer_duration; /* ktime delta (outer loop) */
9446 +       struct timespec timestamp;      /* wall time */
9447 +       unsigned long   lost;
9448 +};
9449 +
9450 +/* keep the global state somewhere. */
9451 +static struct data {
9452 +
9453 +       struct mutex lock;              /* protect changes */
9454 +
9455 +       u64     count;                  /* total since reset */
9456 +       u64     max_sample;             /* max hardware latency */
9457 +       u64     threshold;              /* sample threshold level */
9458 +
9459 +       u64     sample_window;          /* total sampling window (on+off) */
9460 +       u64     sample_width;           /* active sampling portion of window */
9461 +
9462 +       atomic_t sample_open;           /* whether the sample file is open */
9463 +
9464 +       wait_queue_head_t wq;           /* waitqeue for new sample values */
9465 +
9466 +} data;
9467 +
9468 +/**
9469 + * __buffer_add_sample - add a new latency sample recording to the ring buffer
9470 + * @sample: The new latency sample value
9471 + *
9472 + * This receives a new latency sample and records it in a global ring buffer.
9473 + * No additional locking is used in this case.
9474 + */
9475 +static int __buffer_add_sample(struct sample *sample)
9476 +{
9477 +       return ring_buffer_write(ring_buffer,
9478 +                                sizeof(struct sample), sample);
9479 +}
9480 +
9481 +/**
9482 + * buffer_get_sample - remove a hardware latency sample from the ring buffer
9483 + * @sample: Pre-allocated storage for the sample
9484 + *
9485 + * This retrieves a hardware latency sample from the global circular buffer
9486 + */
9487 +static struct sample *buffer_get_sample(struct sample *sample)
9488 +{
9489 +       struct ring_buffer_event *e = NULL;
9490 +       struct sample *s = NULL;
9491 +       unsigned int cpu = 0;
9492 +
9493 +       if (!sample)
9494 +               return NULL;
9495 +
9496 +       mutex_lock(&ring_buffer_mutex);
9497 +       for_each_online_cpu(cpu) {
9498 +               e = ring_buffer_consume(ring_buffer, cpu, NULL, &sample->lost);
9499 +               if (e)
9500 +                       break;
9501 +       }
9502 +
9503 +       if (e) {
9504 +               s = ring_buffer_event_data(e);
9505 +               memcpy(sample, s, sizeof(struct sample));
9506 +       } else
9507 +               sample = NULL;
9508 +       mutex_unlock(&ring_buffer_mutex);
9509 +
9510 +       return sample;
9511 +}
9512 +
9513 +#ifndef CONFIG_TRACING
9514 +#define time_type      ktime_t
9515 +#define time_get()     ktime_get()
9516 +#define time_to_us(x)  ktime_to_us(x)
9517 +#define time_sub(a, b) ktime_sub(a, b)
9518 +#define init_time(a, b)        (a).tv64 = b
9519 +#define time_u64(a)    ((a).tv64)
9520 +#else
9521 +#define time_type      u64
9522 +#define time_get()     trace_clock_local()
9523 +#define time_to_us(x)  div_u64(x, 1000)
9524 +#define time_sub(a, b) ((a) - (b))
9525 +#define init_time(a, b)        (a = b)
9526 +#define time_u64(a)    a
9527 +#endif
9528 +/**
9529 + * get_sample - sample the CPU TSC and look for likely hardware latencies
9530 + *
9531 + * Used to repeatedly capture the CPU TSC (or similar), looking for potential
9532 + * hardware-induced latency. Called with interrupts disabled and with
9533 + * data.lock held.
9534 + */
9535 +static int get_sample(void)
9536 +{
9537 +       time_type start, t1, t2, last_t2;
9538 +       s64 diff, total = 0;
9539 +       u64 sample = 0;
9540 +       u64 outer_sample = 0;
9541 +       int ret = -1;
9542 +
9543 +       init_time(last_t2, 0);
9544 +       start = time_get(); /* start timestamp */
9545 +
9546 +       do {
9547 +
9548 +               t1 = time_get();        /* we'll look for a discontinuity */
9549 +               t2 = time_get();
9550 +
9551 +               if (time_u64(last_t2)) {
9552 +                       /* Check the delta from outer loop (t2 to next t1) */
9553 +                       diff = time_to_us(time_sub(t1, last_t2));
9554 +                       /* This shouldn't happen */
9555 +                       if (diff < 0) {
9556 +                               pr_err(BANNER "time running backwards\n");
9557 +                               goto out;
9558 +                       }
9559 +                       if (diff > outer_sample)
9560 +                               outer_sample = diff;
9561 +               }
9562 +               last_t2 = t2;
9563 +
9564 +               total = time_to_us(time_sub(t2, start)); /* sample width */
9565 +
9566 +               /* This checks the inner loop (t1 to t2) */
9567 +               diff = time_to_us(time_sub(t2, t1));     /* current diff */
9568 +
9569 +               /* This shouldn't happen */
9570 +               if (diff < 0) {
9571 +                       pr_err(BANNER "time running backwards\n");
9572 +                       goto out;
9573 +               }
9574 +
9575 +               if (diff > sample)
9576 +                       sample = diff; /* only want highest value */
9577 +
9578 +       } while (total <= data.sample_width);
9579 +
9580 +       ret = 0;
9581 +
9582 +       /* If we exceed the threshold value, we have found a hardware latency */
9583 +       if (sample > data.threshold || outer_sample > data.threshold) {
9584 +               struct sample s;
9585 +
9586 +               ret = 1;
9587 +
9588 +               data.count++;
9589 +               s.seqnum = data.count;
9590 +               s.duration = sample;
9591 +               s.outer_duration = outer_sample;
9592 +               s.timestamp = CURRENT_TIME;
9593 +               __buffer_add_sample(&s);
9594 +
9595 +               /* Keep a running maximum ever recorded hardware latency */
9596 +               if (sample > data.max_sample)
9597 +                       data.max_sample = sample;
9598 +       }
9599 +
9600 +out:
9601 +       return ret;
9602 +}
9603 +
9604 +/*
9605 + * kthread_fn - The CPU time sampling/hardware latency detection kernel thread
9606 + * @unused: A required part of the kthread API.
9607 + *
9608 + * Used to periodically sample the CPU TSC via a call to get_sample. We
9609 + * disable interrupts, which does (intentionally) introduce latency since we
9610 + * need to ensure nothing else might be running (and thus pre-empting).
9611 + * Obviously this should never be used in production environments.
9612 + *
9613 + * Currently this runs on which ever CPU it was scheduled on, but most
9614 + * real-worald hardware latency situations occur across several CPUs,
9615 + * but we might later generalize this if we find there are any actualy
9616 + * systems with alternate SMI delivery or other hardware latencies.
9617 + */
9618 +static int kthread_fn(void *unused)
9619 +{
9620 +       int ret;
9621 +       u64 interval;
9622 +
9623 +       while (!kthread_should_stop()) {
9624 +
9625 +               mutex_lock(&data.lock);
9626 +
9627 +               local_irq_disable();
9628 +               ret = get_sample();
9629 +               local_irq_enable();
9630 +
9631 +               if (ret > 0)
9632 +                       wake_up(&data.wq); /* wake up reader(s) */
9633 +
9634 +               interval = data.sample_window - data.sample_width;
9635 +               do_div(interval, USEC_PER_MSEC); /* modifies interval value */
9636 +
9637 +               mutex_unlock(&data.lock);
9638 +
9639 +               if (msleep_interruptible(interval))
9640 +                       break;
9641 +       }
9642 +
9643 +       return 0;
9644 +}
9645 +
9646 +/**
9647 + * start_kthread - Kick off the hardware latency sampling/detector kthread
9648 + *
9649 + * This starts a kernel thread that will sit and sample the CPU timestamp
9650 + * counter (TSC or similar) and look for potential hardware latencies.
9651 + */
9652 +static int start_kthread(void)
9653 +{
9654 +       kthread = kthread_run(kthread_fn, NULL,
9655 +                                       DRVNAME);
9656 +       if (IS_ERR(kthread)) {
9657 +               pr_err(BANNER "could not start sampling thread\n");
9658 +               enabled = 0;
9659 +               return -ENOMEM;
9660 +       }
9661 +
9662 +       return 0;
9663 +}
9664 +
9665 +/**
9666 + * stop_kthread - Inform the hardware latency samping/detector kthread to stop
9667 + *
9668 + * This kicks the running hardware latency sampling/detector kernel thread and
9669 + * tells it to stop sampling now. Use this on unload and at system shutdown.
9670 + */
9671 +static int stop_kthread(void)
9672 +{
9673 +       int ret;
9674 +
9675 +       ret = kthread_stop(kthread);
9676 +
9677 +       return ret;
9678 +}
9679 +
9680 +/**
9681 + * __reset_stats - Reset statistics for the hardware latency detector
9682 + *
9683 + * We use data to store various statistics and global state. We call this
9684 + * function in order to reset those when "enable" is toggled on or off, and
9685 + * also at initialization. Should be called with data.lock held.
9686 + */
9687 +static void __reset_stats(void)
9688 +{
9689 +       data.count = 0;
9690 +       data.max_sample = 0;
9691 +       ring_buffer_reset(ring_buffer); /* flush out old sample entries */
9692 +}
9693 +
9694 +/**
9695 + * init_stats - Setup global state statistics for the hardware latency detector
9696 + *
9697 + * We use data to store various statistics and global state. We also use
9698 + * a global ring buffer (ring_buffer) to keep raw samples of detected hardware
9699 + * induced system latencies. This function initializes these structures and
9700 + * allocates the global ring buffer also.
9701 + */
9702 +static int init_stats(void)
9703 +{
9704 +       int ret = -ENOMEM;
9705 +
9706 +       mutex_init(&data.lock);
9707 +       init_waitqueue_head(&data.wq);
9708 +       atomic_set(&data.sample_open, 0);
9709 +
9710 +       ring_buffer = ring_buffer_alloc(buf_size, BUF_FLAGS);
9711 +
9712 +       if (WARN(!ring_buffer, KERN_ERR BANNER
9713 +                              "failed to allocate ring buffer!\n"))
9714 +               goto out;
9715 +
9716 +       __reset_stats();
9717 +       data.threshold = threshold ?: DEFAULT_LAT_THRESHOLD; /* threshold us */
9718 +       data.sample_window = DEFAULT_SAMPLE_WINDOW; /* window us */
9719 +       data.sample_width = DEFAULT_SAMPLE_WIDTH;   /* width us */
9720 +
9721 +       ret = 0;
9722 +
9723 +out:
9724 +       return ret;
9725 +
9726 +}
9727 +
9728 +/*
9729 + * simple_data_read - Wrapper read function for global state debugfs entries
9730 + * @filp: The active open file structure for the debugfs "file"
9731 + * @ubuf: The userspace provided buffer to read value into
9732 + * @cnt: The maximum number of bytes to read
9733 + * @ppos: The current "file" position
9734 + * @entry: The entry to read from
9735 + *
9736 + * This function provides a generic read implementation for the global state
9737 + * "data" structure debugfs filesystem entries. It would be nice to use
9738 + * simple_attr_read directly, but we need to make sure that the data.lock
9739 + * is held during the actual read.
9740 + */
9741 +static ssize_t simple_data_read(struct file *filp, char __user *ubuf,
9742 +                               size_t cnt, loff_t *ppos, const u64 *entry)
9743 +{
9744 +       char buf[U64STR_SIZE];
9745 +       u64 val = 0;
9746 +       int len = 0;
9747 +
9748 +       memset(buf, 0, sizeof(buf));
9749 +
9750 +       if (!entry)
9751 +               return -EFAULT;
9752 +
9753 +       mutex_lock(&data.lock);
9754 +       val = *entry;
9755 +       mutex_unlock(&data.lock);
9756 +
9757 +       len = snprintf(buf, sizeof(buf), "%llu\n", (unsigned long long)val);
9758 +
9759 +       return simple_read_from_buffer(ubuf, cnt, ppos, buf, len);
9760 +
9761 +}
9762 +
9763 +/*
9764 + * simple_data_write - Wrapper write function for global state debugfs entries
9765 + * @filp: The active open file structure for the debugfs "file"
9766 + * @ubuf: The userspace provided buffer to write value from
9767 + * @cnt: The maximum number of bytes to write
9768 + * @ppos: The current "file" position
9769 + * @entry: The entry to write to
9770 + *
9771 + * This function provides a generic write implementation for the global state
9772 + * "data" structure debugfs filesystem entries. It would be nice to use
9773 + * simple_attr_write directly, but we need to make sure that the data.lock
9774 + * is held during the actual write.
9775 + */
9776 +static ssize_t simple_data_write(struct file *filp, const char __user *ubuf,
9777 +                                size_t cnt, loff_t *ppos, u64 *entry)
9778 +{
9779 +       char buf[U64STR_SIZE];
9780 +       int csize = min(cnt, sizeof(buf));
9781 +       u64 val = 0;
9782 +       int err = 0;
9783 +
9784 +       memset(buf, '\0', sizeof(buf));
9785 +       if (copy_from_user(buf, ubuf, csize))
9786 +               return -EFAULT;
9787 +
9788 +       buf[U64STR_SIZE-1] = '\0';                      /* just in case */
9789 +       err = kstrtoull(buf, 10, &val);
9790 +       if (err)
9791 +               return -EINVAL;
9792 +
9793 +       mutex_lock(&data.lock);
9794 +       *entry = val;
9795 +       mutex_unlock(&data.lock);
9796 +
9797 +       return csize;
9798 +}
9799 +
9800 +/**
9801 + * debug_count_fopen - Open function for "count" debugfs entry
9802 + * @inode: The in-kernel inode representation of the debugfs "file"
9803 + * @filp: The active open file structure for the debugfs "file"
9804 + *
9805 + * This function provides an open implementation for the "count" debugfs
9806 + * interface to the hardware latency detector.
9807 + */
9808 +static int debug_count_fopen(struct inode *inode, struct file *filp)
9809 +{
9810 +       return 0;
9811 +}
9812 +
9813 +/**
9814 + * debug_count_fread - Read function for "count" debugfs entry
9815 + * @filp: The active open file structure for the debugfs "file"
9816 + * @ubuf: The userspace provided buffer to read value into
9817 + * @cnt: The maximum number of bytes to read
9818 + * @ppos: The current "file" position
9819 + *
9820 + * This function provides a read implementation for the "count" debugfs
9821 + * interface to the hardware latency detector. Can be used to read the
9822 + * number of latency readings exceeding the configured threshold since
9823 + * the detector was last reset (e.g. by writing a zero into "count").
9824 + */
9825 +static ssize_t debug_count_fread(struct file *filp, char __user *ubuf,
9826 +                                    size_t cnt, loff_t *ppos)
9827 +{
9828 +       return simple_data_read(filp, ubuf, cnt, ppos, &data.count);
9829 +}
9830 +
9831 +/**
9832 + * debug_count_fwrite - Write function for "count" debugfs entry
9833 + * @filp: The active open file structure for the debugfs "file"
9834 + * @ubuf: The user buffer that contains the value to write
9835 + * @cnt: The maximum number of bytes to write to "file"
9836 + * @ppos: The current position in the debugfs "file"
9837 + *
9838 + * This function provides a write implementation for the "count" debugfs
9839 + * interface to the hardware latency detector. Can be used to write a
9840 + * desired value, especially to zero the total count.
9841 + */
9842 +static ssize_t  debug_count_fwrite(struct file *filp,
9843 +                                      const char __user *ubuf,
9844 +                                      size_t cnt,
9845 +                                      loff_t *ppos)
9846 +{
9847 +       return simple_data_write(filp, ubuf, cnt, ppos, &data.count);
9848 +}
9849 +
9850 +/**
9851 + * debug_enable_fopen - Dummy open function for "enable" debugfs interface
9852 + * @inode: The in-kernel inode representation of the debugfs "file"
9853 + * @filp: The active open file structure for the debugfs "file"
9854 + *
9855 + * This function provides an open implementation for the "enable" debugfs
9856 + * interface to the hardware latency detector.
9857 + */
9858 +static int debug_enable_fopen(struct inode *inode, struct file *filp)
9859 +{
9860 +       return 0;
9861 +}
9862 +
9863 +/**
9864 + * debug_enable_fread - Read function for "enable" debugfs interface
9865 + * @filp: The active open file structure for the debugfs "file"
9866 + * @ubuf: The userspace provided buffer to read value into
9867 + * @cnt: The maximum number of bytes to read
9868 + * @ppos: The current "file" position
9869 + *
9870 + * This function provides a read implementation for the "enable" debugfs
9871 + * interface to the hardware latency detector. Can be used to determine
9872 + * whether the detector is currently enabled ("0\n" or "1\n" returned).
9873 + */
9874 +static ssize_t debug_enable_fread(struct file *filp, char __user *ubuf,
9875 +                                     size_t cnt, loff_t *ppos)
9876 +{
9877 +       char buf[4];
9878 +
9879 +       if ((cnt < sizeof(buf)) || (*ppos))
9880 +               return 0;
9881 +
9882 +       buf[0] = enabled ? '1' : '0';
9883 +       buf[1] = '\n';
9884 +       buf[2] = '\0';
9885 +       if (copy_to_user(ubuf, buf, strlen(buf)))
9886 +               return -EFAULT;
9887 +       return *ppos = strlen(buf);
9888 +}
9889 +
9890 +/**
9891 + * debug_enable_fwrite - Write function for "enable" debugfs interface
9892 + * @filp: The active open file structure for the debugfs "file"
9893 + * @ubuf: The user buffer that contains the value to write
9894 + * @cnt: The maximum number of bytes to write to "file"
9895 + * @ppos: The current position in the debugfs "file"
9896 + *
9897 + * This function provides a write implementation for the "enable" debugfs
9898 + * interface to the hardware latency detector. Can be used to enable or
9899 + * disable the detector, which will have the side-effect of possibly
9900 + * also resetting the global stats and kicking off the measuring
9901 + * kthread (on an enable) or the converse (upon a disable).
9902 + */
9903 +static ssize_t  debug_enable_fwrite(struct file *filp,
9904 +                                       const char __user *ubuf,
9905 +                                       size_t cnt,
9906 +                                       loff_t *ppos)
9907 +{
9908 +       char buf[4];
9909 +       int csize = min(cnt, sizeof(buf));
9910 +       long val = 0;
9911 +       int err = 0;
9912 +
9913 +       memset(buf, '\0', sizeof(buf));
9914 +       if (copy_from_user(buf, ubuf, csize))
9915 +               return -EFAULT;
9916 +
9917 +       buf[sizeof(buf)-1] = '\0';                      /* just in case */
9918 +       err = kstrtoul(buf, 10, &val);
9919 +       if (err)
9920 +               return -EINVAL;
9921 +
9922 +       if (val) {
9923 +               if (enabled)
9924 +                       goto unlock;
9925 +               enabled = 1;
9926 +               __reset_stats();
9927 +               if (start_kthread())
9928 +                       return -EFAULT;
9929 +       } else {
9930 +               if (!enabled)
9931 +                       goto unlock;
9932 +               enabled = 0;
9933 +               err = stop_kthread();
9934 +               if (err) {
9935 +                       pr_err(BANNER "cannot stop kthread\n");
9936 +                       return -EFAULT;
9937 +               }
9938 +               wake_up(&data.wq);              /* reader(s) should return */
9939 +       }
9940 +unlock:
9941 +       return csize;
9942 +}
9943 +
9944 +/**
9945 + * debug_max_fopen - Open function for "max" debugfs entry
9946 + * @inode: The in-kernel inode representation of the debugfs "file"
9947 + * @filp: The active open file structure for the debugfs "file"
9948 + *
9949 + * This function provides an open implementation for the "max" debugfs
9950 + * interface to the hardware latency detector.
9951 + */
9952 +static int debug_max_fopen(struct inode *inode, struct file *filp)
9953 +{
9954 +       return 0;
9955 +}
9956 +
9957 +/**
9958 + * debug_max_fread - Read function for "max" debugfs entry
9959 + * @filp: The active open file structure for the debugfs "file"
9960 + * @ubuf: The userspace provided buffer to read value into
9961 + * @cnt: The maximum number of bytes to read
9962 + * @ppos: The current "file" position
9963 + *
9964 + * This function provides a read implementation for the "max" debugfs
9965 + * interface to the hardware latency detector. Can be used to determine
9966 + * the maximum latency value observed since it was last reset.
9967 + */
9968 +static ssize_t debug_max_fread(struct file *filp, char __user *ubuf,
9969 +                                  size_t cnt, loff_t *ppos)
9970 +{
9971 +       return simple_data_read(filp, ubuf, cnt, ppos, &data.max_sample);
9972 +}
9973 +
9974 +/**
9975 + * debug_max_fwrite - Write function for "max" debugfs entry
9976 + * @filp: The active open file structure for the debugfs "file"
9977 + * @ubuf: The user buffer that contains the value to write
9978 + * @cnt: The maximum number of bytes to write to "file"
9979 + * @ppos: The current position in the debugfs "file"
9980 + *
9981 + * This function provides a write implementation for the "max" debugfs
9982 + * interface to the hardware latency detector. Can be used to reset the
9983 + * maximum or set it to some other desired value - if, then, subsequent
9984 + * measurements exceed this value, the maximum will be updated.
9985 + */
9986 +static ssize_t  debug_max_fwrite(struct file *filp,
9987 +                                    const char __user *ubuf,
9988 +                                    size_t cnt,
9989 +                                    loff_t *ppos)
9990 +{
9991 +       return simple_data_write(filp, ubuf, cnt, ppos, &data.max_sample);
9992 +}
9993 +
9994 +
9995 +/**
9996 + * debug_sample_fopen - An open function for "sample" debugfs interface
9997 + * @inode: The in-kernel inode representation of this debugfs "file"
9998 + * @filp: The active open file structure for the debugfs "file"
9999 + *
10000 + * This function handles opening the "sample" file within the hardware
10001 + * latency detector debugfs directory interface. This file is used to read
10002 + * raw samples from the global ring_buffer and allows the user to see a
10003 + * running latency history. Can be opened blocking or non-blocking,
10004 + * affecting whether it behaves as a buffer read pipe, or does not.
10005 + * Implements simple locking to prevent multiple simultaneous use.
10006 + */
10007 +static int debug_sample_fopen(struct inode *inode, struct file *filp)
10008 +{
10009 +       if (!atomic_add_unless(&data.sample_open, 1, 1))
10010 +               return -EBUSY;
10011 +       else
10012 +               return 0;
10013 +}
10014 +
10015 +/**
10016 + * debug_sample_fread - A read function for "sample" debugfs interface
10017 + * @filp: The active open file structure for the debugfs "file"
10018 + * @ubuf: The user buffer that will contain the samples read
10019 + * @cnt: The maximum bytes to read from the debugfs "file"
10020 + * @ppos: The current position in the debugfs "file"
10021 + *
10022 + * This function handles reading from the "sample" file within the hardware
10023 + * latency detector debugfs directory interface. This file is used to read
10024 + * raw samples from the global ring_buffer and allows the user to see a
10025 + * running latency history. By default this will block pending a new
10026 + * value written into the sample buffer, unless there are already a
10027 + * number of value(s) waiting in the buffer, or the sample file was
10028 + * previously opened in a non-blocking mode of operation.
10029 + */
10030 +static ssize_t debug_sample_fread(struct file *filp, char __user *ubuf,
10031 +                                       size_t cnt, loff_t *ppos)
10032 +{
10033 +       int len = 0;
10034 +       char buf[64];
10035 +       struct sample *sample = NULL;
10036 +
10037 +       if (!enabled)
10038 +               return 0;
10039 +
10040 +       sample = kzalloc(sizeof(struct sample), GFP_KERNEL);
10041 +       if (!sample)
10042 +               return -ENOMEM;
10043 +
10044 +       while (!buffer_get_sample(sample)) {
10045 +
10046 +               DEFINE_WAIT(wait);
10047 +
10048 +               if (filp->f_flags & O_NONBLOCK) {
10049 +                       len = -EAGAIN;
10050 +                       goto out;
10051 +               }
10052 +
10053 +               prepare_to_wait(&data.wq, &wait, TASK_INTERRUPTIBLE);
10054 +               schedule();
10055 +               finish_wait(&data.wq, &wait);
10056 +
10057 +               if (signal_pending(current)) {
10058 +                       len = -EINTR;
10059 +                       goto out;
10060 +               }
10061 +
10062 +               if (!enabled) {                 /* enable was toggled */
10063 +                       len = 0;
10064 +                       goto out;
10065 +               }
10066 +       }
10067 +
10068 +       len = snprintf(buf, sizeof(buf), "%010lu.%010lu\t%llu\t%llu\n",
10069 +                      sample->timestamp.tv_sec,
10070 +                      sample->timestamp.tv_nsec,
10071 +                      sample->duration,
10072 +                      sample->outer_duration);
10073 +
10074 +
10075 +       /* handling partial reads is more trouble than it's worth */
10076 +       if (len > cnt)
10077 +               goto out;
10078 +
10079 +       if (copy_to_user(ubuf, buf, len))
10080 +               len = -EFAULT;
10081 +
10082 +out:
10083 +       kfree(sample);
10084 +       return len;
10085 +}
10086 +
10087 +/**
10088 + * debug_sample_release - Release function for "sample" debugfs interface
10089 + * @inode: The in-kernel inode represenation of the debugfs "file"
10090 + * @filp: The active open file structure for the debugfs "file"
10091 + *
10092 + * This function completes the close of the debugfs interface "sample" file.
10093 + * Frees the sample_open "lock" so that other users may open the interface.
10094 + */
10095 +static int debug_sample_release(struct inode *inode, struct file *filp)
10096 +{
10097 +       atomic_dec(&data.sample_open);
10098 +
10099 +       return 0;
10100 +}
10101 +
10102 +/**
10103 + * debug_threshold_fopen - Open function for "threshold" debugfs entry
10104 + * @inode: The in-kernel inode representation of the debugfs "file"
10105 + * @filp: The active open file structure for the debugfs "file"
10106 + *
10107 + * This function provides an open implementation for the "threshold" debugfs
10108 + * interface to the hardware latency detector.
10109 + */
10110 +static int debug_threshold_fopen(struct inode *inode, struct file *filp)
10111 +{
10112 +       return 0;
10113 +}
10114 +
10115 +/**
10116 + * debug_threshold_fread - Read function for "threshold" debugfs entry
10117 + * @filp: The active open file structure for the debugfs "file"
10118 + * @ubuf: The userspace provided buffer to read value into
10119 + * @cnt: The maximum number of bytes to read
10120 + * @ppos: The current "file" position
10121 + *
10122 + * This function provides a read implementation for the "threshold" debugfs
10123 + * interface to the hardware latency detector. It can be used to determine
10124 + * the current threshold level at which a latency will be recorded in the
10125 + * global ring buffer, typically on the order of 10us.
10126 + */
10127 +static ssize_t debug_threshold_fread(struct file *filp, char __user *ubuf,
10128 +                                        size_t cnt, loff_t *ppos)
10129 +{
10130 +       return simple_data_read(filp, ubuf, cnt, ppos, &data.threshold);
10131 +}
10132 +
10133 +/**
10134 + * debug_threshold_fwrite - Write function for "threshold" debugfs entry
10135 + * @filp: The active open file structure for the debugfs "file"
10136 + * @ubuf: The user buffer that contains the value to write
10137 + * @cnt: The maximum number of bytes to write to "file"
10138 + * @ppos: The current position in the debugfs "file"
10139 + *
10140 + * This function provides a write implementation for the "threshold" debugfs
10141 + * interface to the hardware latency detector. It can be used to configure
10142 + * the threshold level at which any subsequently detected latencies will
10143 + * be recorded into the global ring buffer.
10144 + */
10145 +static ssize_t  debug_threshold_fwrite(struct file *filp,
10146 +                                       const char __user *ubuf,
10147 +                                       size_t cnt,
10148 +                                       loff_t *ppos)
10149 +{
10150 +       int ret;
10151 +
10152 +       ret = simple_data_write(filp, ubuf, cnt, ppos, &data.threshold);
10153 +
10154 +       if (enabled)
10155 +               wake_up_process(kthread);
10156 +
10157 +       return ret;
10158 +}
10159 +
10160 +/**
10161 + * debug_width_fopen - Open function for "width" debugfs entry
10162 + * @inode: The in-kernel inode representation of the debugfs "file"
10163 + * @filp: The active open file structure for the debugfs "file"
10164 + *
10165 + * This function provides an open implementation for the "width" debugfs
10166 + * interface to the hardware latency detector.
10167 + */
10168 +static int debug_width_fopen(struct inode *inode, struct file *filp)
10169 +{
10170 +       return 0;
10171 +}
10172 +
10173 +/**
10174 + * debug_width_fread - Read function for "width" debugfs entry
10175 + * @filp: The active open file structure for the debugfs "file"
10176 + * @ubuf: The userspace provided buffer to read value into
10177 + * @cnt: The maximum number of bytes to read
10178 + * @ppos: The current "file" position
10179 + *
10180 + * This function provides a read implementation for the "width" debugfs
10181 + * interface to the hardware latency detector. It can be used to determine
10182 + * for how many us of the total window us we will actively sample for any
10183 + * hardware-induced latecy periods. Obviously, it is not possible to
10184 + * sample constantly and have the system respond to a sample reader, or,
10185 + * worse, without having the system appear to have gone out to lunch.
10186 + */
10187 +static ssize_t debug_width_fread(struct file *filp, char __user *ubuf,
10188 +                                    size_t cnt, loff_t *ppos)
10189 +{
10190 +       return simple_data_read(filp, ubuf, cnt, ppos, &data.sample_width);
10191 +}
10192 +
10193 +/**
10194 + * debug_width_fwrite - Write function for "width" debugfs entry
10195 + * @filp: The active open file structure for the debugfs "file"
10196 + * @ubuf: The user buffer that contains the value to write
10197 + * @cnt: The maximum number of bytes to write to "file"
10198 + * @ppos: The current position in the debugfs "file"
10199 + *
10200 + * This function provides a write implementation for the "width" debugfs
10201 + * interface to the hardware latency detector. It can be used to configure
10202 + * for how many us of the total window us we will actively sample for any
10203 + * hardware-induced latency periods. Obviously, it is not possible to
10204 + * sample constantly and have the system respond to a sample reader, or,
10205 + * worse, without having the system appear to have gone out to lunch. It
10206 + * is enforced that width is less that the total window size.
10207 + */
10208 +static ssize_t  debug_width_fwrite(struct file *filp,
10209 +                                      const char __user *ubuf,
10210 +                                      size_t cnt,
10211 +                                      loff_t *ppos)
10212 +{
10213 +       char buf[U64STR_SIZE];
10214 +       int csize = min(cnt, sizeof(buf));
10215 +       u64 val = 0;
10216 +       int err = 0;
10217 +
10218 +       memset(buf, '\0', sizeof(buf));
10219 +       if (copy_from_user(buf, ubuf, csize))
10220 +               return -EFAULT;
10221 +
10222 +       buf[U64STR_SIZE-1] = '\0';                      /* just in case */
10223 +       err = kstrtoull(buf, 10, &val);
10224 +       if (err)
10225 +               return -EINVAL;
10226 +
10227 +       mutex_lock(&data.lock);
10228 +       if (val < data.sample_window)
10229 +               data.sample_width = val;
10230 +       else {
10231 +               mutex_unlock(&data.lock);
10232 +               return -EINVAL;
10233 +       }
10234 +       mutex_unlock(&data.lock);
10235 +
10236 +       if (enabled)
10237 +               wake_up_process(kthread);
10238 +
10239 +       return csize;
10240 +}
10241 +
10242 +/**
10243 + * debug_window_fopen - Open function for "window" debugfs entry
10244 + * @inode: The in-kernel inode representation of the debugfs "file"
10245 + * @filp: The active open file structure for the debugfs "file"
10246 + *
10247 + * This function provides an open implementation for the "window" debugfs
10248 + * interface to the hardware latency detector. The window is the total time
10249 + * in us that will be considered one sample period. Conceptually, windows
10250 + * occur back-to-back and contain a sample width period during which
10251 + * actual sampling occurs.
10252 + */
10253 +static int debug_window_fopen(struct inode *inode, struct file *filp)
10254 +{
10255 +       return 0;
10256 +}
10257 +
10258 +/**
10259 + * debug_window_fread - Read function for "window" debugfs entry
10260 + * @filp: The active open file structure for the debugfs "file"
10261 + * @ubuf: The userspace provided buffer to read value into
10262 + * @cnt: The maximum number of bytes to read
10263 + * @ppos: The current "file" position
10264 + *
10265 + * This function provides a read implementation for the "window" debugfs
10266 + * interface to the hardware latency detector. The window is the total time
10267 + * in us that will be considered one sample period. Conceptually, windows
10268 + * occur back-to-back and contain a sample width period during which
10269 + * actual sampling occurs. Can be used to read the total window size.
10270 + */
10271 +static ssize_t debug_window_fread(struct file *filp, char __user *ubuf,
10272 +                                     size_t cnt, loff_t *ppos)
10273 +{
10274 +       return simple_data_read(filp, ubuf, cnt, ppos, &data.sample_window);
10275 +}
10276 +
10277 +/**
10278 + * debug_window_fwrite - Write function for "window" debugfs entry
10279 + * @filp: The active open file structure for the debugfs "file"
10280 + * @ubuf: The user buffer that contains the value to write
10281 + * @cnt: The maximum number of bytes to write to "file"
10282 + * @ppos: The current position in the debugfs "file"
10283 + *
10284 + * This function provides a write implementation for the "window" debufds
10285 + * interface to the hardware latency detetector. The window is the total time
10286 + * in us that will be considered one sample period. Conceptually, windows
10287 + * occur back-to-back and contain a sample width period during which
10288 + * actual sampling occurs. Can be used to write a new total window size. It
10289 + * is enfoced that any value written must be greater than the sample width
10290 + * size, or an error results.
10291 + */
10292 +static ssize_t  debug_window_fwrite(struct file *filp,
10293 +                                       const char __user *ubuf,
10294 +                                       size_t cnt,
10295 +                                       loff_t *ppos)
10296 +{
10297 +       char buf[U64STR_SIZE];
10298 +       int csize = min(cnt, sizeof(buf));
10299 +       u64 val = 0;
10300 +       int err = 0;
10301 +
10302 +       memset(buf, '\0', sizeof(buf));
10303 +       if (copy_from_user(buf, ubuf, csize))
10304 +               return -EFAULT;
10305 +
10306 +       buf[U64STR_SIZE-1] = '\0';                      /* just in case */
10307 +       err = kstrtoull(buf, 10, &val);
10308 +       if (err)
10309 +               return -EINVAL;
10310 +
10311 +       mutex_lock(&data.lock);
10312 +       if (data.sample_width < val)
10313 +               data.sample_window = val;
10314 +       else {
10315 +               mutex_unlock(&data.lock);
10316 +               return -EINVAL;
10317 +       }
10318 +       mutex_unlock(&data.lock);
10319 +
10320 +       return csize;
10321 +}
10322 +
10323 +/*
10324 + * Function pointers for the "count" debugfs file operations
10325 + */
10326 +static const struct file_operations count_fops = {
10327 +       .open           = debug_count_fopen,
10328 +       .read           = debug_count_fread,
10329 +       .write          = debug_count_fwrite,
10330 +       .owner          = THIS_MODULE,
10331 +};
10332 +
10333 +/*
10334 + * Function pointers for the "enable" debugfs file operations
10335 + */
10336 +static const struct file_operations enable_fops = {
10337 +       .open           = debug_enable_fopen,
10338 +       .read           = debug_enable_fread,
10339 +       .write          = debug_enable_fwrite,
10340 +       .owner          = THIS_MODULE,
10341 +};
10342 +
10343 +/*
10344 + * Function pointers for the "max" debugfs file operations
10345 + */
10346 +static const struct file_operations max_fops = {
10347 +       .open           = debug_max_fopen,
10348 +       .read           = debug_max_fread,
10349 +       .write          = debug_max_fwrite,
10350 +       .owner          = THIS_MODULE,
10351 +};
10352 +
10353 +/*
10354 + * Function pointers for the "sample" debugfs file operations
10355 + */
10356 +static const struct file_operations sample_fops = {
10357 +       .open           = debug_sample_fopen,
10358 +       .read           = debug_sample_fread,
10359 +       .release        = debug_sample_release,
10360 +       .owner          = THIS_MODULE,
10361 +};
10362 +
10363 +/*
10364 + * Function pointers for the "threshold" debugfs file operations
10365 + */
10366 +static const struct file_operations threshold_fops = {
10367 +       .open           = debug_threshold_fopen,
10368 +       .read           = debug_threshold_fread,
10369 +       .write          = debug_threshold_fwrite,
10370 +       .owner          = THIS_MODULE,
10371 +};
10372 +
10373 +/*
10374 + * Function pointers for the "width" debugfs file operations
10375 + */
10376 +static const struct file_operations width_fops = {
10377 +       .open           = debug_width_fopen,
10378 +       .read           = debug_width_fread,
10379 +       .write          = debug_width_fwrite,
10380 +       .owner          = THIS_MODULE,
10381 +};
10382 +
10383 +/*
10384 + * Function pointers for the "window" debugfs file operations
10385 + */
10386 +static const struct file_operations window_fops = {
10387 +       .open           = debug_window_fopen,
10388 +       .read           = debug_window_fread,
10389 +       .write          = debug_window_fwrite,
10390 +       .owner          = THIS_MODULE,
10391 +};
10392 +
10393 +/**
10394 + * init_debugfs - A function to initialize the debugfs interface files
10395 + *
10396 + * This function creates entries in debugfs for "hwlat_detector", including
10397 + * files to read values from the detector, current samples, and the
10398 + * maximum sample that has been captured since the hardware latency
10399 + * dectector was started.
10400 + */
10401 +static int init_debugfs(void)
10402 +{
10403 +       int ret = -ENOMEM;
10404 +
10405 +       debug_dir = debugfs_create_dir(DRVNAME, NULL);
10406 +       if (!debug_dir)
10407 +               goto err_debug_dir;
10408 +
10409 +       debug_sample = debugfs_create_file("sample", 0444,
10410 +                                              debug_dir, NULL,
10411 +                                              &sample_fops);
10412 +       if (!debug_sample)
10413 +               goto err_sample;
10414 +
10415 +       debug_count = debugfs_create_file("count", 0444,
10416 +                                             debug_dir, NULL,
10417 +                                             &count_fops);
10418 +       if (!debug_count)
10419 +               goto err_count;
10420 +
10421 +       debug_max = debugfs_create_file("max", 0444,
10422 +                                           debug_dir, NULL,
10423 +                                           &max_fops);
10424 +       if (!debug_max)
10425 +               goto err_max;
10426 +
10427 +       debug_sample_window = debugfs_create_file("window", 0644,
10428 +                                                     debug_dir, NULL,
10429 +                                                     &window_fops);
10430 +       if (!debug_sample_window)
10431 +               goto err_window;
10432 +
10433 +       debug_sample_width = debugfs_create_file("width", 0644,
10434 +                                                    debug_dir, NULL,
10435 +                                                    &width_fops);
10436 +       if (!debug_sample_width)
10437 +               goto err_width;
10438 +
10439 +       debug_threshold = debugfs_create_file("threshold", 0644,
10440 +                                                 debug_dir, NULL,
10441 +                                                 &threshold_fops);
10442 +       if (!debug_threshold)
10443 +               goto err_threshold;
10444 +
10445 +       debug_enable = debugfs_create_file("enable", 0644,
10446 +                                              debug_dir, &enabled,
10447 +                                              &enable_fops);
10448 +       if (!debug_enable)
10449 +               goto err_enable;
10450 +
10451 +       else {
10452 +               ret = 0;
10453 +               goto out;
10454 +       }
10455 +
10456 +err_enable:
10457 +       debugfs_remove(debug_threshold);
10458 +err_threshold:
10459 +       debugfs_remove(debug_sample_width);
10460 +err_width:
10461 +       debugfs_remove(debug_sample_window);
10462 +err_window:
10463 +       debugfs_remove(debug_max);
10464 +err_max:
10465 +       debugfs_remove(debug_count);
10466 +err_count:
10467 +       debugfs_remove(debug_sample);
10468 +err_sample:
10469 +       debugfs_remove(debug_dir);
10470 +err_debug_dir:
10471 +out:
10472 +       return ret;
10473 +}
10474 +
10475 +/**
10476 + * free_debugfs - A function to cleanup the debugfs file interface
10477 + */
10478 +static void free_debugfs(void)
10479 +{
10480 +       /* could also use a debugfs_remove_recursive */
10481 +       debugfs_remove(debug_enable);
10482 +       debugfs_remove(debug_threshold);
10483 +       debugfs_remove(debug_sample_width);
10484 +       debugfs_remove(debug_sample_window);
10485 +       debugfs_remove(debug_max);
10486 +       debugfs_remove(debug_count);
10487 +       debugfs_remove(debug_sample);
10488 +       debugfs_remove(debug_dir);
10489 +}
10490 +
10491 +/**
10492 + * detector_init - Standard module initialization code
10493 + */
10494 +static int detector_init(void)
10495 +{
10496 +       int ret = -ENOMEM;
10497 +
10498 +       pr_info(BANNER "version %s\n", VERSION);
10499 +
10500 +       ret = init_stats();
10501 +       if (ret)
10502 +               goto out;
10503 +
10504 +       ret = init_debugfs();
10505 +       if (ret)
10506 +               goto err_stats;
10507 +
10508 +       if (enabled)
10509 +               ret = start_kthread();
10510 +
10511 +       goto out;
10512 +
10513 +err_stats:
10514 +       ring_buffer_free(ring_buffer);
10515 +out:
10516 +       return ret;
10517 +
10518 +}
10519 +
10520 +/**
10521 + * detector_exit - Standard module cleanup code
10522 + */
10523 +static void detector_exit(void)
10524 +{
10525 +       int err;
10526 +
10527 +       if (enabled) {
10528 +               enabled = 0;
10529 +               err = stop_kthread();
10530 +               if (err)
10531 +                       pr_err(BANNER "cannot stop kthread\n");
10532 +       }
10533 +
10534 +       free_debugfs();
10535 +       ring_buffer_free(ring_buffer);  /* free up the ring buffer */
10536 +
10537 +}
10538 +
10539 +module_init(detector_init);
10540 +module_exit(detector_exit);
10541 diff --git a/drivers/mmc/host/mmci.c b/drivers/mmc/host/mmci.c
10542 index acece3299756..58ea04a03fa9 100644
10543 --- a/drivers/mmc/host/mmci.c
10544 +++ b/drivers/mmc/host/mmci.c
10545 @@ -1155,15 +1155,12 @@ static irqreturn_t mmci_pio_irq(int irq, void *dev_id)
10546         struct sg_mapping_iter *sg_miter = &host->sg_miter;
10547         struct variant_data *variant = host->variant;
10548         void __iomem *base = host->base;
10549 -       unsigned long flags;
10550         u32 status;
10551
10552         status = readl(base + MMCISTATUS);
10553
10554         dev_dbg(mmc_dev(host->mmc), "irq1 (pio) %08x\n", status);
10555
10556 -       local_irq_save(flags);
10557 -
10558         do {
10559                 unsigned int remain, len;
10560                 char *buffer;
10561 @@ -1203,8 +1200,6 @@ static irqreturn_t mmci_pio_irq(int irq, void *dev_id)
10562
10563         sg_miter_stop(sg_miter);
10564
10565 -       local_irq_restore(flags);
10566 -
10567         /*
10568          * If we have less than the fifo 'half-full' threshold to transfer,
10569          * trigger a PIO interrupt as soon as any data is available.
10570 diff --git a/drivers/net/ethernet/3com/3c59x.c b/drivers/net/ethernet/3com/3c59x.c
10571 index 2839af00f20c..4348b9c850d3 100644
10572 --- a/drivers/net/ethernet/3com/3c59x.c
10573 +++ b/drivers/net/ethernet/3com/3c59x.c
10574 @@ -842,9 +842,9 @@ static void poll_vortex(struct net_device *dev)
10575  {
10576         struct vortex_private *vp = netdev_priv(dev);
10577         unsigned long flags;
10578 -       local_irq_save(flags);
10579 +       local_irq_save_nort(flags);
10580         (vp->full_bus_master_rx ? boomerang_interrupt:vortex_interrupt)(dev->irq,dev);
10581 -       local_irq_restore(flags);
10582 +       local_irq_restore_nort(flags);
10583  }
10584  #endif
10585
10586 @@ -1916,12 +1916,12 @@ static void vortex_tx_timeout(struct net_device *dev)
10587                          * Block interrupts because vortex_interrupt does a bare spin_lock()
10588                          */
10589                         unsigned long flags;
10590 -                       local_irq_save(flags);
10591 +                       local_irq_save_nort(flags);
10592                         if (vp->full_bus_master_tx)
10593                                 boomerang_interrupt(dev->irq, dev);
10594                         else
10595                                 vortex_interrupt(dev->irq, dev);
10596 -                       local_irq_restore(flags);
10597 +                       local_irq_restore_nort(flags);
10598                 }
10599         }
10600
10601 diff --git a/drivers/net/ethernet/atheros/atl1c/atl1c_main.c b/drivers/net/ethernet/atheros/atl1c/atl1c_main.c
10602 index 8b5988e210d5..cf9928ccdd7e 100644
10603 --- a/drivers/net/ethernet/atheros/atl1c/atl1c_main.c
10604 +++ b/drivers/net/ethernet/atheros/atl1c/atl1c_main.c
10605 @@ -2221,11 +2221,7 @@ static netdev_tx_t atl1c_xmit_frame(struct sk_buff *skb,
10606         }
10607
10608         tpd_req = atl1c_cal_tpd_req(skb);
10609 -       if (!spin_trylock_irqsave(&adapter->tx_lock, flags)) {
10610 -               if (netif_msg_pktdata(adapter))
10611 -                       dev_info(&adapter->pdev->dev, "tx locked\n");
10612 -               return NETDEV_TX_LOCKED;
10613 -       }
10614 +       spin_lock_irqsave(&adapter->tx_lock, flags);
10615
10616         if (atl1c_tpd_avail(adapter, type) < tpd_req) {
10617                 /* no enough descriptor, just stop queue */
10618 diff --git a/drivers/net/ethernet/atheros/atl1e/atl1e_main.c b/drivers/net/ethernet/atheros/atl1e/atl1e_main.c
10619 index 59a03a193e83..734f7a7ad2c3 100644
10620 --- a/drivers/net/ethernet/atheros/atl1e/atl1e_main.c
10621 +++ b/drivers/net/ethernet/atheros/atl1e/atl1e_main.c
10622 @@ -1880,8 +1880,7 @@ static netdev_tx_t atl1e_xmit_frame(struct sk_buff *skb,
10623                 return NETDEV_TX_OK;
10624         }
10625         tpd_req = atl1e_cal_tdp_req(skb);
10626 -       if (!spin_trylock_irqsave(&adapter->tx_lock, flags))
10627 -               return NETDEV_TX_LOCKED;
10628 +       spin_lock_irqsave(&adapter->tx_lock, flags);
10629
10630         if (atl1e_tpd_avail(adapter) < tpd_req) {
10631                 /* no enough descriptor, just stop queue */
10632 diff --git a/drivers/net/ethernet/chelsio/cxgb/sge.c b/drivers/net/ethernet/chelsio/cxgb/sge.c
10633 index 526ea74e82d9..86f467a2c485 100644
10634 --- a/drivers/net/ethernet/chelsio/cxgb/sge.c
10635 +++ b/drivers/net/ethernet/chelsio/cxgb/sge.c
10636 @@ -1664,8 +1664,7 @@ static int t1_sge_tx(struct sk_buff *skb, struct adapter *adapter,
10637         struct cmdQ *q = &sge->cmdQ[qid];
10638         unsigned int credits, pidx, genbit, count, use_sched_skb = 0;
10639
10640 -       if (!spin_trylock(&q->lock))
10641 -               return NETDEV_TX_LOCKED;
10642 +       spin_lock(&q->lock);
10643
10644         reclaim_completed_tx(sge, q);
10645
10646 diff --git a/drivers/net/ethernet/neterion/s2io.c b/drivers/net/ethernet/neterion/s2io.c
10647 index 9ba975853ec6..813cfa698160 100644
10648 --- a/drivers/net/ethernet/neterion/s2io.c
10649 +++ b/drivers/net/ethernet/neterion/s2io.c
10650 @@ -4084,12 +4084,7 @@ static netdev_tx_t s2io_xmit(struct sk_buff *skb, struct net_device *dev)
10651                         [skb->priority & (MAX_TX_FIFOS - 1)];
10652         fifo = &mac_control->fifos[queue];
10653
10654 -       if (do_spin_lock)
10655 -               spin_lock_irqsave(&fifo->tx_lock, flags);
10656 -       else {
10657 -               if (unlikely(!spin_trylock_irqsave(&fifo->tx_lock, flags)))
10658 -                       return NETDEV_TX_LOCKED;
10659 -       }
10660 +       spin_lock_irqsave(&fifo->tx_lock, flags);
10661
10662         if (sp->config.multiq) {
10663                 if (__netif_subqueue_stopped(dev, fifo->fifo_no)) {
10664 diff --git a/drivers/net/ethernet/oki-semi/pch_gbe/pch_gbe_main.c b/drivers/net/ethernet/oki-semi/pch_gbe/pch_gbe_main.c
10665 index 3b98b263bad0..ca4add749410 100644
10666 --- a/drivers/net/ethernet/oki-semi/pch_gbe/pch_gbe_main.c
10667 +++ b/drivers/net/ethernet/oki-semi/pch_gbe/pch_gbe_main.c
10668 @@ -2137,10 +2137,8 @@ static int pch_gbe_xmit_frame(struct sk_buff *skb, struct net_device *netdev)
10669         struct pch_gbe_tx_ring *tx_ring = adapter->tx_ring;
10670         unsigned long flags;
10671
10672 -       if (!spin_trylock_irqsave(&tx_ring->tx_lock, flags)) {
10673 -               /* Collision - tell upper layer to requeue */
10674 -               return NETDEV_TX_LOCKED;
10675 -       }
10676 +       spin_lock_irqsave(&tx_ring->tx_lock, flags);
10677 +
10678         if (unlikely(!PCH_GBE_DESC_UNUSED(tx_ring))) {
10679                 netif_stop_queue(netdev);
10680                 spin_unlock_irqrestore(&tx_ring->tx_lock, flags);
10681 diff --git a/drivers/net/ethernet/realtek/8139too.c b/drivers/net/ethernet/realtek/8139too.c
10682 index ef668d300800..d987d571fdd6 100644
10683 --- a/drivers/net/ethernet/realtek/8139too.c
10684 +++ b/drivers/net/ethernet/realtek/8139too.c
10685 @@ -2229,7 +2229,7 @@ static void rtl8139_poll_controller(struct net_device *dev)
10686         struct rtl8139_private *tp = netdev_priv(dev);
10687         const int irq = tp->pci_dev->irq;
10688
10689 -       disable_irq(irq);
10690 +       disable_irq_nosync(irq);
10691         rtl8139_interrupt(irq, dev);
10692         enable_irq(irq);
10693  }
10694 diff --git a/drivers/net/ethernet/tehuti/tehuti.c b/drivers/net/ethernet/tehuti/tehuti.c
10695 index 14c9d1baa85c..e1a5305418a8 100644
10696 --- a/drivers/net/ethernet/tehuti/tehuti.c
10697 +++ b/drivers/net/ethernet/tehuti/tehuti.c
10698 @@ -1629,13 +1629,8 @@ static netdev_tx_t bdx_tx_transmit(struct sk_buff *skb,
10699         unsigned long flags;
10700
10701         ENTER;
10702 -       local_irq_save(flags);
10703 -       if (!spin_trylock(&priv->tx_lock)) {
10704 -               local_irq_restore(flags);
10705 -               DBG("%s[%s]: TX locked, returning NETDEV_TX_LOCKED\n",
10706 -                   BDX_DRV_NAME, ndev->name);
10707 -               return NETDEV_TX_LOCKED;
10708 -       }
10709 +
10710 +       spin_lock_irqsave(&priv->tx_lock, flags);
10711
10712         /* build tx descriptor */
10713         BDX_ASSERT(f->m.wptr >= f->m.memsz);    /* started with valid wptr */
10714 diff --git a/drivers/net/rionet.c b/drivers/net/rionet.c
10715 index e7034c55e796..2e4ee0f912bf 100644
10716 --- a/drivers/net/rionet.c
10717 +++ b/drivers/net/rionet.c
10718 @@ -174,11 +174,7 @@ static int rionet_start_xmit(struct sk_buff *skb, struct net_device *ndev)
10719         unsigned long flags;
10720         int add_num = 1;
10721
10722 -       local_irq_save(flags);
10723 -       if (!spin_trylock(&rnet->tx_lock)) {
10724 -               local_irq_restore(flags);
10725 -               return NETDEV_TX_LOCKED;
10726 -       }
10727 +       spin_lock_irqsave(&rnet->tx_lock, flags);
10728
10729         if (is_multicast_ether_addr(eth->h_dest))
10730                 add_num = nets[rnet->mport->id].nact;
10731 diff --git a/drivers/net/wireless/orinoco/orinoco_usb.c b/drivers/net/wireless/orinoco/orinoco_usb.c
10732 index f2cd513d54b2..6c0f4c9638a2 100644
10733 --- a/drivers/net/wireless/orinoco/orinoco_usb.c
10734 +++ b/drivers/net/wireless/orinoco/orinoco_usb.c
10735 @@ -697,7 +697,7 @@ static void ezusb_req_ctx_wait(struct ezusb_priv *upriv,
10736                         while (!ctx->done.done && msecs--)
10737                                 udelay(1000);
10738                 } else {
10739 -                       wait_event_interruptible(ctx->done.wait,
10740 +                       swait_event_interruptible(ctx->done.wait,
10741                                                  ctx->done.done);
10742                 }
10743                 break;
10744 diff --git a/drivers/pci/access.c b/drivers/pci/access.c
10745 index 59ac36fe7c42..7a45a20af78a 100644
10746 --- a/drivers/pci/access.c
10747 +++ b/drivers/pci/access.c
10748 @@ -561,7 +561,7 @@ void pci_cfg_access_unlock(struct pci_dev *dev)
10749         WARN_ON(!dev->block_cfg_access);
10750
10751         dev->block_cfg_access = 0;
10752 -       wake_up_all(&pci_cfg_wait);
10753 +       wake_up_all_locked(&pci_cfg_wait);
10754         raw_spin_unlock_irqrestore(&pci_lock, flags);
10755  }
10756  EXPORT_SYMBOL_GPL(pci_cfg_access_unlock);
10757 diff --git a/drivers/scsi/fcoe/fcoe.c b/drivers/scsi/fcoe/fcoe.c
10758 index f4424063b860..cbbbebd86c6e 100644
10759 --- a/drivers/scsi/fcoe/fcoe.c
10760 +++ b/drivers/scsi/fcoe/fcoe.c
10761 @@ -1286,7 +1286,7 @@ static void fcoe_percpu_thread_destroy(unsigned int cpu)
10762         struct sk_buff *skb;
10763  #ifdef CONFIG_SMP
10764         struct fcoe_percpu_s *p0;
10765 -       unsigned targ_cpu = get_cpu();
10766 +       unsigned targ_cpu = get_cpu_light();
10767  #endif /* CONFIG_SMP */
10768
10769         FCOE_DBG("Destroying receive thread for CPU %d\n", cpu);
10770 @@ -1342,7 +1342,7 @@ static void fcoe_percpu_thread_destroy(unsigned int cpu)
10771                         kfree_skb(skb);
10772                 spin_unlock_bh(&p->fcoe_rx_list.lock);
10773         }
10774 -       put_cpu();
10775 +       put_cpu_light();
10776  #else
10777         /*
10778          * This a non-SMP scenario where the singular Rx thread is
10779 @@ -1566,11 +1566,11 @@ err2:
10780  static int fcoe_alloc_paged_crc_eof(struct sk_buff *skb, int tlen)
10781  {
10782         struct fcoe_percpu_s *fps;
10783 -       int rc;
10784 +       int rc, cpu = get_cpu_light();
10785
10786 -       fps = &get_cpu_var(fcoe_percpu);
10787 +       fps = &per_cpu(fcoe_percpu, cpu);
10788         rc = fcoe_get_paged_crc_eof(skb, tlen, fps);
10789 -       put_cpu_var(fcoe_percpu);
10790 +       put_cpu_light();
10791
10792         return rc;
10793  }
10794 @@ -1766,11 +1766,11 @@ static inline int fcoe_filter_frames(struct fc_lport *lport,
10795                 return 0;
10796         }
10797
10798 -       stats = per_cpu_ptr(lport->stats, get_cpu());
10799 +       stats = per_cpu_ptr(lport->stats, get_cpu_light());
10800         stats->InvalidCRCCount++;
10801         if (stats->InvalidCRCCount < 5)
10802                 printk(KERN_WARNING "fcoe: dropping frame with CRC error\n");
10803 -       put_cpu();
10804 +       put_cpu_light();
10805         return -EINVAL;
10806  }
10807
10808 @@ -1814,7 +1814,7 @@ static void fcoe_recv_frame(struct sk_buff *skb)
10809          */
10810         hp = (struct fcoe_hdr *) skb_network_header(skb);
10811
10812 -       stats = per_cpu_ptr(lport->stats, get_cpu());
10813 +       stats = per_cpu_ptr(lport->stats, get_cpu_light());
10814         if (unlikely(FC_FCOE_DECAPS_VER(hp) != FC_FCOE_VER)) {
10815                 if (stats->ErrorFrames < 5)
10816                         printk(KERN_WARNING "fcoe: FCoE version "
10817 @@ -1846,13 +1846,13 @@ static void fcoe_recv_frame(struct sk_buff *skb)
10818                 goto drop;
10819
10820         if (!fcoe_filter_frames(lport, fp)) {
10821 -               put_cpu();
10822 +               put_cpu_light();
10823                 fc_exch_recv(lport, fp);
10824                 return;
10825         }
10826  drop:
10827         stats->ErrorFrames++;
10828 -       put_cpu();
10829 +       put_cpu_light();
10830         kfree_skb(skb);
10831  }
10832
10833 diff --git a/drivers/scsi/fcoe/fcoe_ctlr.c b/drivers/scsi/fcoe/fcoe_ctlr.c
10834 index 34a1b1f333b4..d91131210695 100644
10835 --- a/drivers/scsi/fcoe/fcoe_ctlr.c
10836 +++ b/drivers/scsi/fcoe/fcoe_ctlr.c
10837 @@ -831,7 +831,7 @@ static unsigned long fcoe_ctlr_age_fcfs(struct fcoe_ctlr *fip)
10838
10839         INIT_LIST_HEAD(&del_list);
10840
10841 -       stats = per_cpu_ptr(fip->lp->stats, get_cpu());
10842 +       stats = per_cpu_ptr(fip->lp->stats, get_cpu_light());
10843
10844         list_for_each_entry_safe(fcf, next, &fip->fcfs, list) {
10845                 deadline = fcf->time + fcf->fka_period + fcf->fka_period / 2;
10846 @@ -867,7 +867,7 @@ static unsigned long fcoe_ctlr_age_fcfs(struct fcoe_ctlr *fip)
10847                                 sel_time = fcf->time;
10848                 }
10849         }
10850 -       put_cpu();
10851 +       put_cpu_light();
10852
10853         list_for_each_entry_safe(fcf, next, &del_list, list) {
10854                 /* Removes fcf from current list */
10855 diff --git a/drivers/scsi/libfc/fc_exch.c b/drivers/scsi/libfc/fc_exch.c
10856 index 30f9ef0c0d4f..6c686bc01a82 100644
10857 --- a/drivers/scsi/libfc/fc_exch.c
10858 +++ b/drivers/scsi/libfc/fc_exch.c
10859 @@ -814,10 +814,10 @@ static struct fc_exch *fc_exch_em_alloc(struct fc_lport *lport,
10860         }
10861         memset(ep, 0, sizeof(*ep));
10862
10863 -       cpu = get_cpu();
10864 +       cpu = get_cpu_light();
10865         pool = per_cpu_ptr(mp->pool, cpu);
10866         spin_lock_bh(&pool->lock);
10867 -       put_cpu();
10868 +       put_cpu_light();
10869
10870         /* peek cache of free slot */
10871         if (pool->left != FC_XID_UNKNOWN) {
10872 diff --git a/drivers/scsi/libsas/sas_ata.c b/drivers/scsi/libsas/sas_ata.c
10873 index 9c706d8c1441..d968ffc79c08 100644
10874 --- a/drivers/scsi/libsas/sas_ata.c
10875 +++ b/drivers/scsi/libsas/sas_ata.c
10876 @@ -190,7 +190,7 @@ static unsigned int sas_ata_qc_issue(struct ata_queued_cmd *qc)
10877         /* TODO: audit callers to ensure they are ready for qc_issue to
10878          * unconditionally re-enable interrupts
10879          */
10880 -       local_irq_save(flags);
10881 +       local_irq_save_nort(flags);
10882         spin_unlock(ap->lock);
10883
10884         /* If the device fell off, no sense in issuing commands */
10885 @@ -255,7 +255,7 @@ static unsigned int sas_ata_qc_issue(struct ata_queued_cmd *qc)
10886
10887   out:
10888         spin_lock(ap->lock);
10889 -       local_irq_restore(flags);
10890 +       local_irq_restore_nort(flags);
10891         return ret;
10892  }
10893
10894 diff --git a/drivers/scsi/qla2xxx/qla_inline.h b/drivers/scsi/qla2xxx/qla_inline.h
10895 index fee9eb7c8a60..b42d4adc42dc 100644
10896 --- a/drivers/scsi/qla2xxx/qla_inline.h
10897 +++ b/drivers/scsi/qla2xxx/qla_inline.h
10898 @@ -59,12 +59,12 @@ qla2x00_poll(struct rsp_que *rsp)
10899  {
10900         unsigned long flags;
10901         struct qla_hw_data *ha = rsp->hw;
10902 -       local_irq_save(flags);
10903 +       local_irq_save_nort(flags);
10904         if (IS_P3P_TYPE(ha))
10905                 qla82xx_poll(0, rsp);
10906         else
10907                 ha->isp_ops->intr_handler(0, rsp);
10908 -       local_irq_restore(flags);
10909 +       local_irq_restore_nort(flags);
10910  }
10911
10912  static inline uint8_t *
10913 diff --git a/drivers/thermal/x86_pkg_temp_thermal.c b/drivers/thermal/x86_pkg_temp_thermal.c
10914 index 7fc919f7da4d..e03fa17b8670 100644
10915 --- a/drivers/thermal/x86_pkg_temp_thermal.c
10916 +++ b/drivers/thermal/x86_pkg_temp_thermal.c
10917 @@ -29,6 +29,7 @@
10918  #include <linux/pm.h>
10919  #include <linux/thermal.h>
10920  #include <linux/debugfs.h>
10921 +#include <linux/swork.h>
10922  #include <asm/cpu_device_id.h>
10923  #include <asm/mce.h>
10924
10925 @@ -352,7 +353,7 @@ static void pkg_temp_thermal_threshold_work_fn(struct work_struct *work)
10926         }
10927  }
10928
10929 -static int pkg_temp_thermal_platform_thermal_notify(__u64 msr_val)
10930 +static void platform_thermal_notify_work(struct swork_event *event)
10931  {
10932         unsigned long flags;
10933         int cpu = smp_processor_id();
10934 @@ -369,7 +370,7 @@ static int pkg_temp_thermal_platform_thermal_notify(__u64 msr_val)
10935                         pkg_work_scheduled[phy_id]) {
10936                 disable_pkg_thres_interrupt();
10937                 spin_unlock_irqrestore(&pkg_work_lock, flags);
10938 -               return -EINVAL;
10939 +               return;
10940         }
10941         pkg_work_scheduled[phy_id] = 1;
10942         spin_unlock_irqrestore(&pkg_work_lock, flags);
10943 @@ -378,9 +379,48 @@ static int pkg_temp_thermal_platform_thermal_notify(__u64 msr_val)
10944         schedule_delayed_work_on(cpu,
10945                                 &per_cpu(pkg_temp_thermal_threshold_work, cpu),
10946                                 msecs_to_jiffies(notify_delay_ms));
10947 +}
10948 +
10949 +#ifdef CONFIG_PREEMPT_RT_FULL
10950 +static struct swork_event notify_work;
10951 +
10952 +static int thermal_notify_work_init(void)
10953 +{
10954 +       int err;
10955 +
10956 +       err = swork_get();
10957 +       if (err)
10958 +               return err;
10959 +
10960 +       INIT_SWORK(&notify_work, platform_thermal_notify_work);
10961         return 0;
10962  }
10963
10964 +static void thermal_notify_work_cleanup(void)
10965 +{
10966 +       swork_put();
10967 +}
10968 +
10969 +static int pkg_temp_thermal_platform_thermal_notify(__u64 msr_val)
10970 +{
10971 +       swork_queue(&notify_work);
10972 +       return 0;
10973 +}
10974 +
10975 +#else  /* !CONFIG_PREEMPT_RT_FULL */
10976 +
10977 +static int thermal_notify_work_init(void) { return 0; }
10978 +
10979 +static void thermal_notify_work_cleanup(void) {  }
10980 +
10981 +static int pkg_temp_thermal_platform_thermal_notify(__u64 msr_val)
10982 +{
10983 +       platform_thermal_notify_work(NULL);
10984 +
10985 +       return 0;
10986 +}
10987 +#endif /* CONFIG_PREEMPT_RT_FULL */
10988 +
10989  static int find_siblings_cpu(int cpu)
10990  {
10991         int i;
10992 @@ -584,6 +624,9 @@ static int __init pkg_temp_thermal_init(void)
10993         if (!x86_match_cpu(pkg_temp_thermal_ids))
10994                 return -ENODEV;
10995
10996 +       if (!thermal_notify_work_init())
10997 +               return -ENODEV;
10998 +
10999         spin_lock_init(&pkg_work_lock);
11000         platform_thermal_package_notify =
11001                         pkg_temp_thermal_platform_thermal_notify;
11002 @@ -608,7 +651,7 @@ err_ret:
11003         kfree(pkg_work_scheduled);
11004         platform_thermal_package_notify = NULL;
11005         platform_thermal_package_rate_control = NULL;
11006 -
11007 +       thermal_notify_work_cleanup();
11008         return -ENODEV;
11009  }
11010
11011 @@ -633,6 +676,7 @@ static void __exit pkg_temp_thermal_exit(void)
11012         mutex_unlock(&phy_dev_list_mutex);
11013         platform_thermal_package_notify = NULL;
11014         platform_thermal_package_rate_control = NULL;
11015 +       thermal_notify_work_cleanup();
11016         for_each_online_cpu(i)
11017                 cancel_delayed_work_sync(
11018                         &per_cpu(pkg_temp_thermal_threshold_work, i));
11019 diff --git a/drivers/tty/serial/8250/8250_core.c b/drivers/tty/serial/8250/8250_core.c
11020 index 39126460c1f5..af7701ca4d48 100644
11021 --- a/drivers/tty/serial/8250/8250_core.c
11022 +++ b/drivers/tty/serial/8250/8250_core.c
11023 @@ -58,7 +58,16 @@ static struct uart_driver serial8250_reg;
11024
11025  static unsigned int skip_txen_test; /* force skip of txen test at init time */
11026
11027 -#define PASS_LIMIT     512
11028 +/*
11029 + * On -rt we can have a more delays, and legitimately
11030 + * so - so don't drop work spuriously and spam the
11031 + * syslog:
11032 + */
11033 +#ifdef CONFIG_PREEMPT_RT_FULL
11034 +# define PASS_LIMIT    1000000
11035 +#else
11036 +# define PASS_LIMIT    512
11037 +#endif
11038
11039  #include <asm/serial.h>
11040  /*
11041 diff --git a/drivers/tty/serial/8250/8250_port.c b/drivers/tty/serial/8250/8250_port.c
11042 index 56ccbcefdd85..a0b9e854672c 100644
11043 --- a/drivers/tty/serial/8250/8250_port.c
11044 +++ b/drivers/tty/serial/8250/8250_port.c
11045 @@ -35,6 +35,7 @@
11046  #include <linux/nmi.h>
11047  #include <linux/mutex.h>
11048  #include <linux/slab.h>
11049 +#include <linux/kdb.h>
11050  #include <linux/uaccess.h>
11051  #include <linux/pm_runtime.h>
11052
11053 @@ -2843,9 +2844,9 @@ void serial8250_console_write(struct uart_8250_port *up, const char *s,
11054
11055         serial8250_rpm_get(up);
11056
11057 -       if (port->sysrq)
11058 +       if (port->sysrq || oops_in_progress)
11059                 locked = 0;
11060 -       else if (oops_in_progress)
11061 +       else if (in_kdb_printk())
11062                 locked = spin_trylock_irqsave(&port->lock, flags);
11063         else
11064                 spin_lock_irqsave(&port->lock, flags);
11065 diff --git a/drivers/tty/serial/amba-pl011.c b/drivers/tty/serial/amba-pl011.c
11066 index 899a77187bde..3ff6363b3751 100644
11067 --- a/drivers/tty/serial/amba-pl011.c
11068 +++ b/drivers/tty/serial/amba-pl011.c
11069 @@ -2067,13 +2067,19 @@ pl011_console_write(struct console *co, const char *s, unsigned int count)
11070
11071         clk_enable(uap->clk);
11072
11073 -       local_irq_save(flags);
11074 +       /*
11075 +        * local_irq_save(flags);
11076 +        *
11077 +        * This local_irq_save() is nonsense. If we come in via sysrq
11078 +        * handling then interrupts are already disabled. Aside of
11079 +        * that the port.sysrq check is racy on SMP regardless.
11080 +       */
11081         if (uap->port.sysrq)
11082                 locked = 0;
11083         else if (oops_in_progress)
11084 -               locked = spin_trylock(&uap->port.lock);
11085 +               locked = spin_trylock_irqsave(&uap->port.lock, flags);
11086         else
11087 -               spin_lock(&uap->port.lock);
11088 +               spin_lock_irqsave(&uap->port.lock, flags);
11089
11090         /*
11091          *      First save the CR then disable the interrupts
11092 @@ -2098,8 +2104,7 @@ pl011_console_write(struct console *co, const char *s, unsigned int count)
11093                 writew(old_cr, uap->port.membase + UART011_CR);
11094
11095         if (locked)
11096 -               spin_unlock(&uap->port.lock);
11097 -       local_irq_restore(flags);
11098 +               spin_unlock_irqrestore(&uap->port.lock, flags);
11099
11100         clk_disable(uap->clk);
11101  }
11102 diff --git a/drivers/tty/serial/omap-serial.c b/drivers/tty/serial/omap-serial.c
11103 index 24280d9a05e9..9745fb8b7abb 100644
11104 --- a/drivers/tty/serial/omap-serial.c
11105 +++ b/drivers/tty/serial/omap-serial.c
11106 @@ -1257,13 +1257,10 @@ serial_omap_console_write(struct console *co, const char *s,
11107
11108         pm_runtime_get_sync(up->dev);
11109
11110 -       local_irq_save(flags);
11111 -       if (up->port.sysrq)
11112 -               locked = 0;
11113 -       else if (oops_in_progress)
11114 -               locked = spin_trylock(&up->port.lock);
11115 +       if (up->port.sysrq || oops_in_progress)
11116 +               locked = spin_trylock_irqsave(&up->port.lock, flags);
11117         else
11118 -               spin_lock(&up->port.lock);
11119 +               spin_lock_irqsave(&up->port.lock, flags);
11120
11121         /*
11122          * First save the IER then disable the interrupts
11123 @@ -1292,8 +1289,7 @@ serial_omap_console_write(struct console *co, const char *s,
11124         pm_runtime_mark_last_busy(up->dev);
11125         pm_runtime_put_autosuspend(up->dev);
11126         if (locked)
11127 -               spin_unlock(&up->port.lock);
11128 -       local_irq_restore(flags);
11129 +               spin_unlock_irqrestore(&up->port.lock, flags);
11130  }
11131
11132  static int __init
11133 diff --git a/drivers/tty/serial/sc16is7xx.c b/drivers/tty/serial/sc16is7xx.c
11134 index edb5305b9d4d..7d5ee8a13ac6 100644
11135 --- a/drivers/tty/serial/sc16is7xx.c
11136 +++ b/drivers/tty/serial/sc16is7xx.c
11137 @@ -1230,7 +1230,7 @@ static int sc16is7xx_probe(struct device *dev,
11138
11139         /* Setup interrupt */
11140         ret = devm_request_irq(dev, irq, sc16is7xx_irq,
11141 -                              IRQF_ONESHOT | flags, dev_name(dev), s);
11142 +                              flags, dev_name(dev), s);
11143         if (!ret)
11144                 return 0;
11145
11146 diff --git a/drivers/usb/core/hcd.c b/drivers/usb/core/hcd.c
11147 index f44ce09367bc..5fc9a14721bd 100644
11148 --- a/drivers/usb/core/hcd.c
11149 +++ b/drivers/usb/core/hcd.c
11150 @@ -1735,9 +1735,9 @@ static void __usb_hcd_giveback_urb(struct urb *urb)
11151          * and no one may trigger the above deadlock situation when
11152          * running complete() in tasklet.
11153          */
11154 -       local_irq_save(flags);
11155 +       local_irq_save_nort(flags);
11156         urb->complete(urb);
11157 -       local_irq_restore(flags);
11158 +       local_irq_restore_nort(flags);
11159
11160         usb_anchor_resume_wakeups(anchor);
11161         atomic_dec(&urb->use_count);
11162 diff --git a/drivers/usb/gadget/function/f_fs.c b/drivers/usb/gadget/function/f_fs.c
11163 index 803c503a2e3d..8dd2720aab64 100644
11164 --- a/drivers/usb/gadget/function/f_fs.c
11165 +++ b/drivers/usb/gadget/function/f_fs.c
11166 @@ -1404,7 +1404,7 @@ static void ffs_data_put(struct ffs_data *ffs)
11167                 pr_info("%s(): freeing\n", __func__);
11168                 ffs_data_clear(ffs);
11169                 BUG_ON(waitqueue_active(&ffs->ev.waitq) ||
11170 -                      waitqueue_active(&ffs->ep0req_completion.wait));
11171 +                      swait_active(&ffs->ep0req_completion.wait));
11172                 kfree(ffs->dev_name);
11173                 kfree(ffs);
11174         }
11175 diff --git a/drivers/usb/gadget/legacy/inode.c b/drivers/usb/gadget/legacy/inode.c
11176 index e57f48f9528f..7544a54056e4 100644
11177 --- a/drivers/usb/gadget/legacy/inode.c
11178 +++ b/drivers/usb/gadget/legacy/inode.c
11179 @@ -345,7 +345,7 @@ ep_io (struct ep_data *epdata, void *buf, unsigned len)
11180         spin_unlock_irq (&epdata->dev->lock);
11181
11182         if (likely (value == 0)) {
11183 -               value = wait_event_interruptible (done.wait, done.done);
11184 +               value = swait_event_interruptible (done.wait, done.done);
11185                 if (value != 0) {
11186                         spin_lock_irq (&epdata->dev->lock);
11187                         if (likely (epdata->ep != NULL)) {
11188 @@ -354,7 +354,7 @@ ep_io (struct ep_data *epdata, void *buf, unsigned len)
11189                                 usb_ep_dequeue (epdata->ep, epdata->req);
11190                                 spin_unlock_irq (&epdata->dev->lock);
11191
11192 -                               wait_event (done.wait, done.done);
11193 +                               swait_event (done.wait, done.done);
11194                                 if (epdata->status == -ECONNRESET)
11195                                         epdata->status = -EINTR;
11196                         } else {
11197 diff --git a/drivers/usb/gadget/udc/atmel_usba_udc.c b/drivers/usb/gadget/udc/atmel_usba_udc.c
11198 index f92f5aff0dd5..f9bba26e3655 100644
11199 --- a/drivers/usb/gadget/udc/atmel_usba_udc.c
11200 +++ b/drivers/usb/gadget/udc/atmel_usba_udc.c
11201 @@ -17,7 +17,9 @@
11202  #include <linux/device.h>
11203  #include <linux/dma-mapping.h>
11204  #include <linux/list.h>
11205 +#include <linux/mfd/syscon.h>
11206  #include <linux/platform_device.h>
11207 +#include <linux/regmap.h>
11208  #include <linux/usb/ch9.h>
11209  #include <linux/usb/gadget.h>
11210  #include <linux/usb/atmel_usba_udc.h>
11211 @@ -1888,20 +1890,15 @@ static int atmel_usba_stop(struct usb_gadget *gadget)
11212  #ifdef CONFIG_OF
11213  static void at91sam9rl_toggle_bias(struct usba_udc *udc, int is_on)
11214  {
11215 -       unsigned int uckr = at91_pmc_read(AT91_CKGR_UCKR);
11216 -
11217 -       if (is_on)
11218 -               at91_pmc_write(AT91_CKGR_UCKR, uckr | AT91_PMC_BIASEN);
11219 -       else
11220 -               at91_pmc_write(AT91_CKGR_UCKR, uckr & ~(AT91_PMC_BIASEN));
11221 +       regmap_update_bits(udc->pmc, AT91_CKGR_UCKR, AT91_PMC_BIASEN,
11222 +                          is_on ? AT91_PMC_BIASEN : 0);
11223  }
11224
11225  static void at91sam9g45_pulse_bias(struct usba_udc *udc)
11226  {
11227 -       unsigned int uckr = at91_pmc_read(AT91_CKGR_UCKR);
11228 -
11229 -       at91_pmc_write(AT91_CKGR_UCKR, uckr & ~(AT91_PMC_BIASEN));
11230 -       at91_pmc_write(AT91_CKGR_UCKR, uckr | AT91_PMC_BIASEN);
11231 +       regmap_update_bits(udc->pmc, AT91_CKGR_UCKR, AT91_PMC_BIASEN, 0);
11232 +       regmap_update_bits(udc->pmc, AT91_CKGR_UCKR, AT91_PMC_BIASEN,
11233 +                          AT91_PMC_BIASEN);
11234  }
11235
11236  static const struct usba_udc_errata at91sam9rl_errata = {
11237 @@ -1938,6 +1935,9 @@ static struct usba_ep * atmel_udc_of_init(struct platform_device *pdev,
11238                 return ERR_PTR(-EINVAL);
11239
11240         udc->errata = match->data;
11241 +       udc->pmc = syscon_regmap_lookup_by_compatible("atmel,at91sam9g45-pmc");
11242 +       if (udc->errata && IS_ERR(udc->pmc))
11243 +               return ERR_CAST(udc->pmc);
11244
11245         udc->num_ep = 0;
11246
11247 diff --git a/drivers/usb/gadget/udc/atmel_usba_udc.h b/drivers/usb/gadget/udc/atmel_usba_udc.h
11248 index ea448a344767..3e1c9d589dfa 100644
11249 --- a/drivers/usb/gadget/udc/atmel_usba_udc.h
11250 +++ b/drivers/usb/gadget/udc/atmel_usba_udc.h
11251 @@ -354,6 +354,8 @@ struct usba_udc {
11252         struct dentry *debugfs_root;
11253         struct dentry *debugfs_regs;
11254  #endif
11255 +
11256 +       struct regmap *pmc;
11257  };
11258
11259  static inline struct usba_ep *to_usba_ep(struct usb_ep *ep)
11260 diff --git a/fs/aio.c b/fs/aio.c
11261 index fe4f49212b99..c3194afdc3df 100644
11262 --- a/fs/aio.c
11263 +++ b/fs/aio.c
11264 @@ -40,6 +40,7 @@
11265  #include <linux/ramfs.h>
11266  #include <linux/percpu-refcount.h>
11267  #include <linux/mount.h>
11268 +#include <linux/swork.h>
11269
11270  #include <asm/kmap_types.h>
11271  #include <asm/uaccess.h>
11272 @@ -115,7 +116,7 @@ struct kioctx {
11273         struct page             **ring_pages;
11274         long                    nr_pages;
11275
11276 -       struct work_struct      free_work;
11277 +       struct swork_event      free_work;
11278
11279         /*
11280          * signals when all in-flight requests are done
11281 @@ -258,6 +259,7 @@ static int __init aio_setup(void)
11282                 .mount          = aio_mount,
11283                 .kill_sb        = kill_anon_super,
11284         };
11285 +       BUG_ON(swork_get());
11286         aio_mnt = kern_mount(&aio_fs);
11287         if (IS_ERR(aio_mnt))
11288                 panic("Failed to create aio fs mount.");
11289 @@ -573,9 +575,9 @@ static int kiocb_cancel(struct aio_kiocb *kiocb)
11290         return cancel(&kiocb->common);
11291  }
11292
11293 -static void free_ioctx(struct work_struct *work)
11294 +static void free_ioctx(struct swork_event *sev)
11295  {
11296 -       struct kioctx *ctx = container_of(work, struct kioctx, free_work);
11297 +       struct kioctx *ctx = container_of(sev, struct kioctx, free_work);
11298
11299         pr_debug("freeing %p\n", ctx);
11300
11301 @@ -594,8 +596,8 @@ static void free_ioctx_reqs(struct percpu_ref *ref)
11302         if (ctx->rq_wait && atomic_dec_and_test(&ctx->rq_wait->count))
11303                 complete(&ctx->rq_wait->comp);
11304
11305 -       INIT_WORK(&ctx->free_work, free_ioctx);
11306 -       schedule_work(&ctx->free_work);
11307 +       INIT_SWORK(&ctx->free_work, free_ioctx);
11308 +       swork_queue(&ctx->free_work);
11309  }
11310
11311  /*
11312 @@ -603,9 +605,9 @@ static void free_ioctx_reqs(struct percpu_ref *ref)
11313   * and ctx->users has dropped to 0, so we know no more kiocbs can be submitted -
11314   * now it's safe to cancel any that need to be.
11315   */
11316 -static void free_ioctx_users(struct percpu_ref *ref)
11317 +static void free_ioctx_users_work(struct swork_event *sev)
11318  {
11319 -       struct kioctx *ctx = container_of(ref, struct kioctx, users);
11320 +       struct kioctx *ctx = container_of(sev, struct kioctx, free_work);
11321         struct aio_kiocb *req;
11322
11323         spin_lock_irq(&ctx->ctx_lock);
11324 @@ -624,6 +626,14 @@ static void free_ioctx_users(struct percpu_ref *ref)
11325         percpu_ref_put(&ctx->reqs);
11326  }
11327
11328 +static void free_ioctx_users(struct percpu_ref *ref)
11329 +{
11330 +       struct kioctx *ctx = container_of(ref, struct kioctx, users);
11331 +
11332 +       INIT_SWORK(&ctx->free_work, free_ioctx_users_work);
11333 +       swork_queue(&ctx->free_work);
11334 +}
11335 +
11336  static int ioctx_add_table(struct kioctx *ctx, struct mm_struct *mm)
11337  {
11338         unsigned i, new_nr;
11339 diff --git a/fs/autofs4/autofs_i.h b/fs/autofs4/autofs_i.h
11340 index 502d3892d8a4..05af8d3e6e88 100644
11341 --- a/fs/autofs4/autofs_i.h
11342 +++ b/fs/autofs4/autofs_i.h
11343 @@ -34,6 +34,7 @@
11344  #include <linux/sched.h>
11345  #include <linux/mount.h>
11346  #include <linux/namei.h>
11347 +#include <linux/delay.h>
11348  #include <asm/current.h>
11349  #include <asm/uaccess.h>
11350
11351 diff --git a/fs/autofs4/expire.c b/fs/autofs4/expire.c
11352 index 7a5a598a2d94..d08bcdc30566 100644
11353 --- a/fs/autofs4/expire.c
11354 +++ b/fs/autofs4/expire.c
11355 @@ -150,7 +150,7 @@ again:
11356                         parent = p->d_parent;
11357                         if (!spin_trylock(&parent->d_lock)) {
11358                                 spin_unlock(&p->d_lock);
11359 -                               cpu_relax();
11360 +                               cpu_chill();
11361                                 goto relock;
11362                         }
11363                         spin_unlock(&p->d_lock);
11364 diff --git a/fs/buffer.c b/fs/buffer.c
11365 index 4f4cd959da7c..72b27e17b907 100644
11366 --- a/fs/buffer.c
11367 +++ b/fs/buffer.c
11368 @@ -305,8 +305,7 @@ static void end_buffer_async_read(struct buffer_head *bh, int uptodate)
11369          * decide that the page is now completely done.
11370          */
11371         first = page_buffers(page);
11372 -       local_irq_save(flags);
11373 -       bit_spin_lock(BH_Uptodate_Lock, &first->b_state);
11374 +       flags = bh_uptodate_lock_irqsave(first);
11375         clear_buffer_async_read(bh);
11376         unlock_buffer(bh);
11377         tmp = bh;
11378 @@ -319,8 +318,7 @@ static void end_buffer_async_read(struct buffer_head *bh, int uptodate)
11379                 }
11380                 tmp = tmp->b_this_page;
11381         } while (tmp != bh);
11382 -       bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
11383 -       local_irq_restore(flags);
11384 +       bh_uptodate_unlock_irqrestore(first, flags);
11385
11386         /*
11387          * If none of the buffers had errors and they are all
11388 @@ -332,9 +330,7 @@ static void end_buffer_async_read(struct buffer_head *bh, int uptodate)
11389         return;
11390
11391  still_busy:
11392 -       bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
11393 -       local_irq_restore(flags);
11394 -       return;
11395 +       bh_uptodate_unlock_irqrestore(first, flags);
11396  }
11397
11398  /*
11399 @@ -362,8 +358,7 @@ void end_buffer_async_write(struct buffer_head *bh, int uptodate)
11400         }
11401
11402         first = page_buffers(page);
11403 -       local_irq_save(flags);
11404 -       bit_spin_lock(BH_Uptodate_Lock, &first->b_state);
11405 +       flags = bh_uptodate_lock_irqsave(first);
11406
11407         clear_buffer_async_write(bh);
11408         unlock_buffer(bh);
11409 @@ -375,15 +370,12 @@ void end_buffer_async_write(struct buffer_head *bh, int uptodate)
11410                 }
11411                 tmp = tmp->b_this_page;
11412         }
11413 -       bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
11414 -       local_irq_restore(flags);
11415 +       bh_uptodate_unlock_irqrestore(first, flags);
11416         end_page_writeback(page);
11417         return;
11418
11419  still_busy:
11420 -       bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
11421 -       local_irq_restore(flags);
11422 -       return;
11423 +       bh_uptodate_unlock_irqrestore(first, flags);
11424  }
11425  EXPORT_SYMBOL(end_buffer_async_write);
11426
11427 @@ -3325,6 +3317,7 @@ struct buffer_head *alloc_buffer_head(gfp_t gfp_flags)
11428         struct buffer_head *ret = kmem_cache_zalloc(bh_cachep, gfp_flags);
11429         if (ret) {
11430                 INIT_LIST_HEAD(&ret->b_assoc_buffers);
11431 +               buffer_head_init_locks(ret);
11432                 preempt_disable();
11433                 __this_cpu_inc(bh_accounting.nr);
11434                 recalc_bh_state();
11435 diff --git a/fs/dcache.c b/fs/dcache.c
11436 index 71b6056ad35d..e80471cbfc19 100644
11437 --- a/fs/dcache.c
11438 +++ b/fs/dcache.c
11439 @@ -19,6 +19,7 @@
11440  #include <linux/mm.h>
11441  #include <linux/fs.h>
11442  #include <linux/fsnotify.h>
11443 +#include <linux/delay.h>
11444  #include <linux/slab.h>
11445  #include <linux/init.h>
11446  #include <linux/hash.h>
11447 @@ -747,6 +748,8 @@ static inline bool fast_dput(struct dentry *dentry)
11448   */
11449  void dput(struct dentry *dentry)
11450  {
11451 +       struct dentry *parent;
11452 +
11453         if (unlikely(!dentry))
11454                 return;
11455
11456 @@ -783,9 +786,18 @@ repeat:
11457         return;
11458
11459  kill_it:
11460 -       dentry = dentry_kill(dentry);
11461 -       if (dentry) {
11462 -               cond_resched();
11463 +       parent = dentry_kill(dentry);
11464 +       if (parent) {
11465 +               int r;
11466 +
11467 +               if (parent == dentry) {
11468 +                       /* the task with the highest priority won't schedule */
11469 +                       r = cond_resched();
11470 +                       if (!r)
11471 +                               cpu_chill();
11472 +               } else {
11473 +                       dentry = parent;
11474 +               }
11475                 goto repeat;
11476         }
11477  }
11478 @@ -2394,7 +2406,7 @@ again:
11479         if (dentry->d_lockref.count == 1) {
11480                 if (!spin_trylock(&inode->i_lock)) {
11481                         spin_unlock(&dentry->d_lock);
11482 -                       cpu_relax();
11483 +                       cpu_chill();
11484                         goto again;
11485                 }
11486                 dentry->d_flags &= ~DCACHE_CANT_MOUNT;
11487 diff --git a/fs/eventpoll.c b/fs/eventpoll.c
11488 index 1e009cad8d5c..d0c12504d3b4 100644
11489 --- a/fs/eventpoll.c
11490 +++ b/fs/eventpoll.c
11491 @@ -505,12 +505,12 @@ static int ep_poll_wakeup_proc(void *priv, void *cookie, int call_nests)
11492   */
11493  static void ep_poll_safewake(wait_queue_head_t *wq)
11494  {
11495 -       int this_cpu = get_cpu();
11496 +       int this_cpu = get_cpu_light();
11497
11498         ep_call_nested(&poll_safewake_ncalls, EP_MAX_NESTS,
11499                        ep_poll_wakeup_proc, NULL, wq, (void *) (long) this_cpu);
11500
11501 -       put_cpu();
11502 +       put_cpu_light();
11503  }
11504
11505  static void ep_remove_wait_queue(struct eppoll_entry *pwq)
11506 diff --git a/fs/exec.c b/fs/exec.c
11507 index b06623a9347f..e7760b7b692c 100644
11508 --- a/fs/exec.c
11509 +++ b/fs/exec.c
11510 @@ -865,12 +865,14 @@ static int exec_mmap(struct mm_struct *mm)
11511                 }
11512         }
11513         task_lock(tsk);
11514 +       preempt_disable_rt();
11515         active_mm = tsk->active_mm;
11516         tsk->mm = mm;
11517         tsk->active_mm = mm;
11518         activate_mm(active_mm, mm);
11519         tsk->mm->vmacache_seqnum = 0;
11520         vmacache_flush(tsk);
11521 +       preempt_enable_rt();
11522         task_unlock(tsk);
11523         if (old_mm) {
11524                 up_read(&old_mm->mmap_sem);
11525 diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
11526 index 9db5500d63d9..5951c495d124 100644
11527 --- a/fs/f2fs/f2fs.h
11528 +++ b/fs/f2fs/f2fs.h
11529 @@ -24,7 +24,6 @@
11530
11531  #ifdef CONFIG_F2FS_CHECK_FS
11532  #define f2fs_bug_on(sbi, condition)    BUG_ON(condition)
11533 -#define f2fs_down_write(x, y)  down_write_nest_lock(x, y)
11534  #else
11535  #define f2fs_bug_on(sbi, condition)                                    \
11536         do {                                                            \
11537 @@ -33,7 +32,6 @@
11538                         set_sbi_flag(sbi, SBI_NEED_FSCK);               \
11539                 }                                                       \
11540         } while (0)
11541 -#define f2fs_down_write(x, y)  down_write(x)
11542  #endif
11543
11544  /*
11545 @@ -959,7 +957,7 @@ static inline void f2fs_unlock_op(struct f2fs_sb_info *sbi)
11546
11547  static inline void f2fs_lock_all(struct f2fs_sb_info *sbi)
11548  {
11549 -       f2fs_down_write(&sbi->cp_rwsem, &sbi->cp_mutex);
11550 +       down_write(&sbi->cp_rwsem);
11551  }
11552
11553  static inline void f2fs_unlock_all(struct f2fs_sb_info *sbi)
11554 diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c
11555 index 684996c8a3a4..6e18a06aaabe 100644
11556 --- a/fs/jbd2/checkpoint.c
11557 +++ b/fs/jbd2/checkpoint.c
11558 @@ -116,6 +116,8 @@ void __jbd2_log_wait_for_space(journal_t *journal)
11559         nblocks = jbd2_space_needed(journal);
11560         while (jbd2_log_space_left(journal) < nblocks) {
11561                 write_unlock(&journal->j_state_lock);
11562 +               if (current->plug)
11563 +                       io_schedule();
11564                 mutex_lock(&journal->j_checkpoint_mutex);
11565
11566                 /*
11567 diff --git a/fs/namespace.c b/fs/namespace.c
11568 index 5be02a0635be..1f3725bbd04b 100644
11569 --- a/fs/namespace.c
11570 +++ b/fs/namespace.c
11571 @@ -14,6 +14,7 @@
11572  #include <linux/mnt_namespace.h>
11573  #include <linux/user_namespace.h>
11574  #include <linux/namei.h>
11575 +#include <linux/delay.h>
11576  #include <linux/security.h>
11577  #include <linux/idr.h>
11578  #include <linux/init.h>                /* init_rootfs */
11579 @@ -353,8 +354,11 @@ int __mnt_want_write(struct vfsmount *m)
11580          * incremented count after it has set MNT_WRITE_HOLD.
11581          */
11582         smp_mb();
11583 -       while (ACCESS_ONCE(mnt->mnt.mnt_flags) & MNT_WRITE_HOLD)
11584 -               cpu_relax();
11585 +       while (ACCESS_ONCE(mnt->mnt.mnt_flags) & MNT_WRITE_HOLD) {
11586 +               preempt_enable();
11587 +               cpu_chill();
11588 +               preempt_disable();
11589 +       }
11590         /*
11591          * After the slowpath clears MNT_WRITE_HOLD, mnt_is_readonly will
11592          * be set to match its requirements. So we must not load that until
11593 diff --git a/fs/ntfs/aops.c b/fs/ntfs/aops.c
11594 index 7521e11db728..f0de4b6b8bf3 100644
11595 --- a/fs/ntfs/aops.c
11596 +++ b/fs/ntfs/aops.c
11597 @@ -107,8 +107,7 @@ static void ntfs_end_buffer_async_read(struct buffer_head *bh, int uptodate)
11598                                 "0x%llx.", (unsigned long long)bh->b_blocknr);
11599         }
11600         first = page_buffers(page);
11601 -       local_irq_save(flags);
11602 -       bit_spin_lock(BH_Uptodate_Lock, &first->b_state);
11603 +       flags = bh_uptodate_lock_irqsave(first);
11604         clear_buffer_async_read(bh);
11605         unlock_buffer(bh);
11606         tmp = bh;
11607 @@ -123,8 +122,7 @@ static void ntfs_end_buffer_async_read(struct buffer_head *bh, int uptodate)
11608                 }
11609                 tmp = tmp->b_this_page;
11610         } while (tmp != bh);
11611 -       bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
11612 -       local_irq_restore(flags);
11613 +       bh_uptodate_unlock_irqrestore(first, flags);
11614         /*
11615          * If none of the buffers had errors then we can set the page uptodate,
11616          * but we first have to perform the post read mst fixups, if the
11617 @@ -145,13 +143,13 @@ static void ntfs_end_buffer_async_read(struct buffer_head *bh, int uptodate)
11618                 recs = PAGE_CACHE_SIZE / rec_size;
11619                 /* Should have been verified before we got here... */
11620                 BUG_ON(!recs);
11621 -               local_irq_save(flags);
11622 +               local_irq_save_nort(flags);
11623                 kaddr = kmap_atomic(page);
11624                 for (i = 0; i < recs; i++)
11625                         post_read_mst_fixup((NTFS_RECORD*)(kaddr +
11626                                         i * rec_size), rec_size);
11627                 kunmap_atomic(kaddr);
11628 -               local_irq_restore(flags);
11629 +               local_irq_restore_nort(flags);
11630                 flush_dcache_page(page);
11631                 if (likely(page_uptodate && !PageError(page)))
11632                         SetPageUptodate(page);
11633 @@ -159,9 +157,7 @@ static void ntfs_end_buffer_async_read(struct buffer_head *bh, int uptodate)
11634         unlock_page(page);
11635         return;
11636  still_busy:
11637 -       bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
11638 -       local_irq_restore(flags);
11639 -       return;
11640 +       bh_uptodate_unlock_irqrestore(first, flags);
11641  }
11642
11643  /**
11644 diff --git a/fs/timerfd.c b/fs/timerfd.c
11645 index 053818dd6c18..c4bc14fe0085 100644
11646 --- a/fs/timerfd.c
11647 +++ b/fs/timerfd.c
11648 @@ -450,7 +450,10 @@ static int do_timerfd_settime(int ufd, int flags,
11649                                 break;
11650                 }
11651                 spin_unlock_irq(&ctx->wqh.lock);
11652 -               cpu_relax();
11653 +               if (isalarm(ctx))
11654 +                       hrtimer_wait_for_timer(&ctx->t.alarm.timer);
11655 +               else
11656 +                       hrtimer_wait_for_timer(&ctx->t.tmr);
11657         }
11658
11659         /*
11660 diff --git a/include/acpi/platform/aclinux.h b/include/acpi/platform/aclinux.h
11661 index 323e5daece54..cc5fbd534fd4 100644
11662 --- a/include/acpi/platform/aclinux.h
11663 +++ b/include/acpi/platform/aclinux.h
11664 @@ -127,6 +127,7 @@
11665
11666  #define acpi_cache_t                        struct kmem_cache
11667  #define acpi_spinlock                       spinlock_t *
11668 +#define acpi_raw_spinlock              raw_spinlock_t *
11669  #define acpi_cpu_flags                      unsigned long
11670
11671  /* Use native linux version of acpi_os_allocate_zeroed */
11672 @@ -145,6 +146,20 @@
11673  #define ACPI_USE_ALTERNATE_PROTOTYPE_acpi_os_get_thread_id
11674  #define ACPI_USE_ALTERNATE_PROTOTYPE_acpi_os_create_lock
11675
11676 +#define acpi_os_create_raw_lock(__handle)                      \
11677 +({                                                             \
11678 +        raw_spinlock_t *lock = ACPI_ALLOCATE(sizeof(*lock));   \
11679 +                                                               \
11680 +        if (lock) {                                            \
11681 +               *(__handle) = lock;                             \
11682 +               raw_spin_lock_init(*(__handle));                \
11683 +        }                                                      \
11684 +        lock ? AE_OK : AE_NO_MEMORY;                           \
11685 + })
11686 +
11687 +#define acpi_os_delete_raw_lock(__handle)      kfree(__handle)
11688 +
11689 +
11690  /*
11691   * OSL interfaces used by debugger/disassembler
11692   */
11693 diff --git a/include/asm-generic/bug.h b/include/asm-generic/bug.h
11694 index 630dd2372238..850e4d993a88 100644
11695 --- a/include/asm-generic/bug.h
11696 +++ b/include/asm-generic/bug.h
11697 @@ -206,6 +206,20 @@ extern void warn_slowpath_null(const char *file, const int line);
11698  # define WARN_ON_SMP(x)                        ({0;})
11699  #endif
11700
11701 +#ifdef CONFIG_PREEMPT_RT_BASE
11702 +# define BUG_ON_RT(c)                  BUG_ON(c)
11703 +# define BUG_ON_NONRT(c)               do { } while (0)
11704 +# define WARN_ON_RT(condition)         WARN_ON(condition)
11705 +# define WARN_ON_NONRT(condition)      do { } while (0)
11706 +# define WARN_ON_ONCE_NONRT(condition) do { } while (0)
11707 +#else
11708 +# define BUG_ON_RT(c)                  do { } while (0)
11709 +# define BUG_ON_NONRT(c)               BUG_ON(c)
11710 +# define WARN_ON_RT(condition)         do { } while (0)
11711 +# define WARN_ON_NONRT(condition)      WARN_ON(condition)
11712 +# define WARN_ON_ONCE_NONRT(condition) WARN_ON_ONCE(condition)
11713 +#endif
11714 +
11715  #endif /* __ASSEMBLY__ */
11716
11717  #endif
11718 diff --git a/include/asm-generic/preempt.h b/include/asm-generic/preempt.h
11719 index 5d8ffa3e6f8c..c1cde3577551 100644
11720 --- a/include/asm-generic/preempt.h
11721 +++ b/include/asm-generic/preempt.h
11722 @@ -7,10 +7,10 @@
11723
11724  static __always_inline int preempt_count(void)
11725  {
11726 -       return current_thread_info()->preempt_count;
11727 +       return READ_ONCE(current_thread_info()->preempt_count);
11728  }
11729
11730 -static __always_inline int *preempt_count_ptr(void)
11731 +static __always_inline volatile int *preempt_count_ptr(void)
11732  {
11733         return &current_thread_info()->preempt_count;
11734  }
11735 diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
11736 index daf17d70aeca..463df8954255 100644
11737 --- a/include/linux/blk-mq.h
11738 +++ b/include/linux/blk-mq.h
11739 @@ -212,6 +212,7 @@ static inline u16 blk_mq_unique_tag_to_tag(u32 unique_tag)
11740
11741  struct blk_mq_hw_ctx *blk_mq_map_queue(struct request_queue *, const int ctx_index);
11742  struct blk_mq_hw_ctx *blk_mq_alloc_single_hw_queue(struct blk_mq_tag_set *, unsigned int, int);
11743 +void __blk_mq_complete_request_remote_work(struct work_struct *work);
11744
11745  int blk_mq_request_started(struct request *rq);
11746  void blk_mq_start_request(struct request *rq);
11747 diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
11748 index fe14382f9664..a82143ad6702 100644
11749 --- a/include/linux/blkdev.h
11750 +++ b/include/linux/blkdev.h
11751 @@ -89,6 +89,7 @@ struct request {
11752         struct list_head queuelist;
11753         union {
11754                 struct call_single_data csd;
11755 +               struct work_struct work;
11756                 unsigned long fifo_time;
11757         };
11758
11759 @@ -455,7 +456,7 @@ struct request_queue {
11760         struct throtl_data *td;
11761  #endif
11762         struct rcu_head         rcu_head;
11763 -       wait_queue_head_t       mq_freeze_wq;
11764 +       struct swait_queue_head mq_freeze_wq;
11765         struct percpu_ref       q_usage_counter;
11766         struct list_head        all_q_node;
11767
11768 diff --git a/include/linux/bottom_half.h b/include/linux/bottom_half.h
11769 index 8fdcb783197d..d07dbeec7bc1 100644
11770 --- a/include/linux/bottom_half.h
11771 +++ b/include/linux/bottom_half.h
11772 @@ -3,6 +3,39 @@
11773
11774  #include <linux/preempt.h>
11775
11776 +#ifdef CONFIG_PREEMPT_RT_FULL
11777 +
11778 +extern void __local_bh_disable(void);
11779 +extern void _local_bh_enable(void);
11780 +extern void __local_bh_enable(void);
11781 +
11782 +static inline void local_bh_disable(void)
11783 +{
11784 +       __local_bh_disable();
11785 +}
11786 +
11787 +static inline void __local_bh_disable_ip(unsigned long ip, unsigned int cnt)
11788 +{
11789 +       __local_bh_disable();
11790 +}
11791 +
11792 +static inline void local_bh_enable(void)
11793 +{
11794 +       __local_bh_enable();
11795 +}
11796 +
11797 +static inline void __local_bh_enable_ip(unsigned long ip, unsigned int cnt)
11798 +{
11799 +       __local_bh_enable();
11800 +}
11801 +
11802 +static inline void local_bh_enable_ip(unsigned long ip)
11803 +{
11804 +       __local_bh_enable();
11805 +}
11806 +
11807 +#else
11808 +
11809  #ifdef CONFIG_TRACE_IRQFLAGS
11810  extern void __local_bh_disable_ip(unsigned long ip, unsigned int cnt);
11811  #else
11812 @@ -30,5 +63,6 @@ static inline void local_bh_enable(void)
11813  {
11814         __local_bh_enable_ip(_THIS_IP_, SOFTIRQ_DISABLE_OFFSET);
11815  }
11816 +#endif
11817
11818  #endif /* _LINUX_BH_H */
11819 diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h
11820 index 89d9aa9e79bf..4a201008b02d 100644
11821 --- a/include/linux/buffer_head.h
11822 +++ b/include/linux/buffer_head.h
11823 @@ -75,8 +75,50 @@ struct buffer_head {
11824         struct address_space *b_assoc_map;      /* mapping this buffer is
11825                                                    associated with */
11826         atomic_t b_count;               /* users using this buffer_head */
11827 +#ifdef CONFIG_PREEMPT_RT_BASE
11828 +       spinlock_t b_uptodate_lock;
11829 +#if IS_ENABLED(CONFIG_JBD2)
11830 +       spinlock_t b_state_lock;
11831 +       spinlock_t b_journal_head_lock;
11832 +#endif
11833 +#endif
11834  };
11835
11836 +static inline unsigned long bh_uptodate_lock_irqsave(struct buffer_head *bh)
11837 +{
11838 +       unsigned long flags;
11839 +
11840 +#ifndef CONFIG_PREEMPT_RT_BASE
11841 +       local_irq_save(flags);
11842 +       bit_spin_lock(BH_Uptodate_Lock, &bh->b_state);
11843 +#else
11844 +       spin_lock_irqsave(&bh->b_uptodate_lock, flags);
11845 +#endif
11846 +       return flags;
11847 +}
11848 +
11849 +static inline void
11850 +bh_uptodate_unlock_irqrestore(struct buffer_head *bh, unsigned long flags)
11851 +{
11852 +#ifndef CONFIG_PREEMPT_RT_BASE
11853 +       bit_spin_unlock(BH_Uptodate_Lock, &bh->b_state);
11854 +       local_irq_restore(flags);
11855 +#else
11856 +       spin_unlock_irqrestore(&bh->b_uptodate_lock, flags);
11857 +#endif
11858 +}
11859 +
11860 +static inline void buffer_head_init_locks(struct buffer_head *bh)
11861 +{
11862 +#ifdef CONFIG_PREEMPT_RT_BASE
11863 +       spin_lock_init(&bh->b_uptodate_lock);
11864 +#if IS_ENABLED(CONFIG_JBD2)
11865 +       spin_lock_init(&bh->b_state_lock);
11866 +       spin_lock_init(&bh->b_journal_head_lock);
11867 +#endif
11868 +#endif
11869 +}
11870 +
11871  /*
11872   * macro tricks to expand the set_buffer_foo(), clear_buffer_foo()
11873   * and buffer_foo() functions.
11874 diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h
11875 index 8da263299754..0cc474291e08 100644
11876 --- a/include/linux/cgroup-defs.h
11877 +++ b/include/linux/cgroup-defs.h
11878 @@ -16,6 +16,7 @@
11879  #include <linux/percpu-refcount.h>
11880  #include <linux/percpu-rwsem.h>
11881  #include <linux/workqueue.h>
11882 +#include <linux/swork.h>
11883
11884  #ifdef CONFIG_CGROUPS
11885
11886 @@ -142,6 +143,7 @@ struct cgroup_subsys_state {
11887         /* percpu_ref killing and RCU release */
11888         struct rcu_head rcu_head;
11889         struct work_struct destroy_work;
11890 +       struct swork_event destroy_swork;
11891  };
11892
11893  /*
11894 diff --git a/include/linux/clk/at91_pmc.h b/include/linux/clk/at91_pmc.h
11895 index 1e6932222e11..17f413bbbedf 100644
11896 --- a/include/linux/clk/at91_pmc.h
11897 +++ b/include/linux/clk/at91_pmc.h
11898 @@ -16,18 +16,6 @@
11899  #ifndef AT91_PMC_H
11900  #define AT91_PMC_H
11901
11902 -#ifndef __ASSEMBLY__
11903 -extern void __iomem *at91_pmc_base;
11904 -
11905 -#define at91_pmc_read(field) \
11906 -       readl_relaxed(at91_pmc_base + field)
11907 -
11908 -#define at91_pmc_write(field, value) \
11909 -       writel_relaxed(value, at91_pmc_base + field)
11910 -#else
11911 -.extern at91_pmc_base
11912 -#endif
11913 -
11914  #define        AT91_PMC_SCER           0x00                    /* System Clock Enable Register */
11915  #define        AT91_PMC_SCDR           0x04                    /* System Clock Disable Register */
11916
11917 diff --git a/include/linux/completion.h b/include/linux/completion.h
11918 index 5d5aaae3af43..3bca1590e29f 100644
11919 --- a/include/linux/completion.h
11920 +++ b/include/linux/completion.h
11921 @@ -7,8 +7,7 @@
11922   * Atomic wait-for-completion handler data structures.
11923   * See kernel/sched/completion.c for details.
11924   */
11925 -
11926 -#include <linux/wait.h>
11927 +#include <linux/swait.h>
11928
11929  /*
11930   * struct completion - structure used to maintain state for a "completion"
11931 @@ -24,11 +23,11 @@
11932   */
11933  struct completion {
11934         unsigned int done;
11935 -       wait_queue_head_t wait;
11936 +       struct swait_queue_head wait;
11937  };
11938
11939  #define COMPLETION_INITIALIZER(work) \
11940 -       { 0, __WAIT_QUEUE_HEAD_INITIALIZER((work).wait) }
11941 +       { 0, __SWAIT_QUEUE_HEAD_INITIALIZER((work).wait) }
11942
11943  #define COMPLETION_INITIALIZER_ONSTACK(work) \
11944         ({ init_completion(&work); work; })
11945 @@ -73,7 +72,7 @@ struct completion {
11946  static inline void init_completion(struct completion *x)
11947  {
11948         x->done = 0;
11949 -       init_waitqueue_head(&x->wait);
11950 +       init_swait_queue_head(&x->wait);
11951  }
11952
11953  /**
11954 diff --git a/include/linux/cpu.h b/include/linux/cpu.h
11955 index d2ca8c38f9c4..94041d803d0b 100644
11956 --- a/include/linux/cpu.h
11957 +++ b/include/linux/cpu.h
11958 @@ -231,6 +231,8 @@ extern void get_online_cpus(void);
11959  extern void put_online_cpus(void);
11960  extern void cpu_hotplug_disable(void);
11961  extern void cpu_hotplug_enable(void);
11962 +extern void pin_current_cpu(void);
11963 +extern void unpin_current_cpu(void);
11964  #define hotcpu_notifier(fn, pri)       cpu_notifier(fn, pri)
11965  #define __hotcpu_notifier(fn, pri)     __cpu_notifier(fn, pri)
11966  #define register_hotcpu_notifier(nb)   register_cpu_notifier(nb)
11967 @@ -248,6 +250,8 @@ static inline void cpu_hotplug_done(void) {}
11968  #define put_online_cpus()      do { } while (0)
11969  #define cpu_hotplug_disable()  do { } while (0)
11970  #define cpu_hotplug_enable()   do { } while (0)
11971 +static inline void pin_current_cpu(void) { }
11972 +static inline void unpin_current_cpu(void) { }
11973  #define hotcpu_notifier(fn, pri)       do { (void)(fn); } while (0)
11974  #define __hotcpu_notifier(fn, pri)     do { (void)(fn); } while (0)
11975  /* These aren't inline functions due to a GCC bug. */
11976 diff --git a/include/linux/delay.h b/include/linux/delay.h
11977 index a6ecb34cf547..37caab306336 100644
11978 --- a/include/linux/delay.h
11979 +++ b/include/linux/delay.h
11980 @@ -52,4 +52,10 @@ static inline void ssleep(unsigned int seconds)
11981         msleep(seconds * 1000);
11982  }
11983
11984 +#ifdef CONFIG_PREEMPT_RT_FULL
11985 +extern void cpu_chill(void);
11986 +#else
11987 +# define cpu_chill()   cpu_relax()
11988 +#endif
11989 +
11990  #endif /* defined(_LINUX_DELAY_H) */
11991 diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
11992 index 60048c50404e..f2cd67624f18 100644
11993 --- a/include/linux/ftrace.h
11994 +++ b/include/linux/ftrace.h
11995 @@ -694,6 +694,18 @@ static inline void __ftrace_enabled_restore(int enabled)
11996  #define CALLER_ADDR5 ((unsigned long)ftrace_return_address(5))
11997  #define CALLER_ADDR6 ((unsigned long)ftrace_return_address(6))
11998
11999 +static inline unsigned long get_lock_parent_ip(void)
12000 +{
12001 +       unsigned long addr = CALLER_ADDR0;
12002 +
12003 +       if (!in_lock_functions(addr))
12004 +               return addr;
12005 +       addr = CALLER_ADDR1;
12006 +       if (!in_lock_functions(addr))
12007 +               return addr;
12008 +       return CALLER_ADDR2;
12009 +}
12010 +
12011  #ifdef CONFIG_IRQSOFF_TRACER
12012    extern void time_hardirqs_on(unsigned long a0, unsigned long a1);
12013    extern void time_hardirqs_off(unsigned long a0, unsigned long a1);
12014 diff --git a/include/linux/highmem.h b/include/linux/highmem.h
12015 index bb3f3297062a..a117a33ef72c 100644
12016 --- a/include/linux/highmem.h
12017 +++ b/include/linux/highmem.h
12018 @@ -7,6 +7,7 @@
12019  #include <linux/mm.h>
12020  #include <linux/uaccess.h>
12021  #include <linux/hardirq.h>
12022 +#include <linux/sched.h>
12023
12024  #include <asm/cacheflush.h>
12025
12026 @@ -65,7 +66,7 @@ static inline void kunmap(struct page *page)
12027
12028  static inline void *kmap_atomic(struct page *page)
12029  {
12030 -       preempt_disable();
12031 +       preempt_disable_nort();
12032         pagefault_disable();
12033         return page_address(page);
12034  }
12035 @@ -74,7 +75,7 @@ static inline void *kmap_atomic(struct page *page)
12036  static inline void __kunmap_atomic(void *addr)
12037  {
12038         pagefault_enable();
12039 -       preempt_enable();
12040 +       preempt_enable_nort();
12041  }
12042
12043  #define kmap_atomic_pfn(pfn)   kmap_atomic(pfn_to_page(pfn))
12044 @@ -86,32 +87,51 @@ static inline void __kunmap_atomic(void *addr)
12045
12046  #if defined(CONFIG_HIGHMEM) || defined(CONFIG_X86_32)
12047
12048 +#ifndef CONFIG_PREEMPT_RT_FULL
12049  DECLARE_PER_CPU(int, __kmap_atomic_idx);
12050 +#endif
12051
12052  static inline int kmap_atomic_idx_push(void)
12053  {
12054 +#ifndef CONFIG_PREEMPT_RT_FULL
12055         int idx = __this_cpu_inc_return(__kmap_atomic_idx) - 1;
12056
12057 -#ifdef CONFIG_DEBUG_HIGHMEM
12058 +# ifdef CONFIG_DEBUG_HIGHMEM
12059         WARN_ON_ONCE(in_irq() && !irqs_disabled());
12060         BUG_ON(idx >= KM_TYPE_NR);
12061 -#endif
12062 +# endif
12063         return idx;
12064 +#else
12065 +       current->kmap_idx++;
12066 +       BUG_ON(current->kmap_idx > KM_TYPE_NR);
12067 +       return current->kmap_idx - 1;
12068 +#endif
12069  }
12070
12071  static inline int kmap_atomic_idx(void)
12072  {
12073 +#ifndef CONFIG_PREEMPT_RT_FULL
12074         return __this_cpu_read(__kmap_atomic_idx) - 1;
12075 +#else
12076 +       return current->kmap_idx - 1;
12077 +#endif
12078  }
12079
12080  static inline void kmap_atomic_idx_pop(void)
12081  {
12082 -#ifdef CONFIG_DEBUG_HIGHMEM
12083 +#ifndef CONFIG_PREEMPT_RT_FULL
12084 +# ifdef CONFIG_DEBUG_HIGHMEM
12085         int idx = __this_cpu_dec_return(__kmap_atomic_idx);
12086
12087         BUG_ON(idx < 0);
12088 -#else
12089 +# else
12090         __this_cpu_dec(__kmap_atomic_idx);
12091 +# endif
12092 +#else
12093 +       current->kmap_idx--;
12094 +# ifdef CONFIG_DEBUG_HIGHMEM
12095 +       BUG_ON(current->kmap_idx < 0);
12096 +# endif
12097  #endif
12098  }
12099
12100 diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h
12101 index 2ead22dd74a0..8fbcdfa5dc77 100644
12102 --- a/include/linux/hrtimer.h
12103 +++ b/include/linux/hrtimer.h
12104 @@ -87,6 +87,9 @@ enum hrtimer_restart {
12105   * @function:  timer expiry callback function
12106   * @base:      pointer to the timer base (per cpu and per clock)
12107   * @state:     state information (See bit values above)
12108 + * @cb_entry:  list entry to defer timers from hardirq context
12109 + * @irqsafe:   timer can run in hardirq context
12110 + * @praecox:   timer expiry time if expired at the time of programming
12111   * @is_rel:    Set if the timer was armed relative
12112   * @start_pid:  timer statistics field to store the pid of the task which
12113   *             started the timer
12114 @@ -103,6 +106,11 @@ struct hrtimer {
12115         enum hrtimer_restart            (*function)(struct hrtimer *);
12116         struct hrtimer_clock_base       *base;
12117         u8                              state;
12118 +       struct list_head                cb_entry;
12119 +       int                             irqsafe;
12120 +#ifdef CONFIG_MISSED_TIMER_OFFSETS_HIST
12121 +       ktime_t                         praecox;
12122 +#endif
12123         u8                              is_rel;
12124  #ifdef CONFIG_TIMER_STATS
12125         int                             start_pid;
12126 @@ -123,11 +131,7 @@ struct hrtimer_sleeper {
12127         struct task_struct *task;
12128  };
12129
12130 -#ifdef CONFIG_64BIT
12131  # define HRTIMER_CLOCK_BASE_ALIGN      64
12132 -#else
12133 -# define HRTIMER_CLOCK_BASE_ALIGN      32
12134 -#endif
12135
12136  /**
12137   * struct hrtimer_clock_base - the timer base for a specific clock
12138 @@ -136,6 +140,7 @@ struct hrtimer_sleeper {
12139   *                     timer to a base on another cpu.
12140   * @clockid:           clock id for per_cpu support
12141   * @active:            red black tree root node for the active timers
12142 + * @expired:           list head for deferred timers.
12143   * @get_time:          function to retrieve the current time of the clock
12144   * @offset:            offset of this clock to the monotonic base
12145   */
12146 @@ -144,6 +149,7 @@ struct hrtimer_clock_base {
12147         int                     index;
12148         clockid_t               clockid;
12149         struct timerqueue_head  active;
12150 +       struct list_head        expired;
12151         ktime_t                 (*get_time)(void);
12152         ktime_t                 offset;
12153  } __attribute__((__aligned__(HRTIMER_CLOCK_BASE_ALIGN)));
12154 @@ -187,6 +193,7 @@ struct hrtimer_cpu_base {
12155         raw_spinlock_t                  lock;
12156         seqcount_t                      seq;
12157         struct hrtimer                  *running;
12158 +       struct hrtimer                  *running_soft;
12159         unsigned int                    cpu;
12160         unsigned int                    active_bases;
12161         unsigned int                    clock_was_set_seq;
12162 @@ -203,6 +210,9 @@ struct hrtimer_cpu_base {
12163         unsigned int                    nr_hangs;
12164         unsigned int                    max_hang_time;
12165  #endif
12166 +#ifdef CONFIG_PREEMPT_RT_BASE
12167 +       wait_queue_head_t               wait;
12168 +#endif
12169         struct hrtimer_clock_base       clock_base[HRTIMER_MAX_CLOCK_BASES];
12170  } ____cacheline_aligned;
12171
12172 @@ -412,6 +422,13 @@ static inline void hrtimer_restart(struct hrtimer *timer)
12173         hrtimer_start_expires(timer, HRTIMER_MODE_ABS);
12174  }
12175
12176 +/* Softirq preemption could deadlock timer removal */
12177 +#ifdef CONFIG_PREEMPT_RT_BASE
12178 +  extern void hrtimer_wait_for_timer(const struct hrtimer *timer);
12179 +#else
12180 +# define hrtimer_wait_for_timer(timer) do { cpu_relax(); } while (0)
12181 +#endif
12182 +
12183  /* Query timers: */
12184  extern ktime_t __hrtimer_get_remaining(const struct hrtimer *timer, bool adjust);
12185
12186 @@ -436,7 +453,7 @@ static inline int hrtimer_is_queued(struct hrtimer *timer)
12187   * Helper function to check, whether the timer is running the callback
12188   * function
12189   */
12190 -static inline int hrtimer_callback_running(struct hrtimer *timer)
12191 +static inline int hrtimer_callback_running(const struct hrtimer *timer)
12192  {
12193         return timer->base->cpu_base->running == timer;
12194  }
12195 diff --git a/include/linux/idr.h b/include/linux/idr.h
12196 index 013fd9bc4cb6..f62be0aec911 100644
12197 --- a/include/linux/idr.h
12198 +++ b/include/linux/idr.h
12199 @@ -95,10 +95,14 @@ bool idr_is_empty(struct idr *idp);
12200   * Each idr_preload() should be matched with an invocation of this
12201   * function.  See idr_preload() for details.
12202   */
12203 +#ifdef CONFIG_PREEMPT_RT_FULL
12204 +void idr_preload_end(void);
12205 +#else
12206  static inline void idr_preload_end(void)
12207  {
12208         preempt_enable();
12209  }
12210 +#endif
12211
12212  /**
12213   * idr_find - return pointer for given id
12214 diff --git a/include/linux/init_task.h b/include/linux/init_task.h
12215 index 1c1ff7e4faa4..60fadde71a44 100644
12216 --- a/include/linux/init_task.h
12217 +++ b/include/linux/init_task.h
12218 @@ -148,9 +148,15 @@ extern struct task_group root_task_group;
12219  # define INIT_PERF_EVENTS(tsk)
12220  #endif
12221
12222 +#ifdef CONFIG_PREEMPT_RT_BASE
12223 +# define INIT_TIMER_LIST               .posix_timer_list = NULL,
12224 +#else
12225 +# define INIT_TIMER_LIST
12226 +#endif
12227 +
12228  #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
12229  # define INIT_VTIME(tsk)                                               \
12230 -       .vtime_seqlock = __SEQLOCK_UNLOCKED(tsk.vtime_seqlock), \
12231 +       .vtime_seqcount = SEQCNT_ZERO(tsk.vtime_seqcount),      \
12232         .vtime_snap = 0,                                \
12233         .vtime_snap_whence = VTIME_SYS,
12234  #else
12235 @@ -239,6 +245,7 @@ extern struct task_group root_task_group;
12236         .cpu_timers     = INIT_CPU_TIMERS(tsk.cpu_timers),              \
12237         .pi_lock        = __RAW_SPIN_LOCK_UNLOCKED(tsk.pi_lock),        \
12238         .timer_slack_ns = 50000, /* 50 usec default slack */            \
12239 +       INIT_TIMER_LIST                                                 \
12240         .pids = {                                                       \
12241                 [PIDTYPE_PID]  = INIT_PID_LINK(PIDTYPE_PID),            \
12242                 [PIDTYPE_PGID] = INIT_PID_LINK(PIDTYPE_PGID),           \
12243 diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h
12244 index ad16809c8596..655cee096aed 100644
12245 --- a/include/linux/interrupt.h
12246 +++ b/include/linux/interrupt.h
12247 @@ -61,6 +61,7 @@
12248   *                interrupt handler after suspending interrupts. For system
12249   *                wakeup devices users need to implement wakeup detection in
12250   *                their interrupt handlers.
12251 + * IRQF_NO_SOFTIRQ_CALL - Do not process softirqs in the irq thread context (RT)
12252   */
12253  #define IRQF_SHARED            0x00000080
12254  #define IRQF_PROBE_SHARED      0x00000100
12255 @@ -74,6 +75,7 @@
12256  #define IRQF_NO_THREAD         0x00010000
12257  #define IRQF_EARLY_RESUME      0x00020000
12258  #define IRQF_COND_SUSPEND      0x00040000
12259 +#define IRQF_NO_SOFTIRQ_CALL   0x00080000
12260
12261  #define IRQF_TIMER             (__IRQF_TIMER | IRQF_NO_SUSPEND | IRQF_NO_THREAD)
12262
12263 @@ -186,7 +188,7 @@ extern void devm_free_irq(struct device *dev, unsigned int irq, void *dev_id);
12264  #ifdef CONFIG_LOCKDEP
12265  # define local_irq_enable_in_hardirq() do { } while (0)
12266  #else
12267 -# define local_irq_enable_in_hardirq() local_irq_enable()
12268 +# define local_irq_enable_in_hardirq() local_irq_enable_nort()
12269  #endif
12270
12271  extern void disable_irq_nosync(unsigned int irq);
12272 @@ -206,6 +208,7 @@ extern void resume_device_irqs(void);
12273   * @irq:               Interrupt to which notification applies
12274   * @kref:              Reference count, for internal use
12275   * @work:              Work item, for internal use
12276 + * @list:              List item for deferred callbacks
12277   * @notify:            Function to be called on change.  This will be
12278   *                     called in process context.
12279   * @release:           Function to be called on release.  This will be
12280 @@ -217,6 +220,7 @@ struct irq_affinity_notify {
12281         unsigned int irq;
12282         struct kref kref;
12283         struct work_struct work;
12284 +       struct list_head list;
12285         void (*notify)(struct irq_affinity_notify *, const cpumask_t *mask);
12286         void (*release)(struct kref *ref);
12287  };
12288 @@ -379,9 +383,13 @@ extern int irq_set_irqchip_state(unsigned int irq, enum irqchip_irq_state which,
12289                                  bool state);
12290
12291  #ifdef CONFIG_IRQ_FORCED_THREADING
12292 +# ifndef CONFIG_PREEMPT_RT_BASE
12293  extern bool force_irqthreads;
12294 +# else
12295 +#  define force_irqthreads     (true)
12296 +# endif
12297  #else
12298 -#define force_irqthreads       (0)
12299 +#define force_irqthreads       (false)
12300  #endif
12301
12302  #ifndef __ARCH_SET_SOFTIRQ_PENDING
12303 @@ -438,9 +446,10 @@ struct softirq_action
12304         void    (*action)(struct softirq_action *);
12305  };
12306
12307 +#ifndef CONFIG_PREEMPT_RT_FULL
12308  asmlinkage void do_softirq(void);
12309  asmlinkage void __do_softirq(void);
12310 -
12311 +static inline void thread_do_softirq(void) { do_softirq(); }
12312  #ifdef __ARCH_HAS_DO_SOFTIRQ
12313  void do_softirq_own_stack(void);
12314  #else
12315 @@ -449,13 +458,25 @@ static inline void do_softirq_own_stack(void)
12316         __do_softirq();
12317  }
12318  #endif
12319 +#else
12320 +extern void thread_do_softirq(void);
12321 +#endif
12322
12323  extern void open_softirq(int nr, void (*action)(struct softirq_action *));
12324  extern void softirq_init(void);
12325  extern void __raise_softirq_irqoff(unsigned int nr);
12326 +#ifdef CONFIG_PREEMPT_RT_FULL
12327 +extern void __raise_softirq_irqoff_ksoft(unsigned int nr);
12328 +#else
12329 +static inline void __raise_softirq_irqoff_ksoft(unsigned int nr)
12330 +{
12331 +       __raise_softirq_irqoff(nr);
12332 +}
12333 +#endif
12334
12335  extern void raise_softirq_irqoff(unsigned int nr);
12336  extern void raise_softirq(unsigned int nr);
12337 +extern void softirq_check_pending_idle(void);
12338
12339  DECLARE_PER_CPU(struct task_struct *, ksoftirqd);
12340
12341 @@ -477,8 +498,9 @@ static inline struct task_struct *this_cpu_ksoftirqd(void)
12342       to be executed on some cpu at least once after this.
12343     * If the tasklet is already scheduled, but its execution is still not
12344       started, it will be executed only once.
12345 -   * If this tasklet is already running on another CPU (or schedule is called
12346 -     from tasklet itself), it is rescheduled for later.
12347 +   * If this tasklet is already running on another CPU, it is rescheduled
12348 +     for later.
12349 +   * Schedule must not be called from the tasklet itself (a lockup occurs)
12350     * Tasklet is strictly serialized wrt itself, but not
12351       wrt another tasklets. If client needs some intertask synchronization,
12352       he makes it with spinlocks.
12353 @@ -503,27 +525,36 @@ struct tasklet_struct name = { NULL, 0, ATOMIC_INIT(1), func, data }
12354  enum
12355  {
12356         TASKLET_STATE_SCHED,    /* Tasklet is scheduled for execution */
12357 -       TASKLET_STATE_RUN       /* Tasklet is running (SMP only) */
12358 +       TASKLET_STATE_RUN,      /* Tasklet is running (SMP only) */
12359 +       TASKLET_STATE_PENDING   /* Tasklet is pending */
12360  };
12361
12362 -#ifdef CONFIG_SMP
12363 +#define TASKLET_STATEF_SCHED   (1 << TASKLET_STATE_SCHED)
12364 +#define TASKLET_STATEF_RUN     (1 << TASKLET_STATE_RUN)
12365 +#define TASKLET_STATEF_PENDING (1 << TASKLET_STATE_PENDING)
12366 +
12367 +#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT_FULL)
12368  static inline int tasklet_trylock(struct tasklet_struct *t)
12369  {
12370         return !test_and_set_bit(TASKLET_STATE_RUN, &(t)->state);
12371  }
12372
12373 +static inline int tasklet_tryunlock(struct tasklet_struct *t)
12374 +{
12375 +       return cmpxchg(&t->state, TASKLET_STATEF_RUN, 0) == TASKLET_STATEF_RUN;
12376 +}
12377 +
12378  static inline void tasklet_unlock(struct tasklet_struct *t)
12379  {
12380         smp_mb__before_atomic();
12381         clear_bit(TASKLET_STATE_RUN, &(t)->state);
12382  }
12383
12384 -static inline void tasklet_unlock_wait(struct tasklet_struct *t)
12385 -{
12386 -       while (test_bit(TASKLET_STATE_RUN, &(t)->state)) { barrier(); }
12387 -}
12388 +extern void tasklet_unlock_wait(struct tasklet_struct *t);
12389 +
12390  #else
12391  #define tasklet_trylock(t) 1
12392 +#define tasklet_tryunlock(t)   1
12393  #define tasklet_unlock_wait(t) do { } while (0)
12394  #define tasklet_unlock(t) do { } while (0)
12395  #endif
12396 @@ -572,12 +603,7 @@ static inline void tasklet_disable(struct tasklet_struct *t)
12397         smp_mb();
12398  }
12399
12400 -static inline void tasklet_enable(struct tasklet_struct *t)
12401 -{
12402 -       smp_mb__before_atomic();
12403 -       atomic_dec(&t->count);
12404 -}
12405 -
12406 +extern void tasklet_enable(struct tasklet_struct *t);
12407  extern void tasklet_kill(struct tasklet_struct *t);
12408  extern void tasklet_kill_immediate(struct tasklet_struct *t, unsigned int cpu);
12409  extern void tasklet_init(struct tasklet_struct *t,
12410 @@ -608,6 +634,12 @@ void tasklet_hrtimer_cancel(struct tasklet_hrtimer *ttimer)
12411         tasklet_kill(&ttimer->tasklet);
12412  }
12413
12414 +#ifdef CONFIG_PREEMPT_RT_FULL
12415 +extern void softirq_early_init(void);
12416 +#else
12417 +static inline void softirq_early_init(void) { }
12418 +#endif
12419 +
12420  /*
12421   * Autoprobing for irqs:
12422   *
12423 diff --git a/include/linux/irq.h b/include/linux/irq.h
12424 index f7cade00c525..dac9e11ba037 100644
12425 --- a/include/linux/irq.h
12426 +++ b/include/linux/irq.h
12427 @@ -72,6 +72,7 @@ enum irqchip_irq_state;
12428   * IRQ_IS_POLLED               - Always polled by another interrupt. Exclude
12429   *                               it from the spurious interrupt detection
12430   *                               mechanism and from core side polling.
12431 + * IRQ_NO_SOFTIRQ_CALL         - No softirq processing in the irq thread context (RT)
12432   * IRQ_DISABLE_UNLAZY          - Disable lazy irq disable
12433   */
12434  enum {
12435 @@ -99,13 +100,14 @@ enum {
12436         IRQ_PER_CPU_DEVID       = (1 << 17),
12437         IRQ_IS_POLLED           = (1 << 18),
12438         IRQ_DISABLE_UNLAZY      = (1 << 19),
12439 +       IRQ_NO_SOFTIRQ_CALL     = (1 << 20),
12440  };
12441
12442  #define IRQF_MODIFY_MASK       \
12443         (IRQ_TYPE_SENSE_MASK | IRQ_NOPROBE | IRQ_NOREQUEST | \
12444          IRQ_NOAUTOEN | IRQ_MOVE_PCNTXT | IRQ_LEVEL | IRQ_NO_BALANCING | \
12445          IRQ_PER_CPU | IRQ_NESTED_THREAD | IRQ_NOTHREAD | IRQ_PER_CPU_DEVID | \
12446 -        IRQ_IS_POLLED | IRQ_DISABLE_UNLAZY)
12447 +        IRQ_IS_POLLED | IRQ_DISABLE_UNLAZY | IRQ_NO_SOFTIRQ_CALL)
12448
12449  #define IRQ_NO_BALANCING_MASK  (IRQ_PER_CPU | IRQ_NO_BALANCING)
12450
12451 diff --git a/include/linux/irq_work.h b/include/linux/irq_work.h
12452 index 47b9ebd4a74f..2543aab05daa 100644
12453 --- a/include/linux/irq_work.h
12454 +++ b/include/linux/irq_work.h
12455 @@ -16,6 +16,7 @@
12456  #define IRQ_WORK_BUSY          2UL
12457  #define IRQ_WORK_FLAGS         3UL
12458  #define IRQ_WORK_LAZY          4UL /* Doesn't want IPI, wait for tick */
12459 +#define IRQ_WORK_HARD_IRQ      8UL /* Run hard IRQ context, even on RT */
12460
12461  struct irq_work {
12462         unsigned long flags;
12463 @@ -51,4 +52,10 @@ static inline bool irq_work_needs_cpu(void) { return false; }
12464  static inline void irq_work_run(void) { }
12465  #endif
12466
12467 +#if defined(CONFIG_IRQ_WORK) && defined(CONFIG_PREEMPT_RT_FULL)
12468 +void irq_work_tick_soft(void);
12469 +#else
12470 +static inline void irq_work_tick_soft(void) { }
12471 +#endif
12472 +
12473  #endif /* _LINUX_IRQ_WORK_H */
12474 diff --git a/include/linux/irqdesc.h b/include/linux/irqdesc.h
12475 index a587a33363c7..ad57402a242d 100644
12476 --- a/include/linux/irqdesc.h
12477 +++ b/include/linux/irqdesc.h
12478 @@ -61,6 +61,7 @@ struct irq_desc {
12479         unsigned int            irqs_unhandled;
12480         atomic_t                threads_handled;
12481         int                     threads_handled_last;
12482 +       u64                     random_ip;
12483         raw_spinlock_t          lock;
12484         struct cpumask          *percpu_enabled;
12485  #ifdef CONFIG_SMP
12486 diff --git a/include/linux/irqflags.h b/include/linux/irqflags.h
12487 index 5dd1272d1ab2..9b77034f7c5e 100644
12488 --- a/include/linux/irqflags.h
12489 +++ b/include/linux/irqflags.h
12490 @@ -25,8 +25,6 @@
12491  # define trace_softirqs_enabled(p)     ((p)->softirqs_enabled)
12492  # define trace_hardirq_enter() do { current->hardirq_context++; } while (0)
12493  # define trace_hardirq_exit()  do { current->hardirq_context--; } while (0)
12494 -# define lockdep_softirq_enter()       do { current->softirq_context++; } while (0)
12495 -# define lockdep_softirq_exit()        do { current->softirq_context--; } while (0)
12496  # define INIT_TRACE_IRQFLAGS   .softirqs_enabled = 1,
12497  #else
12498  # define trace_hardirqs_on()           do { } while (0)
12499 @@ -39,9 +37,15 @@
12500  # define trace_softirqs_enabled(p)     0
12501  # define trace_hardirq_enter()         do { } while (0)
12502  # define trace_hardirq_exit()          do { } while (0)
12503 +# define INIT_TRACE_IRQFLAGS
12504 +#endif
12505 +
12506 +#if defined(CONFIG_TRACE_IRQFLAGS) && !defined(CONFIG_PREEMPT_RT_FULL)
12507 +# define lockdep_softirq_enter() do { current->softirq_context++; } while (0)
12508 +# define lockdep_softirq_exit()         do { current->softirq_context--; } while (0)
12509 +#else
12510  # define lockdep_softirq_enter()       do { } while (0)
12511  # define lockdep_softirq_exit()                do { } while (0)
12512 -# define INIT_TRACE_IRQFLAGS
12513  #endif
12514
12515  #if defined(CONFIG_IRQSOFF_TRACER) || \
12516 @@ -148,4 +152,23 @@
12517
12518  #define irqs_disabled_flags(flags) raw_irqs_disabled_flags(flags)
12519
12520 +/*
12521 + * local_irq* variants depending on RT/!RT
12522 + */
12523 +#ifdef CONFIG_PREEMPT_RT_FULL
12524 +# define local_irq_disable_nort()      do { } while (0)
12525 +# define local_irq_enable_nort()       do { } while (0)
12526 +# define local_irq_save_nort(flags)    local_save_flags(flags)
12527 +# define local_irq_restore_nort(flags) (void)(flags)
12528 +# define local_irq_disable_rt()                local_irq_disable()
12529 +# define local_irq_enable_rt()         local_irq_enable()
12530 +#else
12531 +# define local_irq_disable_nort()      local_irq_disable()
12532 +# define local_irq_enable_nort()       local_irq_enable()
12533 +# define local_irq_save_nort(flags)    local_irq_save(flags)
12534 +# define local_irq_restore_nort(flags) local_irq_restore(flags)
12535 +# define local_irq_disable_rt()                do { } while (0)
12536 +# define local_irq_enable_rt()         do { } while (0)
12537 +#endif
12538 +
12539  #endif
12540 diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h
12541 index 65407f6c9120..eb5aabe4e18c 100644
12542 --- a/include/linux/jbd2.h
12543 +++ b/include/linux/jbd2.h
12544 @@ -352,32 +352,56 @@ static inline struct journal_head *bh2jh(struct buffer_head *bh)
12545
12546  static inline void jbd_lock_bh_state(struct buffer_head *bh)
12547  {
12548 +#ifndef CONFIG_PREEMPT_RT_BASE
12549         bit_spin_lock(BH_State, &bh->b_state);
12550 +#else
12551 +       spin_lock(&bh->b_state_lock);
12552 +#endif
12553  }
12554
12555  static inline int jbd_trylock_bh_state(struct buffer_head *bh)
12556  {
12557 +#ifndef CONFIG_PREEMPT_RT_BASE
12558         return bit_spin_trylock(BH_State, &bh->b_state);
12559 +#else
12560 +       return spin_trylock(&bh->b_state_lock);
12561 +#endif
12562  }
12563
12564  static inline int jbd_is_locked_bh_state(struct buffer_head *bh)
12565  {
12566 +#ifndef CONFIG_PREEMPT_RT_BASE
12567         return bit_spin_is_locked(BH_State, &bh->b_state);
12568 +#else
12569 +       return spin_is_locked(&bh->b_state_lock);
12570 +#endif
12571  }
12572
12573  static inline void jbd_unlock_bh_state(struct buffer_head *bh)
12574  {
12575 +#ifndef CONFIG_PREEMPT_RT_BASE
12576         bit_spin_unlock(BH_State, &bh->b_state);
12577 +#else
12578 +       spin_unlock(&bh->b_state_lock);
12579 +#endif
12580  }
12581
12582  static inline void jbd_lock_bh_journal_head(struct buffer_head *bh)
12583  {
12584 +#ifndef CONFIG_PREEMPT_RT_BASE
12585         bit_spin_lock(BH_JournalHead, &bh->b_state);
12586 +#else
12587 +       spin_lock(&bh->b_journal_head_lock);
12588 +#endif
12589  }
12590
12591  static inline void jbd_unlock_bh_journal_head(struct buffer_head *bh)
12592  {
12593 +#ifndef CONFIG_PREEMPT_RT_BASE
12594         bit_spin_unlock(BH_JournalHead, &bh->b_state);
12595 +#else
12596 +       spin_unlock(&bh->b_journal_head_lock);
12597 +#endif
12598  }
12599
12600  #define J_ASSERT(assert)       BUG_ON(!(assert))
12601 diff --git a/include/linux/kdb.h b/include/linux/kdb.h
12602 index a19bcf9e762e..897495386446 100644
12603 --- a/include/linux/kdb.h
12604 +++ b/include/linux/kdb.h
12605 @@ -167,6 +167,7 @@ extern __printf(2, 0) int vkdb_printf(enum kdb_msgsrc src, const char *fmt,
12606  extern __printf(1, 2) int kdb_printf(const char *, ...);
12607  typedef __printf(1, 2) int (*kdb_printf_t)(const char *, ...);
12608
12609 +#define in_kdb_printk()        (kdb_trap_printk)
12610  extern void kdb_init(int level);
12611
12612  /* Access to kdb specific polling devices */
12613 @@ -201,6 +202,7 @@ extern int kdb_register_flags(char *, kdb_func_t, char *, char *,
12614  extern int kdb_unregister(char *);
12615  #else /* ! CONFIG_KGDB_KDB */
12616  static inline __printf(1, 2) int kdb_printf(const char *fmt, ...) { return 0; }
12617 +#define in_kdb_printk() (0)
12618  static inline void kdb_init(int level) {}
12619  static inline int kdb_register(char *cmd, kdb_func_t func, char *usage,
12620                                char *help, short minlen) { return 0; }
12621 diff --git a/include/linux/kernel.h b/include/linux/kernel.h
12622 index 50220cab738c..d68f639f7330 100644
12623 --- a/include/linux/kernel.h
12624 +++ b/include/linux/kernel.h
12625 @@ -188,6 +188,9 @@ extern int _cond_resched(void);
12626   */
12627  # define might_sleep() \
12628         do { __might_sleep(__FILE__, __LINE__, 0); might_resched(); } while (0)
12629 +
12630 +# define might_sleep_no_state_check() \
12631 +       do { ___might_sleep(__FILE__, __LINE__, 0); might_resched(); } while (0)
12632  # define sched_annotate_sleep()        (current->task_state_change = 0)
12633  #else
12634    static inline void ___might_sleep(const char *file, int line,
12635 @@ -195,6 +198,7 @@ extern int _cond_resched(void);
12636    static inline void __might_sleep(const char *file, int line,
12637                                    int preempt_offset) { }
12638  # define might_sleep() do { might_resched(); } while (0)
12639 +# define might_sleep_no_state_check() do { might_resched(); } while (0)
12640  # define sched_annotate_sleep() do { } while (0)
12641  #endif
12642
12643 @@ -255,6 +259,7 @@ extern long (*panic_blink)(int state);
12644  __printf(1, 2)
12645  void panic(const char *fmt, ...)
12646         __noreturn __cold;
12647 +void nmi_panic(struct pt_regs *regs, const char *msg);
12648  extern void oops_enter(void);
12649  extern void oops_exit(void);
12650  void print_oops_end_marker(void);
12651 @@ -448,6 +453,14 @@ extern int sysctl_panic_on_stackoverflow;
12652  extern bool crash_kexec_post_notifiers;
12653
12654  /*
12655 + * panic_cpu is used for synchronizing panic() and crash_kexec() execution. It
12656 + * holds a CPU number which is executing panic() currently. A value of
12657 + * PANIC_CPU_INVALID means no CPU has entered panic() or crash_kexec().
12658 + */
12659 +extern atomic_t panic_cpu;
12660 +#define PANIC_CPU_INVALID      -1
12661 +
12662 +/*
12663   * Only to be used by arch init code. If the user over-wrote the default
12664   * CONFIG_PANIC_TIMEOUT, honor it.
12665   */
12666 @@ -475,6 +488,7 @@ extern enum system_states {
12667         SYSTEM_HALT,
12668         SYSTEM_POWER_OFF,
12669         SYSTEM_RESTART,
12670 +       SYSTEM_SUSPEND,
12671  } system_state;
12672
12673  #define TAINT_PROPRIETARY_MODULE       0
12674 diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
12675 index c923350ca20a..c690acc6900e 100644
12676 --- a/include/linux/kvm_host.h
12677 +++ b/include/linux/kvm_host.h
12678 @@ -25,6 +25,7 @@
12679  #include <linux/irqflags.h>
12680  #include <linux/context_tracking.h>
12681  #include <linux/irqbypass.h>
12682 +#include <linux/swait.h>
12683  #include <asm/signal.h>
12684
12685  #include <linux/kvm.h>
12686 @@ -243,7 +244,7 @@ struct kvm_vcpu {
12687         int fpu_active;
12688         int guest_fpu_loaded, guest_xcr0_loaded;
12689         unsigned char fpu_counter;
12690 -       wait_queue_head_t wq;
12691 +       struct swait_queue_head wq;
12692         struct pid *pid;
12693         int sigset_active;
12694         sigset_t sigset;
12695 @@ -794,7 +795,7 @@ static inline bool kvm_arch_has_assigned_device(struct kvm *kvm)
12696  }
12697  #endif
12698
12699 -static inline wait_queue_head_t *kvm_arch_vcpu_wq(struct kvm_vcpu *vcpu)
12700 +static inline struct swait_queue_head *kvm_arch_vcpu_wq(struct kvm_vcpu *vcpu)
12701  {
12702  #ifdef __KVM_HAVE_ARCH_WQP
12703         return vcpu->arch.wqp;
12704 diff --git a/include/linux/lglock.h b/include/linux/lglock.h
12705 index c92ebd100d9b..6f035f635d0e 100644
12706 --- a/include/linux/lglock.h
12707 +++ b/include/linux/lglock.h
12708 @@ -34,13 +34,30 @@
12709  #endif
12710
12711  struct lglock {
12712 +#ifdef CONFIG_PREEMPT_RT_FULL
12713 +       struct rt_mutex __percpu *lock;
12714 +#else
12715         arch_spinlock_t __percpu *lock;
12716 +#endif
12717  #ifdef CONFIG_DEBUG_LOCK_ALLOC
12718         struct lock_class_key lock_key;
12719         struct lockdep_map    lock_dep_map;
12720  #endif
12721  };
12722
12723 +#ifdef CONFIG_PREEMPT_RT_FULL
12724 +# define DEFINE_LGLOCK(name)                                           \
12725 +       static DEFINE_PER_CPU(struct rt_mutex, name ## _lock)           \
12726 +       = __RT_MUTEX_INITIALIZER( name ## _lock);                       \
12727 +       struct lglock name = { .lock = &name ## _lock }
12728 +
12729 +# define DEFINE_STATIC_LGLOCK(name)                                    \
12730 +       static DEFINE_PER_CPU(struct rt_mutex, name ## _lock)           \
12731 +       = __RT_MUTEX_INITIALIZER( name ## _lock);                       \
12732 +       static struct lglock name = { .lock = &name ## _lock }
12733 +
12734 +#else
12735 +
12736  #define DEFINE_LGLOCK(name)                                            \
12737         static DEFINE_PER_CPU(arch_spinlock_t, name ## _lock)           \
12738         = __ARCH_SPIN_LOCK_UNLOCKED;                                    \
12739 @@ -50,6 +67,7 @@ struct lglock {
12740         static DEFINE_PER_CPU(arch_spinlock_t, name ## _lock)           \
12741         = __ARCH_SPIN_LOCK_UNLOCKED;                                    \
12742         static struct lglock name = { .lock = &name ## _lock }
12743 +#endif
12744
12745  void lg_lock_init(struct lglock *lg, char *name);
12746
12747 @@ -64,6 +82,12 @@ void lg_double_unlock(struct lglock *lg, int cpu1, int cpu2);
12748  void lg_global_lock(struct lglock *lg);
12749  void lg_global_unlock(struct lglock *lg);
12750
12751 +#ifndef CONFIG_PREEMPT_RT_FULL
12752 +#define lg_global_trylock_relax(name)  lg_global_lock(name)
12753 +#else
12754 +void lg_global_trylock_relax(struct lglock *lg);
12755 +#endif
12756 +
12757  #else
12758  /* When !CONFIG_SMP, map lglock to spinlock */
12759  #define lglock spinlock
12760 diff --git a/include/linux/list_bl.h b/include/linux/list_bl.h
12761 index 8132214e8efd..89ffaa7bd342 100644
12762 --- a/include/linux/list_bl.h
12763 +++ b/include/linux/list_bl.h
12764 @@ -2,6 +2,7 @@
12765  #define _LINUX_LIST_BL_H
12766
12767  #include <linux/list.h>
12768 +#include <linux/spinlock.h>
12769  #include <linux/bit_spinlock.h>
12770
12771  /*
12772 @@ -32,13 +33,24 @@
12773
12774  struct hlist_bl_head {
12775         struct hlist_bl_node *first;
12776 +#ifdef CONFIG_PREEMPT_RT_BASE
12777 +       raw_spinlock_t lock;
12778 +#endif
12779  };
12780
12781  struct hlist_bl_node {
12782         struct hlist_bl_node *next, **pprev;
12783  };
12784 -#define INIT_HLIST_BL_HEAD(ptr) \
12785 -       ((ptr)->first = NULL)
12786 +
12787 +#ifdef CONFIG_PREEMPT_RT_BASE
12788 +#define INIT_HLIST_BL_HEAD(h)          \
12789 +do {                                   \
12790 +       (h)->first = NULL;              \
12791 +       raw_spin_lock_init(&(h)->lock); \
12792 +} while (0)
12793 +#else
12794 +#define INIT_HLIST_BL_HEAD(h) (h)->first = NULL
12795 +#endif
12796
12797  static inline void INIT_HLIST_BL_NODE(struct hlist_bl_node *h)
12798  {
12799 @@ -118,12 +130,26 @@ static inline void hlist_bl_del_init(struct hlist_bl_node *n)
12800
12801  static inline void hlist_bl_lock(struct hlist_bl_head *b)
12802  {
12803 +#ifndef CONFIG_PREEMPT_RT_BASE
12804         bit_spin_lock(0, (unsigned long *)b);
12805 +#else
12806 +       raw_spin_lock(&b->lock);
12807 +#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK)
12808 +       __set_bit(0, (unsigned long *)b);
12809 +#endif
12810 +#endif
12811  }
12812
12813  static inline void hlist_bl_unlock(struct hlist_bl_head *b)
12814  {
12815 +#ifndef CONFIG_PREEMPT_RT_BASE
12816         __bit_spin_unlock(0, (unsigned long *)b);
12817 +#else
12818 +#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK)
12819 +       __clear_bit(0, (unsigned long *)b);
12820 +#endif
12821 +       raw_spin_unlock(&b->lock);
12822 +#endif
12823  }
12824
12825  static inline bool hlist_bl_is_locked(struct hlist_bl_head *b)
12826 diff --git a/include/linux/locallock.h b/include/linux/locallock.h
12827 new file mode 100644
12828 index 000000000000..e572a3971631
12829 --- /dev/null
12830 +++ b/include/linux/locallock.h
12831 @@ -0,0 +1,276 @@
12832 +#ifndef _LINUX_LOCALLOCK_H
12833 +#define _LINUX_LOCALLOCK_H
12834 +
12835 +#include <linux/percpu.h>
12836 +#include <linux/spinlock.h>
12837 +
12838 +#ifdef CONFIG_PREEMPT_RT_BASE
12839 +
12840 +#ifdef CONFIG_DEBUG_SPINLOCK
12841 +# define LL_WARN(cond) WARN_ON(cond)
12842 +#else
12843 +# define LL_WARN(cond) do { } while (0)
12844 +#endif
12845 +
12846 +/*
12847 + * per cpu lock based substitute for local_irq_*()
12848 + */
12849 +struct local_irq_lock {
12850 +       spinlock_t              lock;
12851 +       struct task_struct      *owner;
12852 +       int                     nestcnt;
12853 +       unsigned long           flags;
12854 +};
12855 +
12856 +#define DEFINE_LOCAL_IRQ_LOCK(lvar)                                    \
12857 +       DEFINE_PER_CPU(struct local_irq_lock, lvar) = {                 \
12858 +               .lock = __SPIN_LOCK_UNLOCKED((lvar).lock) }
12859 +
12860 +#define DECLARE_LOCAL_IRQ_LOCK(lvar)                                   \
12861 +       DECLARE_PER_CPU(struct local_irq_lock, lvar)
12862 +
12863 +#define local_irq_lock_init(lvar)                                      \
12864 +       do {                                                            \
12865 +               int __cpu;                                              \
12866 +               for_each_possible_cpu(__cpu)                            \
12867 +                       spin_lock_init(&per_cpu(lvar, __cpu).lock);     \
12868 +       } while (0)
12869 +
12870 +/*
12871 + * spin_lock|trylock|unlock_local flavour that does not migrate disable
12872 + * used for __local_lock|trylock|unlock where get_local_var/put_local_var
12873 + * already takes care of the migrate_disable/enable
12874 + * for CONFIG_PREEMPT_BASE map to the normal spin_* calls.
12875 + */
12876 +#ifdef CONFIG_PREEMPT_RT_FULL
12877 +# define spin_lock_local(lock)                 rt_spin_lock__no_mg(lock)
12878 +# define spin_trylock_local(lock)              rt_spin_trylock__no_mg(lock)
12879 +# define spin_unlock_local(lock)               rt_spin_unlock__no_mg(lock)
12880 +#else
12881 +# define spin_lock_local(lock)                 spin_lock(lock)
12882 +# define spin_trylock_local(lock)              spin_trylock(lock)
12883 +# define spin_unlock_local(lock)               spin_unlock(lock)
12884 +#endif
12885 +
12886 +static inline void __local_lock(struct local_irq_lock *lv)
12887 +{
12888 +       if (lv->owner != current) {
12889 +               spin_lock_local(&lv->lock);
12890 +               LL_WARN(lv->owner);
12891 +               LL_WARN(lv->nestcnt);
12892 +               lv->owner = current;
12893 +       }
12894 +       lv->nestcnt++;
12895 +}
12896 +
12897 +#define local_lock(lvar)                                       \
12898 +       do { __local_lock(&get_local_var(lvar)); } while (0)
12899 +
12900 +#define local_lock_on(lvar, cpu)                               \
12901 +       do { __local_lock(&per_cpu(lvar, cpu)); } while (0)
12902 +
12903 +static inline int __local_trylock(struct local_irq_lock *lv)
12904 +{
12905 +       if (lv->owner != current && spin_trylock_local(&lv->lock)) {
12906 +               LL_WARN(lv->owner);
12907 +               LL_WARN(lv->nestcnt);
12908 +               lv->owner = current;
12909 +               lv->nestcnt = 1;
12910 +               return 1;
12911 +       }
12912 +       return 0;
12913 +}
12914 +
12915 +#define local_trylock(lvar)                                            \
12916 +       ({                                                              \
12917 +               int __locked;                                           \
12918 +               __locked = __local_trylock(&get_local_var(lvar));       \
12919 +               if (!__locked)                                          \
12920 +                       put_local_var(lvar);                            \
12921 +               __locked;                                               \
12922 +       })
12923 +
12924 +static inline void __local_unlock(struct local_irq_lock *lv)
12925 +{
12926 +       LL_WARN(lv->nestcnt == 0);
12927 +       LL_WARN(lv->owner != current);
12928 +       if (--lv->nestcnt)
12929 +               return;
12930 +
12931 +       lv->owner = NULL;
12932 +       spin_unlock_local(&lv->lock);
12933 +}
12934 +
12935 +#define local_unlock(lvar)                                     \
12936 +       do {                                                    \
12937 +               __local_unlock(this_cpu_ptr(&lvar));            \
12938 +               put_local_var(lvar);                            \
12939 +       } while (0)
12940 +
12941 +#define local_unlock_on(lvar, cpu)                       \
12942 +       do { __local_unlock(&per_cpu(lvar, cpu)); } while (0)
12943 +
12944 +static inline void __local_lock_irq(struct local_irq_lock *lv)
12945 +{
12946 +       spin_lock_irqsave(&lv->lock, lv->flags);
12947 +       LL_WARN(lv->owner);
12948 +       LL_WARN(lv->nestcnt);
12949 +       lv->owner = current;
12950 +       lv->nestcnt = 1;
12951 +}
12952 +
12953 +#define local_lock_irq(lvar)                                           \
12954 +       do { __local_lock_irq(&get_local_var(lvar)); } while (0)
12955 +
12956 +#define local_lock_irq_on(lvar, cpu)                                   \
12957 +       do { __local_lock_irq(&per_cpu(lvar, cpu)); } while (0)
12958 +
12959 +static inline void __local_unlock_irq(struct local_irq_lock *lv)
12960 +{
12961 +       LL_WARN(!lv->nestcnt);
12962 +       LL_WARN(lv->owner != current);
12963 +       lv->owner = NULL;
12964 +       lv->nestcnt = 0;
12965 +       spin_unlock_irq(&lv->lock);
12966 +}
12967 +
12968 +#define local_unlock_irq(lvar)                                         \
12969 +       do {                                                            \
12970 +               __local_unlock_irq(this_cpu_ptr(&lvar));                \
12971 +               put_local_var(lvar);                                    \
12972 +       } while (0)
12973 +
12974 +#define local_unlock_irq_on(lvar, cpu)                                 \
12975 +       do {                                                            \
12976 +               __local_unlock_irq(&per_cpu(lvar, cpu));                \
12977 +       } while (0)
12978 +
12979 +static inline int __local_lock_irqsave(struct local_irq_lock *lv)
12980 +{
12981 +       if (lv->owner != current) {
12982 +               __local_lock_irq(lv);
12983 +               return 0;
12984 +       } else {
12985 +               lv->nestcnt++;
12986 +               return 1;
12987 +       }
12988 +}
12989 +
12990 +#define local_lock_irqsave(lvar, _flags)                               \
12991 +       do {                                                            \
12992 +               if (__local_lock_irqsave(&get_local_var(lvar)))         \
12993 +                       put_local_var(lvar);                            \
12994 +               _flags = __this_cpu_read(lvar.flags);                   \
12995 +       } while (0)
12996 +
12997 +#define local_lock_irqsave_on(lvar, _flags, cpu)                       \
12998 +       do {                                                            \
12999 +               __local_lock_irqsave(&per_cpu(lvar, cpu));              \
13000 +               _flags = per_cpu(lvar, cpu).flags;                      \
13001 +       } while (0)
13002 +
13003 +static inline int __local_unlock_irqrestore(struct local_irq_lock *lv,
13004 +                                           unsigned long flags)
13005 +{
13006 +       LL_WARN(!lv->nestcnt);
13007 +       LL_WARN(lv->owner != current);
13008 +       if (--lv->nestcnt)
13009 +               return 0;
13010 +
13011 +       lv->owner = NULL;
13012 +       spin_unlock_irqrestore(&lv->lock, lv->flags);
13013 +       return 1;
13014 +}
13015 +
13016 +#define local_unlock_irqrestore(lvar, flags)                           \
13017 +       do {                                                            \
13018 +               if (__local_unlock_irqrestore(this_cpu_ptr(&lvar), flags)) \
13019 +                       put_local_var(lvar);                            \
13020 +       } while (0)
13021 +
13022 +#define local_unlock_irqrestore_on(lvar, flags, cpu)                   \
13023 +       do {                                                            \
13024 +               __local_unlock_irqrestore(&per_cpu(lvar, cpu), flags);  \
13025 +       } while (0)
13026 +
13027 +#define local_spin_trylock_irq(lvar, lock)                             \
13028 +       ({                                                              \
13029 +               int __locked;                                           \
13030 +               local_lock_irq(lvar);                                   \
13031 +               __locked = spin_trylock(lock);                          \
13032 +               if (!__locked)                                          \
13033 +                       local_unlock_irq(lvar);                         \
13034 +               __locked;                                               \
13035 +       })
13036 +
13037 +#define local_spin_lock_irq(lvar, lock)                                        \
13038 +       do {                                                            \
13039 +               local_lock_irq(lvar);                                   \
13040 +               spin_lock(lock);                                        \
13041 +       } while (0)
13042 +
13043 +#define local_spin_unlock_irq(lvar, lock)                              \
13044 +       do {                                                            \
13045 +               spin_unlock(lock);                                      \
13046 +               local_unlock_irq(lvar);                                 \
13047 +       } while (0)
13048 +
13049 +#define local_spin_lock_irqsave(lvar, lock, flags)                     \
13050 +       do {                                                            \
13051 +               local_lock_irqsave(lvar, flags);                        \
13052 +               spin_lock(lock);                                        \
13053 +       } while (0)
13054 +
13055 +#define local_spin_unlock_irqrestore(lvar, lock, flags)                        \
13056 +       do {                                                            \
13057 +               spin_unlock(lock);                                      \
13058 +               local_unlock_irqrestore(lvar, flags);                   \
13059 +       } while (0)
13060 +
13061 +#define get_locked_var(lvar, var)                                      \
13062 +       (*({                                                            \
13063 +               local_lock(lvar);                                       \
13064 +               this_cpu_ptr(&var);                                     \
13065 +       }))
13066 +
13067 +#define put_locked_var(lvar, var)      local_unlock(lvar);
13068 +
13069 +#define local_lock_cpu(lvar)                                           \
13070 +       ({                                                              \
13071 +               local_lock(lvar);                                       \
13072 +               smp_processor_id();                                     \
13073 +       })
13074 +
13075 +#define local_unlock_cpu(lvar)                 local_unlock(lvar)
13076 +
13077 +#else /* PREEMPT_RT_BASE */
13078 +
13079 +#define DEFINE_LOCAL_IRQ_LOCK(lvar)            __typeof__(const int) lvar
13080 +#define DECLARE_LOCAL_IRQ_LOCK(lvar)           extern __typeof__(const int) lvar
13081 +
13082 +static inline void local_irq_lock_init(int lvar) { }
13083 +
13084 +#define local_lock(lvar)                       preempt_disable()
13085 +#define local_unlock(lvar)                     preempt_enable()
13086 +#define local_lock_irq(lvar)                   local_irq_disable()
13087 +#define local_unlock_irq(lvar)                 local_irq_enable()
13088 +#define local_lock_irqsave(lvar, flags)                local_irq_save(flags)
13089 +#define local_unlock_irqrestore(lvar, flags)   local_irq_restore(flags)
13090 +
13091 +#define local_spin_trylock_irq(lvar, lock)     spin_trylock_irq(lock)
13092 +#define local_spin_lock_irq(lvar, lock)                spin_lock_irq(lock)
13093 +#define local_spin_unlock_irq(lvar, lock)      spin_unlock_irq(lock)
13094 +#define local_spin_lock_irqsave(lvar, lock, flags)     \
13095 +       spin_lock_irqsave(lock, flags)
13096 +#define local_spin_unlock_irqrestore(lvar, lock, flags)        \
13097 +       spin_unlock_irqrestore(lock, flags)
13098 +
13099 +#define get_locked_var(lvar, var)              get_cpu_var(var)
13100 +#define put_locked_var(lvar, var)              put_cpu_var(var)
13101 +
13102 +#define local_lock_cpu(lvar)                   get_cpu()
13103 +#define local_unlock_cpu(lvar)                 put_cpu()
13104 +
13105 +#endif
13106 +
13107 +#endif
13108 diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
13109 index f8d1492a114f..b238ebfbb4d6 100644
13110 --- a/include/linux/mm_types.h
13111 +++ b/include/linux/mm_types.h
13112 @@ -11,6 +11,7 @@
13113  #include <linux/completion.h>
13114  #include <linux/cpumask.h>
13115  #include <linux/uprobes.h>
13116 +#include <linux/rcupdate.h>
13117  #include <linux/page-flags-layout.h>
13118  #include <asm/page.h>
13119  #include <asm/mmu.h>
13120 @@ -504,6 +505,9 @@ struct mm_struct {
13121         bool tlb_flush_pending;
13122  #endif
13123         struct uprobes_state uprobes_state;
13124 +#ifdef CONFIG_PREEMPT_RT_BASE
13125 +       struct rcu_head delayed_drop;
13126 +#endif
13127  #ifdef CONFIG_X86_INTEL_MPX
13128         /* address of the bounds directory */
13129         void __user *bd_addr;
13130 diff --git a/include/linux/mutex.h b/include/linux/mutex.h
13131 index 2cb7531e7d7a..b3fdfc820216 100644
13132 --- a/include/linux/mutex.h
13133 +++ b/include/linux/mutex.h
13134 @@ -19,6 +19,17 @@
13135  #include <asm/processor.h>
13136  #include <linux/osq_lock.h>
13137
13138 +#ifdef CONFIG_DEBUG_LOCK_ALLOC
13139 +# define __DEP_MAP_MUTEX_INITIALIZER(lockname) \
13140 +       , .dep_map = { .name = #lockname }
13141 +#else
13142 +# define __DEP_MAP_MUTEX_INITIALIZER(lockname)
13143 +#endif
13144 +
13145 +#ifdef CONFIG_PREEMPT_RT_FULL
13146 +# include <linux/mutex_rt.h>
13147 +#else
13148 +
13149  /*
13150   * Simple, straightforward mutexes with strict semantics:
13151   *
13152 @@ -99,13 +110,6 @@ do {                                                        \
13153  static inline void mutex_destroy(struct mutex *lock) {}
13154  #endif
13155
13156 -#ifdef CONFIG_DEBUG_LOCK_ALLOC
13157 -# define __DEP_MAP_MUTEX_INITIALIZER(lockname) \
13158 -               , .dep_map = { .name = #lockname }
13159 -#else
13160 -# define __DEP_MAP_MUTEX_INITIALIZER(lockname)
13161 -#endif
13162 -
13163  #define __MUTEX_INITIALIZER(lockname) \
13164                 { .count = ATOMIC_INIT(1) \
13165                 , .wait_lock = __SPIN_LOCK_UNLOCKED(lockname.wait_lock) \
13166 @@ -173,6 +177,8 @@ extern int __must_check mutex_lock_killable(struct mutex *lock);
13167  extern int mutex_trylock(struct mutex *lock);
13168  extern void mutex_unlock(struct mutex *lock);
13169
13170 +#endif /* !PREEMPT_RT_FULL */
13171 +
13172  extern int atomic_dec_and_mutex_lock(atomic_t *cnt, struct mutex *lock);
13173
13174  #endif /* __LINUX_MUTEX_H */
13175 diff --git a/include/linux/mutex_rt.h b/include/linux/mutex_rt.h
13176 new file mode 100644
13177 index 000000000000..c38a44b14da5
13178 --- /dev/null
13179 +++ b/include/linux/mutex_rt.h
13180 @@ -0,0 +1,84 @@
13181 +#ifndef __LINUX_MUTEX_RT_H
13182 +#define __LINUX_MUTEX_RT_H
13183 +
13184 +#ifndef __LINUX_MUTEX_H
13185 +#error "Please include mutex.h"
13186 +#endif
13187 +
13188 +#include <linux/rtmutex.h>
13189 +
13190 +/* FIXME: Just for __lockfunc */
13191 +#include <linux/spinlock.h>
13192 +
13193 +struct mutex {
13194 +       struct rt_mutex         lock;
13195 +#ifdef CONFIG_DEBUG_LOCK_ALLOC
13196 +       struct lockdep_map      dep_map;
13197 +#endif
13198 +};
13199 +
13200 +#define __MUTEX_INITIALIZER(mutexname)                                 \
13201 +       {                                                               \
13202 +               .lock = __RT_MUTEX_INITIALIZER(mutexname.lock)          \
13203 +               __DEP_MAP_MUTEX_INITIALIZER(mutexname)                  \
13204 +       }
13205 +
13206 +#define DEFINE_MUTEX(mutexname)                                                \
13207 +       struct mutex mutexname = __MUTEX_INITIALIZER(mutexname)
13208 +
13209 +extern void __mutex_do_init(struct mutex *lock, const char *name, struct lock_class_key *key);
13210 +extern void __lockfunc _mutex_lock(struct mutex *lock);
13211 +extern int __lockfunc _mutex_lock_interruptible(struct mutex *lock);
13212 +extern int __lockfunc _mutex_lock_killable(struct mutex *lock);
13213 +extern void __lockfunc _mutex_lock_nested(struct mutex *lock, int subclass);
13214 +extern void __lockfunc _mutex_lock_nest_lock(struct mutex *lock, struct lockdep_map *nest_lock);
13215 +extern int __lockfunc _mutex_lock_interruptible_nested(struct mutex *lock, int subclass);
13216 +extern int __lockfunc _mutex_lock_killable_nested(struct mutex *lock, int subclass);
13217 +extern int __lockfunc _mutex_trylock(struct mutex *lock);
13218 +extern void __lockfunc _mutex_unlock(struct mutex *lock);
13219 +
13220 +#define mutex_is_locked(l)             rt_mutex_is_locked(&(l)->lock)
13221 +#define mutex_lock(l)                  _mutex_lock(l)
13222 +#define mutex_lock_interruptible(l)    _mutex_lock_interruptible(l)
13223 +#define mutex_lock_killable(l)         _mutex_lock_killable(l)
13224 +#define mutex_trylock(l)               _mutex_trylock(l)
13225 +#define mutex_unlock(l)                        _mutex_unlock(l)
13226 +#define mutex_destroy(l)               rt_mutex_destroy(&(l)->lock)
13227 +
13228 +#ifdef CONFIG_DEBUG_LOCK_ALLOC
13229 +# define mutex_lock_nested(l, s)       _mutex_lock_nested(l, s)
13230 +# define mutex_lock_interruptible_nested(l, s) \
13231 +                                       _mutex_lock_interruptible_nested(l, s)
13232 +# define mutex_lock_killable_nested(l, s) \
13233 +                                       _mutex_lock_killable_nested(l, s)
13234 +
13235 +# define mutex_lock_nest_lock(lock, nest_lock)                         \
13236 +do {                                                                   \
13237 +       typecheck(struct lockdep_map *, &(nest_lock)->dep_map);         \
13238 +       _mutex_lock_nest_lock(lock, &(nest_lock)->dep_map);             \
13239 +} while (0)
13240 +
13241 +#else
13242 +# define mutex_lock_nested(l, s)       _mutex_lock(l)
13243 +# define mutex_lock_interruptible_nested(l, s) \
13244 +                                       _mutex_lock_interruptible(l)
13245 +# define mutex_lock_killable_nested(l, s) \
13246 +                                       _mutex_lock_killable(l)
13247 +# define mutex_lock_nest_lock(lock, nest_lock) mutex_lock(lock)
13248 +#endif
13249 +
13250 +# define mutex_init(mutex)                             \
13251 +do {                                                   \
13252 +       static struct lock_class_key __key;             \
13253 +                                                       \
13254 +       rt_mutex_init(&(mutex)->lock);                  \
13255 +       __mutex_do_init((mutex), #mutex, &__key);       \
13256 +} while (0)
13257 +
13258 +# define __mutex_init(mutex, name, key)                        \
13259 +do {                                                   \
13260 +       rt_mutex_init(&(mutex)->lock);                  \
13261 +       __mutex_do_init((mutex), name, key);            \
13262 +} while (0)
13263 +
13264 +#endif
13265 diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
13266 index 12b4d54a8ffa..a2e7d1816b4c 100644
13267 --- a/include/linux/netdevice.h
13268 +++ b/include/linux/netdevice.h
13269 @@ -2248,11 +2248,20 @@ void netdev_freemem(struct net_device *dev);
13270  void synchronize_net(void);
13271  int init_dummy_netdev(struct net_device *dev);
13272
13273 +#ifdef CONFIG_PREEMPT_RT_FULL
13274 +static inline int dev_recursion_level(void)
13275 +{
13276 +       return current->xmit_recursion;
13277 +}
13278 +
13279 +#else
13280 +
13281  DECLARE_PER_CPU(int, xmit_recursion);
13282  static inline int dev_recursion_level(void)
13283  {
13284         return this_cpu_read(xmit_recursion);
13285  }
13286 +#endif
13287
13288  struct net_device *dev_get_by_index(struct net *net, int ifindex);
13289  struct net_device *__dev_get_by_index(struct net *net, int ifindex);
13290 @@ -2563,6 +2572,7 @@ struct softnet_data {
13291         unsigned int            dropped;
13292         struct sk_buff_head     input_pkt_queue;
13293         struct napi_struct      backlog;
13294 +       struct sk_buff_head     tofree_queue;
13295
13296  };
13297
13298 diff --git a/include/linux/netfilter/x_tables.h b/include/linux/netfilter/x_tables.h
13299 index 04078e8a4803..a61c9609e32f 100644
13300 --- a/include/linux/netfilter/x_tables.h
13301 +++ b/include/linux/netfilter/x_tables.h
13302 @@ -4,6 +4,7 @@
13303
13304  #include <linux/netdevice.h>
13305  #include <linux/static_key.h>
13306 +#include <linux/locallock.h>
13307  #include <uapi/linux/netfilter/x_tables.h>
13308
13309  /**
13310 @@ -289,6 +290,8 @@ void xt_free_table_info(struct xt_table_info *info);
13311   */
13312  DECLARE_PER_CPU(seqcount_t, xt_recseq);
13313
13314 +DECLARE_LOCAL_IRQ_LOCK(xt_write_lock);
13315 +
13316  /* xt_tee_enabled - true if x_tables needs to handle reentrancy
13317   *
13318   * Enabled if current ip(6)tables ruleset has at least one -j TEE rule.
13319 @@ -309,6 +312,9 @@ static inline unsigned int xt_write_recseq_begin(void)
13320  {
13321         unsigned int addend;
13322
13323 +       /* RT protection */
13324 +       local_lock(xt_write_lock);
13325 +
13326         /*
13327          * Low order bit of sequence is set if we already
13328          * called xt_write_recseq_begin().
13329 @@ -339,6 +345,7 @@ static inline void xt_write_recseq_end(unsigned int addend)
13330         /* this is kind of a write_seqcount_end(), but addend is 0 or 1 */
13331         smp_wmb();
13332         __this_cpu_add(xt_recseq.sequence, addend);
13333 +       local_unlock(xt_write_lock);
13334  }
13335
13336  /*
13337 diff --git a/include/linux/notifier.h b/include/linux/notifier.h
13338 index d14a4c362465..2e4414a0c1c4 100644
13339 --- a/include/linux/notifier.h
13340 +++ b/include/linux/notifier.h
13341 @@ -6,7 +6,7 @@
13342   *
13343   *                             Alan Cox <Alan.Cox@linux.org>
13344   */
13345 -
13346 +
13347  #ifndef _LINUX_NOTIFIER_H
13348  #define _LINUX_NOTIFIER_H
13349  #include <linux/errno.h>
13350 @@ -42,9 +42,7 @@
13351   * in srcu_notifier_call_chain(): no cache bounces and no memory barriers.
13352   * As compensation, srcu_notifier_chain_unregister() is rather expensive.
13353   * SRCU notifier chains should be used when the chain will be called very
13354 - * often but notifier_blocks will seldom be removed.  Also, SRCU notifier
13355 - * chains are slightly more difficult to use because they require special
13356 - * runtime initialization.
13357 + * often but notifier_blocks will seldom be removed.
13358   */
13359
13360  typedef        int (*notifier_fn_t)(struct notifier_block *nb,
13361 @@ -88,7 +86,7 @@ struct srcu_notifier_head {
13362                 (name)->head = NULL;            \
13363         } while (0)
13364
13365 -/* srcu_notifier_heads must be initialized and cleaned up dynamically */
13366 +/* srcu_notifier_heads must be cleaned up dynamically */
13367  extern void srcu_init_notifier_head(struct srcu_notifier_head *nh);
13368  #define srcu_cleanup_notifier_head(name)       \
13369                 cleanup_srcu_struct(&(name)->srcu);
13370 @@ -101,7 +99,13 @@ extern void srcu_init_notifier_head(struct srcu_notifier_head *nh);
13371                 .head = NULL }
13372  #define RAW_NOTIFIER_INIT(name)        {                               \
13373                 .head = NULL }
13374 -/* srcu_notifier_heads cannot be initialized statically */
13375 +
13376 +#define SRCU_NOTIFIER_INIT(name, pcpu)                         \
13377 +       {                                                       \
13378 +               .mutex = __MUTEX_INITIALIZER(name.mutex),       \
13379 +               .head = NULL,                                   \
13380 +               .srcu = __SRCU_STRUCT_INIT(name.srcu, pcpu),    \
13381 +       }
13382
13383  #define ATOMIC_NOTIFIER_HEAD(name)                             \
13384         struct atomic_notifier_head name =                      \
13385 @@ -113,6 +117,18 @@ extern void srcu_init_notifier_head(struct srcu_notifier_head *nh);
13386         struct raw_notifier_head name =                         \
13387                 RAW_NOTIFIER_INIT(name)
13388
13389 +#define _SRCU_NOTIFIER_HEAD(name, mod)                         \
13390 +       static DEFINE_PER_CPU(struct srcu_struct_array,         \
13391 +                       name##_head_srcu_array);                \
13392 +       mod struct srcu_notifier_head name =                    \
13393 +                       SRCU_NOTIFIER_INIT(name, name##_head_srcu_array)
13394 +
13395 +#define SRCU_NOTIFIER_HEAD(name)                               \
13396 +       _SRCU_NOTIFIER_HEAD(name, )
13397 +
13398 +#define SRCU_NOTIFIER_HEAD_STATIC(name)                                \
13399 +       _SRCU_NOTIFIER_HEAD(name, static)
13400 +
13401  #ifdef __KERNEL__
13402
13403  extern int atomic_notifier_chain_register(struct atomic_notifier_head *nh,
13404 @@ -182,12 +198,12 @@ static inline int notifier_to_errno(int ret)
13405
13406  /*
13407   *     Declared notifiers so far. I can imagine quite a few more chains
13408 - *     over time (eg laptop power reset chains, reboot chain (to clean
13409 + *     over time (eg laptop power reset chains, reboot chain (to clean
13410   *     device units up), device [un]mount chain, module load/unload chain,
13411 - *     low memory chain, screenblank chain (for plug in modular screenblankers)
13412 + *     low memory chain, screenblank chain (for plug in modular screenblankers)
13413   *     VC switch chains (for loadable kernel svgalib VC switch helpers) etc...
13414   */
13415 -
13416 +
13417  /* CPU notfiers are defined in include/linux/cpu.h. */
13418
13419  /* netdevice notifiers are defined in include/linux/netdevice.h */
13420 diff --git a/include/linux/percpu.h b/include/linux/percpu.h
13421 index caebf2a758dc..53a60a51c758 100644
13422 --- a/include/linux/percpu.h
13423 +++ b/include/linux/percpu.h
13424 @@ -24,6 +24,35 @@
13425          PERCPU_MODULE_RESERVE)
13426  #endif
13427
13428 +#ifdef CONFIG_PREEMPT_RT_FULL
13429 +
13430 +#define get_local_var(var) (*({                \
13431 +              migrate_disable();       \
13432 +              this_cpu_ptr(&var);      }))
13433 +
13434 +#define put_local_var(var) do {        \
13435 +       (void)&(var);           \
13436 +       migrate_enable();       \
13437 +} while (0)
13438 +
13439 +# define get_local_ptr(var) ({         \
13440 +               migrate_disable();      \
13441 +               this_cpu_ptr(var);      })
13442 +
13443 +# define put_local_ptr(var) do {       \
13444 +       (void)(var);                    \
13445 +       migrate_enable();               \
13446 +} while (0)
13447 +
13448 +#else
13449 +
13450 +#define get_local_var(var)     get_cpu_var(var)
13451 +#define put_local_var(var)     put_cpu_var(var)
13452 +#define get_local_ptr(var)     get_cpu_ptr(var)
13453 +#define put_local_ptr(var)     put_cpu_ptr(var)
13454 +
13455 +#endif
13456 +
13457  /* minimum unit size, also is the maximum supported allocation size */
13458  #define PCPU_MIN_UNIT_SIZE             PFN_ALIGN(32 << 10)
13459
13460 diff --git a/include/linux/pid.h b/include/linux/pid.h
13461 index 23705a53abba..2cc64b779f03 100644
13462 --- a/include/linux/pid.h
13463 +++ b/include/linux/pid.h
13464 @@ -2,6 +2,7 @@
13465  #define _LINUX_PID_H
13466
13467  #include <linux/rcupdate.h>
13468 +#include <linux/atomic.h>
13469
13470  enum pid_type
13471  {
13472 diff --git a/include/linux/preempt.h b/include/linux/preempt.h
13473 index 75e4e30677f1..1cfb1cb72354 100644
13474 --- a/include/linux/preempt.h
13475 +++ b/include/linux/preempt.h
13476 @@ -50,7 +50,11 @@
13477  #define HARDIRQ_OFFSET (1UL << HARDIRQ_SHIFT)
13478  #define NMI_OFFSET     (1UL << NMI_SHIFT)
13479
13480 -#define SOFTIRQ_DISABLE_OFFSET (2 * SOFTIRQ_OFFSET)
13481 +#ifndef CONFIG_PREEMPT_RT_FULL
13482 +# define SOFTIRQ_DISABLE_OFFSET                (2 * SOFTIRQ_OFFSET)
13483 +#else
13484 +# define SOFTIRQ_DISABLE_OFFSET                (0)
13485 +#endif
13486
13487  /* We use the MSB mostly because its available */
13488  #define PREEMPT_NEED_RESCHED   0x80000000
13489 @@ -59,9 +63,15 @@
13490  #include <asm/preempt.h>
13491
13492  #define hardirq_count()        (preempt_count() & HARDIRQ_MASK)
13493 -#define softirq_count()        (preempt_count() & SOFTIRQ_MASK)
13494  #define irq_count()    (preempt_count() & (HARDIRQ_MASK | SOFTIRQ_MASK \
13495                                  | NMI_MASK))
13496 +#ifndef CONFIG_PREEMPT_RT_FULL
13497 +# define softirq_count()       (preempt_count() & SOFTIRQ_MASK)
13498 +# define in_serving_softirq()  (softirq_count() & SOFTIRQ_OFFSET)
13499 +#else
13500 +# define softirq_count()       (0UL)
13501 +extern int in_serving_softirq(void);
13502 +#endif
13503
13504  /*
13505   * Are we doing bottom half or hardware interrupt processing?
13506 @@ -72,7 +82,6 @@
13507  #define in_irq()               (hardirq_count())
13508  #define in_softirq()           (softirq_count())
13509  #define in_interrupt()         (irq_count())
13510 -#define in_serving_softirq()   (softirq_count() & SOFTIRQ_OFFSET)
13511
13512  /*
13513   * Are we in NMI context?
13514 @@ -91,7 +100,11 @@
13515  /*
13516   * The preempt_count offset after spin_lock()
13517   */
13518 +#if !defined(CONFIG_PREEMPT_RT_FULL)
13519  #define PREEMPT_LOCK_OFFSET    PREEMPT_DISABLE_OFFSET
13520 +#else
13521 +#define PREEMPT_LOCK_OFFSET    0
13522 +#endif
13523
13524  /*
13525   * The preempt_count offset needed for things like:
13526 @@ -140,6 +153,20 @@ extern void preempt_count_sub(int val);
13527  #define preempt_count_inc() preempt_count_add(1)
13528  #define preempt_count_dec() preempt_count_sub(1)
13529
13530 +#ifdef CONFIG_PREEMPT_LAZY
13531 +#define add_preempt_lazy_count(val)    do { preempt_lazy_count() += (val); } while (0)
13532 +#define sub_preempt_lazy_count(val)    do { preempt_lazy_count() -= (val); } while (0)
13533 +#define inc_preempt_lazy_count()       add_preempt_lazy_count(1)
13534 +#define dec_preempt_lazy_count()       sub_preempt_lazy_count(1)
13535 +#define preempt_lazy_count()           (current_thread_info()->preempt_lazy_count)
13536 +#else
13537 +#define add_preempt_lazy_count(val)    do { } while (0)
13538 +#define sub_preempt_lazy_count(val)    do { } while (0)
13539 +#define inc_preempt_lazy_count()       do { } while (0)
13540 +#define dec_preempt_lazy_count()       do { } while (0)
13541 +#define preempt_lazy_count()           (0)
13542 +#endif
13543 +
13544  #ifdef CONFIG_PREEMPT_COUNT
13545
13546  #define preempt_disable() \
13547 @@ -148,13 +175,25 @@ do { \
13548         barrier(); \
13549  } while (0)
13550
13551 +#define preempt_lazy_disable() \
13552 +do { \
13553 +       inc_preempt_lazy_count(); \
13554 +       barrier(); \
13555 +} while (0)
13556 +
13557  #define sched_preempt_enable_no_resched() \
13558  do { \
13559         barrier(); \
13560         preempt_count_dec(); \
13561  } while (0)
13562
13563 -#define preempt_enable_no_resched() sched_preempt_enable_no_resched()
13564 +#ifdef CONFIG_PREEMPT_RT_BASE
13565 +# define preempt_enable_no_resched() sched_preempt_enable_no_resched()
13566 +# define preempt_check_resched_rt() preempt_check_resched()
13567 +#else
13568 +# define preempt_enable_no_resched() preempt_enable()
13569 +# define preempt_check_resched_rt() barrier();
13570 +#endif
13571
13572  #define preemptible()  (preempt_count() == 0 && !irqs_disabled())
13573
13574 @@ -179,6 +218,13 @@ do { \
13575                 __preempt_schedule(); \
13576  } while (0)
13577
13578 +#define preempt_lazy_enable() \
13579 +do { \
13580 +       dec_preempt_lazy_count(); \
13581 +       barrier(); \
13582 +       preempt_check_resched(); \
13583 +} while (0)
13584 +
13585  #else /* !CONFIG_PREEMPT */
13586  #define preempt_enable() \
13587  do { \
13588 @@ -224,6 +270,7 @@ do { \
13589  #define preempt_disable_notrace()              barrier()
13590  #define preempt_enable_no_resched_notrace()    barrier()
13591  #define preempt_enable_notrace()               barrier()
13592 +#define preempt_check_resched_rt()             barrier()
13593  #define preemptible()                          0
13594
13595  #endif /* CONFIG_PREEMPT_COUNT */
13596 @@ -244,10 +291,31 @@ do { \
13597  } while (0)
13598  #define preempt_fold_need_resched() \
13599  do { \
13600 -       if (tif_need_resched()) \
13601 +       if (tif_need_resched_now()) \
13602                 set_preempt_need_resched(); \
13603  } while (0)
13604
13605 +#ifdef CONFIG_PREEMPT_RT_FULL
13606 +# define preempt_disable_rt()          preempt_disable()
13607 +# define preempt_enable_rt()           preempt_enable()
13608 +# define preempt_disable_nort()                barrier()
13609 +# define preempt_enable_nort()         barrier()
13610 +# ifdef CONFIG_SMP
13611 +   extern void migrate_disable(void);
13612 +   extern void migrate_enable(void);
13613 +# else /* CONFIG_SMP */
13614 +#  define migrate_disable()            barrier()
13615 +#  define migrate_enable()             barrier()
13616 +# endif /* CONFIG_SMP */
13617 +#else
13618 +# define preempt_disable_rt()          barrier()
13619 +# define preempt_enable_rt()           barrier()
13620 +# define preempt_disable_nort()                preempt_disable()
13621 +# define preempt_enable_nort()         preempt_enable()
13622 +# define migrate_disable()             preempt_disable()
13623 +# define migrate_enable()              preempt_enable()
13624 +#endif
13625 +
13626  #ifdef CONFIG_PREEMPT_NOTIFIERS
13627
13628  struct preempt_notifier;
13629 diff --git a/include/linux/printk.h b/include/linux/printk.h
13630 index 9729565c25ff..9cdca696b718 100644
13631 --- a/include/linux/printk.h
13632 +++ b/include/linux/printk.h
13633 @@ -117,9 +117,11 @@ int no_printk(const char *fmt, ...)
13634  #ifdef CONFIG_EARLY_PRINTK
13635  extern asmlinkage __printf(1, 2)
13636  void early_printk(const char *fmt, ...);
13637 +extern void printk_kill(void);
13638  #else
13639  static inline __printf(1, 2) __cold
13640  void early_printk(const char *s, ...) { }
13641 +static inline void printk_kill(void) { }
13642  #endif
13643
13644  typedef __printf(1, 0) int (*printk_func_t)(const char *fmt, va_list args);
13645 diff --git a/include/linux/radix-tree.h b/include/linux/radix-tree.h
13646 index 5d5174b59802..8ddbd6e15a3c 100644
13647 --- a/include/linux/radix-tree.h
13648 +++ b/include/linux/radix-tree.h
13649 @@ -277,8 +277,13 @@ radix_tree_gang_lookup(struct radix_tree_root *root, void **results,
13650  unsigned int radix_tree_gang_lookup_slot(struct radix_tree_root *root,
13651                         void ***results, unsigned long *indices,
13652                         unsigned long first_index, unsigned int max_items);
13653 +#ifndef CONFIG_PREEMPT_RT_FULL
13654  int radix_tree_preload(gfp_t gfp_mask);
13655  int radix_tree_maybe_preload(gfp_t gfp_mask);
13656 +#else
13657 +static inline int radix_tree_preload(gfp_t gm) { return 0; }
13658 +static inline int radix_tree_maybe_preload(gfp_t gfp_mask) { return 0; }
13659 +#endif
13660  void radix_tree_init(void);
13661  void *radix_tree_tag_set(struct radix_tree_root *root,
13662                         unsigned long index, unsigned int tag);
13663 @@ -303,7 +308,7 @@ unsigned long radix_tree_locate_item(struct radix_tree_root *root, void *item);
13664
13665  static inline void radix_tree_preload_end(void)
13666  {
13667 -       preempt_enable();
13668 +       preempt_enable_nort();
13669  }
13670
13671  /**
13672 diff --git a/include/linux/random.h b/include/linux/random.h
13673 index a75840c1aa71..1a804361670c 100644
13674 --- a/include/linux/random.h
13675 +++ b/include/linux/random.h
13676 @@ -20,7 +20,7 @@ struct random_ready_callback {
13677  extern void add_device_randomness(const void *, unsigned int);
13678  extern void add_input_randomness(unsigned int type, unsigned int code,
13679                                  unsigned int value);
13680 -extern void add_interrupt_randomness(int irq, int irq_flags);
13681 +extern void add_interrupt_randomness(int irq, int irq_flags, __u64 ip);
13682
13683  extern void get_random_bytes(void *buf, int nbytes);
13684  extern int add_random_ready_callback(struct random_ready_callback *rdy);
13685 diff --git a/include/linux/rbtree.h b/include/linux/rbtree.h
13686 index a5aa7ae671f4..24ddffd25492 100644
13687 --- a/include/linux/rbtree.h
13688 +++ b/include/linux/rbtree.h
13689 @@ -31,7 +31,6 @@
13690
13691  #include <linux/kernel.h>
13692  #include <linux/stddef.h>
13693 -#include <linux/rcupdate.h>
13694
13695  struct rb_node {
13696         unsigned long  __rb_parent_color;
13697 @@ -86,14 +85,8 @@ static inline void rb_link_node(struct rb_node *node, struct rb_node *parent,
13698         *rb_link = node;
13699  }
13700
13701 -static inline void rb_link_node_rcu(struct rb_node *node, struct rb_node *parent,
13702 -                                   struct rb_node **rb_link)
13703 -{
13704 -       node->__rb_parent_color = (unsigned long)parent;
13705 -       node->rb_left = node->rb_right = NULL;
13706 -
13707 -       rcu_assign_pointer(*rb_link, node);
13708 -}
13709 +void rb_link_node_rcu(struct rb_node *node, struct rb_node *parent,
13710 +                     struct rb_node **rb_link);
13711
13712  #define rb_entry_safe(ptr, type, member) \
13713         ({ typeof(ptr) ____ptr = (ptr); \
13714 diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
13715 index a0189ba67fde..c2f5f955163d 100644
13716 --- a/include/linux/rcupdate.h
13717 +++ b/include/linux/rcupdate.h
13718 @@ -169,6 +169,9 @@ void call_rcu(struct rcu_head *head,
13719
13720  #endif /* #else #ifdef CONFIG_PREEMPT_RCU */
13721
13722 +#ifdef CONFIG_PREEMPT_RT_FULL
13723 +#define call_rcu_bh    call_rcu
13724 +#else
13725  /**
13726   * call_rcu_bh() - Queue an RCU for invocation after a quicker grace period.
13727   * @head: structure to be used for queueing the RCU updates.
13728 @@ -192,6 +195,7 @@ void call_rcu(struct rcu_head *head,
13729   */
13730  void call_rcu_bh(struct rcu_head *head,
13731                  rcu_callback_t func);
13732 +#endif
13733
13734  /**
13735   * call_rcu_sched() - Queue an RCU for invocation after sched grace period.
13736 @@ -292,6 +296,11 @@ void synchronize_rcu(void);
13737   * types of kernel builds, the rcu_read_lock() nesting depth is unknowable.
13738   */
13739  #define rcu_preempt_depth() (current->rcu_read_lock_nesting)
13740 +#ifndef CONFIG_PREEMPT_RT_FULL
13741 +#define sched_rcu_preempt_depth()      rcu_preempt_depth()
13742 +#else
13743 +static inline int sched_rcu_preempt_depth(void) { return 0; }
13744 +#endif
13745
13746  #else /* #ifdef CONFIG_PREEMPT_RCU */
13747
13748 @@ -317,6 +326,8 @@ static inline int rcu_preempt_depth(void)
13749         return 0;
13750  }
13751
13752 +#define sched_rcu_preempt_depth()      rcu_preempt_depth()
13753 +
13754  #endif /* #else #ifdef CONFIG_PREEMPT_RCU */
13755
13756  /* Internal to kernel */
13757 @@ -489,7 +500,14 @@ extern struct lockdep_map rcu_callback_map;
13758  int debug_lockdep_rcu_enabled(void);
13759
13760  int rcu_read_lock_held(void);
13761 +#ifdef CONFIG_PREEMPT_RT_FULL
13762 +static inline int rcu_read_lock_bh_held(void)
13763 +{
13764 +       return rcu_read_lock_held();
13765 +}
13766 +#else
13767  int rcu_read_lock_bh_held(void);
13768 +#endif
13769
13770  /**
13771   * rcu_read_lock_sched_held() - might we be in RCU-sched read-side critical section?
13772 @@ -937,10 +955,14 @@ static inline void rcu_read_unlock(void)
13773  static inline void rcu_read_lock_bh(void)
13774  {
13775         local_bh_disable();
13776 +#ifdef CONFIG_PREEMPT_RT_FULL
13777 +       rcu_read_lock();
13778 +#else
13779         __acquire(RCU_BH);
13780         rcu_lock_acquire(&rcu_bh_lock_map);
13781         RCU_LOCKDEP_WARN(!rcu_is_watching(),
13782                          "rcu_read_lock_bh() used illegally while idle");
13783 +#endif
13784  }
13785
13786  /*
13787 @@ -950,10 +972,14 @@ static inline void rcu_read_lock_bh(void)
13788   */
13789  static inline void rcu_read_unlock_bh(void)
13790  {
13791 +#ifdef CONFIG_PREEMPT_RT_FULL
13792 +       rcu_read_unlock();
13793 +#else
13794         RCU_LOCKDEP_WARN(!rcu_is_watching(),
13795                          "rcu_read_unlock_bh() used illegally while idle");
13796         rcu_lock_release(&rcu_bh_lock_map);
13797         __release(RCU_BH);
13798 +#endif
13799         local_bh_enable();
13800  }
13801
13802 diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h
13803 index 60d15a080d7c..436c9e62bfc6 100644
13804 --- a/include/linux/rcutree.h
13805 +++ b/include/linux/rcutree.h
13806 @@ -44,7 +44,11 @@ static inline void rcu_virt_note_context_switch(int cpu)
13807         rcu_note_context_switch();
13808  }
13809
13810 +#ifdef CONFIG_PREEMPT_RT_FULL
13811 +# define synchronize_rcu_bh    synchronize_rcu
13812 +#else
13813  void synchronize_rcu_bh(void);
13814 +#endif
13815  void synchronize_sched_expedited(void);
13816  void synchronize_rcu_expedited(void);
13817
13818 @@ -72,7 +76,11 @@ static inline void synchronize_rcu_bh_expedited(void)
13819  }
13820
13821  void rcu_barrier(void);
13822 +#ifdef CONFIG_PREEMPT_RT_FULL
13823 +# define rcu_barrier_bh                rcu_barrier
13824 +#else
13825  void rcu_barrier_bh(void);
13826 +#endif
13827  void rcu_barrier_sched(void);
13828  unsigned long get_state_synchronize_rcu(void);
13829  void cond_synchronize_rcu(unsigned long oldstate);
13830 @@ -85,12 +93,10 @@ unsigned long rcu_batches_started(void);
13831  unsigned long rcu_batches_started_bh(void);
13832  unsigned long rcu_batches_started_sched(void);
13833  unsigned long rcu_batches_completed(void);
13834 -unsigned long rcu_batches_completed_bh(void);
13835  unsigned long rcu_batches_completed_sched(void);
13836  void show_rcu_gp_kthreads(void);
13837
13838  void rcu_force_quiescent_state(void);
13839 -void rcu_bh_force_quiescent_state(void);
13840  void rcu_sched_force_quiescent_state(void);
13841
13842  void rcu_idle_enter(void);
13843 @@ -105,6 +111,14 @@ extern int rcu_scheduler_active __read_mostly;
13844
13845  bool rcu_is_watching(void);
13846
13847 +#ifndef CONFIG_PREEMPT_RT_FULL
13848 +void rcu_bh_force_quiescent_state(void);
13849 +unsigned long rcu_batches_completed_bh(void);
13850 +#else
13851 +# define rcu_bh_force_quiescent_state  rcu_force_quiescent_state
13852 +# define rcu_batches_completed_bh      rcu_batches_completed
13853 +#endif
13854 +
13855  void rcu_all_qs(void);
13856
13857  #endif /* __LINUX_RCUTREE_H */
13858 diff --git a/include/linux/rtmutex.h b/include/linux/rtmutex.h
13859 index 1abba5ce2a2f..30211c627511 100644
13860 --- a/include/linux/rtmutex.h
13861 +++ b/include/linux/rtmutex.h
13862 @@ -13,11 +13,15 @@
13863  #define __LINUX_RT_MUTEX_H
13864
13865  #include <linux/linkage.h>
13866 +#include <linux/spinlock_types_raw.h>
13867  #include <linux/rbtree.h>
13868 -#include <linux/spinlock_types.h>
13869
13870  extern int max_lock_depth; /* for sysctl */
13871
13872 +#ifdef CONFIG_DEBUG_MUTEXES
13873 +#include <linux/debug_locks.h>
13874 +#endif
13875 +
13876  /**
13877   * The rt_mutex structure
13878   *
13879 @@ -31,8 +35,8 @@ struct rt_mutex {
13880         struct rb_root          waiters;
13881         struct rb_node          *waiters_leftmost;
13882         struct task_struct      *owner;
13883 -#ifdef CONFIG_DEBUG_RT_MUTEXES
13884         int                     save_state;
13885 +#ifdef CONFIG_DEBUG_RT_MUTEXES
13886         const char              *name, *file;
13887         int                     line;
13888         void                    *magic;
13889 @@ -55,22 +59,33 @@ struct hrtimer_sleeper;
13890  # define rt_mutex_debug_check_no_locks_held(task)      do { } while (0)
13891  #endif
13892
13893 +# define rt_mutex_init(mutex)                                  \
13894 +       do {                                                    \
13895 +               raw_spin_lock_init(&(mutex)->wait_lock);        \
13896 +               __rt_mutex_init(mutex, #mutex);                 \
13897 +       } while (0)
13898 +
13899  #ifdef CONFIG_DEBUG_RT_MUTEXES
13900  # define __DEBUG_RT_MUTEX_INITIALIZER(mutexname) \
13901         , .name = #mutexname, .file = __FILE__, .line = __LINE__
13902 -# define rt_mutex_init(mutex)                  __rt_mutex_init(mutex, __func__)
13903   extern void rt_mutex_debug_task_free(struct task_struct *tsk);
13904  #else
13905  # define __DEBUG_RT_MUTEX_INITIALIZER(mutexname)
13906 -# define rt_mutex_init(mutex)                  __rt_mutex_init(mutex, NULL)
13907  # define rt_mutex_debug_task_free(t)                   do { } while (0)
13908  #endif
13909
13910 -#define __RT_MUTEX_INITIALIZER(mutexname) \
13911 -       { .wait_lock = __RAW_SPIN_LOCK_UNLOCKED(mutexname.wait_lock) \
13912 +#define __RT_MUTEX_INITIALIZER_PLAIN(mutexname) \
13913 +        .wait_lock = __RAW_SPIN_LOCK_UNLOCKED(mutexname.wait_lock) \
13914         , .waiters = RB_ROOT \
13915         , .owner = NULL \
13916 -       __DEBUG_RT_MUTEX_INITIALIZER(mutexname)}
13917 +       __DEBUG_RT_MUTEX_INITIALIZER(mutexname)
13918 +
13919 +#define __RT_MUTEX_INITIALIZER(mutexname) \
13920 +       { __RT_MUTEX_INITIALIZER_PLAIN(mutexname) }
13921 +
13922 +#define __RT_MUTEX_INITIALIZER_SAVE_STATE(mutexname) \
13923 +       { __RT_MUTEX_INITIALIZER_PLAIN(mutexname)    \
13924 +       , .save_state = 1 }
13925
13926  #define DEFINE_RT_MUTEX(mutexname) \
13927         struct rt_mutex mutexname = __RT_MUTEX_INITIALIZER(mutexname)
13928 @@ -91,6 +106,7 @@ extern void rt_mutex_destroy(struct rt_mutex *lock);
13929
13930  extern void rt_mutex_lock(struct rt_mutex *lock);
13931  extern int rt_mutex_lock_interruptible(struct rt_mutex *lock);
13932 +extern int rt_mutex_lock_killable(struct rt_mutex *lock);
13933  extern int rt_mutex_timed_lock(struct rt_mutex *lock,
13934                                struct hrtimer_sleeper *timeout);
13935
13936 diff --git a/include/linux/rwlock_rt.h b/include/linux/rwlock_rt.h
13937 new file mode 100644
13938 index 000000000000..49ed2d45d3be
13939 --- /dev/null
13940 +++ b/include/linux/rwlock_rt.h
13941 @@ -0,0 +1,99 @@
13942 +#ifndef __LINUX_RWLOCK_RT_H
13943 +#define __LINUX_RWLOCK_RT_H
13944 +
13945 +#ifndef __LINUX_SPINLOCK_H
13946 +#error Do not include directly. Use spinlock.h
13947 +#endif
13948 +
13949 +#define rwlock_init(rwl)                               \
13950 +do {                                                   \
13951 +       static struct lock_class_key __key;             \
13952 +                                                       \
13953 +       rt_mutex_init(&(rwl)->lock);                    \
13954 +       __rt_rwlock_init(rwl, #rwl, &__key);            \
13955 +} while (0)
13956 +
13957 +extern void __lockfunc rt_write_lock(rwlock_t *rwlock);
13958 +extern void __lockfunc rt_read_lock(rwlock_t *rwlock);
13959 +extern int __lockfunc rt_write_trylock(rwlock_t *rwlock);
13960 +extern int __lockfunc rt_write_trylock_irqsave(rwlock_t *trylock, unsigned long *flags);
13961 +extern int __lockfunc rt_read_trylock(rwlock_t *rwlock);
13962 +extern void __lockfunc rt_write_unlock(rwlock_t *rwlock);
13963 +extern void __lockfunc rt_read_unlock(rwlock_t *rwlock);
13964 +extern unsigned long __lockfunc rt_write_lock_irqsave(rwlock_t *rwlock);
13965 +extern unsigned long __lockfunc rt_read_lock_irqsave(rwlock_t *rwlock);
13966 +extern void __rt_rwlock_init(rwlock_t *rwlock, char *name, struct lock_class_key *key);
13967 +
13968 +#define read_trylock(lock)     __cond_lock(lock, rt_read_trylock(lock))
13969 +#define write_trylock(lock)    __cond_lock(lock, rt_write_trylock(lock))
13970 +
13971 +#define write_trylock_irqsave(lock, flags)     \
13972 +       __cond_lock(lock, rt_write_trylock_irqsave(lock, &flags))
13973 +
13974 +#define read_lock_irqsave(lock, flags)                 \
13975 +       do {                                            \
13976 +               typecheck(unsigned long, flags);        \
13977 +               flags = rt_read_lock_irqsave(lock);     \
13978 +       } while (0)
13979 +
13980 +#define write_lock_irqsave(lock, flags)                        \
13981 +       do {                                            \
13982 +               typecheck(unsigned long, flags);        \
13983 +               flags = rt_write_lock_irqsave(lock);    \
13984 +       } while (0)
13985 +
13986 +#define read_lock(lock)                rt_read_lock(lock)
13987 +
13988 +#define read_lock_bh(lock)                             \
13989 +       do {                                            \
13990 +               local_bh_disable();                     \
13991 +               rt_read_lock(lock);                     \
13992 +       } while (0)
13993 +
13994 +#define read_lock_irq(lock)    read_lock(lock)
13995 +
13996 +#define write_lock(lock)       rt_write_lock(lock)
13997 +
13998 +#define write_lock_bh(lock)                            \
13999 +       do {                                            \
14000 +               local_bh_disable();                     \
14001 +               rt_write_lock(lock);                    \
14002 +       } while (0)
14003 +
14004 +#define write_lock_irq(lock)   write_lock(lock)
14005 +
14006 +#define read_unlock(lock)      rt_read_unlock(lock)
14007 +
14008 +#define read_unlock_bh(lock)                           \
14009 +       do {                                            \
14010 +               rt_read_unlock(lock);                   \
14011 +               local_bh_enable();                      \
14012 +       } while (0)
14013 +
14014 +#define read_unlock_irq(lock)  read_unlock(lock)
14015 +
14016 +#define write_unlock(lock)     rt_write_unlock(lock)
14017 +
14018 +#define write_unlock_bh(lock)                          \
14019 +       do {                                            \
14020 +               rt_write_unlock(lock);                  \
14021 +               local_bh_enable();                      \
14022 +       } while (0)
14023 +
14024 +#define write_unlock_irq(lock) write_unlock(lock)
14025 +
14026 +#define read_unlock_irqrestore(lock, flags)            \
14027 +       do {                                            \
14028 +               typecheck(unsigned long, flags);        \
14029 +               (void) flags;                           \
14030 +               rt_read_unlock(lock);                   \
14031 +       } while (0)
14032 +
14033 +#define write_unlock_irqrestore(lock, flags) \
14034 +       do {                                            \
14035 +               typecheck(unsigned long, flags);        \
14036 +               (void) flags;                           \
14037 +               rt_write_unlock(lock);                  \
14038 +       } while (0)
14039 +
14040 +#endif
14041 diff --git a/include/linux/rwlock_types.h b/include/linux/rwlock_types.h
14042 index cc0072e93e36..d0da966ad7a0 100644
14043 --- a/include/linux/rwlock_types.h
14044 +++ b/include/linux/rwlock_types.h
14045 @@ -1,6 +1,10 @@
14046  #ifndef __LINUX_RWLOCK_TYPES_H
14047  #define __LINUX_RWLOCK_TYPES_H
14048
14049 +#if !defined(__LINUX_SPINLOCK_TYPES_H)
14050 +# error "Do not include directly, include spinlock_types.h"
14051 +#endif
14052 +
14053  /*
14054   * include/linux/rwlock_types.h - generic rwlock type definitions
14055   *                               and initializers
14056 @@ -43,6 +47,7 @@ typedef struct {
14057                                 RW_DEP_MAP_INIT(lockname) }
14058  #endif
14059
14060 -#define DEFINE_RWLOCK(x)       rwlock_t x = __RW_LOCK_UNLOCKED(x)
14061 +#define DEFINE_RWLOCK(name) \
14062 +       rwlock_t name __cacheline_aligned_in_smp = __RW_LOCK_UNLOCKED(name)
14063
14064  #endif /* __LINUX_RWLOCK_TYPES_H */
14065 diff --git a/include/linux/rwlock_types_rt.h b/include/linux/rwlock_types_rt.h
14066 new file mode 100644
14067 index 000000000000..b13832119591
14068 --- /dev/null
14069 +++ b/include/linux/rwlock_types_rt.h
14070 @@ -0,0 +1,33 @@
14071 +#ifndef __LINUX_RWLOCK_TYPES_RT_H
14072 +#define __LINUX_RWLOCK_TYPES_RT_H
14073 +
14074 +#ifndef __LINUX_SPINLOCK_TYPES_H
14075 +#error "Do not include directly. Include spinlock_types.h instead"
14076 +#endif
14077 +
14078 +/*
14079 + * rwlocks - rtmutex which allows single reader recursion
14080 + */
14081 +typedef struct {
14082 +       struct rt_mutex         lock;
14083 +       int                     read_depth;
14084 +       unsigned int            break_lock;
14085 +#ifdef CONFIG_DEBUG_LOCK_ALLOC
14086 +       struct lockdep_map      dep_map;
14087 +#endif
14088 +} rwlock_t;
14089 +
14090 +#ifdef CONFIG_DEBUG_LOCK_ALLOC
14091 +# define RW_DEP_MAP_INIT(lockname)     .dep_map = { .name = #lockname }
14092 +#else
14093 +# define RW_DEP_MAP_INIT(lockname)
14094 +#endif
14095 +
14096 +#define __RW_LOCK_UNLOCKED(name) \
14097 +       { .lock = __RT_MUTEX_INITIALIZER_SAVE_STATE(name.lock), \
14098 +         RW_DEP_MAP_INIT(name) }
14099 +
14100 +#define DEFINE_RWLOCK(name) \
14101 +       rwlock_t name __cacheline_aligned_in_smp = __RW_LOCK_UNLOCKED(name)
14102 +
14103 +#endif
14104 diff --git a/include/linux/rwsem.h b/include/linux/rwsem.h
14105 index 8f498cdde280..2b2148431f14 100644
14106 --- a/include/linux/rwsem.h
14107 +++ b/include/linux/rwsem.h
14108 @@ -18,6 +18,10 @@
14109  #include <linux/osq_lock.h>
14110  #endif
14111
14112 +#ifdef CONFIG_PREEMPT_RT_FULL
14113 +#include <linux/rwsem_rt.h>
14114 +#else /* PREEMPT_RT_FULL */
14115 +
14116  struct rw_semaphore;
14117
14118  #ifdef CONFIG_RWSEM_GENERIC_SPINLOCK
14119 @@ -177,4 +181,6 @@ extern void up_read_non_owner(struct rw_semaphore *sem);
14120  # define up_read_non_owner(sem)                        up_read(sem)
14121  #endif
14122
14123 +#endif /* !PREEMPT_RT_FULL */
14124 +
14125  #endif /* _LINUX_RWSEM_H */
14126 diff --git a/include/linux/rwsem_rt.h b/include/linux/rwsem_rt.h
14127 new file mode 100644
14128 index 000000000000..f97860b2e2a4
14129 --- /dev/null
14130 +++ b/include/linux/rwsem_rt.h
14131 @@ -0,0 +1,152 @@
14132 +#ifndef _LINUX_RWSEM_RT_H
14133 +#define _LINUX_RWSEM_RT_H
14134 +
14135 +#ifndef _LINUX_RWSEM_H
14136 +#error "Include rwsem.h"
14137 +#endif
14138 +
14139 +/*
14140 + * RW-semaphores are a spinlock plus a reader-depth count.
14141 + *
14142 + * Note that the semantics are different from the usual
14143 + * Linux rw-sems, in PREEMPT_RT mode we do not allow
14144 + * multiple readers to hold the lock at once, we only allow
14145 + * a read-lock owner to read-lock recursively. This is
14146 + * better for latency, makes the implementation inherently
14147 + * fair and makes it simpler as well.
14148 + */
14149 +
14150 +#include <linux/rtmutex.h>
14151 +
14152 +struct rw_semaphore {
14153 +       struct rt_mutex         lock;
14154 +       int                     read_depth;
14155 +#ifdef CONFIG_DEBUG_LOCK_ALLOC
14156 +       struct lockdep_map      dep_map;
14157 +#endif
14158 +};
14159 +
14160 +#define __RWSEM_INITIALIZER(name) \
14161 +       { .lock = __RT_MUTEX_INITIALIZER(name.lock), \
14162 +         RW_DEP_MAP_INIT(name) }
14163 +
14164 +#define DECLARE_RWSEM(lockname) \
14165 +       struct rw_semaphore lockname = __RWSEM_INITIALIZER(lockname)
14166 +
14167 +extern void  __rt_rwsem_init(struct rw_semaphore *rwsem, const char *name,
14168 +                                    struct lock_class_key *key);
14169 +
14170 +#define __rt_init_rwsem(sem, name, key)                        \
14171 +       do {                                            \
14172 +               rt_mutex_init(&(sem)->lock);            \
14173 +               __rt_rwsem_init((sem), (name), (key));\
14174 +       } while (0)
14175 +
14176 +#define __init_rwsem(sem, name, key) __rt_init_rwsem(sem, name, key)
14177 +
14178 +# define rt_init_rwsem(sem)                            \
14179 +do {                                                   \
14180 +       static struct lock_class_key __key;             \
14181 +                                                       \
14182 +       __rt_init_rwsem((sem), #sem, &__key);           \
14183 +} while (0)
14184 +
14185 +extern void rt_down_write(struct rw_semaphore *rwsem);
14186 +extern void rt_down_read_nested(struct rw_semaphore *rwsem, int subclass);
14187 +extern void rt_down_write_nested(struct rw_semaphore *rwsem, int subclass);
14188 +extern void rt_down_write_nested_lock(struct rw_semaphore *rwsem,
14189 +                                     struct lockdep_map *nest);
14190 +extern void rt__down_read(struct rw_semaphore *rwsem);
14191 +extern void rt_down_read(struct rw_semaphore *rwsem);
14192 +extern int  rt_down_write_trylock(struct rw_semaphore *rwsem);
14193 +extern int  rt__down_read_trylock(struct rw_semaphore *rwsem);
14194 +extern int  rt_down_read_trylock(struct rw_semaphore *rwsem);
14195 +extern void __rt_up_read(struct rw_semaphore *rwsem);
14196 +extern void rt_up_read(struct rw_semaphore *rwsem);
14197 +extern void rt_up_write(struct rw_semaphore *rwsem);
14198 +extern void rt_downgrade_write(struct rw_semaphore *rwsem);
14199 +
14200 +#define init_rwsem(sem)                rt_init_rwsem(sem)
14201 +#define rwsem_is_locked(s)     rt_mutex_is_locked(&(s)->lock)
14202 +
14203 +static inline int rwsem_is_contended(struct rw_semaphore *sem)
14204 +{
14205 +       /* rt_mutex_has_waiters() */
14206 +       return !RB_EMPTY_ROOT(&sem->lock.waiters);
14207 +}
14208 +
14209 +static inline void __down_read(struct rw_semaphore *sem)
14210 +{
14211 +       rt__down_read(sem);
14212 +}
14213 +
14214 +static inline void down_read(struct rw_semaphore *sem)
14215 +{
14216 +       rt_down_read(sem);
14217 +}
14218 +
14219 +static inline int __down_read_trylock(struct rw_semaphore *sem)
14220 +{
14221 +       return rt__down_read_trylock(sem);
14222 +}
14223 +
14224 +static inline int down_read_trylock(struct rw_semaphore *sem)
14225 +{
14226 +       return rt_down_read_trylock(sem);
14227 +}
14228 +
14229 +static inline void down_write(struct rw_semaphore *sem)
14230 +{
14231 +       rt_down_write(sem);
14232 +}
14233 +
14234 +static inline int down_write_trylock(struct rw_semaphore *sem)
14235 +{
14236 +       return rt_down_write_trylock(sem);
14237 +}
14238 +
14239 +static inline void __up_read(struct rw_semaphore *sem)
14240 +{
14241 +       __rt_up_read(sem);
14242 +}
14243 +
14244 +static inline void up_read(struct rw_semaphore *sem)
14245 +{
14246 +       rt_up_read(sem);
14247 +}
14248 +
14249 +static inline void up_write(struct rw_semaphore *sem)
14250 +{
14251 +       rt_up_write(sem);
14252 +}
14253 +
14254 +static inline void downgrade_write(struct rw_semaphore *sem)
14255 +{
14256 +       rt_downgrade_write(sem);
14257 +}
14258 +
14259 +static inline void down_read_nested(struct rw_semaphore *sem, int subclass)
14260 +{
14261 +       return rt_down_read_nested(sem, subclass);
14262 +}
14263 +
14264 +static inline void down_write_nested(struct rw_semaphore *sem, int subclass)
14265 +{
14266 +       rt_down_write_nested(sem, subclass);
14267 +}
14268 +#ifdef CONFIG_DEBUG_LOCK_ALLOC
14269 +static inline void down_write_nest_lock(struct rw_semaphore *sem,
14270 +               struct rw_semaphore *nest_lock)
14271 +{
14272 +       rt_down_write_nested_lock(sem, &nest_lock->dep_map);
14273 +}
14274 +
14275 +#else
14276 +
14277 +static inline void down_write_nest_lock(struct rw_semaphore *sem,
14278 +               struct rw_semaphore *nest_lock)
14279 +{
14280 +       rt_down_write_nested_lock(sem, NULL);
14281 +}
14282 +#endif
14283 +#endif
14284 diff --git a/include/linux/sched.h b/include/linux/sched.h
14285 index 1c0193baea2a..0570d8e022ec 100644
14286 --- a/include/linux/sched.h
14287 +++ b/include/linux/sched.h
14288 @@ -26,6 +26,7 @@ struct sched_param {
14289  #include <linux/nodemask.h>
14290  #include <linux/mm_types.h>
14291  #include <linux/preempt.h>
14292 +#include <asm/kmap_types.h>
14293
14294  #include <asm/page.h>
14295  #include <asm/ptrace.h>
14296 @@ -182,8 +183,6 @@ extern void update_cpu_load_nohz(void);
14297  static inline void update_cpu_load_nohz(void) { }
14298  #endif
14299
14300 -extern unsigned long get_parent_ip(unsigned long addr);
14301 -
14302  extern void dump_cpu_task(int cpu);
14303
14304  struct seq_file;
14305 @@ -242,10 +241,7 @@ extern char ___assert_task_state[1 - 2*!!(
14306                                  TASK_UNINTERRUPTIBLE | __TASK_STOPPED | \
14307                                  __TASK_TRACED | EXIT_ZOMBIE | EXIT_DEAD)
14308
14309 -#define task_is_traced(task)   ((task->state & __TASK_TRACED) != 0)
14310  #define task_is_stopped(task)  ((task->state & __TASK_STOPPED) != 0)
14311 -#define task_is_stopped_or_traced(task)        \
14312 -                       ((task->state & (__TASK_STOPPED | __TASK_TRACED)) != 0)
14313  #define task_contributes_to_load(task) \
14314                                 ((task->state & TASK_UNINTERRUPTIBLE) != 0 && \
14315                                  (task->flags & PF_FROZEN) == 0 && \
14316 @@ -311,6 +307,11 @@ extern char ___assert_task_state[1 - 2*!!(
14317
14318  #endif
14319
14320 +#define __set_current_state_no_track(state_value)      \
14321 +       do { current->state = (state_value); } while (0)
14322 +#define set_current_state_no_track(state_value)                \
14323 +       set_mb(current->state, (state_value))
14324 +
14325  /* Task command name length */
14326  #define TASK_COMM_LEN 16
14327
14328 @@ -970,8 +971,18 @@ struct wake_q_head {
14329         struct wake_q_head name = { WAKE_Q_TAIL, &name.first }
14330
14331  extern void wake_q_add(struct wake_q_head *head,
14332 -                      struct task_struct *task);
14333 -extern void wake_up_q(struct wake_q_head *head);
14334 +                             struct task_struct *task);
14335 +extern void __wake_up_q(struct wake_q_head *head, bool sleeper);
14336 +
14337 +static inline void wake_up_q(struct wake_q_head *head)
14338 +{
14339 +       __wake_up_q(head, false);
14340 +}
14341 +
14342 +static inline void wake_up_q_sleeper(struct wake_q_head *head)
14343 +{
14344 +       __wake_up_q(head, true);
14345 +}
14346
14347  /*
14348   * sched-domains (multiprocessor balancing) declarations:
14349 @@ -1379,6 +1390,7 @@ struct tlbflush_unmap_batch {
14350
14351  struct task_struct {
14352         volatile long state;    /* -1 unrunnable, 0 runnable, >0 stopped */
14353 +       volatile long saved_state;      /* saved state for "spinlock sleepers" */
14354         void *stack;
14355         atomic_t usage;
14356         unsigned int flags;     /* per process flags, defined below */
14357 @@ -1415,6 +1427,12 @@ struct task_struct {
14358  #endif
14359
14360         unsigned int policy;
14361 +#ifdef CONFIG_PREEMPT_RT_FULL
14362 +       int migrate_disable;
14363 +# ifdef CONFIG_SCHED_DEBUG
14364 +       int migrate_disable_atomic;
14365 +# endif
14366 +#endif
14367         int nr_cpus_allowed;
14368         cpumask_t cpus_allowed;
14369
14370 @@ -1522,11 +1540,14 @@ struct task_struct {
14371         cputime_t gtime;
14372         struct prev_cputime prev_cputime;
14373  #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
14374 -       seqlock_t vtime_seqlock;
14375 +       seqcount_t vtime_seqcount;
14376         unsigned long long vtime_snap;
14377         enum {
14378 -               VTIME_SLEEPING = 0,
14379 +               /* Task is sleeping or running in a CPU with VTIME inactive */
14380 +               VTIME_INACTIVE = 0,
14381 +               /* Task runs in userspace in a CPU with VTIME active */
14382                 VTIME_USER,
14383 +               /* Task runs in kernelspace in a CPU with VTIME active */
14384                 VTIME_SYS,
14385         } vtime_snap_whence;
14386  #endif
14387 @@ -1538,6 +1559,9 @@ struct task_struct {
14388
14389         struct task_cputime cputime_expires;
14390         struct list_head cpu_timers[3];
14391 +#ifdef CONFIG_PREEMPT_RT_BASE
14392 +       struct task_struct *posix_timer_list;
14393 +#endif
14394
14395  /* process credentials */
14396         const struct cred __rcu *real_cred; /* objective and real subjective task
14397 @@ -1568,10 +1592,15 @@ struct task_struct {
14398  /* signal handlers */
14399         struct signal_struct *signal;
14400         struct sighand_struct *sighand;
14401 +       struct sigqueue *sigqueue_cache;
14402
14403         sigset_t blocked, real_blocked;
14404         sigset_t saved_sigmask; /* restored if set_restore_sigmask() was used */
14405         struct sigpending pending;
14406 +#ifdef CONFIG_PREEMPT_RT_FULL
14407 +       /* TODO: move me into ->restart_block ? */
14408 +       struct siginfo forced_info;
14409 +#endif
14410
14411         unsigned long sas_ss_sp;
14412         size_t sas_ss_size;
14413 @@ -1795,6 +1824,12 @@ struct task_struct {
14414         unsigned long trace;
14415         /* bitmask and counter of trace recursion */
14416         unsigned long trace_recursion;
14417 +#ifdef CONFIG_WAKEUP_LATENCY_HIST
14418 +       u64 preempt_timestamp_hist;
14419 +#ifdef CONFIG_MISSED_TIMER_OFFSETS_HIST
14420 +       long timer_offset;
14421 +#endif
14422 +#endif
14423  #endif /* CONFIG_TRACING */
14424  #ifdef CONFIG_MEMCG
14425         struct mem_cgroup *memcg_in_oom;
14426 @@ -1811,9 +1846,23 @@ struct task_struct {
14427         unsigned int    sequential_io;
14428         unsigned int    sequential_io_avg;
14429  #endif
14430 +#ifdef CONFIG_PREEMPT_RT_BASE
14431 +       struct rcu_head put_rcu;
14432 +       int softirq_nestcnt;
14433 +       unsigned int softirqs_raised;
14434 +#endif
14435 +#ifdef CONFIG_PREEMPT_RT_FULL
14436 +# if defined CONFIG_HIGHMEM || defined CONFIG_X86_32
14437 +       int kmap_idx;
14438 +       pte_t kmap_pte[KM_TYPE_NR];
14439 +# endif
14440 +#endif
14441  #ifdef CONFIG_DEBUG_ATOMIC_SLEEP
14442         unsigned long   task_state_change;
14443  #endif
14444 +#ifdef CONFIG_PREEMPT_RT_FULL
14445 +       int xmit_recursion;
14446 +#endif
14447         int pagefault_disabled;
14448  /* CPU-specific state of this task */
14449         struct thread_struct thread;
14450 @@ -1831,9 +1880,6 @@ extern int arch_task_struct_size __read_mostly;
14451  # define arch_task_struct_size (sizeof(struct task_struct))
14452  #endif
14453
14454 -/* Future-safe accessor for struct task_struct's cpus_allowed. */
14455 -#define tsk_cpus_allowed(tsk) (&(tsk)->cpus_allowed)
14456 -
14457  #define TNF_MIGRATED   0x01
14458  #define TNF_NO_GROUP   0x02
14459  #define TNF_SHARED     0x04
14460 @@ -2023,6 +2069,15 @@ extern struct pid *cad_pid;
14461  extern void free_task(struct task_struct *tsk);
14462  #define get_task_struct(tsk) do { atomic_inc(&(tsk)->usage); } while(0)
14463
14464 +#ifdef CONFIG_PREEMPT_RT_BASE
14465 +extern void __put_task_struct_cb(struct rcu_head *rhp);
14466 +
14467 +static inline void put_task_struct(struct task_struct *t)
14468 +{
14469 +       if (atomic_dec_and_test(&t->usage))
14470 +               call_rcu(&t->put_rcu, __put_task_struct_cb);
14471 +}
14472 +#else
14473  extern void __put_task_struct(struct task_struct *t);
14474
14475  static inline void put_task_struct(struct task_struct *t)
14476 @@ -2030,6 +2085,7 @@ static inline void put_task_struct(struct task_struct *t)
14477         if (atomic_dec_and_test(&t->usage))
14478                 __put_task_struct(t);
14479  }
14480 +#endif
14481
14482  #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
14483  extern void task_cputime(struct task_struct *t,
14484 @@ -2068,6 +2124,7 @@ extern void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut,
14485  /*
14486   * Per process flags
14487   */
14488 +#define PF_IN_SOFTIRQ  0x00000001      /* Task is serving softirq */
14489  #define PF_EXITING     0x00000004      /* getting shut down */
14490  #define PF_EXITPIDONE  0x00000008      /* pi exit done on shut down */
14491  #define PF_VCPU                0x00000010      /* I'm a virtual CPU */
14492 @@ -2232,6 +2289,10 @@ extern void do_set_cpus_allowed(struct task_struct *p,
14493
14494  extern int set_cpus_allowed_ptr(struct task_struct *p,
14495                                 const struct cpumask *new_mask);
14496 +int migrate_me(void);
14497 +void tell_sched_cpu_down_begin(int cpu);
14498 +void tell_sched_cpu_down_done(int cpu);
14499 +
14500  #else
14501  static inline void do_set_cpus_allowed(struct task_struct *p,
14502                                       const struct cpumask *new_mask)
14503 @@ -2244,6 +2305,9 @@ static inline int set_cpus_allowed_ptr(struct task_struct *p,
14504                 return -EINVAL;
14505         return 0;
14506  }
14507 +static inline int migrate_me(void) { return 0; }
14508 +static inline void tell_sched_cpu_down_begin(int cpu) { }
14509 +static inline void tell_sched_cpu_down_done(int cpu) { }
14510  #endif
14511
14512  #ifdef CONFIG_NO_HZ_COMMON
14513 @@ -2453,6 +2517,7 @@ extern void xtime_update(unsigned long ticks);
14514
14515  extern int wake_up_state(struct task_struct *tsk, unsigned int state);
14516  extern int wake_up_process(struct task_struct *tsk);
14517 +extern int wake_up_lock_sleeper(struct task_struct * tsk);
14518  extern void wake_up_new_task(struct task_struct *tsk);
14519  #ifdef CONFIG_SMP
14520   extern void kick_process(struct task_struct *tsk);
14521 @@ -2576,12 +2641,24 @@ extern struct mm_struct * mm_alloc(void);
14522
14523  /* mmdrop drops the mm and the page tables */
14524  extern void __mmdrop(struct mm_struct *);
14525 +
14526  static inline void mmdrop(struct mm_struct * mm)
14527  {
14528         if (unlikely(atomic_dec_and_test(&mm->mm_count)))
14529                 __mmdrop(mm);
14530  }
14531
14532 +#ifdef CONFIG_PREEMPT_RT_BASE
14533 +extern void __mmdrop_delayed(struct rcu_head *rhp);
14534 +static inline void mmdrop_delayed(struct mm_struct *mm)
14535 +{
14536 +       if (atomic_dec_and_test(&mm->mm_count))
14537 +               call_rcu(&mm->delayed_drop, __mmdrop_delayed);
14538 +}
14539 +#else
14540 +# define mmdrop_delayed(mm)    mmdrop(mm)
14541 +#endif
14542 +
14543  /* mmput gets rid of the mappings and all user-space */
14544  extern void mmput(struct mm_struct *);
14545  /* Grab a reference to a task's mm, if it is not already going away */
14546 @@ -2891,6 +2968,43 @@ static inline int test_tsk_need_resched(struct task_struct *tsk)
14547         return unlikely(test_tsk_thread_flag(tsk,TIF_NEED_RESCHED));
14548  }
14549
14550 +#ifdef CONFIG_PREEMPT_LAZY
14551 +static inline void set_tsk_need_resched_lazy(struct task_struct *tsk)
14552 +{
14553 +       set_tsk_thread_flag(tsk,TIF_NEED_RESCHED_LAZY);
14554 +}
14555 +
14556 +static inline void clear_tsk_need_resched_lazy(struct task_struct *tsk)
14557 +{
14558 +       clear_tsk_thread_flag(tsk,TIF_NEED_RESCHED_LAZY);
14559 +}
14560 +
14561 +static inline int test_tsk_need_resched_lazy(struct task_struct *tsk)
14562 +{
14563 +       return unlikely(test_tsk_thread_flag(tsk,TIF_NEED_RESCHED_LAZY));
14564 +}
14565 +
14566 +static inline int need_resched_lazy(void)
14567 +{
14568 +       return test_thread_flag(TIF_NEED_RESCHED_LAZY);
14569 +}
14570 +
14571 +static inline int need_resched_now(void)
14572 +{
14573 +       return test_thread_flag(TIF_NEED_RESCHED);
14574 +}
14575 +
14576 +#else
14577 +static inline void clear_tsk_need_resched_lazy(struct task_struct *tsk) { }
14578 +static inline int need_resched_lazy(void) { return 0; }
14579 +
14580 +static inline int need_resched_now(void)
14581 +{
14582 +       return test_thread_flag(TIF_NEED_RESCHED);
14583 +}
14584 +
14585 +#endif
14586 +
14587  static inline int restart_syscall(void)
14588  {
14589         set_tsk_thread_flag(current, TIF_SIGPENDING);
14590 @@ -2922,6 +3036,51 @@ static inline int signal_pending_state(long state, struct task_struct *p)
14591         return (state & TASK_INTERRUPTIBLE) || __fatal_signal_pending(p);
14592  }
14593
14594 +static inline bool __task_is_stopped_or_traced(struct task_struct *task)
14595 +{
14596 +       if (task->state & (__TASK_STOPPED | __TASK_TRACED))
14597 +               return true;
14598 +#ifdef CONFIG_PREEMPT_RT_FULL
14599 +       if (task->saved_state & (__TASK_STOPPED | __TASK_TRACED))
14600 +               return true;
14601 +#endif
14602 +       return false;
14603 +}
14604 +
14605 +static inline bool task_is_stopped_or_traced(struct task_struct *task)
14606 +{
14607 +       bool traced_stopped;
14608 +
14609 +#ifdef CONFIG_PREEMPT_RT_FULL
14610 +       unsigned long flags;
14611 +
14612 +       raw_spin_lock_irqsave(&task->pi_lock, flags);
14613 +       traced_stopped = __task_is_stopped_or_traced(task);
14614 +       raw_spin_unlock_irqrestore(&task->pi_lock, flags);
14615 +#else
14616 +       traced_stopped = __task_is_stopped_or_traced(task);
14617 +#endif
14618 +       return traced_stopped;
14619 +}
14620 +
14621 +static inline bool task_is_traced(struct task_struct *task)
14622 +{
14623 +       bool traced = false;
14624 +
14625 +       if (task->state & __TASK_TRACED)
14626 +               return true;
14627 +#ifdef CONFIG_PREEMPT_RT_FULL
14628 +       /* in case the task is sleeping on tasklist_lock */
14629 +       raw_spin_lock_irq(&task->pi_lock);
14630 +       if (task->state & __TASK_TRACED)
14631 +               traced = true;
14632 +       else if (task->saved_state & __TASK_TRACED)
14633 +               traced = true;
14634 +       raw_spin_unlock_irq(&task->pi_lock);
14635 +#endif
14636 +       return traced;
14637 +}
14638 +
14639  /*
14640   * cond_resched() and cond_resched_lock(): latency reduction via
14641   * explicit rescheduling in places that are safe. The return
14642 @@ -2943,12 +3102,16 @@ extern int __cond_resched_lock(spinlock_t *lock);
14643         __cond_resched_lock(lock);                              \
14644  })
14645
14646 +#ifndef CONFIG_PREEMPT_RT_FULL
14647  extern int __cond_resched_softirq(void);
14648
14649  #define cond_resched_softirq() ({                                      \
14650         ___might_sleep(__FILE__, __LINE__, SOFTIRQ_DISABLE_OFFSET);     \
14651         __cond_resched_softirq();                                       \
14652  })
14653 +#else
14654 +# define cond_resched_softirq()                cond_resched()
14655 +#endif
14656
14657  static inline void cond_resched_rcu(void)
14658  {
14659 @@ -3110,6 +3273,31 @@ static inline void set_task_cpu(struct task_struct *p, unsigned int cpu)
14660
14661  #endif /* CONFIG_SMP */
14662
14663 +static inline int __migrate_disabled(struct task_struct *p)
14664 +{
14665 +#ifdef CONFIG_PREEMPT_RT_FULL
14666 +       return p->migrate_disable;
14667 +#else
14668 +       return 0;
14669 +#endif
14670 +}
14671 +
14672 +/* Future-safe accessor for struct task_struct's cpus_allowed. */
14673 +static inline const struct cpumask *tsk_cpus_allowed(struct task_struct *p)
14674 +{
14675 +       if (__migrate_disabled(p))
14676 +               return cpumask_of(task_cpu(p));
14677 +
14678 +       return &p->cpus_allowed;
14679 +}
14680 +
14681 +static inline int tsk_nr_cpus_allowed(struct task_struct *p)
14682 +{
14683 +       if (__migrate_disabled(p))
14684 +               return 1;
14685 +       return p->nr_cpus_allowed;
14686 +}
14687 +
14688  extern long sched_setaffinity(pid_t pid, const struct cpumask *new_mask);
14689  extern long sched_getaffinity(pid_t pid, struct cpumask *mask);
14690
14691 diff --git a/include/linux/seqlock.h b/include/linux/seqlock.h
14692 index e0582106ef4f..b14f4d2368aa 100644
14693 --- a/include/linux/seqlock.h
14694 +++ b/include/linux/seqlock.h
14695 @@ -220,20 +220,30 @@ static inline int read_seqcount_retry(const seqcount_t *s, unsigned start)
14696         return __read_seqcount_retry(s, start);
14697  }
14698
14699 -
14700 -
14701 -static inline void raw_write_seqcount_begin(seqcount_t *s)
14702 +static inline void __raw_write_seqcount_begin(seqcount_t *s)
14703  {
14704         s->sequence++;
14705         smp_wmb();
14706  }
14707
14708 -static inline void raw_write_seqcount_end(seqcount_t *s)
14709 +static inline void raw_write_seqcount_begin(seqcount_t *s)
14710 +{
14711 +       preempt_disable_rt();
14712 +       __raw_write_seqcount_begin(s);
14713 +}
14714 +
14715 +static inline void __raw_write_seqcount_end(seqcount_t *s)
14716  {
14717         smp_wmb();
14718         s->sequence++;
14719  }
14720
14721 +static inline void raw_write_seqcount_end(seqcount_t *s)
14722 +{
14723 +       __raw_write_seqcount_end(s);
14724 +       preempt_enable_rt();
14725 +}
14726 +
14727  /**
14728   * raw_write_seqcount_barrier - do a seq write barrier
14729   * @s: pointer to seqcount_t
14730 @@ -425,10 +435,32 @@ typedef struct {
14731  /*
14732   * Read side functions for starting and finalizing a read side section.
14733   */
14734 +#ifndef CONFIG_PREEMPT_RT_FULL
14735  static inline unsigned read_seqbegin(const seqlock_t *sl)
14736  {
14737         return read_seqcount_begin(&sl->seqcount);
14738  }
14739 +#else
14740 +/*
14741 + * Starvation safe read side for RT
14742 + */
14743 +static inline unsigned read_seqbegin(seqlock_t *sl)
14744 +{
14745 +       unsigned ret;
14746 +
14747 +repeat:
14748 +       ret = ACCESS_ONCE(sl->seqcount.sequence);
14749 +       if (unlikely(ret & 1)) {
14750 +               /*
14751 +                * Take the lock and let the writer proceed (i.e. evtl
14752 +                * boost it), otherwise we could loop here forever.
14753 +                */
14754 +               spin_unlock_wait(&sl->lock);
14755 +               goto repeat;
14756 +       }
14757 +       return ret;
14758 +}
14759 +#endif
14760
14761  static inline unsigned read_seqretry(const seqlock_t *sl, unsigned start)
14762  {
14763 @@ -443,36 +475,36 @@ static inline unsigned read_seqretry(const seqlock_t *sl, unsigned start)
14764  static inline void write_seqlock(seqlock_t *sl)
14765  {
14766         spin_lock(&sl->lock);
14767 -       write_seqcount_begin(&sl->seqcount);
14768 +       __raw_write_seqcount_begin(&sl->seqcount);
14769  }
14770
14771  static inline void write_sequnlock(seqlock_t *sl)
14772  {
14773 -       write_seqcount_end(&sl->seqcount);
14774 +       __raw_write_seqcount_end(&sl->seqcount);
14775         spin_unlock(&sl->lock);
14776  }
14777
14778  static inline void write_seqlock_bh(seqlock_t *sl)
14779  {
14780         spin_lock_bh(&sl->lock);
14781 -       write_seqcount_begin(&sl->seqcount);
14782 +       __raw_write_seqcount_begin(&sl->seqcount);
14783  }
14784
14785  static inline void write_sequnlock_bh(seqlock_t *sl)
14786  {
14787 -       write_seqcount_end(&sl->seqcount);
14788 +       __raw_write_seqcount_end(&sl->seqcount);
14789         spin_unlock_bh(&sl->lock);
14790  }
14791
14792  static inline void write_seqlock_irq(seqlock_t *sl)
14793  {
14794         spin_lock_irq(&sl->lock);
14795 -       write_seqcount_begin(&sl->seqcount);
14796 +       __raw_write_seqcount_begin(&sl->seqcount);
14797  }
14798
14799  static inline void write_sequnlock_irq(seqlock_t *sl)
14800  {
14801 -       write_seqcount_end(&sl->seqcount);
14802 +       __raw_write_seqcount_end(&sl->seqcount);
14803         spin_unlock_irq(&sl->lock);
14804  }
14805
14806 @@ -481,7 +513,7 @@ static inline unsigned long __write_seqlock_irqsave(seqlock_t *sl)
14807         unsigned long flags;
14808
14809         spin_lock_irqsave(&sl->lock, flags);
14810 -       write_seqcount_begin(&sl->seqcount);
14811 +       __raw_write_seqcount_begin(&sl->seqcount);
14812         return flags;
14813  }
14814
14815 @@ -491,7 +523,7 @@ static inline unsigned long __write_seqlock_irqsave(seqlock_t *sl)
14816  static inline void
14817  write_sequnlock_irqrestore(seqlock_t *sl, unsigned long flags)
14818  {
14819 -       write_seqcount_end(&sl->seqcount);
14820 +       __raw_write_seqcount_end(&sl->seqcount);
14821         spin_unlock_irqrestore(&sl->lock, flags);
14822  }
14823
14824 diff --git a/include/linux/signal.h b/include/linux/signal.h
14825 index d80259afb9e5..ddd1e6866a54 100644
14826 --- a/include/linux/signal.h
14827 +++ b/include/linux/signal.h
14828 @@ -233,6 +233,7 @@ static inline void init_sigpending(struct sigpending *sig)
14829  }
14830
14831  extern void flush_sigqueue(struct sigpending *queue);
14832 +extern void flush_task_sigqueue(struct task_struct *tsk);
14833
14834  /* Test if 'sig' is valid signal. Use this instead of testing _NSIG directly */
14835  static inline int valid_signal(unsigned long sig)
14836 diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
14837 index d443d9ab0236..2d1c7f9b7fd0 100644
14838 --- a/include/linux/skbuff.h
14839 +++ b/include/linux/skbuff.h
14840 @@ -203,6 +203,7 @@ struct sk_buff_head {
14841
14842         __u32           qlen;
14843         spinlock_t      lock;
14844 +       raw_spinlock_t  raw_lock;
14845  };
14846
14847  struct sk_buff;
14848 @@ -1465,6 +1466,12 @@ static inline void skb_queue_head_init(struct sk_buff_head *list)
14849         __skb_queue_head_init(list);
14850  }
14851
14852 +static inline void skb_queue_head_init_raw(struct sk_buff_head *list)
14853 +{
14854 +       raw_spin_lock_init(&list->raw_lock);
14855 +       __skb_queue_head_init(list);
14856 +}
14857 +
14858  static inline void skb_queue_head_init_class(struct sk_buff_head *list,
14859                 struct lock_class_key *class)
14860  {
14861 diff --git a/include/linux/smp.h b/include/linux/smp.h
14862 index c4414074bd88..e6ab36aeaaab 100644
14863 --- a/include/linux/smp.h
14864 +++ b/include/linux/smp.h
14865 @@ -185,6 +185,9 @@ static inline void smp_init(void) { }
14866  #define get_cpu()              ({ preempt_disable(); smp_processor_id(); })
14867  #define put_cpu()              preempt_enable()
14868
14869 +#define get_cpu_light()                ({ migrate_disable(); smp_processor_id(); })
14870 +#define put_cpu_light()                migrate_enable()
14871 +
14872  /*
14873   * Callback to arch code if there's nosmp or maxcpus=0 on the
14874   * boot command line:
14875 diff --git a/include/linux/spinlock.h b/include/linux/spinlock.h
14876 index 47dd0cebd204..02928fa5499d 100644
14877 --- a/include/linux/spinlock.h
14878 +++ b/include/linux/spinlock.h
14879 @@ -271,7 +271,11 @@ static inline void do_raw_spin_unlock(raw_spinlock_t *lock) __releases(lock)
14880  #define raw_spin_can_lock(lock)        (!raw_spin_is_locked(lock))
14881
14882  /* Include rwlock functions */
14883 -#include <linux/rwlock.h>
14884 +#ifdef CONFIG_PREEMPT_RT_FULL
14885 +# include <linux/rwlock_rt.h>
14886 +#else
14887 +# include <linux/rwlock.h>
14888 +#endif
14889
14890  /*
14891   * Pull the _spin_*()/_read_*()/_write_*() functions/declarations:
14892 @@ -282,6 +286,10 @@ static inline void do_raw_spin_unlock(raw_spinlock_t *lock) __releases(lock)
14893  # include <linux/spinlock_api_up.h>
14894  #endif
14895
14896 +#ifdef CONFIG_PREEMPT_RT_FULL
14897 +# include <linux/spinlock_rt.h>
14898 +#else /* PREEMPT_RT_FULL */
14899 +
14900  /*
14901   * Map the spin_lock functions to the raw variants for PREEMPT_RT=n
14902   */
14903 @@ -347,6 +355,12 @@ static __always_inline void spin_unlock(spinlock_t *lock)
14904         raw_spin_unlock(&lock->rlock);
14905  }
14906
14907 +static __always_inline int spin_unlock_no_deboost(spinlock_t *lock)
14908 +{
14909 +       raw_spin_unlock(&lock->rlock);
14910 +       return 0;
14911 +}
14912 +
14913  static __always_inline void spin_unlock_bh(spinlock_t *lock)
14914  {
14915         raw_spin_unlock_bh(&lock->rlock);
14916 @@ -416,4 +430,6 @@ extern int _atomic_dec_and_lock(atomic_t *atomic, spinlock_t *lock);
14917  #define atomic_dec_and_lock(atomic, lock) \
14918                 __cond_lock(lock, _atomic_dec_and_lock(atomic, lock))
14919
14920 +#endif /* !PREEMPT_RT_FULL */
14921 +
14922  #endif /* __LINUX_SPINLOCK_H */
14923 diff --git a/include/linux/spinlock_api_smp.h b/include/linux/spinlock_api_smp.h
14924 index 5344268e6e62..043263f30e81 100644
14925 --- a/include/linux/spinlock_api_smp.h
14926 +++ b/include/linux/spinlock_api_smp.h
14927 @@ -189,6 +189,8 @@ static inline int __raw_spin_trylock_bh(raw_spinlock_t *lock)
14928         return 0;
14929  }
14930
14931 -#include <linux/rwlock_api_smp.h>
14932 +#ifndef CONFIG_PREEMPT_RT_FULL
14933 +# include <linux/rwlock_api_smp.h>
14934 +#endif
14935
14936  #endif /* __LINUX_SPINLOCK_API_SMP_H */
14937 diff --git a/include/linux/spinlock_rt.h b/include/linux/spinlock_rt.h
14938 new file mode 100644
14939 index 000000000000..7eb87584e843
14940 --- /dev/null
14941 +++ b/include/linux/spinlock_rt.h
14942 @@ -0,0 +1,165 @@
14943 +#ifndef __LINUX_SPINLOCK_RT_H
14944 +#define __LINUX_SPINLOCK_RT_H
14945 +
14946 +#ifndef __LINUX_SPINLOCK_H
14947 +#error Do not include directly. Use spinlock.h
14948 +#endif
14949 +
14950 +#include <linux/bug.h>
14951 +
14952 +extern void
14953 +__rt_spin_lock_init(spinlock_t *lock, char *name, struct lock_class_key *key);
14954 +
14955 +#define spin_lock_init(slock)                          \
14956 +do {                                                   \
14957 +       static struct lock_class_key __key;             \
14958 +                                                       \
14959 +       rt_mutex_init(&(slock)->lock);                  \
14960 +       __rt_spin_lock_init(slock, #slock, &__key);     \
14961 +} while (0)
14962 +
14963 +void __lockfunc rt_spin_lock__no_mg(spinlock_t *lock);
14964 +void __lockfunc rt_spin_unlock__no_mg(spinlock_t *lock);
14965 +int __lockfunc rt_spin_trylock__no_mg(spinlock_t *lock);
14966 +
14967 +extern void __lockfunc rt_spin_lock(spinlock_t *lock);
14968 +extern unsigned long __lockfunc rt_spin_lock_trace_flags(spinlock_t *lock);
14969 +extern void __lockfunc rt_spin_lock_nested(spinlock_t *lock, int subclass);
14970 +extern void __lockfunc rt_spin_unlock(spinlock_t *lock);
14971 +extern int __lockfunc rt_spin_unlock_no_deboost(spinlock_t *lock);
14972 +extern void __lockfunc rt_spin_unlock_wait(spinlock_t *lock);
14973 +extern int __lockfunc rt_spin_trylock_irqsave(spinlock_t *lock, unsigned long *flags);
14974 +extern int __lockfunc rt_spin_trylock_bh(spinlock_t *lock);
14975 +extern int __lockfunc rt_spin_trylock(spinlock_t *lock);
14976 +extern int atomic_dec_and_spin_lock(atomic_t *atomic, spinlock_t *lock);
14977 +
14978 +/*
14979 + * lockdep-less calls, for derived types like rwlock:
14980 + * (for trylock they can use rt_mutex_trylock() directly.
14981 + */
14982 +extern void __lockfunc __rt_spin_lock__no_mg(struct rt_mutex *lock);
14983 +extern void __lockfunc __rt_spin_lock(struct rt_mutex *lock);
14984 +extern void __lockfunc __rt_spin_unlock(struct rt_mutex *lock);
14985 +extern int __lockfunc __rt_spin_trylock(struct rt_mutex *lock);
14986 +
14987 +#define spin_lock(lock)                        rt_spin_lock(lock)
14988 +
14989 +#define spin_lock_bh(lock)                     \
14990 +       do {                                    \
14991 +               local_bh_disable();             \
14992 +               rt_spin_lock(lock);             \
14993 +       } while (0)
14994 +
14995 +#define spin_lock_irq(lock)            spin_lock(lock)
14996 +
14997 +#define spin_do_trylock(lock)          __cond_lock(lock, rt_spin_trylock(lock))
14998 +
14999 +#define spin_trylock(lock)                     \
15000 +({                                             \
15001 +       int __locked;                           \
15002 +       __locked = spin_do_trylock(lock);       \
15003 +       __locked;                               \
15004 +})
15005 +
15006 +#ifdef CONFIG_LOCKDEP
15007 +# define spin_lock_nested(lock, subclass)              \
15008 +       do {                                            \
15009 +               rt_spin_lock_nested(lock, subclass);    \
15010 +       } while (0)
15011 +
15012 +#define spin_lock_bh_nested(lock, subclass)            \
15013 +       do {                                            \
15014 +               local_bh_disable();                     \
15015 +               rt_spin_lock_nested(lock, subclass);    \
15016 +       } while (0)
15017 +
15018 +# define spin_lock_irqsave_nested(lock, flags, subclass) \
15019 +       do {                                             \
15020 +               typecheck(unsigned long, flags);         \
15021 +               flags = 0;                               \
15022 +               rt_spin_lock_nested(lock, subclass);     \
15023 +       } while (0)
15024 +#else
15025 +# define spin_lock_nested(lock, subclass)      spin_lock(lock)
15026 +# define spin_lock_bh_nested(lock, subclass)   spin_lock_bh(lock)
15027 +
15028 +# define spin_lock_irqsave_nested(lock, flags, subclass) \
15029 +       do {                                             \
15030 +               typecheck(unsigned long, flags);         \
15031 +               flags = 0;                               \
15032 +               spin_lock(lock);                         \
15033 +       } while (0)
15034 +#endif
15035 +
15036 +#define spin_lock_irqsave(lock, flags)                  \
15037 +       do {                                             \
15038 +               typecheck(unsigned long, flags);         \
15039 +               flags = 0;                               \
15040 +               spin_lock(lock);                         \
15041 +       } while (0)
15042 +
15043 +static inline unsigned long spin_lock_trace_flags(spinlock_t *lock)
15044 +{
15045 +       unsigned long flags = 0;
15046 +#ifdef CONFIG_TRACE_IRQFLAGS
15047 +       flags = rt_spin_lock_trace_flags(lock);
15048 +#else
15049 +       spin_lock(lock); /* lock_local */
15050 +#endif
15051 +       return flags;
15052 +}
15053 +
15054 +/* FIXME: we need rt_spin_lock_nest_lock */
15055 +#define spin_lock_nest_lock(lock, nest_lock) spin_lock_nested(lock, 0)
15056 +
15057 +#define spin_unlock(lock)                      rt_spin_unlock(lock)
15058 +#define spin_unlock_no_deboost(lock)           rt_spin_unlock_no_deboost(lock)
15059 +
15060 +#define spin_unlock_bh(lock)                           \
15061 +       do {                                            \
15062 +               rt_spin_unlock(lock);                   \
15063 +               local_bh_enable();                      \
15064 +       } while (0)
15065 +
15066 +#define spin_unlock_irq(lock)          spin_unlock(lock)
15067 +
15068 +#define spin_unlock_irqrestore(lock, flags)            \
15069 +       do {                                            \
15070 +               typecheck(unsigned long, flags);        \
15071 +               (void) flags;                           \
15072 +               spin_unlock(lock);                      \
15073 +       } while (0)
15074 +
15075 +#define spin_trylock_bh(lock)  __cond_lock(lock, rt_spin_trylock_bh(lock))
15076 +#define spin_trylock_irq(lock) spin_trylock(lock)
15077 +
15078 +#define spin_trylock_irqsave(lock, flags)      \
15079 +       rt_spin_trylock_irqsave(lock, &(flags))
15080 +
15081 +#define spin_unlock_wait(lock)         rt_spin_unlock_wait(lock)
15082 +
15083 +#ifdef CONFIG_GENERIC_LOCKBREAK
15084 +# define spin_is_contended(lock)       ((lock)->break_lock)
15085 +#else
15086 +# define spin_is_contended(lock)       (((void)(lock), 0))
15087 +#endif
15088 +
15089 +static inline int spin_can_lock(spinlock_t *lock)
15090 +{
15091 +       return !rt_mutex_is_locked(&lock->lock);
15092 +}
15093 +
15094 +static inline int spin_is_locked(spinlock_t *lock)
15095 +{
15096 +       return rt_mutex_is_locked(&lock->lock);
15097 +}
15098 +
15099 +static inline void assert_spin_locked(spinlock_t *lock)
15100 +{
15101 +       BUG_ON(!spin_is_locked(lock));
15102 +}
15103 +
15104 +#define atomic_dec_and_lock(atomic, lock) \
15105 +       atomic_dec_and_spin_lock(atomic, lock)
15106 +
15107 +#endif
15108 diff --git a/include/linux/spinlock_types.h b/include/linux/spinlock_types.h
15109 index 73548eb13a5d..10bac715ea96 100644
15110 --- a/include/linux/spinlock_types.h
15111 +++ b/include/linux/spinlock_types.h
15112 @@ -9,80 +9,15 @@
15113   * Released under the General Public License (GPL).
15114   */
15115
15116 -#if defined(CONFIG_SMP)
15117 -# include <asm/spinlock_types.h>
15118 -#else
15119 -# include <linux/spinlock_types_up.h>
15120 -#endif
15121 -
15122 -#include <linux/lockdep.h>
15123 -
15124 -typedef struct raw_spinlock {
15125 -       arch_spinlock_t raw_lock;
15126 -#ifdef CONFIG_GENERIC_LOCKBREAK
15127 -       unsigned int break_lock;
15128 -#endif
15129 -#ifdef CONFIG_DEBUG_SPINLOCK
15130 -       unsigned int magic, owner_cpu;
15131 -       void *owner;
15132 -#endif
15133 -#ifdef CONFIG_DEBUG_LOCK_ALLOC
15134 -       struct lockdep_map dep_map;
15135 -#endif
15136 -} raw_spinlock_t;
15137 -
15138 -#define SPINLOCK_MAGIC         0xdead4ead
15139 -
15140 -#define SPINLOCK_OWNER_INIT    ((void *)-1L)
15141 -
15142 -#ifdef CONFIG_DEBUG_LOCK_ALLOC
15143 -# define SPIN_DEP_MAP_INIT(lockname)   .dep_map = { .name = #lockname }
15144 -#else
15145 -# define SPIN_DEP_MAP_INIT(lockname)
15146 -#endif
15147 +#include <linux/spinlock_types_raw.h>
15148
15149 -#ifdef CONFIG_DEBUG_SPINLOCK
15150 -# define SPIN_DEBUG_INIT(lockname)             \
15151 -       .magic = SPINLOCK_MAGIC,                \
15152 -       .owner_cpu = -1,                        \
15153 -       .owner = SPINLOCK_OWNER_INIT,
15154 +#ifndef CONFIG_PREEMPT_RT_FULL
15155 +# include <linux/spinlock_types_nort.h>
15156 +# include <linux/rwlock_types.h>
15157  #else
15158 -# define SPIN_DEBUG_INIT(lockname)
15159 +# include <linux/rtmutex.h>
15160 +# include <linux/spinlock_types_rt.h>
15161 +# include <linux/rwlock_types_rt.h>
15162  #endif
15163
15164 -#define __RAW_SPIN_LOCK_INITIALIZER(lockname)  \
15165 -       {                                       \
15166 -       .raw_lock = __ARCH_SPIN_LOCK_UNLOCKED,  \
15167 -       SPIN_DEBUG_INIT(lockname)               \
15168 -       SPIN_DEP_MAP_INIT(lockname) }
15169 -
15170 -#define __RAW_SPIN_LOCK_UNLOCKED(lockname)     \
15171 -       (raw_spinlock_t) __RAW_SPIN_LOCK_INITIALIZER(lockname)
15172 -
15173 -#define DEFINE_RAW_SPINLOCK(x) raw_spinlock_t x = __RAW_SPIN_LOCK_UNLOCKED(x)
15174 -
15175 -typedef struct spinlock {
15176 -       union {
15177 -               struct raw_spinlock rlock;
15178 -
15179 -#ifdef CONFIG_DEBUG_LOCK_ALLOC
15180 -# define LOCK_PADSIZE (offsetof(struct raw_spinlock, dep_map))
15181 -               struct {
15182 -                       u8 __padding[LOCK_PADSIZE];
15183 -                       struct lockdep_map dep_map;
15184 -               };
15185 -#endif
15186 -       };
15187 -} spinlock_t;
15188 -
15189 -#define __SPIN_LOCK_INITIALIZER(lockname) \
15190 -       { { .rlock = __RAW_SPIN_LOCK_INITIALIZER(lockname) } }
15191 -
15192 -#define __SPIN_LOCK_UNLOCKED(lockname) \
15193 -       (spinlock_t ) __SPIN_LOCK_INITIALIZER(lockname)
15194 -
15195 -#define DEFINE_SPINLOCK(x)     spinlock_t x = __SPIN_LOCK_UNLOCKED(x)
15196 -
15197 -#include <linux/rwlock_types.h>
15198 -
15199  #endif /* __LINUX_SPINLOCK_TYPES_H */
15200 diff --git a/include/linux/spinlock_types_nort.h b/include/linux/spinlock_types_nort.h
15201 new file mode 100644
15202 index 000000000000..f1dac1fb1d6a
15203 --- /dev/null
15204 +++ b/include/linux/spinlock_types_nort.h
15205 @@ -0,0 +1,33 @@
15206 +#ifndef __LINUX_SPINLOCK_TYPES_NORT_H
15207 +#define __LINUX_SPINLOCK_TYPES_NORT_H
15208 +
15209 +#ifndef __LINUX_SPINLOCK_TYPES_H
15210 +#error "Do not include directly. Include spinlock_types.h instead"
15211 +#endif
15212 +
15213 +/*
15214 + * The non RT version maps spinlocks to raw_spinlocks
15215 + */
15216 +typedef struct spinlock {
15217 +       union {
15218 +               struct raw_spinlock rlock;
15219 +
15220 +#ifdef CONFIG_DEBUG_LOCK_ALLOC
15221 +# define LOCK_PADSIZE (offsetof(struct raw_spinlock, dep_map))
15222 +               struct {
15223 +                       u8 __padding[LOCK_PADSIZE];
15224 +                       struct lockdep_map dep_map;
15225 +               };
15226 +#endif
15227 +       };
15228 +} spinlock_t;
15229 +
15230 +#define __SPIN_LOCK_INITIALIZER(lockname) \
15231 +       { { .rlock = __RAW_SPIN_LOCK_INITIALIZER(lockname) } }
15232 +
15233 +#define __SPIN_LOCK_UNLOCKED(lockname) \
15234 +       (spinlock_t ) __SPIN_LOCK_INITIALIZER(lockname)
15235 +
15236 +#define DEFINE_SPINLOCK(x)     spinlock_t x = __SPIN_LOCK_UNLOCKED(x)
15237 +
15238 +#endif
15239 diff --git a/include/linux/spinlock_types_raw.h b/include/linux/spinlock_types_raw.h
15240 new file mode 100644
15241 index 000000000000..edffc4d53fc9
15242 --- /dev/null
15243 +++ b/include/linux/spinlock_types_raw.h
15244 @@ -0,0 +1,56 @@
15245 +#ifndef __LINUX_SPINLOCK_TYPES_RAW_H
15246 +#define __LINUX_SPINLOCK_TYPES_RAW_H
15247 +
15248 +#if defined(CONFIG_SMP)
15249 +# include <asm/spinlock_types.h>
15250 +#else
15251 +# include <linux/spinlock_types_up.h>
15252 +#endif
15253 +
15254 +#include <linux/lockdep.h>
15255 +
15256 +typedef struct raw_spinlock {
15257 +       arch_spinlock_t raw_lock;
15258 +#ifdef CONFIG_GENERIC_LOCKBREAK
15259 +       unsigned int break_lock;
15260 +#endif
15261 +#ifdef CONFIG_DEBUG_SPINLOCK
15262 +       unsigned int magic, owner_cpu;
15263 +       void *owner;
15264 +#endif
15265 +#ifdef CONFIG_DEBUG_LOCK_ALLOC
15266 +       struct lockdep_map dep_map;
15267 +#endif
15268 +} raw_spinlock_t;
15269 +
15270 +#define SPINLOCK_MAGIC         0xdead4ead
15271 +
15272 +#define SPINLOCK_OWNER_INIT    ((void *)-1L)
15273 +
15274 +#ifdef CONFIG_DEBUG_LOCK_ALLOC
15275 +# define SPIN_DEP_MAP_INIT(lockname)   .dep_map = { .name = #lockname }
15276 +#else
15277 +# define SPIN_DEP_MAP_INIT(lockname)
15278 +#endif
15279 +
15280 +#ifdef CONFIG_DEBUG_SPINLOCK
15281 +# define SPIN_DEBUG_INIT(lockname)             \
15282 +       .magic = SPINLOCK_MAGIC,                \
15283 +       .owner_cpu = -1,                        \
15284 +       .owner = SPINLOCK_OWNER_INIT,
15285 +#else
15286 +# define SPIN_DEBUG_INIT(lockname)
15287 +#endif
15288 +
15289 +#define __RAW_SPIN_LOCK_INITIALIZER(lockname)  \
15290 +       {                                       \
15291 +       .raw_lock = __ARCH_SPIN_LOCK_UNLOCKED,  \
15292 +       SPIN_DEBUG_INIT(lockname)               \
15293 +       SPIN_DEP_MAP_INIT(lockname) }
15294 +
15295 +#define __RAW_SPIN_LOCK_UNLOCKED(lockname)     \
15296 +       (raw_spinlock_t) __RAW_SPIN_LOCK_INITIALIZER(lockname)
15297 +
15298 +#define DEFINE_RAW_SPINLOCK(x) raw_spinlock_t x = __RAW_SPIN_LOCK_UNLOCKED(x)
15299 +
15300 +#endif
15301 diff --git a/include/linux/spinlock_types_rt.h b/include/linux/spinlock_types_rt.h
15302 new file mode 100644
15303 index 000000000000..9fd431967abc
15304 --- /dev/null
15305 +++ b/include/linux/spinlock_types_rt.h
15306 @@ -0,0 +1,51 @@
15307 +#ifndef __LINUX_SPINLOCK_TYPES_RT_H
15308 +#define __LINUX_SPINLOCK_TYPES_RT_H
15309 +
15310 +#ifndef __LINUX_SPINLOCK_TYPES_H
15311 +#error "Do not include directly. Include spinlock_types.h instead"
15312 +#endif
15313 +
15314 +#include <linux/cache.h>
15315 +
15316 +/*
15317 + * PREEMPT_RT: spinlocks - an RT mutex plus lock-break field:
15318 + */
15319 +typedef struct spinlock {
15320 +       struct rt_mutex         lock;
15321 +       unsigned int            break_lock;
15322 +#ifdef CONFIG_DEBUG_LOCK_ALLOC
15323 +       struct lockdep_map      dep_map;
15324 +#endif
15325 +} spinlock_t;
15326 +
15327 +#ifdef CONFIG_DEBUG_RT_MUTEXES
15328 +# define __RT_SPIN_INITIALIZER(name) \
15329 +       { \
15330 +       .wait_lock = __RAW_SPIN_LOCK_UNLOCKED(name.wait_lock), \
15331 +       .save_state = 1, \
15332 +       .file = __FILE__, \
15333 +       .line = __LINE__ , \
15334 +       }
15335 +#else
15336 +# define __RT_SPIN_INITIALIZER(name) \
15337 +       {                                                               \
15338 +       .wait_lock = __RAW_SPIN_LOCK_UNLOCKED(name.wait_lock),          \
15339 +       .save_state = 1, \
15340 +       }
15341 +#endif
15342 +
15343 +/*
15344 +.wait_list = PLIST_HEAD_INIT_RAW((name).lock.wait_list, (name).lock.wait_lock)
15345 +*/
15346 +
15347 +#define __SPIN_LOCK_UNLOCKED(name)                     \
15348 +       { .lock = __RT_SPIN_INITIALIZER(name.lock),             \
15349 +         SPIN_DEP_MAP_INIT(name) }
15350 +
15351 +#define __DEFINE_SPINLOCK(name) \
15352 +       spinlock_t name = __SPIN_LOCK_UNLOCKED(name)
15353 +
15354 +#define DEFINE_SPINLOCK(name) \
15355 +       spinlock_t name __cacheline_aligned_in_smp = __SPIN_LOCK_UNLOCKED(name)
15356 +
15357 +#endif
15358 diff --git a/include/linux/srcu.h b/include/linux/srcu.h
15359 index f5f80c5643ac..ec1a8f01563c 100644
15360 --- a/include/linux/srcu.h
15361 +++ b/include/linux/srcu.h
15362 @@ -84,10 +84,10 @@ int init_srcu_struct(struct srcu_struct *sp);
15363
15364  void process_srcu(struct work_struct *work);
15365
15366 -#define __SRCU_STRUCT_INIT(name)                                       \
15367 +#define __SRCU_STRUCT_INIT(name, pcpu_name)                            \
15368         {                                                               \
15369                 .completed = -300,                                      \
15370 -               .per_cpu_ref = &name##_srcu_array,                      \
15371 +               .per_cpu_ref = &pcpu_name,                              \
15372                 .queue_lock = __SPIN_LOCK_UNLOCKED(name.queue_lock),    \
15373                 .running = false,                                       \
15374                 .batch_queue = RCU_BATCH_INIT(name.batch_queue),        \
15375 @@ -104,7 +104,7 @@ void process_srcu(struct work_struct *work);
15376   */
15377  #define __DEFINE_SRCU(name, is_static)                                 \
15378         static DEFINE_PER_CPU(struct srcu_struct_array, name##_srcu_array);\
15379 -       is_static struct srcu_struct name = __SRCU_STRUCT_INIT(name)
15380 +       is_static struct srcu_struct name = __SRCU_STRUCT_INIT(name, name##_srcu_array)
15381  #define DEFINE_SRCU(name)              __DEFINE_SRCU(name, /* not static */)
15382  #define DEFINE_STATIC_SRCU(name)       __DEFINE_SRCU(name, static)
15383
15384 diff --git a/include/linux/suspend.h b/include/linux/suspend.h
15385 index 8b6ec7ef0854..9b77d4cc929f 100644
15386 --- a/include/linux/suspend.h
15387 +++ b/include/linux/suspend.h
15388 @@ -194,6 +194,12 @@ struct platform_freeze_ops {
15389         void (*end)(void);
15390  };
15391
15392 +#if defined(CONFIG_SUSPEND) || defined(CONFIG_HIBERNATION)
15393 +extern bool pm_in_action;
15394 +#else
15395 +# define pm_in_action false
15396 +#endif
15397 +
15398  #ifdef CONFIG_SUSPEND
15399  /**
15400   * suspend_set_ops - set platform dependent suspend operations
15401 diff --git a/include/linux/swait.h b/include/linux/swait.h
15402 new file mode 100644
15403 index 000000000000..83f004a72320
15404 --- /dev/null
15405 +++ b/include/linux/swait.h
15406 @@ -0,0 +1,173 @@
15407 +#ifndef _LINUX_SWAIT_H
15408 +#define _LINUX_SWAIT_H
15409 +
15410 +#include <linux/list.h>
15411 +#include <linux/stddef.h>
15412 +#include <linux/spinlock.h>
15413 +#include <asm/current.h>
15414 +
15415 +/*
15416 + * Simple wait queues
15417 + *
15418 + * While these are very similar to the other/complex wait queues (wait.h) the
15419 + * most important difference is that the simple waitqueue allows for
15420 + * deterministic behaviour -- IOW it has strictly bounded IRQ and lock hold
15421 + * times.
15422 + *
15423 + * In order to make this so, we had to drop a fair number of features of the
15424 + * other waitqueue code; notably:
15425 + *
15426 + *  - mixing INTERRUPTIBLE and UNINTERRUPTIBLE sleeps on the same waitqueue;
15427 + *    all wakeups are TASK_NORMAL in order to avoid O(n) lookups for the right
15428 + *    sleeper state.
15429 + *
15430 + *  - the exclusive mode; because this requires preserving the list order
15431 + *    and this is hard.
15432 + *
15433 + *  - custom wake functions; because you cannot give any guarantees about
15434 + *    random code.
15435 + *
15436 + * As a side effect of this; the data structures are slimmer.
15437 + *
15438 + * One would recommend using this wait queue where possible.
15439 + */
15440 +
15441 +struct task_struct;
15442 +
15443 +struct swait_queue_head {
15444 +       raw_spinlock_t          lock;
15445 +       struct list_head        task_list;
15446 +};
15447 +
15448 +struct swait_queue {
15449 +       struct task_struct      *task;
15450 +       struct list_head        task_list;
15451 +};
15452 +
15453 +#define __SWAITQUEUE_INITIALIZER(name) {                               \
15454 +       .task           = current,                                      \
15455 +       .task_list      = LIST_HEAD_INIT((name).task_list),             \
15456 +}
15457 +
15458 +#define DECLARE_SWAITQUEUE(name)                                       \
15459 +       struct swait_queue name = __SWAITQUEUE_INITIALIZER(name)
15460 +
15461 +#define __SWAIT_QUEUE_HEAD_INITIALIZER(name) {                         \
15462 +       .lock           = __RAW_SPIN_LOCK_UNLOCKED(name.lock),          \
15463 +       .task_list      = LIST_HEAD_INIT((name).task_list),             \
15464 +}
15465 +
15466 +#define DECLARE_SWAIT_QUEUE_HEAD(name)                                 \
15467 +       struct swait_queue_head name = __SWAIT_QUEUE_HEAD_INITIALIZER(name)
15468 +
15469 +extern void __init_swait_queue_head(struct swait_queue_head *q, const char *name,
15470 +                                   struct lock_class_key *key);
15471 +
15472 +#define init_swait_queue_head(q)                               \
15473 +       do {                                                    \
15474 +               static struct lock_class_key __key;             \
15475 +               __init_swait_queue_head((q), #q, &__key);       \
15476 +       } while (0)
15477 +
15478 +#ifdef CONFIG_LOCKDEP
15479 +# define __SWAIT_QUEUE_HEAD_INIT_ONSTACK(name)                 \
15480 +       ({ init_swait_queue_head(&name); name; })
15481 +# define DECLARE_SWAIT_QUEUE_HEAD_ONSTACK(name)                        \
15482 +       struct swait_queue_head name = __SWAIT_QUEUE_HEAD_INIT_ONSTACK(name)
15483 +#else
15484 +# define DECLARE_SWAIT_QUEUE_HEAD_ONSTACK(name)                        \
15485 +       DECLARE_SWAIT_QUEUE_HEAD(name)
15486 +#endif
15487 +
15488 +static inline int swait_active(struct swait_queue_head *q)
15489 +{
15490 +       return !list_empty(&q->task_list);
15491 +}
15492 +
15493 +extern void swake_up(struct swait_queue_head *q);
15494 +extern void swake_up_all(struct swait_queue_head *q);
15495 +extern void swake_up_locked(struct swait_queue_head *q);
15496 +extern void swake_up_all_locked(struct swait_queue_head *q);
15497 +
15498 +extern void __prepare_to_swait(struct swait_queue_head *q, struct swait_queue *wait);
15499 +extern void prepare_to_swait(struct swait_queue_head *q, struct swait_queue *wait, int state);
15500 +extern long prepare_to_swait_event(struct swait_queue_head *q, struct swait_queue *wait, int state);
15501 +
15502 +extern void __finish_swait(struct swait_queue_head *q, struct swait_queue *wait);
15503 +extern void finish_swait(struct swait_queue_head *q, struct swait_queue *wait);
15504 +
15505 +/* as per ___wait_event() but for swait, therefore "exclusive == 0" */
15506 +#define ___swait_event(wq, condition, state, ret, cmd)                 \
15507 +({                                                                     \
15508 +       struct swait_queue __wait;                                      \
15509 +       long __ret = ret;                                               \
15510 +                                                                       \
15511 +       INIT_LIST_HEAD(&__wait.task_list);                              \
15512 +       for (;;) {                                                      \
15513 +               long __int = prepare_to_swait_event(&wq, &__wait, state);\
15514 +                                                                       \
15515 +               if (condition)                                          \
15516 +                       break;                                          \
15517 +                                                                       \
15518 +               if (___wait_is_interruptible(state) && __int) {         \
15519 +                       __ret = __int;                                  \
15520 +                       break;                                          \
15521 +               }                                                       \
15522 +                                                                       \
15523 +               cmd;                                                    \
15524 +       }                                                               \
15525 +       finish_swait(&wq, &__wait);                                     \
15526 +       __ret;                                                          \
15527 +})
15528 +
15529 +#define __swait_event(wq, condition)                                   \
15530 +       (void)___swait_event(wq, condition, TASK_UNINTERRUPTIBLE, 0,    \
15531 +                           schedule())
15532 +
15533 +#define swait_event(wq, condition)                                     \
15534 +do {                                                                   \
15535 +       if (condition)                                                  \
15536 +               break;                                                  \
15537 +       __swait_event(wq, condition);                                   \
15538 +} while (0)
15539 +
15540 +#define __swait_event_timeout(wq, condition, timeout)                  \
15541 +       ___swait_event(wq, ___wait_cond_timeout(condition),             \
15542 +                     TASK_UNINTERRUPTIBLE, timeout,                    \
15543 +                     __ret = schedule_timeout(__ret))
15544 +
15545 +#define swait_event_timeout(wq, condition, timeout)                    \
15546 +({                                                                     \
15547 +       long __ret = timeout;                                           \
15548 +       if (!___wait_cond_timeout(condition))                           \
15549 +               __ret = __swait_event_timeout(wq, condition, timeout);  \
15550 +       __ret;                                                          \
15551 +})
15552 +
15553 +#define __swait_event_interruptible(wq, condition)                     \
15554 +       ___swait_event(wq, condition, TASK_INTERRUPTIBLE, 0,            \
15555 +                     schedule())
15556 +
15557 +#define swait_event_interruptible(wq, condition)                       \
15558 +({                                                                     \
15559 +       int __ret = 0;                                                  \
15560 +       if (!(condition))                                               \
15561 +               __ret = __swait_event_interruptible(wq, condition);     \
15562 +       __ret;                                                          \
15563 +})
15564 +
15565 +#define __swait_event_interruptible_timeout(wq, condition, timeout)    \
15566 +       ___swait_event(wq, ___wait_cond_timeout(condition),             \
15567 +                     TASK_INTERRUPTIBLE, timeout,                      \
15568 +                     __ret = schedule_timeout(__ret))
15569 +
15570 +#define swait_event_interruptible_timeout(wq, condition, timeout)      \
15571 +({                                                                     \
15572 +       long __ret = timeout;                                           \
15573 +       if (!___wait_cond_timeout(condition))                           \
15574 +               __ret = __swait_event_interruptible_timeout(wq,         \
15575 +                                               condition, timeout);    \
15576 +       __ret;                                                          \
15577 +})
15578 +
15579 +#endif /* _LINUX_SWAIT_H */
15580 diff --git a/include/linux/swap.h b/include/linux/swap.h
15581 index d8ca2eaa3a8b..19e038054914 100644
15582 --- a/include/linux/swap.h
15583 +++ b/include/linux/swap.h
15584 @@ -11,6 +11,7 @@
15585  #include <linux/fs.h>
15586  #include <linux/atomic.h>
15587  #include <linux/page-flags.h>
15588 +#include <linux/locallock.h>
15589  #include <asm/page.h>
15590
15591  struct notifier_block;
15592 @@ -252,7 +253,8 @@ struct swap_info_struct {
15593  void *workingset_eviction(struct address_space *mapping, struct page *page);
15594  bool workingset_refault(void *shadow);
15595  void workingset_activation(struct page *page);
15596 -extern struct list_lru workingset_shadow_nodes;
15597 +extern struct list_lru __workingset_shadow_nodes;
15598 +DECLARE_LOCAL_IRQ_LOCK(workingset_shadow_lock);
15599
15600  static inline unsigned int workingset_node_pages(struct radix_tree_node *node)
15601  {
15602 @@ -298,6 +300,7 @@ extern unsigned long nr_free_pagecache_pages(void);
15603
15604
15605  /* linux/mm/swap.c */
15606 +DECLARE_LOCAL_IRQ_LOCK(swapvec_lock);
15607  extern void lru_cache_add(struct page *);
15608  extern void lru_cache_add_anon(struct page *page);
15609  extern void lru_cache_add_file(struct page *page);
15610 diff --git a/include/linux/swork.h b/include/linux/swork.h
15611 new file mode 100644
15612 index 000000000000..f175fa9a6016
15613 --- /dev/null
15614 +++ b/include/linux/swork.h
15615 @@ -0,0 +1,24 @@
15616 +#ifndef _LINUX_SWORK_H
15617 +#define _LINUX_SWORK_H
15618 +
15619 +#include <linux/list.h>
15620 +
15621 +struct swork_event {
15622 +       struct list_head item;
15623 +       unsigned long flags;
15624 +       void (*func)(struct swork_event *);
15625 +};
15626 +
15627 +static inline void INIT_SWORK(struct swork_event *event,
15628 +                             void (*func)(struct swork_event *))
15629 +{
15630 +       event->flags = 0;
15631 +       event->func = func;
15632 +}
15633 +
15634 +bool swork_queue(struct swork_event *sev);
15635 +
15636 +int swork_get(void);
15637 +void swork_put(void);
15638 +
15639 +#endif /* _LINUX_SWORK_H */
15640 diff --git a/include/linux/thread_info.h b/include/linux/thread_info.h
15641 index ff307b548ed3..be9f9dc6a4e1 100644
15642 --- a/include/linux/thread_info.h
15643 +++ b/include/linux/thread_info.h
15644 @@ -102,7 +102,17 @@ static inline int test_ti_thread_flag(struct thread_info *ti, int flag)
15645  #define test_thread_flag(flag) \
15646         test_ti_thread_flag(current_thread_info(), flag)
15647
15648 -#define tif_need_resched() test_thread_flag(TIF_NEED_RESCHED)
15649 +#ifdef CONFIG_PREEMPT_LAZY
15650 +#define tif_need_resched()     (test_thread_flag(TIF_NEED_RESCHED) || \
15651 +                                test_thread_flag(TIF_NEED_RESCHED_LAZY))
15652 +#define tif_need_resched_now() (test_thread_flag(TIF_NEED_RESCHED))
15653 +#define tif_need_resched_lazy()        test_thread_flag(TIF_NEED_RESCHED_LAZY))
15654 +
15655 +#else
15656 +#define tif_need_resched()     test_thread_flag(TIF_NEED_RESCHED)
15657 +#define tif_need_resched_now() test_thread_flag(TIF_NEED_RESCHED)
15658 +#define tif_need_resched_lazy()        0
15659 +#endif
15660
15661  #if defined TIF_RESTORE_SIGMASK && !defined HAVE_SET_RESTORE_SIGMASK
15662  /*
15663 diff --git a/include/linux/timer.h b/include/linux/timer.h
15664 index 61aa61dc410c..299d2b78591f 100644
15665 --- a/include/linux/timer.h
15666 +++ b/include/linux/timer.h
15667 @@ -225,7 +225,7 @@ extern void add_timer(struct timer_list *timer);
15668
15669  extern int try_to_del_timer_sync(struct timer_list *timer);
15670
15671 -#ifdef CONFIG_SMP
15672 +#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT_FULL)
15673    extern int del_timer_sync(struct timer_list *timer);
15674  #else
15675  # define del_timer_sync(t)             del_timer(t)
15676 diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h
15677 index 925730bc9fc1..a591f414da6f 100644
15678 --- a/include/linux/trace_events.h
15679 +++ b/include/linux/trace_events.h
15680 @@ -66,6 +66,9 @@ struct trace_entry {
15681         unsigned char           flags;
15682         unsigned char           preempt_count;
15683         int                     pid;
15684 +       unsigned short          migrate_disable;
15685 +       unsigned short          padding;
15686 +       unsigned char           preempt_lazy_count;
15687  };
15688
15689  #define TRACE_EVENT_TYPE_MAX                                           \
15690 diff --git a/include/linux/uaccess.h b/include/linux/uaccess.h
15691 index 558129af828a..cf5c472bbc79 100644
15692 --- a/include/linux/uaccess.h
15693 +++ b/include/linux/uaccess.h
15694 @@ -24,6 +24,7 @@ static __always_inline void pagefault_disabled_dec(void)
15695   */
15696  static inline void pagefault_disable(void)
15697  {
15698 +       migrate_disable();
15699         pagefault_disabled_inc();
15700         /*
15701          * make sure to have issued the store before a pagefault
15702 @@ -40,6 +41,7 @@ static inline void pagefault_enable(void)
15703          */
15704         barrier();
15705         pagefault_disabled_dec();
15706 +       migrate_enable();
15707  }
15708
15709  /*
15710 diff --git a/include/linux/uprobes.h b/include/linux/uprobes.h
15711 index 4a29c75b146e..0a294e950df8 100644
15712 --- a/include/linux/uprobes.h
15713 +++ b/include/linux/uprobes.h
15714 @@ -27,6 +27,7 @@
15715  #include <linux/errno.h>
15716  #include <linux/rbtree.h>
15717  #include <linux/types.h>
15718 +#include <linux/wait.h>
15719
15720  struct vm_area_struct;
15721  struct mm_struct;
15722 diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h
15723 index 3e5d9075960f..7eaa847cd5a5 100644
15724 --- a/include/linux/vmstat.h
15725 +++ b/include/linux/vmstat.h
15726 @@ -33,7 +33,9 @@ DECLARE_PER_CPU(struct vm_event_state, vm_event_states);
15727   */
15728  static inline void __count_vm_event(enum vm_event_item item)
15729  {
15730 +       preempt_disable_rt();
15731         raw_cpu_inc(vm_event_states.event[item]);
15732 +       preempt_enable_rt();
15733  }
15734
15735  static inline void count_vm_event(enum vm_event_item item)
15736 @@ -43,7 +45,9 @@ static inline void count_vm_event(enum vm_event_item item)
15737
15738  static inline void __count_vm_events(enum vm_event_item item, long delta)
15739  {
15740 +       preempt_disable_rt();
15741         raw_cpu_add(vm_event_states.event[item], delta);
15742 +       preempt_enable_rt();
15743  }
15744
15745  static inline void count_vm_events(enum vm_event_item item, long delta)
15746 diff --git a/include/linux/wait.h b/include/linux/wait.h
15747 index 513b36f04dfd..981c8a840f96 100644
15748 --- a/include/linux/wait.h
15749 +++ b/include/linux/wait.h
15750 @@ -8,6 +8,7 @@
15751  #include <linux/spinlock.h>
15752  #include <asm/current.h>
15753  #include <uapi/linux/wait.h>
15754 +#include <linux/atomic.h>
15755
15756  typedef struct __wait_queue wait_queue_t;
15757  typedef int (*wait_queue_func_t)(wait_queue_t *wait, unsigned mode, int flags, void *key);
15758 diff --git a/include/net/dst.h b/include/net/dst.h
15759 index c7329dcd90cc..35c3dba16728 100644
15760 --- a/include/net/dst.h
15761 +++ b/include/net/dst.h
15762 @@ -437,7 +437,7 @@ static inline void dst_confirm(struct dst_entry *dst)
15763  static inline int dst_neigh_output(struct dst_entry *dst, struct neighbour *n,
15764                                    struct sk_buff *skb)
15765  {
15766 -       const struct hh_cache *hh;
15767 +       struct hh_cache *hh;
15768
15769         if (dst->pending_confirm) {
15770                 unsigned long now = jiffies;
15771 diff --git a/include/net/neighbour.h b/include/net/neighbour.h
15772 index 8b683841e574..bf656008f6e7 100644
15773 --- a/include/net/neighbour.h
15774 +++ b/include/net/neighbour.h
15775 @@ -446,7 +446,7 @@ static inline int neigh_hh_bridge(struct hh_cache *hh, struct sk_buff *skb)
15776  }
15777  #endif
15778
15779 -static inline int neigh_hh_output(const struct hh_cache *hh, struct sk_buff *skb)
15780 +static inline int neigh_hh_output(struct hh_cache *hh, struct sk_buff *skb)
15781  {
15782         unsigned int seq;
15783         int hh_len;
15784 @@ -501,7 +501,7 @@ struct neighbour_cb {
15785
15786  #define NEIGH_CB(skb)  ((struct neighbour_cb *)(skb)->cb)
15787
15788 -static inline void neigh_ha_snapshot(char *dst, const struct neighbour *n,
15789 +static inline void neigh_ha_snapshot(char *dst, struct neighbour *n,
15790                                      const struct net_device *dev)
15791  {
15792         unsigned int seq;
15793 diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h
15794 index c68926b4899c..dd0751e76065 100644
15795 --- a/include/net/netns/ipv4.h
15796 +++ b/include/net/netns/ipv4.h
15797 @@ -70,6 +70,7 @@ struct netns_ipv4 {
15798
15799         int sysctl_icmp_echo_ignore_all;
15800         int sysctl_icmp_echo_ignore_broadcasts;
15801 +       int sysctl_icmp_echo_sysrq;
15802         int sysctl_icmp_ignore_bogus_error_responses;
15803         int sysctl_icmp_ratelimit;
15804         int sysctl_icmp_ratemask;
15805 diff --git a/include/trace/events/hist.h b/include/trace/events/hist.h
15806 new file mode 100644
15807 index 000000000000..f7710de1b1f3
15808 --- /dev/null
15809 +++ b/include/trace/events/hist.h
15810 @@ -0,0 +1,73 @@
15811 +#undef TRACE_SYSTEM
15812 +#define TRACE_SYSTEM hist
15813 +
15814 +#if !defined(_TRACE_HIST_H) || defined(TRACE_HEADER_MULTI_READ)
15815 +#define _TRACE_HIST_H
15816 +
15817 +#include "latency_hist.h"
15818 +#include <linux/tracepoint.h>
15819 +
15820 +#if !defined(CONFIG_PREEMPT_OFF_HIST) && !defined(CONFIG_INTERRUPT_OFF_HIST)
15821 +#define trace_preemptirqsoff_hist(a, b)
15822 +#define trace_preemptirqsoff_hist_rcuidle(a, b)
15823 +#else
15824 +TRACE_EVENT(preemptirqsoff_hist,
15825 +
15826 +       TP_PROTO(int reason, int starthist),
15827 +
15828 +       TP_ARGS(reason, starthist),
15829 +
15830 +       TP_STRUCT__entry(
15831 +               __field(int,    reason)
15832 +               __field(int,    starthist)
15833 +       ),
15834 +
15835 +       TP_fast_assign(
15836 +               __entry->reason         = reason;
15837 +               __entry->starthist      = starthist;
15838 +       ),
15839 +
15840 +       TP_printk("reason=%s starthist=%s", getaction(__entry->reason),
15841 +                 __entry->starthist ? "start" : "stop")
15842 +);
15843 +#endif
15844 +
15845 +#ifndef CONFIG_MISSED_TIMER_OFFSETS_HIST
15846 +#define trace_hrtimer_interrupt(a, b, c, d)
15847 +#else
15848 +TRACE_EVENT(hrtimer_interrupt,
15849 +
15850 +       TP_PROTO(int cpu, long long offset, struct task_struct *curr,
15851 +               struct task_struct *task),
15852 +
15853 +       TP_ARGS(cpu, offset, curr, task),
15854 +
15855 +       TP_STRUCT__entry(
15856 +               __field(int,            cpu)
15857 +               __field(long long,      offset)
15858 +               __array(char,           ccomm,  TASK_COMM_LEN)
15859 +               __field(int,            cprio)
15860 +               __array(char,           tcomm,  TASK_COMM_LEN)
15861 +               __field(int,            tprio)
15862 +       ),
15863 +
15864 +       TP_fast_assign(
15865 +               __entry->cpu    = cpu;
15866 +               __entry->offset = offset;
15867 +               memcpy(__entry->ccomm, curr->comm, TASK_COMM_LEN);
15868 +               __entry->cprio  = curr->prio;
15869 +               memcpy(__entry->tcomm, task != NULL ? task->comm : "<none>",
15870 +                       task != NULL ? TASK_COMM_LEN : 7);
15871 +               __entry->tprio  = task != NULL ? task->prio : -1;
15872 +       ),
15873 +
15874 +       TP_printk("cpu=%d offset=%lld curr=%s[%d] thread=%s[%d]",
15875 +               __entry->cpu, __entry->offset, __entry->ccomm,
15876 +               __entry->cprio, __entry->tcomm, __entry->tprio)
15877 +);
15878 +#endif
15879 +
15880 +#endif /* _TRACE_HIST_H */
15881 +
15882 +/* This part must be outside protection */
15883 +#include <trace/define_trace.h>
15884 diff --git a/include/trace/events/latency_hist.h b/include/trace/events/latency_hist.h
15885 new file mode 100644
15886 index 000000000000..d3f2fbd560b1
15887 --- /dev/null
15888 +++ b/include/trace/events/latency_hist.h
15889 @@ -0,0 +1,29 @@
15890 +#ifndef _LATENCY_HIST_H
15891 +#define _LATENCY_HIST_H
15892 +
15893 +enum hist_action {
15894 +       IRQS_ON,
15895 +       PREEMPT_ON,
15896 +       TRACE_STOP,
15897 +       IRQS_OFF,
15898 +       PREEMPT_OFF,
15899 +       TRACE_START,
15900 +};
15901 +
15902 +static char *actions[] = {
15903 +       "IRQS_ON",
15904 +       "PREEMPT_ON",
15905 +       "TRACE_STOP",
15906 +       "IRQS_OFF",
15907 +       "PREEMPT_OFF",
15908 +       "TRACE_START",
15909 +};
15910 +
15911 +static inline char *getaction(int action)
15912 +{
15913 +       if (action >= 0 && action <= sizeof(actions)/sizeof(actions[0]))
15914 +               return actions[action];
15915 +       return "unknown";
15916 +}
15917 +
15918 +#endif /* _LATENCY_HIST_H */
15919 diff --git a/include/trace/events/writeback.h b/include/trace/events/writeback.h
15920 index fff846b512e6..73614ce1d204 100644
15921 --- a/include/trace/events/writeback.h
15922 +++ b/include/trace/events/writeback.h
15923 @@ -134,58 +134,28 @@ DEFINE_EVENT(writeback_dirty_inode_template, writeback_dirty_inode,
15924  #ifdef CREATE_TRACE_POINTS
15925  #ifdef CONFIG_CGROUP_WRITEBACK
15926
15927 -static inline size_t __trace_wb_cgroup_size(struct bdi_writeback *wb)
15928 +static inline unsigned int __trace_wb_assign_cgroup(struct bdi_writeback *wb)
15929  {
15930 -       return kernfs_path_len(wb->memcg_css->cgroup->kn) + 1;
15931 +       return wb->memcg_css->cgroup->kn->ino;
15932  }
15933
15934 -static inline void __trace_wb_assign_cgroup(char *buf, struct bdi_writeback *wb)
15935 -{
15936 -       struct cgroup *cgrp = wb->memcg_css->cgroup;
15937 -       char *path;
15938 -
15939 -       path = cgroup_path(cgrp, buf, kernfs_path_len(cgrp->kn) + 1);
15940 -       WARN_ON_ONCE(path != buf);
15941 -}
15942 -
15943 -static inline size_t __trace_wbc_cgroup_size(struct writeback_control *wbc)
15944 -{
15945 -       if (wbc->wb)
15946 -               return __trace_wb_cgroup_size(wbc->wb);
15947 -       else
15948 -               return 2;
15949 -}
15950 -
15951 -static inline void __trace_wbc_assign_cgroup(char *buf,
15952 -                                            struct writeback_control *wbc)
15953 +static inline unsigned int __trace_wbc_assign_cgroup(struct writeback_control *wbc)
15954  {
15955         if (wbc->wb)
15956 -               __trace_wb_assign_cgroup(buf, wbc->wb);
15957 +               return __trace_wb_assign_cgroup(wbc->wb);
15958         else
15959 -               strcpy(buf, "/");
15960 +               return -1U;
15961  }
15962 -
15963  #else  /* CONFIG_CGROUP_WRITEBACK */
15964
15965 -static inline size_t __trace_wb_cgroup_size(struct bdi_writeback *wb)
15966 -{
15967 -       return 2;
15968 -}
15969 -
15970 -static inline void __trace_wb_assign_cgroup(char *buf, struct bdi_writeback *wb)
15971 -{
15972 -       strcpy(buf, "/");
15973 -}
15974 -
15975 -static inline size_t __trace_wbc_cgroup_size(struct writeback_control *wbc)
15976 +static inline unsigned int __trace_wb_assign_cgroup(struct bdi_writeback *wb)
15977  {
15978 -       return 2;
15979 +       return -1U;
15980  }
15981
15982 -static inline void __trace_wbc_assign_cgroup(char *buf,
15983 -                                            struct writeback_control *wbc)
15984 +static inline unsigned int __trace_wbc_assign_cgroup(struct writeback_control *wbc)
15985  {
15986 -       strcpy(buf, "/");
15987 +       return -1U;
15988  }
15989
15990  #endif /* CONFIG_CGROUP_WRITEBACK */
15991 @@ -201,7 +171,7 @@ DECLARE_EVENT_CLASS(writeback_write_inode_template,
15992                 __array(char, name, 32)
15993                 __field(unsigned long, ino)
15994                 __field(int, sync_mode)
15995 -               __dynamic_array(char, cgroup, __trace_wbc_cgroup_size(wbc))
15996 +               __field(unsigned int, cgroup_ino)
15997         ),
15998
15999         TP_fast_assign(
16000 @@ -209,14 +179,14 @@ DECLARE_EVENT_CLASS(writeback_write_inode_template,
16001                         dev_name(inode_to_bdi(inode)->dev), 32);
16002                 __entry->ino            = inode->i_ino;
16003                 __entry->sync_mode      = wbc->sync_mode;
16004 -               __trace_wbc_assign_cgroup(__get_str(cgroup), wbc);
16005 +               __entry->cgroup_ino     = __trace_wbc_assign_cgroup(wbc);
16006         ),
16007
16008 -       TP_printk("bdi %s: ino=%lu sync_mode=%d cgroup=%s",
16009 +       TP_printk("bdi %s: ino=%lu sync_mode=%d cgroup_ino=%u",
16010                 __entry->name,
16011                 __entry->ino,
16012                 __entry->sync_mode,
16013 -               __get_str(cgroup)
16014 +               __entry->cgroup_ino
16015         )
16016  );
16017
16018 @@ -246,7 +216,7 @@ DECLARE_EVENT_CLASS(writeback_work_class,
16019                 __field(int, range_cyclic)
16020                 __field(int, for_background)
16021                 __field(int, reason)
16022 -               __dynamic_array(char, cgroup, __trace_wb_cgroup_size(wb))
16023 +               __field(unsigned int, cgroup_ino)
16024         ),
16025         TP_fast_assign(
16026                 strncpy(__entry->name,
16027 @@ -258,10 +228,10 @@ DECLARE_EVENT_CLASS(writeback_work_class,
16028                 __entry->range_cyclic = work->range_cyclic;
16029                 __entry->for_background = work->for_background;
16030                 __entry->reason = work->reason;
16031 -               __trace_wb_assign_cgroup(__get_str(cgroup), wb);
16032 +               __entry->cgroup_ino = __trace_wb_assign_cgroup(wb);
16033         ),
16034         TP_printk("bdi %s: sb_dev %d:%d nr_pages=%ld sync_mode=%d "
16035 -                 "kupdate=%d range_cyclic=%d background=%d reason=%s cgroup=%s",
16036 +                 "kupdate=%d range_cyclic=%d background=%d reason=%s cgroup_ino=%u",
16037                   __entry->name,
16038                   MAJOR(__entry->sb_dev), MINOR(__entry->sb_dev),
16039                   __entry->nr_pages,
16040 @@ -270,7 +240,7 @@ DECLARE_EVENT_CLASS(writeback_work_class,
16041                   __entry->range_cyclic,
16042                   __entry->for_background,
16043                   __print_symbolic(__entry->reason, WB_WORK_REASON),
16044 -                 __get_str(cgroup)
16045 +                 __entry->cgroup_ino
16046         )
16047  );
16048  #define DEFINE_WRITEBACK_WORK_EVENT(name) \
16049 @@ -300,15 +270,15 @@ DECLARE_EVENT_CLASS(writeback_class,
16050         TP_ARGS(wb),
16051         TP_STRUCT__entry(
16052                 __array(char, name, 32)
16053 -               __dynamic_array(char, cgroup, __trace_wb_cgroup_size(wb))
16054 +               __field(unsigned int, cgroup_ino)
16055         ),
16056         TP_fast_assign(
16057                 strncpy(__entry->name, dev_name(wb->bdi->dev), 32);
16058 -               __trace_wb_assign_cgroup(__get_str(cgroup), wb);
16059 +               __entry->cgroup_ino = __trace_wb_assign_cgroup(wb);
16060         ),
16061 -       TP_printk("bdi %s: cgroup=%s",
16062 +       TP_printk("bdi %s: cgroup_ino=%u",
16063                   __entry->name,
16064 -                 __get_str(cgroup)
16065 +                 __entry->cgroup_ino
16066         )
16067  );
16068  #define DEFINE_WRITEBACK_EVENT(name) \
16069 @@ -347,7 +317,7 @@ DECLARE_EVENT_CLASS(wbc_class,
16070                 __field(int, range_cyclic)
16071                 __field(long, range_start)
16072                 __field(long, range_end)
16073 -               __dynamic_array(char, cgroup, __trace_wbc_cgroup_size(wbc))
16074 +               __field(unsigned int, cgroup_ino)
16075         ),
16076
16077         TP_fast_assign(
16078 @@ -361,12 +331,12 @@ DECLARE_EVENT_CLASS(wbc_class,
16079                 __entry->range_cyclic   = wbc->range_cyclic;
16080                 __entry->range_start    = (long)wbc->range_start;
16081                 __entry->range_end      = (long)wbc->range_end;
16082 -               __trace_wbc_assign_cgroup(__get_str(cgroup), wbc);
16083 +               __entry->cgroup_ino     = __trace_wbc_assign_cgroup(wbc);
16084         ),
16085
16086         TP_printk("bdi %s: towrt=%ld skip=%ld mode=%d kupd=%d "
16087                 "bgrd=%d reclm=%d cyclic=%d "
16088 -               "start=0x%lx end=0x%lx cgroup=%s",
16089 +               "start=0x%lx end=0x%lx cgroup_ino=%u",
16090                 __entry->name,
16091                 __entry->nr_to_write,
16092                 __entry->pages_skipped,
16093 @@ -377,7 +347,7 @@ DECLARE_EVENT_CLASS(wbc_class,
16094                 __entry->range_cyclic,
16095                 __entry->range_start,
16096                 __entry->range_end,
16097 -               __get_str(cgroup)
16098 +               __entry->cgroup_ino
16099         )
16100  )
16101
16102 @@ -398,7 +368,7 @@ TRACE_EVENT(writeback_queue_io,
16103                 __field(long,           age)
16104                 __field(int,            moved)
16105                 __field(int,            reason)
16106 -               __dynamic_array(char, cgroup, __trace_wb_cgroup_size(wb))
16107 +               __field(unsigned int,   cgroup_ino)
16108         ),
16109         TP_fast_assign(
16110                 unsigned long *older_than_this = work->older_than_this;
16111 @@ -408,15 +378,15 @@ TRACE_EVENT(writeback_queue_io,
16112                                   (jiffies - *older_than_this) * 1000 / HZ : -1;
16113                 __entry->moved  = moved;
16114                 __entry->reason = work->reason;
16115 -               __trace_wb_assign_cgroup(__get_str(cgroup), wb);
16116 +               __entry->cgroup_ino     = __trace_wb_assign_cgroup(wb);
16117         ),
16118 -       TP_printk("bdi %s: older=%lu age=%ld enqueue=%d reason=%s cgroup=%s",
16119 +       TP_printk("bdi %s: older=%lu age=%ld enqueue=%d reason=%s cgroup_ino=%u",
16120                 __entry->name,
16121                 __entry->older, /* older_than_this in jiffies */
16122                 __entry->age,   /* older_than_this in relative milliseconds */
16123                 __entry->moved,
16124                 __print_symbolic(__entry->reason, WB_WORK_REASON),
16125 -               __get_str(cgroup)
16126 +               __entry->cgroup_ino
16127         )
16128  );
16129
16130 @@ -484,7 +454,7 @@ TRACE_EVENT(bdi_dirty_ratelimit,
16131                 __field(unsigned long,  dirty_ratelimit)
16132                 __field(unsigned long,  task_ratelimit)
16133                 __field(unsigned long,  balanced_dirty_ratelimit)
16134 -               __dynamic_array(char, cgroup, __trace_wb_cgroup_size(wb))
16135 +               __field(unsigned int,   cgroup_ino)
16136         ),
16137
16138         TP_fast_assign(
16139 @@ -496,13 +466,13 @@ TRACE_EVENT(bdi_dirty_ratelimit,
16140                 __entry->task_ratelimit = KBps(task_ratelimit);
16141                 __entry->balanced_dirty_ratelimit =
16142                                         KBps(wb->balanced_dirty_ratelimit);
16143 -               __trace_wb_assign_cgroup(__get_str(cgroup), wb);
16144 +               __entry->cgroup_ino     = __trace_wb_assign_cgroup(wb);
16145         ),
16146
16147         TP_printk("bdi %s: "
16148                   "write_bw=%lu awrite_bw=%lu dirty_rate=%lu "
16149                   "dirty_ratelimit=%lu task_ratelimit=%lu "
16150 -                 "balanced_dirty_ratelimit=%lu cgroup=%s",
16151 +                 "balanced_dirty_ratelimit=%lu cgroup_ino=%u",
16152                   __entry->bdi,
16153                   __entry->write_bw,            /* write bandwidth */
16154                   __entry->avg_write_bw,        /* avg write bandwidth */
16155 @@ -510,7 +480,7 @@ TRACE_EVENT(bdi_dirty_ratelimit,
16156                   __entry->dirty_ratelimit,     /* base ratelimit */
16157                   __entry->task_ratelimit, /* ratelimit with position control */
16158                   __entry->balanced_dirty_ratelimit, /* the balanced ratelimit */
16159 -                 __get_str(cgroup)
16160 +                 __entry->cgroup_ino
16161         )
16162  );
16163
16164 @@ -548,7 +518,7 @@ TRACE_EVENT(balance_dirty_pages,
16165                 __field(         long,  pause)
16166                 __field(unsigned long,  period)
16167                 __field(         long,  think)
16168 -               __dynamic_array(char, cgroup, __trace_wb_cgroup_size(wb))
16169 +               __field(unsigned int,   cgroup_ino)
16170         ),
16171
16172         TP_fast_assign(
16173 @@ -571,7 +541,7 @@ TRACE_EVENT(balance_dirty_pages,
16174                 __entry->period         = period * 1000 / HZ;
16175                 __entry->pause          = pause * 1000 / HZ;
16176                 __entry->paused         = (jiffies - start_time) * 1000 / HZ;
16177 -               __trace_wb_assign_cgroup(__get_str(cgroup), wb);
16178 +               __entry->cgroup_ino     = __trace_wb_assign_cgroup(wb);
16179         ),
16180
16181
16182 @@ -580,7 +550,7 @@ TRACE_EVENT(balance_dirty_pages,
16183                   "bdi_setpoint=%lu bdi_dirty=%lu "
16184                   "dirty_ratelimit=%lu task_ratelimit=%lu "
16185                   "dirtied=%u dirtied_pause=%u "
16186 -                 "paused=%lu pause=%ld period=%lu think=%ld cgroup=%s",
16187 +                 "paused=%lu pause=%ld period=%lu think=%ld cgroup_ino=%u",
16188                   __entry->bdi,
16189                   __entry->limit,
16190                   __entry->setpoint,
16191 @@ -595,7 +565,7 @@ TRACE_EVENT(balance_dirty_pages,
16192                   __entry->pause,       /* ms */
16193                   __entry->period,      /* ms */
16194                   __entry->think,       /* ms */
16195 -                 __get_str(cgroup)
16196 +                 __entry->cgroup_ino
16197           )
16198  );
16199
16200 @@ -609,8 +579,7 @@ TRACE_EVENT(writeback_sb_inodes_requeue,
16201                 __field(unsigned long, ino)
16202                 __field(unsigned long, state)
16203                 __field(unsigned long, dirtied_when)
16204 -               __dynamic_array(char, cgroup,
16205 -                               __trace_wb_cgroup_size(inode_to_wb(inode)))
16206 +               __field(unsigned int, cgroup_ino)
16207         ),
16208
16209         TP_fast_assign(
16210 @@ -619,16 +588,16 @@ TRACE_EVENT(writeback_sb_inodes_requeue,
16211                 __entry->ino            = inode->i_ino;
16212                 __entry->state          = inode->i_state;
16213                 __entry->dirtied_when   = inode->dirtied_when;
16214 -               __trace_wb_assign_cgroup(__get_str(cgroup), inode_to_wb(inode));
16215 +               __entry->cgroup_ino     = __trace_wb_assign_cgroup(inode_to_wb(inode));
16216         ),
16217
16218 -       TP_printk("bdi %s: ino=%lu state=%s dirtied_when=%lu age=%lu cgroup=%s",
16219 +       TP_printk("bdi %s: ino=%lu state=%s dirtied_when=%lu age=%lu cgroup_ino=%u",
16220                   __entry->name,
16221                   __entry->ino,
16222                   show_inode_state(__entry->state),
16223                   __entry->dirtied_when,
16224                   (jiffies - __entry->dirtied_when) / HZ,
16225 -                 __get_str(cgroup)
16226 +                 __entry->cgroup_ino
16227         )
16228  );
16229
16230 @@ -684,7 +653,7 @@ DECLARE_EVENT_CLASS(writeback_single_inode_template,
16231                 __field(unsigned long, writeback_index)
16232                 __field(long, nr_to_write)
16233                 __field(unsigned long, wrote)
16234 -               __dynamic_array(char, cgroup, __trace_wbc_cgroup_size(wbc))
16235 +               __field(unsigned int, cgroup_ino)
16236         ),
16237
16238         TP_fast_assign(
16239 @@ -696,11 +665,11 @@ DECLARE_EVENT_CLASS(writeback_single_inode_template,
16240                 __entry->writeback_index = inode->i_mapping->writeback_index;
16241                 __entry->nr_to_write    = nr_to_write;
16242                 __entry->wrote          = nr_to_write - wbc->nr_to_write;
16243 -               __trace_wbc_assign_cgroup(__get_str(cgroup), wbc);
16244 +               __entry->cgroup_ino     = __trace_wbc_assign_cgroup(wbc);
16245         ),
16246
16247         TP_printk("bdi %s: ino=%lu state=%s dirtied_when=%lu age=%lu "
16248 -                 "index=%lu to_write=%ld wrote=%lu cgroup=%s",
16249 +                 "index=%lu to_write=%ld wrote=%lu cgroup_ino=%u",
16250                   __entry->name,
16251                   __entry->ino,
16252                   show_inode_state(__entry->state),
16253 @@ -709,7 +678,7 @@ DECLARE_EVENT_CLASS(writeback_single_inode_template,
16254                   __entry->writeback_index,
16255                   __entry->nr_to_write,
16256                   __entry->wrote,
16257 -                 __get_str(cgroup)
16258 +                 __entry->cgroup_ino
16259         )
16260  );
16261
16262 diff --git a/init/Kconfig b/init/Kconfig
16263 index 235c7a2c0d20..a7c81c0911da 100644
16264 --- a/init/Kconfig
16265 +++ b/init/Kconfig
16266 @@ -498,7 +498,7 @@ config TINY_RCU
16267
16268  config RCU_EXPERT
16269         bool "Make expert-level adjustments to RCU configuration"
16270 -       default n
16271 +       default y if PREEMPT_RT_FULL
16272         help
16273           This option needs to be enabled if you wish to make
16274           expert-level adjustments to RCU configuration.  By default,
16275 @@ -614,7 +614,7 @@ config RCU_FANOUT_LEAF
16276
16277  config RCU_FAST_NO_HZ
16278         bool "Accelerate last non-dyntick-idle CPU's grace periods"
16279 -       depends on NO_HZ_COMMON && SMP && RCU_EXPERT
16280 +       depends on NO_HZ_COMMON && SMP && RCU_EXPERT && !PREEMPT_RT_FULL
16281         default n
16282         help
16283           This option permits CPUs to enter dynticks-idle state even if
16284 @@ -641,7 +641,7 @@ config TREE_RCU_TRACE
16285  config RCU_BOOST
16286         bool "Enable RCU priority boosting"
16287         depends on RT_MUTEXES && PREEMPT_RCU && RCU_EXPERT
16288 -       default n
16289 +       default y if PREEMPT_RT_FULL
16290         help
16291           This option boosts the priority of preempted RCU readers that
16292           block the current preemptible RCU grace period for too long.
16293 @@ -1106,6 +1106,7 @@ config CFS_BANDWIDTH
16294  config RT_GROUP_SCHED
16295         bool "Group scheduling for SCHED_RR/FIFO"
16296         depends on CGROUP_SCHED
16297 +       depends on !PREEMPT_RT_FULL
16298         default n
16299         help
16300           This feature lets you explicitly allocate real CPU bandwidth
16301 @@ -1719,6 +1720,7 @@ choice
16302
16303  config SLAB
16304         bool "SLAB"
16305 +       depends on !PREEMPT_RT_FULL
16306         help
16307           The regular slab allocator that is established and known to work
16308           well in all environments. It organizes cache hot objects in
16309 @@ -1737,6 +1739,7 @@ config SLUB
16310  config SLOB
16311         depends on EXPERT
16312         bool "SLOB (Simple Allocator)"
16313 +       depends on !PREEMPT_RT_FULL
16314         help
16315            SLOB replaces the stock allocator with a drastically simpler
16316            allocator. SLOB is generally more space efficient but
16317 @@ -1746,7 +1749,7 @@ endchoice
16318
16319  config SLUB_CPU_PARTIAL
16320         default y
16321 -       depends on SLUB && SMP
16322 +       depends on SLUB && SMP && !PREEMPT_RT_FULL
16323         bool "SLUB per cpu partial cache"
16324         help
16325           Per cpu partial caches accellerate objects allocation and freeing
16326 diff --git a/init/Makefile b/init/Makefile
16327 index 7bc47ee31c36..88cf473554e0 100644
16328 --- a/init/Makefile
16329 +++ b/init/Makefile
16330 @@ -33,4 +33,4 @@ silent_chk_compile.h = :
16331  include/generated/compile.h: FORCE
16332         @$($(quiet)chk_compile.h)
16333         $(Q)$(CONFIG_SHELL) $(srctree)/scripts/mkcompile_h $@ \
16334 -       "$(UTS_MACHINE)" "$(CONFIG_SMP)" "$(CONFIG_PREEMPT)" "$(CC) $(KBUILD_CFLAGS)"
16335 +       "$(UTS_MACHINE)" "$(CONFIG_SMP)" "$(CONFIG_PREEMPT)" "$(CONFIG_PREEMPT_RT_FULL)" "$(CC) $(KBUILD_CFLAGS)"
16336 diff --git a/init/main.c b/init/main.c
16337 index 9e64d7097f1a..4a76e629c137 100644
16338 --- a/init/main.c
16339 +++ b/init/main.c
16340 @@ -530,6 +530,7 @@ asmlinkage __visible void __init start_kernel(void)
16341         setup_command_line(command_line);
16342         setup_nr_cpu_ids();
16343         setup_per_cpu_areas();
16344 +       softirq_early_init();
16345         smp_prepare_boot_cpu(); /* arch-specific boot-cpu hooks */
16346
16347         build_all_zonelists(NULL, NULL);
16348 diff --git a/ipc/msg.c b/ipc/msg.c
16349 index c6521c205cb4..996d89023552 100644
16350 --- a/ipc/msg.c
16351 +++ b/ipc/msg.c
16352 @@ -183,20 +183,14 @@ static void ss_wakeup(struct list_head *h, int kill)
16353         }
16354  }
16355
16356 -static void expunge_all(struct msg_queue *msq, int res)
16357 +static void expunge_all(struct msg_queue *msq, int res,
16358 +                       struct wake_q_head *wake_q)
16359  {
16360         struct msg_receiver *msr, *t;
16361
16362         list_for_each_entry_safe(msr, t, &msq->q_receivers, r_list) {
16363 -               msr->r_msg = NULL; /* initialize expunge ordering */
16364 -               wake_up_process(msr->r_tsk);
16365 -               /*
16366 -                * Ensure that the wakeup is visible before setting r_msg as
16367 -                * the receiving end depends on it: either spinning on a nil,
16368 -                * or dealing with -EAGAIN cases. See lockless receive part 1
16369 -                * and 2 in do_msgrcv().
16370 -                */
16371 -               smp_wmb(); /* barrier (B) */
16372 +
16373 +               wake_q_add(wake_q, msr->r_tsk);
16374                 msr->r_msg = ERR_PTR(res);
16375         }
16376  }
16377 @@ -213,11 +207,13 @@ static void freeque(struct ipc_namespace *ns, struct kern_ipc_perm *ipcp)
16378  {
16379         struct msg_msg *msg, *t;
16380         struct msg_queue *msq = container_of(ipcp, struct msg_queue, q_perm);
16381 +       WAKE_Q(wake_q);
16382
16383 -       expunge_all(msq, -EIDRM);
16384 +       expunge_all(msq, -EIDRM, &wake_q);
16385         ss_wakeup(&msq->q_senders, 1);
16386         msg_rmid(ns, msq);
16387         ipc_unlock_object(&msq->q_perm);
16388 +       wake_up_q(&wake_q);
16389         rcu_read_unlock();
16390
16391         list_for_each_entry_safe(msg, t, &msq->q_messages, m_list) {
16392 @@ -342,6 +338,7 @@ static int msgctl_down(struct ipc_namespace *ns, int msqid, int cmd,
16393         struct kern_ipc_perm *ipcp;
16394         struct msqid64_ds uninitialized_var(msqid64);
16395         struct msg_queue *msq;
16396 +       WAKE_Q(wake_q);
16397         int err;
16398
16399         if (cmd == IPC_SET) {
16400 @@ -389,7 +386,7 @@ static int msgctl_down(struct ipc_namespace *ns, int msqid, int cmd,
16401                 /* sleeping receivers might be excluded by
16402                  * stricter permissions.
16403                  */
16404 -               expunge_all(msq, -EAGAIN);
16405 +               expunge_all(msq, -EAGAIN, &wake_q);
16406                 /* sleeping senders might be able to send
16407                  * due to a larger queue size.
16408                  */
16409 @@ -402,6 +399,7 @@ static int msgctl_down(struct ipc_namespace *ns, int msqid, int cmd,
16410
16411  out_unlock0:
16412         ipc_unlock_object(&msq->q_perm);
16413 +       wake_up_q(&wake_q);
16414  out_unlock1:
16415         rcu_read_unlock();
16416  out_up:
16417 @@ -566,7 +564,8 @@ static int testmsg(struct msg_msg *msg, long type, int mode)
16418         return 0;
16419  }
16420
16421 -static inline int pipelined_send(struct msg_queue *msq, struct msg_msg *msg)
16422 +static inline int pipelined_send(struct msg_queue *msq, struct msg_msg *msg,
16423 +                                struct wake_q_head *wake_q)
16424  {
16425         struct msg_receiver *msr, *t;
16426
16427 @@ -577,27 +576,13 @@ static inline int pipelined_send(struct msg_queue *msq, struct msg_msg *msg)
16428
16429                         list_del(&msr->r_list);
16430                         if (msr->r_maxsize < msg->m_ts) {
16431 -                               /* initialize pipelined send ordering */
16432 -                               msr->r_msg = NULL;
16433 -                               wake_up_process(msr->r_tsk);
16434 -                               /* barrier (B) see barrier comment below */
16435 -                               smp_wmb();
16436 +                               wake_q_add(wake_q, msr->r_tsk);
16437                                 msr->r_msg = ERR_PTR(-E2BIG);
16438                         } else {
16439 -                               msr->r_msg = NULL;
16440                                 msq->q_lrpid = task_pid_vnr(msr->r_tsk);
16441                                 msq->q_rtime = get_seconds();
16442 -                               wake_up_process(msr->r_tsk);
16443 -                               /*
16444 -                                * Ensure that the wakeup is visible before
16445 -                                * setting r_msg, as the receiving can otherwise
16446 -                                * exit - once r_msg is set, the receiver can
16447 -                                * continue. See lockless receive part 1 and 2
16448 -                                * in do_msgrcv(). Barrier (B).
16449 -                                */
16450 -                               smp_wmb();
16451 +                               wake_q_add(wake_q, msr->r_tsk);
16452                                 msr->r_msg = msg;
16453 -
16454                                 return 1;
16455                         }
16456                 }
16457 @@ -613,6 +598,7 @@ long do_msgsnd(int msqid, long mtype, void __user *mtext,
16458         struct msg_msg *msg;
16459         int err;
16460         struct ipc_namespace *ns;
16461 +       WAKE_Q(wake_q);
16462
16463         ns = current->nsproxy->ipc_ns;
16464
16465 @@ -698,7 +684,7 @@ long do_msgsnd(int msqid, long mtype, void __user *mtext,
16466         msq->q_lspid = task_tgid_vnr(current);
16467         msq->q_stime = get_seconds();
16468
16469 -       if (!pipelined_send(msq, msg)) {
16470 +       if (!pipelined_send(msq, msg, &wake_q)) {
16471                 /* no one is waiting for this message, enqueue it */
16472                 list_add_tail(&msg->m_list, &msq->q_messages);
16473                 msq->q_cbytes += msgsz;
16474 @@ -712,6 +698,7 @@ long do_msgsnd(int msqid, long mtype, void __user *mtext,
16475
16476  out_unlock0:
16477         ipc_unlock_object(&msq->q_perm);
16478 +       wake_up_q(&wake_q);
16479  out_unlock1:
16480         rcu_read_unlock();
16481         if (msg != NULL)
16482 @@ -932,57 +919,25 @@ long do_msgrcv(int msqid, void __user *buf, size_t bufsz, long msgtyp, int msgfl
16483                 rcu_read_lock();
16484
16485                 /* Lockless receive, part 2:
16486 -                * Wait until pipelined_send or expunge_all are outside of
16487 -                * wake_up_process(). There is a race with exit(), see
16488 -                * ipc/mqueue.c for the details. The correct serialization
16489 -                * ensures that a receiver cannot continue without the wakeup
16490 -                * being visibible _before_ setting r_msg:
16491 +                * The work in pipelined_send() and expunge_all():
16492 +                * - Set pointer to message
16493 +                * - Queue the receiver task for later wakeup
16494 +                * - Wake up the process after the lock is dropped.
16495                  *
16496 -                * CPU 0                             CPU 1
16497 -                * <loop receiver>
16498 -                *   smp_rmb(); (A) <-- pair -.      <waker thread>
16499 -                *   <load ->r_msg>           |        msr->r_msg = NULL;
16500 -                *                            |        wake_up_process();
16501 -                * <continue>                 `------> smp_wmb(); (B)
16502 -                *                                     msr->r_msg = msg;
16503 -                *
16504 -                * Where (A) orders the message value read and where (B) orders
16505 -                * the write to the r_msg -- done in both pipelined_send and
16506 -                * expunge_all.
16507 +                * Should the process wake up before this wakeup (due to a
16508 +                * signal) it will either see the message and continue …
16509                  */
16510 -               for (;;) {
16511 -                       /*
16512 -                        * Pairs with writer barrier in pipelined_send
16513 -                        * or expunge_all.
16514 -                        */
16515 -                       smp_rmb(); /* barrier (A) */
16516 -                       msg = (struct msg_msg *)msr_d.r_msg;
16517 -                       if (msg)
16518 -                               break;
16519
16520 -                       /*
16521 -                        * The cpu_relax() call is a compiler barrier
16522 -                        * which forces everything in this loop to be
16523 -                        * re-loaded.
16524 -                        */
16525 -                       cpu_relax();
16526 -               }
16527 -
16528 -               /* Lockless receive, part 3:
16529 -                * If there is a message or an error then accept it without
16530 -                * locking.
16531 -                */
16532 +               msg = (struct msg_msg *)msr_d.r_msg;
16533                 if (msg != ERR_PTR(-EAGAIN))
16534                         goto out_unlock1;
16535
16536 -               /* Lockless receive, part 3:
16537 -                * Acquire the queue spinlock.
16538 -                */
16539 +                /*
16540 +                 * … or see -EAGAIN, acquire the lock to check the message
16541 +                 * again.
16542 +                 */
16543                 ipc_lock_object(&msq->q_perm);
16544
16545 -               /* Lockless receive, part 4:
16546 -                * Repeat test after acquiring the spinlock.
16547 -                */
16548                 msg = (struct msg_msg *)msr_d.r_msg;
16549                 if (msg != ERR_PTR(-EAGAIN))
16550                         goto out_unlock0;
16551 diff --git a/ipc/sem.c b/ipc/sem.c
16552 index 9862c3d1c26d..ef34d7376697 100644
16553 --- a/ipc/sem.c
16554 +++ b/ipc/sem.c
16555 @@ -708,6 +708,13 @@ undo:
16556  static void wake_up_sem_queue_prepare(struct list_head *pt,
16557                                 struct sem_queue *q, int error)
16558  {
16559 +#ifdef CONFIG_PREEMPT_RT_BASE
16560 +       struct task_struct *p = q->sleeper;
16561 +       get_task_struct(p);
16562 +       q->status = error;
16563 +       wake_up_process(p);
16564 +       put_task_struct(p);
16565 +#else
16566         if (list_empty(pt)) {
16567                 /*
16568                  * Hold preempt off so that we don't get preempted and have the
16569 @@ -719,6 +726,7 @@ static void wake_up_sem_queue_prepare(struct list_head *pt,
16570         q->pid = error;
16571
16572         list_add_tail(&q->list, pt);
16573 +#endif
16574  }
16575
16576  /**
16577 @@ -732,6 +740,7 @@ static void wake_up_sem_queue_prepare(struct list_head *pt,
16578   */
16579  static void wake_up_sem_queue_do(struct list_head *pt)
16580  {
16581 +#ifndef CONFIG_PREEMPT_RT_BASE
16582         struct sem_queue *q, *t;
16583         int did_something;
16584
16585 @@ -744,6 +753,7 @@ static void wake_up_sem_queue_do(struct list_head *pt)
16586         }
16587         if (did_something)
16588                 preempt_enable();
16589 +#endif
16590  }
16591
16592  static void unlink_queue(struct sem_array *sma, struct sem_queue *q)
16593 diff --git a/kernel/Kconfig.locks b/kernel/Kconfig.locks
16594 index ebdb0043203a..b9e6aa7e5aa6 100644
16595 --- a/kernel/Kconfig.locks
16596 +++ b/kernel/Kconfig.locks
16597 @@ -225,11 +225,11 @@ config ARCH_SUPPORTS_ATOMIC_RMW
16598
16599  config MUTEX_SPIN_ON_OWNER
16600         def_bool y
16601 -       depends on SMP && !DEBUG_MUTEXES && ARCH_SUPPORTS_ATOMIC_RMW
16602 +       depends on SMP && !DEBUG_MUTEXES && ARCH_SUPPORTS_ATOMIC_RMW && !PREEMPT_RT_FULL
16603
16604  config RWSEM_SPIN_ON_OWNER
16605         def_bool y
16606 -       depends on SMP && RWSEM_XCHGADD_ALGORITHM && ARCH_SUPPORTS_ATOMIC_RMW
16607 +       depends on SMP && RWSEM_XCHGADD_ALGORITHM && ARCH_SUPPORTS_ATOMIC_RMW && !PREEMPT_RT_FULL
16608
16609  config LOCK_SPIN_ON_OWNER
16610         def_bool y
16611 diff --git a/kernel/Kconfig.preempt b/kernel/Kconfig.preempt
16612 index 3f9c97419f02..11dbe26a8279 100644
16613 --- a/kernel/Kconfig.preempt
16614 +++ b/kernel/Kconfig.preempt
16615 @@ -1,3 +1,16 @@
16616 +config PREEMPT
16617 +       bool
16618 +       select PREEMPT_COUNT
16619 +
16620 +config PREEMPT_RT_BASE
16621 +       bool
16622 +       select PREEMPT
16623 +
16624 +config HAVE_PREEMPT_LAZY
16625 +       bool
16626 +
16627 +config PREEMPT_LAZY
16628 +       def_bool y if HAVE_PREEMPT_LAZY && PREEMPT_RT_FULL
16629
16630  choice
16631         prompt "Preemption Model"
16632 @@ -33,9 +46,9 @@ config PREEMPT_VOLUNTARY
16633
16634           Select this if you are building a kernel for a desktop system.
16635
16636 -config PREEMPT
16637 +config PREEMPT__LL
16638         bool "Preemptible Kernel (Low-Latency Desktop)"
16639 -       select PREEMPT_COUNT
16640 +       select PREEMPT
16641         select UNINLINE_SPIN_UNLOCK if !ARCH_INLINE_SPIN_UNLOCK
16642         help
16643           This option reduces the latency of the kernel by making
16644 @@ -52,6 +65,22 @@ config PREEMPT
16645           embedded system with latency requirements in the milliseconds
16646           range.
16647
16648 +config PREEMPT_RTB
16649 +       bool "Preemptible Kernel (Basic RT)"
16650 +       select PREEMPT_RT_BASE
16651 +       help
16652 +         This option is basically the same as (Low-Latency Desktop) but
16653 +         enables changes which are preliminary for the full preemptible
16654 +         RT kernel.
16655 +
16656 +config PREEMPT_RT_FULL
16657 +       bool "Fully Preemptible Kernel (RT)"
16658 +       depends on IRQ_FORCED_THREADING
16659 +       select PREEMPT_RT_BASE
16660 +       select PREEMPT_RCU
16661 +       help
16662 +         All and everything
16663 +
16664  endchoice
16665
16666  config PREEMPT_COUNT
16667 diff --git a/kernel/cgroup.c b/kernel/cgroup.c
16668 index a3424f28aaf4..69434d231e21 100644
16669 --- a/kernel/cgroup.c
16670 +++ b/kernel/cgroup.c
16671 @@ -4737,10 +4737,10 @@ static void css_free_rcu_fn(struct rcu_head *rcu_head)
16672         queue_work(cgroup_destroy_wq, &css->destroy_work);
16673  }
16674
16675 -static void css_release_work_fn(struct work_struct *work)
16676 +static void css_release_work_fn(struct swork_event *sev)
16677  {
16678         struct cgroup_subsys_state *css =
16679 -               container_of(work, struct cgroup_subsys_state, destroy_work);
16680 +               container_of(sev, struct cgroup_subsys_state, destroy_swork);
16681         struct cgroup_subsys *ss = css->ss;
16682         struct cgroup *cgrp = css->cgroup;
16683
16684 @@ -4779,8 +4779,8 @@ static void css_release(struct percpu_ref *ref)
16685         struct cgroup_subsys_state *css =
16686                 container_of(ref, struct cgroup_subsys_state, refcnt);
16687
16688 -       INIT_WORK(&css->destroy_work, css_release_work_fn);
16689 -       queue_work(cgroup_destroy_wq, &css->destroy_work);
16690 +       INIT_SWORK(&css->destroy_swork, css_release_work_fn);
16691 +       swork_queue(&css->destroy_swork);
16692  }
16693
16694  static void init_and_link_css(struct cgroup_subsys_state *css,
16695 @@ -5397,6 +5397,7 @@ static int __init cgroup_wq_init(void)
16696          */
16697         cgroup_destroy_wq = alloc_workqueue("cgroup_destroy", 0, 1);
16698         BUG_ON(!cgroup_destroy_wq);
16699 +       BUG_ON(swork_get());
16700
16701         /*
16702          * Used to destroy pidlists and separate to serve as flush domain.
16703 diff --git a/kernel/cpu.c b/kernel/cpu.c
16704 index 85ff5e26e23b..8edd3c716092 100644
16705 --- a/kernel/cpu.c
16706 +++ b/kernel/cpu.c
16707 @@ -75,8 +75,8 @@ static struct {
16708  #endif
16709  } cpu_hotplug = {
16710         .active_writer = NULL,
16711 -       .wq = __WAIT_QUEUE_HEAD_INITIALIZER(cpu_hotplug.wq),
16712         .lock = __MUTEX_INITIALIZER(cpu_hotplug.lock),
16713 +       .wq = __WAIT_QUEUE_HEAD_INITIALIZER(cpu_hotplug.wq),
16714  #ifdef CONFIG_DEBUG_LOCK_ALLOC
16715         .dep_map = {.name = "cpu_hotplug.lock" },
16716  #endif
16717 @@ -89,6 +89,289 @@ static struct {
16718  #define cpuhp_lock_acquire()      lock_map_acquire(&cpu_hotplug.dep_map)
16719  #define cpuhp_lock_release()      lock_map_release(&cpu_hotplug.dep_map)
16720
16721 +/**
16722 + * hotplug_pcp - per cpu hotplug descriptor
16723 + * @unplug:    set when pin_current_cpu() needs to sync tasks
16724 + * @sync_tsk:  the task that waits for tasks to finish pinned sections
16725 + * @refcount:  counter of tasks in pinned sections
16726 + * @grab_lock: set when the tasks entering pinned sections should wait
16727 + * @synced:    notifier for @sync_tsk to tell cpu_down it's finished
16728 + * @mutex:     the mutex to make tasks wait (used when @grab_lock is true)
16729 + * @mutex_init:        zero if the mutex hasn't been initialized yet.
16730 + *
16731 + * Although @unplug and @sync_tsk may point to the same task, the @unplug
16732 + * is used as a flag and still exists after @sync_tsk has exited and
16733 + * @sync_tsk set to NULL.
16734 + */
16735 +struct hotplug_pcp {
16736 +       struct task_struct *unplug;
16737 +       struct task_struct *sync_tsk;
16738 +       int refcount;
16739 +       int grab_lock;
16740 +       struct completion synced;
16741 +       struct completion unplug_wait;
16742 +#ifdef CONFIG_PREEMPT_RT_FULL
16743 +       /*
16744 +        * Note, on PREEMPT_RT, the hotplug lock must save the state of
16745 +        * the task, otherwise the mutex will cause the task to fail
16746 +        * to sleep when required. (Because it's called from migrate_disable())
16747 +        *
16748 +        * The spinlock_t on PREEMPT_RT is a mutex that saves the task's
16749 +        * state.
16750 +        */
16751 +       spinlock_t lock;
16752 +#else
16753 +       struct mutex mutex;
16754 +#endif
16755 +       int mutex_init;
16756 +};
16757 +
16758 +#ifdef CONFIG_PREEMPT_RT_FULL
16759 +# define hotplug_lock(hp) rt_spin_lock__no_mg(&(hp)->lock)
16760 +# define hotplug_unlock(hp) rt_spin_unlock__no_mg(&(hp)->lock)
16761 +#else
16762 +# define hotplug_lock(hp) mutex_lock(&(hp)->mutex)
16763 +# define hotplug_unlock(hp) mutex_unlock(&(hp)->mutex)
16764 +#endif
16765 +
16766 +static DEFINE_PER_CPU(struct hotplug_pcp, hotplug_pcp);
16767 +
16768 +/**
16769 + * pin_current_cpu - Prevent the current cpu from being unplugged
16770 + *
16771 + * Lightweight version of get_online_cpus() to prevent cpu from being
16772 + * unplugged when code runs in a migration disabled region.
16773 + *
16774 + * Must be called with preemption disabled (preempt_count = 1)!
16775 + */
16776 +void pin_current_cpu(void)
16777 +{
16778 +       struct hotplug_pcp *hp;
16779 +       int force = 0;
16780 +
16781 +retry:
16782 +       hp = this_cpu_ptr(&hotplug_pcp);
16783 +
16784 +       if (!hp->unplug || hp->refcount || force || preempt_count() > 1 ||
16785 +           hp->unplug == current) {
16786 +               hp->refcount++;
16787 +               return;
16788 +       }
16789 +       if (hp->grab_lock) {
16790 +               preempt_enable();
16791 +               hotplug_lock(hp);
16792 +               hotplug_unlock(hp);
16793 +       } else {
16794 +               preempt_enable();
16795 +               /*
16796 +                * Try to push this task off of this CPU.
16797 +                */
16798 +               if (!migrate_me()) {
16799 +                       preempt_disable();
16800 +                       hp = this_cpu_ptr(&hotplug_pcp);
16801 +                       if (!hp->grab_lock) {
16802 +                               /*
16803 +                                * Just let it continue it's already pinned
16804 +                                * or about to sleep.
16805 +                                */
16806 +                               force = 1;
16807 +                               goto retry;
16808 +                       }
16809 +                       preempt_enable();
16810 +               }
16811 +       }
16812 +       preempt_disable();
16813 +       goto retry;
16814 +}
16815 +
16816 +/**
16817 + * unpin_current_cpu - Allow unplug of current cpu
16818 + *
16819 + * Must be called with preemption or interrupts disabled!
16820 + */
16821 +void unpin_current_cpu(void)
16822 +{
16823 +       struct hotplug_pcp *hp = this_cpu_ptr(&hotplug_pcp);
16824 +
16825 +       WARN_ON(hp->refcount <= 0);
16826 +
16827 +       /* This is safe. sync_unplug_thread is pinned to this cpu */
16828 +       if (!--hp->refcount && hp->unplug && hp->unplug != current)
16829 +               wake_up_process(hp->unplug);
16830 +}
16831 +
16832 +static void wait_for_pinned_cpus(struct hotplug_pcp *hp)
16833 +{
16834 +       set_current_state(TASK_UNINTERRUPTIBLE);
16835 +       while (hp->refcount) {
16836 +               schedule_preempt_disabled();
16837 +               set_current_state(TASK_UNINTERRUPTIBLE);
16838 +       }
16839 +}
16840 +
16841 +static int sync_unplug_thread(void *data)
16842 +{
16843 +       struct hotplug_pcp *hp = data;
16844 +
16845 +       wait_for_completion(&hp->unplug_wait);
16846 +       preempt_disable();
16847 +       hp->unplug = current;
16848 +       wait_for_pinned_cpus(hp);
16849 +
16850 +       /*
16851 +        * This thread will synchronize the cpu_down() with threads
16852 +        * that have pinned the CPU. When the pinned CPU count reaches
16853 +        * zero, we inform the cpu_down code to continue to the next step.
16854 +        */
16855 +       set_current_state(TASK_UNINTERRUPTIBLE);
16856 +       preempt_enable();
16857 +       complete(&hp->synced);
16858 +
16859 +       /*
16860 +        * If all succeeds, the next step will need tasks to wait till
16861 +        * the CPU is offline before continuing. To do this, the grab_lock
16862 +        * is set and tasks going into pin_current_cpu() will block on the
16863 +        * mutex. But we still need to wait for those that are already in
16864 +        * pinned CPU sections. If the cpu_down() failed, the kthread_should_stop()
16865 +        * will kick this thread out.
16866 +        */
16867 +       while (!hp->grab_lock && !kthread_should_stop()) {
16868 +               schedule();
16869 +               set_current_state(TASK_UNINTERRUPTIBLE);
16870 +       }
16871 +
16872 +       /* Make sure grab_lock is seen before we see a stale completion */
16873 +       smp_mb();
16874 +
16875 +       /*
16876 +        * Now just before cpu_down() enters stop machine, we need to make
16877 +        * sure all tasks that are in pinned CPU sections are out, and new
16878 +        * tasks will now grab the lock, keeping them from entering pinned
16879 +        * CPU sections.
16880 +        */
16881 +       if (!kthread_should_stop()) {
16882 +               preempt_disable();
16883 +               wait_for_pinned_cpus(hp);
16884 +               preempt_enable();
16885 +               complete(&hp->synced);
16886 +       }
16887 +
16888 +       set_current_state(TASK_UNINTERRUPTIBLE);
16889 +       while (!kthread_should_stop()) {
16890 +               schedule();
16891 +               set_current_state(TASK_UNINTERRUPTIBLE);
16892 +       }
16893 +       set_current_state(TASK_RUNNING);
16894 +
16895 +       /*
16896 +        * Force this thread off this CPU as it's going down and
16897 +        * we don't want any more work on this CPU.
16898 +        */
16899 +       current->flags &= ~PF_NO_SETAFFINITY;
16900 +       set_cpus_allowed_ptr(current, cpu_present_mask);
16901 +       migrate_me();
16902 +       return 0;
16903 +}
16904 +
16905 +static void __cpu_unplug_sync(struct hotplug_pcp *hp)
16906 +{
16907 +       wake_up_process(hp->sync_tsk);
16908 +       wait_for_completion(&hp->synced);
16909 +}
16910 +
16911 +static void __cpu_unplug_wait(unsigned int cpu)
16912 +{
16913 +       struct hotplug_pcp *hp = &per_cpu(hotplug_pcp, cpu);
16914 +
16915 +       complete(&hp->unplug_wait);
16916 +       wait_for_completion(&hp->synced);
16917 +}
16918 +
16919 +/*
16920 + * Start the sync_unplug_thread on the target cpu and wait for it to
16921 + * complete.
16922 + */
16923 +static int cpu_unplug_begin(unsigned int cpu)
16924 +{
16925 +       struct hotplug_pcp *hp = &per_cpu(hotplug_pcp, cpu);
16926 +       int err;
16927 +
16928 +       /* Protected by cpu_hotplug.lock */
16929 +       if (!hp->mutex_init) {
16930 +#ifdef CONFIG_PREEMPT_RT_FULL
16931 +               spin_lock_init(&hp->lock);
16932 +#else
16933 +               mutex_init(&hp->mutex);
16934 +#endif
16935 +               hp->mutex_init = 1;
16936 +       }
16937 +
16938 +       /* Inform the scheduler to migrate tasks off this CPU */
16939 +       tell_sched_cpu_down_begin(cpu);
16940 +
16941 +       init_completion(&hp->synced);
16942 +       init_completion(&hp->unplug_wait);
16943 +
16944 +       hp->sync_tsk = kthread_create(sync_unplug_thread, hp, "sync_unplug/%d", cpu);
16945 +       if (IS_ERR(hp->sync_tsk)) {
16946 +               err = PTR_ERR(hp->sync_tsk);
16947 +               hp->sync_tsk = NULL;
16948 +               return err;
16949 +       }
16950 +       kthread_bind(hp->sync_tsk, cpu);
16951 +
16952 +       /*
16953 +        * Wait for tasks to get out of the pinned sections,
16954 +        * it's still OK if new tasks enter. Some CPU notifiers will
16955 +        * wait for tasks that are going to enter these sections and
16956 +        * we must not have them block.
16957 +        */
16958 +       wake_up_process(hp->sync_tsk);
16959 +       return 0;
16960 +}
16961 +
16962 +static void cpu_unplug_sync(unsigned int cpu)
16963 +{
16964 +       struct hotplug_pcp *hp = &per_cpu(hotplug_pcp, cpu);
16965 +
16966 +       init_completion(&hp->synced);
16967 +       /* The completion needs to be initialzied before setting grab_lock */
16968 +       smp_wmb();
16969 +
16970 +       /* Grab the mutex before setting grab_lock */
16971 +       hotplug_lock(hp);
16972 +       hp->grab_lock = 1;
16973 +
16974 +       /*
16975 +        * The CPU notifiers have been completed.
16976 +        * Wait for tasks to get out of pinned CPU sections and have new
16977 +        * tasks block until the CPU is completely down.
16978 +        */
16979 +       __cpu_unplug_sync(hp);
16980 +
16981 +       /* All done with the sync thread */
16982 +       kthread_stop(hp->sync_tsk);
16983 +       hp->sync_tsk = NULL;
16984 +}
16985 +
16986 +static void cpu_unplug_done(unsigned int cpu)
16987 +{
16988 +       struct hotplug_pcp *hp = &per_cpu(hotplug_pcp, cpu);
16989 +
16990 +       hp->unplug = NULL;
16991 +       /* Let all tasks know cpu unplug is finished before cleaning up */
16992 +       smp_wmb();
16993 +
16994 +       if (hp->sync_tsk)
16995 +               kthread_stop(hp->sync_tsk);
16996 +
16997 +       if (hp->grab_lock) {
16998 +               hotplug_unlock(hp);
16999 +               /* protected by cpu_hotplug.lock */
17000 +               hp->grab_lock = 0;
17001 +       }
17002 +       tell_sched_cpu_down_done(cpu);
17003 +}
17004
17005  void get_online_cpus(void)
17006  {
17007 @@ -338,13 +621,15 @@ static int take_cpu_down(void *_param)
17008  /* Requires cpu_add_remove_lock to be held */
17009  static int _cpu_down(unsigned int cpu, int tasks_frozen)
17010  {
17011 -       int err, nr_calls = 0;
17012 +       int mycpu, err, nr_calls = 0;
17013         void *hcpu = (void *)(long)cpu;
17014         unsigned long mod = tasks_frozen ? CPU_TASKS_FROZEN : 0;
17015         struct take_cpu_down_param tcd_param = {
17016                 .mod = mod,
17017                 .hcpu = hcpu,
17018         };
17019 +       cpumask_var_t cpumask;
17020 +       cpumask_var_t cpumask_org;
17021
17022         if (num_online_cpus() == 1)
17023                 return -EBUSY;
17024 @@ -352,7 +637,34 @@ static int _cpu_down(unsigned int cpu, int tasks_frozen)
17025         if (!cpu_online(cpu))
17026                 return -EINVAL;
17027
17028 +       /* Move the downtaker off the unplug cpu */
17029 +       if (!alloc_cpumask_var(&cpumask, GFP_KERNEL))
17030 +               return -ENOMEM;
17031 +       if (!alloc_cpumask_var(&cpumask_org, GFP_KERNEL))  {
17032 +               free_cpumask_var(cpumask);
17033 +               return -ENOMEM;
17034 +       }
17035 +
17036 +       cpumask_copy(cpumask_org, tsk_cpus_allowed(current));
17037 +       cpumask_andnot(cpumask, cpu_online_mask, cpumask_of(cpu));
17038 +       set_cpus_allowed_ptr(current, cpumask);
17039 +       free_cpumask_var(cpumask);
17040 +       migrate_disable();
17041 +       mycpu = smp_processor_id();
17042 +       if (mycpu == cpu) {
17043 +               printk(KERN_ERR "Yuck! Still on unplug CPU\n!");
17044 +               migrate_enable();
17045 +               err = -EBUSY;
17046 +               goto restore_cpus;
17047 +       }
17048 +       migrate_enable();
17049 +
17050         cpu_hotplug_begin();
17051 +       err = cpu_unplug_begin(cpu);
17052 +       if (err) {
17053 +               printk("cpu_unplug_begin(%d) failed\n", cpu);
17054 +               goto out_cancel;
17055 +       }
17056
17057         err = __cpu_notify(CPU_DOWN_PREPARE | mod, hcpu, -1, &nr_calls);
17058         if (err) {
17059 @@ -378,8 +690,12 @@ static int _cpu_down(unsigned int cpu, int tasks_frozen)
17060         else
17061                 synchronize_rcu();
17062
17063 +       __cpu_unplug_wait(cpu);
17064         smpboot_park_threads(cpu);
17065
17066 +       /* Notifiers are done. Don't let any more tasks pin this CPU. */
17067 +       cpu_unplug_sync(cpu);
17068 +
17069         /*
17070          * Prevent irq alloc/free while the dying cpu reorganizes the
17071          * interrupt affinities.
17072 @@ -424,9 +740,14 @@ static int _cpu_down(unsigned int cpu, int tasks_frozen)
17073         check_for_tasks(cpu);
17074
17075  out_release:
17076 +       cpu_unplug_done(cpu);
17077 +out_cancel:
17078         cpu_hotplug_done();
17079         if (!err)
17080                 cpu_notify_nofail(CPU_POST_DEAD | mod, hcpu);
17081 +restore_cpus:
17082 +       set_cpus_allowed_ptr(current, cpumask_org);
17083 +       free_cpumask_var(cpumask_org);
17084         return err;
17085  }
17086
17087 diff --git a/kernel/debug/kdb/kdb_io.c b/kernel/debug/kdb/kdb_io.c
17088 index fc1ef736253c..83c666537a7a 100644
17089 --- a/kernel/debug/kdb/kdb_io.c
17090 +++ b/kernel/debug/kdb/kdb_io.c
17091 @@ -554,7 +554,6 @@ int vkdb_printf(enum kdb_msgsrc src, const char *fmt, va_list ap)
17092         int linecount;
17093         int colcount;
17094         int logging, saved_loglevel = 0;
17095 -       int saved_trap_printk;
17096         int got_printf_lock = 0;
17097         int retlen = 0;
17098         int fnd, len;
17099 @@ -565,8 +564,6 @@ int vkdb_printf(enum kdb_msgsrc src, const char *fmt, va_list ap)
17100         unsigned long uninitialized_var(flags);
17101
17102         preempt_disable();
17103 -       saved_trap_printk = kdb_trap_printk;
17104 -       kdb_trap_printk = 0;
17105
17106         /* Serialize kdb_printf if multiple cpus try to write at once.
17107          * But if any cpu goes recursive in kdb, just print the output,
17108 @@ -855,7 +852,6 @@ kdb_print_out:
17109         } else {
17110                 __release(kdb_printf_lock);
17111         }
17112 -       kdb_trap_printk = saved_trap_printk;
17113         preempt_enable();
17114         return retlen;
17115  }
17116 @@ -865,9 +861,11 @@ int kdb_printf(const char *fmt, ...)
17117         va_list ap;
17118         int r;
17119
17120 +       kdb_trap_printk++;
17121         va_start(ap, fmt);
17122         r = vkdb_printf(KDB_MSGSRC_INTERNAL, fmt, ap);
17123         va_end(ap);
17124 +       kdb_trap_printk--;
17125
17126         return r;
17127  }
17128 diff --git a/kernel/events/core.c b/kernel/events/core.c
17129 index bc6371b0e4fb..388de1dc27d9 100644
17130 --- a/kernel/events/core.c
17131 +++ b/kernel/events/core.c
17132 @@ -802,6 +802,7 @@ static void __perf_mux_hrtimer_init(struct perf_cpu_context *cpuctx, int cpu)
17133         raw_spin_lock_init(&cpuctx->hrtimer_lock);
17134         hrtimer_init(timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED);
17135         timer->function = perf_mux_hrtimer_handler;
17136 +       timer->irqsafe = 1;
17137  }
17138
17139  static int perf_mux_hrtimer_restart(struct perf_cpu_context *cpuctx)
17140 @@ -7240,6 +7241,7 @@ static void perf_swevent_init_hrtimer(struct perf_event *event)
17141
17142         hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
17143         hwc->hrtimer.function = perf_swevent_hrtimer;
17144 +       hwc->hrtimer.irqsafe = 1;
17145
17146         /*
17147          * Since hrtimers have a fixed rate, we can do a static freq->period
17148 diff --git a/kernel/exit.c b/kernel/exit.c
17149 index ffba5df4abd5..e199407f8831 100644
17150 --- a/kernel/exit.c
17151 +++ b/kernel/exit.c
17152 @@ -144,7 +144,7 @@ static void __exit_signal(struct task_struct *tsk)
17153          * Do this under ->siglock, we can race with another thread
17154          * doing sigqueue_free() if we have SIGQUEUE_PREALLOC signals.
17155          */
17156 -       flush_sigqueue(&tsk->pending);
17157 +       flush_task_sigqueue(tsk);
17158         tsk->sighand = NULL;
17159         spin_unlock(&sighand->siglock);
17160
17161 diff --git a/kernel/fork.c b/kernel/fork.c
17162 index 7161ebe67cbb..3b880312b385 100644
17163 --- a/kernel/fork.c
17164 +++ b/kernel/fork.c
17165 @@ -108,7 +108,7 @@ int max_threads;            /* tunable limit on nr_threads */
17166
17167  DEFINE_PER_CPU(unsigned long, process_counts) = 0;
17168
17169 -__cacheline_aligned DEFINE_RWLOCK(tasklist_lock);  /* outer */
17170 +DEFINE_RWLOCK(tasklist_lock);  /* outer */
17171
17172  #ifdef CONFIG_PROVE_RCU
17173  int lockdep_tasklist_lock_is_held(void)
17174 @@ -244,7 +244,9 @@ static inline void put_signal_struct(struct signal_struct *sig)
17175         if (atomic_dec_and_test(&sig->sigcnt))
17176                 free_signal_struct(sig);
17177  }
17178 -
17179 +#ifdef CONFIG_PREEMPT_RT_BASE
17180 +static
17181 +#endif
17182  void __put_task_struct(struct task_struct *tsk)
17183  {
17184         WARN_ON(!tsk->exit_state);
17185 @@ -261,7 +263,18 @@ void __put_task_struct(struct task_struct *tsk)
17186         if (!profile_handoff_task(tsk))
17187                 free_task(tsk);
17188  }
17189 +#ifndef CONFIG_PREEMPT_RT_BASE
17190  EXPORT_SYMBOL_GPL(__put_task_struct);
17191 +#else
17192 +void __put_task_struct_cb(struct rcu_head *rhp)
17193 +{
17194 +       struct task_struct *tsk = container_of(rhp, struct task_struct, put_rcu);
17195 +
17196 +       __put_task_struct(tsk);
17197 +
17198 +}
17199 +EXPORT_SYMBOL_GPL(__put_task_struct_cb);
17200 +#endif
17201
17202  void __init __weak arch_task_cache_init(void) { }
17203
17204 @@ -689,6 +702,19 @@ void __mmdrop(struct mm_struct *mm)
17205  }
17206  EXPORT_SYMBOL_GPL(__mmdrop);
17207
17208 +#ifdef CONFIG_PREEMPT_RT_BASE
17209 +/*
17210 + * RCU callback for delayed mm drop. Not strictly rcu, but we don't
17211 + * want another facility to make this work.
17212 + */
17213 +void __mmdrop_delayed(struct rcu_head *rhp)
17214 +{
17215 +       struct mm_struct *mm = container_of(rhp, struct mm_struct, delayed_drop);
17216 +
17217 +       __mmdrop(mm);
17218 +}
17219 +#endif
17220 +
17221  /*
17222   * Decrement the use count and release all resources for an mm.
17223   */
17224 @@ -1239,6 +1265,9 @@ static void rt_mutex_init_task(struct task_struct *p)
17225   */
17226  static void posix_cpu_timers_init(struct task_struct *tsk)
17227  {
17228 +#ifdef CONFIG_PREEMPT_RT_BASE
17229 +       tsk->posix_timer_list = NULL;
17230 +#endif
17231         tsk->cputime_expires.prof_exp = 0;
17232         tsk->cputime_expires.virt_exp = 0;
17233         tsk->cputime_expires.sched_exp = 0;
17234 @@ -1364,15 +1393,16 @@ static struct task_struct *copy_process(unsigned long clone_flags,
17235         spin_lock_init(&p->alloc_lock);
17236
17237         init_sigpending(&p->pending);
17238 +       p->sigqueue_cache = NULL;
17239
17240         p->utime = p->stime = p->gtime = 0;
17241         p->utimescaled = p->stimescaled = 0;
17242         prev_cputime_init(&p->prev_cputime);
17243
17244  #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
17245 -       seqlock_init(&p->vtime_seqlock);
17246 +       seqcount_init(&p->vtime_seqcount);
17247         p->vtime_snap = 0;
17248 -       p->vtime_snap_whence = VTIME_SLEEPING;
17249 +       p->vtime_snap_whence = VTIME_INACTIVE;
17250  #endif
17251
17252  #if defined(SPLIT_RSS_COUNTING)
17253 diff --git a/kernel/futex.c b/kernel/futex.c
17254 index 9d8163afd87c..059623427b99 100644
17255 --- a/kernel/futex.c
17256 +++ b/kernel/futex.c
17257 @@ -815,7 +815,9 @@ void exit_pi_state_list(struct task_struct *curr)
17258                  * task still owns the PI-state:
17259                  */
17260                 if (head->next != next) {
17261 +                       raw_spin_unlock_irq(&curr->pi_lock);
17262                         spin_unlock(&hb->lock);
17263 +                       raw_spin_lock_irq(&curr->pi_lock);
17264                         continue;
17265                 }
17266
17267 @@ -1210,6 +1212,7 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this,
17268         struct futex_pi_state *pi_state = this->pi_state;
17269         u32 uninitialized_var(curval), newval;
17270         WAKE_Q(wake_q);
17271 +       WAKE_Q(wake_sleeper_q);
17272         bool deboost;
17273         int ret = 0;
17274
17275 @@ -1223,7 +1226,7 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this,
17276         if (pi_state->owner != current)
17277                 return -EINVAL;
17278
17279 -       raw_spin_lock(&pi_state->pi_mutex.wait_lock);
17280 +       raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
17281         new_owner = rt_mutex_next_owner(&pi_state->pi_mutex);
17282
17283         /*
17284 @@ -1259,24 +1262,25 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this,
17285                         ret = -EINVAL;
17286         }
17287         if (ret) {
17288 -               raw_spin_unlock(&pi_state->pi_mutex.wait_lock);
17289 +               raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
17290                 return ret;
17291         }
17292
17293 -       raw_spin_lock_irq(&pi_state->owner->pi_lock);
17294 +       raw_spin_lock(&pi_state->owner->pi_lock);
17295         WARN_ON(list_empty(&pi_state->list));
17296         list_del_init(&pi_state->list);
17297 -       raw_spin_unlock_irq(&pi_state->owner->pi_lock);
17298 +       raw_spin_unlock(&pi_state->owner->pi_lock);
17299
17300 -       raw_spin_lock_irq(&new_owner->pi_lock);
17301 +       raw_spin_lock(&new_owner->pi_lock);
17302         WARN_ON(!list_empty(&pi_state->list));
17303         list_add(&pi_state->list, &new_owner->pi_state_list);
17304         pi_state->owner = new_owner;
17305 -       raw_spin_unlock_irq(&new_owner->pi_lock);
17306 +       raw_spin_unlock(&new_owner->pi_lock);
17307
17308 -       raw_spin_unlock(&pi_state->pi_mutex.wait_lock);
17309 +       raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
17310
17311 -       deboost = rt_mutex_futex_unlock(&pi_state->pi_mutex, &wake_q);
17312 +       deboost = rt_mutex_futex_unlock(&pi_state->pi_mutex, &wake_q,
17313 +                                       &wake_sleeper_q);
17314
17315         /*
17316          * First unlock HB so the waiter does not spin on it once he got woken
17317 @@ -1284,8 +1288,9 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this,
17318          * deboost first (and lose our higher priority), then the task might get
17319          * scheduled away before the wake up can take place.
17320          */
17321 -       spin_unlock(&hb->lock);
17322 +       deboost |= spin_unlock_no_deboost(&hb->lock);
17323         wake_up_q(&wake_q);
17324 +       wake_up_q_sleeper(&wake_sleeper_q);
17325         if (deboost)
17326                 rt_mutex_adjust_prio(current);
17327
17328 @@ -1822,6 +1827,16 @@ retry_private:
17329                                 requeue_pi_wake_futex(this, &key2, hb2);
17330                                 drop_count++;
17331                                 continue;
17332 +                       } else if (ret == -EAGAIN) {
17333 +                               /*
17334 +                                * Waiter was woken by timeout or
17335 +                                * signal and has set pi_blocked_on to
17336 +                                * PI_WAKEUP_INPROGRESS before we
17337 +                                * tried to enqueue it on the rtmutex.
17338 +                                */
17339 +                               this->pi_state = NULL;
17340 +                               free_pi_state(pi_state);
17341 +                               continue;
17342                         } else if (ret) {
17343                                 /* -EDEADLK */
17344                                 this->pi_state = NULL;
17345 @@ -2139,11 +2154,11 @@ static int fixup_owner(u32 __user *uaddr, struct futex_q *q, int locked)
17346                  * we returned due to timeout or signal without taking the
17347                  * rt_mutex. Too late.
17348                  */
17349 -               raw_spin_lock(&q->pi_state->pi_mutex.wait_lock);
17350 +               raw_spin_lock_irq(&q->pi_state->pi_mutex.wait_lock);
17351                 owner = rt_mutex_owner(&q->pi_state->pi_mutex);
17352                 if (!owner)
17353                         owner = rt_mutex_next_owner(&q->pi_state->pi_mutex);
17354 -               raw_spin_unlock(&q->pi_state->pi_mutex.wait_lock);
17355 +               raw_spin_unlock_irq(&q->pi_state->pi_mutex.wait_lock);
17356                 ret = fixup_pi_state_owner(uaddr, q, owner);
17357                 goto out;
17358         }
17359 @@ -2691,7 +2706,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
17360         struct hrtimer_sleeper timeout, *to = NULL;
17361         struct rt_mutex_waiter rt_waiter;
17362         struct rt_mutex *pi_mutex = NULL;
17363 -       struct futex_hash_bucket *hb;
17364 +       struct futex_hash_bucket *hb, *hb2;
17365         union futex_key key2 = FUTEX_KEY_INIT;
17366         struct futex_q q = futex_q_init;
17367         int res, ret;
17368 @@ -2716,10 +2731,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
17369          * The waiter is allocated on our stack, manipulated by the requeue
17370          * code while we sleep on uaddr.
17371          */
17372 -       debug_rt_mutex_init_waiter(&rt_waiter);
17373 -       RB_CLEAR_NODE(&rt_waiter.pi_tree_entry);
17374 -       RB_CLEAR_NODE(&rt_waiter.tree_entry);
17375 -       rt_waiter.task = NULL;
17376 +       rt_mutex_init_waiter(&rt_waiter, false);
17377
17378         ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2, VERIFY_WRITE);
17379         if (unlikely(ret != 0))
17380 @@ -2750,20 +2762,55 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
17381         /* Queue the futex_q, drop the hb lock, wait for wakeup. */
17382         futex_wait_queue_me(hb, &q, to);
17383
17384 -       spin_lock(&hb->lock);
17385 -       ret = handle_early_requeue_pi_wakeup(hb, &q, &key2, to);
17386 -       spin_unlock(&hb->lock);
17387 -       if (ret)
17388 -               goto out_put_keys;
17389 +       /*
17390 +        * On RT we must avoid races with requeue and trying to block
17391 +        * on two mutexes (hb->lock and uaddr2's rtmutex) by
17392 +        * serializing access to pi_blocked_on with pi_lock.
17393 +        */
17394 +       raw_spin_lock_irq(&current->pi_lock);
17395 +       if (current->pi_blocked_on) {
17396 +               /*
17397 +                * We have been requeued or are in the process of
17398 +                * being requeued.
17399 +                */
17400 +               raw_spin_unlock_irq(&current->pi_lock);
17401 +       } else {
17402 +               /*
17403 +                * Setting pi_blocked_on to PI_WAKEUP_INPROGRESS
17404 +                * prevents a concurrent requeue from moving us to the
17405 +                * uaddr2 rtmutex. After that we can safely acquire
17406 +                * (and possibly block on) hb->lock.
17407 +                */
17408 +               current->pi_blocked_on = PI_WAKEUP_INPROGRESS;
17409 +               raw_spin_unlock_irq(&current->pi_lock);
17410 +
17411 +               spin_lock(&hb->lock);
17412 +
17413 +               /*
17414 +                * Clean up pi_blocked_on. We might leak it otherwise
17415 +                * when we succeeded with the hb->lock in the fast
17416 +                * path.
17417 +                */
17418 +               raw_spin_lock_irq(&current->pi_lock);
17419 +               current->pi_blocked_on = NULL;
17420 +               raw_spin_unlock_irq(&current->pi_lock);
17421 +
17422 +               ret = handle_early_requeue_pi_wakeup(hb, &q, &key2, to);
17423 +               spin_unlock(&hb->lock);
17424 +               if (ret)
17425 +                       goto out_put_keys;
17426 +       }
17427
17428         /*
17429 -        * In order for us to be here, we know our q.key == key2, and since
17430 -        * we took the hb->lock above, we also know that futex_requeue() has
17431 -        * completed and we no longer have to concern ourselves with a wakeup
17432 -        * race with the atomic proxy lock acquisition by the requeue code. The
17433 -        * futex_requeue dropped our key1 reference and incremented our key2
17434 -        * reference count.
17435 +        * In order to be here, we have either been requeued, are in
17436 +        * the process of being requeued, or requeue successfully
17437 +        * acquired uaddr2 on our behalf.  If pi_blocked_on was
17438 +        * non-null above, we may be racing with a requeue.  Do not
17439 +        * rely on q->lock_ptr to be hb2->lock until after blocking on
17440 +        * hb->lock or hb2->lock. The futex_requeue dropped our key1
17441 +        * reference and incremented our key2 reference count.
17442          */
17443 +       hb2 = hash_futex(&key2);
17444
17445         /* Check if the requeue code acquired the second futex for us. */
17446         if (!q.rt_waiter) {
17447 @@ -2772,14 +2819,15 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
17448                  * did a lock-steal - fix up the PI-state in that case.
17449                  */
17450                 if (q.pi_state && (q.pi_state->owner != current)) {
17451 -                       spin_lock(q.lock_ptr);
17452 +                       spin_lock(&hb2->lock);
17453 +                       BUG_ON(&hb2->lock != q.lock_ptr);
17454                         ret = fixup_pi_state_owner(uaddr2, &q, current);
17455                         /*
17456                          * Drop the reference to the pi state which
17457                          * the requeue_pi() code acquired for us.
17458                          */
17459                         free_pi_state(q.pi_state);
17460 -                       spin_unlock(q.lock_ptr);
17461 +                       spin_unlock(&hb2->lock);
17462                 }
17463         } else {
17464                 /*
17465 @@ -2792,7 +2840,8 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
17466                 ret = rt_mutex_finish_proxy_lock(pi_mutex, to, &rt_waiter);
17467                 debug_rt_mutex_free_waiter(&rt_waiter);
17468
17469 -               spin_lock(q.lock_ptr);
17470 +               spin_lock(&hb2->lock);
17471 +               BUG_ON(&hb2->lock != q.lock_ptr);
17472                 /*
17473                  * Fixup the pi_state owner and possibly acquire the lock if we
17474                  * haven't already.
17475 diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
17476 index 57bff7857e87..6c65c9252991 100644
17477 --- a/kernel/irq/handle.c
17478 +++ b/kernel/irq/handle.c
17479 @@ -134,6 +134,8 @@ void __irq_wake_thread(struct irq_desc *desc, struct irqaction *action)
17480
17481  irqreturn_t handle_irq_event_percpu(struct irq_desc *desc)
17482  {
17483 +       struct pt_regs *regs = get_irq_regs();
17484 +       u64 ip = regs ? instruction_pointer(regs) : 0;
17485         irqreturn_t retval = IRQ_NONE;
17486         unsigned int flags = 0, irq = desc->irq_data.irq;
17487         struct irqaction *action = desc->action;
17488 @@ -176,7 +178,11 @@ irqreturn_t handle_irq_event_percpu(struct irq_desc *desc)
17489                 action = action->next;
17490         }
17491
17492 -       add_interrupt_randomness(irq, flags);
17493 +#ifdef CONFIG_PREEMPT_RT_FULL
17494 +       desc->random_ip = ip;
17495 +#else
17496 +       add_interrupt_randomness(irq, flags, ip);
17497 +#endif
17498
17499         if (!noirqdebug)
17500                 note_interrupt(desc, retval);
17501 diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c
17502 index 239e2ae2c947..0b73349a42d5 100644
17503 --- a/kernel/irq/irqdesc.c
17504 +++ b/kernel/irq/irqdesc.c
17505 @@ -24,10 +24,27 @@
17506  static struct lock_class_key irq_desc_lock_class;
17507
17508  #if defined(CONFIG_SMP)
17509 +static int __init irq_affinity_setup(char *str)
17510 +{
17511 +       zalloc_cpumask_var(&irq_default_affinity, GFP_NOWAIT);
17512 +       cpulist_parse(str, irq_default_affinity);
17513 +       /*
17514 +        * Set at least the boot cpu. We don't want to end up with
17515 +        * bugreports caused by random comandline masks
17516 +        */
17517 +       cpumask_set_cpu(smp_processor_id(), irq_default_affinity);
17518 +       return 1;
17519 +}
17520 +__setup("irqaffinity=", irq_affinity_setup);
17521 +
17522  static void __init init_irq_default_affinity(void)
17523  {
17524 -       alloc_cpumask_var(&irq_default_affinity, GFP_NOWAIT);
17525 -       cpumask_setall(irq_default_affinity);
17526 +#ifdef CONFIG_CPUMASK_OFFSTACK
17527 +       if (!irq_default_affinity)
17528 +               zalloc_cpumask_var(&irq_default_affinity, GFP_NOWAIT);
17529 +#endif
17530 +       if (cpumask_empty(irq_default_affinity))
17531 +               cpumask_setall(irq_default_affinity);
17532  }
17533  #else
17534  static void __init init_irq_default_affinity(void)
17535 diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
17536 index 6ead200370da..8e89554aa345 100644
17537 --- a/kernel/irq/manage.c
17538 +++ b/kernel/irq/manage.c
17539 @@ -22,6 +22,7 @@
17540  #include "internals.h"
17541
17542  #ifdef CONFIG_IRQ_FORCED_THREADING
17543 +# ifndef CONFIG_PREEMPT_RT_BASE
17544  __read_mostly bool force_irqthreads;
17545
17546  static int __init setup_forced_irqthreads(char *arg)
17547 @@ -30,6 +31,7 @@ static int __init setup_forced_irqthreads(char *arg)
17548         return 0;
17549  }
17550  early_param("threadirqs", setup_forced_irqthreads);
17551 +# endif
17552  #endif
17553
17554  static void __synchronize_hardirq(struct irq_desc *desc)
17555 @@ -181,6 +183,62 @@ static inline void
17556  irq_get_pending(struct cpumask *mask, struct irq_desc *desc) { }
17557  #endif
17558
17559 +#ifdef CONFIG_PREEMPT_RT_FULL
17560 +static void _irq_affinity_notify(struct irq_affinity_notify *notify);
17561 +static struct task_struct *set_affinity_helper;
17562 +static LIST_HEAD(affinity_list);
17563 +static DEFINE_RAW_SPINLOCK(affinity_list_lock);
17564 +
17565 +static int set_affinity_thread(void *unused)
17566 +{
17567 +       while (1) {
17568 +               struct irq_affinity_notify *notify;
17569 +               int empty;
17570 +
17571 +               set_current_state(TASK_INTERRUPTIBLE);
17572 +
17573 +               raw_spin_lock_irq(&affinity_list_lock);
17574 +               empty = list_empty(&affinity_list);
17575 +               raw_spin_unlock_irq(&affinity_list_lock);
17576 +
17577 +               if (empty)
17578 +                       schedule();
17579 +               if (kthread_should_stop())
17580 +                       break;
17581 +               set_current_state(TASK_RUNNING);
17582 +try_next:
17583 +               notify = NULL;
17584 +
17585 +               raw_spin_lock_irq(&affinity_list_lock);
17586 +               if (!list_empty(&affinity_list)) {
17587 +                       notify = list_first_entry(&affinity_list,
17588 +                                       struct irq_affinity_notify, list);
17589 +                       list_del_init(&notify->list);
17590 +               }
17591 +               raw_spin_unlock_irq(&affinity_list_lock);
17592 +
17593 +               if (!notify)
17594 +                       continue;
17595 +               _irq_affinity_notify(notify);
17596 +               goto try_next;
17597 +       }
17598 +       return 0;
17599 +}
17600 +
17601 +static void init_helper_thread(void)
17602 +{
17603 +       if (set_affinity_helper)
17604 +               return;
17605 +       set_affinity_helper = kthread_run(set_affinity_thread, NULL,
17606 +                       "affinity-cb");
17607 +       WARN_ON(IS_ERR(set_affinity_helper));
17608 +}
17609 +#else
17610 +
17611 +static inline void init_helper_thread(void) { }
17612 +
17613 +#endif
17614 +
17615  int irq_do_set_affinity(struct irq_data *data, const struct cpumask *mask,
17616                         bool force)
17617  {
17618 @@ -220,7 +278,17 @@ int irq_set_affinity_locked(struct irq_data *data, const struct cpumask *mask,
17619
17620         if (desc->affinity_notify) {
17621                 kref_get(&desc->affinity_notify->kref);
17622 +
17623 +#ifdef CONFIG_PREEMPT_RT_FULL
17624 +               raw_spin_lock(&affinity_list_lock);
17625 +               if (list_empty(&desc->affinity_notify->list))
17626 +                       list_add_tail(&affinity_list,
17627 +                                       &desc->affinity_notify->list);
17628 +               raw_spin_unlock(&affinity_list_lock);
17629 +               wake_up_process(set_affinity_helper);
17630 +#else
17631                 schedule_work(&desc->affinity_notify->work);
17632 +#endif
17633         }
17634         irqd_set(data, IRQD_AFFINITY_SET);
17635
17636 @@ -258,10 +326,8 @@ int irq_set_affinity_hint(unsigned int irq, const struct cpumask *m)
17637  }
17638  EXPORT_SYMBOL_GPL(irq_set_affinity_hint);
17639
17640 -static void irq_affinity_notify(struct work_struct *work)
17641 +static void _irq_affinity_notify(struct irq_affinity_notify *notify)
17642  {
17643 -       struct irq_affinity_notify *notify =
17644 -               container_of(work, struct irq_affinity_notify, work);
17645         struct irq_desc *desc = irq_to_desc(notify->irq);
17646         cpumask_var_t cpumask;
17647         unsigned long flags;
17648 @@ -283,6 +349,13 @@ out:
17649         kref_put(&notify->kref, notify->release);
17650  }
17651
17652 +static void irq_affinity_notify(struct work_struct *work)
17653 +{
17654 +       struct irq_affinity_notify *notify =
17655 +               container_of(work, struct irq_affinity_notify, work);
17656 +       _irq_affinity_notify(notify);
17657 +}
17658 +
17659  /**
17660   *     irq_set_affinity_notifier - control notification of IRQ affinity changes
17661   *     @irq:           Interrupt for which to enable/disable notification
17662 @@ -312,6 +385,8 @@ irq_set_affinity_notifier(unsigned int irq, struct irq_affinity_notify *notify)
17663                 notify->irq = irq;
17664                 kref_init(&notify->kref);
17665                 INIT_WORK(&notify->work, irq_affinity_notify);
17666 +               INIT_LIST_HEAD(&notify->list);
17667 +               init_helper_thread();
17668         }
17669
17670         raw_spin_lock_irqsave(&desc->lock, flags);
17671 @@ -865,7 +940,15 @@ irq_forced_thread_fn(struct irq_desc *desc, struct irqaction *action)
17672         local_bh_disable();
17673         ret = action->thread_fn(action->irq, action->dev_id);
17674         irq_finalize_oneshot(desc, action);
17675 -       local_bh_enable();
17676 +       /*
17677 +        * Interrupts which have real time requirements can be set up
17678 +        * to avoid softirq processing in the thread handler. This is
17679 +        * safe as these interrupts do not raise soft interrupts.
17680 +        */
17681 +       if (irq_settings_no_softirq_call(desc))
17682 +               _local_bh_enable();
17683 +       else
17684 +               local_bh_enable();
17685         return ret;
17686  }
17687
17688 @@ -962,6 +1045,12 @@ static int irq_thread(void *data)
17689                 if (action_ret == IRQ_WAKE_THREAD)
17690                         irq_wake_secondary(desc, action);
17691
17692 +#ifdef CONFIG_PREEMPT_RT_FULL
17693 +               migrate_disable();
17694 +               add_interrupt_randomness(action->irq, 0,
17695 +                                desc->random_ip ^ (unsigned long) action);
17696 +               migrate_enable();
17697 +#endif
17698                 wake_threads_waitq(desc);
17699         }
17700
17701 @@ -1315,6 +1404,9 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
17702                         irqd_set(&desc->irq_data, IRQD_NO_BALANCING);
17703                 }
17704
17705 +               if (new->flags & IRQF_NO_SOFTIRQ_CALL)
17706 +                       irq_settings_set_no_softirq_call(desc);
17707 +
17708                 /* Set default affinity mask once everything is setup */
17709                 setup_affinity(desc, mask);
17710
17711 @@ -1968,7 +2060,7 @@ EXPORT_SYMBOL_GPL(irq_get_irqchip_state);
17712   *     This call sets the internal irqchip state of an interrupt,
17713   *     depending on the value of @which.
17714   *
17715 - *     This function should be called with preemption disabled if the
17716 + *     This function should be called with migration disabled if the
17717   *     interrupt controller has per-cpu registers.
17718   */
17719  int irq_set_irqchip_state(unsigned int irq, enum irqchip_irq_state which,
17720 diff --git a/kernel/irq/settings.h b/kernel/irq/settings.h
17721 index 320579d89091..2df2d4445b1e 100644
17722 --- a/kernel/irq/settings.h
17723 +++ b/kernel/irq/settings.h
17724 @@ -16,6 +16,7 @@ enum {
17725         _IRQ_PER_CPU_DEVID      = IRQ_PER_CPU_DEVID,
17726         _IRQ_IS_POLLED          = IRQ_IS_POLLED,
17727         _IRQ_DISABLE_UNLAZY     = IRQ_DISABLE_UNLAZY,
17728 +       _IRQ_NO_SOFTIRQ_CALL    = IRQ_NO_SOFTIRQ_CALL,
17729         _IRQF_MODIFY_MASK       = IRQF_MODIFY_MASK,
17730  };
17731
17732 @@ -30,6 +31,7 @@ enum {
17733  #define IRQ_PER_CPU_DEVID      GOT_YOU_MORON
17734  #define IRQ_IS_POLLED          GOT_YOU_MORON
17735  #define IRQ_DISABLE_UNLAZY     GOT_YOU_MORON
17736 +#define IRQ_NO_SOFTIRQ_CALL    GOT_YOU_MORON
17737  #undef IRQF_MODIFY_MASK
17738  #define IRQF_MODIFY_MASK       GOT_YOU_MORON
17739
17740 @@ -40,6 +42,16 @@ irq_settings_clr_and_set(struct irq_desc *desc, u32 clr, u32 set)
17741         desc->status_use_accessors |= (set & _IRQF_MODIFY_MASK);
17742  }
17743
17744 +static inline bool irq_settings_no_softirq_call(struct irq_desc *desc)
17745 +{
17746 +       return desc->status_use_accessors & _IRQ_NO_SOFTIRQ_CALL;
17747 +}
17748 +
17749 +static inline void irq_settings_set_no_softirq_call(struct irq_desc *desc)
17750 +{
17751 +       desc->status_use_accessors |= _IRQ_NO_SOFTIRQ_CALL;
17752 +}
17753 +
17754  static inline bool irq_settings_is_per_cpu(struct irq_desc *desc)
17755  {
17756         return desc->status_use_accessors & _IRQ_PER_CPU;
17757 diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c
17758 index 32144175458d..ed26f2554972 100644
17759 --- a/kernel/irq/spurious.c
17760 +++ b/kernel/irq/spurious.c
17761 @@ -444,6 +444,10 @@ MODULE_PARM_DESC(noirqdebug, "Disable irq lockup detection when true");
17762
17763  static int __init irqfixup_setup(char *str)
17764  {
17765 +#ifdef CONFIG_PREEMPT_RT_BASE
17766 +       pr_warn("irqfixup boot option not supported w/ CONFIG_PREEMPT_RT_BASE\n");
17767 +       return 1;
17768 +#endif
17769         irqfixup = 1;
17770         printk(KERN_WARNING "Misrouted IRQ fixup support enabled.\n");
17771         printk(KERN_WARNING "This may impact system performance.\n");
17772 @@ -456,6 +460,10 @@ module_param(irqfixup, int, 0644);
17773
17774  static int __init irqpoll_setup(char *str)
17775  {
17776 +#ifdef CONFIG_PREEMPT_RT_BASE
17777 +       pr_warn("irqpoll boot option not supported w/ CONFIG_PREEMPT_RT_BASE\n");
17778 +       return 1;
17779 +#endif
17780         irqfixup = 2;
17781         printk(KERN_WARNING "Misrouted IRQ fixup and polling support "
17782                                 "enabled\n");
17783 diff --git a/kernel/irq_work.c b/kernel/irq_work.c
17784 index bcf107ce0854..2899ba0d23d1 100644
17785 --- a/kernel/irq_work.c
17786 +++ b/kernel/irq_work.c
17787 @@ -17,6 +17,7 @@
17788  #include <linux/cpu.h>
17789  #include <linux/notifier.h>
17790  #include <linux/smp.h>
17791 +#include <linux/interrupt.h>
17792  #include <asm/processor.h>
17793
17794
17795 @@ -65,6 +66,8 @@ void __weak arch_irq_work_raise(void)
17796   */
17797  bool irq_work_queue_on(struct irq_work *work, int cpu)
17798  {
17799 +       struct llist_head *list;
17800 +
17801         /* All work should have been flushed before going offline */
17802         WARN_ON_ONCE(cpu_is_offline(cpu));
17803
17804 @@ -75,7 +78,12 @@ bool irq_work_queue_on(struct irq_work *work, int cpu)
17805         if (!irq_work_claim(work))
17806                 return false;
17807
17808 -       if (llist_add(&work->llnode, &per_cpu(raised_list, cpu)))
17809 +       if (IS_ENABLED(CONFIG_PREEMPT_RT_FULL) && !(work->flags & IRQ_WORK_HARD_IRQ))
17810 +               list = &per_cpu(lazy_list, cpu);
17811 +       else
17812 +               list = &per_cpu(raised_list, cpu);
17813 +
17814 +       if (llist_add(&work->llnode, list))
17815                 arch_send_call_function_single_ipi(cpu);
17816
17817         return true;
17818 @@ -86,6 +94,9 @@ EXPORT_SYMBOL_GPL(irq_work_queue_on);
17819  /* Enqueue the irq work @work on the current CPU */
17820  bool irq_work_queue(struct irq_work *work)
17821  {
17822 +       struct llist_head *list;
17823 +       bool lazy_work, realtime = IS_ENABLED(CONFIG_PREEMPT_RT_FULL);
17824 +
17825         /* Only queue if not already pending */
17826         if (!irq_work_claim(work))
17827                 return false;
17828 @@ -93,13 +104,15 @@ bool irq_work_queue(struct irq_work *work)
17829         /* Queue the entry and raise the IPI if needed. */
17830         preempt_disable();
17831
17832 -       /* If the work is "lazy", handle it from next tick if any */
17833 -       if (work->flags & IRQ_WORK_LAZY) {
17834 -               if (llist_add(&work->llnode, this_cpu_ptr(&lazy_list)) &&
17835 -                   tick_nohz_tick_stopped())
17836 -                       arch_irq_work_raise();
17837 -       } else {
17838 -               if (llist_add(&work->llnode, this_cpu_ptr(&raised_list)))
17839 +       lazy_work = work->flags & IRQ_WORK_LAZY;
17840 +
17841 +       if (lazy_work || (realtime && !(work->flags & IRQ_WORK_HARD_IRQ)))
17842 +               list = this_cpu_ptr(&lazy_list);
17843 +       else
17844 +               list = this_cpu_ptr(&raised_list);
17845 +
17846 +       if (llist_add(&work->llnode, list)) {
17847 +               if (!lazy_work || tick_nohz_tick_stopped())
17848                         arch_irq_work_raise();
17849         }
17850
17851 @@ -116,9 +129,8 @@ bool irq_work_needs_cpu(void)
17852         raised = this_cpu_ptr(&raised_list);
17853         lazy = this_cpu_ptr(&lazy_list);
17854
17855 -       if (llist_empty(raised) || arch_irq_work_has_interrupt())
17856 -               if (llist_empty(lazy))
17857 -                       return false;
17858 +       if (llist_empty(raised) && llist_empty(lazy))
17859 +               return false;
17860
17861         /* All work should have been flushed before going offline */
17862         WARN_ON_ONCE(cpu_is_offline(smp_processor_id()));
17863 @@ -132,7 +144,7 @@ static void irq_work_run_list(struct llist_head *list)
17864         struct irq_work *work;
17865         struct llist_node *llnode;
17866
17867 -       BUG_ON(!irqs_disabled());
17868 +       BUG_ON_NONRT(!irqs_disabled());
17869
17870         if (llist_empty(list))
17871                 return;
17872 @@ -169,7 +181,16 @@ static void irq_work_run_list(struct llist_head *list)
17873  void irq_work_run(void)
17874  {
17875         irq_work_run_list(this_cpu_ptr(&raised_list));
17876 -       irq_work_run_list(this_cpu_ptr(&lazy_list));
17877 +       if (IS_ENABLED(CONFIG_PREEMPT_RT_FULL)) {
17878 +               /*
17879 +                * NOTE: we raise softirq via IPI for safety,
17880 +                * and execute in irq_work_tick() to move the
17881 +                * overhead from hard to soft irq context.
17882 +                */
17883 +               if (!llist_empty(this_cpu_ptr(&lazy_list)))
17884 +                       raise_softirq(TIMER_SOFTIRQ);
17885 +       } else
17886 +               irq_work_run_list(this_cpu_ptr(&lazy_list));
17887  }
17888  EXPORT_SYMBOL_GPL(irq_work_run);
17889
17890 @@ -179,8 +200,17 @@ void irq_work_tick(void)
17891
17892         if (!llist_empty(raised) && !arch_irq_work_has_interrupt())
17893                 irq_work_run_list(raised);
17894 +
17895 +       if (!IS_ENABLED(CONFIG_PREEMPT_RT_FULL))
17896 +               irq_work_run_list(this_cpu_ptr(&lazy_list));
17897 +}
17898 +
17899 +#if defined(CONFIG_IRQ_WORK) && defined(CONFIG_PREEMPT_RT_FULL)
17900 +void irq_work_tick_soft(void)
17901 +{
17902         irq_work_run_list(this_cpu_ptr(&lazy_list));
17903  }
17904 +#endif
17905
17906  /*
17907   * Synchronize against the irq_work @entry, ensures the entry is not
17908 diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c
17909 index e83b26464061..c0e08d1cf33e 100644
17910 --- a/kernel/ksysfs.c
17911 +++ b/kernel/ksysfs.c
17912 @@ -136,6 +136,15 @@ KERNEL_ATTR_RO(vmcoreinfo);
17913
17914  #endif /* CONFIG_KEXEC_CORE */
17915
17916 +#if defined(CONFIG_PREEMPT_RT_FULL)
17917 +static ssize_t  realtime_show(struct kobject *kobj,
17918 +                             struct kobj_attribute *attr, char *buf)
17919 +{
17920 +       return sprintf(buf, "%d\n", 1);
17921 +}
17922 +KERNEL_ATTR_RO(realtime);
17923 +#endif
17924 +
17925  /* whether file capabilities are enabled */
17926  static ssize_t fscaps_show(struct kobject *kobj,
17927                                   struct kobj_attribute *attr, char *buf)
17928 @@ -203,6 +212,9 @@ static struct attribute * kernel_attrs[] = {
17929         &vmcoreinfo_attr.attr,
17930  #endif
17931         &rcu_expedited_attr.attr,
17932 +#ifdef CONFIG_PREEMPT_RT_FULL
17933 +       &realtime_attr.attr,
17934 +#endif
17935         NULL
17936  };
17937
17938 diff --git a/kernel/locking/Makefile b/kernel/locking/Makefile
17939 index 8e96f6cc2a4a..447b03082d88 100644
17940 --- a/kernel/locking/Makefile
17941 +++ b/kernel/locking/Makefile
17942 @@ -1,5 +1,5 @@
17943
17944 -obj-y += mutex.o semaphore.o rwsem.o percpu-rwsem.o
17945 +obj-y += semaphore.o percpu-rwsem.o
17946
17947  ifdef CONFIG_FUNCTION_TRACER
17948  CFLAGS_REMOVE_lockdep.o = $(CC_FLAGS_FTRACE)
17949 @@ -8,7 +8,11 @@ CFLAGS_REMOVE_mutex-debug.o = $(CC_FLAGS_FTRACE)
17950  CFLAGS_REMOVE_rtmutex-debug.o = $(CC_FLAGS_FTRACE)
17951  endif
17952
17953 +ifneq ($(CONFIG_PREEMPT_RT_FULL),y)
17954 +obj-y += mutex.o
17955  obj-$(CONFIG_DEBUG_MUTEXES) += mutex-debug.o
17956 +obj-y += rwsem.o
17957 +endif
17958  obj-$(CONFIG_LOCKDEP) += lockdep.o
17959  ifeq ($(CONFIG_PROC_FS),y)
17960  obj-$(CONFIG_LOCKDEP) += lockdep_proc.o
17961 @@ -22,7 +26,10 @@ obj-$(CONFIG_RT_MUTEXES) += rtmutex.o
17962  obj-$(CONFIG_DEBUG_RT_MUTEXES) += rtmutex-debug.o
17963  obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock.o
17964  obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock_debug.o
17965 +ifneq ($(CONFIG_PREEMPT_RT_FULL),y)
17966  obj-$(CONFIG_RWSEM_GENERIC_SPINLOCK) += rwsem-spinlock.o
17967  obj-$(CONFIG_RWSEM_XCHGADD_ALGORITHM) += rwsem-xadd.o
17968 +endif
17969 +obj-$(CONFIG_PREEMPT_RT_FULL) += rt.o
17970  obj-$(CONFIG_QUEUED_RWLOCKS) += qrwlock.o
17971  obj-$(CONFIG_LOCK_TORTURE_TEST) += locktorture.o
17972 diff --git a/kernel/locking/lglock.c b/kernel/locking/lglock.c
17973 index 951cfcd10b4a..57e0ea72c28a 100644
17974 --- a/kernel/locking/lglock.c
17975 +++ b/kernel/locking/lglock.c
17976 @@ -4,6 +4,15 @@
17977  #include <linux/cpu.h>
17978  #include <linux/string.h>
17979
17980 +#ifndef CONFIG_PREEMPT_RT_FULL
17981 +# define lg_lock_ptr           arch_spinlock_t
17982 +# define lg_do_lock(l)         arch_spin_lock(l)
17983 +# define lg_do_unlock(l)       arch_spin_unlock(l)
17984 +#else
17985 +# define lg_lock_ptr           struct rt_mutex
17986 +# define lg_do_lock(l)         __rt_spin_lock__no_mg(l)
17987 +# define lg_do_unlock(l)       __rt_spin_unlock(l)
17988 +#endif
17989  /*
17990   * Note there is no uninit, so lglocks cannot be defined in
17991   * modules (but it's fine to use them from there)
17992 @@ -12,51 +21,60 @@
17993
17994  void lg_lock_init(struct lglock *lg, char *name)
17995  {
17996 +#ifdef CONFIG_PREEMPT_RT_FULL
17997 +       int i;
17998 +
17999 +       for_each_possible_cpu(i) {
18000 +               struct rt_mutex *lock = per_cpu_ptr(lg->lock, i);
18001 +
18002 +               rt_mutex_init(lock);
18003 +       }
18004 +#endif
18005         LOCKDEP_INIT_MAP(&lg->lock_dep_map, name, &lg->lock_key, 0);
18006  }
18007  EXPORT_SYMBOL(lg_lock_init);
18008
18009  void lg_local_lock(struct lglock *lg)
18010  {
18011 -       arch_spinlock_t *lock;
18012 +       lg_lock_ptr *lock;
18013
18014 -       preempt_disable();
18015 +       migrate_disable();
18016         lock_acquire_shared(&lg->lock_dep_map, 0, 0, NULL, _RET_IP_);
18017         lock = this_cpu_ptr(lg->lock);
18018 -       arch_spin_lock(lock);
18019 +       lg_do_lock(lock);
18020  }
18021  EXPORT_SYMBOL(lg_local_lock);
18022
18023  void lg_local_unlock(struct lglock *lg)
18024  {
18025 -       arch_spinlock_t *lock;
18026 +       lg_lock_ptr *lock;
18027
18028         lock_release(&lg->lock_dep_map, 1, _RET_IP_);
18029         lock = this_cpu_ptr(lg->lock);
18030 -       arch_spin_unlock(lock);
18031 -       preempt_enable();
18032 +       lg_do_unlock(lock);
18033 +       migrate_enable();
18034  }
18035  EXPORT_SYMBOL(lg_local_unlock);
18036
18037  void lg_local_lock_cpu(struct lglock *lg, int cpu)
18038  {
18039 -       arch_spinlock_t *lock;
18040 +       lg_lock_ptr *lock;
18041
18042 -       preempt_disable();
18043 +       preempt_disable_nort();
18044         lock_acquire_shared(&lg->lock_dep_map, 0, 0, NULL, _RET_IP_);
18045         lock = per_cpu_ptr(lg->lock, cpu);
18046 -       arch_spin_lock(lock);
18047 +       lg_do_lock(lock);
18048  }
18049  EXPORT_SYMBOL(lg_local_lock_cpu);
18050
18051  void lg_local_unlock_cpu(struct lglock *lg, int cpu)
18052  {
18053 -       arch_spinlock_t *lock;
18054 +       lg_lock_ptr *lock;
18055
18056         lock_release(&lg->lock_dep_map, 1, _RET_IP_);
18057         lock = per_cpu_ptr(lg->lock, cpu);
18058 -       arch_spin_unlock(lock);
18059 -       preempt_enable();
18060 +       lg_do_unlock(lock);
18061 +       preempt_enable_nort();
18062  }
18063  EXPORT_SYMBOL(lg_local_unlock_cpu);
18064
18065 @@ -68,30 +86,30 @@ void lg_double_lock(struct lglock *lg, int cpu1, int cpu2)
18066         if (cpu2 < cpu1)
18067                 swap(cpu1, cpu2);
18068
18069 -       preempt_disable();
18070 +       preempt_disable_nort();
18071         lock_acquire_shared(&lg->lock_dep_map, 0, 0, NULL, _RET_IP_);
18072 -       arch_spin_lock(per_cpu_ptr(lg->lock, cpu1));
18073 -       arch_spin_lock(per_cpu_ptr(lg->lock, cpu2));
18074 +       lg_do_lock(per_cpu_ptr(lg->lock, cpu1));
18075 +       lg_do_lock(per_cpu_ptr(lg->lock, cpu2));
18076  }
18077
18078  void lg_double_unlock(struct lglock *lg, int cpu1, int cpu2)
18079  {
18080         lock_release(&lg->lock_dep_map, 1, _RET_IP_);
18081 -       arch_spin_unlock(per_cpu_ptr(lg->lock, cpu1));
18082 -       arch_spin_unlock(per_cpu_ptr(lg->lock, cpu2));
18083 -       preempt_enable();
18084 +       lg_do_unlock(per_cpu_ptr(lg->lock, cpu1));
18085 +       lg_do_unlock(per_cpu_ptr(lg->lock, cpu2));
18086 +       preempt_enable_nort();
18087  }
18088
18089  void lg_global_lock(struct lglock *lg)
18090  {
18091         int i;
18092
18093 -       preempt_disable();
18094 +       preempt_disable_nort();
18095         lock_acquire_exclusive(&lg->lock_dep_map, 0, 0, NULL, _RET_IP_);
18096         for_each_possible_cpu(i) {
18097 -               arch_spinlock_t *lock;
18098 +               lg_lock_ptr *lock;
18099                 lock = per_cpu_ptr(lg->lock, i);
18100 -               arch_spin_lock(lock);
18101 +               lg_do_lock(lock);
18102         }
18103  }
18104  EXPORT_SYMBOL(lg_global_lock);
18105 @@ -102,10 +120,35 @@ void lg_global_unlock(struct lglock *lg)
18106
18107         lock_release(&lg->lock_dep_map, 1, _RET_IP_);
18108         for_each_possible_cpu(i) {
18109 -               arch_spinlock_t *lock;
18110 +               lg_lock_ptr *lock;
18111                 lock = per_cpu_ptr(lg->lock, i);
18112 -               arch_spin_unlock(lock);
18113 +               lg_do_unlock(lock);
18114         }
18115 -       preempt_enable();
18116 +       preempt_enable_nort();
18117  }
18118  EXPORT_SYMBOL(lg_global_unlock);
18119 +
18120 +#ifdef CONFIG_PREEMPT_RT_FULL
18121 +/*
18122 + * HACK: If you use this, you get to keep the pieces.
18123 + * Used in queue_stop_cpus_work() when stop machinery
18124 + * is called from inactive CPU, so we can't schedule.
18125 + */
18126 +# define lg_do_trylock_relax(l)                        \
18127 +       do {                                    \
18128 +               while (!__rt_spin_trylock(l))   \
18129 +                       cpu_relax();            \
18130 +       } while (0)
18131 +
18132 +void lg_global_trylock_relax(struct lglock *lg)
18133 +{
18134 +       int i;
18135 +
18136 +       lock_acquire_exclusive(&lg->lock_dep_map, 0, 0, NULL, _RET_IP_);
18137 +       for_each_possible_cpu(i) {
18138 +               lg_lock_ptr *lock;
18139 +               lock = per_cpu_ptr(lg->lock, i);
18140 +               lg_do_trylock_relax(lock);
18141 +       }
18142 +}
18143 +#endif
18144 diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
18145 index 60ace56618f6..e98ee958a353 100644
18146 --- a/kernel/locking/lockdep.c
18147 +++ b/kernel/locking/lockdep.c
18148 @@ -3525,6 +3525,7 @@ static void check_flags(unsigned long flags)
18149                 }
18150         }
18151
18152 +#ifndef CONFIG_PREEMPT_RT_FULL
18153         /*
18154          * We dont accurately track softirq state in e.g.
18155          * hardirq contexts (such as on 4KSTACKS), so only
18156 @@ -3539,6 +3540,7 @@ static void check_flags(unsigned long flags)
18157                         DEBUG_LOCKS_WARN_ON(!current->softirqs_enabled);
18158                 }
18159         }
18160 +#endif
18161
18162         if (!debug_locks)
18163                 print_irqtrace_events(current);
18164 diff --git a/kernel/locking/locktorture.c b/kernel/locking/locktorture.c
18165 index 8ef1919d63b2..291fc19e28e0 100644
18166 --- a/kernel/locking/locktorture.c
18167 +++ b/kernel/locking/locktorture.c
18168 @@ -26,7 +26,6 @@
18169  #include <linux/kthread.h>
18170  #include <linux/sched/rt.h>
18171  #include <linux/spinlock.h>
18172 -#include <linux/rwlock.h>
18173  #include <linux/mutex.h>
18174  #include <linux/rwsem.h>
18175  #include <linux/smp.h>
18176 diff --git a/kernel/locking/rt.c b/kernel/locking/rt.c
18177 new file mode 100644
18178 index 000000000000..d4ab61c1848b
18179 --- /dev/null
18180 +++ b/kernel/locking/rt.c
18181 @@ -0,0 +1,474 @@
18182 +/*
18183 + * kernel/rt.c
18184 + *
18185 + * Real-Time Preemption Support
18186 + *
18187 + * started by Ingo Molnar:
18188 + *
18189 + *  Copyright (C) 2004-2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
18190 + *  Copyright (C) 2006, Timesys Corp., Thomas Gleixner <tglx@timesys.com>
18191 + *
18192 + * historic credit for proving that Linux spinlocks can be implemented via
18193 + * RT-aware mutexes goes to many people: The Pmutex project (Dirk Grambow
18194 + * and others) who prototyped it on 2.4 and did lots of comparative
18195 + * research and analysis; TimeSys, for proving that you can implement a
18196 + * fully preemptible kernel via the use of IRQ threading and mutexes;
18197 + * Bill Huey for persuasively arguing on lkml that the mutex model is the
18198 + * right one; and to MontaVista, who ported pmutexes to 2.6.
18199 + *
18200 + * This code is a from-scratch implementation and is not based on pmutexes,
18201 + * but the idea of converting spinlocks to mutexes is used here too.
18202 + *
18203 + * lock debugging, locking tree, deadlock detection:
18204 + *
18205 + *  Copyright (C) 2004, LynuxWorks, Inc., Igor Manyilov, Bill Huey
18206 + *  Released under the General Public License (GPL).
18207 + *
18208 + * Includes portions of the generic R/W semaphore implementation from:
18209 + *
18210 + *  Copyright (c) 2001   David Howells (dhowells@redhat.com).
18211 + *  - Derived partially from idea by Andrea Arcangeli <andrea@suse.de>
18212 + *  - Derived also from comments by Linus
18213 + *
18214 + * Pending ownership of locks and ownership stealing:
18215 + *
18216 + *  Copyright (C) 2005, Kihon Technologies Inc., Steven Rostedt
18217 + *
18218 + *   (also by Steven Rostedt)
18219 + *    - Converted single pi_lock to individual task locks.
18220 + *
18221 + * By Esben Nielsen:
18222 + *    Doing priority inheritance with help of the scheduler.
18223 + *
18224 + *  Copyright (C) 2006, Timesys Corp., Thomas Gleixner <tglx@timesys.com>
18225 + *  - major rework based on Esben Nielsens initial patch
18226 + *  - replaced thread_info references by task_struct refs
18227 + *  - removed task->pending_owner dependency
18228 + *  - BKL drop/reacquire for semaphore style locks to avoid deadlocks
18229 + *    in the scheduler return path as discussed with Steven Rostedt
18230 + *
18231 + *  Copyright (C) 2006, Kihon Technologies Inc.
18232 + *    Steven Rostedt <rostedt@goodmis.org>
18233 + *  - debugged and patched Thomas Gleixner's rework.
18234 + *  - added back the cmpxchg to the rework.
18235 + *  - turned atomic require back on for SMP.
18236 + */
18237 +
18238 +#include <linux/spinlock.h>
18239 +#include <linux/rtmutex.h>
18240 +#include <linux/sched.h>
18241 +#include <linux/delay.h>
18242 +#include <linux/module.h>
18243 +#include <linux/kallsyms.h>
18244 +#include <linux/syscalls.h>
18245 +#include <linux/interrupt.h>
18246 +#include <linux/plist.h>
18247 +#include <linux/fs.h>
18248 +#include <linux/futex.h>
18249 +#include <linux/hrtimer.h>
18250 +
18251 +#include "rtmutex_common.h"
18252 +
18253 +/*
18254 + * struct mutex functions
18255 + */
18256 +void __mutex_do_init(struct mutex *mutex, const char *name,
18257 +                    struct lock_class_key *key)
18258 +{
18259 +#ifdef CONFIG_DEBUG_LOCK_ALLOC
18260 +       /*
18261 +        * Make sure we are not reinitializing a held lock:
18262 +        */
18263 +       debug_check_no_locks_freed((void *)mutex, sizeof(*mutex));
18264 +       lockdep_init_map(&mutex->dep_map, name, key, 0);
18265 +#endif
18266 +       mutex->lock.save_state = 0;
18267 +}
18268 +EXPORT_SYMBOL(__mutex_do_init);
18269 +
18270 +void __lockfunc _mutex_lock(struct mutex *lock)
18271 +{
18272 +       mutex_acquire(&lock->dep_map, 0, 0, _RET_IP_);
18273 +       rt_mutex_lock(&lock->lock);
18274 +}
18275 +EXPORT_SYMBOL(_mutex_lock);
18276 +
18277 +int __lockfunc _mutex_lock_interruptible(struct mutex *lock)
18278 +{
18279 +       int ret;
18280 +
18281 +       mutex_acquire(&lock->dep_map, 0, 0, _RET_IP_);
18282 +       ret = rt_mutex_lock_interruptible(&lock->lock);
18283 +       if (ret)
18284 +               mutex_release(&lock->dep_map, 1, _RET_IP_);
18285 +       return ret;
18286 +}
18287 +EXPORT_SYMBOL(_mutex_lock_interruptible);
18288 +
18289 +int __lockfunc _mutex_lock_killable(struct mutex *lock)
18290 +{
18291 +       int ret;
18292 +
18293 +       mutex_acquire(&lock->dep_map, 0, 0, _RET_IP_);
18294 +       ret = rt_mutex_lock_killable(&lock->lock);
18295 +       if (ret)
18296 +               mutex_release(&lock->dep_map, 1, _RET_IP_);
18297 +       return ret;
18298 +}
18299 +EXPORT_SYMBOL(_mutex_lock_killable);
18300 +
18301 +#ifdef CONFIG_DEBUG_LOCK_ALLOC
18302 +void __lockfunc _mutex_lock_nested(struct mutex *lock, int subclass)
18303 +{
18304 +       mutex_acquire_nest(&lock->dep_map, subclass, 0, NULL, _RET_IP_);
18305 +       rt_mutex_lock(&lock->lock);
18306 +}
18307 +EXPORT_SYMBOL(_mutex_lock_nested);
18308 +
18309 +void __lockfunc _mutex_lock_nest_lock(struct mutex *lock, struct lockdep_map *nest)
18310 +{
18311 +       mutex_acquire_nest(&lock->dep_map, 0, 0, nest, _RET_IP_);
18312 +       rt_mutex_lock(&lock->lock);
18313 +}
18314 +EXPORT_SYMBOL(_mutex_lock_nest_lock);
18315 +
18316 +int __lockfunc _mutex_lock_interruptible_nested(struct mutex *lock, int subclass)
18317 +{
18318 +       int ret;
18319 +
18320 +       mutex_acquire_nest(&lock->dep_map, subclass, 0, NULL, _RET_IP_);
18321 +       ret = rt_mutex_lock_interruptible(&lock->lock);
18322 +       if (ret)
18323 +               mutex_release(&lock->dep_map, 1, _RET_IP_);
18324 +       return ret;
18325 +}
18326 +EXPORT_SYMBOL(_mutex_lock_interruptible_nested);
18327 +
18328 +int __lockfunc _mutex_lock_killable_nested(struct mutex *lock, int subclass)
18329 +{
18330 +       int ret;
18331 +
18332 +       mutex_acquire(&lock->dep_map, subclass, 0, _RET_IP_);
18333 +       ret = rt_mutex_lock_killable(&lock->lock);
18334 +       if (ret)
18335 +               mutex_release(&lock->dep_map, 1, _RET_IP_);
18336 +       return ret;
18337 +}
18338 +EXPORT_SYMBOL(_mutex_lock_killable_nested);
18339 +#endif
18340 +
18341 +int __lockfunc _mutex_trylock(struct mutex *lock)
18342 +{
18343 +       int ret = rt_mutex_trylock(&lock->lock);
18344 +
18345 +       if (ret)
18346 +               mutex_acquire(&lock->dep_map, 0, 1, _RET_IP_);
18347 +
18348 +       return ret;
18349 +}
18350 +EXPORT_SYMBOL(_mutex_trylock);
18351 +
18352 +void __lockfunc _mutex_unlock(struct mutex *lock)
18353 +{
18354 +       mutex_release(&lock->dep_map, 1, _RET_IP_);
18355 +       rt_mutex_unlock(&lock->lock);
18356 +}
18357 +EXPORT_SYMBOL(_mutex_unlock);
18358 +
18359 +/*
18360 + * rwlock_t functions
18361 + */
18362 +int __lockfunc rt_write_trylock(rwlock_t *rwlock)
18363 +{
18364 +       int ret;
18365 +
18366 +       migrate_disable();
18367 +       ret = rt_mutex_trylock(&rwlock->lock);
18368 +       if (ret)
18369 +               rwlock_acquire(&rwlock->dep_map, 0, 1, _RET_IP_);
18370 +       else
18371 +               migrate_enable();
18372 +
18373 +       return ret;
18374 +}
18375 +EXPORT_SYMBOL(rt_write_trylock);
18376 +
18377 +int __lockfunc rt_write_trylock_irqsave(rwlock_t *rwlock, unsigned long *flags)
18378 +{
18379 +       int ret;
18380 +
18381 +       *flags = 0;
18382 +       ret = rt_write_trylock(rwlock);
18383 +       return ret;
18384 +}
18385 +EXPORT_SYMBOL(rt_write_trylock_irqsave);
18386 +
18387 +int __lockfunc rt_read_trylock(rwlock_t *rwlock)
18388 +{
18389 +       struct rt_mutex *lock = &rwlock->lock;
18390 +       int ret = 1;
18391 +
18392 +       /*
18393 +        * recursive read locks succeed when current owns the lock,
18394 +        * but not when read_depth == 0 which means that the lock is
18395 +        * write locked.
18396 +        */
18397 +       if (rt_mutex_owner(lock) != current) {
18398 +               migrate_disable();
18399 +               ret = rt_mutex_trylock(lock);
18400 +               if (ret)
18401 +                       rwlock_acquire(&rwlock->dep_map, 0, 1, _RET_IP_);
18402 +               else
18403 +                       migrate_enable();
18404 +
18405 +       } else if (!rwlock->read_depth) {
18406 +               ret = 0;
18407 +       }
18408 +
18409 +       if (ret)
18410 +               rwlock->read_depth++;
18411 +
18412 +       return ret;
18413 +}
18414 +EXPORT_SYMBOL(rt_read_trylock);
18415 +
18416 +void __lockfunc rt_write_lock(rwlock_t *rwlock)
18417 +{
18418 +       rwlock_acquire(&rwlock->dep_map, 0, 0, _RET_IP_);
18419 +       __rt_spin_lock(&rwlock->lock);
18420 +}
18421 +EXPORT_SYMBOL(rt_write_lock);
18422 +
18423 +void __lockfunc rt_read_lock(rwlock_t *rwlock)
18424 +{
18425 +       struct rt_mutex *lock = &rwlock->lock;
18426 +
18427 +
18428 +       /*
18429 +        * recursive read locks succeed when current owns the lock
18430 +        */
18431 +       if (rt_mutex_owner(lock) != current) {
18432 +               rwlock_acquire(&rwlock->dep_map, 0, 0, _RET_IP_);
18433 +               __rt_spin_lock(lock);
18434 +       }
18435 +       rwlock->read_depth++;
18436 +}
18437 +
18438 +EXPORT_SYMBOL(rt_read_lock);
18439 +
18440 +void __lockfunc rt_write_unlock(rwlock_t *rwlock)
18441 +{
18442 +       /* NOTE: we always pass in '1' for nested, for simplicity */
18443 +       rwlock_release(&rwlock->dep_map, 1, _RET_IP_);
18444 +       __rt_spin_unlock(&rwlock->lock);
18445 +       migrate_enable();
18446 +}
18447 +EXPORT_SYMBOL(rt_write_unlock);
18448 +
18449 +void __lockfunc rt_read_unlock(rwlock_t *rwlock)
18450 +{
18451 +       /* Release the lock only when read_depth is down to 0 */
18452 +       if (--rwlock->read_depth == 0) {
18453 +               rwlock_release(&rwlock->dep_map, 1, _RET_IP_);
18454 +               __rt_spin_unlock(&rwlock->lock);
18455 +               migrate_enable();
18456 +       }
18457 +}
18458 +EXPORT_SYMBOL(rt_read_unlock);
18459 +
18460 +unsigned long __lockfunc rt_write_lock_irqsave(rwlock_t *rwlock)
18461 +{
18462 +       rt_write_lock(rwlock);
18463 +
18464 +       return 0;
18465 +}
18466 +EXPORT_SYMBOL(rt_write_lock_irqsave);
18467 +
18468 +unsigned long __lockfunc rt_read_lock_irqsave(rwlock_t *rwlock)
18469 +{
18470 +       rt_read_lock(rwlock);
18471 +
18472 +       return 0;
18473 +}
18474 +EXPORT_SYMBOL(rt_read_lock_irqsave);
18475 +
18476 +void __rt_rwlock_init(rwlock_t *rwlock, char *name, struct lock_class_key *key)
18477 +{
18478 +#ifdef CONFIG_DEBUG_LOCK_ALLOC
18479 +       /*
18480 +        * Make sure we are not reinitializing a held lock:
18481 +        */
18482 +       debug_check_no_locks_freed((void *)rwlock, sizeof(*rwlock));
18483 +       lockdep_init_map(&rwlock->dep_map, name, key, 0);
18484 +#endif
18485 +       rwlock->lock.save_state = 1;
18486 +       rwlock->read_depth = 0;
18487 +}
18488 +EXPORT_SYMBOL(__rt_rwlock_init);
18489 +
18490 +/*
18491 + * rw_semaphores
18492 + */
18493 +
18494 +void  rt_up_write(struct rw_semaphore *rwsem)
18495 +{
18496 +       rwsem_release(&rwsem->dep_map, 1, _RET_IP_);
18497 +       rt_mutex_unlock(&rwsem->lock);
18498 +}
18499 +EXPORT_SYMBOL(rt_up_write);
18500 +
18501 +void __rt_up_read(struct rw_semaphore *rwsem)
18502 +{
18503 +       if (--rwsem->read_depth == 0)
18504 +               rt_mutex_unlock(&rwsem->lock);
18505 +}
18506 +
18507 +void  rt_up_read(struct rw_semaphore *rwsem)
18508 +{
18509 +       rwsem_release(&rwsem->dep_map, 1, _RET_IP_);
18510 +       __rt_up_read(rwsem);
18511 +}
18512 +EXPORT_SYMBOL(rt_up_read);
18513 +
18514 +/*
18515 + * downgrade a write lock into a read lock
18516 + * - just wake up any readers at the front of the queue
18517 + */
18518 +void  rt_downgrade_write(struct rw_semaphore *rwsem)
18519 +{
18520 +       BUG_ON(rt_mutex_owner(&rwsem->lock) != current);
18521 +       rwsem->read_depth = 1;
18522 +}
18523 +EXPORT_SYMBOL(rt_downgrade_write);
18524 +
18525 +int  rt_down_write_trylock(struct rw_semaphore *rwsem)
18526 +{
18527 +       int ret = rt_mutex_trylock(&rwsem->lock);
18528 +
18529 +       if (ret)
18530 +               rwsem_acquire(&rwsem->dep_map, 0, 1, _RET_IP_);
18531 +       return ret;
18532 +}
18533 +EXPORT_SYMBOL(rt_down_write_trylock);
18534 +
18535 +void  rt_down_write(struct rw_semaphore *rwsem)
18536 +{
18537 +       rwsem_acquire(&rwsem->dep_map, 0, 0, _RET_IP_);
18538 +       rt_mutex_lock(&rwsem->lock);
18539 +}
18540 +EXPORT_SYMBOL(rt_down_write);
18541 +
18542 +void  rt_down_write_nested(struct rw_semaphore *rwsem, int subclass)
18543 +{
18544 +       rwsem_acquire(&rwsem->dep_map, subclass, 0, _RET_IP_);
18545 +       rt_mutex_lock(&rwsem->lock);
18546 +}
18547 +EXPORT_SYMBOL(rt_down_write_nested);
18548 +
18549 +void rt_down_write_nested_lock(struct rw_semaphore *rwsem,
18550 +                              struct lockdep_map *nest)
18551 +{
18552 +       rwsem_acquire_nest(&rwsem->dep_map, 0, 0, nest, _RET_IP_);
18553 +       rt_mutex_lock(&rwsem->lock);
18554 +}
18555 +EXPORT_SYMBOL(rt_down_write_nested_lock);
18556 +
18557 +int rt__down_read_trylock(struct rw_semaphore *rwsem)
18558 +{
18559 +       struct rt_mutex *lock = &rwsem->lock;
18560 +       int ret = 1;
18561 +
18562 +       /*
18563 +        * recursive read locks succeed when current owns the rwsem,
18564 +        * but not when read_depth == 0 which means that the rwsem is
18565 +        * write locked.
18566 +        */
18567 +       if (rt_mutex_owner(lock) != current)
18568 +               ret = rt_mutex_trylock(&rwsem->lock);
18569 +       else if (!rwsem->read_depth)
18570 +               ret = 0;
18571 +
18572 +       if (ret)
18573 +               rwsem->read_depth++;
18574 +       return ret;
18575 +
18576 +}
18577 +
18578 +int  rt_down_read_trylock(struct rw_semaphore *rwsem)
18579 +{
18580 +       int ret;
18581 +
18582 +       ret = rt__down_read_trylock(rwsem);
18583 +       if (ret)
18584 +               rwsem_acquire(&rwsem->dep_map, 0, 1, _RET_IP_);
18585 +
18586 +       return ret;
18587 +}
18588 +EXPORT_SYMBOL(rt_down_read_trylock);
18589 +
18590 +void rt__down_read(struct rw_semaphore *rwsem)
18591 +{
18592 +       struct rt_mutex *lock = &rwsem->lock;
18593 +
18594 +       if (rt_mutex_owner(lock) != current)
18595 +               rt_mutex_lock(&rwsem->lock);
18596 +       rwsem->read_depth++;
18597 +}
18598 +EXPORT_SYMBOL(rt__down_read);
18599 +
18600 +static void __rt_down_read(struct rw_semaphore *rwsem, int subclass)
18601 +{
18602 +       rwsem_acquire_read(&rwsem->dep_map, subclass, 0, _RET_IP_);
18603 +       rt__down_read(rwsem);
18604 +}
18605 +
18606 +void  rt_down_read(struct rw_semaphore *rwsem)
18607 +{
18608 +       __rt_down_read(rwsem, 0);
18609 +}
18610 +EXPORT_SYMBOL(rt_down_read);
18611 +
18612 +void  rt_down_read_nested(struct rw_semaphore *rwsem, int subclass)
18613 +{
18614 +       __rt_down_read(rwsem, subclass);
18615 +}
18616 +EXPORT_SYMBOL(rt_down_read_nested);
18617 +
18618 +void  __rt_rwsem_init(struct rw_semaphore *rwsem, const char *name,
18619 +                             struct lock_class_key *key)
18620 +{
18621 +#ifdef CONFIG_DEBUG_LOCK_ALLOC
18622 +       /*
18623 +        * Make sure we are not reinitializing a held lock:
18624 +        */
18625 +       debug_check_no_locks_freed((void *)rwsem, sizeof(*rwsem));
18626 +       lockdep_init_map(&rwsem->dep_map, name, key, 0);
18627 +#endif
18628 +       rwsem->read_depth = 0;
18629 +       rwsem->lock.save_state = 0;
18630 +}
18631 +EXPORT_SYMBOL(__rt_rwsem_init);
18632 +
18633 +/**
18634 + * atomic_dec_and_mutex_lock - return holding mutex if we dec to 0
18635 + * @cnt: the atomic which we are to dec
18636 + * @lock: the mutex to return holding if we dec to 0
18637 + *
18638 + * return true and hold lock if we dec to 0, return false otherwise
18639 + */
18640 +int atomic_dec_and_mutex_lock(atomic_t *cnt, struct mutex *lock)
18641 +{
18642 +       /* dec if we can't possibly hit 0 */
18643 +       if (atomic_add_unless(cnt, -1, 1))
18644 +               return 0;
18645 +       /* we might hit 0, so take the lock */
18646 +       mutex_lock(lock);
18647 +       if (!atomic_dec_and_test(cnt)) {
18648 +               /* when we actually did the dec, we didn't hit 0 */
18649 +               mutex_unlock(lock);
18650 +               return 0;
18651 +       }
18652 +       /* we hit 0, and we hold the lock */
18653 +       return 1;
18654 +}
18655 +EXPORT_SYMBOL(atomic_dec_and_mutex_lock);
18656 diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c
18657 index 8251e75dd9c0..6759a798c927 100644
18658 --- a/kernel/locking/rtmutex.c
18659 +++ b/kernel/locking/rtmutex.c
18660 @@ -7,6 +7,11 @@
18661   *  Copyright (C) 2005-2006 Timesys Corp., Thomas Gleixner <tglx@timesys.com>
18662   *  Copyright (C) 2005 Kihon Technologies Inc., Steven Rostedt
18663   *  Copyright (C) 2006 Esben Nielsen
18664 + *  Adaptive Spinlocks:
18665 + *  Copyright (C) 2008 Novell, Inc., Gregory Haskins, Sven Dietrich,
18666 + *                                  and Peter Morreale,
18667 + * Adaptive Spinlocks simplification:
18668 + *  Copyright (C) 2008 Red Hat, Inc., Steven Rostedt <srostedt@redhat.com>
18669   *
18670   *  See Documentation/locking/rt-mutex-design.txt for details.
18671   */
18672 @@ -16,6 +21,7 @@
18673  #include <linux/sched/rt.h>
18674  #include <linux/sched/deadline.h>
18675  #include <linux/timer.h>
18676 +#include <linux/ww_mutex.h>
18677
18678  #include "rtmutex_common.h"
18679
18680 @@ -69,6 +75,12 @@ static void fixup_rt_mutex_waiters(struct rt_mutex *lock)
18681                 clear_rt_mutex_waiters(lock);
18682  }
18683
18684 +static int rt_mutex_real_waiter(struct rt_mutex_waiter *waiter)
18685 +{
18686 +       return waiter && waiter != PI_WAKEUP_INPROGRESS &&
18687 +               waiter != PI_REQUEUE_INPROGRESS;
18688 +}
18689 +
18690  /*
18691   * We can speed up the acquire/release, if there's no debugging state to be
18692   * set up.
18693 @@ -99,13 +111,14 @@ static inline void mark_rt_mutex_waiters(struct rt_mutex *lock)
18694   * 2) Drop lock->wait_lock
18695   * 3) Try to unlock the lock with cmpxchg
18696   */
18697 -static inline bool unlock_rt_mutex_safe(struct rt_mutex *lock)
18698 +static inline bool unlock_rt_mutex_safe(struct rt_mutex *lock,
18699 +                                       unsigned long flags)
18700         __releases(lock->wait_lock)
18701  {
18702         struct task_struct *owner = rt_mutex_owner(lock);
18703
18704         clear_rt_mutex_waiters(lock);
18705 -       raw_spin_unlock(&lock->wait_lock);
18706 +       raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
18707         /*
18708          * If a new waiter comes in between the unlock and the cmpxchg
18709          * we have two situations:
18710 @@ -147,11 +160,12 @@ static inline void mark_rt_mutex_waiters(struct rt_mutex *lock)
18711  /*
18712   * Simple slow path only version: lock->owner is protected by lock->wait_lock.
18713   */
18714 -static inline bool unlock_rt_mutex_safe(struct rt_mutex *lock)
18715 +static inline bool unlock_rt_mutex_safe(struct rt_mutex *lock,
18716 +                                       unsigned long flags)
18717         __releases(lock->wait_lock)
18718  {
18719         lock->owner = NULL;
18720 -       raw_spin_unlock(&lock->wait_lock);
18721 +       raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
18722         return true;
18723  }
18724  #endif
18725 @@ -348,6 +362,14 @@ static bool rt_mutex_cond_detect_deadlock(struct rt_mutex_waiter *waiter,
18726         return debug_rt_mutex_detect_deadlock(waiter, chwalk);
18727  }
18728
18729 +static void rt_mutex_wake_waiter(struct rt_mutex_waiter *waiter)
18730 +{
18731 +       if (waiter->savestate)
18732 +               wake_up_lock_sleeper(waiter->task);
18733 +       else
18734 +               wake_up_process(waiter->task);
18735 +}
18736 +
18737  /*
18738   * Max number of times we'll walk the boosting chain:
18739   */
18740 @@ -355,7 +377,8 @@ int max_lock_depth = 1024;
18741
18742  static inline struct rt_mutex *task_blocked_on_lock(struct task_struct *p)
18743  {
18744 -       return p->pi_blocked_on ? p->pi_blocked_on->lock : NULL;
18745 +       return rt_mutex_real_waiter(p->pi_blocked_on) ?
18746 +               p->pi_blocked_on->lock : NULL;
18747  }
18748
18749  /*
18750 @@ -433,7 +456,6 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
18751         int ret = 0, depth = 0;
18752         struct rt_mutex *lock;
18753         bool detect_deadlock;
18754 -       unsigned long flags;
18755         bool requeue = true;
18756
18757         detect_deadlock = rt_mutex_cond_detect_deadlock(orig_waiter, chwalk);
18758 @@ -476,7 +498,7 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
18759         /*
18760          * [1] Task cannot go away as we did a get_task() before !
18761          */
18762 -       raw_spin_lock_irqsave(&task->pi_lock, flags);
18763 +       raw_spin_lock_irq(&task->pi_lock);
18764
18765         /*
18766          * [2] Get the waiter on which @task is blocked on.
18767 @@ -492,7 +514,7 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
18768          * reached or the state of the chain has changed while we
18769          * dropped the locks.
18770          */
18771 -       if (!waiter)
18772 +       if (!rt_mutex_real_waiter(waiter))
18773                 goto out_unlock_pi;
18774
18775         /*
18776 @@ -560,7 +582,7 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
18777          * operations.
18778          */
18779         if (!raw_spin_trylock(&lock->wait_lock)) {
18780 -               raw_spin_unlock_irqrestore(&task->pi_lock, flags);
18781 +               raw_spin_unlock_irq(&task->pi_lock);
18782                 cpu_relax();
18783                 goto retry;
18784         }
18785 @@ -591,7 +613,7 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
18786                 /*
18787                  * No requeue[7] here. Just release @task [8]
18788                  */
18789 -               raw_spin_unlock_irqrestore(&task->pi_lock, flags);
18790 +               raw_spin_unlock(&task->pi_lock);
18791                 put_task_struct(task);
18792
18793                 /*
18794 @@ -599,14 +621,14 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
18795                  * If there is no owner of the lock, end of chain.
18796                  */
18797                 if (!rt_mutex_owner(lock)) {
18798 -                       raw_spin_unlock(&lock->wait_lock);
18799 +                       raw_spin_unlock_irq(&lock->wait_lock);
18800                         return 0;
18801                 }
18802
18803                 /* [10] Grab the next task, i.e. owner of @lock */
18804                 task = rt_mutex_owner(lock);
18805                 get_task_struct(task);
18806 -               raw_spin_lock_irqsave(&task->pi_lock, flags);
18807 +               raw_spin_lock(&task->pi_lock);
18808
18809                 /*
18810                  * No requeue [11] here. We just do deadlock detection.
18811 @@ -621,8 +643,8 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
18812                 top_waiter = rt_mutex_top_waiter(lock);
18813
18814                 /* [13] Drop locks */
18815 -               raw_spin_unlock_irqrestore(&task->pi_lock, flags);
18816 -               raw_spin_unlock(&lock->wait_lock);
18817 +               raw_spin_unlock(&task->pi_lock);
18818 +               raw_spin_unlock_irq(&lock->wait_lock);
18819
18820                 /* If owner is not blocked, end of chain. */
18821                 if (!next_lock)
18822 @@ -643,7 +665,7 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
18823         rt_mutex_enqueue(lock, waiter);
18824
18825         /* [8] Release the task */
18826 -       raw_spin_unlock_irqrestore(&task->pi_lock, flags);
18827 +       raw_spin_unlock(&task->pi_lock);
18828         put_task_struct(task);
18829
18830         /*
18831 @@ -654,21 +676,24 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
18832          * follow here. This is the end of the chain we are walking.
18833          */
18834         if (!rt_mutex_owner(lock)) {
18835 +               struct rt_mutex_waiter *lock_top_waiter;
18836 +
18837                 /*
18838                  * If the requeue [7] above changed the top waiter,
18839                  * then we need to wake the new top waiter up to try
18840                  * to get the lock.
18841                  */
18842 -               if (prerequeue_top_waiter != rt_mutex_top_waiter(lock))
18843 -                       wake_up_process(rt_mutex_top_waiter(lock)->task);
18844 -               raw_spin_unlock(&lock->wait_lock);
18845 +               lock_top_waiter = rt_mutex_top_waiter(lock);
18846 +               if (prerequeue_top_waiter != lock_top_waiter)
18847 +                       rt_mutex_wake_waiter(lock_top_waiter);
18848 +               raw_spin_unlock_irq(&lock->wait_lock);
18849                 return 0;
18850         }
18851
18852         /* [10] Grab the next task, i.e. the owner of @lock */
18853         task = rt_mutex_owner(lock);
18854         get_task_struct(task);
18855 -       raw_spin_lock_irqsave(&task->pi_lock, flags);
18856 +       raw_spin_lock(&task->pi_lock);
18857
18858         /* [11] requeue the pi waiters if necessary */
18859         if (waiter == rt_mutex_top_waiter(lock)) {
18860 @@ -722,8 +747,8 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
18861         top_waiter = rt_mutex_top_waiter(lock);
18862
18863         /* [13] Drop the locks */
18864 -       raw_spin_unlock_irqrestore(&task->pi_lock, flags);
18865 -       raw_spin_unlock(&lock->wait_lock);
18866 +       raw_spin_unlock(&task->pi_lock);
18867 +       raw_spin_unlock_irq(&lock->wait_lock);
18868
18869         /*
18870          * Make the actual exit decisions [12], based on the stored
18871 @@ -746,28 +771,46 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
18872         goto again;
18873
18874   out_unlock_pi:
18875 -       raw_spin_unlock_irqrestore(&task->pi_lock, flags);
18876 +       raw_spin_unlock_irq(&task->pi_lock);
18877   out_put_task:
18878         put_task_struct(task);
18879
18880         return ret;
18881  }
18882
18883 +
18884 +#define STEAL_NORMAL  0
18885 +#define STEAL_LATERAL 1
18886 +
18887 +/*
18888 + * Note that RT tasks are excluded from lateral-steals to prevent the
18889 + * introduction of an unbounded latency
18890 + */
18891 +static inline int lock_is_stealable(struct task_struct *task,
18892 +                                   struct task_struct *pendowner, int mode)
18893 +{
18894 +    if (mode == STEAL_NORMAL || rt_task(task)) {
18895 +           if (task->prio >= pendowner->prio)
18896 +                   return 0;
18897 +    } else if (task->prio > pendowner->prio)
18898 +           return 0;
18899 +    return 1;
18900 +}
18901 +
18902  /*
18903   * Try to take an rt-mutex
18904   *
18905 - * Must be called with lock->wait_lock held.
18906 + * Must be called with lock->wait_lock held and interrupts disabled
18907   *
18908   * @lock:   The lock to be acquired.
18909   * @task:   The task which wants to acquire the lock
18910   * @waiter: The waiter that is queued to the lock's wait tree if the
18911   *         callsite called task_blocked_on_lock(), otherwise NULL
18912   */
18913 -static int try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task,
18914 -                               struct rt_mutex_waiter *waiter)
18915 +static int __try_to_take_rt_mutex(struct rt_mutex *lock,
18916 +                                 struct task_struct *task,
18917 +                                 struct rt_mutex_waiter *waiter, int mode)
18918  {
18919 -       unsigned long flags;
18920 -
18921         /*
18922          * Before testing whether we can acquire @lock, we set the
18923          * RT_MUTEX_HAS_WAITERS bit in @lock->owner. This forces all
18924 @@ -803,8 +846,10 @@ static int try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task,
18925                  * If waiter is not the highest priority waiter of
18926                  * @lock, give up.
18927                  */
18928 -               if (waiter != rt_mutex_top_waiter(lock))
18929 +               if (waiter != rt_mutex_top_waiter(lock)) {
18930 +                       /* XXX lock_is_stealable() ? */
18931                         return 0;
18932 +               }
18933
18934                 /*
18935                  * We can acquire the lock. Remove the waiter from the
18936 @@ -822,14 +867,10 @@ static int try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task,
18937                  * not need to be dequeued.
18938                  */
18939                 if (rt_mutex_has_waiters(lock)) {
18940 -                       /*
18941 -                        * If @task->prio is greater than or equal to
18942 -                        * the top waiter priority (kernel view),
18943 -                        * @task lost.
18944 -                        */
18945 -                       if (task->prio >= rt_mutex_top_waiter(lock)->prio)
18946 -                               return 0;
18947 +                       struct task_struct *pown = rt_mutex_top_waiter(lock)->task;
18948
18949 +                       if (task != pown && !lock_is_stealable(task, pown, mode))
18950 +                               return 0;
18951                         /*
18952                          * The current top waiter stays enqueued. We
18953                          * don't have to change anything in the lock
18954 @@ -852,7 +893,7 @@ static int try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task,
18955          * case, but conditionals are more expensive than a redundant
18956          * store.
18957          */
18958 -       raw_spin_lock_irqsave(&task->pi_lock, flags);
18959 +       raw_spin_lock(&task->pi_lock);
18960         task->pi_blocked_on = NULL;
18961         /*
18962          * Finish the lock acquisition. @task is the new owner. If
18963 @@ -861,7 +902,7 @@ static int try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task,
18964          */
18965         if (rt_mutex_has_waiters(lock))
18966                 rt_mutex_enqueue_pi(task, rt_mutex_top_waiter(lock));
18967 -       raw_spin_unlock_irqrestore(&task->pi_lock, flags);
18968 +       raw_spin_unlock(&task->pi_lock);
18969
18970  takeit:
18971         /* We got the lock. */
18972 @@ -878,12 +919,444 @@ takeit:
18973         return 1;
18974  }
18975
18976 +#ifdef CONFIG_PREEMPT_RT_FULL
18977 +/*
18978 + * preemptible spin_lock functions:
18979 + */
18980 +static inline void rt_spin_lock_fastlock(struct rt_mutex *lock,
18981 +                                        void  (*slowfn)(struct rt_mutex *lock,
18982 +                                                        bool mg_off),
18983 +                                        bool do_mig_dis)
18984 +{
18985 +       might_sleep_no_state_check();
18986 +
18987 +       if (do_mig_dis)
18988 +               migrate_disable();
18989 +
18990 +       if (likely(rt_mutex_cmpxchg_acquire(lock, NULL, current)))
18991 +               rt_mutex_deadlock_account_lock(lock, current);
18992 +       else
18993 +               slowfn(lock, do_mig_dis);
18994 +}
18995 +
18996 +static inline int rt_spin_lock_fastunlock(struct rt_mutex *lock,
18997 +                                          int  (*slowfn)(struct rt_mutex *lock))
18998 +{
18999 +       if (likely(rt_mutex_cmpxchg_release(lock, current, NULL))) {
19000 +               rt_mutex_deadlock_account_unlock(current);
19001 +               return 0;
19002 +       }
19003 +       return slowfn(lock);
19004 +}
19005 +#ifdef CONFIG_SMP
19006 +/*
19007 + * Note that owner is a speculative pointer and dereferencing relies
19008 + * on rcu_read_lock() and the check against the lock owner.
19009 + */
19010 +static int adaptive_wait(struct rt_mutex *lock,
19011 +                        struct task_struct *owner)
19012 +{
19013 +       int res = 0;
19014 +
19015 +       rcu_read_lock();
19016 +       for (;;) {
19017 +               if (owner != rt_mutex_owner(lock))
19018 +                       break;
19019 +               /*
19020 +                * Ensure that owner->on_cpu is dereferenced _after_
19021 +                * checking the above to be valid.
19022 +                */
19023 +               barrier();
19024 +               if (!owner->on_cpu) {
19025 +                       res = 1;
19026 +                       break;
19027 +               }
19028 +               cpu_relax();
19029 +       }
19030 +       rcu_read_unlock();
19031 +       return res;
19032 +}
19033 +#else
19034 +static int adaptive_wait(struct rt_mutex *lock,
19035 +                        struct task_struct *orig_owner)
19036 +{
19037 +       return 1;
19038 +}
19039 +#endif
19040 +
19041 +static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
19042 +                                  struct rt_mutex_waiter *waiter,
19043 +                                  struct task_struct *task,
19044 +                                  enum rtmutex_chainwalk chwalk);
19045 +/*
19046 + * Slow path lock function spin_lock style: this variant is very
19047 + * careful not to miss any non-lock wakeups.
19048 + *
19049 + * We store the current state under p->pi_lock in p->saved_state and
19050 + * the try_to_wake_up() code handles this accordingly.
19051 + */
19052 +static void  noinline __sched rt_spin_lock_slowlock(struct rt_mutex *lock,
19053 +                                                   bool mg_off)
19054 +{
19055 +       struct task_struct *lock_owner, *self = current;
19056 +       struct rt_mutex_waiter waiter, *top_waiter;
19057 +       unsigned long flags;
19058 +       int ret;
19059 +
19060 +       rt_mutex_init_waiter(&waiter, true);
19061 +
19062 +       raw_spin_lock_irqsave(&lock->wait_lock, flags);
19063 +
19064 +       if (__try_to_take_rt_mutex(lock, self, NULL, STEAL_LATERAL)) {
19065 +               raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
19066 +               return;
19067 +       }
19068 +
19069 +       BUG_ON(rt_mutex_owner(lock) == self);
19070 +
19071 +       /*
19072 +        * We save whatever state the task is in and we'll restore it
19073 +        * after acquiring the lock taking real wakeups into account
19074 +        * as well. We are serialized via pi_lock against wakeups. See
19075 +        * try_to_wake_up().
19076 +        */
19077 +       raw_spin_lock(&self->pi_lock);
19078 +       self->saved_state = self->state;
19079 +       __set_current_state_no_track(TASK_UNINTERRUPTIBLE);
19080 +       raw_spin_unlock(&self->pi_lock);
19081 +
19082 +       ret = task_blocks_on_rt_mutex(lock, &waiter, self, RT_MUTEX_MIN_CHAINWALK);
19083 +       BUG_ON(ret);
19084 +
19085 +       for (;;) {
19086 +               /* Try to acquire the lock again. */
19087 +               if (__try_to_take_rt_mutex(lock, self, &waiter, STEAL_LATERAL))
19088 +                       break;
19089 +
19090 +               top_waiter = rt_mutex_top_waiter(lock);
19091 +               lock_owner = rt_mutex_owner(lock);
19092 +
19093 +               raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
19094 +
19095 +               debug_rt_mutex_print_deadlock(&waiter);
19096 +
19097 +               if (top_waiter != &waiter || adaptive_wait(lock, lock_owner)) {
19098 +                       if (mg_off)
19099 +                               migrate_enable();
19100 +                       schedule();
19101 +                       if (mg_off)
19102 +                               migrate_disable();
19103 +               }
19104 +
19105 +               raw_spin_lock_irqsave(&lock->wait_lock, flags);
19106 +
19107 +               raw_spin_lock(&self->pi_lock);
19108 +               __set_current_state_no_track(TASK_UNINTERRUPTIBLE);
19109 +               raw_spin_unlock(&self->pi_lock);
19110 +       }
19111 +
19112 +       /*
19113 +        * Restore the task state to current->saved_state. We set it
19114 +        * to the original state above and the try_to_wake_up() code
19115 +        * has possibly updated it when a real (non-rtmutex) wakeup
19116 +        * happened while we were blocked. Clear saved_state so
19117 +        * try_to_wakeup() does not get confused.
19118 +        */
19119 +       raw_spin_lock(&self->pi_lock);
19120 +       __set_current_state_no_track(self->saved_state);
19121 +       self->saved_state = TASK_RUNNING;
19122 +       raw_spin_unlock(&self->pi_lock);
19123 +
19124 +       /*
19125 +        * try_to_take_rt_mutex() sets the waiter bit
19126 +        * unconditionally. We might have to fix that up:
19127 +        */
19128 +       fixup_rt_mutex_waiters(lock);
19129 +
19130 +       BUG_ON(rt_mutex_has_waiters(lock) && &waiter == rt_mutex_top_waiter(lock));
19131 +       BUG_ON(!RB_EMPTY_NODE(&waiter.tree_entry));
19132 +
19133 +       raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
19134 +
19135 +       debug_rt_mutex_free_waiter(&waiter);
19136 +}
19137 +
19138 +static void mark_wakeup_next_waiter(struct wake_q_head *wake_q,
19139 +                                   struct wake_q_head *wake_sleeper_q,
19140 +                                   struct rt_mutex *lock);
19141 +/*
19142 + * Slow path to release a rt_mutex spin_lock style
19143 + */
19144 +static int noinline __sched rt_spin_lock_slowunlock(struct rt_mutex *lock)
19145 +{
19146 +       unsigned long flags;
19147 +       WAKE_Q(wake_q);
19148 +       WAKE_Q(wake_sleeper_q);
19149 +
19150 +       raw_spin_lock_irqsave(&lock->wait_lock, flags);
19151 +
19152 +       debug_rt_mutex_unlock(lock);
19153 +
19154 +       rt_mutex_deadlock_account_unlock(current);
19155 +
19156 +       if (!rt_mutex_has_waiters(lock)) {
19157 +               lock->owner = NULL;
19158 +               raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
19159 +               return 0;
19160 +       }
19161 +
19162 +       mark_wakeup_next_waiter(&wake_q, &wake_sleeper_q, lock);
19163 +
19164 +       raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
19165 +       wake_up_q(&wake_q);
19166 +       wake_up_q_sleeper(&wake_sleeper_q);
19167 +
19168 +       /* Undo pi boosting.when necessary */
19169 +       rt_mutex_adjust_prio(current);
19170 +       return 0;
19171 +}
19172 +
19173 +static int noinline __sched rt_spin_lock_slowunlock_no_deboost(struct rt_mutex *lock)
19174 +{
19175 +       unsigned long flags;
19176 +       WAKE_Q(wake_q);
19177 +       WAKE_Q(wake_sleeper_q);
19178 +
19179 +       raw_spin_lock_irqsave(&lock->wait_lock, flags);
19180 +
19181 +       debug_rt_mutex_unlock(lock);
19182 +
19183 +       rt_mutex_deadlock_account_unlock(current);
19184 +
19185 +       if (!rt_mutex_has_waiters(lock)) {
19186 +               lock->owner = NULL;
19187 +               raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
19188 +               return 0;
19189 +       }
19190 +
19191 +       mark_wakeup_next_waiter(&wake_q, &wake_sleeper_q, lock);
19192 +
19193 +       raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
19194 +       wake_up_q(&wake_q);
19195 +       wake_up_q_sleeper(&wake_sleeper_q);
19196 +       return 1;
19197 +}
19198 +
19199 +void __lockfunc rt_spin_lock__no_mg(spinlock_t *lock)
19200 +{
19201 +       rt_spin_lock_fastlock(&lock->lock, rt_spin_lock_slowlock, false);
19202 +       spin_acquire(&lock->dep_map, 0, 0, _RET_IP_);
19203 +}
19204 +EXPORT_SYMBOL(rt_spin_lock__no_mg);
19205 +
19206 +void __lockfunc rt_spin_lock(spinlock_t *lock)
19207 +{
19208 +       rt_spin_lock_fastlock(&lock->lock, rt_spin_lock_slowlock, true);
19209 +       spin_acquire(&lock->dep_map, 0, 0, _RET_IP_);
19210 +}
19211 +EXPORT_SYMBOL(rt_spin_lock);
19212 +
19213 +void __lockfunc __rt_spin_lock(struct rt_mutex *lock)
19214 +{
19215 +       rt_spin_lock_fastlock(lock, rt_spin_lock_slowlock, true);
19216 +}
19217 +EXPORT_SYMBOL(__rt_spin_lock);
19218 +
19219 +void __lockfunc __rt_spin_lock__no_mg(struct rt_mutex *lock)
19220 +{
19221 +       rt_spin_lock_fastlock(lock, rt_spin_lock_slowlock, false);
19222 +}
19223 +EXPORT_SYMBOL(__rt_spin_lock__no_mg);
19224 +
19225 +#ifdef CONFIG_DEBUG_LOCK_ALLOC
19226 +void __lockfunc rt_spin_lock_nested(spinlock_t *lock, int subclass)
19227 +{
19228 +       spin_acquire(&lock->dep_map, subclass, 0, _RET_IP_);
19229 +       rt_spin_lock_fastlock(&lock->lock, rt_spin_lock_slowlock, true);
19230 +}
19231 +EXPORT_SYMBOL(rt_spin_lock_nested);
19232 +#endif
19233 +
19234 +void __lockfunc rt_spin_unlock__no_mg(spinlock_t *lock)
19235 +{
19236 +       /* NOTE: we always pass in '1' for nested, for simplicity */
19237 +       spin_release(&lock->dep_map, 1, _RET_IP_);
19238 +       rt_spin_lock_fastunlock(&lock->lock, rt_spin_lock_slowunlock);
19239 +}
19240 +EXPORT_SYMBOL(rt_spin_unlock__no_mg);
19241 +
19242 +void __lockfunc rt_spin_unlock(spinlock_t *lock)
19243 +{
19244 +       /* NOTE: we always pass in '1' for nested, for simplicity */
19245 +       spin_release(&lock->dep_map, 1, _RET_IP_);
19246 +       rt_spin_lock_fastunlock(&lock->lock, rt_spin_lock_slowunlock);
19247 +       migrate_enable();
19248 +}
19249 +EXPORT_SYMBOL(rt_spin_unlock);
19250 +
19251 +int __lockfunc rt_spin_unlock_no_deboost(spinlock_t *lock)
19252 +{
19253 +       int ret;
19254 +
19255 +       /* NOTE: we always pass in '1' for nested, for simplicity */
19256 +       spin_release(&lock->dep_map, 1, _RET_IP_);
19257 +       ret = rt_spin_lock_fastunlock(&lock->lock, rt_spin_lock_slowunlock_no_deboost);
19258 +       migrate_enable();
19259 +       return ret;
19260 +}
19261 +
19262 +void __lockfunc __rt_spin_unlock(struct rt_mutex *lock)
19263 +{
19264 +       rt_spin_lock_fastunlock(lock, rt_spin_lock_slowunlock);
19265 +}
19266 +EXPORT_SYMBOL(__rt_spin_unlock);
19267 +
19268 +/*
19269 + * Wait for the lock to get unlocked: instead of polling for an unlock
19270 + * (like raw spinlocks do), we lock and unlock, to force the kernel to
19271 + * schedule if there's contention:
19272 + */
19273 +void __lockfunc rt_spin_unlock_wait(spinlock_t *lock)
19274 +{
19275 +       spin_lock(lock);
19276 +       spin_unlock(lock);
19277 +}
19278 +EXPORT_SYMBOL(rt_spin_unlock_wait);
19279 +
19280 +int __lockfunc __rt_spin_trylock(struct rt_mutex *lock)
19281 +{
19282 +       return rt_mutex_trylock(lock);
19283 +}
19284 +
19285 +int __lockfunc rt_spin_trylock__no_mg(spinlock_t *lock)
19286 +{
19287 +       int ret;
19288 +
19289 +       ret = rt_mutex_trylock(&lock->lock);
19290 +       if (ret)
19291 +               spin_acquire(&lock->dep_map, 0, 1, _RET_IP_);
19292 +       return ret;
19293 +}
19294 +EXPORT_SYMBOL(rt_spin_trylock__no_mg);
19295 +
19296 +int __lockfunc rt_spin_trylock(spinlock_t *lock)
19297 +{
19298 +       int ret;
19299 +
19300 +       migrate_disable();
19301 +       ret = rt_mutex_trylock(&lock->lock);
19302 +       if (ret)
19303 +               spin_acquire(&lock->dep_map, 0, 1, _RET_IP_);
19304 +       else
19305 +               migrate_enable();
19306 +       return ret;
19307 +}
19308 +EXPORT_SYMBOL(rt_spin_trylock);
19309 +
19310 +int __lockfunc rt_spin_trylock_bh(spinlock_t *lock)
19311 +{
19312 +       int ret;
19313 +
19314 +       local_bh_disable();
19315 +       ret = rt_mutex_trylock(&lock->lock);
19316 +       if (ret) {
19317 +               migrate_disable();
19318 +               spin_acquire(&lock->dep_map, 0, 1, _RET_IP_);
19319 +       } else
19320 +               local_bh_enable();
19321 +       return ret;
19322 +}
19323 +EXPORT_SYMBOL(rt_spin_trylock_bh);
19324 +
19325 +int __lockfunc rt_spin_trylock_irqsave(spinlock_t *lock, unsigned long *flags)
19326 +{
19327 +       int ret;
19328 +
19329 +       *flags = 0;
19330 +       ret = rt_mutex_trylock(&lock->lock);
19331 +       if (ret) {
19332 +               migrate_disable();
19333 +               spin_acquire(&lock->dep_map, 0, 1, _RET_IP_);
19334 +       }
19335 +       return ret;
19336 +}
19337 +EXPORT_SYMBOL(rt_spin_trylock_irqsave);
19338 +
19339 +int atomic_dec_and_spin_lock(atomic_t *atomic, spinlock_t *lock)
19340 +{
19341 +       /* Subtract 1 from counter unless that drops it to 0 (ie. it was 1) */
19342 +       if (atomic_add_unless(atomic, -1, 1))
19343 +               return 0;
19344 +       rt_spin_lock(lock);
19345 +       if (atomic_dec_and_test(atomic))
19346 +               return 1;
19347 +       rt_spin_unlock(lock);
19348 +       return 0;
19349 +}
19350 +EXPORT_SYMBOL(atomic_dec_and_spin_lock);
19351 +
19352 +       void
19353 +__rt_spin_lock_init(spinlock_t *lock, char *name, struct lock_class_key *key)
19354 +{
19355 +#ifdef CONFIG_DEBUG_LOCK_ALLOC
19356 +       /*
19357 +        * Make sure we are not reinitializing a held lock:
19358 +        */
19359 +       debug_check_no_locks_freed((void *)lock, sizeof(*lock));
19360 +       lockdep_init_map(&lock->dep_map, name, key, 0);
19361 +#endif
19362 +}
19363 +EXPORT_SYMBOL(__rt_spin_lock_init);
19364 +
19365 +#endif /* PREEMPT_RT_FULL */
19366 +
19367 +#ifdef CONFIG_PREEMPT_RT_FULL
19368 +       static inline int __sched
19369 +__mutex_lock_check_stamp(struct rt_mutex *lock, struct ww_acquire_ctx *ctx)
19370 +{
19371 +       struct ww_mutex *ww = container_of(lock, struct ww_mutex, base.lock);
19372 +       struct ww_acquire_ctx *hold_ctx = ACCESS_ONCE(ww->ctx);
19373 +
19374 +       if (!hold_ctx)
19375 +               return 0;
19376 +
19377 +       if (unlikely(ctx == hold_ctx))
19378 +               return -EALREADY;
19379 +
19380 +       if (ctx->stamp - hold_ctx->stamp <= LONG_MAX &&
19381 +           (ctx->stamp != hold_ctx->stamp || ctx > hold_ctx)) {
19382 +#ifdef CONFIG_DEBUG_MUTEXES
19383 +               DEBUG_LOCKS_WARN_ON(ctx->contending_lock);
19384 +               ctx->contending_lock = ww;
19385 +#endif
19386 +               return -EDEADLK;
19387 +       }
19388 +
19389 +       return 0;
19390 +}
19391 +#else
19392 +       static inline int __sched
19393 +__mutex_lock_check_stamp(struct rt_mutex *lock, struct ww_acquire_ctx *ctx)
19394 +{
19395 +       BUG();
19396 +       return 0;
19397 +}
19398 +
19399 +#endif
19400 +
19401 +static inline int
19402 +try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task,
19403 +                    struct rt_mutex_waiter *waiter)
19404 +{
19405 +       return __try_to_take_rt_mutex(lock, task, waiter, STEAL_NORMAL);
19406 +}
19407 +
19408  /*
19409   * Task blocks on lock.
19410   *
19411   * Prepare waiter and propagate pi chain
19412   *
19413 - * This must be called with lock->wait_lock held.
19414 + * This must be called with lock->wait_lock held and interrupts disabled
19415   */
19416  static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
19417                                    struct rt_mutex_waiter *waiter,
19418 @@ -894,7 +1367,6 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
19419         struct rt_mutex_waiter *top_waiter = waiter;
19420         struct rt_mutex *next_lock;
19421         int chain_walk = 0, res;
19422 -       unsigned long flags;
19423
19424         /*
19425          * Early deadlock detection. We really don't want the task to
19426 @@ -908,7 +1380,24 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
19427         if (owner == task)
19428                 return -EDEADLK;
19429
19430 -       raw_spin_lock_irqsave(&task->pi_lock, flags);
19431 +       raw_spin_lock(&task->pi_lock);
19432 +
19433 +       /*
19434 +        * In the case of futex requeue PI, this will be a proxy
19435 +        * lock. The task will wake unaware that it is enqueueed on
19436 +        * this lock. Avoid blocking on two locks and corrupting
19437 +        * pi_blocked_on via the PI_WAKEUP_INPROGRESS
19438 +        * flag. futex_wait_requeue_pi() sets this when it wakes up
19439 +        * before requeue (due to a signal or timeout). Do not enqueue
19440 +        * the task if PI_WAKEUP_INPROGRESS is set.
19441 +        */
19442 +       if (task != current && task->pi_blocked_on == PI_WAKEUP_INPROGRESS) {
19443 +               raw_spin_unlock(&task->pi_lock);
19444 +               return -EAGAIN;
19445 +       }
19446 +
19447 +       BUG_ON(rt_mutex_real_waiter(task->pi_blocked_on));
19448 +
19449         __rt_mutex_adjust_prio(task);
19450         waiter->task = task;
19451         waiter->lock = lock;
19452 @@ -921,18 +1410,18 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
19453
19454         task->pi_blocked_on = waiter;
19455
19456 -       raw_spin_unlock_irqrestore(&task->pi_lock, flags);
19457 +       raw_spin_unlock(&task->pi_lock);
19458
19459         if (!owner)
19460                 return 0;
19461
19462 -       raw_spin_lock_irqsave(&owner->pi_lock, flags);
19463 +       raw_spin_lock(&owner->pi_lock);
19464         if (waiter == rt_mutex_top_waiter(lock)) {
19465                 rt_mutex_dequeue_pi(owner, top_waiter);
19466                 rt_mutex_enqueue_pi(owner, waiter);
19467
19468                 __rt_mutex_adjust_prio(owner);
19469 -               if (owner->pi_blocked_on)
19470 +               if (rt_mutex_real_waiter(owner->pi_blocked_on))
19471                         chain_walk = 1;
19472         } else if (rt_mutex_cond_detect_deadlock(waiter, chwalk)) {
19473                 chain_walk = 1;
19474 @@ -941,7 +1430,7 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
19475         /* Store the lock on which owner is blocked or NULL */
19476         next_lock = task_blocked_on_lock(owner);
19477
19478 -       raw_spin_unlock_irqrestore(&owner->pi_lock, flags);
19479 +       raw_spin_unlock(&owner->pi_lock);
19480         /*
19481          * Even if full deadlock detection is on, if the owner is not
19482          * blocked itself, we can avoid finding this out in the chain
19483 @@ -957,12 +1446,12 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
19484          */
19485         get_task_struct(owner);
19486
19487 -       raw_spin_unlock(&lock->wait_lock);
19488 +       raw_spin_unlock_irq(&lock->wait_lock);
19489
19490         res = rt_mutex_adjust_prio_chain(owner, chwalk, lock,
19491                                          next_lock, waiter, task);
19492
19493 -       raw_spin_lock(&lock->wait_lock);
19494 +       raw_spin_lock_irq(&lock->wait_lock);
19495
19496         return res;
19497  }
19498 @@ -971,15 +1460,15 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
19499   * Remove the top waiter from the current tasks pi waiter tree and
19500   * queue it up.
19501   *
19502 - * Called with lock->wait_lock held.
19503 + * Called with lock->wait_lock held and interrupts disabled.
19504   */
19505  static void mark_wakeup_next_waiter(struct wake_q_head *wake_q,
19506 +                                   struct wake_q_head *wake_sleeper_q,
19507                                     struct rt_mutex *lock)
19508  {
19509         struct rt_mutex_waiter *waiter;
19510 -       unsigned long flags;
19511
19512 -       raw_spin_lock_irqsave(&current->pi_lock, flags);
19513 +       raw_spin_lock(&current->pi_lock);
19514
19515         waiter = rt_mutex_top_waiter(lock);
19516
19517 @@ -1001,15 +1490,18 @@ static void mark_wakeup_next_waiter(struct wake_q_head *wake_q,
19518          */
19519         lock->owner = (void *) RT_MUTEX_HAS_WAITERS;
19520
19521 -       raw_spin_unlock_irqrestore(&current->pi_lock, flags);
19522 +       raw_spin_unlock(&current->pi_lock);
19523
19524 -       wake_q_add(wake_q, waiter->task);
19525 +       if (waiter->savestate)
19526 +               wake_q_add(wake_sleeper_q, waiter->task);
19527 +       else
19528 +               wake_q_add(wake_q, waiter->task);
19529  }
19530
19531  /*
19532   * Remove a waiter from a lock and give up
19533   *
19534 - * Must be called with lock->wait_lock held and
19535 + * Must be called with lock->wait_lock held and interrupts disabled. I must
19536   * have just failed to try_to_take_rt_mutex().
19537   */
19538  static void remove_waiter(struct rt_mutex *lock,
19539 @@ -1017,13 +1509,12 @@ static void remove_waiter(struct rt_mutex *lock,
19540  {
19541         bool is_top_waiter = (waiter == rt_mutex_top_waiter(lock));
19542         struct task_struct *owner = rt_mutex_owner(lock);
19543 -       struct rt_mutex *next_lock;
19544 -       unsigned long flags;
19545 +       struct rt_mutex *next_lock = NULL;
19546
19547 -       raw_spin_lock_irqsave(&current->pi_lock, flags);
19548 +       raw_spin_lock(&current->pi_lock);
19549         rt_mutex_dequeue(lock, waiter);
19550         current->pi_blocked_on = NULL;
19551 -       raw_spin_unlock_irqrestore(&current->pi_lock, flags);
19552 +       raw_spin_unlock(&current->pi_lock);
19553
19554         /*
19555          * Only update priority if the waiter was the highest priority
19556 @@ -1032,7 +1523,7 @@ static void remove_waiter(struct rt_mutex *lock,
19557         if (!owner || !is_top_waiter)
19558                 return;
19559
19560 -       raw_spin_lock_irqsave(&owner->pi_lock, flags);
19561 +       raw_spin_lock(&owner->pi_lock);
19562
19563         rt_mutex_dequeue_pi(owner, waiter);
19564
19565 @@ -1042,9 +1533,10 @@ static void remove_waiter(struct rt_mutex *lock,
19566         __rt_mutex_adjust_prio(owner);
19567
19568         /* Store the lock on which owner is blocked or NULL */
19569 -       next_lock = task_blocked_on_lock(owner);
19570 +       if (rt_mutex_real_waiter(owner->pi_blocked_on))
19571 +               next_lock = task_blocked_on_lock(owner);
19572
19573 -       raw_spin_unlock_irqrestore(&owner->pi_lock, flags);
19574 +       raw_spin_unlock(&owner->pi_lock);
19575
19576         /*
19577          * Don't walk the chain, if the owner task is not blocked
19578 @@ -1056,12 +1548,12 @@ static void remove_waiter(struct rt_mutex *lock,
19579         /* gets dropped in rt_mutex_adjust_prio_chain()! */
19580         get_task_struct(owner);
19581
19582 -       raw_spin_unlock(&lock->wait_lock);
19583 +       raw_spin_unlock_irq(&lock->wait_lock);
19584
19585         rt_mutex_adjust_prio_chain(owner, RT_MUTEX_MIN_CHAINWALK, lock,
19586                                    next_lock, NULL, current);
19587
19588 -       raw_spin_lock(&lock->wait_lock);
19589 +       raw_spin_lock_irq(&lock->wait_lock);
19590  }
19591
19592  /*
19593 @@ -1078,17 +1570,17 @@ void rt_mutex_adjust_pi(struct task_struct *task)
19594         raw_spin_lock_irqsave(&task->pi_lock, flags);
19595
19596         waiter = task->pi_blocked_on;
19597 -       if (!waiter || (waiter->prio == task->prio &&
19598 +       if (!rt_mutex_real_waiter(waiter) || (waiter->prio == task->prio &&
19599                         !dl_prio(task->prio))) {
19600                 raw_spin_unlock_irqrestore(&task->pi_lock, flags);
19601                 return;
19602         }
19603         next_lock = waiter->lock;
19604 -       raw_spin_unlock_irqrestore(&task->pi_lock, flags);
19605
19606         /* gets dropped in rt_mutex_adjust_prio_chain()! */
19607         get_task_struct(task);
19608
19609 +       raw_spin_unlock_irqrestore(&task->pi_lock, flags);
19610         rt_mutex_adjust_prio_chain(task, RT_MUTEX_MIN_CHAINWALK, NULL,
19611                                    next_lock, NULL, task);
19612  }
19613 @@ -1097,16 +1589,17 @@ void rt_mutex_adjust_pi(struct task_struct *task)
19614   * __rt_mutex_slowlock() - Perform the wait-wake-try-to-take loop
19615   * @lock:               the rt_mutex to take
19616   * @state:              the state the task should block in (TASK_INTERRUPTIBLE
19617 - *                      or TASK_UNINTERRUPTIBLE)
19618 + *                      or TASK_UNINTERRUPTIBLE)
19619   * @timeout:            the pre-initialized and started timer, or NULL for none
19620   * @waiter:             the pre-initialized rt_mutex_waiter
19621   *
19622 - * lock->wait_lock must be held by the caller.
19623 + * Must be called with lock->wait_lock held and interrupts disabled
19624   */
19625  static int __sched
19626  __rt_mutex_slowlock(struct rt_mutex *lock, int state,
19627                     struct hrtimer_sleeper *timeout,
19628 -                   struct rt_mutex_waiter *waiter)
19629 +                   struct rt_mutex_waiter *waiter,
19630 +                   struct ww_acquire_ctx *ww_ctx)
19631  {
19632         int ret = 0;
19633
19634 @@ -1129,13 +1622,19 @@ __rt_mutex_slowlock(struct rt_mutex *lock, int state,
19635                                 break;
19636                 }
19637
19638 -               raw_spin_unlock(&lock->wait_lock);
19639 +               if (ww_ctx && ww_ctx->acquired > 0) {
19640 +                       ret = __mutex_lock_check_stamp(lock, ww_ctx);
19641 +                       if (ret)
19642 +                               break;
19643 +               }
19644 +
19645 +               raw_spin_unlock_irq(&lock->wait_lock);
19646
19647                 debug_rt_mutex_print_deadlock(waiter);
19648
19649                 schedule();
19650
19651 -               raw_spin_lock(&lock->wait_lock);
19652 +               raw_spin_lock_irq(&lock->wait_lock);
19653                 set_current_state(state);
19654         }
19655
19656 @@ -1163,26 +1662,112 @@ static void rt_mutex_handle_deadlock(int res, int detect_deadlock,
19657         }
19658  }
19659
19660 +static __always_inline void ww_mutex_lock_acquired(struct ww_mutex *ww,
19661 +                                                  struct ww_acquire_ctx *ww_ctx)
19662 +{
19663 +#ifdef CONFIG_DEBUG_MUTEXES
19664 +       /*
19665 +        * If this WARN_ON triggers, you used ww_mutex_lock to acquire,
19666 +        * but released with a normal mutex_unlock in this call.
19667 +        *
19668 +        * This should never happen, always use ww_mutex_unlock.
19669 +        */
19670 +       DEBUG_LOCKS_WARN_ON(ww->ctx);
19671 +
19672 +       /*
19673 +        * Not quite done after calling ww_acquire_done() ?
19674 +        */
19675 +       DEBUG_LOCKS_WARN_ON(ww_ctx->done_acquire);
19676 +
19677 +       if (ww_ctx->contending_lock) {
19678 +               /*
19679 +                * After -EDEADLK you tried to
19680 +                * acquire a different ww_mutex? Bad!
19681 +                */
19682 +               DEBUG_LOCKS_WARN_ON(ww_ctx->contending_lock != ww);
19683 +
19684 +               /*
19685 +                * You called ww_mutex_lock after receiving -EDEADLK,
19686 +                * but 'forgot' to unlock everything else first?
19687 +                */
19688 +               DEBUG_LOCKS_WARN_ON(ww_ctx->acquired > 0);
19689 +               ww_ctx->contending_lock = NULL;
19690 +       }
19691 +
19692 +       /*
19693 +        * Naughty, using a different class will lead to undefined behavior!
19694 +        */
19695 +       DEBUG_LOCKS_WARN_ON(ww_ctx->ww_class != ww->ww_class);
19696 +#endif
19697 +       ww_ctx->acquired++;
19698 +}
19699 +
19700 +#ifdef CONFIG_PREEMPT_RT_FULL
19701 +static void ww_mutex_account_lock(struct rt_mutex *lock,
19702 +                                 struct ww_acquire_ctx *ww_ctx)
19703 +{
19704 +       struct ww_mutex *ww = container_of(lock, struct ww_mutex, base.lock);
19705 +       struct rt_mutex_waiter *waiter, *n;
19706 +
19707 +       /*
19708 +        * This branch gets optimized out for the common case,
19709 +        * and is only important for ww_mutex_lock.
19710 +        */
19711 +       ww_mutex_lock_acquired(ww, ww_ctx);
19712 +       ww->ctx = ww_ctx;
19713 +
19714 +       /*
19715 +        * Give any possible sleeping processes the chance to wake up,
19716 +        * so they can recheck if they have to back off.
19717 +        */
19718 +       rbtree_postorder_for_each_entry_safe(waiter, n, &lock->waiters,
19719 +                                            tree_entry) {
19720 +               /* XXX debug rt mutex waiter wakeup */
19721 +
19722 +               BUG_ON(waiter->lock != lock);
19723 +               rt_mutex_wake_waiter(waiter);
19724 +       }
19725 +}
19726 +
19727 +#else
19728 +
19729 +static void ww_mutex_account_lock(struct rt_mutex *lock,
19730 +                                 struct ww_acquire_ctx *ww_ctx)
19731 +{
19732 +       BUG();
19733 +}
19734 +#endif
19735 +
19736  /*
19737   * Slow path lock function:
19738   */
19739  static int __sched
19740  rt_mutex_slowlock(struct rt_mutex *lock, int state,
19741                   struct hrtimer_sleeper *timeout,
19742 -                 enum rtmutex_chainwalk chwalk)
19743 +                 enum rtmutex_chainwalk chwalk,
19744 +                 struct ww_acquire_ctx *ww_ctx)
19745  {
19746         struct rt_mutex_waiter waiter;
19747 +       unsigned long flags;
19748         int ret = 0;
19749
19750 -       debug_rt_mutex_init_waiter(&waiter);
19751 -       RB_CLEAR_NODE(&waiter.pi_tree_entry);
19752 -       RB_CLEAR_NODE(&waiter.tree_entry);
19753 +       rt_mutex_init_waiter(&waiter, false);
19754
19755 -       raw_spin_lock(&lock->wait_lock);
19756 +       /*
19757 +        * Technically we could use raw_spin_[un]lock_irq() here, but this can
19758 +        * be called in early boot if the cmpxchg() fast path is disabled
19759 +        * (debug, no architecture support). In this case we will acquire the
19760 +        * rtmutex with lock->wait_lock held. But we cannot unconditionally
19761 +        * enable interrupts in that early boot case. So we need to use the
19762 +        * irqsave/restore variants.
19763 +        */
19764 +       raw_spin_lock_irqsave(&lock->wait_lock, flags);
19765
19766         /* Try to acquire the lock again: */
19767         if (try_to_take_rt_mutex(lock, current, NULL)) {
19768 -               raw_spin_unlock(&lock->wait_lock);
19769 +               if (ww_ctx)
19770 +                       ww_mutex_account_lock(lock, ww_ctx);
19771 +               raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
19772                 return 0;
19773         }
19774
19775 @@ -1196,13 +1781,23 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state,
19776
19777         if (likely(!ret))
19778                 /* sleep on the mutex */
19779 -               ret = __rt_mutex_slowlock(lock, state, timeout, &waiter);
19780 +               ret = __rt_mutex_slowlock(lock, state, timeout, &waiter,
19781 +                                         ww_ctx);
19782 +       else if (ww_ctx) {
19783 +               /* ww_mutex received EDEADLK, let it become EALREADY */
19784 +               ret = __mutex_lock_check_stamp(lock, ww_ctx);
19785 +               BUG_ON(!ret);
19786 +       }
19787
19788         if (unlikely(ret)) {
19789                 __set_current_state(TASK_RUNNING);
19790                 if (rt_mutex_has_waiters(lock))
19791                         remove_waiter(lock, &waiter);
19792 -               rt_mutex_handle_deadlock(ret, chwalk, &waiter);
19793 +               /* ww_mutex want to report EDEADLK/EALREADY, let them */
19794 +               if (!ww_ctx)
19795 +                       rt_mutex_handle_deadlock(ret, chwalk, &waiter);
19796 +       } else if (ww_ctx) {
19797 +               ww_mutex_account_lock(lock, ww_ctx);
19798         }
19799
19800         /*
19801 @@ -1211,7 +1806,7 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state,
19802          */
19803         fixup_rt_mutex_waiters(lock);
19804
19805 -       raw_spin_unlock(&lock->wait_lock);
19806 +       raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
19807
19808         /* Remove pending timer: */
19809         if (unlikely(timeout))
19810 @@ -1227,6 +1822,7 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state,
19811   */
19812  static inline int rt_mutex_slowtrylock(struct rt_mutex *lock)
19813  {
19814 +       unsigned long flags;
19815         int ret;
19816
19817         /*
19818 @@ -1238,10 +1834,10 @@ static inline int rt_mutex_slowtrylock(struct rt_mutex *lock)
19819                 return 0;
19820
19821         /*
19822 -        * The mutex has currently no owner. Lock the wait lock and
19823 -        * try to acquire the lock.
19824 +        * The mutex has currently no owner. Lock the wait lock and try to
19825 +        * acquire the lock. We use irqsave here to support early boot calls.
19826          */
19827 -       raw_spin_lock(&lock->wait_lock);
19828 +       raw_spin_lock_irqsave(&lock->wait_lock, flags);
19829
19830         ret = try_to_take_rt_mutex(lock, current, NULL);
19831
19832 @@ -1251,7 +1847,7 @@ static inline int rt_mutex_slowtrylock(struct rt_mutex *lock)
19833          */
19834         fixup_rt_mutex_waiters(lock);
19835
19836 -       raw_spin_unlock(&lock->wait_lock);
19837 +       raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
19838
19839         return ret;
19840  }
19841 @@ -1261,9 +1857,13 @@ static inline int rt_mutex_slowtrylock(struct rt_mutex *lock)
19842   * Return whether the current task needs to undo a potential priority boosting.
19843   */
19844  static bool __sched rt_mutex_slowunlock(struct rt_mutex *lock,
19845 -                                       struct wake_q_head *wake_q)
19846 +                                       struct wake_q_head *wake_q,
19847 +                                       struct wake_q_head *wake_sleeper_q)
19848  {
19849 -       raw_spin_lock(&lock->wait_lock);
19850 +       unsigned long flags;
19851 +
19852 +       /* irqsave required to support early boot calls */
19853 +       raw_spin_lock_irqsave(&lock->wait_lock, flags);
19854
19855         debug_rt_mutex_unlock(lock);
19856
19857 @@ -1302,10 +1902,10 @@ static bool __sched rt_mutex_slowunlock(struct rt_mutex *lock,
19858          */
19859         while (!rt_mutex_has_waiters(lock)) {
19860                 /* Drops lock->wait_lock ! */
19861 -               if (unlock_rt_mutex_safe(lock) == true)
19862 +               if (unlock_rt_mutex_safe(lock, flags) == true)
19863                         return false;
19864                 /* Relock the rtmutex and try again */
19865 -               raw_spin_lock(&lock->wait_lock);
19866 +               raw_spin_lock_irqsave(&lock->wait_lock, flags);
19867         }
19868
19869         /*
19870 @@ -1314,9 +1914,9 @@ static bool __sched rt_mutex_slowunlock(struct rt_mutex *lock,
19871          *
19872          * Queue the next waiter for wakeup once we release the wait_lock.
19873          */
19874 -       mark_wakeup_next_waiter(wake_q, lock);
19875 +       mark_wakeup_next_waiter(wake_q, wake_sleeper_q, lock);
19876
19877 -       raw_spin_unlock(&lock->wait_lock);
19878 +       raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
19879
19880         /* check PI boosting */
19881         return true;
19882 @@ -1330,31 +1930,36 @@ static bool __sched rt_mutex_slowunlock(struct rt_mutex *lock,
19883   */
19884  static inline int
19885  rt_mutex_fastlock(struct rt_mutex *lock, int state,
19886 +                 struct ww_acquire_ctx *ww_ctx,
19887                   int (*slowfn)(struct rt_mutex *lock, int state,
19888                                 struct hrtimer_sleeper *timeout,
19889 -                               enum rtmutex_chainwalk chwalk))
19890 +                               enum rtmutex_chainwalk chwalk,
19891 +                               struct ww_acquire_ctx *ww_ctx))
19892  {
19893         if (likely(rt_mutex_cmpxchg_acquire(lock, NULL, current))) {
19894                 rt_mutex_deadlock_account_lock(lock, current);
19895                 return 0;
19896         } else
19897 -               return slowfn(lock, state, NULL, RT_MUTEX_MIN_CHAINWALK);
19898 +               return slowfn(lock, state, NULL, RT_MUTEX_MIN_CHAINWALK,
19899 +                             ww_ctx);
19900  }
19901
19902  static inline int
19903  rt_mutex_timed_fastlock(struct rt_mutex *lock, int state,
19904                         struct hrtimer_sleeper *timeout,
19905                         enum rtmutex_chainwalk chwalk,
19906 +                       struct ww_acquire_ctx *ww_ctx,
19907                         int (*slowfn)(struct rt_mutex *lock, int state,
19908                                       struct hrtimer_sleeper *timeout,
19909 -                                     enum rtmutex_chainwalk chwalk))
19910 +                                     enum rtmutex_chainwalk chwalk,
19911 +                                     struct ww_acquire_ctx *ww_ctx))
19912  {
19913         if (chwalk == RT_MUTEX_MIN_CHAINWALK &&
19914             likely(rt_mutex_cmpxchg_acquire(lock, NULL, current))) {
19915                 rt_mutex_deadlock_account_lock(lock, current);
19916                 return 0;
19917         } else
19918 -               return slowfn(lock, state, timeout, chwalk);
19919 +               return slowfn(lock, state, timeout, chwalk, ww_ctx);
19920  }
19921
19922  static inline int
19923 @@ -1371,17 +1976,20 @@ rt_mutex_fasttrylock(struct rt_mutex *lock,
19924  static inline void
19925  rt_mutex_fastunlock(struct rt_mutex *lock,
19926                     bool (*slowfn)(struct rt_mutex *lock,
19927 -                                  struct wake_q_head *wqh))
19928 +                                  struct wake_q_head *wqh,
19929 +                                  struct wake_q_head *wq_sleeper))
19930  {
19931         WAKE_Q(wake_q);
19932 +       WAKE_Q(wake_sleeper_q);
19933
19934         if (likely(rt_mutex_cmpxchg_release(lock, current, NULL))) {
19935                 rt_mutex_deadlock_account_unlock(current);
19936
19937         } else {
19938 -               bool deboost = slowfn(lock, &wake_q);
19939 +               bool deboost = slowfn(lock, &wake_q, &wake_sleeper_q);
19940
19941                 wake_up_q(&wake_q);
19942 +               wake_up_q_sleeper(&wake_sleeper_q);
19943
19944                 /* Undo pi boosting if necessary: */
19945                 if (deboost)
19946 @@ -1398,7 +2006,7 @@ void __sched rt_mutex_lock(struct rt_mutex *lock)
19947  {
19948         might_sleep();
19949
19950 -       rt_mutex_fastlock(lock, TASK_UNINTERRUPTIBLE, rt_mutex_slowlock);
19951 +       rt_mutex_fastlock(lock, TASK_UNINTERRUPTIBLE, NULL, rt_mutex_slowlock);
19952  }
19953  EXPORT_SYMBOL_GPL(rt_mutex_lock);
19954
19955 @@ -1415,7 +2023,7 @@ int __sched rt_mutex_lock_interruptible(struct rt_mutex *lock)
19956  {
19957         might_sleep();
19958
19959 -       return rt_mutex_fastlock(lock, TASK_INTERRUPTIBLE, rt_mutex_slowlock);
19960 +       return rt_mutex_fastlock(lock, TASK_INTERRUPTIBLE, NULL, rt_mutex_slowlock);
19961  }
19962  EXPORT_SYMBOL_GPL(rt_mutex_lock_interruptible);
19963
19964 @@ -1428,11 +2036,30 @@ int rt_mutex_timed_futex_lock(struct rt_mutex *lock,
19965         might_sleep();
19966
19967         return rt_mutex_timed_fastlock(lock, TASK_INTERRUPTIBLE, timeout,
19968 -                                      RT_MUTEX_FULL_CHAINWALK,
19969 +                                      RT_MUTEX_FULL_CHAINWALK, NULL,
19970                                        rt_mutex_slowlock);
19971  }
19972
19973  /**
19974 + * rt_mutex_lock_killable - lock a rt_mutex killable
19975 + *
19976 + * @lock:              the rt_mutex to be locked
19977 + * @detect_deadlock:   deadlock detection on/off
19978 + *
19979 + * Returns:
19980 + *  0          on success
19981 + * -EINTR      when interrupted by a signal
19982 + * -EDEADLK    when the lock would deadlock (when deadlock detection is on)
19983 + */
19984 +int __sched rt_mutex_lock_killable(struct rt_mutex *lock)
19985 +{
19986 +       might_sleep();
19987 +
19988 +       return rt_mutex_fastlock(lock, TASK_KILLABLE, NULL, rt_mutex_slowlock);
19989 +}
19990 +EXPORT_SYMBOL_GPL(rt_mutex_lock_killable);
19991 +
19992 +/**
19993   * rt_mutex_timed_lock - lock a rt_mutex interruptible
19994   *                     the timeout structure is provided
19995   *                     by the caller
19996 @@ -1452,6 +2079,7 @@ rt_mutex_timed_lock(struct rt_mutex *lock, struct hrtimer_sleeper *timeout)
19997
19998         return rt_mutex_timed_fastlock(lock, TASK_INTERRUPTIBLE, timeout,
19999                                        RT_MUTEX_MIN_CHAINWALK,
20000 +                                      NULL,
20001                                        rt_mutex_slowlock);
20002  }
20003  EXPORT_SYMBOL_GPL(rt_mutex_timed_lock);
20004 @@ -1469,7 +2097,11 @@ EXPORT_SYMBOL_GPL(rt_mutex_timed_lock);
20005   */
20006  int __sched rt_mutex_trylock(struct rt_mutex *lock)
20007  {
20008 +#ifdef CONFIG_PREEMPT_RT_FULL
20009 +       if (WARN_ON_ONCE(in_irq() || in_nmi()))
20010 +#else
20011         if (WARN_ON(in_irq() || in_nmi() || in_serving_softirq()))
20012 +#endif
20013                 return 0;
20014
20015         return rt_mutex_fasttrylock(lock, rt_mutex_slowtrylock);
20016 @@ -1495,13 +2127,14 @@ EXPORT_SYMBOL_GPL(rt_mutex_unlock);
20017   * required or not.
20018   */
20019  bool __sched rt_mutex_futex_unlock(struct rt_mutex *lock,
20020 -                                  struct wake_q_head *wqh)
20021 +                                  struct wake_q_head *wqh,
20022 +                                  struct wake_q_head *wq_sleeper)
20023  {
20024         if (likely(rt_mutex_cmpxchg_release(lock, current, NULL))) {
20025                 rt_mutex_deadlock_account_unlock(current);
20026                 return false;
20027         }
20028 -       return rt_mutex_slowunlock(lock, wqh);
20029 +       return rt_mutex_slowunlock(lock, wqh, wq_sleeper);
20030  }
20031
20032  /**
20033 @@ -1534,13 +2167,12 @@ EXPORT_SYMBOL_GPL(rt_mutex_destroy);
20034  void __rt_mutex_init(struct rt_mutex *lock, const char *name)
20035  {
20036         lock->owner = NULL;
20037 -       raw_spin_lock_init(&lock->wait_lock);
20038         lock->waiters = RB_ROOT;
20039         lock->waiters_leftmost = NULL;
20040
20041         debug_rt_mutex_init(lock, name);
20042  }
20043 -EXPORT_SYMBOL_GPL(__rt_mutex_init);
20044 +EXPORT_SYMBOL(__rt_mutex_init);
20045
20046  /**
20047   * rt_mutex_init_proxy_locked - initialize and lock a rt_mutex on behalf of a
20048 @@ -1555,7 +2187,7 @@ EXPORT_SYMBOL_GPL(__rt_mutex_init);
20049  void rt_mutex_init_proxy_locked(struct rt_mutex *lock,
20050                                 struct task_struct *proxy_owner)
20051  {
20052 -       __rt_mutex_init(lock, NULL);
20053 +       rt_mutex_init(lock);
20054         debug_rt_mutex_proxy_lock(lock, proxy_owner);
20055         rt_mutex_set_owner(lock, proxy_owner);
20056         rt_mutex_deadlock_account_lock(lock, proxy_owner);
20057 @@ -1596,13 +2228,42 @@ int rt_mutex_start_proxy_lock(struct rt_mutex *lock,
20058  {
20059         int ret;
20060
20061 -       raw_spin_lock(&lock->wait_lock);
20062 +       raw_spin_lock_irq(&lock->wait_lock);
20063
20064         if (try_to_take_rt_mutex(lock, task, NULL)) {
20065 -               raw_spin_unlock(&lock->wait_lock);
20066 +               raw_spin_unlock_irq(&lock->wait_lock);
20067                 return 1;
20068         }
20069
20070 +#ifdef CONFIG_PREEMPT_RT_FULL
20071 +       /*
20072 +        * In PREEMPT_RT there's an added race.
20073 +        * If the task, that we are about to requeue, times out,
20074 +        * it can set the PI_WAKEUP_INPROGRESS. This tells the requeue
20075 +        * to skip this task. But right after the task sets
20076 +        * its pi_blocked_on to PI_WAKEUP_INPROGRESS it can then
20077 +        * block on the spin_lock(&hb->lock), which in RT is an rtmutex.
20078 +        * This will replace the PI_WAKEUP_INPROGRESS with the actual
20079 +        * lock that it blocks on. We *must not* place this task
20080 +        * on this proxy lock in that case.
20081 +        *
20082 +        * To prevent this race, we first take the task's pi_lock
20083 +        * and check if it has updated its pi_blocked_on. If it has,
20084 +        * we assume that it woke up and we return -EAGAIN.
20085 +        * Otherwise, we set the task's pi_blocked_on to
20086 +        * PI_REQUEUE_INPROGRESS, so that if the task is waking up
20087 +        * it will know that we are in the process of requeuing it.
20088 +        */
20089 +       raw_spin_lock(&task->pi_lock);
20090 +       if (task->pi_blocked_on) {
20091 +               raw_spin_unlock(&task->pi_lock);
20092 +               raw_spin_unlock_irq(&lock->wait_lock);
20093 +               return -EAGAIN;
20094 +       }
20095 +       task->pi_blocked_on = PI_REQUEUE_INPROGRESS;
20096 +       raw_spin_unlock(&task->pi_lock);
20097 +#endif
20098 +
20099         /* We enforce deadlock detection for futexes */
20100         ret = task_blocks_on_rt_mutex(lock, waiter, task,
20101                                       RT_MUTEX_FULL_CHAINWALK);
20102 @@ -1617,10 +2278,10 @@ int rt_mutex_start_proxy_lock(struct rt_mutex *lock,
20103                 ret = 0;
20104         }
20105
20106 -       if (unlikely(ret))
20107 +       if (ret && rt_mutex_has_waiters(lock))
20108                 remove_waiter(lock, waiter);
20109
20110 -       raw_spin_unlock(&lock->wait_lock);
20111 +       raw_spin_unlock_irq(&lock->wait_lock);
20112
20113         debug_rt_mutex_print_deadlock(waiter);
20114
20115 @@ -1668,12 +2329,12 @@ int rt_mutex_finish_proxy_lock(struct rt_mutex *lock,
20116  {
20117         int ret;
20118
20119 -       raw_spin_lock(&lock->wait_lock);
20120 +       raw_spin_lock_irq(&lock->wait_lock);
20121
20122         set_current_state(TASK_INTERRUPTIBLE);
20123
20124         /* sleep on the mutex */
20125 -       ret = __rt_mutex_slowlock(lock, TASK_INTERRUPTIBLE, to, waiter);
20126 +       ret = __rt_mutex_slowlock(lock, TASK_INTERRUPTIBLE, to, waiter, NULL);
20127
20128         if (unlikely(ret))
20129                 remove_waiter(lock, waiter);
20130 @@ -1684,7 +2345,93 @@ int rt_mutex_finish_proxy_lock(struct rt_mutex *lock,
20131          */
20132         fixup_rt_mutex_waiters(lock);
20133
20134 -       raw_spin_unlock(&lock->wait_lock);
20135 +       raw_spin_unlock_irq(&lock->wait_lock);
20136
20137         return ret;
20138  }
20139 +
20140 +static inline int
20141 +ww_mutex_deadlock_injection(struct ww_mutex *lock, struct ww_acquire_ctx *ctx)
20142 +{
20143 +#ifdef CONFIG_DEBUG_WW_MUTEX_SLOWPATH
20144 +       unsigned tmp;
20145 +
20146 +       if (ctx->deadlock_inject_countdown-- == 0) {
20147 +               tmp = ctx->deadlock_inject_interval;
20148 +               if (tmp > UINT_MAX/4)
20149 +                       tmp = UINT_MAX;
20150 +               else
20151 +                       tmp = tmp*2 + tmp + tmp/2;
20152 +
20153 +               ctx->deadlock_inject_interval = tmp;
20154 +               ctx->deadlock_inject_countdown = tmp;
20155 +               ctx->contending_lock = lock;
20156 +
20157 +               ww_mutex_unlock(lock);
20158 +
20159 +               return -EDEADLK;
20160 +       }
20161 +#endif
20162 +
20163 +       return 0;
20164 +}
20165 +
20166 +#ifdef CONFIG_PREEMPT_RT_FULL
20167 +int __sched
20168 +__ww_mutex_lock_interruptible(struct ww_mutex *lock, struct ww_acquire_ctx *ww_ctx)
20169 +{
20170 +       int ret;
20171 +
20172 +       might_sleep();
20173 +
20174 +       mutex_acquire_nest(&lock->base.dep_map, 0, 0, &ww_ctx->dep_map, _RET_IP_);
20175 +       ret = rt_mutex_slowlock(&lock->base.lock, TASK_INTERRUPTIBLE, NULL, 0, ww_ctx);
20176 +       if (ret)
20177 +               mutex_release(&lock->base.dep_map, 1, _RET_IP_);
20178 +       else if (!ret && ww_ctx->acquired > 1)
20179 +               return ww_mutex_deadlock_injection(lock, ww_ctx);
20180 +
20181 +       return ret;
20182 +}
20183 +EXPORT_SYMBOL_GPL(__ww_mutex_lock_interruptible);
20184 +
20185 +int __sched
20186 +__ww_mutex_lock(struct ww_mutex *lock, struct ww_acquire_ctx *ww_ctx)
20187 +{
20188 +       int ret;
20189 +
20190 +       might_sleep();
20191 +
20192 +       mutex_acquire_nest(&lock->base.dep_map, 0, 0, &ww_ctx->dep_map, _RET_IP_);
20193 +       ret = rt_mutex_slowlock(&lock->base.lock, TASK_UNINTERRUPTIBLE, NULL, 0, ww_ctx);
20194 +       if (ret)
20195 +               mutex_release(&lock->base.dep_map, 1, _RET_IP_);
20196 +       else if (!ret && ww_ctx->acquired > 1)
20197 +               return ww_mutex_deadlock_injection(lock, ww_ctx);
20198 +
20199 +       return ret;
20200 +}
20201 +EXPORT_SYMBOL_GPL(__ww_mutex_lock);
20202 +
20203 +void __sched ww_mutex_unlock(struct ww_mutex *lock)
20204 +{
20205 +       int nest = !!lock->ctx;
20206 +
20207 +       /*
20208 +        * The unlocking fastpath is the 0->1 transition from 'locked'
20209 +        * into 'unlocked' state:
20210 +        */
20211 +       if (nest) {
20212 +#ifdef CONFIG_DEBUG_MUTEXES
20213 +               DEBUG_LOCKS_WARN_ON(!lock->ctx->acquired);
20214 +#endif
20215 +               if (lock->ctx->acquired > 0)
20216 +                       lock->ctx->acquired--;
20217 +               lock->ctx = NULL;
20218 +       }
20219 +
20220 +       mutex_release(&lock->base.dep_map, nest, _RET_IP_);
20221 +       rt_mutex_unlock(&lock->base.lock);
20222 +}
20223 +EXPORT_SYMBOL(ww_mutex_unlock);
20224 +#endif
20225 diff --git a/kernel/locking/rtmutex_common.h b/kernel/locking/rtmutex_common.h
20226 index 4f5f83c7d2d3..289f062f26cd 100644
20227 --- a/kernel/locking/rtmutex_common.h
20228 +++ b/kernel/locking/rtmutex_common.h
20229 @@ -27,6 +27,7 @@ struct rt_mutex_waiter {
20230         struct rb_node          pi_tree_entry;
20231         struct task_struct      *task;
20232         struct rt_mutex         *lock;
20233 +       bool                    savestate;
20234  #ifdef CONFIG_DEBUG_RT_MUTEXES
20235         unsigned long           ip;
20236         struct pid              *deadlock_task_pid;
20237 @@ -97,6 +98,9 @@ enum rtmutex_chainwalk {
20238  /*
20239   * PI-futex support (proxy locking functions, etc.):
20240   */
20241 +#define PI_WAKEUP_INPROGRESS   ((struct rt_mutex_waiter *) 1)
20242 +#define PI_REQUEUE_INPROGRESS  ((struct rt_mutex_waiter *) 2)
20243 +
20244  extern struct task_struct *rt_mutex_next_owner(struct rt_mutex *lock);
20245  extern void rt_mutex_init_proxy_locked(struct rt_mutex *lock,
20246                                        struct task_struct *proxy_owner);
20247 @@ -110,7 +114,8 @@ extern int rt_mutex_finish_proxy_lock(struct rt_mutex *lock,
20248                                       struct rt_mutex_waiter *waiter);
20249  extern int rt_mutex_timed_futex_lock(struct rt_mutex *l, struct hrtimer_sleeper *to);
20250  extern bool rt_mutex_futex_unlock(struct rt_mutex *lock,
20251 -                                 struct wake_q_head *wqh);
20252 +                                 struct wake_q_head *wqh,
20253 +                                 struct wake_q_head *wq_sleeper);
20254  extern void rt_mutex_adjust_prio(struct task_struct *task);
20255
20256  #ifdef CONFIG_DEBUG_RT_MUTEXES
20257 @@ -119,4 +124,14 @@ extern void rt_mutex_adjust_prio(struct task_struct *task);
20258  # include "rtmutex.h"
20259  #endif
20260
20261 +static inline void
20262 +rt_mutex_init_waiter(struct rt_mutex_waiter *waiter, bool savestate)
20263 +{
20264 +       debug_rt_mutex_init_waiter(waiter);
20265 +       waiter->task = NULL;
20266 +       waiter->savestate = savestate;
20267 +       RB_CLEAR_NODE(&waiter->pi_tree_entry);
20268 +       RB_CLEAR_NODE(&waiter->tree_entry);
20269 +}
20270 +
20271  #endif
20272 diff --git a/kernel/locking/spinlock.c b/kernel/locking/spinlock.c
20273 index db3ccb1dd614..909779647bd1 100644
20274 --- a/kernel/locking/spinlock.c
20275 +++ b/kernel/locking/spinlock.c
20276 @@ -124,8 +124,11 @@ void __lockfunc __raw_##op##_lock_bh(locktype##_t *lock)           \
20277   *         __[spin|read|write]_lock_bh()
20278   */
20279  BUILD_LOCK_OPS(spin, raw_spinlock);
20280 +
20281 +#ifndef CONFIG_PREEMPT_RT_FULL
20282  BUILD_LOCK_OPS(read, rwlock);
20283  BUILD_LOCK_OPS(write, rwlock);
20284 +#endif
20285
20286  #endif
20287
20288 @@ -209,6 +212,8 @@ void __lockfunc _raw_spin_unlock_bh(raw_spinlock_t *lock)
20289  EXPORT_SYMBOL(_raw_spin_unlock_bh);
20290  #endif
20291
20292 +#ifndef CONFIG_PREEMPT_RT_FULL
20293 +
20294  #ifndef CONFIG_INLINE_READ_TRYLOCK
20295  int __lockfunc _raw_read_trylock(rwlock_t *lock)
20296  {
20297 @@ -353,6 +358,8 @@ void __lockfunc _raw_write_unlock_bh(rwlock_t *lock)
20298  EXPORT_SYMBOL(_raw_write_unlock_bh);
20299  #endif
20300
20301 +#endif /* !PREEMPT_RT_FULL */
20302 +
20303  #ifdef CONFIG_DEBUG_LOCK_ALLOC
20304
20305  void __lockfunc _raw_spin_lock_nested(raw_spinlock_t *lock, int subclass)
20306 diff --git a/kernel/locking/spinlock_debug.c b/kernel/locking/spinlock_debug.c
20307 index 0374a596cffa..94970338d518 100644
20308 --- a/kernel/locking/spinlock_debug.c
20309 +++ b/kernel/locking/spinlock_debug.c
20310 @@ -31,6 +31,7 @@ void __raw_spin_lock_init(raw_spinlock_t *lock, const char *name,
20311
20312  EXPORT_SYMBOL(__raw_spin_lock_init);
20313
20314 +#ifndef CONFIG_PREEMPT_RT_FULL
20315  void __rwlock_init(rwlock_t *lock, const char *name,
20316                    struct lock_class_key *key)
20317  {
20318 @@ -48,6 +49,7 @@ void __rwlock_init(rwlock_t *lock, const char *name,
20319  }
20320
20321  EXPORT_SYMBOL(__rwlock_init);
20322 +#endif
20323
20324  static void spin_dump(raw_spinlock_t *lock, const char *msg)
20325  {
20326 @@ -159,6 +161,7 @@ void do_raw_spin_unlock(raw_spinlock_t *lock)
20327         arch_spin_unlock(&lock->raw_lock);
20328  }
20329
20330 +#ifndef CONFIG_PREEMPT_RT_FULL
20331  static void rwlock_bug(rwlock_t *lock, const char *msg)
20332  {
20333         if (!debug_locks_off())
20334 @@ -300,3 +303,5 @@ void do_raw_write_unlock(rwlock_t *lock)
20335         debug_write_unlock(lock);
20336         arch_write_unlock(&lock->raw_lock);
20337  }
20338 +
20339 +#endif
20340 diff --git a/kernel/panic.c b/kernel/panic.c
20341 index 41e2b54f36b5..3535f802953a 100644
20342 --- a/kernel/panic.c
20343 +++ b/kernel/panic.c
20344 @@ -61,6 +61,37 @@ void __weak panic_smp_self_stop(void)
20345                 cpu_relax();
20346  }
20347
20348 +/*
20349 + * Stop ourselves in NMI context if another CPU has already panicked. Arch code
20350 + * may override this to prepare for crash dumping, e.g. save regs info.
20351 + */
20352 +void __weak nmi_panic_self_stop(struct pt_regs *regs)
20353 +{
20354 +       panic_smp_self_stop();
20355 +}
20356 +
20357 +atomic_t panic_cpu = ATOMIC_INIT(PANIC_CPU_INVALID);
20358 +
20359 +/*
20360 + * A variant of panic() called from NMI context. We return if we've already
20361 + * panicked on this CPU. If another CPU already panicked, loop in
20362 + * nmi_panic_self_stop() which can provide architecture dependent code such
20363 + * as saving register state for crash dump.
20364 + */
20365 +void nmi_panic(struct pt_regs *regs, const char *msg)
20366 +{
20367 +       int old_cpu, cpu;
20368 +
20369 +       cpu = raw_smp_processor_id();
20370 +       old_cpu = atomic_cmpxchg(&panic_cpu, PANIC_CPU_INVALID, cpu);
20371 +
20372 +       if (old_cpu == PANIC_CPU_INVALID)
20373 +               panic("%s", msg);
20374 +       else if (old_cpu != cpu)
20375 +               nmi_panic_self_stop(regs);
20376 +}
20377 +EXPORT_SYMBOL(nmi_panic);
20378 +
20379  /**
20380   *     panic - halt the system
20381   *     @fmt: The text string to print
20382 @@ -71,17 +102,17 @@ void __weak panic_smp_self_stop(void)
20383   */
20384  void panic(const char *fmt, ...)
20385  {
20386 -       static DEFINE_SPINLOCK(panic_lock);
20387         static char buf[1024];
20388         va_list args;
20389         long i, i_next = 0;
20390         int state = 0;
20391 +       int old_cpu, this_cpu;
20392
20393         /*
20394          * Disable local interrupts. This will prevent panic_smp_self_stop
20395          * from deadlocking the first cpu that invokes the panic, since
20396          * there is nothing to prevent an interrupt handler (that runs
20397 -        * after the panic_lock is acquired) from invoking panic again.
20398 +        * after setting panic_cpu) from invoking panic() again.
20399          */
20400         local_irq_disable();
20401
20402 @@ -94,8 +125,16 @@ void panic(const char *fmt, ...)
20403          * multiple parallel invocations of panic, all other CPUs either
20404          * stop themself or will wait until they are stopped by the 1st CPU
20405          * with smp_send_stop().
20406 +        *
20407 +        * `old_cpu == PANIC_CPU_INVALID' means this is the 1st CPU which
20408 +        * comes here, so go ahead.
20409 +        * `old_cpu == this_cpu' means we came from nmi_panic() which sets
20410 +        * panic_cpu to this CPU.  In this case, this is also the 1st CPU.
20411          */
20412 -       if (!spin_trylock(&panic_lock))
20413 +       this_cpu = raw_smp_processor_id();
20414 +       old_cpu  = atomic_cmpxchg(&panic_cpu, PANIC_CPU_INVALID, this_cpu);
20415 +
20416 +       if (old_cpu != PANIC_CPU_INVALID && old_cpu != this_cpu)
20417                 panic_smp_self_stop();
20418
20419         console_verbose();
20420 @@ -400,9 +439,11 @@ static u64 oops_id;
20421
20422  static int init_oops_id(void)
20423  {
20424 +#ifndef CONFIG_PREEMPT_RT_FULL
20425         if (!oops_id)
20426                 get_random_bytes(&oops_id, sizeof(oops_id));
20427         else
20428 +#endif
20429                 oops_id++;
20430
20431         return 0;
20432 diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
20433 index 3124cebaec31..c1b981521dd0 100644
20434 --- a/kernel/power/hibernate.c
20435 +++ b/kernel/power/hibernate.c
20436 @@ -285,6 +285,8 @@ static int create_image(int platform_mode)
20437
20438         local_irq_disable();
20439
20440 +       system_state = SYSTEM_SUSPEND;
20441 +
20442         error = syscore_suspend();
20443         if (error) {
20444                 printk(KERN_ERR "PM: Some system devices failed to power down, "
20445 @@ -314,6 +316,7 @@ static int create_image(int platform_mode)
20446         syscore_resume();
20447
20448   Enable_irqs:
20449 +       system_state = SYSTEM_RUNNING;
20450         local_irq_enable();
20451
20452   Enable_cpus:
20453 @@ -438,6 +441,7 @@ static int resume_target_kernel(bool platform_mode)
20454                 goto Enable_cpus;
20455
20456         local_irq_disable();
20457 +       system_state = SYSTEM_SUSPEND;
20458
20459         error = syscore_suspend();
20460         if (error)
20461 @@ -471,6 +475,7 @@ static int resume_target_kernel(bool platform_mode)
20462         syscore_resume();
20463
20464   Enable_irqs:
20465 +       system_state = SYSTEM_RUNNING;
20466         local_irq_enable();
20467
20468   Enable_cpus:
20469 @@ -556,6 +561,7 @@ int hibernation_platform_enter(void)
20470                 goto Enable_cpus;
20471
20472         local_irq_disable();
20473 +       system_state = SYSTEM_SUSPEND;
20474         syscore_suspend();
20475         if (pm_wakeup_pending()) {
20476                 error = -EAGAIN;
20477 @@ -568,6 +574,7 @@ int hibernation_platform_enter(void)
20478
20479   Power_up:
20480         syscore_resume();
20481 +       system_state = SYSTEM_RUNNING;
20482         local_irq_enable();
20483
20484   Enable_cpus:
20485 @@ -642,6 +649,10 @@ static void power_down(void)
20486                 cpu_relax();
20487  }
20488
20489 +#ifndef CONFIG_SUSPEND
20490 +bool pm_in_action;
20491 +#endif
20492 +
20493  /**
20494   * hibernate - Carry out system hibernation, including saving the image.
20495   */
20496 @@ -654,6 +665,8 @@ int hibernate(void)
20497                 return -EPERM;
20498         }
20499
20500 +       pm_in_action = true;
20501 +
20502         lock_system_sleep();
20503         /* The snapshot device should not be opened while we're running */
20504         if (!atomic_add_unless(&snapshot_device_available, -1, 0)) {
20505 @@ -719,6 +732,7 @@ int hibernate(void)
20506         atomic_inc(&snapshot_device_available);
20507   Unlock:
20508         unlock_system_sleep();
20509 +       pm_in_action = false;
20510         return error;
20511  }
20512
20513 diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
20514 index f9fe133c13e2..393bc342c586 100644
20515 --- a/kernel/power/suspend.c
20516 +++ b/kernel/power/suspend.c
20517 @@ -359,6 +359,8 @@ static int suspend_enter(suspend_state_t state, bool *wakeup)
20518         arch_suspend_disable_irqs();
20519         BUG_ON(!irqs_disabled());
20520
20521 +       system_state = SYSTEM_SUSPEND;
20522 +
20523         error = syscore_suspend();
20524         if (!error) {
20525                 *wakeup = pm_wakeup_pending();
20526 @@ -375,6 +377,8 @@ static int suspend_enter(suspend_state_t state, bool *wakeup)
20527                 syscore_resume();
20528         }
20529
20530 +       system_state = SYSTEM_RUNNING;
20531 +
20532         arch_suspend_enable_irqs();
20533         BUG_ON(irqs_disabled());
20534
20535 @@ -518,6 +522,8 @@ static int enter_state(suspend_state_t state)
20536         return error;
20537  }
20538
20539 +bool pm_in_action;
20540 +
20541  /**
20542   * pm_suspend - Externally visible function for suspending the system.
20543   * @state: System sleep state to enter.
20544 @@ -532,6 +538,8 @@ int pm_suspend(suspend_state_t state)
20545         if (state <= PM_SUSPEND_ON || state >= PM_SUSPEND_MAX)
20546                 return -EINVAL;
20547
20548 +       pm_in_action = true;
20549 +
20550         error = enter_state(state);
20551         if (error) {
20552                 suspend_stats.fail++;
20553 @@ -539,6 +547,7 @@ int pm_suspend(suspend_state_t state)
20554         } else {
20555                 suspend_stats.success++;
20556         }
20557 +       pm_in_action = false;
20558         return error;
20559  }
20560  EXPORT_SYMBOL(pm_suspend);
20561 diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
20562 index c048e34b177f..c747bdfa199e 100644
20563 --- a/kernel/printk/printk.c
20564 +++ b/kernel/printk/printk.c
20565 @@ -241,6 +241,65 @@ struct printk_log {
20566   */
20567  static DEFINE_RAW_SPINLOCK(logbuf_lock);
20568
20569 +#ifdef CONFIG_EARLY_PRINTK
20570 +struct console *early_console;
20571 +
20572 +static void early_vprintk(const char *fmt, va_list ap)
20573 +{
20574 +       if (early_console) {
20575 +               char buf[512];
20576 +               int n = vscnprintf(buf, sizeof(buf), fmt, ap);
20577 +
20578 +               early_console->write(early_console, buf, n);
20579 +       }
20580 +}
20581 +
20582 +asmlinkage void early_printk(const char *fmt, ...)
20583 +{
20584 +       va_list ap;
20585 +
20586 +       va_start(ap, fmt);
20587 +       early_vprintk(fmt, ap);
20588 +       va_end(ap);
20589 +}
20590 +
20591 +/*
20592 + * This is independent of any log levels - a global
20593 + * kill switch that turns off all of printk.
20594 + *
20595 + * Used by the NMI watchdog if early-printk is enabled.
20596 + */
20597 +static bool __read_mostly printk_killswitch;
20598 +
20599 +static int __init force_early_printk_setup(char *str)
20600 +{
20601 +       printk_killswitch = true;
20602 +       return 0;
20603 +}
20604 +early_param("force_early_printk", force_early_printk_setup);
20605 +
20606 +void printk_kill(void)
20607 +{
20608 +       printk_killswitch = true;
20609 +}
20610 +
20611 +#ifdef CONFIG_PRINTK
20612 +static int forced_early_printk(const char *fmt, va_list ap)
20613 +{
20614 +       if (!printk_killswitch)
20615 +               return 0;
20616 +       early_vprintk(fmt, ap);
20617 +       return 1;
20618 +}
20619 +#endif
20620 +
20621 +#else
20622 +static inline int forced_early_printk(const char *fmt, va_list ap)
20623 +{
20624 +       return 0;
20625 +}
20626 +#endif
20627 +
20628  #ifdef CONFIG_PRINTK
20629  DECLARE_WAIT_QUEUE_HEAD(log_wait);
20630  /* the next printk record to read by syslog(READ) or /proc/kmsg */
20631 @@ -1203,6 +1262,7 @@ static int syslog_print_all(char __user *buf, int size, bool clear)
20632  {
20633         char *text;
20634         int len = 0;
20635 +       int attempts = 0;
20636
20637         text = kmalloc(LOG_LINE_MAX + PREFIX_MAX, GFP_KERNEL);
20638         if (!text)
20639 @@ -1214,7 +1274,14 @@ static int syslog_print_all(char __user *buf, int size, bool clear)
20640                 u64 seq;
20641                 u32 idx;
20642                 enum log_flags prev;
20643 -
20644 +               int num_msg;
20645 +try_again:
20646 +               attempts++;
20647 +               if (attempts > 10) {
20648 +                       len = -EBUSY;
20649 +                       goto out;
20650 +               }
20651 +               num_msg = 0;
20652                 if (clear_seq < log_first_seq) {
20653                         /* messages are gone, move to first available one */
20654                         clear_seq = log_first_seq;
20655 @@ -1235,6 +1302,14 @@ static int syslog_print_all(char __user *buf, int size, bool clear)
20656                         prev = msg->flags;
20657                         idx = log_next(idx);
20658                         seq++;
20659 +                       num_msg++;
20660 +                       if (num_msg > 5) {
20661 +                               num_msg = 0;
20662 +                               raw_spin_unlock_irq(&logbuf_lock);
20663 +                               raw_spin_lock_irq(&logbuf_lock);
20664 +                               if (clear_seq < log_first_seq)
20665 +                                       goto try_again;
20666 +                       }
20667                 }
20668
20669                 /* move first record forward until length fits into the buffer */
20670 @@ -1248,6 +1323,14 @@ static int syslog_print_all(char __user *buf, int size, bool clear)
20671                         prev = msg->flags;
20672                         idx = log_next(idx);
20673                         seq++;
20674 +                       num_msg++;
20675 +                       if (num_msg > 5) {
20676 +                               num_msg = 0;
20677 +                               raw_spin_unlock_irq(&logbuf_lock);
20678 +                               raw_spin_lock_irq(&logbuf_lock);
20679 +                               if (clear_seq < log_first_seq)
20680 +                                       goto try_again;
20681 +                       }
20682                 }
20683
20684                 /* last message fitting into this dump */
20685 @@ -1288,6 +1371,7 @@ static int syslog_print_all(char __user *buf, int size, bool clear)
20686                 clear_seq = log_next_seq;
20687                 clear_idx = log_next_idx;
20688         }
20689 +out:
20690         raw_spin_unlock_irq(&logbuf_lock);
20691
20692         kfree(text);
20693 @@ -1443,6 +1527,12 @@ static void call_console_drivers(int level,
20694         if (!console_drivers)
20695                 return;
20696
20697 +       if (IS_ENABLED(CONFIG_PREEMPT_RT_BASE)) {
20698 +               if (in_irq() || in_nmi())
20699 +                       return;
20700 +       }
20701 +
20702 +       migrate_disable();
20703         for_each_console(con) {
20704                 if (exclusive_console && con != exclusive_console)
20705                         continue;
20706 @@ -1458,6 +1548,7 @@ static void call_console_drivers(int level,
20707                 else
20708                         con->write(con, text, len);
20709         }
20710 +       migrate_enable();
20711  }
20712
20713  /*
20714 @@ -1518,6 +1609,15 @@ static inline int can_use_console(unsigned int cpu)
20715  static int console_trylock_for_printk(void)
20716  {
20717         unsigned int cpu = smp_processor_id();
20718 +#ifdef CONFIG_PREEMPT_RT_FULL
20719 +       int lock = !early_boot_irqs_disabled && (preempt_count() == 0) &&
20720 +               !irqs_disabled();
20721 +#else
20722 +       int lock = 1;
20723 +#endif
20724 +
20725 +       if (!lock)
20726 +               return 0;
20727
20728         if (!console_trylock())
20729                 return 0;
20730 @@ -1672,6 +1772,13 @@ asmlinkage int vprintk_emit(int facility, int level,
20731         /* cpu currently holding logbuf_lock in this function */
20732         static unsigned int logbuf_cpu = UINT_MAX;
20733
20734 +       /*
20735 +        * Fall back to early_printk if a debugging subsystem has
20736 +        * killed printk output
20737 +        */
20738 +       if (unlikely(forced_early_printk(fmt, args)))
20739 +               return 1;
20740 +
20741         if (level == LOGLEVEL_SCHED) {
20742                 level = LOGLEVEL_DEFAULT;
20743                 in_sched = true;
20744 @@ -1813,8 +1920,7 @@ asmlinkage int vprintk_emit(int facility, int level,
20745                  * console_sem which would prevent anyone from printing to
20746                  * console
20747                  */
20748 -               preempt_disable();
20749 -
20750 +               migrate_disable();
20751                 /*
20752                  * Try to acquire and then immediately release the console
20753                  * semaphore.  The release will print out buffers and wake up
20754 @@ -1822,7 +1928,7 @@ asmlinkage int vprintk_emit(int facility, int level,
20755                  */
20756                 if (console_trylock_for_printk())
20757                         console_unlock();
20758 -               preempt_enable();
20759 +               migrate_enable();
20760                 lockdep_on();
20761         }
20762
20763 @@ -1961,26 +2067,6 @@ DEFINE_PER_CPU(printk_func_t, printk_func);
20764
20765  #endif /* CONFIG_PRINTK */
20766
20767 -#ifdef CONFIG_EARLY_PRINTK
20768 -struct console *early_console;
20769 -
20770 -asmlinkage __visible void early_printk(const char *fmt, ...)
20771 -{
20772 -       va_list ap;
20773 -       char buf[512];
20774 -       int n;
20775 -
20776 -       if (!early_console)
20777 -               return;
20778 -
20779 -       va_start(ap, fmt);
20780 -       n = vscnprintf(buf, sizeof(buf), fmt, ap);
20781 -       va_end(ap);
20782 -
20783 -       early_console->write(early_console, buf, n);
20784 -}
20785 -#endif
20786 -
20787  static int __add_preferred_console(char *name, int idx, char *options,
20788                                    char *brl_options)
20789  {
20790 @@ -2202,11 +2288,16 @@ static void console_cont_flush(char *text, size_t size)
20791                 goto out;
20792
20793         len = cont_print_text(text, size);
20794 +#ifdef CONFIG_PREEMPT_RT_FULL
20795 +       raw_spin_unlock_irqrestore(&logbuf_lock, flags);
20796 +       call_console_drivers(cont.level, NULL, 0, text, len);
20797 +#else
20798         raw_spin_unlock(&logbuf_lock);
20799         stop_critical_timings();
20800         call_console_drivers(cont.level, NULL, 0, text, len);
20801         start_critical_timings();
20802         local_irq_restore(flags);
20803 +#endif
20804         return;
20805  out:
20806         raw_spin_unlock_irqrestore(&logbuf_lock, flags);
20807 @@ -2316,13 +2407,17 @@ skip:
20808                 console_idx = log_next(console_idx);
20809                 console_seq++;
20810                 console_prev = msg->flags;
20811 +#ifdef CONFIG_PREEMPT_RT_FULL
20812 +               raw_spin_unlock_irqrestore(&logbuf_lock, flags);
20813 +               call_console_drivers(level, ext_text, ext_len, text, len);
20814 +#else
20815                 raw_spin_unlock(&logbuf_lock);
20816
20817                 stop_critical_timings();        /* don't trace print latency */
20818                 call_console_drivers(level, ext_text, ext_len, text, len);
20819                 start_critical_timings();
20820                 local_irq_restore(flags);
20821 -
20822 +#endif
20823                 if (do_cond_resched)
20824                         cond_resched();
20825         }
20826 @@ -2374,6 +2469,11 @@ void console_unblank(void)
20827  {
20828         struct console *c;
20829
20830 +       if (IS_ENABLED(CONFIG_PREEMPT_RT_BASE)) {
20831 +               if (in_irq() || in_nmi())
20832 +                       return;
20833 +       }
20834 +
20835         /*
20836          * console_unblank can no longer be called in interrupt context unless
20837          * oops_in_progress is set to 1..
20838 diff --git a/kernel/ptrace.c b/kernel/ptrace.c
20839 index 3189e51db7e8..1004af706be7 100644
20840 --- a/kernel/ptrace.c
20841 +++ b/kernel/ptrace.c
20842 @@ -129,7 +129,14 @@ static bool ptrace_freeze_traced(struct task_struct *task)
20843
20844         spin_lock_irq(&task->sighand->siglock);
20845         if (task_is_traced(task) && !__fatal_signal_pending(task)) {
20846 -               task->state = __TASK_TRACED;
20847 +               unsigned long flags;
20848 +
20849 +               raw_spin_lock_irqsave(&task->pi_lock, flags);
20850 +               if (task->state & __TASK_TRACED)
20851 +                       task->state = __TASK_TRACED;
20852 +               else
20853 +                       task->saved_state = __TASK_TRACED;
20854 +               raw_spin_unlock_irqrestore(&task->pi_lock, flags);
20855                 ret = true;
20856         }
20857         spin_unlock_irq(&task->sighand->siglock);
20858 diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
20859 index d89328e260df..5bb3364a6284 100644
20860 --- a/kernel/rcu/rcutorture.c
20861 +++ b/kernel/rcu/rcutorture.c
20862 @@ -390,6 +390,7 @@ static struct rcu_torture_ops rcu_ops = {
20863         .name           = "rcu"
20864  };
20865
20866 +#ifndef CONFIG_PREEMPT_RT_FULL
20867  /*
20868   * Definitions for rcu_bh torture testing.
20869   */
20870 @@ -429,6 +430,12 @@ static struct rcu_torture_ops rcu_bh_ops = {
20871         .name           = "rcu_bh"
20872  };
20873
20874 +#else
20875 +static struct rcu_torture_ops rcu_bh_ops = {
20876 +       .ttype          = INVALID_RCU_FLAVOR,
20877 +};
20878 +#endif
20879 +
20880  /*
20881   * Don't even think about trying any of these in real life!!!
20882   * The names includes "busted", and they really means it!
20883 diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
20884 index f07343b54fe5..d862a203fce0 100644
20885 --- a/kernel/rcu/tree.c
20886 +++ b/kernel/rcu/tree.c
20887 @@ -56,6 +56,11 @@
20888  #include <linux/random.h>
20889  #include <linux/trace_events.h>
20890  #include <linux/suspend.h>
20891 +#include <linux/delay.h>
20892 +#include <linux/gfp.h>
20893 +#include <linux/oom.h>
20894 +#include <linux/smpboot.h>
20895 +#include "../time/tick-internal.h"
20896
20897  #include "tree.h"
20898  #include "rcu.h"
20899 @@ -266,6 +271,19 @@ void rcu_sched_qs(void)
20900         }
20901  }
20902
20903 +#ifdef CONFIG_PREEMPT_RT_FULL
20904 +static void rcu_preempt_qs(void);
20905 +
20906 +void rcu_bh_qs(void)
20907 +{
20908 +       unsigned long flags;
20909 +
20910 +       /* Callers to this function, rcu_preempt_qs(), must disable irqs. */
20911 +       local_irq_save(flags);
20912 +       rcu_preempt_qs();
20913 +       local_irq_restore(flags);
20914 +}
20915 +#else
20916  void rcu_bh_qs(void)
20917  {
20918         if (__this_cpu_read(rcu_bh_data.cpu_no_qs.s)) {
20919 @@ -275,6 +293,7 @@ void rcu_bh_qs(void)
20920                 __this_cpu_write(rcu_bh_data.cpu_no_qs.b.norm, false);
20921         }
20922  }
20923 +#endif
20924
20925  static DEFINE_PER_CPU(int, rcu_sched_qs_mask);
20926
20927 @@ -435,11 +454,13 @@ EXPORT_SYMBOL_GPL(rcu_batches_started_sched);
20928  /*
20929   * Return the number of RCU BH batches started thus far for debug & stats.
20930   */
20931 +#ifndef CONFIG_PREEMPT_RT_FULL
20932  unsigned long rcu_batches_started_bh(void)
20933  {
20934         return rcu_bh_state.gpnum;
20935  }
20936  EXPORT_SYMBOL_GPL(rcu_batches_started_bh);
20937 +#endif
20938
20939  /*
20940   * Return the number of RCU batches completed thus far for debug & stats.
20941 @@ -459,6 +480,7 @@ unsigned long rcu_batches_completed_sched(void)
20942  }
20943  EXPORT_SYMBOL_GPL(rcu_batches_completed_sched);
20944
20945 +#ifndef CONFIG_PREEMPT_RT_FULL
20946  /*
20947   * Return the number of RCU BH batches completed thus far for debug & stats.
20948   */
20949 @@ -486,6 +508,13 @@ void rcu_bh_force_quiescent_state(void)
20950  }
20951  EXPORT_SYMBOL_GPL(rcu_bh_force_quiescent_state);
20952
20953 +#else
20954 +void rcu_force_quiescent_state(void)
20955 +{
20956 +}
20957 +EXPORT_SYMBOL_GPL(rcu_force_quiescent_state);
20958 +#endif
20959 +
20960  /*
20961   * Force a quiescent state for RCU-sched.
20962   */
20963 @@ -536,9 +565,11 @@ void rcutorture_get_gp_data(enum rcutorture_type test_type, int *flags,
20964         case RCU_FLAVOR:
20965                 rsp = rcu_state_p;
20966                 break;
20967 +#ifndef CONFIG_PREEMPT_RT_FULL
20968         case RCU_BH_FLAVOR:
20969                 rsp = &rcu_bh_state;
20970                 break;
20971 +#endif
20972         case RCU_SCHED_FLAVOR:
20973                 rsp = &rcu_sched_state;
20974                 break;
20975 @@ -1590,7 +1621,6 @@ static int rcu_future_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp)
20976         int needmore;
20977         struct rcu_data *rdp = this_cpu_ptr(rsp->rda);
20978
20979 -       rcu_nocb_gp_cleanup(rsp, rnp);
20980         rnp->need_future_gp[c & 0x1] = 0;
20981         needmore = rnp->need_future_gp[(c + 1) & 0x1];
20982         trace_rcu_future_gp(rnp, rdp, c,
20983 @@ -1611,7 +1641,7 @@ static void rcu_gp_kthread_wake(struct rcu_state *rsp)
20984             !READ_ONCE(rsp->gp_flags) ||
20985             !rsp->gp_kthread)
20986                 return;
20987 -       wake_up(&rsp->gp_wq);
20988 +       swake_up(&rsp->gp_wq);
20989  }
20990
20991  /*
20992 @@ -1991,6 +2021,7 @@ static void rcu_gp_cleanup(struct rcu_state *rsp)
20993         int nocb = 0;
20994         struct rcu_data *rdp;
20995         struct rcu_node *rnp = rcu_get_root(rsp);
20996 +       struct swait_queue_head *sq;
20997
20998         WRITE_ONCE(rsp->gp_activity, jiffies);
20999         raw_spin_lock_irq(&rnp->lock);
21000 @@ -2029,7 +2060,9 @@ static void rcu_gp_cleanup(struct rcu_state *rsp)
21001                         needgp = __note_gp_changes(rsp, rnp, rdp) || needgp;
21002                 /* smp_mb() provided by prior unlock-lock pair. */
21003                 nocb += rcu_future_gp_cleanup(rsp, rnp);
21004 +               sq = rcu_nocb_gp_get(rnp);
21005                 raw_spin_unlock_irq(&rnp->lock);
21006 +               rcu_nocb_gp_cleanup(sq);
21007                 cond_resched_rcu_qs();
21008                 WRITE_ONCE(rsp->gp_activity, jiffies);
21009                 rcu_gp_slow(rsp, gp_cleanup_delay);
21010 @@ -2076,7 +2109,7 @@ static int __noreturn rcu_gp_kthread(void *arg)
21011                                                READ_ONCE(rsp->gpnum),
21012                                                TPS("reqwait"));
21013                         rsp->gp_state = RCU_GP_WAIT_GPS;
21014 -                       wait_event_interruptible(rsp->gp_wq,
21015 +                       swait_event_interruptible(rsp->gp_wq,
21016                                                  READ_ONCE(rsp->gp_flags) &
21017                                                  RCU_GP_FLAG_INIT);
21018                         rsp->gp_state = RCU_GP_DONE_GPS;
21019 @@ -2106,7 +2139,7 @@ static int __noreturn rcu_gp_kthread(void *arg)
21020                                                READ_ONCE(rsp->gpnum),
21021                                                TPS("fqswait"));
21022                         rsp->gp_state = RCU_GP_WAIT_FQS;
21023 -                       ret = wait_event_interruptible_timeout(rsp->gp_wq,
21024 +                       ret = swait_event_interruptible_timeout(rsp->gp_wq,
21025                                         rcu_gp_fqs_check_wake(rsp, &gf), j);
21026                         rsp->gp_state = RCU_GP_DOING_FQS;
21027                         /* Locking provides needed memory barriers. */
21028 @@ -2230,7 +2263,7 @@ static void rcu_report_qs_rsp(struct rcu_state *rsp, unsigned long flags)
21029         WARN_ON_ONCE(!rcu_gp_in_progress(rsp));
21030         WRITE_ONCE(rsp->gp_flags, READ_ONCE(rsp->gp_flags) | RCU_GP_FLAG_FQS);
21031         raw_spin_unlock_irqrestore(&rcu_get_root(rsp)->lock, flags);
21032 -       rcu_gp_kthread_wake(rsp);
21033 +       swake_up(&rsp->gp_wq);  /* Memory barrier implied by swake_up() path. */
21034  }
21035
21036  /*
21037 @@ -2891,7 +2924,7 @@ static void force_quiescent_state(struct rcu_state *rsp)
21038         }
21039         WRITE_ONCE(rsp->gp_flags, READ_ONCE(rsp->gp_flags) | RCU_GP_FLAG_FQS);
21040         raw_spin_unlock_irqrestore(&rnp_old->lock, flags);
21041 -       rcu_gp_kthread_wake(rsp);
21042 +       swake_up(&rsp->gp_wq); /* Memory barrier implied by swake_up() path. */
21043  }
21044
21045  /*
21046 @@ -2934,18 +2967,17 @@ __rcu_process_callbacks(struct rcu_state *rsp)
21047  /*
21048   * Do RCU core processing for the current CPU.
21049   */
21050 -static void rcu_process_callbacks(struct softirq_action *unused)
21051 +static void rcu_process_callbacks(void)
21052  {
21053         struct rcu_state *rsp;
21054
21055         if (cpu_is_offline(smp_processor_id()))
21056                 return;
21057 -       trace_rcu_utilization(TPS("Start RCU core"));
21058         for_each_rcu_flavor(rsp)
21059                 __rcu_process_callbacks(rsp);
21060 -       trace_rcu_utilization(TPS("End RCU core"));
21061  }
21062
21063 +static DEFINE_PER_CPU(struct task_struct *, rcu_cpu_kthread_task);
21064  /*
21065   * Schedule RCU callback invocation.  If the specified type of RCU
21066   * does not support RCU priority boosting, just do a direct call,
21067 @@ -2957,18 +2989,105 @@ static void invoke_rcu_callbacks(struct rcu_state *rsp, struct rcu_data *rdp)
21068  {
21069         if (unlikely(!READ_ONCE(rcu_scheduler_fully_active)))
21070                 return;
21071 -       if (likely(!rsp->boost)) {
21072 -               rcu_do_batch(rsp, rdp);
21073 -               return;
21074 -       }
21075 -       invoke_rcu_callbacks_kthread();
21076 +       rcu_do_batch(rsp, rdp);
21077  }
21078
21079 +static void rcu_wake_cond(struct task_struct *t, int status)
21080 +{
21081 +       /*
21082 +        * If the thread is yielding, only wake it when this
21083 +        * is invoked from idle
21084 +        */
21085 +       if (t && (status != RCU_KTHREAD_YIELDING || is_idle_task(current)))
21086 +               wake_up_process(t);
21087 +}
21088 +
21089 +/*
21090 + * Wake up this CPU's rcuc kthread to do RCU core processing.
21091 + */
21092  static void invoke_rcu_core(void)
21093  {
21094 -       if (cpu_online(smp_processor_id()))
21095 -               raise_softirq(RCU_SOFTIRQ);
21096 +       unsigned long flags;
21097 +       struct task_struct *t;
21098 +
21099 +       if (!cpu_online(smp_processor_id()))
21100 +               return;
21101 +       local_irq_save(flags);
21102 +       __this_cpu_write(rcu_cpu_has_work, 1);
21103 +       t = __this_cpu_read(rcu_cpu_kthread_task);
21104 +       if (t != NULL && current != t)
21105 +               rcu_wake_cond(t, __this_cpu_read(rcu_cpu_kthread_status));
21106 +       local_irq_restore(flags);
21107 +}
21108 +
21109 +static void rcu_cpu_kthread_park(unsigned int cpu)
21110 +{
21111 +       per_cpu(rcu_cpu_kthread_status, cpu) = RCU_KTHREAD_OFFCPU;
21112 +}
21113 +
21114 +static int rcu_cpu_kthread_should_run(unsigned int cpu)
21115 +{
21116 +       return __this_cpu_read(rcu_cpu_has_work);
21117 +}
21118 +
21119 +/*
21120 + * Per-CPU kernel thread that invokes RCU callbacks.  This replaces the
21121 + * RCU softirq used in flavors and configurations of RCU that do not
21122 + * support RCU priority boosting.
21123 + */
21124 +static void rcu_cpu_kthread(unsigned int cpu)
21125 +{
21126 +       unsigned int *statusp = this_cpu_ptr(&rcu_cpu_kthread_status);
21127 +       char work, *workp = this_cpu_ptr(&rcu_cpu_has_work);
21128 +       int spincnt;
21129 +
21130 +       for (spincnt = 0; spincnt < 10; spincnt++) {
21131 +               trace_rcu_utilization(TPS("Start CPU kthread@rcu_wait"));
21132 +               local_bh_disable();
21133 +               *statusp = RCU_KTHREAD_RUNNING;
21134 +               this_cpu_inc(rcu_cpu_kthread_loops);
21135 +               local_irq_disable();
21136 +               work = *workp;
21137 +               *workp = 0;
21138 +               local_irq_enable();
21139 +               if (work)
21140 +                       rcu_process_callbacks();
21141 +               local_bh_enable();
21142 +               if (*workp == 0) {
21143 +                       trace_rcu_utilization(TPS("End CPU kthread@rcu_wait"));
21144 +                       *statusp = RCU_KTHREAD_WAITING;
21145 +                       return;
21146 +               }
21147 +       }
21148 +       *statusp = RCU_KTHREAD_YIELDING;
21149 +       trace_rcu_utilization(TPS("Start CPU kthread@rcu_yield"));
21150 +       schedule_timeout_interruptible(2);
21151 +       trace_rcu_utilization(TPS("End CPU kthread@rcu_yield"));
21152 +       *statusp = RCU_KTHREAD_WAITING;
21153 +}
21154 +
21155 +static struct smp_hotplug_thread rcu_cpu_thread_spec = {
21156 +       .store                  = &rcu_cpu_kthread_task,
21157 +       .thread_should_run      = rcu_cpu_kthread_should_run,
21158 +       .thread_fn              = rcu_cpu_kthread,
21159 +       .thread_comm            = "rcuc/%u",
21160 +       .setup                  = rcu_cpu_kthread_setup,
21161 +       .park                   = rcu_cpu_kthread_park,
21162 +};
21163 +
21164 +/*
21165 + * Spawn per-CPU RCU core processing kthreads.
21166 + */
21167 +static int __init rcu_spawn_core_kthreads(void)
21168 +{
21169 +       int cpu;
21170 +
21171 +       for_each_possible_cpu(cpu)
21172 +               per_cpu(rcu_cpu_has_work, cpu) = 0;
21173 +       BUG_ON(smpboot_register_percpu_thread(&rcu_cpu_thread_spec));
21174 +       return 0;
21175  }
21176 +early_initcall(rcu_spawn_core_kthreads);
21177
21178  /*
21179   * Handle any core-RCU processing required by a call_rcu() invocation.
21180 @@ -3114,6 +3233,7 @@ void call_rcu_sched(struct rcu_head *head, rcu_callback_t func)
21181  }
21182  EXPORT_SYMBOL_GPL(call_rcu_sched);
21183
21184 +#ifndef CONFIG_PREEMPT_RT_FULL
21185  /*
21186   * Queue an RCU callback for invocation after a quicker grace period.
21187   */
21188 @@ -3122,6 +3242,7 @@ void call_rcu_bh(struct rcu_head *head, rcu_callback_t func)
21189         __call_rcu(head, func, &rcu_bh_state, -1, 0);
21190  }
21191  EXPORT_SYMBOL_GPL(call_rcu_bh);
21192 +#endif
21193
21194  /*
21195   * Queue an RCU callback for lazy invocation after a grace period.
21196 @@ -3213,6 +3334,7 @@ void synchronize_sched(void)
21197  }
21198  EXPORT_SYMBOL_GPL(synchronize_sched);
21199
21200 +#ifndef CONFIG_PREEMPT_RT_FULL
21201  /**
21202   * synchronize_rcu_bh - wait until an rcu_bh grace period has elapsed.
21203   *
21204 @@ -3239,6 +3361,7 @@ void synchronize_rcu_bh(void)
21205                 wait_rcu_gp(call_rcu_bh);
21206  }
21207  EXPORT_SYMBOL_GPL(synchronize_rcu_bh);
21208 +#endif
21209
21210  /**
21211   * get_state_synchronize_rcu - Snapshot current RCU state
21212 @@ -3524,7 +3647,7 @@ static void __rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp,
21213                         raw_spin_unlock_irqrestore(&rnp->lock, flags);
21214                         if (wake) {
21215                                 smp_mb(); /* EGP done before wake_up(). */
21216 -                               wake_up(&rsp->expedited_wq);
21217 +                               swake_up(&rsp->expedited_wq);
21218                         }
21219                         break;
21220                 }
21221 @@ -3781,7 +3904,7 @@ static void synchronize_sched_expedited_wait(struct rcu_state *rsp)
21222         jiffies_start = jiffies;
21223
21224         for (;;) {
21225 -               ret = wait_event_interruptible_timeout(
21226 +               ret = swait_event_timeout(
21227                                 rsp->expedited_wq,
21228                                 sync_rcu_preempt_exp_done(rnp_root),
21229                                 jiffies_stall);
21230 @@ -3789,7 +3912,7 @@ static void synchronize_sched_expedited_wait(struct rcu_state *rsp)
21231                         return;
21232                 if (ret < 0) {
21233                         /* Hit a signal, disable CPU stall warnings. */
21234 -                       wait_event(rsp->expedited_wq,
21235 +                       swait_event(rsp->expedited_wq,
21236                                    sync_rcu_preempt_exp_done(rnp_root));
21237                         return;
21238                 }
21239 @@ -4101,6 +4224,7 @@ static void _rcu_barrier(struct rcu_state *rsp)
21240         mutex_unlock(&rsp->barrier_mutex);
21241  }
21242
21243 +#ifndef CONFIG_PREEMPT_RT_FULL
21244  /**
21245   * rcu_barrier_bh - Wait until all in-flight call_rcu_bh() callbacks complete.
21246   */
21247 @@ -4109,6 +4233,7 @@ void rcu_barrier_bh(void)
21248         _rcu_barrier(&rcu_bh_state);
21249  }
21250  EXPORT_SYMBOL_GPL(rcu_barrier_bh);
21251 +#endif
21252
21253  /**
21254   * rcu_barrier_sched - Wait for in-flight call_rcu_sched() callbacks.
21255 @@ -4455,8 +4580,8 @@ static void __init rcu_init_one(struct rcu_state *rsp,
21256                 }
21257         }
21258
21259 -       init_waitqueue_head(&rsp->gp_wq);
21260 -       init_waitqueue_head(&rsp->expedited_wq);
21261 +       init_swait_queue_head(&rsp->gp_wq);
21262 +       init_swait_queue_head(&rsp->expedited_wq);
21263         rnp = rsp->level[rcu_num_lvls - 1];
21264         for_each_possible_cpu(i) {
21265                 while (i > rnp->grphi)
21266 @@ -4576,12 +4701,13 @@ void __init rcu_init(void)
21267
21268         rcu_bootup_announce();
21269         rcu_init_geometry();
21270 +#ifndef CONFIG_PREEMPT_RT_FULL
21271         rcu_init_one(&rcu_bh_state, &rcu_bh_data);
21272 +#endif
21273         rcu_init_one(&rcu_sched_state, &rcu_sched_data);
21274         if (dump_tree)
21275                 rcu_dump_rcu_node_tree(&rcu_sched_state);
21276         __rcu_init_preempt();
21277 -       open_softirq(RCU_SOFTIRQ, rcu_process_callbacks);
21278
21279         /*
21280          * We don't need protection against CPU-hotplug here because
21281 diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
21282 index 9fb4e238d4dc..c75834d8de24 100644
21283 --- a/kernel/rcu/tree.h
21284 +++ b/kernel/rcu/tree.h
21285 @@ -27,6 +27,7 @@
21286  #include <linux/threads.h>
21287  #include <linux/cpumask.h>
21288  #include <linux/seqlock.h>
21289 +#include <linux/swait.h>
21290  #include <linux/stop_machine.h>
21291
21292  /*
21293 @@ -241,7 +242,7 @@ struct rcu_node {
21294                                 /* Refused to boost: not sure why, though. */
21295                                 /*  This can happen due to race conditions. */
21296  #ifdef CONFIG_RCU_NOCB_CPU
21297 -       wait_queue_head_t nocb_gp_wq[2];
21298 +       struct swait_queue_head nocb_gp_wq[2];
21299                                 /* Place for rcu_nocb_kthread() to wait GP. */
21300  #endif /* #ifdef CONFIG_RCU_NOCB_CPU */
21301         int need_future_gp[2];
21302 @@ -393,7 +394,7 @@ struct rcu_data {
21303         atomic_long_t nocb_q_count_lazy; /*  invocation (all stages). */
21304         struct rcu_head *nocb_follower_head; /* CBs ready to invoke. */
21305         struct rcu_head **nocb_follower_tail;
21306 -       wait_queue_head_t nocb_wq;      /* For nocb kthreads to sleep on. */
21307 +       struct swait_queue_head nocb_wq; /* For nocb kthreads to sleep on. */
21308         struct task_struct *nocb_kthread;
21309         int nocb_defer_wakeup;          /* Defer wakeup of nocb_kthread. */
21310
21311 @@ -472,7 +473,7 @@ struct rcu_state {
21312         unsigned long gpnum;                    /* Current gp number. */
21313         unsigned long completed;                /* # of last completed gp. */
21314         struct task_struct *gp_kthread;         /* Task for grace periods. */
21315 -       wait_queue_head_t gp_wq;                /* Where GP task waits. */
21316 +       struct swait_queue_head gp_wq;          /* Where GP task waits. */
21317         short gp_flags;                         /* Commands for GP task. */
21318         short gp_state;                         /* GP kthread sleep state. */
21319
21320 @@ -504,7 +505,7 @@ struct rcu_state {
21321         atomic_long_t expedited_workdone3;      /* # done by others #3. */
21322         atomic_long_t expedited_normal;         /* # fallbacks to normal. */
21323         atomic_t expedited_need_qs;             /* # CPUs left to check in. */
21324 -       wait_queue_head_t expedited_wq;         /* Wait for check-ins. */
21325 +       struct swait_queue_head expedited_wq;   /* Wait for check-ins. */
21326         int ncpus_snap;                         /* # CPUs seen last time. */
21327
21328         unsigned long jiffies_force_qs;         /* Time at which to invoke */
21329 @@ -556,18 +557,18 @@ extern struct list_head rcu_struct_flavors;
21330   */
21331  extern struct rcu_state rcu_sched_state;
21332
21333 +#ifndef CONFIG_PREEMPT_RT_FULL
21334  extern struct rcu_state rcu_bh_state;
21335 +#endif
21336
21337  #ifdef CONFIG_PREEMPT_RCU
21338  extern struct rcu_state rcu_preempt_state;
21339  #endif /* #ifdef CONFIG_PREEMPT_RCU */
21340
21341 -#ifdef CONFIG_RCU_BOOST
21342  DECLARE_PER_CPU(unsigned int, rcu_cpu_kthread_status);
21343  DECLARE_PER_CPU(int, rcu_cpu_kthread_cpu);
21344  DECLARE_PER_CPU(unsigned int, rcu_cpu_kthread_loops);
21345  DECLARE_PER_CPU(char, rcu_cpu_has_work);
21346 -#endif /* #ifdef CONFIG_RCU_BOOST */
21347
21348  #ifndef RCU_TREE_NONCORE
21349
21350 @@ -587,10 +588,9 @@ void call_rcu(struct rcu_head *head, rcu_callback_t func);
21351  static void __init __rcu_init_preempt(void);
21352  static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags);
21353  static void rcu_preempt_boost_start_gp(struct rcu_node *rnp);
21354 -static void invoke_rcu_callbacks_kthread(void);
21355  static bool rcu_is_callbacks_kthread(void);
21356 +static void rcu_cpu_kthread_setup(unsigned int cpu);
21357  #ifdef CONFIG_RCU_BOOST
21358 -static void rcu_preempt_do_callbacks(void);
21359  static int rcu_spawn_one_boost_kthread(struct rcu_state *rsp,
21360                                                  struct rcu_node *rnp);
21361  #endif /* #ifdef CONFIG_RCU_BOOST */
21362 @@ -607,7 +607,8 @@ static void zero_cpu_stall_ticks(struct rcu_data *rdp);
21363  static void increment_cpu_stall_ticks(void);
21364  static bool rcu_nocb_cpu_needs_barrier(struct rcu_state *rsp, int cpu);
21365  static void rcu_nocb_gp_set(struct rcu_node *rnp, int nrq);
21366 -static void rcu_nocb_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp);
21367 +static struct swait_queue_head *rcu_nocb_gp_get(struct rcu_node *rnp);
21368 +static void rcu_nocb_gp_cleanup(struct swait_queue_head *sq);
21369  static void rcu_init_one_nocb(struct rcu_node *rnp);
21370  static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp,
21371                             bool lazy, unsigned long flags);
21372 diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
21373 index 630c19772630..8e119cf647ba 100644
21374 --- a/kernel/rcu/tree_plugin.h
21375 +++ b/kernel/rcu/tree_plugin.h
21376 @@ -24,25 +24,10 @@
21377   *        Paul E. McKenney <paulmck@linux.vnet.ibm.com>
21378   */
21379
21380 -#include <linux/delay.h>
21381 -#include <linux/gfp.h>
21382 -#include <linux/oom.h>
21383 -#include <linux/smpboot.h>
21384 -#include "../time/tick-internal.h"
21385 -
21386  #ifdef CONFIG_RCU_BOOST
21387
21388  #include "../locking/rtmutex_common.h"
21389
21390 -/*
21391 - * Control variables for per-CPU and per-rcu_node kthreads.  These
21392 - * handle all flavors of RCU.
21393 - */
21394 -static DEFINE_PER_CPU(struct task_struct *, rcu_cpu_kthread_task);
21395 -DEFINE_PER_CPU(unsigned int, rcu_cpu_kthread_status);
21396 -DEFINE_PER_CPU(unsigned int, rcu_cpu_kthread_loops);
21397 -DEFINE_PER_CPU(char, rcu_cpu_has_work);
21398 -
21399  #else /* #ifdef CONFIG_RCU_BOOST */
21400
21401  /*
21402 @@ -55,6 +40,14 @@ DEFINE_PER_CPU(char, rcu_cpu_has_work);
21403
21404  #endif /* #else #ifdef CONFIG_RCU_BOOST */
21405
21406 +/*
21407 + * Control variables for per-CPU and per-rcu_node kthreads.  These
21408 + * handle all flavors of RCU.
21409 + */
21410 +DEFINE_PER_CPU(unsigned int, rcu_cpu_kthread_status);
21411 +DEFINE_PER_CPU(unsigned int, rcu_cpu_kthread_loops);
21412 +DEFINE_PER_CPU(char, rcu_cpu_has_work);
21413 +
21414  #ifdef CONFIG_RCU_NOCB_CPU
21415  static cpumask_var_t rcu_nocb_mask; /* CPUs to have callbacks offloaded. */
21416  static bool have_rcu_nocb_mask;            /* Was rcu_nocb_mask allocated? */
21417 @@ -432,7 +425,7 @@ void rcu_read_unlock_special(struct task_struct *t)
21418         }
21419
21420         /* Hardware IRQ handlers cannot block, complain if they get here. */
21421 -       if (in_irq() || in_serving_softirq()) {
21422 +       if (preempt_count() & (HARDIRQ_MASK | SOFTIRQ_OFFSET)) {
21423                 lockdep_rcu_suspicious(__FILE__, __LINE__,
21424                                        "rcu_read_unlock() from irq or softirq with blocking in critical section!!!\n");
21425                 pr_alert("->rcu_read_unlock_special: %#x (b: %d, enq: %d nq: %d)\n",
21426 @@ -645,15 +638,6 @@ static void rcu_preempt_check_callbacks(void)
21427                 t->rcu_read_unlock_special.b.need_qs = true;
21428  }
21429
21430 -#ifdef CONFIG_RCU_BOOST
21431 -
21432 -static void rcu_preempt_do_callbacks(void)
21433 -{
21434 -       rcu_do_batch(rcu_state_p, this_cpu_ptr(rcu_data_p));
21435 -}
21436 -
21437 -#endif /* #ifdef CONFIG_RCU_BOOST */
21438 -
21439  /*
21440   * Queue a preemptible-RCU callback for invocation after a grace period.
21441   */
21442 @@ -930,6 +914,19 @@ void exit_rcu(void)
21443
21444  #endif /* #else #ifdef CONFIG_PREEMPT_RCU */
21445
21446 +/*
21447 + * If boosting, set rcuc kthreads to realtime priority.
21448 + */
21449 +static void rcu_cpu_kthread_setup(unsigned int cpu)
21450 +{
21451 +#ifdef CONFIG_RCU_BOOST
21452 +       struct sched_param sp;
21453 +
21454 +       sp.sched_priority = kthread_prio;
21455 +       sched_setscheduler_nocheck(current, SCHED_FIFO, &sp);
21456 +#endif /* #ifdef CONFIG_RCU_BOOST */
21457 +}
21458 +
21459  #ifdef CONFIG_RCU_BOOST
21460
21461  #include "../locking/rtmutex_common.h"
21462 @@ -961,16 +958,6 @@ static void rcu_initiate_boost_trace(struct rcu_node *rnp)
21463
21464  #endif /* #else #ifdef CONFIG_RCU_TRACE */
21465
21466 -static void rcu_wake_cond(struct task_struct *t, int status)
21467 -{
21468 -       /*
21469 -        * If the thread is yielding, only wake it when this
21470 -        * is invoked from idle
21471 -        */
21472 -       if (status != RCU_KTHREAD_YIELDING || is_idle_task(current))
21473 -               wake_up_process(t);
21474 -}
21475 -
21476  /*
21477   * Carry out RCU priority boosting on the task indicated by ->exp_tasks
21478   * or ->boost_tasks, advancing the pointer to the next task in the
21479 @@ -1115,23 +1102,6 @@ static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags)
21480  }
21481
21482  /*
21483 - * Wake up the per-CPU kthread to invoke RCU callbacks.
21484 - */
21485 -static void invoke_rcu_callbacks_kthread(void)
21486 -{
21487 -       unsigned long flags;
21488 -
21489 -       local_irq_save(flags);
21490 -       __this_cpu_write(rcu_cpu_has_work, 1);
21491 -       if (__this_cpu_read(rcu_cpu_kthread_task) != NULL &&
21492 -           current != __this_cpu_read(rcu_cpu_kthread_task)) {
21493 -               rcu_wake_cond(__this_cpu_read(rcu_cpu_kthread_task),
21494 -                             __this_cpu_read(rcu_cpu_kthread_status));
21495 -       }
21496 -       local_irq_restore(flags);
21497 -}
21498 -
21499 -/*
21500   * Is the current CPU running the RCU-callbacks kthread?
21501   * Caller must have preemption disabled.
21502   */
21503 @@ -1186,67 +1156,6 @@ static int rcu_spawn_one_boost_kthread(struct rcu_state *rsp,
21504         return 0;
21505  }
21506
21507 -static void rcu_kthread_do_work(void)
21508 -{
21509 -       rcu_do_batch(&rcu_sched_state, this_cpu_ptr(&rcu_sched_data));
21510 -       rcu_do_batch(&rcu_bh_state, this_cpu_ptr(&rcu_bh_data));
21511 -       rcu_preempt_do_callbacks();
21512 -}
21513 -
21514 -static void rcu_cpu_kthread_setup(unsigned int cpu)
21515 -{
21516 -       struct sched_param sp;
21517 -
21518 -       sp.sched_priority = kthread_prio;
21519 -       sched_setscheduler_nocheck(current, SCHED_FIFO, &sp);
21520 -}
21521 -
21522 -static void rcu_cpu_kthread_park(unsigned int cpu)
21523 -{
21524 -       per_cpu(rcu_cpu_kthread_status, cpu) = RCU_KTHREAD_OFFCPU;
21525 -}
21526 -
21527 -static int rcu_cpu_kthread_should_run(unsigned int cpu)
21528 -{
21529 -       return __this_cpu_read(rcu_cpu_has_work);
21530 -}
21531 -
21532 -/*
21533 - * Per-CPU kernel thread that invokes RCU callbacks.  This replaces the
21534 - * RCU softirq used in flavors and configurations of RCU that do not
21535 - * support RCU priority boosting.
21536 - */
21537 -static void rcu_cpu_kthread(unsigned int cpu)
21538 -{
21539 -       unsigned int *statusp = this_cpu_ptr(&rcu_cpu_kthread_status);
21540 -       char work, *workp = this_cpu_ptr(&rcu_cpu_has_work);
21541 -       int spincnt;
21542 -
21543 -       for (spincnt = 0; spincnt < 10; spincnt++) {
21544 -               trace_rcu_utilization(TPS("Start CPU kthread@rcu_wait"));
21545 -               local_bh_disable();
21546 -               *statusp = RCU_KTHREAD_RUNNING;
21547 -               this_cpu_inc(rcu_cpu_kthread_loops);
21548 -               local_irq_disable();
21549 -               work = *workp;
21550 -               *workp = 0;
21551 -               local_irq_enable();
21552 -               if (work)
21553 -                       rcu_kthread_do_work();
21554 -               local_bh_enable();
21555 -               if (*workp == 0) {
21556 -                       trace_rcu_utilization(TPS("End CPU kthread@rcu_wait"));
21557 -                       *statusp = RCU_KTHREAD_WAITING;
21558 -                       return;
21559 -               }
21560 -       }
21561 -       *statusp = RCU_KTHREAD_YIELDING;
21562 -       trace_rcu_utilization(TPS("Start CPU kthread@rcu_yield"));
21563 -       schedule_timeout_interruptible(2);
21564 -       trace_rcu_utilization(TPS("End CPU kthread@rcu_yield"));
21565 -       *statusp = RCU_KTHREAD_WAITING;
21566 -}
21567 -
21568  /*
21569   * Set the per-rcu_node kthread's affinity to cover all CPUs that are
21570   * served by the rcu_node in question.  The CPU hotplug lock is still
21571 @@ -1276,26 +1185,12 @@ static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu)
21572         free_cpumask_var(cm);
21573  }
21574
21575 -static struct smp_hotplug_thread rcu_cpu_thread_spec = {
21576 -       .store                  = &rcu_cpu_kthread_task,
21577 -       .thread_should_run      = rcu_cpu_kthread_should_run,
21578 -       .thread_fn              = rcu_cpu_kthread,
21579 -       .thread_comm            = "rcuc/%u",
21580 -       .setup                  = rcu_cpu_kthread_setup,
21581 -       .park                   = rcu_cpu_kthread_park,
21582 -};
21583 -
21584  /*
21585   * Spawn boost kthreads -- called as soon as the scheduler is running.
21586   */
21587  static void __init rcu_spawn_boost_kthreads(void)
21588  {
21589         struct rcu_node *rnp;
21590 -       int cpu;
21591 -
21592 -       for_each_possible_cpu(cpu)
21593 -               per_cpu(rcu_cpu_has_work, cpu) = 0;
21594 -       BUG_ON(smpboot_register_percpu_thread(&rcu_cpu_thread_spec));
21595         rcu_for_each_leaf_node(rcu_state_p, rnp)
21596                 (void)rcu_spawn_one_boost_kthread(rcu_state_p, rnp);
21597  }
21598 @@ -1318,11 +1213,6 @@ static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags)
21599         raw_spin_unlock_irqrestore(&rnp->lock, flags);
21600  }
21601
21602 -static void invoke_rcu_callbacks_kthread(void)
21603 -{
21604 -       WARN_ON_ONCE(1);
21605 -}
21606 -
21607  static bool rcu_is_callbacks_kthread(void)
21608  {
21609         return false;
21610 @@ -1346,7 +1236,7 @@ static void rcu_prepare_kthreads(int cpu)
21611
21612  #endif /* #else #ifdef CONFIG_RCU_BOOST */
21613
21614 -#if !defined(CONFIG_RCU_FAST_NO_HZ)
21615 +#if !defined(CONFIG_RCU_FAST_NO_HZ) || defined(CONFIG_PREEMPT_RT_FULL)
21616
21617  /*
21618   * Check to see if any future RCU-related work will need to be done
21619 @@ -1363,7 +1253,9 @@ int rcu_needs_cpu(u64 basemono, u64 *nextevt)
21620         return IS_ENABLED(CONFIG_RCU_NOCB_CPU_ALL)
21621                ? 0 : rcu_cpu_has_callbacks(NULL);
21622  }
21623 +#endif /* !defined(CONFIG_RCU_FAST_NO_HZ) || defined(CONFIG_PREEMPT_RT_FULL) */
21624
21625 +#if !defined(CONFIG_RCU_FAST_NO_HZ)
21626  /*
21627   * Because we do not have RCU_FAST_NO_HZ, don't bother cleaning up
21628   * after it.
21629 @@ -1459,6 +1351,8 @@ static bool __maybe_unused rcu_try_advance_all_cbs(void)
21630         return cbs_ready;
21631  }
21632
21633 +#ifndef CONFIG_PREEMPT_RT_FULL
21634 +
21635  /*
21636   * Allow the CPU to enter dyntick-idle mode unless it has callbacks ready
21637   * to invoke.  If the CPU has callbacks, try to advance them.  Tell the
21638 @@ -1504,6 +1398,7 @@ int rcu_needs_cpu(u64 basemono, u64 *nextevt)
21639         *nextevt = basemono + dj * TICK_NSEC;
21640         return 0;
21641  }
21642 +#endif /* #ifndef CONFIG_PREEMPT_RT_FULL */
21643
21644  /*
21645   * Prepare a CPU for idle from an RCU perspective.  The first major task
21646 @@ -1822,9 +1717,9 @@ early_param("rcu_nocb_poll", parse_rcu_nocb_poll);
21647   * Wake up any no-CBs CPUs' kthreads that were waiting on the just-ended
21648   * grace period.
21649   */
21650 -static void rcu_nocb_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp)
21651 +static void rcu_nocb_gp_cleanup(struct swait_queue_head *sq)
21652  {
21653 -       wake_up_all(&rnp->nocb_gp_wq[rnp->completed & 0x1]);
21654 +       swake_up_all(sq);
21655  }
21656
21657  /*
21658 @@ -1840,10 +1735,15 @@ static void rcu_nocb_gp_set(struct rcu_node *rnp, int nrq)
21659         rnp->need_future_gp[(rnp->completed + 1) & 0x1] += nrq;
21660  }
21661
21662 +static struct swait_queue_head *rcu_nocb_gp_get(struct rcu_node *rnp)
21663 +{
21664 +       return &rnp->nocb_gp_wq[rnp->completed & 0x1];
21665 +}
21666 +
21667  static void rcu_init_one_nocb(struct rcu_node *rnp)
21668  {
21669 -       init_waitqueue_head(&rnp->nocb_gp_wq[0]);
21670 -       init_waitqueue_head(&rnp->nocb_gp_wq[1]);
21671 +       init_swait_queue_head(&rnp->nocb_gp_wq[0]);
21672 +       init_swait_queue_head(&rnp->nocb_gp_wq[1]);
21673  }
21674
21675  #ifndef CONFIG_RCU_NOCB_CPU_ALL
21676 @@ -1868,7 +1768,7 @@ static void wake_nocb_leader(struct rcu_data *rdp, bool force)
21677         if (READ_ONCE(rdp_leader->nocb_leader_sleep) || force) {
21678                 /* Prior smp_mb__after_atomic() orders against prior enqueue. */
21679                 WRITE_ONCE(rdp_leader->nocb_leader_sleep, false);
21680 -               wake_up(&rdp_leader->nocb_wq);
21681 +               swake_up(&rdp_leader->nocb_wq);
21682         }
21683  }
21684
21685 @@ -2081,7 +1981,7 @@ static void rcu_nocb_wait_gp(struct rcu_data *rdp)
21686          */
21687         trace_rcu_future_gp(rnp, rdp, c, TPS("StartWait"));
21688         for (;;) {
21689 -               wait_event_interruptible(
21690 +               swait_event_interruptible(
21691                         rnp->nocb_gp_wq[c & 0x1],
21692                         (d = ULONG_CMP_GE(READ_ONCE(rnp->completed), c)));
21693                 if (likely(d))
21694 @@ -2109,7 +2009,7 @@ wait_again:
21695         /* Wait for callbacks to appear. */
21696         if (!rcu_nocb_poll) {
21697                 trace_rcu_nocb_wake(my_rdp->rsp->name, my_rdp->cpu, "Sleep");
21698 -               wait_event_interruptible(my_rdp->nocb_wq,
21699 +               swait_event_interruptible(my_rdp->nocb_wq,
21700                                 !READ_ONCE(my_rdp->nocb_leader_sleep));
21701                 /* Memory barrier handled by smp_mb() calls below and repoll. */
21702         } else if (firsttime) {
21703 @@ -2184,7 +2084,7 @@ wait_again:
21704                          * List was empty, wake up the follower.
21705                          * Memory barriers supplied by atomic_long_add().
21706                          */
21707 -                       wake_up(&rdp->nocb_wq);
21708 +                       swake_up(&rdp->nocb_wq);
21709                 }
21710         }
21711
21712 @@ -2205,7 +2105,7 @@ static void nocb_follower_wait(struct rcu_data *rdp)
21713                 if (!rcu_nocb_poll) {
21714                         trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu,
21715                                             "FollowerSleep");
21716 -                       wait_event_interruptible(rdp->nocb_wq,
21717 +                       swait_event_interruptible(rdp->nocb_wq,
21718                                                  READ_ONCE(rdp->nocb_follower_head));
21719                 } else if (firsttime) {
21720                         /* Don't drown trace log with "Poll"! */
21721 @@ -2364,7 +2264,7 @@ void __init rcu_init_nohz(void)
21722  static void __init rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp)
21723  {
21724         rdp->nocb_tail = &rdp->nocb_head;
21725 -       init_waitqueue_head(&rdp->nocb_wq);
21726 +       init_swait_queue_head(&rdp->nocb_wq);
21727         rdp->nocb_follower_tail = &rdp->nocb_follower_head;
21728  }
21729
21730 @@ -2514,7 +2414,7 @@ static bool rcu_nocb_cpu_needs_barrier(struct rcu_state *rsp, int cpu)
21731         return false;
21732  }
21733
21734 -static void rcu_nocb_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp)
21735 +static void rcu_nocb_gp_cleanup(struct swait_queue_head *sq)
21736  {
21737  }
21738
21739 @@ -2522,6 +2422,11 @@ static void rcu_nocb_gp_set(struct rcu_node *rnp, int nrq)
21740  {
21741  }
21742
21743 +static struct swait_queue_head *rcu_nocb_gp_get(struct rcu_node *rnp)
21744 +{
21745 +       return NULL;
21746 +}
21747 +
21748  static void rcu_init_one_nocb(struct rcu_node *rnp)
21749  {
21750  }
21751 diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c
21752 index 5f748c5a40f0..9a3904603ff6 100644
21753 --- a/kernel/rcu/update.c
21754 +++ b/kernel/rcu/update.c
21755 @@ -276,6 +276,7 @@ int rcu_read_lock_held(void)
21756  }
21757  EXPORT_SYMBOL_GPL(rcu_read_lock_held);
21758
21759 +#ifndef CONFIG_PREEMPT_RT_FULL
21760  /**
21761   * rcu_read_lock_bh_held() - might we be in RCU-bh read-side critical section?
21762   *
21763 @@ -302,6 +303,7 @@ int rcu_read_lock_bh_held(void)
21764         return in_softirq() || irqs_disabled();
21765  }
21766  EXPORT_SYMBOL_GPL(rcu_read_lock_bh_held);
21767 +#endif
21768
21769  #endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
21770
21771 diff --git a/kernel/relay.c b/kernel/relay.c
21772 index 0b4570cfacae..60684be39f22 100644
21773 --- a/kernel/relay.c
21774 +++ b/kernel/relay.c
21775 @@ -336,6 +336,10 @@ static void wakeup_readers(unsigned long data)
21776  {
21777         struct rchan_buf *buf = (struct rchan_buf *)data;
21778         wake_up_interruptible(&buf->read_wait);
21779 +       /*
21780 +        * Stupid polling for now:
21781 +        */
21782 +       mod_timer(&buf->timer, jiffies + 1);
21783  }
21784
21785  /**
21786 @@ -353,6 +357,7 @@ static void __relay_reset(struct rchan_buf *buf, unsigned int init)
21787                 init_waitqueue_head(&buf->read_wait);
21788                 kref_init(&buf->kref);
21789                 setup_timer(&buf->timer, wakeup_readers, (unsigned long)buf);
21790 +               mod_timer(&buf->timer, jiffies + 1);
21791         } else
21792                 del_timer_sync(&buf->timer);
21793
21794 @@ -736,15 +741,6 @@ size_t relay_switch_subbuf(struct rchan_buf *buf, size_t length)
21795                 else
21796                         buf->early_bytes += buf->chan->subbuf_size -
21797                                             buf->padding[old_subbuf];
21798 -               smp_mb();
21799 -               if (waitqueue_active(&buf->read_wait))
21800 -                       /*
21801 -                        * Calling wake_up_interruptible() from here
21802 -                        * will deadlock if we happen to be logging
21803 -                        * from the scheduler (trying to re-grab
21804 -                        * rq->lock), so defer it.
21805 -                        */
21806 -                       mod_timer(&buf->timer, jiffies + 1);
21807         }
21808
21809         old = buf->data;
21810 diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile
21811 index 67687973ce80..01b9994b367a 100644
21812 --- a/kernel/sched/Makefile
21813 +++ b/kernel/sched/Makefile
21814 @@ -13,7 +13,7 @@ endif
21815
21816  obj-y += core.o loadavg.o clock.o cputime.o
21817  obj-y += idle_task.o fair.o rt.o deadline.o stop_task.o
21818 -obj-y += wait.o completion.o idle.o
21819 +obj-y += wait.o swait.o swork.o completion.o idle.o
21820  obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o
21821  obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o
21822  obj-$(CONFIG_SCHEDSTATS) += stats.o
21823 diff --git a/kernel/sched/completion.c b/kernel/sched/completion.c
21824 index 8d0f35debf35..b62cf6400fe0 100644
21825 --- a/kernel/sched/completion.c
21826 +++ b/kernel/sched/completion.c
21827 @@ -30,10 +30,10 @@ void complete(struct completion *x)
21828  {
21829         unsigned long flags;
21830
21831 -       spin_lock_irqsave(&x->wait.lock, flags);
21832 +       raw_spin_lock_irqsave(&x->wait.lock, flags);
21833         x->done++;
21834 -       __wake_up_locked(&x->wait, TASK_NORMAL, 1);
21835 -       spin_unlock_irqrestore(&x->wait.lock, flags);
21836 +       swake_up_locked(&x->wait);
21837 +       raw_spin_unlock_irqrestore(&x->wait.lock, flags);
21838  }
21839  EXPORT_SYMBOL(complete);
21840
21841 @@ -50,10 +50,10 @@ void complete_all(struct completion *x)
21842  {
21843         unsigned long flags;
21844
21845 -       spin_lock_irqsave(&x->wait.lock, flags);
21846 +       raw_spin_lock_irqsave(&x->wait.lock, flags);
21847         x->done += UINT_MAX/2;
21848 -       __wake_up_locked(&x->wait, TASK_NORMAL, 0);
21849 -       spin_unlock_irqrestore(&x->wait.lock, flags);
21850 +       swake_up_all_locked(&x->wait);
21851 +       raw_spin_unlock_irqrestore(&x->wait.lock, flags);
21852  }
21853  EXPORT_SYMBOL(complete_all);
21854
21855 @@ -62,20 +62,20 @@ do_wait_for_common(struct completion *x,
21856                    long (*action)(long), long timeout, int state)
21857  {
21858         if (!x->done) {
21859 -               DECLARE_WAITQUEUE(wait, current);
21860 +               DECLARE_SWAITQUEUE(wait);
21861
21862 -               __add_wait_queue_tail_exclusive(&x->wait, &wait);
21863 +               __prepare_to_swait(&x->wait, &wait);
21864                 do {
21865                         if (signal_pending_state(state, current)) {
21866                                 timeout = -ERESTARTSYS;
21867                                 break;
21868                         }
21869                         __set_current_state(state);
21870 -                       spin_unlock_irq(&x->wait.lock);
21871 +                       raw_spin_unlock_irq(&x->wait.lock);
21872                         timeout = action(timeout);
21873 -                       spin_lock_irq(&x->wait.lock);
21874 +                       raw_spin_lock_irq(&x->wait.lock);
21875                 } while (!x->done && timeout);
21876 -               __remove_wait_queue(&x->wait, &wait);
21877 +               __finish_swait(&x->wait, &wait);
21878                 if (!x->done)
21879                         return timeout;
21880         }
21881 @@ -89,9 +89,9 @@ __wait_for_common(struct completion *x,
21882  {
21883         might_sleep();
21884
21885 -       spin_lock_irq(&x->wait.lock);
21886 +       raw_spin_lock_irq(&x->wait.lock);
21887         timeout = do_wait_for_common(x, action, timeout, state);
21888 -       spin_unlock_irq(&x->wait.lock);
21889 +       raw_spin_unlock_irq(&x->wait.lock);
21890         return timeout;
21891  }
21892
21893 @@ -277,12 +277,12 @@ bool try_wait_for_completion(struct completion *x)
21894         if (!READ_ONCE(x->done))
21895                 return 0;
21896
21897 -       spin_lock_irqsave(&x->wait.lock, flags);
21898 +       raw_spin_lock_irqsave(&x->wait.lock, flags);
21899         if (!x->done)
21900                 ret = 0;
21901         else
21902                 x->done--;
21903 -       spin_unlock_irqrestore(&x->wait.lock, flags);
21904 +       raw_spin_unlock_irqrestore(&x->wait.lock, flags);
21905         return ret;
21906  }
21907  EXPORT_SYMBOL(try_wait_for_completion);
21908 @@ -311,7 +311,7 @@ bool completion_done(struct completion *x)
21909          * after it's acquired the lock.
21910          */
21911         smp_rmb();
21912 -       spin_unlock_wait(&x->wait.lock);
21913 +       raw_spin_unlock_wait(&x->wait.lock);
21914         return true;
21915  }
21916  EXPORT_SYMBOL(completion_done);
21917 diff --git a/kernel/sched/core.c b/kernel/sched/core.c
21918 index 20253dbc8610..e9b8d518202e 100644
21919 --- a/kernel/sched/core.c
21920 +++ b/kernel/sched/core.c
21921 @@ -260,7 +260,11 @@ late_initcall(sched_init_debug);
21922   * Number of tasks to iterate in a single balance run.
21923   * Limited because this is done with IRQs disabled.
21924   */
21925 +#ifndef CONFIG_PREEMPT_RT_FULL
21926  const_debug unsigned int sysctl_sched_nr_migrate = 32;
21927 +#else
21928 +const_debug unsigned int sysctl_sched_nr_migrate = 8;
21929 +#endif
21930
21931  /*
21932   * period over which we average the RT time consumption, measured
21933 @@ -438,6 +442,7 @@ static void init_rq_hrtick(struct rq *rq)
21934
21935         hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
21936         rq->hrtick_timer.function = hrtick;
21937 +       rq->hrtick_timer.irqsafe = 1;
21938  }
21939  #else  /* CONFIG_SCHED_HRTICK */
21940  static inline void hrtick_clear(struct rq *rq)
21941 @@ -542,7 +547,7 @@ void wake_q_add(struct wake_q_head *head, struct task_struct *task)
21942         head->lastp = &node->next;
21943  }
21944
21945 -void wake_up_q(struct wake_q_head *head)
21946 +void __wake_up_q(struct wake_q_head *head, bool sleeper)
21947  {
21948         struct wake_q_node *node = head->first;
21949
21950 @@ -559,7 +564,10 @@ void wake_up_q(struct wake_q_head *head)
21951                  * wake_up_process() implies a wmb() to pair with the queueing
21952                  * in wake_q_add() so as not to miss wakeups.
21953                  */
21954 -               wake_up_process(task);
21955 +               if (sleeper)
21956 +                       wake_up_lock_sleeper(task);
21957 +               else
21958 +                       wake_up_process(task);
21959                 put_task_struct(task);
21960         }
21961  }
21962 @@ -595,6 +603,38 @@ void resched_curr(struct rq *rq)
21963                 trace_sched_wake_idle_without_ipi(cpu);
21964  }
21965
21966 +#ifdef CONFIG_PREEMPT_LAZY
21967 +void resched_curr_lazy(struct rq *rq)
21968 +{
21969 +       struct task_struct *curr = rq->curr;
21970 +       int cpu;
21971 +
21972 +       if (!sched_feat(PREEMPT_LAZY)) {
21973 +               resched_curr(rq);
21974 +               return;
21975 +       }
21976 +
21977 +       lockdep_assert_held(&rq->lock);
21978 +
21979 +       if (test_tsk_need_resched(curr))
21980 +               return;
21981 +
21982 +       if (test_tsk_need_resched_lazy(curr))
21983 +               return;
21984 +
21985 +       set_tsk_need_resched_lazy(curr);
21986 +
21987 +       cpu = cpu_of(rq);
21988 +       if (cpu == smp_processor_id())
21989 +               return;
21990 +
21991 +       /* NEED_RESCHED_LAZY must be visible before we test polling */
21992 +       smp_mb();
21993 +       if (!tsk_is_polling(curr))
21994 +               smp_send_reschedule(cpu);
21995 +}
21996 +#endif
21997 +
21998  void resched_cpu(int cpu)
21999  {
22000         struct rq *rq = cpu_rq(cpu);
22001 @@ -618,11 +658,14 @@ void resched_cpu(int cpu)
22002   */
22003  int get_nohz_timer_target(void)
22004  {
22005 -       int i, cpu = smp_processor_id();
22006 +       int i, cpu;
22007         struct sched_domain *sd;
22008
22009 +       preempt_disable_rt();
22010 +       cpu = smp_processor_id();
22011 +
22012         if (!idle_cpu(cpu) && is_housekeeping_cpu(cpu))
22013 -               return cpu;
22014 +               goto preempt_en_rt;
22015
22016         rcu_read_lock();
22017         for_each_domain(cpu, sd) {
22018 @@ -641,6 +684,8 @@ int get_nohz_timer_target(void)
22019                 cpu = housekeeping_any_cpu();
22020  unlock:
22021         rcu_read_unlock();
22022 +preempt_en_rt:
22023 +       preempt_enable_rt();
22024         return cpu;
22025  }
22026  /*
22027 @@ -1174,6 +1219,11 @@ void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
22028
22029         lockdep_assert_held(&p->pi_lock);
22030
22031 +       if (__migrate_disabled(p)) {
22032 +               cpumask_copy(&p->cpus_allowed, new_mask);
22033 +               return;
22034 +       }
22035 +
22036         queued = task_on_rq_queued(p);
22037         running = task_current(rq, p);
22038
22039 @@ -1196,6 +1246,84 @@ void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
22040                 enqueue_task(rq, p, ENQUEUE_RESTORE);
22041  }
22042
22043 +static DEFINE_PER_CPU(struct cpumask, sched_cpumasks);
22044 +static DEFINE_MUTEX(sched_down_mutex);
22045 +static cpumask_t sched_down_cpumask;
22046 +
22047 +void tell_sched_cpu_down_begin(int cpu)
22048 +{
22049 +       mutex_lock(&sched_down_mutex);
22050 +       cpumask_set_cpu(cpu, &sched_down_cpumask);
22051 +       mutex_unlock(&sched_down_mutex);
22052 +}
22053 +
22054 +void tell_sched_cpu_down_done(int cpu)
22055 +{
22056 +       mutex_lock(&sched_down_mutex);
22057 +       cpumask_clear_cpu(cpu, &sched_down_cpumask);
22058 +       mutex_unlock(&sched_down_mutex);
22059 +}
22060 +
22061 +/**
22062 + * migrate_me - try to move the current task off this cpu
22063 + *
22064 + * Used by the pin_current_cpu() code to try to get tasks
22065 + * to move off the current CPU as it is going down.
22066 + * It will only move the task if the task isn't pinned to
22067 + * the CPU (with migrate_disable, affinity or NO_SETAFFINITY)
22068 + * and the task has to be in a RUNNING state. Otherwise the
22069 + * movement of the task will wake it up (change its state
22070 + * to running) when the task did not expect it.
22071 + *
22072 + * Returns 1 if it succeeded in moving the current task
22073 + *         0 otherwise.
22074 + */
22075 +int migrate_me(void)
22076 +{
22077 +       struct task_struct *p = current;
22078 +       struct migration_arg arg;
22079 +       struct cpumask *cpumask;
22080 +       struct cpumask *mask;
22081 +       unsigned long flags;
22082 +       unsigned int dest_cpu;
22083 +       struct rq *rq;
22084 +
22085 +       /*
22086 +        * We can not migrate tasks bounded to a CPU or tasks not
22087 +        * running. The movement of the task will wake it up.
22088 +        */
22089 +       if (p->flags & PF_NO_SETAFFINITY || p->state)
22090 +               return 0;
22091 +
22092 +       mutex_lock(&sched_down_mutex);
22093 +       rq = task_rq_lock(p, &flags);
22094 +
22095 +       cpumask = this_cpu_ptr(&sched_cpumasks);
22096 +       mask = &p->cpus_allowed;
22097 +
22098 +       cpumask_andnot(cpumask, mask, &sched_down_cpumask);
22099 +
22100 +       if (!cpumask_weight(cpumask)) {
22101 +               /* It's only on this CPU? */
22102 +               task_rq_unlock(rq, p, &flags);
22103 +               mutex_unlock(&sched_down_mutex);
22104 +               return 0;
22105 +       }
22106 +
22107 +       dest_cpu = cpumask_any_and(cpu_active_mask, cpumask);
22108 +
22109 +       arg.task = p;
22110 +       arg.dest_cpu = dest_cpu;
22111 +
22112 +       task_rq_unlock(rq, p, &flags);
22113 +
22114 +       stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg);
22115 +       tlb_migrate_finish(p->mm);
22116 +       mutex_unlock(&sched_down_mutex);
22117 +
22118 +       return 1;
22119 +}
22120 +
22121  /*
22122   * Change a given task's CPU affinity. Migrate the thread to a
22123   * proper CPU and schedule it away if the CPU it's executing on
22124 @@ -1235,7 +1363,7 @@ static int __set_cpus_allowed_ptr(struct task_struct *p,
22125         do_set_cpus_allowed(p, new_mask);
22126
22127         /* Can the task run on the task's current CPU? If so, we're done */
22128 -       if (cpumask_test_cpu(task_cpu(p), new_mask))
22129 +       if (cpumask_test_cpu(task_cpu(p), new_mask) || __migrate_disabled(p))
22130                 goto out;
22131
22132         dest_cpu = cpumask_any_and(cpu_active_mask, new_mask);
22133 @@ -1411,6 +1539,18 @@ out:
22134         return ret;
22135  }
22136
22137 +static bool check_task_state(struct task_struct *p, long match_state)
22138 +{
22139 +       bool match = false;
22140 +
22141 +       raw_spin_lock_irq(&p->pi_lock);
22142 +       if (p->state == match_state || p->saved_state == match_state)
22143 +               match = true;
22144 +       raw_spin_unlock_irq(&p->pi_lock);
22145 +
22146 +       return match;
22147 +}
22148 +
22149  /*
22150   * wait_task_inactive - wait for a thread to unschedule.
22151   *
22152 @@ -1455,7 +1595,7 @@ unsigned long wait_task_inactive(struct task_struct *p, long match_state)
22153                  * is actually now running somewhere else!
22154                  */
22155                 while (task_running(rq, p)) {
22156 -                       if (match_state && unlikely(p->state != match_state))
22157 +                       if (match_state && !check_task_state(p, match_state))
22158                                 return 0;
22159                         cpu_relax();
22160                 }
22161 @@ -1470,7 +1610,8 @@ unsigned long wait_task_inactive(struct task_struct *p, long match_state)
22162                 running = task_running(rq, p);
22163                 queued = task_on_rq_queued(p);
22164                 ncsw = 0;
22165 -               if (!match_state || p->state == match_state)
22166 +               if (!match_state || p->state == match_state ||
22167 +                   p->saved_state == match_state)
22168                         ncsw = p->nvcsw | LONG_MIN; /* sets MSB */
22169                 task_rq_unlock(rq, p, &flags);
22170
22171 @@ -1627,7 +1768,7 @@ int select_task_rq(struct task_struct *p, int cpu, int sd_flags, int wake_flags)
22172  {
22173         lockdep_assert_held(&p->pi_lock);
22174
22175 -       if (p->nr_cpus_allowed > 1)
22176 +       if (tsk_nr_cpus_allowed(p) > 1)
22177                 cpu = p->sched_class->select_task_rq(p, cpu, sd_flags, wake_flags);
22178
22179         /*
22180 @@ -1707,10 +1848,6 @@ static inline void ttwu_activate(struct rq *rq, struct task_struct *p, int en_fl
22181  {
22182         activate_task(rq, p, en_flags);
22183         p->on_rq = TASK_ON_RQ_QUEUED;
22184 -
22185 -       /* if a worker is waking up, notify workqueue */
22186 -       if (p->flags & PF_WQ_WORKER)
22187 -               wq_worker_waking_up(p, cpu_of(rq));
22188  }
22189
22190  /*
22191 @@ -1937,8 +2074,27 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
22192          */
22193         smp_mb__before_spinlock();
22194         raw_spin_lock_irqsave(&p->pi_lock, flags);
22195 -       if (!(p->state & state))
22196 +       if (!(p->state & state)) {
22197 +               /*
22198 +                * The task might be running due to a spinlock sleeper
22199 +                * wakeup. Check the saved state and set it to running
22200 +                * if the wakeup condition is true.
22201 +                */
22202 +               if (!(wake_flags & WF_LOCK_SLEEPER)) {
22203 +                       if (p->saved_state & state) {
22204 +                               p->saved_state = TASK_RUNNING;
22205 +                               success = 1;
22206 +                       }
22207 +               }
22208                 goto out;
22209 +       }
22210 +
22211 +       /*
22212 +        * If this is a regular wakeup, then we can unconditionally
22213 +        * clear the saved state of a "lock sleeper".
22214 +        */
22215 +       if (!(wake_flags & WF_LOCK_SLEEPER))
22216 +               p->saved_state = TASK_RUNNING;
22217
22218         trace_sched_waking(p);
22219
22220 @@ -2030,52 +2186,6 @@ out:
22221  }
22222
22223  /**
22224 - * try_to_wake_up_local - try to wake up a local task with rq lock held
22225 - * @p: the thread to be awakened
22226 - *
22227 - * Put @p on the run-queue if it's not already there. The caller must
22228 - * ensure that this_rq() is locked, @p is bound to this_rq() and not
22229 - * the current task.
22230 - */
22231 -static void try_to_wake_up_local(struct task_struct *p)
22232 -{
22233 -       struct rq *rq = task_rq(p);
22234 -
22235 -       if (WARN_ON_ONCE(rq != this_rq()) ||
22236 -           WARN_ON_ONCE(p == current))
22237 -               return;
22238 -
22239 -       lockdep_assert_held(&rq->lock);
22240 -
22241 -       if (!raw_spin_trylock(&p->pi_lock)) {
22242 -               /*
22243 -                * This is OK, because current is on_cpu, which avoids it being
22244 -                * picked for load-balance and preemption/IRQs are still
22245 -                * disabled avoiding further scheduler activity on it and we've
22246 -                * not yet picked a replacement task.
22247 -                */
22248 -               lockdep_unpin_lock(&rq->lock);
22249 -               raw_spin_unlock(&rq->lock);
22250 -               raw_spin_lock(&p->pi_lock);
22251 -               raw_spin_lock(&rq->lock);
22252 -               lockdep_pin_lock(&rq->lock);
22253 -       }
22254 -
22255 -       if (!(p->state & TASK_NORMAL))
22256 -               goto out;
22257 -
22258 -       trace_sched_waking(p);
22259 -
22260 -       if (!task_on_rq_queued(p))
22261 -               ttwu_activate(rq, p, ENQUEUE_WAKEUP);
22262 -
22263 -       ttwu_do_wakeup(rq, p, 0);
22264 -       ttwu_stat(p, smp_processor_id(), 0);
22265 -out:
22266 -       raw_spin_unlock(&p->pi_lock);
22267 -}
22268 -
22269 -/**
22270   * wake_up_process - Wake up a specific process
22271   * @p: The process to be woken up.
22272   *
22273 @@ -2093,6 +2203,18 @@ int wake_up_process(struct task_struct *p)
22274  }
22275  EXPORT_SYMBOL(wake_up_process);
22276
22277 +/**
22278 + * wake_up_lock_sleeper - Wake up a specific process blocked on a "sleeping lock"
22279 + * @p: The process to be woken up.
22280 + *
22281 + * Same as wake_up_process() above, but wake_flags=WF_LOCK_SLEEPER to indicate
22282 + * the nature of the wakeup.
22283 + */
22284 +int wake_up_lock_sleeper(struct task_struct *p)
22285 +{
22286 +       return try_to_wake_up(p, TASK_ALL, WF_LOCK_SLEEPER);
22287 +}
22288 +
22289  int wake_up_state(struct task_struct *p, unsigned int state)
22290  {
22291         return try_to_wake_up(p, state, 0);
22292 @@ -2279,6 +2401,9 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p)
22293         p->on_cpu = 0;
22294  #endif
22295         init_task_preempt_count(p);
22296 +#ifdef CONFIG_HAVE_PREEMPT_LAZY
22297 +       task_thread_info(p)->preempt_lazy_count = 0;
22298 +#endif
22299  #ifdef CONFIG_SMP
22300         plist_node_init(&p->pushable_tasks, MAX_PRIO);
22301         RB_CLEAR_NODE(&p->pushable_dl_tasks);
22302 @@ -2603,8 +2728,12 @@ static struct rq *finish_task_switch(struct task_struct *prev)
22303         finish_arch_post_lock_switch();
22304
22305         fire_sched_in_preempt_notifiers(current);
22306 +       /*
22307 +        * We use mmdrop_delayed() here so we don't have to do the
22308 +        * full __mmdrop() when we are the last user.
22309 +        */
22310         if (mm)
22311 -               mmdrop(mm);
22312 +               mmdrop_delayed(mm);
22313         if (unlikely(prev_state == TASK_DEAD)) {
22314                 if (prev->sched_class->task_dead)
22315                         prev->sched_class->task_dead(prev);
22316 @@ -2935,16 +3064,6 @@ u64 scheduler_tick_max_deferment(void)
22317  }
22318  #endif
22319
22320 -notrace unsigned long get_parent_ip(unsigned long addr)
22321 -{
22322 -       if (in_lock_functions(addr)) {
22323 -               addr = CALLER_ADDR2;
22324 -               if (in_lock_functions(addr))
22325 -                       addr = CALLER_ADDR3;
22326 -       }
22327 -       return addr;
22328 -}
22329 -
22330  #if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \
22331                                 defined(CONFIG_PREEMPT_TRACER))
22332
22333 @@ -2966,7 +3085,7 @@ void preempt_count_add(int val)
22334                                 PREEMPT_MASK - 10);
22335  #endif
22336         if (preempt_count() == val) {
22337 -               unsigned long ip = get_parent_ip(CALLER_ADDR1);
22338 +               unsigned long ip = get_lock_parent_ip();
22339  #ifdef CONFIG_DEBUG_PREEMPT
22340                 current->preempt_disable_ip = ip;
22341  #endif
22342 @@ -2993,7 +3112,7 @@ void preempt_count_sub(int val)
22343  #endif
22344
22345         if (preempt_count() == val)
22346 -               trace_preempt_on(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1));
22347 +               trace_preempt_on(CALLER_ADDR0, get_lock_parent_ip());
22348         __preempt_count_sub(val);
22349  }
22350  EXPORT_SYMBOL(preempt_count_sub);
22351 @@ -3048,6 +3167,77 @@ static inline void schedule_debug(struct task_struct *prev)
22352         schedstat_inc(this_rq(), sched_count);
22353  }
22354
22355 +#if defined(CONFIG_PREEMPT_RT_FULL) && defined(CONFIG_SMP)
22356 +
22357 +void migrate_disable(void)
22358 +{
22359 +       struct task_struct *p = current;
22360 +
22361 +       if (in_atomic() || irqs_disabled()) {
22362 +#ifdef CONFIG_SCHED_DEBUG
22363 +               p->migrate_disable_atomic++;
22364 +#endif
22365 +               return;
22366 +       }
22367 +
22368 +#ifdef CONFIG_SCHED_DEBUG
22369 +       if (unlikely(p->migrate_disable_atomic)) {
22370 +               tracing_off();
22371 +               WARN_ON_ONCE(1);
22372 +       }
22373 +#endif
22374 +
22375 +       if (p->migrate_disable) {
22376 +               p->migrate_disable++;
22377 +               return;
22378 +       }
22379 +
22380 +       preempt_disable();
22381 +       preempt_lazy_disable();
22382 +       pin_current_cpu();
22383 +       p->migrate_disable = 1;
22384 +       preempt_enable();
22385 +}
22386 +EXPORT_SYMBOL(migrate_disable);
22387 +
22388 +void migrate_enable(void)
22389 +{
22390 +       struct task_struct *p = current;
22391 +
22392 +       if (in_atomic() || irqs_disabled()) {
22393 +#ifdef CONFIG_SCHED_DEBUG
22394 +               p->migrate_disable_atomic--;
22395 +#endif
22396 +               return;
22397 +       }
22398 +
22399 +#ifdef CONFIG_SCHED_DEBUG
22400 +       if (unlikely(p->migrate_disable_atomic)) {
22401 +               tracing_off();
22402 +               WARN_ON_ONCE(1);
22403 +       }
22404 +#endif
22405 +       WARN_ON_ONCE(p->migrate_disable <= 0);
22406 +
22407 +       if (p->migrate_disable > 1) {
22408 +               p->migrate_disable--;
22409 +               return;
22410 +       }
22411 +
22412 +       preempt_disable();
22413 +       /*
22414 +        * Clearing migrate_disable causes tsk_cpus_allowed to
22415 +        * show the tasks original cpu affinity.
22416 +        */
22417 +       p->migrate_disable = 0;
22418 +
22419 +       unpin_current_cpu();
22420 +       preempt_enable();
22421 +       preempt_lazy_enable();
22422 +}
22423 +EXPORT_SYMBOL(migrate_enable);
22424 +#endif
22425 +
22426  /*
22427   * Pick up the highest-prio task:
22428   */
22429 @@ -3172,19 +3362,6 @@ static void __sched notrace __schedule(bool preempt)
22430                 } else {
22431                         deactivate_task(rq, prev, DEQUEUE_SLEEP);
22432                         prev->on_rq = 0;
22433 -
22434 -                       /*
22435 -                        * If a worker went to sleep, notify and ask workqueue
22436 -                        * whether it wants to wake up a task to maintain
22437 -                        * concurrency.
22438 -                        */
22439 -                       if (prev->flags & PF_WQ_WORKER) {
22440 -                               struct task_struct *to_wakeup;
22441 -
22442 -                               to_wakeup = wq_worker_sleeping(prev, cpu);
22443 -                               if (to_wakeup)
22444 -                                       try_to_wake_up_local(to_wakeup);
22445 -                       }
22446                 }
22447                 switch_count = &prev->nvcsw;
22448         }
22449 @@ -3194,6 +3371,7 @@ static void __sched notrace __schedule(bool preempt)
22450
22451         next = pick_next_task(rq, prev);
22452         clear_tsk_need_resched(prev);
22453 +       clear_tsk_need_resched_lazy(prev);
22454         clear_preempt_need_resched();
22455         rq->clock_skip_update = 0;
22456
22457 @@ -3215,9 +3393,20 @@ static void __sched notrace __schedule(bool preempt)
22458
22459  static inline void sched_submit_work(struct task_struct *tsk)
22460  {
22461 -       if (!tsk->state || tsk_is_pi_blocked(tsk))
22462 +       if (!tsk->state)
22463                 return;
22464         /*
22465 +        * If a worker went to sleep, notify and ask workqueue whether
22466 +        * it wants to wake up a task to maintain concurrency.
22467 +        */
22468 +       if (tsk->flags & PF_WQ_WORKER)
22469 +               wq_worker_sleeping(tsk);
22470 +
22471 +
22472 +       if (tsk_is_pi_blocked(tsk))
22473 +               return;
22474 +
22475 +       /*
22476          * If we are going to sleep and we have plugged IO queued,
22477          * make sure to submit it to avoid deadlocks.
22478          */
22479 @@ -3225,6 +3414,12 @@ static inline void sched_submit_work(struct task_struct *tsk)
22480                 blk_schedule_flush_plug(tsk);
22481  }
22482
22483 +static void sched_update_worker(struct task_struct *tsk)
22484 +{
22485 +       if (tsk->flags & PF_WQ_WORKER)
22486 +               wq_worker_running(tsk);
22487 +}
22488 +
22489  asmlinkage __visible void __sched schedule(void)
22490  {
22491         struct task_struct *tsk = current;
22492 @@ -3235,6 +3430,7 @@ asmlinkage __visible void __sched schedule(void)
22493                 __schedule(false);
22494                 sched_preempt_enable_no_resched();
22495         } while (need_resched());
22496 +       sched_update_worker(tsk);
22497  }
22498  EXPORT_SYMBOL(schedule);
22499
22500 @@ -3283,6 +3479,30 @@ static void __sched notrace preempt_schedule_common(void)
22501         } while (need_resched());
22502  }
22503
22504 +#ifdef CONFIG_PREEMPT_LAZY
22505 +/*
22506 + * If TIF_NEED_RESCHED is then we allow to be scheduled away since this is
22507 + * set by a RT task. Oterwise we try to avoid beeing scheduled out as long as
22508 + * preempt_lazy_count counter >0.
22509 + */
22510 +static __always_inline int preemptible_lazy(void)
22511 +{
22512 +       if (test_thread_flag(TIF_NEED_RESCHED))
22513 +               return 1;
22514 +       if (current_thread_info()->preempt_lazy_count)
22515 +               return 0;
22516 +       return 1;
22517 +}
22518 +
22519 +#else
22520 +
22521 +static inline int preemptible_lazy(void)
22522 +{
22523 +       return 1;
22524 +}
22525 +
22526 +#endif
22527 +
22528  #ifdef CONFIG_PREEMPT
22529  /*
22530   * this is the entry point to schedule() from in-kernel preemption
22531 @@ -3297,6 +3517,8 @@ asmlinkage __visible void __sched notrace preempt_schedule(void)
22532          */
22533         if (likely(!preemptible()))
22534                 return;
22535 +       if (!preemptible_lazy())
22536 +               return;
22537
22538         preempt_schedule_common();
22539  }
22540 @@ -3323,6 +3545,8 @@ asmlinkage __visible void __sched notrace preempt_schedule_notrace(void)
22541
22542         if (likely(!preemptible()))
22543                 return;
22544 +       if (!preemptible_lazy())
22545 +               return;
22546
22547         do {
22548                 preempt_disable_notrace();
22549 @@ -3332,7 +3556,16 @@ asmlinkage __visible void __sched notrace preempt_schedule_notrace(void)
22550                  * an infinite recursion.
22551                  */
22552                 prev_ctx = exception_enter();
22553 +               /*
22554 +                * The add/subtract must not be traced by the function
22555 +                * tracer. But we still want to account for the
22556 +                * preempt off latency tracer. Since the _notrace versions
22557 +                * of add/subtract skip the accounting for latency tracer
22558 +                * we must force it manually.
22559 +                */
22560 +               start_critical_timings();
22561                 __schedule(true);
22562 +               stop_critical_timings();
22563                 exception_exit(prev_ctx);
22564
22565                 preempt_enable_no_resched_notrace();
22566 @@ -4676,6 +4909,7 @@ int __cond_resched_lock(spinlock_t *lock)
22567  }
22568  EXPORT_SYMBOL(__cond_resched_lock);
22569
22570 +#ifndef CONFIG_PREEMPT_RT_FULL
22571  int __sched __cond_resched_softirq(void)
22572  {
22573         BUG_ON(!in_softirq());
22574 @@ -4689,6 +4923,7 @@ int __sched __cond_resched_softirq(void)
22575         return 0;
22576  }
22577  EXPORT_SYMBOL(__cond_resched_softirq);
22578 +#endif
22579
22580  /**
22581   * yield - yield the current processor to other threads.
22582 @@ -5055,7 +5290,9 @@ void init_idle(struct task_struct *idle, int cpu)
22583
22584         /* Set the preempt count _outside_ the spinlocks! */
22585         init_idle_preempt_count(idle, cpu);
22586 -
22587 +#ifdef CONFIG_HAVE_PREEMPT_LAZY
22588 +       task_thread_info(idle)->preempt_lazy_count = 0;
22589 +#endif
22590         /*
22591          * The idle tasks have their own, simple scheduling class:
22592          */
22593 @@ -5196,6 +5433,8 @@ void sched_setnuma(struct task_struct *p, int nid)
22594  #endif /* CONFIG_NUMA_BALANCING */
22595
22596  #ifdef CONFIG_HOTPLUG_CPU
22597 +static DEFINE_PER_CPU(struct mm_struct *, idle_last_mm);
22598 +
22599  /*
22600   * Ensures that the idle task is using init_mm right before its cpu goes
22601   * offline.
22602 @@ -5210,7 +5449,11 @@ void idle_task_exit(void)
22603                 switch_mm(mm, &init_mm, current);
22604                 finish_arch_post_lock_switch();
22605         }
22606 -       mmdrop(mm);
22607 +       /*
22608 +        * Defer the cleanup to an alive cpu. On RT we can neither
22609 +        * call mmdrop() nor mmdrop_delayed() from here.
22610 +        */
22611 +       per_cpu(idle_last_mm, smp_processor_id()) = mm;
22612  }
22613
22614  /*
22615 @@ -5583,6 +5826,10 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
22616
22617         case CPU_DEAD:
22618                 calc_load_migrate(rq);
22619 +               if (per_cpu(idle_last_mm, cpu)) {
22620 +                       mmdrop(per_cpu(idle_last_mm, cpu));
22621 +                       per_cpu(idle_last_mm, cpu) = NULL;
22622 +               }
22623                 break;
22624  #endif
22625         }
22626 @@ -7566,7 +7813,7 @@ void __init sched_init(void)
22627  #ifdef CONFIG_DEBUG_ATOMIC_SLEEP
22628  static inline int preempt_count_equals(int preempt_offset)
22629  {
22630 -       int nested = preempt_count() + rcu_preempt_depth();
22631 +       int nested = preempt_count() + sched_rcu_preempt_depth();
22632
22633         return (nested == preempt_offset);
22634  }
22635 diff --git a/kernel/sched/cpudeadline.c b/kernel/sched/cpudeadline.c
22636 index 5a75b08cfd85..5be58820465c 100644
22637 --- a/kernel/sched/cpudeadline.c
22638 +++ b/kernel/sched/cpudeadline.c
22639 @@ -103,10 +103,10 @@ int cpudl_find(struct cpudl *cp, struct task_struct *p,
22640         const struct sched_dl_entity *dl_se = &p->dl;
22641
22642         if (later_mask &&
22643 -           cpumask_and(later_mask, cp->free_cpus, &p->cpus_allowed)) {
22644 +           cpumask_and(later_mask, cp->free_cpus, tsk_cpus_allowed(p))) {
22645                 best_cpu = cpumask_any(later_mask);
22646                 goto out;
22647 -       } else if (cpumask_test_cpu(cpudl_maximum(cp), &p->cpus_allowed) &&
22648 +       } else if (cpumask_test_cpu(cpudl_maximum(cp), tsk_cpus_allowed(p)) &&
22649                         dl_time_before(dl_se->deadline, cp->elements[0].dl)) {
22650                 best_cpu = cpudl_maximum(cp);
22651                 if (later_mask)
22652 diff --git a/kernel/sched/cpupri.c b/kernel/sched/cpupri.c
22653 index 981fcd7dc394..11e9705bf937 100644
22654 --- a/kernel/sched/cpupri.c
22655 +++ b/kernel/sched/cpupri.c
22656 @@ -103,11 +103,11 @@ int cpupri_find(struct cpupri *cp, struct task_struct *p,
22657                 if (skip)
22658                         continue;
22659
22660 -               if (cpumask_any_and(&p->cpus_allowed, vec->mask) >= nr_cpu_ids)
22661 +               if (cpumask_any_and(tsk_cpus_allowed(p), vec->mask) >= nr_cpu_ids)
22662                         continue;
22663
22664                 if (lowest_mask) {
22665 -                       cpumask_and(lowest_mask, &p->cpus_allowed, vec->mask);
22666 +                       cpumask_and(lowest_mask, tsk_cpus_allowed(p), vec->mask);
22667
22668                         /*
22669                          * We have to ensure that we have at least one bit
22670 diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
22671 index a1aecbedf5b1..558b98af241d 100644
22672 --- a/kernel/sched/cputime.c
22673 +++ b/kernel/sched/cputime.c
22674 @@ -685,7 +685,7 @@ static cputime_t get_vtime_delta(struct task_struct *tsk)
22675  {
22676         unsigned long long delta = vtime_delta(tsk);
22677
22678 -       WARN_ON_ONCE(tsk->vtime_snap_whence == VTIME_SLEEPING);
22679 +       WARN_ON_ONCE(tsk->vtime_snap_whence == VTIME_INACTIVE);
22680         tsk->vtime_snap += delta;
22681
22682         /* CHECKME: always safe to convert nsecs to cputime? */
22683 @@ -701,37 +701,37 @@ static void __vtime_account_system(struct task_struct *tsk)
22684
22685  void vtime_account_system(struct task_struct *tsk)
22686  {
22687 -       write_seqlock(&tsk->vtime_seqlock);
22688 +       write_seqcount_begin(&tsk->vtime_seqcount);
22689         __vtime_account_system(tsk);
22690 -       write_sequnlock(&tsk->vtime_seqlock);
22691 +       write_seqcount_end(&tsk->vtime_seqcount);
22692  }
22693
22694  void vtime_gen_account_irq_exit(struct task_struct *tsk)
22695  {
22696 -       write_seqlock(&tsk->vtime_seqlock);
22697 +       write_seqcount_begin(&tsk->vtime_seqcount);
22698         __vtime_account_system(tsk);
22699         if (context_tracking_in_user())
22700                 tsk->vtime_snap_whence = VTIME_USER;
22701 -       write_sequnlock(&tsk->vtime_seqlock);
22702 +       write_seqcount_end(&tsk->vtime_seqcount);
22703  }
22704
22705  void vtime_account_user(struct task_struct *tsk)
22706  {
22707         cputime_t delta_cpu;
22708
22709 -       write_seqlock(&tsk->vtime_seqlock);
22710 +       write_seqcount_begin(&tsk->vtime_seqcount);
22711         delta_cpu = get_vtime_delta(tsk);
22712         tsk->vtime_snap_whence = VTIME_SYS;
22713         account_user_time(tsk, delta_cpu, cputime_to_scaled(delta_cpu));
22714 -       write_sequnlock(&tsk->vtime_seqlock);
22715 +       write_seqcount_end(&tsk->vtime_seqcount);
22716  }
22717
22718  void vtime_user_enter(struct task_struct *tsk)
22719  {
22720 -       write_seqlock(&tsk->vtime_seqlock);
22721 +       write_seqcount_begin(&tsk->vtime_seqcount);
22722         __vtime_account_system(tsk);
22723         tsk->vtime_snap_whence = VTIME_USER;
22724 -       write_sequnlock(&tsk->vtime_seqlock);
22725 +       write_seqcount_end(&tsk->vtime_seqcount);
22726  }
22727
22728  void vtime_guest_enter(struct task_struct *tsk)
22729 @@ -743,19 +743,19 @@ void vtime_guest_enter(struct task_struct *tsk)
22730          * synchronization against the reader (task_gtime())
22731          * that can thus safely catch up with a tickless delta.
22732          */
22733 -       write_seqlock(&tsk->vtime_seqlock);
22734 +       write_seqcount_begin(&tsk->vtime_seqcount);
22735         __vtime_account_system(tsk);
22736         current->flags |= PF_VCPU;
22737 -       write_sequnlock(&tsk->vtime_seqlock);
22738 +       write_seqcount_end(&tsk->vtime_seqcount);
22739  }
22740  EXPORT_SYMBOL_GPL(vtime_guest_enter);
22741
22742  void vtime_guest_exit(struct task_struct *tsk)
22743  {
22744 -       write_seqlock(&tsk->vtime_seqlock);
22745 +       write_seqcount_begin(&tsk->vtime_seqcount);
22746         __vtime_account_system(tsk);
22747         current->flags &= ~PF_VCPU;
22748 -       write_sequnlock(&tsk->vtime_seqlock);
22749 +       write_seqcount_end(&tsk->vtime_seqcount);
22750  }
22751  EXPORT_SYMBOL_GPL(vtime_guest_exit);
22752
22753 @@ -768,24 +768,26 @@ void vtime_account_idle(struct task_struct *tsk)
22754
22755  void arch_vtime_task_switch(struct task_struct *prev)
22756  {
22757 -       write_seqlock(&prev->vtime_seqlock);
22758 -       prev->vtime_snap_whence = VTIME_SLEEPING;
22759 -       write_sequnlock(&prev->vtime_seqlock);
22760 +       write_seqcount_begin(&prev->vtime_seqcount);
22761 +       prev->vtime_snap_whence = VTIME_INACTIVE;
22762 +       write_seqcount_end(&prev->vtime_seqcount);
22763
22764 -       write_seqlock(&current->vtime_seqlock);
22765 +       write_seqcount_begin(&current->vtime_seqcount);
22766         current->vtime_snap_whence = VTIME_SYS;
22767         current->vtime_snap = sched_clock_cpu(smp_processor_id());
22768 -       write_sequnlock(&current->vtime_seqlock);
22769 +       write_seqcount_end(&current->vtime_seqcount);
22770  }
22771
22772  void vtime_init_idle(struct task_struct *t, int cpu)
22773  {
22774         unsigned long flags;
22775
22776 -       write_seqlock_irqsave(&t->vtime_seqlock, flags);
22777 +       local_irq_save(flags);
22778 +       write_seqcount_begin(&t->vtime_seqcount);
22779         t->vtime_snap_whence = VTIME_SYS;
22780         t->vtime_snap = sched_clock_cpu(cpu);
22781 -       write_sequnlock_irqrestore(&t->vtime_seqlock, flags);
22782 +       write_seqcount_end(&t->vtime_seqcount);
22783 +       local_irq_restore(flags);
22784  }
22785
22786  cputime_t task_gtime(struct task_struct *t)
22787 @@ -797,13 +799,13 @@ cputime_t task_gtime(struct task_struct *t)
22788                 return t->gtime;
22789
22790         do {
22791 -               seq = read_seqbegin(&t->vtime_seqlock);
22792 +               seq = read_seqcount_begin(&t->vtime_seqcount);
22793
22794                 gtime = t->gtime;
22795                 if (t->flags & PF_VCPU)
22796                         gtime += vtime_delta(t);
22797
22798 -       } while (read_seqretry(&t->vtime_seqlock, seq));
22799 +       } while (read_seqcount_retry(&t->vtime_seqcount, seq));
22800
22801         return gtime;
22802  }
22803 @@ -826,7 +828,7 @@ fetch_task_cputime(struct task_struct *t,
22804                 *udelta = 0;
22805                 *sdelta = 0;
22806
22807 -               seq = read_seqbegin(&t->vtime_seqlock);
22808 +               seq = read_seqcount_begin(&t->vtime_seqcount);
22809
22810                 if (u_dst)
22811                         *u_dst = *u_src;
22812 @@ -834,7 +836,7 @@ fetch_task_cputime(struct task_struct *t,
22813                         *s_dst = *s_src;
22814
22815                 /* Task is sleeping, nothing to add */
22816 -               if (t->vtime_snap_whence == VTIME_SLEEPING ||
22817 +               if (t->vtime_snap_whence == VTIME_INACTIVE ||
22818                     is_idle_task(t))
22819                         continue;
22820
22821 @@ -850,7 +852,7 @@ fetch_task_cputime(struct task_struct *t,
22822                         if (t->vtime_snap_whence == VTIME_SYS)
22823                                 *sdelta = delta;
22824                 }
22825 -       } while (read_seqretry(&t->vtime_seqlock, seq));
22826 +       } while (read_seqcount_retry(&t->vtime_seqcount, seq));
22827  }
22828
22829
22830 diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
22831 index 8b0a15e285f9..7a72e69fcf65 100644
22832 --- a/kernel/sched/deadline.c
22833 +++ b/kernel/sched/deadline.c
22834 @@ -134,7 +134,7 @@ static void inc_dl_migration(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
22835  {
22836         struct task_struct *p = dl_task_of(dl_se);
22837
22838 -       if (p->nr_cpus_allowed > 1)
22839 +       if (tsk_nr_cpus_allowed(p) > 1)
22840                 dl_rq->dl_nr_migratory++;
22841
22842         update_dl_migration(dl_rq);
22843 @@ -144,7 +144,7 @@ static void dec_dl_migration(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
22844  {
22845         struct task_struct *p = dl_task_of(dl_se);
22846
22847 -       if (p->nr_cpus_allowed > 1)
22848 +       if (tsk_nr_cpus_allowed(p) > 1)
22849                 dl_rq->dl_nr_migratory--;
22850
22851         update_dl_migration(dl_rq);
22852 @@ -697,6 +697,7 @@ void init_dl_task_timer(struct sched_dl_entity *dl_se)
22853
22854         hrtimer_init(timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
22855         timer->function = dl_task_timer;
22856 +       timer->irqsafe = 1;
22857  }
22858
22859  static
22860 @@ -989,7 +990,7 @@ static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags)
22861
22862         enqueue_dl_entity(&p->dl, pi_se, flags);
22863
22864 -       if (!task_current(rq, p) && p->nr_cpus_allowed > 1)
22865 +       if (!task_current(rq, p) && tsk_nr_cpus_allowed(p) > 1)
22866                 enqueue_pushable_dl_task(rq, p);
22867  }
22868
22869 @@ -1067,9 +1068,9 @@ select_task_rq_dl(struct task_struct *p, int cpu, int sd_flag, int flags)
22870          * try to make it stay here, it might be important.
22871          */
22872         if (unlikely(dl_task(curr)) &&
22873 -           (curr->nr_cpus_allowed < 2 ||
22874 +           (tsk_nr_cpus_allowed(curr) < 2 ||
22875              !dl_entity_preempt(&p->dl, &curr->dl)) &&
22876 -           (p->nr_cpus_allowed > 1)) {
22877 +           (tsk_nr_cpus_allowed(p) > 1)) {
22878                 int target = find_later_rq(p);
22879
22880                 if (target != -1 &&
22881 @@ -1090,7 +1091,7 @@ static void check_preempt_equal_dl(struct rq *rq, struct task_struct *p)
22882          * Current can't be migrated, useless to reschedule,
22883          * let's hope p can move out.
22884          */
22885 -       if (rq->curr->nr_cpus_allowed == 1 ||
22886 +       if (tsk_nr_cpus_allowed(rq->curr) == 1 ||
22887             cpudl_find(&rq->rd->cpudl, rq->curr, NULL) == -1)
22888                 return;
22889
22890 @@ -1098,7 +1099,7 @@ static void check_preempt_equal_dl(struct rq *rq, struct task_struct *p)
22891          * p is migratable, so let's not schedule it and
22892          * see if it is pushed or pulled somewhere else.
22893          */
22894 -       if (p->nr_cpus_allowed != 1 &&
22895 +       if (tsk_nr_cpus_allowed(p) != 1 &&
22896             cpudl_find(&rq->rd->cpudl, p, NULL) != -1)
22897                 return;
22898
22899 @@ -1212,7 +1213,7 @@ static void put_prev_task_dl(struct rq *rq, struct task_struct *p)
22900  {
22901         update_curr_dl(rq);
22902
22903 -       if (on_dl_rq(&p->dl) && p->nr_cpus_allowed > 1)
22904 +       if (on_dl_rq(&p->dl) && tsk_nr_cpus_allowed(p) > 1)
22905                 enqueue_pushable_dl_task(rq, p);
22906  }
22907
22908 @@ -1335,7 +1336,7 @@ static int find_later_rq(struct task_struct *task)
22909         if (unlikely(!later_mask))
22910                 return -1;
22911
22912 -       if (task->nr_cpus_allowed == 1)
22913 +       if (tsk_nr_cpus_allowed(task) == 1)
22914                 return -1;
22915
22916         /*
22917 @@ -1441,7 +1442,7 @@ static struct rq *find_lock_later_rq(struct task_struct *task, struct rq *rq)
22918                 if (double_lock_balance(rq, later_rq)) {
22919                         if (unlikely(task_rq(task) != rq ||
22920                                      !cpumask_test_cpu(later_rq->cpu,
22921 -                                                      &task->cpus_allowed) ||
22922 +                                                      tsk_cpus_allowed(task)) ||
22923                                      task_running(rq, task) ||
22924                                      !task_on_rq_queued(task))) {
22925                                 double_unlock_balance(rq, later_rq);
22926 @@ -1480,7 +1481,7 @@ static struct task_struct *pick_next_pushable_dl_task(struct rq *rq)
22927
22928         BUG_ON(rq->cpu != task_cpu(p));
22929         BUG_ON(task_current(rq, p));
22930 -       BUG_ON(p->nr_cpus_allowed <= 1);
22931 +       BUG_ON(tsk_nr_cpus_allowed(p) <= 1);
22932
22933         BUG_ON(!task_on_rq_queued(p));
22934         BUG_ON(!dl_task(p));
22935 @@ -1519,7 +1520,7 @@ retry:
22936          */
22937         if (dl_task(rq->curr) &&
22938             dl_time_before(next_task->dl.deadline, rq->curr->dl.deadline) &&
22939 -           rq->curr->nr_cpus_allowed > 1) {
22940 +           tsk_nr_cpus_allowed(rq->curr) > 1) {
22941                 resched_curr(rq);
22942                 return 0;
22943         }
22944 @@ -1666,9 +1667,9 @@ static void task_woken_dl(struct rq *rq, struct task_struct *p)
22945  {
22946         if (!task_running(rq, p) &&
22947             !test_tsk_need_resched(rq->curr) &&
22948 -           p->nr_cpus_allowed > 1 &&
22949 +           tsk_nr_cpus_allowed(p) > 1 &&
22950             dl_task(rq->curr) &&
22951 -           (rq->curr->nr_cpus_allowed < 2 ||
22952 +           (tsk_nr_cpus_allowed(rq->curr) < 2 ||
22953              !dl_entity_preempt(&p->dl, &rq->curr->dl))) {
22954                 push_dl_tasks(rq);
22955         }
22956 @@ -1769,7 +1770,7 @@ static void switched_to_dl(struct rq *rq, struct task_struct *p)
22957  {
22958         if (task_on_rq_queued(p) && rq->curr != p) {
22959  #ifdef CONFIG_SMP
22960 -               if (p->nr_cpus_allowed > 1 && rq->dl.overloaded)
22961 +               if (tsk_nr_cpus_allowed(p) > 1 && rq->dl.overloaded)
22962                         queue_push_tasks(rq);
22963  #else
22964                 if (dl_task(rq->curr))
22965 diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
22966 index 641511771ae6..a2d69b883623 100644
22967 --- a/kernel/sched/debug.c
22968 +++ b/kernel/sched/debug.c
22969 @@ -251,6 +251,9 @@ void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq)
22970         P(rt_throttled);
22971         PN(rt_time);
22972         PN(rt_runtime);
22973 +#ifdef CONFIG_SMP
22974 +       P(rt_nr_migratory);
22975 +#endif
22976
22977  #undef PN
22978  #undef P
22979 @@ -635,6 +638,10 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
22980  #endif
22981         P(policy);
22982         P(prio);
22983 +#ifdef CONFIG_PREEMPT_RT_FULL
22984 +       P(migrate_disable);
22985 +#endif
22986 +       P(nr_cpus_allowed);
22987  #undef PN
22988  #undef __PN
22989  #undef P
22990 diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
22991 index 8f258f437ac2..cf0a1adba6c6 100644
22992 --- a/kernel/sched/fair.c
22993 +++ b/kernel/sched/fair.c
22994 @@ -3166,7 +3166,7 @@ check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
22995         ideal_runtime = sched_slice(cfs_rq, curr);
22996         delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime;
22997         if (delta_exec > ideal_runtime) {
22998 -               resched_curr(rq_of(cfs_rq));
22999 +               resched_curr_lazy(rq_of(cfs_rq));
23000                 /*
23001                  * The current task ran long enough, ensure it doesn't get
23002                  * re-elected due to buddy favours.
23003 @@ -3190,7 +3190,7 @@ check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
23004                 return;
23005
23006         if (delta > ideal_runtime)
23007 -               resched_curr(rq_of(cfs_rq));
23008 +               resched_curr_lazy(rq_of(cfs_rq));
23009  }
23010
23011  static void
23012 @@ -3330,7 +3330,7 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
23013          * validating it and just reschedule.
23014          */
23015         if (queued) {
23016 -               resched_curr(rq_of(cfs_rq));
23017 +               resched_curr_lazy(rq_of(cfs_rq));
23018                 return;
23019         }
23020         /*
23021 @@ -3512,7 +3512,7 @@ static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec)
23022          * hierarchy can be throttled
23023          */
23024         if (!assign_cfs_rq_runtime(cfs_rq) && likely(cfs_rq->curr))
23025 -               resched_curr(rq_of(cfs_rq));
23026 +               resched_curr_lazy(rq_of(cfs_rq));
23027  }
23028
23029  static __always_inline
23030 @@ -4124,7 +4124,7 @@ static void hrtick_start_fair(struct rq *rq, struct task_struct *p)
23031
23032                 if (delta < 0) {
23033                         if (rq->curr == p)
23034 -                               resched_curr(rq);
23035 +                               resched_curr_lazy(rq);
23036                         return;
23037                 }
23038                 hrtick_start(rq, delta);
23039 @@ -5213,7 +5213,7 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
23040         return;
23041
23042  preempt:
23043 -       resched_curr(rq);
23044 +       resched_curr_lazy(rq);
23045         /*
23046          * Only set the backward buddy when the current task is still
23047          * on the rq. This can happen when a wakeup gets interleaved
23048 @@ -7964,7 +7964,7 @@ static void task_fork_fair(struct task_struct *p)
23049                  * 'current' within the tree based on its new key value.
23050                  */
23051                 swap(curr->vruntime, se->vruntime);
23052 -               resched_curr(rq);
23053 +               resched_curr_lazy(rq);
23054         }
23055
23056         se->vruntime -= cfs_rq->min_vruntime;
23057 @@ -7989,7 +7989,7 @@ prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio)
23058          */
23059         if (rq->curr == p) {
23060                 if (p->prio > oldprio)
23061 -                       resched_curr(rq);
23062 +                       resched_curr_lazy(rq);
23063         } else
23064                 check_preempt_curr(rq, p, 0);
23065  }
23066 diff --git a/kernel/sched/features.h b/kernel/sched/features.h
23067 index 69631fa46c2f..6d28fcd08872 100644
23068 --- a/kernel/sched/features.h
23069 +++ b/kernel/sched/features.h
23070 @@ -45,11 +45,19 @@ SCHED_FEAT(LB_BIAS, true)
23071   */
23072  SCHED_FEAT(NONTASK_CAPACITY, true)
23073
23074 +#ifdef CONFIG_PREEMPT_RT_FULL
23075 +SCHED_FEAT(TTWU_QUEUE, false)
23076 +# ifdef CONFIG_PREEMPT_LAZY
23077 +SCHED_FEAT(PREEMPT_LAZY, true)
23078 +# endif
23079 +#else
23080 +
23081  /*
23082   * Queue remote wakeups on the target CPU and process them
23083   * using the scheduler IPI. Reduces rq->lock contention/bounces.
23084   */
23085  SCHED_FEAT(TTWU_QUEUE, true)
23086 +#endif
23087
23088  #ifdef HAVE_RT_PUSH_IPI
23089  /*
23090 diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
23091 index 8ec86abe0ea1..8cf360d309ec 100644
23092 --- a/kernel/sched/rt.c
23093 +++ b/kernel/sched/rt.c
23094 @@ -47,6 +47,7 @@ void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime)
23095
23096         hrtimer_init(&rt_b->rt_period_timer,
23097                         CLOCK_MONOTONIC, HRTIMER_MODE_REL);
23098 +       rt_b->rt_period_timer.irqsafe = 1;
23099         rt_b->rt_period_timer.function = sched_rt_period_timer;
23100  }
23101
23102 @@ -93,6 +94,7 @@ void init_rt_rq(struct rt_rq *rt_rq)
23103         rt_rq->push_cpu = nr_cpu_ids;
23104         raw_spin_lock_init(&rt_rq->push_lock);
23105         init_irq_work(&rt_rq->push_work, push_irq_work_func);
23106 +       rt_rq->push_work.flags |= IRQ_WORK_HARD_IRQ;
23107  #endif
23108  #endif /* CONFIG_SMP */
23109         /* We start is dequeued state, because no RT tasks are queued */
23110 @@ -326,7 +328,7 @@ static void inc_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
23111         rt_rq = &rq_of_rt_rq(rt_rq)->rt;
23112
23113         rt_rq->rt_nr_total++;
23114 -       if (p->nr_cpus_allowed > 1)
23115 +       if (tsk_nr_cpus_allowed(p) > 1)
23116                 rt_rq->rt_nr_migratory++;
23117
23118         update_rt_migration(rt_rq);
23119 @@ -343,7 +345,7 @@ static void dec_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
23120         rt_rq = &rq_of_rt_rq(rt_rq)->rt;
23121
23122         rt_rq->rt_nr_total--;
23123 -       if (p->nr_cpus_allowed > 1)
23124 +       if (tsk_nr_cpus_allowed(p) > 1)
23125                 rt_rq->rt_nr_migratory--;
23126
23127         update_rt_migration(rt_rq);
23128 @@ -1262,7 +1264,7 @@ enqueue_task_rt(struct rq *rq, struct task_struct *p, int flags)
23129
23130         enqueue_rt_entity(rt_se, flags & ENQUEUE_HEAD);
23131
23132 -       if (!task_current(rq, p) && p->nr_cpus_allowed > 1)
23133 +       if (!task_current(rq, p) && tsk_nr_cpus_allowed(p) > 1)
23134                 enqueue_pushable_task(rq, p);
23135  }
23136
23137 @@ -1351,7 +1353,7 @@ select_task_rq_rt(struct task_struct *p, int cpu, int sd_flag, int flags)
23138          * will have to sort it out.
23139          */
23140         if (curr && unlikely(rt_task(curr)) &&
23141 -           (curr->nr_cpus_allowed < 2 ||
23142 +           (tsk_nr_cpus_allowed(curr) < 2 ||
23143              curr->prio <= p->prio)) {
23144                 int target = find_lowest_rq(p);
23145
23146 @@ -1375,7 +1377,7 @@ static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p)
23147          * Current can't be migrated, useless to reschedule,
23148          * let's hope p can move out.
23149          */
23150 -       if (rq->curr->nr_cpus_allowed == 1 ||
23151 +       if (tsk_nr_cpus_allowed(rq->curr) == 1 ||
23152             !cpupri_find(&rq->rd->cpupri, rq->curr, NULL))
23153                 return;
23154
23155 @@ -1383,7 +1385,7 @@ static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p)
23156          * p is migratable, so let's not schedule it and
23157          * see if it is pushed or pulled somewhere else.
23158          */
23159 -       if (p->nr_cpus_allowed != 1
23160 +       if (tsk_nr_cpus_allowed(p) != 1
23161             && cpupri_find(&rq->rd->cpupri, p, NULL))
23162                 return;
23163
23164 @@ -1517,7 +1519,7 @@ static void put_prev_task_rt(struct rq *rq, struct task_struct *p)
23165          * The previous task needs to be made eligible for pushing
23166          * if it is still active
23167          */
23168 -       if (on_rt_rq(&p->rt) && p->nr_cpus_allowed > 1)
23169 +       if (on_rt_rq(&p->rt) && tsk_nr_cpus_allowed(p) > 1)
23170                 enqueue_pushable_task(rq, p);
23171  }
23172
23173 @@ -1567,7 +1569,7 @@ static int find_lowest_rq(struct task_struct *task)
23174         if (unlikely(!lowest_mask))
23175                 return -1;
23176
23177 -       if (task->nr_cpus_allowed == 1)
23178 +       if (tsk_nr_cpus_allowed(task) == 1)
23179                 return -1; /* No other targets possible */
23180
23181         if (!cpupri_find(&task_rq(task)->rd->cpupri, task, lowest_mask))
23182 @@ -1699,7 +1701,7 @@ static struct task_struct *pick_next_pushable_task(struct rq *rq)
23183
23184         BUG_ON(rq->cpu != task_cpu(p));
23185         BUG_ON(task_current(rq, p));
23186 -       BUG_ON(p->nr_cpus_allowed <= 1);
23187 +       BUG_ON(tsk_nr_cpus_allowed(p) <= 1);
23188
23189         BUG_ON(!task_on_rq_queued(p));
23190         BUG_ON(!rt_task(p));
23191 @@ -2059,9 +2061,9 @@ static void task_woken_rt(struct rq *rq, struct task_struct *p)
23192  {
23193         if (!task_running(rq, p) &&
23194             !test_tsk_need_resched(rq->curr) &&
23195 -           p->nr_cpus_allowed > 1 &&
23196 +           tsk_nr_cpus_allowed(p) > 1 &&
23197             (dl_task(rq->curr) || rt_task(rq->curr)) &&
23198 -           (rq->curr->nr_cpus_allowed < 2 ||
23199 +           (tsk_nr_cpus_allowed(rq->curr) < 2 ||
23200              rq->curr->prio <= p->prio))
23201                 push_rt_tasks(rq);
23202  }
23203 @@ -2134,7 +2136,7 @@ static void switched_to_rt(struct rq *rq, struct task_struct *p)
23204          */
23205         if (task_on_rq_queued(p) && rq->curr != p) {
23206  #ifdef CONFIG_SMP
23207 -               if (p->nr_cpus_allowed > 1 && rq->rt.overloaded)
23208 +               if (tsk_nr_cpus_allowed(p) > 1 && rq->rt.overloaded)
23209                         queue_push_tasks(rq);
23210  #else
23211                 if (p->prio < rq->curr->prio)
23212 diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
23213 index 0517abd7dd73..a8a9b156ea15 100644
23214 --- a/kernel/sched/sched.h
23215 +++ b/kernel/sched/sched.h
23216 @@ -1100,6 +1100,7 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
23217  #define WF_SYNC                0x01            /* waker goes to sleep after wakeup */
23218  #define WF_FORK                0x02            /* child wakeup after fork */
23219  #define WF_MIGRATED    0x4             /* internal use, task got migrated */
23220 +#define WF_LOCK_SLEEPER        0x08            /* wakeup spinlock "sleeper" */
23221
23222  /*
23223   * To aid in avoiding the subversion of "niceness" due to uneven distribution
23224 @@ -1299,6 +1300,15 @@ extern void init_sched_fair_class(void);
23225  extern void resched_curr(struct rq *rq);
23226  extern void resched_cpu(int cpu);
23227
23228 +#ifdef CONFIG_PREEMPT_LAZY
23229 +extern void resched_curr_lazy(struct rq *rq);
23230 +#else
23231 +static inline void resched_curr_lazy(struct rq *rq)
23232 +{
23233 +       resched_curr(rq);
23234 +}
23235 +#endif
23236 +
23237  extern struct rt_bandwidth def_rt_bandwidth;
23238  extern void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime);
23239
23240 diff --git a/kernel/sched/swait.c b/kernel/sched/swait.c
23241 new file mode 100644
23242 index 000000000000..205fe36868f9
23243 --- /dev/null
23244 +++ b/kernel/sched/swait.c
23245 @@ -0,0 +1,143 @@
23246 +#include <linux/sched.h>
23247 +#include <linux/swait.h>
23248 +#include <linux/suspend.h>
23249 +
23250 +void __init_swait_queue_head(struct swait_queue_head *q, const char *name,
23251 +                            struct lock_class_key *key)
23252 +{
23253 +       raw_spin_lock_init(&q->lock);
23254 +       lockdep_set_class_and_name(&q->lock, key, name);
23255 +       INIT_LIST_HEAD(&q->task_list);
23256 +}
23257 +EXPORT_SYMBOL(__init_swait_queue_head);
23258 +
23259 +/*
23260 + * The thing about the wake_up_state() return value; I think we can ignore it.
23261 + *
23262 + * If for some reason it would return 0, that means the previously waiting
23263 + * task is already running, so it will observe condition true (or has already).
23264 + */
23265 +void swake_up_locked(struct swait_queue_head *q)
23266 +{
23267 +       struct swait_queue *curr;
23268 +
23269 +       if (list_empty(&q->task_list))
23270 +               return;
23271 +
23272 +       curr = list_first_entry(&q->task_list, typeof(*curr), task_list);
23273 +       wake_up_process(curr->task);
23274 +       list_del_init(&curr->task_list);
23275 +}
23276 +EXPORT_SYMBOL(swake_up_locked);
23277 +
23278 +void swake_up_all_locked(struct swait_queue_head *q)
23279 +{
23280 +       struct swait_queue *curr;
23281 +       int wakes = 0;
23282 +
23283 +       while (!list_empty(&q->task_list)) {
23284 +
23285 +               curr = list_first_entry(&q->task_list, typeof(*curr),
23286 +                                       task_list);
23287 +               wake_up_process(curr->task);
23288 +               list_del_init(&curr->task_list);
23289 +               wakes++;
23290 +       }
23291 +       if (pm_in_action)
23292 +               return;
23293 +       WARN(wakes > 2, "complate_all() with %d waiters\n", wakes);
23294 +}
23295 +EXPORT_SYMBOL(swake_up_all_locked);
23296 +
23297 +void swake_up(struct swait_queue_head *q)
23298 +{
23299 +       unsigned long flags;
23300 +
23301 +       if (!swait_active(q))
23302 +               return;
23303 +
23304 +       raw_spin_lock_irqsave(&q->lock, flags);
23305 +       swake_up_locked(q);
23306 +       raw_spin_unlock_irqrestore(&q->lock, flags);
23307 +}
23308 +EXPORT_SYMBOL(swake_up);
23309 +
23310 +/*
23311 + * Does not allow usage from IRQ disabled, since we must be able to
23312 + * release IRQs to guarantee bounded hold time.
23313 + */
23314 +void swake_up_all(struct swait_queue_head *q)
23315 +{
23316 +       struct swait_queue *curr;
23317 +       LIST_HEAD(tmp);
23318 +
23319 +       if (!swait_active(q))
23320 +               return;
23321 +
23322 +       raw_spin_lock_irq(&q->lock);
23323 +       list_splice_init(&q->task_list, &tmp);
23324 +       while (!list_empty(&tmp)) {
23325 +               curr = list_first_entry(&tmp, typeof(*curr), task_list);
23326 +
23327 +               wake_up_state(curr->task, TASK_NORMAL);
23328 +               list_del_init(&curr->task_list);
23329 +
23330 +               if (list_empty(&tmp))
23331 +                       break;
23332 +
23333 +               raw_spin_unlock_irq(&q->lock);
23334 +               raw_spin_lock_irq(&q->lock);
23335 +       }
23336 +       raw_spin_unlock_irq(&q->lock);
23337 +}
23338 +EXPORT_SYMBOL(swake_up_all);
23339 +
23340 +void __prepare_to_swait(struct swait_queue_head *q, struct swait_queue *wait)
23341 +{
23342 +       wait->task = current;
23343 +       if (list_empty(&wait->task_list))
23344 +               list_add(&wait->task_list, &q->task_list);
23345 +}
23346 +
23347 +void prepare_to_swait(struct swait_queue_head *q, struct swait_queue *wait, int state)
23348 +{
23349 +       unsigned long flags;
23350 +
23351 +       raw_spin_lock_irqsave(&q->lock, flags);
23352 +       __prepare_to_swait(q, wait);
23353 +       set_current_state(state);
23354 +       raw_spin_unlock_irqrestore(&q->lock, flags);
23355 +}
23356 +EXPORT_SYMBOL(prepare_to_swait);
23357 +
23358 +long prepare_to_swait_event(struct swait_queue_head *q, struct swait_queue *wait, int state)
23359 +{
23360 +       if (signal_pending_state(state, current))
23361 +               return -ERESTARTSYS;
23362 +
23363 +       prepare_to_swait(q, wait, state);
23364 +
23365 +       return 0;
23366 +}
23367 +EXPORT_SYMBOL(prepare_to_swait_event);
23368 +
23369 +void __finish_swait(struct swait_queue_head *q, struct swait_queue *wait)
23370 +{
23371 +       __set_current_state(TASK_RUNNING);
23372 +       if (!list_empty(&wait->task_list))
23373 +               list_del_init(&wait->task_list);
23374 +}
23375 +
23376 +void finish_swait(struct swait_queue_head *q, struct swait_queue *wait)
23377 +{
23378 +       unsigned long flags;
23379 +
23380 +       __set_current_state(TASK_RUNNING);
23381 +
23382 +       if (!list_empty_careful(&wait->task_list)) {
23383 +               raw_spin_lock_irqsave(&q->lock, flags);
23384 +               list_del_init(&wait->task_list);
23385 +               raw_spin_unlock_irqrestore(&q->lock, flags);
23386 +       }
23387 +}
23388 +EXPORT_SYMBOL(finish_swait);
23389 diff --git a/kernel/sched/swork.c b/kernel/sched/swork.c
23390 new file mode 100644
23391 index 000000000000..1950f40ca725
23392 --- /dev/null
23393 +++ b/kernel/sched/swork.c
23394 @@ -0,0 +1,173 @@
23395 +/*
23396 + * Copyright (C) 2014 BMW Car IT GmbH, Daniel Wagner daniel.wagner@bmw-carit.de
23397 + *
23398 + * Provides a framework for enqueuing callbacks from irq context
23399 + * PREEMPT_RT_FULL safe. The callbacks are executed in kthread context.
23400 + */
23401 +
23402 +#include <linux/swait.h>
23403 +#include <linux/swork.h>
23404 +#include <linux/kthread.h>
23405 +#include <linux/slab.h>
23406 +#include <linux/spinlock.h>
23407 +#include <linux/export.h>
23408 +
23409 +#define SWORK_EVENT_PENDING     (1 << 0)
23410 +
23411 +static DEFINE_MUTEX(worker_mutex);
23412 +static struct sworker *glob_worker;
23413 +
23414 +struct sworker {
23415 +       struct list_head events;
23416 +       struct swait_queue_head wq;
23417 +
23418 +       raw_spinlock_t lock;
23419 +
23420 +       struct task_struct *task;
23421 +       int refs;
23422 +};
23423 +
23424 +static bool swork_readable(struct sworker *worker)
23425 +{
23426 +       bool r;
23427 +
23428 +       if (kthread_should_stop())
23429 +               return true;
23430 +
23431 +       raw_spin_lock_irq(&worker->lock);
23432 +       r = !list_empty(&worker->events);
23433 +       raw_spin_unlock_irq(&worker->lock);
23434 +
23435 +       return r;
23436 +}
23437 +
23438 +static int swork_kthread(void *arg)
23439 +{
23440 +       struct sworker *worker = arg;
23441 +
23442 +       for (;;) {
23443 +               swait_event_interruptible(worker->wq,
23444 +                                       swork_readable(worker));
23445 +               if (kthread_should_stop())
23446 +                       break;
23447 +
23448 +               raw_spin_lock_irq(&worker->lock);
23449 +               while (!list_empty(&worker->events)) {
23450 +                       struct swork_event *sev;
23451 +
23452 +                       sev = list_first_entry(&worker->events,
23453 +                                       struct swork_event, item);
23454 +                       list_del(&sev->item);
23455 +                       raw_spin_unlock_irq(&worker->lock);
23456 +
23457 +                       WARN_ON_ONCE(!test_and_clear_bit(SWORK_EVENT_PENDING,
23458 +                                                        &sev->flags));
23459 +                       sev->func(sev);
23460 +                       raw_spin_lock_irq(&worker->lock);
23461 +               }
23462 +               raw_spin_unlock_irq(&worker->lock);
23463 +       }
23464 +       return 0;
23465 +}
23466 +
23467 +static struct sworker *swork_create(void)
23468 +{
23469 +       struct sworker *worker;
23470 +
23471 +       worker = kzalloc(sizeof(*worker), GFP_KERNEL);
23472 +       if (!worker)
23473 +               return ERR_PTR(-ENOMEM);
23474 +
23475 +       INIT_LIST_HEAD(&worker->events);
23476 +       raw_spin_lock_init(&worker->lock);
23477 +       init_swait_queue_head(&worker->wq);
23478 +
23479 +       worker->task = kthread_run(swork_kthread, worker, "kswork");
23480 +       if (IS_ERR(worker->task)) {
23481 +               kfree(worker);
23482 +               return ERR_PTR(-ENOMEM);
23483 +       }
23484 +
23485 +       return worker;
23486 +}
23487 +
23488 +static void swork_destroy(struct sworker *worker)
23489 +{
23490 +       kthread_stop(worker->task);
23491 +
23492 +       WARN_ON(!list_empty(&worker->events));
23493 +       kfree(worker);
23494 +}
23495 +
23496 +/**
23497 + * swork_queue - queue swork
23498 + *
23499 + * Returns %false if @work was already on a queue, %true otherwise.
23500 + *
23501 + * The work is queued and processed on a random CPU
23502 + */
23503 +bool swork_queue(struct swork_event *sev)
23504 +{
23505 +       unsigned long flags;
23506 +
23507 +       if (test_and_set_bit(SWORK_EVENT_PENDING, &sev->flags))
23508 +               return false;
23509 +
23510 +       raw_spin_lock_irqsave(&glob_worker->lock, flags);
23511 +       list_add_tail(&sev->item, &glob_worker->events);
23512 +       raw_spin_unlock_irqrestore(&glob_worker->lock, flags);
23513 +
23514 +       swake_up(&glob_worker->wq);
23515 +       return true;
23516 +}
23517 +EXPORT_SYMBOL_GPL(swork_queue);
23518 +
23519 +/**
23520 + * swork_get - get an instance of the sworker
23521 + *
23522 + * Returns an negative error code if the initialization if the worker did not
23523 + * work, %0 otherwise.
23524 + *
23525 + */
23526 +int swork_get(void)
23527 +{
23528 +       struct sworker *worker;
23529 +
23530 +       mutex_lock(&worker_mutex);
23531 +       if (!glob_worker) {
23532 +               worker = swork_create();
23533 +               if (IS_ERR(worker)) {
23534 +                       mutex_unlock(&worker_mutex);
23535 +                       return -ENOMEM;
23536 +               }
23537 +
23538 +               glob_worker = worker;
23539 +       }
23540 +
23541 +       glob_worker->refs++;
23542 +       mutex_unlock(&worker_mutex);
23543 +
23544 +       return 0;
23545 +}
23546 +EXPORT_SYMBOL_GPL(swork_get);
23547 +
23548 +/**
23549 + * swork_put - puts an instance of the sworker
23550 + *
23551 + * Will destroy the sworker thread. This function must not be called until all
23552 + * queued events have been completed.
23553 + */
23554 +void swork_put(void)
23555 +{
23556 +       mutex_lock(&worker_mutex);
23557 +
23558 +       glob_worker->refs--;
23559 +       if (glob_worker->refs > 0)
23560 +               goto out;
23561 +
23562 +       swork_destroy(glob_worker);
23563 +       glob_worker = NULL;
23564 +out:
23565 +       mutex_unlock(&worker_mutex);
23566 +}
23567 +EXPORT_SYMBOL_GPL(swork_put);
23568 diff --git a/kernel/signal.c b/kernel/signal.c
23569 index f3f1f7a972fd..bc2c990f3f63 100644
23570 --- a/kernel/signal.c
23571 +++ b/kernel/signal.c
23572 @@ -14,6 +14,7 @@
23573  #include <linux/export.h>
23574  #include <linux/init.h>
23575  #include <linux/sched.h>
23576 +#include <linux/sched/rt.h>
23577  #include <linux/fs.h>
23578  #include <linux/tty.h>
23579  #include <linux/binfmts.h>
23580 @@ -352,13 +353,30 @@ static bool task_participate_group_stop(struct task_struct *task)
23581         return false;
23582  }
23583
23584 +static inline struct sigqueue *get_task_cache(struct task_struct *t)
23585 +{
23586 +       struct sigqueue *q = t->sigqueue_cache;
23587 +
23588 +       if (cmpxchg(&t->sigqueue_cache, q, NULL) != q)
23589 +               return NULL;
23590 +       return q;
23591 +}
23592 +
23593 +static inline int put_task_cache(struct task_struct *t, struct sigqueue *q)
23594 +{
23595 +       if (cmpxchg(&t->sigqueue_cache, NULL, q) == NULL)
23596 +               return 0;
23597 +       return 1;
23598 +}
23599 +
23600  /*
23601   * allocate a new signal queue record
23602   * - this may be called without locks if and only if t == current, otherwise an
23603   *   appropriate lock must be held to stop the target task from exiting
23604   */
23605  static struct sigqueue *
23606 -__sigqueue_alloc(int sig, struct task_struct *t, gfp_t flags, int override_rlimit)
23607 +__sigqueue_do_alloc(int sig, struct task_struct *t, gfp_t flags,
23608 +                   int override_rlimit, int fromslab)
23609  {
23610         struct sigqueue *q = NULL;
23611         struct user_struct *user;
23612 @@ -375,7 +393,10 @@ __sigqueue_alloc(int sig, struct task_struct *t, gfp_t flags, int override_rlimi
23613         if (override_rlimit ||
23614             atomic_read(&user->sigpending) <=
23615                         task_rlimit(t, RLIMIT_SIGPENDING)) {
23616 -               q = kmem_cache_alloc(sigqueue_cachep, flags);
23617 +               if (!fromslab)
23618 +                       q = get_task_cache(t);
23619 +               if (!q)
23620 +                       q = kmem_cache_alloc(sigqueue_cachep, flags);
23621         } else {
23622                 print_dropped_signal(sig);
23623         }
23624 @@ -392,6 +413,13 @@ __sigqueue_alloc(int sig, struct task_struct *t, gfp_t flags, int override_rlimi
23625         return q;
23626  }
23627
23628 +static struct sigqueue *
23629 +__sigqueue_alloc(int sig, struct task_struct *t, gfp_t flags,
23630 +                int override_rlimit)
23631 +{
23632 +       return __sigqueue_do_alloc(sig, t, flags, override_rlimit, 0);
23633 +}
23634 +
23635  static void __sigqueue_free(struct sigqueue *q)
23636  {
23637         if (q->flags & SIGQUEUE_PREALLOC)
23638 @@ -401,6 +429,21 @@ static void __sigqueue_free(struct sigqueue *q)
23639         kmem_cache_free(sigqueue_cachep, q);
23640  }
23641
23642 +static void sigqueue_free_current(struct sigqueue *q)
23643 +{
23644 +       struct user_struct *up;
23645 +
23646 +       if (q->flags & SIGQUEUE_PREALLOC)
23647 +               return;
23648 +
23649 +       up = q->user;
23650 +       if (rt_prio(current->normal_prio) && !put_task_cache(current, q)) {
23651 +               atomic_dec(&up->sigpending);
23652 +               free_uid(up);
23653 +       } else
23654 +                 __sigqueue_free(q);
23655 +}
23656 +
23657  void flush_sigqueue(struct sigpending *queue)
23658  {
23659         struct sigqueue *q;
23660 @@ -414,6 +457,21 @@ void flush_sigqueue(struct sigpending *queue)
23661  }
23662
23663  /*
23664 + * Called from __exit_signal. Flush tsk->pending and
23665 + * tsk->sigqueue_cache
23666 + */
23667 +void flush_task_sigqueue(struct task_struct *tsk)
23668 +{
23669 +       struct sigqueue *q;
23670 +
23671 +       flush_sigqueue(&tsk->pending);
23672 +
23673 +       q = get_task_cache(tsk);
23674 +       if (q)
23675 +               kmem_cache_free(sigqueue_cachep, q);
23676 +}
23677 +
23678 +/*
23679   * Flush all pending signals for this kthread.
23680   */
23681  void flush_signals(struct task_struct *t)
23682 @@ -525,7 +583,7 @@ static void collect_signal(int sig, struct sigpending *list, siginfo_t *info)
23683  still_pending:
23684                 list_del_init(&first->list);
23685                 copy_siginfo(info, &first->info);
23686 -               __sigqueue_free(first);
23687 +               sigqueue_free_current(first);
23688         } else {
23689                 /*
23690                  * Ok, it wasn't in the queue.  This must be
23691 @@ -560,6 +618,8 @@ int dequeue_signal(struct task_struct *tsk, sigset_t *mask, siginfo_t *info)
23692  {
23693         int signr;
23694
23695 +       WARN_ON_ONCE(tsk != current);
23696 +
23697         /* We only dequeue private signals from ourselves, we don't let
23698          * signalfd steal them
23699          */
23700 @@ -1156,8 +1216,8 @@ int do_send_sig_info(int sig, struct siginfo *info, struct task_struct *p,
23701   * We don't want to have recursive SIGSEGV's etc, for example,
23702   * that is why we also clear SIGNAL_UNKILLABLE.
23703   */
23704 -int
23705 -force_sig_info(int sig, struct siginfo *info, struct task_struct *t)
23706 +static int
23707 +do_force_sig_info(int sig, struct siginfo *info, struct task_struct *t)
23708  {
23709         unsigned long int flags;
23710         int ret, blocked, ignored;
23711 @@ -1182,6 +1242,39 @@ force_sig_info(int sig, struct siginfo *info, struct task_struct *t)
23712         return ret;
23713  }
23714
23715 +int force_sig_info(int sig, struct siginfo *info, struct task_struct *t)
23716 +{
23717 +/*
23718 + * On some archs, PREEMPT_RT has to delay sending a signal from a trap
23719 + * since it can not enable preemption, and the signal code's spin_locks
23720 + * turn into mutexes. Instead, it must set TIF_NOTIFY_RESUME which will
23721 + * send the signal on exit of the trap.
23722 + */
23723 +#ifdef ARCH_RT_DELAYS_SIGNAL_SEND
23724 +       if (in_atomic()) {
23725 +               if (WARN_ON_ONCE(t != current))
23726 +                       return 0;
23727 +               if (WARN_ON_ONCE(t->forced_info.si_signo))
23728 +                       return 0;
23729 +
23730 +               if (is_si_special(info)) {
23731 +                       WARN_ON_ONCE(info != SEND_SIG_PRIV);
23732 +                       t->forced_info.si_signo = sig;
23733 +                       t->forced_info.si_errno = 0;
23734 +                       t->forced_info.si_code = SI_KERNEL;
23735 +                       t->forced_info.si_pid = 0;
23736 +                       t->forced_info.si_uid = 0;
23737 +               } else {
23738 +                       t->forced_info = *info;
23739 +               }
23740 +
23741 +               set_tsk_thread_flag(t, TIF_NOTIFY_RESUME);
23742 +               return 0;
23743 +       }
23744 +#endif
23745 +       return do_force_sig_info(sig, info, t);
23746 +}
23747 +
23748  /*
23749   * Nuke all other threads in the group.
23750   */
23751 @@ -1216,12 +1309,12 @@ struct sighand_struct *__lock_task_sighand(struct task_struct *tsk,
23752                  * Disable interrupts early to avoid deadlocks.
23753                  * See rcu_read_unlock() comment header for details.
23754                  */
23755 -               local_irq_save(*flags);
23756 +               local_irq_save_nort(*flags);
23757                 rcu_read_lock();
23758                 sighand = rcu_dereference(tsk->sighand);
23759                 if (unlikely(sighand == NULL)) {
23760                         rcu_read_unlock();
23761 -                       local_irq_restore(*flags);
23762 +                       local_irq_restore_nort(*flags);
23763                         break;
23764                 }
23765                 /*
23766 @@ -1242,7 +1335,7 @@ struct sighand_struct *__lock_task_sighand(struct task_struct *tsk,
23767                 }
23768                 spin_unlock(&sighand->siglock);
23769                 rcu_read_unlock();
23770 -               local_irq_restore(*flags);
23771 +               local_irq_restore_nort(*flags);
23772         }
23773
23774         return sighand;
23775 @@ -1485,7 +1578,8 @@ EXPORT_SYMBOL(kill_pid);
23776   */
23777  struct sigqueue *sigqueue_alloc(void)
23778  {
23779 -       struct sigqueue *q = __sigqueue_alloc(-1, current, GFP_KERNEL, 0);
23780 +       /* Preallocated sigqueue objects always from the slabcache ! */
23781 +       struct sigqueue *q = __sigqueue_do_alloc(-1, current, GFP_KERNEL, 0, 1);
23782
23783         if (q)
23784                 q->flags |= SIGQUEUE_PREALLOC;
23785 @@ -1846,15 +1940,7 @@ static void ptrace_stop(int exit_code, int why, int clear_code, siginfo_t *info)
23786                 if (gstop_done && ptrace_reparented(current))
23787                         do_notify_parent_cldstop(current, false, why);
23788
23789 -               /*
23790 -                * Don't want to allow preemption here, because
23791 -                * sys_ptrace() needs this task to be inactive.
23792 -                *
23793 -                * XXX: implement read_unlock_no_resched().
23794 -                */
23795 -               preempt_disable();
23796                 read_unlock(&tasklist_lock);
23797 -               preempt_enable_no_resched();
23798                 freezable_schedule();
23799         } else {
23800                 /*
23801 diff --git a/kernel/softirq.c b/kernel/softirq.c
23802 index 479e4436f787..cb9c1d5dee10 100644
23803 --- a/kernel/softirq.c
23804 +++ b/kernel/softirq.c
23805 @@ -21,10 +21,12 @@
23806  #include <linux/freezer.h>
23807  #include <linux/kthread.h>
23808  #include <linux/rcupdate.h>
23809 +#include <linux/delay.h>
23810  #include <linux/ftrace.h>
23811  #include <linux/smp.h>
23812  #include <linux/smpboot.h>
23813  #include <linux/tick.h>
23814 +#include <linux/locallock.h>
23815  #include <linux/irq.h>
23816
23817  #define CREATE_TRACE_POINTS
23818 @@ -56,12 +58,108 @@ EXPORT_SYMBOL(irq_stat);
23819  static struct softirq_action softirq_vec[NR_SOFTIRQS] __cacheline_aligned_in_smp;
23820
23821  DEFINE_PER_CPU(struct task_struct *, ksoftirqd);
23822 +#ifdef CONFIG_PREEMPT_RT_FULL
23823 +#define TIMER_SOFTIRQS ((1 << TIMER_SOFTIRQ) | (1 << HRTIMER_SOFTIRQ))
23824 +DEFINE_PER_CPU(struct task_struct *, ktimer_softirqd);
23825 +#endif
23826
23827  const char * const softirq_to_name[NR_SOFTIRQS] = {
23828         "HI", "TIMER", "NET_TX", "NET_RX", "BLOCK", "BLOCK_IOPOLL",
23829         "TASKLET", "SCHED", "HRTIMER", "RCU"
23830  };
23831
23832 +#ifdef CONFIG_NO_HZ_COMMON
23833 +# ifdef CONFIG_PREEMPT_RT_FULL
23834 +
23835 +struct softirq_runner {
23836 +       struct task_struct *runner[NR_SOFTIRQS];
23837 +};
23838 +
23839 +static DEFINE_PER_CPU(struct softirq_runner, softirq_runners);
23840 +
23841 +static inline void softirq_set_runner(unsigned int sirq)
23842 +{
23843 +       struct softirq_runner *sr = this_cpu_ptr(&softirq_runners);
23844 +
23845 +       sr->runner[sirq] = current;
23846 +}
23847 +
23848 +static inline void softirq_clr_runner(unsigned int sirq)
23849 +{
23850 +       struct softirq_runner *sr = this_cpu_ptr(&softirq_runners);
23851 +
23852 +       sr->runner[sirq] = NULL;
23853 +}
23854 +
23855 +/*
23856 + * On preempt-rt a softirq running context might be blocked on a
23857 + * lock. There might be no other runnable task on this CPU because the
23858 + * lock owner runs on some other CPU. So we have to go into idle with
23859 + * the pending bit set. Therefor we need to check this otherwise we
23860 + * warn about false positives which confuses users and defeats the
23861 + * whole purpose of this test.
23862 + *
23863 + * This code is called with interrupts disabled.
23864 + */
23865 +void softirq_check_pending_idle(void)
23866 +{
23867 +       static int rate_limit;
23868 +       struct softirq_runner *sr = this_cpu_ptr(&softirq_runners);
23869 +       u32 warnpending;
23870 +       int i;
23871 +
23872 +       if (rate_limit >= 10)
23873 +               return;
23874 +
23875 +       warnpending = local_softirq_pending() & SOFTIRQ_STOP_IDLE_MASK;
23876 +       for (i = 0; i < NR_SOFTIRQS; i++) {
23877 +               struct task_struct *tsk = sr->runner[i];
23878 +
23879 +               /*
23880 +                * The wakeup code in rtmutex.c wakes up the task
23881 +                * _before_ it sets pi_blocked_on to NULL under
23882 +                * tsk->pi_lock. So we need to check for both: state
23883 +                * and pi_blocked_on.
23884 +                */
23885 +               if (tsk) {
23886 +                       raw_spin_lock(&tsk->pi_lock);
23887 +                       if (tsk->pi_blocked_on || tsk->state == TASK_RUNNING) {
23888 +                               /* Clear all bits pending in that task */
23889 +                               warnpending &= ~(tsk->softirqs_raised);
23890 +                               warnpending &= ~(1 << i);
23891 +                       }
23892 +                       raw_spin_unlock(&tsk->pi_lock);
23893 +               }
23894 +       }
23895 +
23896 +       if (warnpending) {
23897 +               printk(KERN_ERR "NOHZ: local_softirq_pending %02x\n",
23898 +                      warnpending);
23899 +               rate_limit++;
23900 +       }
23901 +}
23902 +# else
23903 +/*
23904 + * On !PREEMPT_RT we just printk rate limited:
23905 + */
23906 +void softirq_check_pending_idle(void)
23907 +{
23908 +       static int rate_limit;
23909 +
23910 +       if (rate_limit < 10 &&
23911 +                       (local_softirq_pending() & SOFTIRQ_STOP_IDLE_MASK)) {
23912 +               printk(KERN_ERR "NOHZ: local_softirq_pending %02x\n",
23913 +                      local_softirq_pending());
23914 +               rate_limit++;
23915 +       }
23916 +}
23917 +# endif
23918 +
23919 +#else /* !CONFIG_NO_HZ_COMMON */
23920 +static inline void softirq_set_runner(unsigned int sirq) { }
23921 +static inline void softirq_clr_runner(unsigned int sirq) { }
23922 +#endif
23923 +
23924  /*
23925   * we cannot loop indefinitely here to avoid userspace starvation,
23926   * but we also don't want to introduce a worst case 1/HZ latency
23927 @@ -77,6 +175,79 @@ static void wakeup_softirqd(void)
23928                 wake_up_process(tsk);
23929  }
23930
23931 +#ifdef CONFIG_PREEMPT_RT_FULL
23932 +static void wakeup_timer_softirqd(void)
23933 +{
23934 +       /* Interrupts are disabled: no need to stop preemption */
23935 +       struct task_struct *tsk = __this_cpu_read(ktimer_softirqd);
23936 +
23937 +       if (tsk && tsk->state != TASK_RUNNING)
23938 +               wake_up_process(tsk);
23939 +}
23940 +#endif
23941 +
23942 +static void handle_softirq(unsigned int vec_nr)
23943 +{
23944 +       struct softirq_action *h = softirq_vec + vec_nr;
23945 +       int prev_count;
23946 +
23947 +       prev_count = preempt_count();
23948 +
23949 +       kstat_incr_softirqs_this_cpu(vec_nr);
23950 +
23951 +       trace_softirq_entry(vec_nr);
23952 +       h->action(h);
23953 +       trace_softirq_exit(vec_nr);
23954 +       if (unlikely(prev_count != preempt_count())) {
23955 +               pr_err("huh, entered softirq %u %s %p with preempt_count %08x, exited with %08x?\n",
23956 +                      vec_nr, softirq_to_name[vec_nr], h->action,
23957 +                      prev_count, preempt_count());
23958 +               preempt_count_set(prev_count);
23959 +       }
23960 +}
23961 +
23962 +#ifndef CONFIG_PREEMPT_RT_FULL
23963 +static inline int ksoftirqd_softirq_pending(void)
23964 +{
23965 +       return local_softirq_pending();
23966 +}
23967 +
23968 +static void handle_pending_softirqs(u32 pending)
23969 +{
23970 +       struct softirq_action *h = softirq_vec;
23971 +       int softirq_bit;
23972 +
23973 +       local_irq_enable();
23974 +
23975 +       h = softirq_vec;
23976 +
23977 +       while ((softirq_bit = ffs(pending))) {
23978 +               unsigned int vec_nr;
23979 +
23980 +               h += softirq_bit - 1;
23981 +               vec_nr = h - softirq_vec;
23982 +               handle_softirq(vec_nr);
23983 +
23984 +               h++;
23985 +               pending >>= softirq_bit;
23986 +       }
23987 +
23988 +       rcu_bh_qs();
23989 +       local_irq_disable();
23990 +}
23991 +
23992 +static void run_ksoftirqd(unsigned int cpu)
23993 +{
23994 +       local_irq_disable();
23995 +       if (ksoftirqd_softirq_pending()) {
23996 +               __do_softirq();
23997 +               local_irq_enable();
23998 +               cond_resched_rcu_qs();
23999 +               return;
24000 +       }
24001 +       local_irq_enable();
24002 +}
24003 +
24004  /*
24005   * preempt_count and SOFTIRQ_OFFSET usage:
24006   * - preempt_count is changed by SOFTIRQ_OFFSET on entering or leaving
24007 @@ -116,9 +287,9 @@ void __local_bh_disable_ip(unsigned long ip, unsigned int cnt)
24008
24009         if (preempt_count() == cnt) {
24010  #ifdef CONFIG_DEBUG_PREEMPT
24011 -               current->preempt_disable_ip = get_parent_ip(CALLER_ADDR1);
24012 +               current->preempt_disable_ip = get_lock_parent_ip();
24013  #endif
24014 -               trace_preempt_off(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1));
24015 +               trace_preempt_off(CALLER_ADDR0, get_lock_parent_ip());
24016         }
24017  }
24018  EXPORT_SYMBOL(__local_bh_disable_ip);
24019 @@ -232,10 +403,8 @@ asmlinkage __visible void __do_softirq(void)
24020         unsigned long end = jiffies + MAX_SOFTIRQ_TIME;
24021         unsigned long old_flags = current->flags;
24022         int max_restart = MAX_SOFTIRQ_RESTART;
24023 -       struct softirq_action *h;
24024         bool in_hardirq;
24025         __u32 pending;
24026 -       int softirq_bit;
24027
24028         /*
24029          * Mask out PF_MEMALLOC s current task context is borrowed for the
24030 @@ -254,36 +423,7 @@ restart:
24031         /* Reset the pending bitmask before enabling irqs */
24032         set_softirq_pending(0);
24033
24034 -       local_irq_enable();
24035 -
24036 -       h = softirq_vec;
24037 -
24038 -       while ((softirq_bit = ffs(pending))) {
24039 -               unsigned int vec_nr;
24040 -               int prev_count;
24041 -
24042 -               h += softirq_bit - 1;
24043 -
24044 -               vec_nr = h - softirq_vec;
24045 -               prev_count = preempt_count();
24046 -
24047 -               kstat_incr_softirqs_this_cpu(vec_nr);
24048 -
24049 -               trace_softirq_entry(vec_nr);
24050 -               h->action(h);
24051 -               trace_softirq_exit(vec_nr);
24052 -               if (unlikely(prev_count != preempt_count())) {
24053 -                       pr_err("huh, entered softirq %u %s %p with preempt_count %08x, exited with %08x?\n",
24054 -                              vec_nr, softirq_to_name[vec_nr], h->action,
24055 -                              prev_count, preempt_count());
24056 -                       preempt_count_set(prev_count);
24057 -               }
24058 -               h++;
24059 -               pending >>= softirq_bit;
24060 -       }
24061 -
24062 -       rcu_bh_qs();
24063 -       local_irq_disable();
24064 +       handle_pending_softirqs(pending);
24065
24066         pending = local_softirq_pending();
24067         if (pending) {
24068 @@ -320,6 +460,310 @@ asmlinkage __visible void do_softirq(void)
24069  }
24070
24071  /*
24072 + * This function must run with irqs disabled!
24073 + */
24074 +void raise_softirq_irqoff(unsigned int nr)
24075 +{
24076 +       __raise_softirq_irqoff(nr);
24077 +
24078 +       /*
24079 +        * If we're in an interrupt or softirq, we're done
24080 +        * (this also catches softirq-disabled code). We will
24081 +        * actually run the softirq once we return from
24082 +        * the irq or softirq.
24083 +        *
24084 +        * Otherwise we wake up ksoftirqd to make sure we
24085 +        * schedule the softirq soon.
24086 +        */
24087 +       if (!in_interrupt())
24088 +               wakeup_softirqd();
24089 +}
24090 +
24091 +void __raise_softirq_irqoff(unsigned int nr)
24092 +{
24093 +       trace_softirq_raise(nr);
24094 +       or_softirq_pending(1UL << nr);
24095 +}
24096 +
24097 +static inline void local_bh_disable_nort(void) { local_bh_disable(); }
24098 +static inline void _local_bh_enable_nort(void) { _local_bh_enable(); }
24099 +static void ksoftirqd_set_sched_params(unsigned int cpu) { }
24100 +
24101 +#else /* !PREEMPT_RT_FULL */
24102 +
24103 +/*
24104 + * On RT we serialize softirq execution with a cpu local lock per softirq
24105 + */
24106 +static DEFINE_PER_CPU(struct local_irq_lock [NR_SOFTIRQS], local_softirq_locks);
24107 +
24108 +void __init softirq_early_init(void)
24109 +{
24110 +       int i;
24111 +
24112 +       for (i = 0; i < NR_SOFTIRQS; i++)
24113 +               local_irq_lock_init(local_softirq_locks[i]);
24114 +}
24115 +
24116 +static void lock_softirq(int which)
24117 +{
24118 +       local_lock(local_softirq_locks[which]);
24119 +}
24120 +
24121 +static void unlock_softirq(int which)
24122 +{
24123 +       local_unlock(local_softirq_locks[which]);
24124 +}
24125 +
24126 +static void do_single_softirq(int which)
24127 +{
24128 +       unsigned long old_flags = current->flags;
24129 +
24130 +       current->flags &= ~PF_MEMALLOC;
24131 +       vtime_account_irq_enter(current);
24132 +       current->flags |= PF_IN_SOFTIRQ;
24133 +       lockdep_softirq_enter();
24134 +       local_irq_enable();
24135 +       handle_softirq(which);
24136 +       local_irq_disable();
24137 +       lockdep_softirq_exit();
24138 +       current->flags &= ~PF_IN_SOFTIRQ;
24139 +       vtime_account_irq_enter(current);
24140 +       tsk_restore_flags(current, old_flags, PF_MEMALLOC);
24141 +}
24142 +
24143 +/*
24144 + * Called with interrupts disabled. Process softirqs which were raised
24145 + * in current context (or on behalf of ksoftirqd).
24146 + */
24147 +static void do_current_softirqs(void)
24148 +{
24149 +       while (current->softirqs_raised) {
24150 +               int i = __ffs(current->softirqs_raised);
24151 +               unsigned int pending, mask = (1U << i);
24152 +
24153 +               current->softirqs_raised &= ~mask;
24154 +               local_irq_enable();
24155 +
24156 +               /*
24157 +                * If the lock is contended, we boost the owner to
24158 +                * process the softirq or leave the critical section
24159 +                * now.
24160 +                */
24161 +               lock_softirq(i);
24162 +               local_irq_disable();
24163 +               softirq_set_runner(i);
24164 +               /*
24165 +                * Check with the local_softirq_pending() bits,
24166 +                * whether we need to process this still or if someone
24167 +                * else took care of it.
24168 +                */
24169 +               pending = local_softirq_pending();
24170 +               if (pending & mask) {
24171 +                       set_softirq_pending(pending & ~mask);
24172 +                       do_single_softirq(i);
24173 +               }
24174 +               softirq_clr_runner(i);
24175 +               WARN_ON(current->softirq_nestcnt != 1);
24176 +               local_irq_enable();
24177 +               unlock_softirq(i);
24178 +               local_irq_disable();
24179 +       }
24180 +}
24181 +
24182 +void __local_bh_disable(void)
24183 +{
24184 +       if (++current->softirq_nestcnt == 1)
24185 +               migrate_disable();
24186 +}
24187 +EXPORT_SYMBOL(__local_bh_disable);
24188 +
24189 +void __local_bh_enable(void)
24190 +{
24191 +       if (WARN_ON(current->softirq_nestcnt == 0))
24192 +               return;
24193 +
24194 +       local_irq_disable();
24195 +       if (current->softirq_nestcnt == 1 && current->softirqs_raised)
24196 +               do_current_softirqs();
24197 +       local_irq_enable();
24198 +
24199 +       if (--current->softirq_nestcnt == 0)
24200 +               migrate_enable();
24201 +}
24202 +EXPORT_SYMBOL(__local_bh_enable);
24203 +
24204 +void _local_bh_enable(void)
24205 +{
24206 +       if (WARN_ON(current->softirq_nestcnt == 0))
24207 +               return;
24208 +       if (--current->softirq_nestcnt == 0)
24209 +               migrate_enable();
24210 +}
24211 +EXPORT_SYMBOL(_local_bh_enable);
24212 +
24213 +int in_serving_softirq(void)
24214 +{
24215 +       return current->flags & PF_IN_SOFTIRQ;
24216 +}
24217 +EXPORT_SYMBOL(in_serving_softirq);
24218 +
24219 +/* Called with preemption disabled */
24220 +static void run_ksoftirqd(unsigned int cpu)
24221 +{
24222 +       local_irq_disable();
24223 +       current->softirq_nestcnt++;
24224 +
24225 +       do_current_softirqs();
24226 +       current->softirq_nestcnt--;
24227 +       local_irq_enable();
24228 +       cond_resched_rcu_qs();
24229 +}
24230 +
24231 +/*
24232 + * Called from netif_rx_ni(). Preemption enabled, but migration
24233 + * disabled. So the cpu can't go away under us.
24234 + */
24235 +void thread_do_softirq(void)
24236 +{
24237 +       if (!in_serving_softirq() && current->softirqs_raised) {
24238 +               current->softirq_nestcnt++;
24239 +               do_current_softirqs();
24240 +               current->softirq_nestcnt--;
24241 +       }
24242 +}
24243 +
24244 +static void do_raise_softirq_irqoff(unsigned int nr)
24245 +{
24246 +       unsigned int mask;
24247 +
24248 +       mask = 1UL << nr;
24249 +
24250 +       trace_softirq_raise(nr);
24251 +       or_softirq_pending(mask);
24252 +
24253 +       /*
24254 +        * If we are not in a hard interrupt and inside a bh disabled
24255 +        * region, we simply raise the flag on current. local_bh_enable()
24256 +        * will make sure that the softirq is executed. Otherwise we
24257 +        * delegate it to ksoftirqd.
24258 +        */
24259 +       if (!in_irq() && current->softirq_nestcnt)
24260 +               current->softirqs_raised |= mask;
24261 +       else if (!__this_cpu_read(ksoftirqd) || !__this_cpu_read(ktimer_softirqd))
24262 +               return;
24263 +
24264 +       if (mask & TIMER_SOFTIRQS)
24265 +               __this_cpu_read(ktimer_softirqd)->softirqs_raised |= mask;
24266 +       else
24267 +               __this_cpu_read(ksoftirqd)->softirqs_raised |= mask;
24268 +}
24269 +
24270 +static void wakeup_proper_softirq(unsigned int nr)
24271 +{
24272 +       if ((1UL << nr) & TIMER_SOFTIRQS)
24273 +               wakeup_timer_softirqd();
24274 +       else
24275 +               wakeup_softirqd();
24276 +}
24277 +
24278 +
24279 +void __raise_softirq_irqoff(unsigned int nr)
24280 +{
24281 +       do_raise_softirq_irqoff(nr);
24282 +       if (!in_irq() && !current->softirq_nestcnt)
24283 +               wakeup_proper_softirq(nr);
24284 +}
24285 +
24286 +/*
24287 + * Same as __raise_softirq_irqoff() but will process them in ksoftirqd
24288 + */
24289 +void __raise_softirq_irqoff_ksoft(unsigned int nr)
24290 +{
24291 +       unsigned int mask;
24292 +
24293 +       if (WARN_ON_ONCE(!__this_cpu_read(ksoftirqd) ||
24294 +                        !__this_cpu_read(ktimer_softirqd)))
24295 +               return;
24296 +       mask = 1UL << nr;
24297 +
24298 +       trace_softirq_raise(nr);
24299 +       or_softirq_pending(mask);
24300 +       if (mask & TIMER_SOFTIRQS)
24301 +               __this_cpu_read(ktimer_softirqd)->softirqs_raised |= mask;
24302 +       else
24303 +               __this_cpu_read(ksoftirqd)->softirqs_raised |= mask;
24304 +       wakeup_proper_softirq(nr);
24305 +}
24306 +
24307 +/*
24308 + * This function must run with irqs disabled!
24309 + */
24310 +void raise_softirq_irqoff(unsigned int nr)
24311 +{
24312 +       do_raise_softirq_irqoff(nr);
24313 +
24314 +       /*
24315 +        * If we're in an hard interrupt we let irq return code deal
24316 +        * with the wakeup of ksoftirqd.
24317 +        */
24318 +       if (in_irq())
24319 +               return;
24320 +       /*
24321 +        * If we are in thread context but outside of a bh disabled
24322 +        * region, we need to wake ksoftirqd as well.
24323 +        *
24324 +        * CHECKME: Some of the places which do that could be wrapped
24325 +        * into local_bh_disable/enable pairs. Though it's unclear
24326 +        * whether this is worth the effort. To find those places just
24327 +        * raise a WARN() if the condition is met.
24328 +        */
24329 +       if (!current->softirq_nestcnt)
24330 +               wakeup_proper_softirq(nr);
24331 +}
24332 +
24333 +static inline int ksoftirqd_softirq_pending(void)
24334 +{
24335 +       return current->softirqs_raised;
24336 +}
24337 +
24338 +static inline void local_bh_disable_nort(void) { }
24339 +static inline void _local_bh_enable_nort(void) { }
24340 +
24341 +static inline void ksoftirqd_set_sched_params(unsigned int cpu)
24342 +{
24343 +       /* Take over all but timer pending softirqs when starting */
24344 +       local_irq_disable();
24345 +       current->softirqs_raised = local_softirq_pending() & ~TIMER_SOFTIRQS;
24346 +       local_irq_enable();
24347 +}
24348 +
24349 +static inline void ktimer_softirqd_set_sched_params(unsigned int cpu)
24350 +{
24351 +       struct sched_param param = { .sched_priority = 1 };
24352 +
24353 +       sched_setscheduler(current, SCHED_FIFO, &param);
24354 +
24355 +       /* Take over timer pending softirqs when starting */
24356 +       local_irq_disable();
24357 +       current->softirqs_raised = local_softirq_pending() & TIMER_SOFTIRQS;
24358 +       local_irq_enable();
24359 +}
24360 +
24361 +static inline void ktimer_softirqd_clr_sched_params(unsigned int cpu,
24362 +                                                   bool online)
24363 +{
24364 +       struct sched_param param = { .sched_priority = 0 };
24365 +
24366 +       sched_setscheduler(current, SCHED_NORMAL, &param);
24367 +}
24368 +
24369 +static int ktimer_softirqd_should_run(unsigned int cpu)
24370 +{
24371 +       return current->softirqs_raised;
24372 +}
24373 +
24374 +#endif /* PREEMPT_RT_FULL */
24375 +/*
24376   * Enter an interrupt context.
24377   */
24378  void irq_enter(void)
24379 @@ -330,9 +774,9 @@ void irq_enter(void)
24380                  * Prevent raise_softirq from needlessly waking up ksoftirqd
24381                  * here, as softirq will be serviced on return from interrupt.
24382                  */
24383 -               local_bh_disable();
24384 +               local_bh_disable_nort();
24385                 tick_irq_enter();
24386 -               _local_bh_enable();
24387 +               _local_bh_enable_nort();
24388         }
24389
24390         __irq_enter();
24391 @@ -340,6 +784,7 @@ void irq_enter(void)
24392
24393  static inline void invoke_softirq(void)
24394  {
24395 +#ifndef CONFIG_PREEMPT_RT_FULL
24396         if (!force_irqthreads) {
24397  #ifdef CONFIG_HAVE_IRQ_EXIT_ON_IRQ_STACK
24398                 /*
24399 @@ -359,6 +804,18 @@ static inline void invoke_softirq(void)
24400         } else {
24401                 wakeup_softirqd();
24402         }
24403 +#else /* PREEMPT_RT_FULL */
24404 +       unsigned long flags;
24405 +
24406 +       local_irq_save(flags);
24407 +       if (__this_cpu_read(ksoftirqd) &&
24408 +                       __this_cpu_read(ksoftirqd)->softirqs_raised)
24409 +               wakeup_softirqd();
24410 +       if (__this_cpu_read(ktimer_softirqd) &&
24411 +                       __this_cpu_read(ktimer_softirqd)->softirqs_raised)
24412 +               wakeup_timer_softirqd();
24413 +       local_irq_restore(flags);
24414 +#endif
24415  }
24416
24417  static inline void tick_irq_exit(void)
24418 @@ -395,26 +852,6 @@ void irq_exit(void)
24419         trace_hardirq_exit(); /* must be last! */
24420  }
24421
24422 -/*
24423 - * This function must run with irqs disabled!
24424 - */
24425 -inline void raise_softirq_irqoff(unsigned int nr)
24426 -{
24427 -       __raise_softirq_irqoff(nr);
24428 -
24429 -       /*
24430 -        * If we're in an interrupt or softirq, we're done
24431 -        * (this also catches softirq-disabled code). We will
24432 -        * actually run the softirq once we return from
24433 -        * the irq or softirq.
24434 -        *
24435 -        * Otherwise we wake up ksoftirqd to make sure we
24436 -        * schedule the softirq soon.
24437 -        */
24438 -       if (!in_interrupt())
24439 -               wakeup_softirqd();
24440 -}
24441 -
24442  void raise_softirq(unsigned int nr)
24443  {
24444         unsigned long flags;
24445 @@ -424,12 +861,6 @@ void raise_softirq(unsigned int nr)
24446         local_irq_restore(flags);
24447  }
24448
24449 -void __raise_softirq_irqoff(unsigned int nr)
24450 -{
24451 -       trace_softirq_raise(nr);
24452 -       or_softirq_pending(1UL << nr);
24453 -}
24454 -
24455  void open_softirq(int nr, void (*action)(struct softirq_action *))
24456  {
24457         softirq_vec[nr].action = action;
24458 @@ -446,15 +877,45 @@ struct tasklet_head {
24459  static DEFINE_PER_CPU(struct tasklet_head, tasklet_vec);
24460  static DEFINE_PER_CPU(struct tasklet_head, tasklet_hi_vec);
24461
24462 +static void inline
24463 +__tasklet_common_schedule(struct tasklet_struct *t, struct tasklet_head *head, unsigned int nr)
24464 +{
24465 +       if (tasklet_trylock(t)) {
24466 +again:
24467 +               /* We may have been preempted before tasklet_trylock
24468 +                * and __tasklet_action may have already run.
24469 +                * So double check the sched bit while the takslet
24470 +                * is locked before adding it to the list.
24471 +                */
24472 +               if (test_bit(TASKLET_STATE_SCHED, &t->state)) {
24473 +                       t->next = NULL;
24474 +                       *head->tail = t;
24475 +                       head->tail = &(t->next);
24476 +                       raise_softirq_irqoff(nr);
24477 +                       tasklet_unlock(t);
24478 +               } else {
24479 +                       /* This is subtle. If we hit the corner case above
24480 +                        * It is possible that we get preempted right here,
24481 +                        * and another task has successfully called
24482 +                        * tasklet_schedule(), then this function, and
24483 +                        * failed on the trylock. Thus we must be sure
24484 +                        * before releasing the tasklet lock, that the
24485 +                        * SCHED_BIT is clear. Otherwise the tasklet
24486 +                        * may get its SCHED_BIT set, but not added to the
24487 +                        * list
24488 +                        */
24489 +                       if (!tasklet_tryunlock(t))
24490 +                               goto again;
24491 +               }
24492 +       }
24493 +}
24494 +
24495  void __tasklet_schedule(struct tasklet_struct *t)
24496  {
24497         unsigned long flags;
24498
24499         local_irq_save(flags);
24500 -       t->next = NULL;
24501 -       *__this_cpu_read(tasklet_vec.tail) = t;
24502 -       __this_cpu_write(tasklet_vec.tail, &(t->next));
24503 -       raise_softirq_irqoff(TASKLET_SOFTIRQ);
24504 +       __tasklet_common_schedule(t, this_cpu_ptr(&tasklet_vec), TASKLET_SOFTIRQ);
24505         local_irq_restore(flags);
24506  }
24507  EXPORT_SYMBOL(__tasklet_schedule);
24508 @@ -464,10 +925,7 @@ void __tasklet_hi_schedule(struct tasklet_struct *t)
24509         unsigned long flags;
24510
24511         local_irq_save(flags);
24512 -       t->next = NULL;
24513 -       *__this_cpu_read(tasklet_hi_vec.tail) = t;
24514 -       __this_cpu_write(tasklet_hi_vec.tail,  &(t->next));
24515 -       raise_softirq_irqoff(HI_SOFTIRQ);
24516 +       __tasklet_common_schedule(t, this_cpu_ptr(&tasklet_hi_vec), HI_SOFTIRQ);
24517         local_irq_restore(flags);
24518  }
24519  EXPORT_SYMBOL(__tasklet_hi_schedule);
24520 @@ -476,82 +934,122 @@ void __tasklet_hi_schedule_first(struct tasklet_struct *t)
24521  {
24522         BUG_ON(!irqs_disabled());
24523
24524 -       t->next = __this_cpu_read(tasklet_hi_vec.head);
24525 -       __this_cpu_write(tasklet_hi_vec.head, t);
24526 -       __raise_softirq_irqoff(HI_SOFTIRQ);
24527 +       __tasklet_hi_schedule(t);
24528  }
24529  EXPORT_SYMBOL(__tasklet_hi_schedule_first);
24530
24531 -static void tasklet_action(struct softirq_action *a)
24532 +void  tasklet_enable(struct tasklet_struct *t)
24533  {
24534 -       struct tasklet_struct *list;
24535 +       if (!atomic_dec_and_test(&t->count))
24536 +               return;
24537 +       if (test_and_clear_bit(TASKLET_STATE_PENDING, &t->state))
24538 +               tasklet_schedule(t);
24539 +}
24540 +EXPORT_SYMBOL(tasklet_enable);
24541
24542 -       local_irq_disable();
24543 -       list = __this_cpu_read(tasklet_vec.head);
24544 -       __this_cpu_write(tasklet_vec.head, NULL);
24545 -       __this_cpu_write(tasklet_vec.tail, this_cpu_ptr(&tasklet_vec.head));
24546 -       local_irq_enable();
24547 +static void __tasklet_action(struct softirq_action *a,
24548 +                            struct tasklet_struct *list)
24549 +{
24550 +       int loops = 1000000;
24551
24552         while (list) {
24553                 struct tasklet_struct *t = list;
24554
24555                 list = list->next;
24556
24557 -               if (tasklet_trylock(t)) {
24558 -                       if (!atomic_read(&t->count)) {
24559 -                               if (!test_and_clear_bit(TASKLET_STATE_SCHED,
24560 -                                                       &t->state))
24561 -                                       BUG();
24562 -                               t->func(t->data);
24563 -                               tasklet_unlock(t);
24564 -                               continue;
24565 -                       }
24566 -                       tasklet_unlock(t);
24567 +               /*
24568 +                * Should always succeed - after a tasklist got on the
24569 +                * list (after getting the SCHED bit set from 0 to 1),
24570 +                * nothing but the tasklet softirq it got queued to can
24571 +                * lock it:
24572 +                */
24573 +               if (!tasklet_trylock(t)) {
24574 +                       WARN_ON(1);
24575 +                       continue;
24576                 }
24577
24578 -               local_irq_disable();
24579                 t->next = NULL;
24580 -               *__this_cpu_read(tasklet_vec.tail) = t;
24581 -               __this_cpu_write(tasklet_vec.tail, &(t->next));
24582 -               __raise_softirq_irqoff(TASKLET_SOFTIRQ);
24583 -               local_irq_enable();
24584 +
24585 +               /*
24586 +                * If we cannot handle the tasklet because it's disabled,
24587 +                * mark it as pending. tasklet_enable() will later
24588 +                * re-schedule the tasklet.
24589 +                */
24590 +               if (unlikely(atomic_read(&t->count))) {
24591 +out_disabled:
24592 +                       /* implicit unlock: */
24593 +                       wmb();
24594 +                       t->state = TASKLET_STATEF_PENDING;
24595 +                       continue;
24596 +               }
24597 +
24598 +               /*
24599 +                * After this point on the tasklet might be rescheduled
24600 +                * on another CPU, but it can only be added to another
24601 +                * CPU's tasklet list if we unlock the tasklet (which we
24602 +                * dont do yet).
24603 +                */
24604 +               if (!test_and_clear_bit(TASKLET_STATE_SCHED, &t->state))
24605 +                       WARN_ON(1);
24606 +
24607 +again:
24608 +               t->func(t->data);
24609 +
24610 +               /*
24611 +                * Try to unlock the tasklet. We must use cmpxchg, because
24612 +                * another CPU might have scheduled or disabled the tasklet.
24613 +                * We only allow the STATE_RUN -> 0 transition here.
24614 +                */
24615 +               while (!tasklet_tryunlock(t)) {
24616 +                       /*
24617 +                        * If it got disabled meanwhile, bail out:
24618 +                        */
24619 +                       if (atomic_read(&t->count))
24620 +                               goto out_disabled;
24621 +                       /*
24622 +                        * If it got scheduled meanwhile, re-execute
24623 +                        * the tasklet function:
24624 +                        */
24625 +                       if (test_and_clear_bit(TASKLET_STATE_SCHED, &t->state))
24626 +                               goto again;
24627 +                       if (!--loops) {
24628 +                               printk("hm, tasklet state: %08lx\n", t->state);
24629 +                               WARN_ON(1);
24630 +                               tasklet_unlock(t);
24631 +                               break;
24632 +                       }
24633 +               }
24634         }
24635  }
24636
24637 +static void tasklet_action(struct softirq_action *a)
24638 +{
24639 +       struct tasklet_struct *list;
24640 +
24641 +       local_irq_disable();
24642 +
24643 +       list = __this_cpu_read(tasklet_vec.head);
24644 +       __this_cpu_write(tasklet_vec.head, NULL);
24645 +       __this_cpu_write(tasklet_vec.tail, this_cpu_ptr(&tasklet_vec.head));
24646 +
24647 +       local_irq_enable();
24648 +
24649 +       __tasklet_action(a, list);
24650 +}
24651 +
24652  static void tasklet_hi_action(struct softirq_action *a)
24653  {
24654         struct tasklet_struct *list;
24655
24656         local_irq_disable();
24657 +
24658         list = __this_cpu_read(tasklet_hi_vec.head);
24659         __this_cpu_write(tasklet_hi_vec.head, NULL);
24660         __this_cpu_write(tasklet_hi_vec.tail, this_cpu_ptr(&tasklet_hi_vec.head));
24661 -       local_irq_enable();
24662
24663 -       while (list) {
24664 -               struct tasklet_struct *t = list;
24665 -
24666 -               list = list->next;
24667 -
24668 -               if (tasklet_trylock(t)) {
24669 -                       if (!atomic_read(&t->count)) {
24670 -                               if (!test_and_clear_bit(TASKLET_STATE_SCHED,
24671 -                                                       &t->state))
24672 -                                       BUG();
24673 -                               t->func(t->data);
24674 -                               tasklet_unlock(t);
24675 -                               continue;
24676 -                       }
24677 -                       tasklet_unlock(t);
24678 -               }
24679 +       local_irq_enable();
24680
24681 -               local_irq_disable();
24682 -               t->next = NULL;
24683 -               *__this_cpu_read(tasklet_hi_vec.tail) = t;
24684 -               __this_cpu_write(tasklet_hi_vec.tail, &(t->next));
24685 -               __raise_softirq_irqoff(HI_SOFTIRQ);
24686 -               local_irq_enable();
24687 -       }
24688 +       __tasklet_action(a, list);
24689  }
24690
24691  void tasklet_init(struct tasklet_struct *t,
24692 @@ -572,7 +1070,7 @@ void tasklet_kill(struct tasklet_struct *t)
24693
24694         while (test_and_set_bit(TASKLET_STATE_SCHED, &t->state)) {
24695                 do {
24696 -                       yield();
24697 +                       msleep(1);
24698                 } while (test_bit(TASKLET_STATE_SCHED, &t->state));
24699         }
24700         tasklet_unlock_wait(t);
24701 @@ -646,25 +1144,26 @@ void __init softirq_init(void)
24702         open_softirq(HI_SOFTIRQ, tasklet_hi_action);
24703  }
24704
24705 -static int ksoftirqd_should_run(unsigned int cpu)
24706 -{
24707 -       return local_softirq_pending();
24708 -}
24709 -
24710 -static void run_ksoftirqd(unsigned int cpu)
24711 +#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT_FULL)
24712 +void tasklet_unlock_wait(struct tasklet_struct *t)
24713  {
24714 -       local_irq_disable();
24715 -       if (local_softirq_pending()) {
24716 +       while (test_bit(TASKLET_STATE_RUN, &(t)->state)) {
24717                 /*
24718 -                * We can safely run softirq on inline stack, as we are not deep
24719 -                * in the task stack here.
24720 +                * Hack for now to avoid this busy-loop:
24721                  */
24722 -               __do_softirq();
24723 -               local_irq_enable();
24724 -               cond_resched_rcu_qs();
24725 -               return;
24726 +#ifdef CONFIG_PREEMPT_RT_FULL
24727 +               msleep(1);
24728 +#else
24729 +               barrier();
24730 +#endif
24731         }
24732 -       local_irq_enable();
24733 +}
24734 +EXPORT_SYMBOL(tasklet_unlock_wait);
24735 +#endif
24736 +
24737 +static int ksoftirqd_should_run(unsigned int cpu)
24738 +{
24739 +       return ksoftirqd_softirq_pending();
24740  }
24741
24742  #ifdef CONFIG_HOTPLUG_CPU
24743 @@ -746,16 +1245,31 @@ static struct notifier_block cpu_nfb = {
24744
24745  static struct smp_hotplug_thread softirq_threads = {
24746         .store                  = &ksoftirqd,
24747 +       .setup                  = ksoftirqd_set_sched_params,
24748         .thread_should_run      = ksoftirqd_should_run,
24749         .thread_fn              = run_ksoftirqd,
24750         .thread_comm            = "ksoftirqd/%u",
24751  };
24752
24753 +#ifdef CONFIG_PREEMPT_RT_FULL
24754 +static struct smp_hotplug_thread softirq_timer_threads = {
24755 +       .store                  = &ktimer_softirqd,
24756 +       .setup                  = ktimer_softirqd_set_sched_params,
24757 +       .cleanup                = ktimer_softirqd_clr_sched_params,
24758 +       .thread_should_run      = ktimer_softirqd_should_run,
24759 +       .thread_fn              = run_ksoftirqd,
24760 +       .thread_comm            = "ktimersoftd/%u",
24761 +};
24762 +#endif
24763 +
24764  static __init int spawn_ksoftirqd(void)
24765  {
24766         register_cpu_notifier(&cpu_nfb);
24767
24768         BUG_ON(smpboot_register_percpu_thread(&softirq_threads));
24769 +#ifdef CONFIG_PREEMPT_RT_FULL
24770 +       BUG_ON(smpboot_register_percpu_thread(&softirq_timer_threads));
24771 +#endif
24772
24773         return 0;
24774  }
24775 diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
24776 index a3bbaee77c58..f84d3b45cda7 100644
24777 --- a/kernel/stop_machine.c
24778 +++ b/kernel/stop_machine.c
24779 @@ -37,7 +37,7 @@ struct cpu_stop_done {
24780  struct cpu_stopper {
24781         struct task_struct      *thread;
24782
24783 -       spinlock_t              lock;
24784 +       raw_spinlock_t          lock;
24785         bool                    enabled;        /* is this stopper enabled? */
24786         struct list_head        works;          /* list of pending works */
24787
24788 @@ -86,12 +86,12 @@ static void cpu_stop_queue_work(unsigned int cpu, struct cpu_stop_work *work)
24789         struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu);
24790         unsigned long flags;
24791
24792 -       spin_lock_irqsave(&stopper->lock, flags);
24793 +       raw_spin_lock_irqsave(&stopper->lock, flags);
24794         if (stopper->enabled)
24795                 __cpu_stop_queue_work(stopper, work);
24796         else
24797                 cpu_stop_signal_done(work->done, false);
24798 -       spin_unlock_irqrestore(&stopper->lock, flags);
24799 +       raw_spin_unlock_irqrestore(&stopper->lock, flags);
24800  }
24801
24802  /**
24803 @@ -224,8 +224,8 @@ static int cpu_stop_queue_two_works(int cpu1, struct cpu_stop_work *work1,
24804         int err;
24805
24806         lg_double_lock(&stop_cpus_lock, cpu1, cpu2);
24807 -       spin_lock_irq(&stopper1->lock);
24808 -       spin_lock_nested(&stopper2->lock, SINGLE_DEPTH_NESTING);
24809 +       raw_spin_lock_irq(&stopper1->lock);
24810 +       raw_spin_lock_nested(&stopper2->lock, SINGLE_DEPTH_NESTING);
24811
24812         err = -ENOENT;
24813         if (!stopper1->enabled || !stopper2->enabled)
24814 @@ -235,8 +235,8 @@ static int cpu_stop_queue_two_works(int cpu1, struct cpu_stop_work *work1,
24815         __cpu_stop_queue_work(stopper1, work1);
24816         __cpu_stop_queue_work(stopper2, work2);
24817  unlock:
24818 -       spin_unlock(&stopper2->lock);
24819 -       spin_unlock_irq(&stopper1->lock);
24820 +       raw_spin_unlock(&stopper2->lock);
24821 +       raw_spin_unlock_irq(&stopper1->lock);
24822         lg_double_unlock(&stop_cpus_lock, cpu1, cpu2);
24823
24824         return err;
24825 @@ -258,7 +258,7 @@ int stop_two_cpus(unsigned int cpu1, unsigned int cpu2, cpu_stop_fn_t fn, void *
24826         struct cpu_stop_work work1, work2;
24827         struct multi_stop_data msdata;
24828
24829 -       preempt_disable();
24830 +       preempt_disable_nort();
24831         msdata = (struct multi_stop_data){
24832                 .fn = fn,
24833                 .data = arg,
24834 @@ -278,11 +278,11 @@ int stop_two_cpus(unsigned int cpu1, unsigned int cpu2, cpu_stop_fn_t fn, void *
24835         if (cpu1 > cpu2)
24836                 swap(cpu1, cpu2);
24837         if (cpu_stop_queue_two_works(cpu1, &work1, cpu2, &work2)) {
24838 -               preempt_enable();
24839 +               preempt_enable_nort();
24840                 return -ENOENT;
24841         }
24842
24843 -       preempt_enable();
24844 +       preempt_enable_nort();
24845
24846         wait_for_completion(&done.completion);
24847
24848 @@ -315,17 +315,20 @@ static DEFINE_MUTEX(stop_cpus_mutex);
24849
24850  static void queue_stop_cpus_work(const struct cpumask *cpumask,
24851                                  cpu_stop_fn_t fn, void *arg,
24852 -                                struct cpu_stop_done *done)
24853 +                                struct cpu_stop_done *done, bool inactive)
24854  {
24855         struct cpu_stop_work *work;
24856         unsigned int cpu;
24857
24858         /*
24859 -        * Disable preemption while queueing to avoid getting
24860 -        * preempted by a stopper which might wait for other stoppers
24861 -        * to enter @fn which can lead to deadlock.
24862 +        * Make sure that all work is queued on all cpus before
24863 +        * any of the cpus can execute it.
24864          */
24865 -       lg_global_lock(&stop_cpus_lock);
24866 +       if (!inactive)
24867 +               lg_global_lock(&stop_cpus_lock);
24868 +       else
24869 +               lg_global_trylock_relax(&stop_cpus_lock);
24870 +
24871         for_each_cpu(cpu, cpumask) {
24872                 work = &per_cpu(cpu_stopper.stop_work, cpu);
24873                 work->fn = fn;
24874 @@ -342,7 +345,7 @@ static int __stop_cpus(const struct cpumask *cpumask,
24875         struct cpu_stop_done done;
24876
24877         cpu_stop_init_done(&done, cpumask_weight(cpumask));
24878 -       queue_stop_cpus_work(cpumask, fn, arg, &done);
24879 +       queue_stop_cpus_work(cpumask, fn, arg, &done, false);
24880         wait_for_completion(&done.completion);
24881         return done.executed ? done.ret : -ENOENT;
24882  }
24883 @@ -422,9 +425,9 @@ static int cpu_stop_should_run(unsigned int cpu)
24884         unsigned long flags;
24885         int run;
24886
24887 -       spin_lock_irqsave(&stopper->lock, flags);
24888 +       raw_spin_lock_irqsave(&stopper->lock, flags);
24889         run = !list_empty(&stopper->works);
24890 -       spin_unlock_irqrestore(&stopper->lock, flags);
24891 +       raw_spin_unlock_irqrestore(&stopper->lock, flags);
24892         return run;
24893  }
24894
24895 @@ -436,13 +439,13 @@ static void cpu_stopper_thread(unsigned int cpu)
24896
24897  repeat:
24898         work = NULL;
24899 -       spin_lock_irq(&stopper->lock);
24900 +       raw_spin_lock_irq(&stopper->lock);
24901         if (!list_empty(&stopper->works)) {
24902                 work = list_first_entry(&stopper->works,
24903                                         struct cpu_stop_work, list);
24904                 list_del_init(&work->list);
24905         }
24906 -       spin_unlock_irq(&stopper->lock);
24907 +       raw_spin_unlock_irq(&stopper->lock);
24908
24909         if (work) {
24910                 cpu_stop_fn_t fn = work->fn;
24911 @@ -450,6 +453,16 @@ repeat:
24912                 struct cpu_stop_done *done = work->done;
24913                 char ksym_buf[KSYM_NAME_LEN] __maybe_unused;
24914
24915 +               /*
24916 +                * Wait until the stopper finished scheduling on all
24917 +                * cpus
24918 +                */
24919 +               lg_global_lock(&stop_cpus_lock);
24920 +               /*
24921 +                * Let other cpu threads continue as well
24922 +                */
24923 +               lg_global_unlock(&stop_cpus_lock);
24924 +
24925                 /* cpu stop callbacks are not allowed to sleep */
24926                 preempt_disable();
24927
24928 @@ -520,10 +533,12 @@ static int __init cpu_stop_init(void)
24929         for_each_possible_cpu(cpu) {
24930                 struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu);
24931
24932 -               spin_lock_init(&stopper->lock);
24933 +               raw_spin_lock_init(&stopper->lock);
24934                 INIT_LIST_HEAD(&stopper->works);
24935         }
24936
24937 +       lg_lock_init(&stop_cpus_lock, "stop_cpus_lock");
24938 +
24939         BUG_ON(smpboot_register_percpu_thread(&cpu_stop_threads));
24940         stop_machine_unpark(raw_smp_processor_id());
24941         stop_machine_initialized = true;
24942 @@ -620,7 +635,7 @@ int stop_machine_from_inactive_cpu(cpu_stop_fn_t fn, void *data,
24943         set_state(&msdata, MULTI_STOP_PREPARE);
24944         cpu_stop_init_done(&done, num_active_cpus());
24945         queue_stop_cpus_work(cpu_active_mask, multi_cpu_stop, &msdata,
24946 -                            &done);
24947 +                            &done, true);
24948         ret = multi_cpu_stop(&msdata);
24949
24950         /* Busy wait for completion. */
24951 diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
24952 index 17f7bcff1e02..ba3d60144838 100644
24953 --- a/kernel/time/hrtimer.c
24954 +++ b/kernel/time/hrtimer.c
24955 @@ -48,11 +48,13 @@
24956  #include <linux/sched/rt.h>
24957  #include <linux/sched/deadline.h>
24958  #include <linux/timer.h>
24959 +#include <linux/kthread.h>
24960  #include <linux/freezer.h>
24961
24962  #include <asm/uaccess.h>
24963
24964  #include <trace/events/timer.h>
24965 +#include <trace/events/hist.h>
24966
24967  #include "tick-internal.h"
24968
24969 @@ -717,6 +719,44 @@ static void clock_was_set_work(struct work_struct *work)
24970
24971  static DECLARE_WORK(hrtimer_work, clock_was_set_work);
24972
24973 +#ifdef CONFIG_PREEMPT_RT_FULL
24974 +/*
24975 + * RT can not call schedule_work from real interrupt context.
24976 + * Need to make a thread to do the real work.
24977 + */
24978 +static struct task_struct *clock_set_delay_thread;
24979 +static bool do_clock_set_delay;
24980 +
24981 +static int run_clock_set_delay(void *ignore)
24982 +{
24983 +       while (!kthread_should_stop()) {
24984 +               set_current_state(TASK_INTERRUPTIBLE);
24985 +               if (do_clock_set_delay) {
24986 +                       do_clock_set_delay = false;
24987 +                       schedule_work(&hrtimer_work);
24988 +               }
24989 +               schedule();
24990 +       }
24991 +       __set_current_state(TASK_RUNNING);
24992 +       return 0;
24993 +}
24994 +
24995 +void clock_was_set_delayed(void)
24996 +{
24997 +       do_clock_set_delay = true;
24998 +       /* Make visible before waking up process */
24999 +       smp_wmb();
25000 +       wake_up_process(clock_set_delay_thread);
25001 +}
25002 +
25003 +static __init int create_clock_set_delay_thread(void)
25004 +{
25005 +       clock_set_delay_thread = kthread_run(run_clock_set_delay, NULL, "kclksetdelayd");
25006 +       BUG_ON(!clock_set_delay_thread);
25007 +       return 0;
25008 +}
25009 +early_initcall(create_clock_set_delay_thread);
25010 +#else /* PREEMPT_RT_FULL */
25011  /*
25012   * Called from timekeeping and resume code to reprogramm the hrtimer
25013   * interrupt device on all cpus.
25014 @@ -725,6 +765,7 @@ void clock_was_set_delayed(void)
25015  {
25016         schedule_work(&hrtimer_work);
25017  }
25018 +#endif
25019
25020  #else
25021
25022 @@ -734,11 +775,8 @@ static inline int hrtimer_is_hres_enabled(void) { return 0; }
25023  static inline void hrtimer_switch_to_hres(void) { }
25024  static inline void
25025  hrtimer_force_reprogram(struct hrtimer_cpu_base *base, int skip_equal) { }
25026 -static inline int hrtimer_reprogram(struct hrtimer *timer,
25027 -                                   struct hrtimer_clock_base *base)
25028 -{
25029 -       return 0;
25030 -}
25031 +static inline void hrtimer_reprogram(struct hrtimer *timer,
25032 +                                    struct hrtimer_clock_base *base) { }
25033  static inline void hrtimer_init_hres(struct hrtimer_cpu_base *base) { }
25034  static inline void retrigger_next_event(void *arg) { }
25035
25036 @@ -870,6 +908,32 @@ u64 hrtimer_forward(struct hrtimer *timer, ktime_t now, ktime_t interval)
25037  }
25038  EXPORT_SYMBOL_GPL(hrtimer_forward);
25039
25040 +#ifdef CONFIG_PREEMPT_RT_BASE
25041 +# define wake_up_timer_waiters(b)      wake_up(&(b)->wait)
25042 +
25043 +/**
25044 + * hrtimer_wait_for_timer - Wait for a running timer
25045 + *
25046 + * @timer:     timer to wait for
25047 + *
25048 + * The function waits in case the timers callback function is
25049 + * currently executed on the waitqueue of the timer base. The
25050 + * waitqueue is woken up after the timer callback function has
25051 + * finished execution.
25052 + */
25053 +void hrtimer_wait_for_timer(const struct hrtimer *timer)
25054 +{
25055 +       struct hrtimer_clock_base *base = timer->base;
25056 +
25057 +       if (base && base->cpu_base && !timer->irqsafe)
25058 +               wait_event(base->cpu_base->wait,
25059 +                               !(hrtimer_callback_running(timer)));
25060 +}
25061 +
25062 +#else
25063 +# define wake_up_timer_waiters(b)      do { } while (0)
25064 +#endif
25065 +
25066  /*
25067   * enqueue_hrtimer - internal function to (re)start a timer
25068   *
25069 @@ -911,6 +975,11 @@ static void __remove_hrtimer(struct hrtimer *timer,
25070         if (!(state & HRTIMER_STATE_ENQUEUED))
25071                 return;
25072
25073 +       if (unlikely(!list_empty(&timer->cb_entry))) {
25074 +               list_del_init(&timer->cb_entry);
25075 +               return;
25076 +       }
25077 +
25078         if (!timerqueue_del(&base->active, &timer->node))
25079                 cpu_base->active_bases &= ~(1 << base->index);
25080
25081 @@ -1006,7 +1075,16 @@ void hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim,
25082         new_base = switch_hrtimer_base(timer, base, mode & HRTIMER_MODE_PINNED);
25083
25084         timer_stats_hrtimer_set_start_info(timer);
25085 +#ifdef CONFIG_MISSED_TIMER_OFFSETS_HIST
25086 +       {
25087 +               ktime_t now = new_base->get_time();
25088
25089 +               if (ktime_to_ns(tim) < ktime_to_ns(now))
25090 +                       timer->praecox = now;
25091 +               else
25092 +                       timer->praecox = ktime_set(0, 0);
25093 +       }
25094 +#endif
25095         leftmost = enqueue_hrtimer(timer, new_base);
25096         if (!leftmost)
25097                 goto unlock;
25098 @@ -1078,7 +1156,7 @@ int hrtimer_cancel(struct hrtimer *timer)
25099
25100                 if (ret >= 0)
25101                         return ret;
25102 -               cpu_relax();
25103 +               hrtimer_wait_for_timer(timer);
25104         }
25105  }
25106  EXPORT_SYMBOL_GPL(hrtimer_cancel);
25107 @@ -1142,6 +1220,7 @@ static void __hrtimer_init(struct hrtimer *timer, clockid_t clock_id,
25108
25109         base = hrtimer_clockid_to_base(clock_id);
25110         timer->base = &cpu_base->clock_base[base];
25111 +       INIT_LIST_HEAD(&timer->cb_entry);
25112         timerqueue_init(&timer->node);
25113
25114  #ifdef CONFIG_TIMER_STATS
25115 @@ -1182,6 +1261,7 @@ bool hrtimer_active(const struct hrtimer *timer)
25116                 seq = raw_read_seqcount_begin(&cpu_base->seq);
25117
25118                 if (timer->state != HRTIMER_STATE_INACTIVE ||
25119 +                   cpu_base->running_soft == timer ||
25120                     cpu_base->running == timer)
25121                         return true;
25122
25123 @@ -1280,10 +1360,112 @@ static void __run_hrtimer(struct hrtimer_cpu_base *cpu_base,
25124         cpu_base->running = NULL;
25125  }
25126
25127 +#ifdef CONFIG_PREEMPT_RT_BASE
25128 +static void hrtimer_rt_reprogram(int restart, struct hrtimer *timer,
25129 +                                struct hrtimer_clock_base *base)
25130 +{
25131 +       int leftmost;
25132 +
25133 +       if (restart != HRTIMER_NORESTART &&
25134 +           !(timer->state & HRTIMER_STATE_ENQUEUED)) {
25135 +
25136 +               leftmost = enqueue_hrtimer(timer, base);
25137 +               if (!leftmost)
25138 +                       return;
25139 +#ifdef CONFIG_HIGH_RES_TIMERS
25140 +               if (!hrtimer_is_hres_active(timer)) {
25141 +                       /*
25142 +                        * Kick to reschedule the next tick to handle the new timer
25143 +                        * on dynticks target.
25144 +                        */
25145 +                       if (base->cpu_base->nohz_active)
25146 +                               wake_up_nohz_cpu(base->cpu_base->cpu);
25147 +               } else {
25148 +
25149 +                       hrtimer_reprogram(timer, base);
25150 +               }
25151 +#endif
25152 +       }
25153 +}
25154 +
25155 +/*
25156 + * The changes in mainline which removed the callback modes from
25157 + * hrtimer are not yet working with -rt. The non wakeup_process()
25158 + * based callbacks which involve sleeping locks need to be treated
25159 + * seperately.
25160 + */
25161 +static void hrtimer_rt_run_pending(void)
25162 +{
25163 +       enum hrtimer_restart (*fn)(struct hrtimer *);
25164 +       struct hrtimer_cpu_base *cpu_base;
25165 +       struct hrtimer_clock_base *base;
25166 +       struct hrtimer *timer;
25167 +       int index, restart;
25168 +
25169 +       local_irq_disable();
25170 +       cpu_base = &per_cpu(hrtimer_bases, smp_processor_id());
25171 +
25172 +       raw_spin_lock(&cpu_base->lock);
25173 +
25174 +       for (index = 0; index < HRTIMER_MAX_CLOCK_BASES; index++) {
25175 +               base = &cpu_base->clock_base[index];
25176 +
25177 +               while (!list_empty(&base->expired)) {
25178 +                       timer = list_first_entry(&base->expired,
25179 +                                                struct hrtimer, cb_entry);
25180 +
25181 +                       /*
25182 +                        * Same as the above __run_hrtimer function
25183 +                        * just we run with interrupts enabled.
25184 +                        */
25185 +                       debug_deactivate(timer);
25186 +                       cpu_base->running_soft = timer;
25187 +                       raw_write_seqcount_barrier(&cpu_base->seq);
25188 +
25189 +                       __remove_hrtimer(timer, base, HRTIMER_STATE_INACTIVE, 0);
25190 +                       timer_stats_account_hrtimer(timer);
25191 +                       fn = timer->function;
25192 +
25193 +                       raw_spin_unlock_irq(&cpu_base->lock);
25194 +                       restart = fn(timer);
25195 +                       raw_spin_lock_irq(&cpu_base->lock);
25196 +
25197 +                       hrtimer_rt_reprogram(restart, timer, base);
25198 +                       raw_write_seqcount_barrier(&cpu_base->seq);
25199 +
25200 +                       WARN_ON_ONCE(cpu_base->running_soft != timer);
25201 +                       cpu_base->running_soft = NULL;
25202 +               }
25203 +       }
25204 +
25205 +       raw_spin_unlock_irq(&cpu_base->lock);
25206 +
25207 +       wake_up_timer_waiters(cpu_base);
25208 +}
25209 +
25210 +static int hrtimer_rt_defer(struct hrtimer *timer)
25211 +{
25212 +       if (timer->irqsafe)
25213 +               return 0;
25214 +
25215 +       __remove_hrtimer(timer, timer->base, timer->state, 0);
25216 +       list_add_tail(&timer->cb_entry, &timer->base->expired);
25217 +       return 1;
25218 +}
25219 +
25220 +#else
25221 +
25222 +static inline int hrtimer_rt_defer(struct hrtimer *timer) { return 0; }
25223 +
25224 +#endif
25225 +
25226 +static enum hrtimer_restart hrtimer_wakeup(struct hrtimer *timer);
25227 +
25228  static void __hrtimer_run_queues(struct hrtimer_cpu_base *cpu_base, ktime_t now)
25229  {
25230         struct hrtimer_clock_base *base = cpu_base->clock_base;
25231         unsigned int active = cpu_base->active_bases;
25232 +       int raise = 0;
25233
25234         for (; active; base++, active >>= 1) {
25235                 struct timerqueue_node *node;
25236 @@ -1299,6 +1481,15 @@ static void __hrtimer_run_queues(struct hrtimer_cpu_base *cpu_base, ktime_t now)
25237
25238                         timer = container_of(node, struct hrtimer, node);
25239
25240 +                       trace_hrtimer_interrupt(raw_smp_processor_id(),
25241 +                           ktime_to_ns(ktime_sub(ktime_to_ns(timer->praecox) ?
25242 +                               timer->praecox : hrtimer_get_expires(timer),
25243 +                               basenow)),
25244 +                           current,
25245 +                           timer->function == hrtimer_wakeup ?
25246 +                           container_of(timer, struct hrtimer_sleeper,
25247 +                               timer)->task : NULL);
25248 +
25249                         /*
25250                          * The immediate goal for using the softexpires is
25251                          * minimizing wakeups, not running timers at the
25252 @@ -1314,9 +1505,14 @@ static void __hrtimer_run_queues(struct hrtimer_cpu_base *cpu_base, ktime_t now)
25253                         if (basenow.tv64 < hrtimer_get_softexpires_tv64(timer))
25254                                 break;
25255
25256 -                       __run_hrtimer(cpu_base, base, timer, &basenow);
25257 +                       if (!hrtimer_rt_defer(timer))
25258 +                               __run_hrtimer(cpu_base, base, timer, &basenow);
25259 +                       else
25260 +                               raise = 1;
25261                 }
25262         }
25263 +       if (raise)
25264 +               raise_softirq_irqoff(HRTIMER_SOFTIRQ);
25265  }
25266
25267  #ifdef CONFIG_HIGH_RES_TIMERS
25268 @@ -1479,16 +1675,18 @@ static enum hrtimer_restart hrtimer_wakeup(struct hrtimer *timer)
25269  void hrtimer_init_sleeper(struct hrtimer_sleeper *sl, struct task_struct *task)
25270  {
25271         sl->timer.function = hrtimer_wakeup;
25272 +       sl->timer.irqsafe = 1;
25273         sl->task = task;
25274  }
25275  EXPORT_SYMBOL_GPL(hrtimer_init_sleeper);
25276
25277 -static int __sched do_nanosleep(struct hrtimer_sleeper *t, enum hrtimer_mode mode)
25278 +static int __sched do_nanosleep(struct hrtimer_sleeper *t, enum hrtimer_mode mode,
25279 +                               unsigned long state)
25280  {
25281         hrtimer_init_sleeper(t, current);
25282
25283         do {
25284 -               set_current_state(TASK_INTERRUPTIBLE);
25285 +               set_current_state(state);
25286                 hrtimer_start_expires(&t->timer, mode);
25287
25288                 if (likely(t->task))
25289 @@ -1530,7 +1728,8 @@ long __sched hrtimer_nanosleep_restart(struct restart_block *restart)
25290                                 HRTIMER_MODE_ABS);
25291         hrtimer_set_expires_tv64(&t.timer, restart->nanosleep.expires);
25292
25293 -       if (do_nanosleep(&t, HRTIMER_MODE_ABS))
25294 +       /* cpu_chill() does not care about restart state. */
25295 +       if (do_nanosleep(&t, HRTIMER_MODE_ABS, TASK_INTERRUPTIBLE))
25296                 goto out;
25297
25298         rmtp = restart->nanosleep.rmtp;
25299 @@ -1547,8 +1746,10 @@ out:
25300         return ret;
25301  }
25302
25303 -long hrtimer_nanosleep(struct timespec *rqtp, struct timespec __user *rmtp,
25304 -                      const enum hrtimer_mode mode, const clockid_t clockid)
25305 +static long
25306 +__hrtimer_nanosleep(struct timespec *rqtp, struct timespec __user *rmtp,
25307 +                   const enum hrtimer_mode mode, const clockid_t clockid,
25308 +                   unsigned long state)
25309  {
25310         struct restart_block *restart;
25311         struct hrtimer_sleeper t;
25312 @@ -1561,7 +1762,7 @@ long hrtimer_nanosleep(struct timespec *rqtp, struct timespec __user *rmtp,
25313
25314         hrtimer_init_on_stack(&t.timer, clockid, mode);
25315         hrtimer_set_expires_range_ns(&t.timer, timespec_to_ktime(*rqtp), slack);
25316 -       if (do_nanosleep(&t, mode))
25317 +       if (do_nanosleep(&t, mode, state))
25318                 goto out;
25319
25320         /* Absolute timers do not update the rmtp value and restart: */
25321 @@ -1588,6 +1789,12 @@ out:
25322         return ret;
25323  }
25324
25325 +long hrtimer_nanosleep(struct timespec *rqtp, struct timespec __user *rmtp,
25326 +                      const enum hrtimer_mode mode, const clockid_t clockid)
25327 +{
25328 +       return __hrtimer_nanosleep(rqtp, rmtp, mode, clockid, TASK_INTERRUPTIBLE);
25329 +}
25330 +
25331  SYSCALL_DEFINE2(nanosleep, struct timespec __user *, rqtp,
25332                 struct timespec __user *, rmtp)
25333  {
25334 @@ -1602,6 +1809,26 @@ SYSCALL_DEFINE2(nanosleep, struct timespec __user *, rqtp,
25335         return hrtimer_nanosleep(&tu, rmtp, HRTIMER_MODE_REL, CLOCK_MONOTONIC);
25336  }
25337
25338 +#ifdef CONFIG_PREEMPT_RT_FULL
25339 +/*
25340 + * Sleep for 1 ms in hope whoever holds what we want will let it go.
25341 + */
25342 +void cpu_chill(void)
25343 +{
25344 +       struct timespec tu = {
25345 +               .tv_nsec = NSEC_PER_MSEC,
25346 +       };
25347 +       unsigned int freeze_flag = current->flags & PF_NOFREEZE;
25348 +
25349 +       current->flags |= PF_NOFREEZE;
25350 +       __hrtimer_nanosleep(&tu, NULL, HRTIMER_MODE_REL, CLOCK_MONOTONIC,
25351 +                           TASK_UNINTERRUPTIBLE);
25352 +       if (!freeze_flag)
25353 +               current->flags &= ~PF_NOFREEZE;
25354 +}
25355 +EXPORT_SYMBOL(cpu_chill);
25356 +#endif
25357 +
25358  /*
25359   * Functions related to boot-time initialization:
25360   */
25361 @@ -1613,10 +1840,14 @@ static void init_hrtimers_cpu(int cpu)
25362         for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) {
25363                 cpu_base->clock_base[i].cpu_base = cpu_base;
25364                 timerqueue_init_head(&cpu_base->clock_base[i].active);
25365 +               INIT_LIST_HEAD(&cpu_base->clock_base[i].expired);
25366         }
25367
25368         cpu_base->cpu = cpu;
25369         hrtimer_init_hres(cpu_base);
25370 +#ifdef CONFIG_PREEMPT_RT_BASE
25371 +       init_waitqueue_head(&cpu_base->wait);
25372 +#endif
25373  }
25374
25375  #ifdef CONFIG_HOTPLUG_CPU
25376 @@ -1714,11 +1945,21 @@ static struct notifier_block hrtimers_nb = {
25377         .notifier_call = hrtimer_cpu_notify,
25378  };
25379
25380 +#ifdef CONFIG_PREEMPT_RT_BASE
25381 +static void run_hrtimer_softirq(struct softirq_action *h)
25382 +{
25383 +       hrtimer_rt_run_pending();
25384 +}
25385 +#endif
25386 +
25387  void __init hrtimers_init(void)
25388  {
25389         hrtimer_cpu_notify(&hrtimers_nb, (unsigned long)CPU_UP_PREPARE,
25390                           (void *)(long)smp_processor_id());
25391         register_cpu_notifier(&hrtimers_nb);
25392 +#ifdef CONFIG_PREEMPT_RT_BASE
25393 +       open_softirq(HRTIMER_SOFTIRQ, run_hrtimer_softirq);
25394 +#endif
25395  }
25396
25397  /**
25398 diff --git a/kernel/time/itimer.c b/kernel/time/itimer.c
25399 index 1d5c7204ddc9..184de6751180 100644
25400 --- a/kernel/time/itimer.c
25401 +++ b/kernel/time/itimer.c
25402 @@ -213,6 +213,7 @@ again:
25403                 /* We are sharing ->siglock with it_real_fn() */
25404                 if (hrtimer_try_to_cancel(timer) < 0) {
25405                         spin_unlock_irq(&tsk->sighand->siglock);
25406 +                       hrtimer_wait_for_timer(&tsk->signal->real_timer);
25407                         goto again;
25408                 }
25409                 expires = timeval_to_ktime(value->it_value);
25410 diff --git a/kernel/time/jiffies.c b/kernel/time/jiffies.c
25411 index 347fecf86a3f..2ede47408a3e 100644
25412 --- a/kernel/time/jiffies.c
25413 +++ b/kernel/time/jiffies.c
25414 @@ -74,7 +74,8 @@ static struct clocksource clocksource_jiffies = {
25415         .max_cycles     = 10,
25416  };
25417
25418 -__cacheline_aligned_in_smp DEFINE_SEQLOCK(jiffies_lock);
25419 +__cacheline_aligned_in_smp DEFINE_RAW_SPINLOCK(jiffies_lock);
25420 +__cacheline_aligned_in_smp seqcount_t jiffies_seq;
25421
25422  #if (BITS_PER_LONG < 64)
25423  u64 get_jiffies_64(void)
25424 @@ -83,9 +84,9 @@ u64 get_jiffies_64(void)
25425         u64 ret;
25426
25427         do {
25428 -               seq = read_seqbegin(&jiffies_lock);
25429 +               seq = read_seqcount_begin(&jiffies_seq);
25430                 ret = jiffies_64;
25431 -       } while (read_seqretry(&jiffies_lock, seq));
25432 +       } while (read_seqcount_retry(&jiffies_seq, seq));
25433         return ret;
25434  }
25435  EXPORT_SYMBOL(get_jiffies_64);
25436 diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
25437 index ab861771e37f..0f6868fd2de6 100644
25438 --- a/kernel/time/ntp.c
25439 +++ b/kernel/time/ntp.c
25440 @@ -10,6 +10,7 @@
25441  #include <linux/workqueue.h>
25442  #include <linux/hrtimer.h>
25443  #include <linux/jiffies.h>
25444 +#include <linux/kthread.h>
25445  #include <linux/math64.h>
25446  #include <linux/timex.h>
25447  #include <linux/time.h>
25448 @@ -562,10 +563,52 @@ static void sync_cmos_clock(struct work_struct *work)
25449                            &sync_cmos_work, timespec64_to_jiffies(&next));
25450  }
25451
25452 +#ifdef CONFIG_PREEMPT_RT_FULL
25453 +/*
25454 + * RT can not call schedule_delayed_work from real interrupt context.
25455 + * Need to make a thread to do the real work.
25456 + */
25457 +static struct task_struct *cmos_delay_thread;
25458 +static bool do_cmos_delay;
25459 +
25460 +static int run_cmos_delay(void *ignore)
25461 +{
25462 +       while (!kthread_should_stop()) {
25463 +               set_current_state(TASK_INTERRUPTIBLE);
25464 +               if (do_cmos_delay) {
25465 +                       do_cmos_delay = false;
25466 +                       queue_delayed_work(system_power_efficient_wq,
25467 +                                          &sync_cmos_work, 0);
25468 +               }
25469 +               schedule();
25470 +       }
25471 +       __set_current_state(TASK_RUNNING);
25472 +       return 0;
25473 +}
25474 +
25475 +void ntp_notify_cmos_timer(void)
25476 +{
25477 +       do_cmos_delay = true;
25478 +       /* Make visible before waking up process */
25479 +       smp_wmb();
25480 +       wake_up_process(cmos_delay_thread);
25481 +}
25482 +
25483 +static __init int create_cmos_delay_thread(void)
25484 +{
25485 +       cmos_delay_thread = kthread_run(run_cmos_delay, NULL, "kcmosdelayd");
25486 +       BUG_ON(!cmos_delay_thread);
25487 +       return 0;
25488 +}
25489 +early_initcall(create_cmos_delay_thread);
25490 +
25491 +#else
25492 +
25493  void ntp_notify_cmos_timer(void)
25494  {
25495         queue_delayed_work(system_power_efficient_wq, &sync_cmos_work, 0);
25496  }
25497 +#endif /* CONFIG_PREEMPT_RT_FULL */
25498
25499  #else
25500  void ntp_notify_cmos_timer(void) { }
25501 diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c
25502 index 80016b329d94..b7342b6e6a5a 100644
25503 --- a/kernel/time/posix-cpu-timers.c
25504 +++ b/kernel/time/posix-cpu-timers.c
25505 @@ -3,6 +3,7 @@
25506   */
25507
25508  #include <linux/sched.h>
25509 +#include <linux/sched/rt.h>
25510  #include <linux/posix-timers.h>
25511  #include <linux/errno.h>
25512  #include <linux/math64.h>
25513 @@ -650,7 +651,7 @@ static int posix_cpu_timer_set(struct k_itimer *timer, int timer_flags,
25514         /*
25515          * Disarm any old timer after extracting its expiry time.
25516          */
25517 -       WARN_ON_ONCE(!irqs_disabled());
25518 +       WARN_ON_ONCE_NONRT(!irqs_disabled());
25519
25520         ret = 0;
25521         old_incr = timer->it.cpu.incr;
25522 @@ -1092,7 +1093,7 @@ void posix_cpu_timer_schedule(struct k_itimer *timer)
25523         /*
25524          * Now re-arm for the new expiry time.
25525          */
25526 -       WARN_ON_ONCE(!irqs_disabled());
25527 +       WARN_ON_ONCE_NONRT(!irqs_disabled());
25528         arm_timer(timer);
25529         unlock_task_sighand(p, &flags);
25530
25531 @@ -1183,13 +1184,13 @@ static inline int fastpath_timer_check(struct task_struct *tsk)
25532   * already updated our counts.  We need to check if any timers fire now.
25533   * Interrupts are disabled.
25534   */
25535 -void run_posix_cpu_timers(struct task_struct *tsk)
25536 +static void __run_posix_cpu_timers(struct task_struct *tsk)
25537  {
25538         LIST_HEAD(firing);
25539         struct k_itimer *timer, *next;
25540         unsigned long flags;
25541
25542 -       WARN_ON_ONCE(!irqs_disabled());
25543 +       WARN_ON_ONCE_NONRT(!irqs_disabled());
25544
25545         /*
25546          * The fast path checks that there are no expired thread or thread
25547 @@ -1243,6 +1244,190 @@ void run_posix_cpu_timers(struct task_struct *tsk)
25548         }
25549  }
25550
25551 +#ifdef CONFIG_PREEMPT_RT_BASE
25552 +#include <linux/kthread.h>
25553 +#include <linux/cpu.h>
25554 +DEFINE_PER_CPU(struct task_struct *, posix_timer_task);
25555 +DEFINE_PER_CPU(struct task_struct *, posix_timer_tasklist);
25556 +
25557 +static int posix_cpu_timers_thread(void *data)
25558 +{
25559 +       int cpu = (long)data;
25560 +
25561 +       BUG_ON(per_cpu(posix_timer_task,cpu) != current);
25562 +
25563 +       while (!kthread_should_stop()) {
25564 +               struct task_struct *tsk = NULL;
25565 +               struct task_struct *next = NULL;
25566 +
25567 +               if (cpu_is_offline(cpu))
25568 +                       goto wait_to_die;
25569 +
25570 +               /* grab task list */
25571 +               raw_local_irq_disable();
25572 +               tsk = per_cpu(posix_timer_tasklist, cpu);
25573 +               per_cpu(posix_timer_tasklist, cpu) = NULL;
25574 +               raw_local_irq_enable();
25575 +
25576 +               /* its possible the list is empty, just return */
25577 +               if (!tsk) {
25578 +                       set_current_state(TASK_INTERRUPTIBLE);
25579 +                       schedule();
25580 +                       __set_current_state(TASK_RUNNING);
25581 +                       continue;
25582 +               }
25583 +
25584 +               /* Process task list */
25585 +               while (1) {
25586 +                       /* save next */
25587 +                       next = tsk->posix_timer_list;
25588 +
25589 +                       /* run the task timers, clear its ptr and
25590 +                        * unreference it
25591 +                        */
25592 +                       __run_posix_cpu_timers(tsk);
25593 +                       tsk->posix_timer_list = NULL;
25594 +                       put_task_struct(tsk);
25595 +
25596 +                       /* check if this is the last on the list */
25597 +                       if (next == tsk)
25598 +                               break;
25599 +                       tsk = next;
25600 +               }
25601 +       }
25602 +       return 0;
25603 +
25604 +wait_to_die:
25605 +       /* Wait for kthread_stop */
25606 +       set_current_state(TASK_INTERRUPTIBLE);
25607 +       while (!kthread_should_stop()) {
25608 +               schedule();
25609 +               set_current_state(TASK_INTERRUPTIBLE);
25610 +       }
25611 +       __set_current_state(TASK_RUNNING);
25612 +       return 0;
25613 +}
25614 +
25615 +static inline int __fastpath_timer_check(struct task_struct *tsk)
25616 +{
25617 +       /* tsk == current, ensure it is safe to use ->signal/sighand */
25618 +       if (unlikely(tsk->exit_state))
25619 +               return 0;
25620 +
25621 +       if (!task_cputime_zero(&tsk->cputime_expires))
25622 +                       return 1;
25623 +
25624 +       if (!task_cputime_zero(&tsk->signal->cputime_expires))
25625 +                       return 1;
25626 +
25627 +       return 0;
25628 +}
25629 +
25630 +void run_posix_cpu_timers(struct task_struct *tsk)
25631 +{
25632 +       unsigned long cpu = smp_processor_id();
25633 +       struct task_struct *tasklist;
25634 +
25635 +       BUG_ON(!irqs_disabled());
25636 +       if(!per_cpu(posix_timer_task, cpu))
25637 +               return;
25638 +       /* get per-cpu references */
25639 +       tasklist = per_cpu(posix_timer_tasklist, cpu);
25640 +
25641 +       /* check to see if we're already queued */
25642 +       if (!tsk->posix_timer_list && __fastpath_timer_check(tsk)) {
25643 +               get_task_struct(tsk);
25644 +               if (tasklist) {
25645 +                       tsk->posix_timer_list = tasklist;
25646 +               } else {
25647 +                       /*
25648 +                        * The list is terminated by a self-pointing
25649 +                        * task_struct
25650 +                        */
25651 +                       tsk->posix_timer_list = tsk;
25652 +               }
25653 +               per_cpu(posix_timer_tasklist, cpu) = tsk;
25654 +
25655 +               wake_up_process(per_cpu(posix_timer_task, cpu));
25656 +       }
25657 +}
25658 +
25659 +/*
25660 + * posix_cpu_thread_call - callback that gets triggered when a CPU is added.
25661 + * Here we can start up the necessary migration thread for the new CPU.
25662 + */
25663 +static int posix_cpu_thread_call(struct notifier_block *nfb,
25664 +                                unsigned long action, void *hcpu)
25665 +{
25666 +       int cpu = (long)hcpu;
25667 +       struct task_struct *p;
25668 +       struct sched_param param;
25669 +
25670 +       switch (action) {
25671 +       case CPU_UP_PREPARE:
25672 +               p = kthread_create(posix_cpu_timers_thread, hcpu,
25673 +                                       "posixcputmr/%d",cpu);
25674 +               if (IS_ERR(p))
25675 +                       return NOTIFY_BAD;
25676 +               p->flags |= PF_NOFREEZE;
25677 +               kthread_bind(p, cpu);
25678 +               /* Must be high prio to avoid getting starved */
25679 +               param.sched_priority = MAX_RT_PRIO-1;
25680 +               sched_setscheduler(p, SCHED_FIFO, &param);
25681 +               per_cpu(posix_timer_task,cpu) = p;
25682 +               break;
25683 +       case CPU_ONLINE:
25684 +               /* Strictly unneccessary, as first user will wake it. */
25685 +               wake_up_process(per_cpu(posix_timer_task,cpu));
25686 +               break;
25687 +#ifdef CONFIG_HOTPLUG_CPU
25688 +       case CPU_UP_CANCELED:
25689 +               /* Unbind it from offline cpu so it can run.  Fall thru. */
25690 +               kthread_bind(per_cpu(posix_timer_task, cpu),
25691 +                            cpumask_any(cpu_online_mask));
25692 +               kthread_stop(per_cpu(posix_timer_task,cpu));
25693 +               per_cpu(posix_timer_task,cpu) = NULL;
25694 +               break;
25695 +       case CPU_DEAD:
25696 +               kthread_stop(per_cpu(posix_timer_task,cpu));
25697 +               per_cpu(posix_timer_task,cpu) = NULL;
25698 +               break;
25699 +#endif
25700 +       }
25701 +       return NOTIFY_OK;
25702 +}
25703 +
25704 +/* Register at highest priority so that task migration (migrate_all_tasks)
25705 + * happens before everything else.
25706 + */
25707 +static struct notifier_block posix_cpu_thread_notifier = {
25708 +       .notifier_call = posix_cpu_thread_call,
25709 +       .priority = 10
25710 +};
25711 +
25712 +static int __init posix_cpu_thread_init(void)
25713 +{
25714 +       void *hcpu = (void *)(long)smp_processor_id();
25715 +       /* Start one for boot CPU. */
25716 +       unsigned long cpu;
25717 +
25718 +       /* init the per-cpu posix_timer_tasklets */
25719 +       for_each_possible_cpu(cpu)
25720 +               per_cpu(posix_timer_tasklist, cpu) = NULL;
25721 +
25722 +       posix_cpu_thread_call(&posix_cpu_thread_notifier, CPU_UP_PREPARE, hcpu);
25723 +       posix_cpu_thread_call(&posix_cpu_thread_notifier, CPU_ONLINE, hcpu);
25724 +       register_cpu_notifier(&posix_cpu_thread_notifier);
25725 +       return 0;
25726 +}
25727 +early_initcall(posix_cpu_thread_init);
25728 +#else /* CONFIG_PREEMPT_RT_BASE */
25729 +void run_posix_cpu_timers(struct task_struct *tsk)
25730 +{
25731 +       __run_posix_cpu_timers(tsk);
25732 +}
25733 +#endif /* CONFIG_PREEMPT_RT_BASE */
25734 +
25735  /*
25736   * Set one of the process-wide special case CPU timers or RLIMIT_CPU.
25737   * The tsk->sighand->siglock must be held by the caller.
25738 diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c
25739 index f2826c35e918..464a98155a0e 100644
25740 --- a/kernel/time/posix-timers.c
25741 +++ b/kernel/time/posix-timers.c
25742 @@ -506,6 +506,7 @@ static enum hrtimer_restart posix_timer_fn(struct hrtimer *timer)
25743  static struct pid *good_sigevent(sigevent_t * event)
25744  {
25745         struct task_struct *rtn = current->group_leader;
25746 +       int sig = event->sigev_signo;
25747
25748         if ((event->sigev_notify & SIGEV_THREAD_ID ) &&
25749                 (!(rtn = find_task_by_vpid(event->sigev_notify_thread_id)) ||
25750 @@ -514,7 +515,8 @@ static struct pid *good_sigevent(sigevent_t * event)
25751                 return NULL;
25752
25753         if (((event->sigev_notify & ~SIGEV_THREAD_ID) != SIGEV_NONE) &&
25754 -           ((event->sigev_signo <= 0) || (event->sigev_signo > SIGRTMAX)))
25755 +           (sig <= 0 || sig > SIGRTMAX || sig_kernel_only(sig) ||
25756 +            sig_kernel_coredump(sig)))
25757                 return NULL;
25758
25759         return task_pid(rtn);
25760 @@ -826,6 +828,20 @@ SYSCALL_DEFINE1(timer_getoverrun, timer_t, timer_id)
25761         return overrun;
25762  }
25763
25764 +/*
25765 + * Protected by RCU!
25766 + */
25767 +static void timer_wait_for_callback(struct k_clock *kc, struct k_itimer *timr)
25768 +{
25769 +#ifdef CONFIG_PREEMPT_RT_FULL
25770 +       if (kc->timer_set == common_timer_set)
25771 +               hrtimer_wait_for_timer(&timr->it.real.timer);
25772 +       else
25773 +               /* FIXME: Whacky hack for posix-cpu-timers */
25774 +               schedule_timeout(1);
25775 +#endif
25776 +}
25777 +
25778  /* Set a POSIX.1b interval timer. */
25779  /* timr->it_lock is taken. */
25780  static int
25781 @@ -903,6 +919,7 @@ retry:
25782         if (!timr)
25783                 return -EINVAL;
25784
25785 +       rcu_read_lock();
25786         kc = clockid_to_kclock(timr->it_clock);
25787         if (WARN_ON_ONCE(!kc || !kc->timer_set))
25788                 error = -EINVAL;
25789 @@ -911,9 +928,12 @@ retry:
25790
25791         unlock_timer(timr, flag);
25792         if (error == TIMER_RETRY) {
25793 +               timer_wait_for_callback(kc, timr);
25794                 rtn = NULL;     // We already got the old time...
25795 +               rcu_read_unlock();
25796                 goto retry;
25797         }
25798 +       rcu_read_unlock();
25799
25800         if (old_setting && !error &&
25801             copy_to_user(old_setting, &old_spec, sizeof (old_spec)))
25802 @@ -951,10 +971,15 @@ retry_delete:
25803         if (!timer)
25804                 return -EINVAL;
25805
25806 +       rcu_read_lock();
25807         if (timer_delete_hook(timer) == TIMER_RETRY) {
25808                 unlock_timer(timer, flags);
25809 +               timer_wait_for_callback(clockid_to_kclock(timer->it_clock),
25810 +                                       timer);
25811 +               rcu_read_unlock();
25812                 goto retry_delete;
25813         }
25814 +       rcu_read_unlock();
25815
25816         spin_lock(&current->sighand->siglock);
25817         list_del(&timer->list);
25818 @@ -980,8 +1005,18 @@ static void itimer_delete(struct k_itimer *timer)
25819  retry_delete:
25820         spin_lock_irqsave(&timer->it_lock, flags);
25821
25822 +       /* On RT we can race with a deletion */
25823 +       if (!timer->it_signal) {
25824 +               unlock_timer(timer, flags);
25825 +               return;
25826 +       }
25827 +
25828         if (timer_delete_hook(timer) == TIMER_RETRY) {
25829 +               rcu_read_lock();
25830                 unlock_timer(timer, flags);
25831 +               timer_wait_for_callback(clockid_to_kclock(timer->it_clock),
25832 +                                       timer);
25833 +               rcu_read_unlock();
25834                 goto retry_delete;
25835         }
25836         list_del(&timer->list);
25837 diff --git a/kernel/time/tick-broadcast-hrtimer.c b/kernel/time/tick-broadcast-hrtimer.c
25838 index 53d7184da0be..1b4ac3361c3f 100644
25839 --- a/kernel/time/tick-broadcast-hrtimer.c
25840 +++ b/kernel/time/tick-broadcast-hrtimer.c
25841 @@ -106,5 +106,6 @@ void tick_setup_hrtimer_broadcast(void)
25842  {
25843         hrtimer_init(&bctimer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
25844         bctimer.function = bc_handler;
25845 +       bctimer.irqsafe = true;
25846         clockevents_register_device(&ce_broadcast_hrtimer);
25847  }
25848 diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c
25849 index 4fcd99e12aa0..5a47f2e98faf 100644
25850 --- a/kernel/time/tick-common.c
25851 +++ b/kernel/time/tick-common.c
25852 @@ -79,13 +79,15 @@ int tick_is_oneshot_available(void)
25853  static void tick_periodic(int cpu)
25854  {
25855         if (tick_do_timer_cpu == cpu) {
25856 -               write_seqlock(&jiffies_lock);
25857 +               raw_spin_lock(&jiffies_lock);
25858 +               write_seqcount_begin(&jiffies_seq);
25859
25860                 /* Keep track of the next tick event */
25861                 tick_next_period = ktime_add(tick_next_period, tick_period);
25862
25863                 do_timer(1);
25864 -               write_sequnlock(&jiffies_lock);
25865 +               write_seqcount_end(&jiffies_seq);
25866 +               raw_spin_unlock(&jiffies_lock);
25867                 update_wall_time();
25868         }
25869
25870 @@ -157,9 +159,9 @@ void tick_setup_periodic(struct clock_event_device *dev, int broadcast)
25871                 ktime_t next;
25872
25873                 do {
25874 -                       seq = read_seqbegin(&jiffies_lock);
25875 +                       seq = read_seqcount_begin(&jiffies_seq);
25876                         next = tick_next_period;
25877 -               } while (read_seqretry(&jiffies_lock, seq));
25878 +               } while (read_seqcount_retry(&jiffies_seq, seq));
25879
25880                 clockevents_switch_state(dev, CLOCK_EVT_STATE_ONESHOT);
25881
25882 diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
25883 index 22c57e191a23..d536824cbd36 100644
25884 --- a/kernel/time/tick-sched.c
25885 +++ b/kernel/time/tick-sched.c
25886 @@ -62,7 +62,8 @@ static void tick_do_update_jiffies64(ktime_t now)
25887                 return;
25888
25889         /* Reevalute with jiffies_lock held */
25890 -       write_seqlock(&jiffies_lock);
25891 +       raw_spin_lock(&jiffies_lock);
25892 +       write_seqcount_begin(&jiffies_seq);
25893
25894         delta = ktime_sub(now, last_jiffies_update);
25895         if (delta.tv64 >= tick_period.tv64) {
25896 @@ -85,10 +86,12 @@ static void tick_do_update_jiffies64(ktime_t now)
25897                 /* Keep the tick_next_period variable up to date */
25898                 tick_next_period = ktime_add(last_jiffies_update, tick_period);
25899         } else {
25900 -               write_sequnlock(&jiffies_lock);
25901 +               write_seqcount_end(&jiffies_seq);
25902 +               raw_spin_unlock(&jiffies_lock);
25903                 return;
25904         }
25905 -       write_sequnlock(&jiffies_lock);
25906 +       write_seqcount_end(&jiffies_seq);
25907 +       raw_spin_unlock(&jiffies_lock);
25908         update_wall_time();
25909  }
25910
25911 @@ -99,12 +102,14 @@ static ktime_t tick_init_jiffy_update(void)
25912  {
25913         ktime_t period;
25914
25915 -       write_seqlock(&jiffies_lock);
25916 +       raw_spin_lock(&jiffies_lock);
25917 +       write_seqcount_begin(&jiffies_seq);
25918         /* Did we start the jiffies update yet ? */
25919         if (last_jiffies_update.tv64 == 0)
25920                 last_jiffies_update = tick_next_period;
25921         period = last_jiffies_update;
25922 -       write_sequnlock(&jiffies_lock);
25923 +       write_seqcount_end(&jiffies_seq);
25924 +       raw_spin_unlock(&jiffies_lock);
25925         return period;
25926  }
25927
25928 @@ -176,6 +181,11 @@ static bool can_stop_full_tick(void)
25929                 return false;
25930         }
25931
25932 +       if (!arch_irq_work_has_interrupt()) {
25933 +               trace_tick_stop(0, "missing irq work interrupt\n");
25934 +               return false;
25935 +       }
25936 +
25937         /* sched_clock_tick() needs us? */
25938  #ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK
25939         /*
25940 @@ -204,6 +214,7 @@ static void nohz_full_kick_work_func(struct irq_work *work)
25941
25942  static DEFINE_PER_CPU(struct irq_work, nohz_full_kick_work) = {
25943         .func = nohz_full_kick_work_func,
25944 +       .flags = IRQ_WORK_HARD_IRQ,
25945  };
25946
25947  /*
25948 @@ -578,10 +589,10 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts,
25949
25950         /* Read jiffies and the time when jiffies were updated last */
25951         do {
25952 -               seq = read_seqbegin(&jiffies_lock);
25953 +               seq = read_seqcount_begin(&jiffies_seq);
25954                 basemono = last_jiffies_update.tv64;
25955                 basejiff = jiffies;
25956 -       } while (read_seqretry(&jiffies_lock, seq));
25957 +       } while (read_seqcount_retry(&jiffies_seq, seq));
25958         ts->last_jiffies = basejiff;
25959
25960         if (rcu_needs_cpu(basemono, &next_rcu) ||
25961 @@ -753,14 +764,7 @@ static bool can_stop_idle_tick(int cpu, struct tick_sched *ts)
25962                 return false;
25963
25964         if (unlikely(local_softirq_pending() && cpu_online(cpu))) {
25965 -               static int ratelimit;
25966 -
25967 -               if (ratelimit < 10 &&
25968 -                   (local_softirq_pending() & SOFTIRQ_STOP_IDLE_MASK)) {
25969 -                       pr_warn("NOHZ: local_softirq_pending %02x\n",
25970 -                               (unsigned int) local_softirq_pending());
25971 -                       ratelimit++;
25972 -               }
25973 +               softirq_check_pending_idle();
25974                 return false;
25975         }
25976
25977 @@ -1100,6 +1104,7 @@ void tick_setup_sched_timer(void)
25978          * Emulate tick processing via per-CPU hrtimers:
25979          */
25980         hrtimer_init(&ts->sched_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
25981 +       ts->sched_timer.irqsafe = 1;
25982         ts->sched_timer.function = tick_sched_timer;
25983
25984         /* Get the next period (per cpu) */
25985 diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
25986 index 445601c580d6..8744b0d87479 100644
25987 --- a/kernel/time/timekeeping.c
25988 +++ b/kernel/time/timekeeping.c
25989 @@ -2070,8 +2070,10 @@ EXPORT_SYMBOL(hardpps);
25990   */
25991  void xtime_update(unsigned long ticks)
25992  {
25993 -       write_seqlock(&jiffies_lock);
25994 +       raw_spin_lock(&jiffies_lock);
25995 +       write_seqcount_begin(&jiffies_seq);
25996         do_timer(ticks);
25997 -       write_sequnlock(&jiffies_lock);
25998 +       write_seqcount_end(&jiffies_seq);
25999 +       raw_spin_unlock(&jiffies_lock);
26000         update_wall_time();
26001  }
26002 diff --git a/kernel/time/timekeeping.h b/kernel/time/timekeeping.h
26003 index 704f595ce83f..763a3e5121ff 100644
26004 --- a/kernel/time/timekeeping.h
26005 +++ b/kernel/time/timekeeping.h
26006 @@ -19,7 +19,8 @@ extern void timekeeping_resume(void);
26007  extern void do_timer(unsigned long ticks);
26008  extern void update_wall_time(void);
26009
26010 -extern seqlock_t jiffies_lock;
26011 +extern raw_spinlock_t jiffies_lock;
26012 +extern seqcount_t jiffies_seq;
26013
26014  #define CS_NAME_LEN    32
26015
26016 diff --git a/kernel/time/timer.c b/kernel/time/timer.c
26017 index bbc5d1114583..603699ff9411 100644
26018 --- a/kernel/time/timer.c
26019 +++ b/kernel/time/timer.c
26020 @@ -80,6 +80,9 @@ struct tvec_root {
26021  struct tvec_base {
26022         spinlock_t lock;
26023         struct timer_list *running_timer;
26024 +#ifdef CONFIG_PREEMPT_RT_FULL
26025 +       wait_queue_head_t wait_for_running_timer;
26026 +#endif
26027         unsigned long timer_jiffies;
26028         unsigned long next_timer;
26029         unsigned long active_timers;
26030 @@ -777,6 +780,39 @@ static struct tvec_base *lock_timer_base(struct timer_list *timer,
26031                 cpu_relax();
26032         }
26033  }
26034 +#ifdef CONFIG_PREEMPT_RT_FULL
26035 +static inline struct tvec_base *switch_timer_base(struct timer_list *timer,
26036 +                                                 struct tvec_base *old,
26037 +                                                 struct tvec_base *new)
26038 +{
26039 +       /*
26040 +        * We cannot do the below because we might be preempted and
26041 +        * then the preempter would see NULL and loop forever.
26042 +        */
26043 +       if (spin_trylock(&new->lock)) {
26044 +               WRITE_ONCE(timer->flags,
26045 +                          (timer->flags & ~TIMER_BASEMASK) | new->cpu);
26046 +               spin_unlock(&old->lock);
26047 +               return new;
26048 +       }
26049 +       return old;
26050 +}
26051 +
26052 +#else
26053 +static inline struct tvec_base *switch_timer_base(struct timer_list *timer,
26054 +                                                 struct tvec_base *old,
26055 +                                                 struct tvec_base *new)
26056 +{
26057 +       /* See the comment in lock_timer_base() */
26058 +       timer->flags |= TIMER_MIGRATING;
26059 +
26060 +       spin_unlock(&old->lock);
26061 +       spin_lock(&new->lock);
26062 +       WRITE_ONCE(timer->flags,
26063 +                  (timer->flags & ~TIMER_BASEMASK) | new->cpu);
26064 +       return new;
26065 +}
26066 +#endif
26067
26068  static inline int
26069  __mod_timer(struct timer_list *timer, unsigned long expires,
26070 @@ -807,16 +843,8 @@ __mod_timer(struct timer_list *timer, unsigned long expires,
26071                  * handler yet has not finished. This also guarantees that
26072                  * the timer is serialized wrt itself.
26073                  */
26074 -               if (likely(base->running_timer != timer)) {
26075 -                       /* See the comment in lock_timer_base() */
26076 -                       timer->flags |= TIMER_MIGRATING;
26077 -
26078 -                       spin_unlock(&base->lock);
26079 -                       base = new_base;
26080 -                       spin_lock(&base->lock);
26081 -                       WRITE_ONCE(timer->flags,
26082 -                                  (timer->flags & ~TIMER_BASEMASK) | base->cpu);
26083 -               }
26084 +               if (likely(base->running_timer != timer))
26085 +                       base = switch_timer_base(timer, base, new_base);
26086         }
26087
26088         timer->expires = expires;
26089 @@ -1006,6 +1034,33 @@ void add_timer_on(struct timer_list *timer, int cpu)
26090  }
26091  EXPORT_SYMBOL_GPL(add_timer_on);
26092
26093 +#ifdef CONFIG_PREEMPT_RT_FULL
26094 +/*
26095 + * Wait for a running timer
26096 + */
26097 +static void wait_for_running_timer(struct timer_list *timer)
26098 +{
26099 +       struct tvec_base *base;
26100 +       u32 tf = timer->flags;
26101 +
26102 +       if (tf & TIMER_MIGRATING)
26103 +               return;
26104 +
26105 +       base = per_cpu_ptr(&tvec_bases, tf & TIMER_CPUMASK);
26106 +       wait_event(base->wait_for_running_timer,
26107 +                  base->running_timer != timer);
26108 +}
26109 +
26110 +# define wakeup_timer_waiters(b)       wake_up_all(&(b)->wait_for_running_timer)
26111 +#else
26112 +static inline void wait_for_running_timer(struct timer_list *timer)
26113 +{
26114 +       cpu_relax();
26115 +}
26116 +
26117 +# define wakeup_timer_waiters(b)       do { } while (0)
26118 +#endif
26119 +
26120  /**
26121   * del_timer - deactive a timer.
26122   * @timer: the timer to be deactivated
26123 @@ -1063,7 +1118,7 @@ int try_to_del_timer_sync(struct timer_list *timer)
26124  }
26125  EXPORT_SYMBOL(try_to_del_timer_sync);
26126
26127 -#ifdef CONFIG_SMP
26128 +#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT_FULL)
26129  /**
26130   * del_timer_sync - deactivate a timer and wait for the handler to finish.
26131   * @timer: the timer to be deactivated
26132 @@ -1123,7 +1178,7 @@ int del_timer_sync(struct timer_list *timer)
26133                 int ret = try_to_del_timer_sync(timer);
26134                 if (ret >= 0)
26135                         return ret;
26136 -               cpu_relax();
26137 +               wait_for_running_timer(timer);
26138         }
26139  }
26140  EXPORT_SYMBOL(del_timer_sync);
26141 @@ -1248,16 +1303,18 @@ static inline void __run_timers(struct tvec_base *base)
26142                         if (irqsafe) {
26143                                 spin_unlock(&base->lock);
26144                                 call_timer_fn(timer, fn, data);
26145 +                               base->running_timer = NULL;
26146                                 spin_lock(&base->lock);
26147                         } else {
26148                                 spin_unlock_irq(&base->lock);
26149                                 call_timer_fn(timer, fn, data);
26150 +                               base->running_timer = NULL;
26151                                 spin_lock_irq(&base->lock);
26152                         }
26153                 }
26154         }
26155 -       base->running_timer = NULL;
26156         spin_unlock_irq(&base->lock);
26157 +       wakeup_timer_waiters(base);
26158  }
26159
26160  #ifdef CONFIG_NO_HZ_COMMON
26161 @@ -1390,6 +1447,14 @@ u64 get_next_timer_interrupt(unsigned long basej, u64 basem)
26162         if (cpu_is_offline(smp_processor_id()))
26163                 return expires;
26164
26165 +#ifdef CONFIG_PREEMPT_RT_FULL
26166 +       /*
26167 +        * On PREEMPT_RT we cannot sleep here. As a result we can't take
26168 +        * the base lock to check when the next timer is pending and so
26169 +        * we assume the next jiffy.
26170 +        */
26171 +       return basem + TICK_NSEC;
26172 +#endif
26173         spin_lock(&base->lock);
26174         if (base->active_timers) {
26175                 if (time_before_eq(base->next_timer, base->timer_jiffies))
26176 @@ -1416,13 +1481,13 @@ void update_process_times(int user_tick)
26177
26178         /* Note: this timer irq context must be accounted for as well. */
26179         account_process_tick(p, user_tick);
26180 +       scheduler_tick();
26181         run_local_timers();
26182         rcu_check_callbacks(user_tick);
26183 -#ifdef CONFIG_IRQ_WORK
26184 +#if defined(CONFIG_IRQ_WORK)
26185         if (in_irq())
26186                 irq_work_tick();
26187  #endif
26188 -       scheduler_tick();
26189         run_posix_cpu_timers(p);
26190  }
26191
26192 @@ -1433,6 +1498,8 @@ static void run_timer_softirq(struct softirq_action *h)
26193  {
26194         struct tvec_base *base = this_cpu_ptr(&tvec_bases);
26195
26196 +       irq_work_tick_soft();
26197 +
26198         if (time_after_eq(jiffies, base->timer_jiffies))
26199                 __run_timers(base);
26200  }
26201 @@ -1589,7 +1656,7 @@ static void migrate_timers(int cpu)
26202
26203         BUG_ON(cpu_online(cpu));
26204         old_base = per_cpu_ptr(&tvec_bases, cpu);
26205 -       new_base = get_cpu_ptr(&tvec_bases);
26206 +       new_base = get_local_ptr(&tvec_bases);
26207         /*
26208          * The caller is globally serialized and nobody else
26209          * takes two locks at once, deadlock is not possible.
26210 @@ -1613,7 +1680,7 @@ static void migrate_timers(int cpu)
26211
26212         spin_unlock(&old_base->lock);
26213         spin_unlock_irq(&new_base->lock);
26214 -       put_cpu_ptr(&tvec_bases);
26215 +       put_local_ptr(&tvec_bases);
26216  }
26217
26218  static int timer_cpu_notify(struct notifier_block *self,
26219 @@ -1645,6 +1712,9 @@ static void __init init_timer_cpu(int cpu)
26220
26221         base->cpu = cpu;
26222         spin_lock_init(&base->lock);
26223 +#ifdef CONFIG_PREEMPT_RT_FULL
26224 +       init_waitqueue_head(&base->wait_for_running_timer);
26225 +#endif
26226
26227         base->timer_jiffies = jiffies;
26228         base->next_timer = base->timer_jiffies;
26229 diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
26230 index e45db6b0d878..364ccd0eb57b 100644
26231 --- a/kernel/trace/Kconfig
26232 +++ b/kernel/trace/Kconfig
26233 @@ -187,6 +187,24 @@ config IRQSOFF_TRACER
26234           enabled. This option and the preempt-off timing option can be
26235           used together or separately.)
26236
26237 +config INTERRUPT_OFF_HIST
26238 +       bool "Interrupts-off Latency Histogram"
26239 +       depends on IRQSOFF_TRACER
26240 +       help
26241 +         This option generates continuously updated histograms (one per cpu)
26242 +         of the duration of time periods with interrupts disabled. The
26243 +         histograms are disabled by default. To enable them, write a non-zero
26244 +         number to
26245 +
26246 +             /sys/kernel/debug/tracing/latency_hist/enable/preemptirqsoff
26247 +
26248 +         If PREEMPT_OFF_HIST is also selected, additional histograms (one
26249 +         per cpu) are generated that accumulate the duration of time periods
26250 +         when both interrupts and preemption are disabled. The histogram data
26251 +         will be located in the debug file system at
26252 +
26253 +             /sys/kernel/debug/tracing/latency_hist/irqsoff
26254 +
26255  config PREEMPT_TRACER
26256         bool "Preemption-off Latency Tracer"
26257         default n
26258 @@ -211,6 +229,24 @@ config PREEMPT_TRACER
26259           enabled. This option and the irqs-off timing option can be
26260           used together or separately.)
26261
26262 +config PREEMPT_OFF_HIST
26263 +       bool "Preemption-off Latency Histogram"
26264 +       depends on PREEMPT_TRACER
26265 +       help
26266 +         This option generates continuously updated histograms (one per cpu)
26267 +         of the duration of time periods with preemption disabled. The
26268 +         histograms are disabled by default. To enable them, write a non-zero
26269 +         number to
26270 +
26271 +             /sys/kernel/debug/tracing/latency_hist/enable/preemptirqsoff
26272 +
26273 +         If INTERRUPT_OFF_HIST is also selected, additional histograms (one
26274 +         per cpu) are generated that accumulate the duration of time periods
26275 +         when both interrupts and preemption are disabled. The histogram data
26276 +         will be located in the debug file system at
26277 +
26278 +             /sys/kernel/debug/tracing/latency_hist/preemptoff
26279 +
26280  config SCHED_TRACER
26281         bool "Scheduling Latency Tracer"
26282         select GENERIC_TRACER
26283 @@ -221,6 +257,74 @@ config SCHED_TRACER
26284           This tracer tracks the latency of the highest priority task
26285           to be scheduled in, starting from the point it has woken up.
26286
26287 +config WAKEUP_LATENCY_HIST
26288 +       bool "Scheduling Latency Histogram"
26289 +       depends on SCHED_TRACER
26290 +       help
26291 +         This option generates continuously updated histograms (one per cpu)
26292 +         of the scheduling latency of the highest priority task.
26293 +         The histograms are disabled by default. To enable them, write a
26294 +         non-zero number to
26295 +
26296 +             /sys/kernel/debug/tracing/latency_hist/enable/wakeup
26297 +
26298 +         Two different algorithms are used, one to determine the latency of
26299 +         processes that exclusively use the highest priority of the system and
26300 +         another one to determine the latency of processes that share the
26301 +         highest system priority with other processes. The former is used to
26302 +         improve hardware and system software, the latter to optimize the
26303 +         priority design of a given system. The histogram data will be
26304 +         located in the debug file system at
26305 +
26306 +             /sys/kernel/debug/tracing/latency_hist/wakeup
26307 +
26308 +         and
26309 +
26310 +             /sys/kernel/debug/tracing/latency_hist/wakeup/sharedprio
26311 +
26312 +         If both Scheduling Latency Histogram and Missed Timer Offsets
26313 +         Histogram are selected, additional histogram data will be collected
26314 +         that contain, in addition to the wakeup latency, the timer latency, in
26315 +         case the wakeup was triggered by an expired timer. These histograms
26316 +         are available in the
26317 +
26318 +             /sys/kernel/debug/tracing/latency_hist/timerandwakeup
26319 +
26320 +         directory. They reflect the apparent interrupt and scheduling latency
26321 +         and are best suitable to determine the worst-case latency of a given
26322 +         system. To enable these histograms, write a non-zero number to
26323 +
26324 +             /sys/kernel/debug/tracing/latency_hist/enable/timerandwakeup
26325 +
26326 +config MISSED_TIMER_OFFSETS_HIST
26327 +       depends on HIGH_RES_TIMERS
26328 +       select GENERIC_TRACER
26329 +       bool "Missed Timer Offsets Histogram"
26330 +       help
26331 +         Generate a histogram of missed timer offsets in microseconds. The
26332 +         histograms are disabled by default. To enable them, write a non-zero
26333 +         number to
26334 +
26335 +             /sys/kernel/debug/tracing/latency_hist/enable/missed_timer_offsets
26336 +
26337 +         The histogram data will be located in the debug file system at
26338 +
26339 +             /sys/kernel/debug/tracing/latency_hist/missed_timer_offsets
26340 +
26341 +         If both Scheduling Latency Histogram and Missed Timer Offsets
26342 +         Histogram are selected, additional histogram data will be collected
26343 +         that contain, in addition to the wakeup latency, the timer latency, in
26344 +         case the wakeup was triggered by an expired timer. These histograms
26345 +         are available in the
26346 +
26347 +             /sys/kernel/debug/tracing/latency_hist/timerandwakeup
26348 +
26349 +         directory. They reflect the apparent interrupt and scheduling latency
26350 +         and are best suitable to determine the worst-case latency of a given
26351 +         system. To enable these histograms, write a non-zero number to
26352 +
26353 +             /sys/kernel/debug/tracing/latency_hist/enable/timerandwakeup
26354 +
26355  config ENABLE_DEFAULT_TRACERS
26356         bool "Trace process context switches and events"
26357         depends on !GENERIC_TRACER
26358 diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
26359 index 05ea5167e6bb..bc08c67301ae 100644
26360 --- a/kernel/trace/Makefile
26361 +++ b/kernel/trace/Makefile
26362 @@ -40,6 +40,10 @@ obj-$(CONFIG_FUNCTION_TRACER) += trace_functions.o
26363  obj-$(CONFIG_IRQSOFF_TRACER) += trace_irqsoff.o
26364  obj-$(CONFIG_PREEMPT_TRACER) += trace_irqsoff.o
26365  obj-$(CONFIG_SCHED_TRACER) += trace_sched_wakeup.o
26366 +obj-$(CONFIG_INTERRUPT_OFF_HIST) += latency_hist.o
26367 +obj-$(CONFIG_PREEMPT_OFF_HIST) += latency_hist.o
26368 +obj-$(CONFIG_WAKEUP_LATENCY_HIST) += latency_hist.o
26369 +obj-$(CONFIG_MISSED_TIMER_OFFSETS_HIST) += latency_hist.o
26370  obj-$(CONFIG_NOP_TRACER) += trace_nop.o
26371  obj-$(CONFIG_STACK_TRACER) += trace_stack.o
26372  obj-$(CONFIG_MMIOTRACE) += trace_mmiotrace.o
26373 diff --git a/kernel/trace/latency_hist.c b/kernel/trace/latency_hist.c
26374 new file mode 100644
26375 index 000000000000..7f6ee70dea41
26376 --- /dev/null
26377 +++ b/kernel/trace/latency_hist.c
26378 @@ -0,0 +1,1178 @@
26379 +/*
26380 + * kernel/trace/latency_hist.c
26381 + *
26382 + * Add support for histograms of preemption-off latency and
26383 + * interrupt-off latency and wakeup latency, it depends on
26384 + * Real-Time Preemption Support.
26385 + *
26386 + *  Copyright (C) 2005 MontaVista Software, Inc.
26387 + *  Yi Yang <yyang@ch.mvista.com>
26388 + *
26389 + *  Converted to work with the new latency tracer.
26390 + *  Copyright (C) 2008 Red Hat, Inc.
26391 + *    Steven Rostedt <srostedt@redhat.com>
26392 + *
26393 + */
26394 +#include <linux/module.h>
26395 +#include <linux/debugfs.h>
26396 +#include <linux/seq_file.h>
26397 +#include <linux/percpu.h>
26398 +#include <linux/kallsyms.h>
26399 +#include <linux/uaccess.h>
26400 +#include <linux/sched.h>
26401 +#include <linux/sched/rt.h>
26402 +#include <linux/slab.h>
26403 +#include <linux/atomic.h>
26404 +#include <asm/div64.h>
26405 +
26406 +#include "trace.h"
26407 +#include <trace/events/sched.h>
26408 +
26409 +#define NSECS_PER_USECS 1000L
26410 +
26411 +#define CREATE_TRACE_POINTS
26412 +#include <trace/events/hist.h>
26413 +
26414 +enum {
26415 +       IRQSOFF_LATENCY = 0,
26416 +       PREEMPTOFF_LATENCY,
26417 +       PREEMPTIRQSOFF_LATENCY,
26418 +       WAKEUP_LATENCY,
26419 +       WAKEUP_LATENCY_SHAREDPRIO,
26420 +       MISSED_TIMER_OFFSETS,
26421 +       TIMERANDWAKEUP_LATENCY,
26422 +       MAX_LATENCY_TYPE,
26423 +};
26424 +
26425 +#define MAX_ENTRY_NUM 10240
26426 +
26427 +struct hist_data {
26428 +       atomic_t hist_mode; /* 0 log, 1 don't log */
26429 +       long offset; /* set it to MAX_ENTRY_NUM/2 for a bipolar scale */
26430 +       long min_lat;
26431 +       long max_lat;
26432 +       unsigned long long below_hist_bound_samples;
26433 +       unsigned long long above_hist_bound_samples;
26434 +       long long accumulate_lat;
26435 +       unsigned long long total_samples;
26436 +       unsigned long long hist_array[MAX_ENTRY_NUM];
26437 +};
26438 +
26439 +struct enable_data {
26440 +       int latency_type;
26441 +       int enabled;
26442 +};
26443 +
26444 +static char *latency_hist_dir_root = "latency_hist";
26445 +
26446 +#ifdef CONFIG_INTERRUPT_OFF_HIST
26447 +static DEFINE_PER_CPU(struct hist_data, irqsoff_hist);
26448 +static char *irqsoff_hist_dir = "irqsoff";
26449 +static DEFINE_PER_CPU(cycles_t, hist_irqsoff_start);
26450 +static DEFINE_PER_CPU(int, hist_irqsoff_counting);
26451 +#endif
26452 +
26453 +#ifdef CONFIG_PREEMPT_OFF_HIST
26454 +static DEFINE_PER_CPU(struct hist_data, preemptoff_hist);
26455 +static char *preemptoff_hist_dir = "preemptoff";
26456 +static DEFINE_PER_CPU(cycles_t, hist_preemptoff_start);
26457 +static DEFINE_PER_CPU(int, hist_preemptoff_counting);
26458 +#endif
26459 +
26460 +#if defined(CONFIG_PREEMPT_OFF_HIST) && defined(CONFIG_INTERRUPT_OFF_HIST)
26461 +static DEFINE_PER_CPU(struct hist_data, preemptirqsoff_hist);
26462 +static char *preemptirqsoff_hist_dir = "preemptirqsoff";
26463 +static DEFINE_PER_CPU(cycles_t, hist_preemptirqsoff_start);
26464 +static DEFINE_PER_CPU(int, hist_preemptirqsoff_counting);
26465 +#endif
26466 +
26467 +#if defined(CONFIG_PREEMPT_OFF_HIST) || defined(CONFIG_INTERRUPT_OFF_HIST)
26468 +static notrace void probe_preemptirqsoff_hist(void *v, int reason, int start);
26469 +static struct enable_data preemptirqsoff_enabled_data = {
26470 +       .latency_type = PREEMPTIRQSOFF_LATENCY,
26471 +       .enabled = 0,
26472 +};
26473 +#endif
26474 +
26475 +#if defined(CONFIG_WAKEUP_LATENCY_HIST) || \
26476 +       defined(CONFIG_MISSED_TIMER_OFFSETS_HIST)
26477 +struct maxlatproc_data {
26478 +       char comm[FIELD_SIZEOF(struct task_struct, comm)];
26479 +       char current_comm[FIELD_SIZEOF(struct task_struct, comm)];
26480 +       int pid;
26481 +       int current_pid;
26482 +       int prio;
26483 +       int current_prio;
26484 +       long latency;
26485 +       long timeroffset;
26486 +       cycle_t timestamp;
26487 +};
26488 +#endif
26489 +
26490 +#ifdef CONFIG_WAKEUP_LATENCY_HIST
26491 +static DEFINE_PER_CPU(struct hist_data, wakeup_latency_hist);
26492 +static DEFINE_PER_CPU(struct hist_data, wakeup_latency_hist_sharedprio);
26493 +static char *wakeup_latency_hist_dir = "wakeup";
26494 +static char *wakeup_latency_hist_dir_sharedprio = "sharedprio";
26495 +static notrace void probe_wakeup_latency_hist_start(void *v,
26496 +       struct task_struct *p);
26497 +static notrace void probe_wakeup_latency_hist_stop(void *v,
26498 +       bool preempt, struct task_struct *prev, struct task_struct *next);
26499 +static notrace void probe_sched_migrate_task(void *,
26500 +       struct task_struct *task, int cpu);
26501 +static struct enable_data wakeup_latency_enabled_data = {
26502 +       .latency_type = WAKEUP_LATENCY,
26503 +       .enabled = 0,
26504 +};
26505 +static DEFINE_PER_CPU(struct maxlatproc_data, wakeup_maxlatproc);
26506 +static DEFINE_PER_CPU(struct maxlatproc_data, wakeup_maxlatproc_sharedprio);
26507 +static DEFINE_PER_CPU(struct task_struct *, wakeup_task);
26508 +static DEFINE_PER_CPU(int, wakeup_sharedprio);
26509 +static unsigned long wakeup_pid;
26510 +#endif
26511 +
26512 +#ifdef CONFIG_MISSED_TIMER_OFFSETS_HIST
26513 +static DEFINE_PER_CPU(struct hist_data, missed_timer_offsets);
26514 +static char *missed_timer_offsets_dir = "missed_timer_offsets";
26515 +static notrace void probe_hrtimer_interrupt(void *v, int cpu,
26516 +       long long offset, struct task_struct *curr, struct task_struct *task);
26517 +static struct enable_data missed_timer_offsets_enabled_data = {
26518 +       .latency_type = MISSED_TIMER_OFFSETS,
26519 +       .enabled = 0,
26520 +};
26521 +static DEFINE_PER_CPU(struct maxlatproc_data, missed_timer_offsets_maxlatproc);
26522 +static unsigned long missed_timer_offsets_pid;
26523 +#endif
26524 +
26525 +#if defined(CONFIG_WAKEUP_LATENCY_HIST) && \
26526 +       defined(CONFIG_MISSED_TIMER_OFFSETS_HIST)
26527 +static DEFINE_PER_CPU(struct hist_data, timerandwakeup_latency_hist);
26528 +static char *timerandwakeup_latency_hist_dir = "timerandwakeup";
26529 +static struct enable_data timerandwakeup_enabled_data = {
26530 +       .latency_type = TIMERANDWAKEUP_LATENCY,
26531 +       .enabled = 0,
26532 +};
26533 +static DEFINE_PER_CPU(struct maxlatproc_data, timerandwakeup_maxlatproc);
26534 +#endif
26535 +
26536 +void notrace latency_hist(int latency_type, int cpu, long latency,
26537 +                         long timeroffset, cycle_t stop,
26538 +                         struct task_struct *p)
26539 +{
26540 +       struct hist_data *my_hist;
26541 +#if defined(CONFIG_WAKEUP_LATENCY_HIST) || \
26542 +       defined(CONFIG_MISSED_TIMER_OFFSETS_HIST)
26543 +       struct maxlatproc_data *mp = NULL;
26544 +#endif
26545 +
26546 +       if (!cpu_possible(cpu) || latency_type < 0 ||
26547 +           latency_type >= MAX_LATENCY_TYPE)
26548 +               return;
26549 +
26550 +       switch (latency_type) {
26551 +#ifdef CONFIG_INTERRUPT_OFF_HIST
26552 +       case IRQSOFF_LATENCY:
26553 +               my_hist = &per_cpu(irqsoff_hist, cpu);
26554 +               break;
26555 +#endif
26556 +#ifdef CONFIG_PREEMPT_OFF_HIST
26557 +       case PREEMPTOFF_LATENCY:
26558 +               my_hist = &per_cpu(preemptoff_hist, cpu);
26559 +               break;
26560 +#endif
26561 +#if defined(CONFIG_PREEMPT_OFF_HIST) && defined(CONFIG_INTERRUPT_OFF_HIST)
26562 +       case PREEMPTIRQSOFF_LATENCY:
26563 +               my_hist = &per_cpu(preemptirqsoff_hist, cpu);
26564 +               break;
26565 +#endif
26566 +#ifdef CONFIG_WAKEUP_LATENCY_HIST
26567 +       case WAKEUP_LATENCY:
26568 +               my_hist = &per_cpu(wakeup_latency_hist, cpu);
26569 +               mp = &per_cpu(wakeup_maxlatproc, cpu);
26570 +               break;
26571 +       case WAKEUP_LATENCY_SHAREDPRIO:
26572 +               my_hist = &per_cpu(wakeup_latency_hist_sharedprio, cpu);
26573 +               mp = &per_cpu(wakeup_maxlatproc_sharedprio, cpu);
26574 +               break;
26575 +#endif
26576 +#ifdef CONFIG_MISSED_TIMER_OFFSETS_HIST
26577 +       case MISSED_TIMER_OFFSETS:
26578 +               my_hist = &per_cpu(missed_timer_offsets, cpu);
26579 +               mp = &per_cpu(missed_timer_offsets_maxlatproc, cpu);
26580 +               break;
26581 +#endif
26582 +#if defined(CONFIG_WAKEUP_LATENCY_HIST) && \
26583 +       defined(CONFIG_MISSED_TIMER_OFFSETS_HIST)
26584 +       case TIMERANDWAKEUP_LATENCY:
26585 +               my_hist = &per_cpu(timerandwakeup_latency_hist, cpu);
26586 +               mp = &per_cpu(timerandwakeup_maxlatproc, cpu);
26587 +               break;
26588 +#endif
26589 +
26590 +       default:
26591 +               return;
26592 +       }
26593 +
26594 +       latency += my_hist->offset;
26595 +
26596 +       if (atomic_read(&my_hist->hist_mode) == 0)
26597 +               return;
26598 +
26599 +       if (latency < 0 || latency >= MAX_ENTRY_NUM) {
26600 +               if (latency < 0)
26601 +                       my_hist->below_hist_bound_samples++;
26602 +               else
26603 +                       my_hist->above_hist_bound_samples++;
26604 +       } else
26605 +               my_hist->hist_array[latency]++;
26606 +
26607 +       if (unlikely(latency > my_hist->max_lat ||
26608 +           my_hist->min_lat == LONG_MAX)) {
26609 +#if defined(CONFIG_WAKEUP_LATENCY_HIST) || \
26610 +       defined(CONFIG_MISSED_TIMER_OFFSETS_HIST)
26611 +               if (latency_type == WAKEUP_LATENCY ||
26612 +                   latency_type == WAKEUP_LATENCY_SHAREDPRIO ||
26613 +                   latency_type == MISSED_TIMER_OFFSETS ||
26614 +                   latency_type == TIMERANDWAKEUP_LATENCY) {
26615 +                       strncpy(mp->comm, p->comm, sizeof(mp->comm));
26616 +                       strncpy(mp->current_comm, current->comm,
26617 +                           sizeof(mp->current_comm));
26618 +                       mp->pid = task_pid_nr(p);
26619 +                       mp->current_pid = task_pid_nr(current);
26620 +                       mp->prio = p->prio;
26621 +                       mp->current_prio = current->prio;
26622 +                       mp->latency = latency;
26623 +                       mp->timeroffset = timeroffset;
26624 +                       mp->timestamp = stop;
26625 +               }
26626 +#endif
26627 +               my_hist->max_lat = latency;
26628 +       }
26629 +       if (unlikely(latency < my_hist->min_lat))
26630 +               my_hist->min_lat = latency;
26631 +       my_hist->total_samples++;
26632 +       my_hist->accumulate_lat += latency;
26633 +}
26634 +
26635 +static void *l_start(struct seq_file *m, loff_t *pos)
26636 +{
26637 +       loff_t *index_ptr = NULL;
26638 +       loff_t index = *pos;
26639 +       struct hist_data *my_hist = m->private;
26640 +
26641 +       if (index == 0) {
26642 +               char minstr[32], avgstr[32], maxstr[32];
26643 +
26644 +               atomic_dec(&my_hist->hist_mode);
26645 +
26646 +               if (likely(my_hist->total_samples)) {
26647 +                       long avg = (long) div64_s64(my_hist->accumulate_lat,
26648 +                           my_hist->total_samples);
26649 +                       snprintf(minstr, sizeof(minstr), "%ld",
26650 +                           my_hist->min_lat - my_hist->offset);
26651 +                       snprintf(avgstr, sizeof(avgstr), "%ld",
26652 +                           avg - my_hist->offset);
26653 +                       snprintf(maxstr, sizeof(maxstr), "%ld",
26654 +                           my_hist->max_lat - my_hist->offset);
26655 +               } else {
26656 +                       strcpy(minstr, "<undef>");
26657 +                       strcpy(avgstr, minstr);
26658 +                       strcpy(maxstr, minstr);
26659 +               }
26660 +
26661 +               seq_printf(m, "#Minimum latency: %s microseconds\n"
26662 +                          "#Average latency: %s microseconds\n"
26663 +                          "#Maximum latency: %s microseconds\n"
26664 +                          "#Total samples: %llu\n"
26665 +                          "#There are %llu samples lower than %ld"
26666 +                          " microseconds.\n"
26667 +                          "#There are %llu samples greater or equal"
26668 +                          " than %ld microseconds.\n"
26669 +                          "#usecs\t%16s\n",
26670 +                          minstr, avgstr, maxstr,
26671 +                          my_hist->total_samples,
26672 +                          my_hist->below_hist_bound_samples,
26673 +                          -my_hist->offset,
26674 +                          my_hist->above_hist_bound_samples,
26675 +                          MAX_ENTRY_NUM - my_hist->offset,
26676 +                          "samples");
26677 +       }
26678 +       if (index < MAX_ENTRY_NUM) {
26679 +               index_ptr = kmalloc(sizeof(loff_t), GFP_KERNEL);
26680 +               if (index_ptr)
26681 +                       *index_ptr = index;
26682 +       }
26683 +
26684 +       return index_ptr;
26685 +}
26686 +
26687 +static void *l_next(struct seq_file *m, void *p, loff_t *pos)
26688 +{
26689 +       loff_t *index_ptr = p;
26690 +       struct hist_data *my_hist = m->private;
26691 +
26692 +       if (++*pos >= MAX_ENTRY_NUM) {
26693 +               atomic_inc(&my_hist->hist_mode);
26694 +               return NULL;
26695 +       }
26696 +       *index_ptr = *pos;
26697 +       return index_ptr;
26698 +}
26699 +
26700 +static void l_stop(struct seq_file *m, void *p)
26701 +{
26702 +       kfree(p);
26703 +}
26704 +
26705 +static int l_show(struct seq_file *m, void *p)
26706 +{
26707 +       int index = *(loff_t *) p;
26708 +       struct hist_data *my_hist = m->private;
26709 +
26710 +       seq_printf(m, "%6ld\t%16llu\n", index - my_hist->offset,
26711 +           my_hist->hist_array[index]);
26712 +       return 0;
26713 +}
26714 +
26715 +static const struct seq_operations latency_hist_seq_op = {
26716 +       .start = l_start,
26717 +       .next  = l_next,
26718 +       .stop  = l_stop,
26719 +       .show  = l_show
26720 +};
26721 +
26722 +static int latency_hist_open(struct inode *inode, struct file *file)
26723 +{
26724 +       int ret;
26725 +
26726 +       ret = seq_open(file, &latency_hist_seq_op);
26727 +       if (!ret) {
26728 +               struct seq_file *seq = file->private_data;
26729 +               seq->private = inode->i_private;
26730 +       }
26731 +       return ret;
26732 +}
26733 +
26734 +static const struct file_operations latency_hist_fops = {
26735 +       .open = latency_hist_open,
26736 +       .read = seq_read,
26737 +       .llseek = seq_lseek,
26738 +       .release = seq_release,
26739 +};
26740 +
26741 +#if defined(CONFIG_WAKEUP_LATENCY_HIST) || \
26742 +       defined(CONFIG_MISSED_TIMER_OFFSETS_HIST)
26743 +static void clear_maxlatprocdata(struct maxlatproc_data *mp)
26744 +{
26745 +       mp->comm[0] = mp->current_comm[0] = '\0';
26746 +       mp->prio = mp->current_prio = mp->pid = mp->current_pid =
26747 +           mp->latency = mp->timeroffset = -1;
26748 +       mp->timestamp = 0;
26749 +}
26750 +#endif
26751 +
26752 +static void hist_reset(struct hist_data *hist)
26753 +{
26754 +       atomic_dec(&hist->hist_mode);
26755 +
26756 +       memset(hist->hist_array, 0, sizeof(hist->hist_array));
26757 +       hist->below_hist_bound_samples = 0ULL;
26758 +       hist->above_hist_bound_samples = 0ULL;
26759 +       hist->min_lat = LONG_MAX;
26760 +       hist->max_lat = LONG_MIN;
26761 +       hist->total_samples = 0ULL;
26762 +       hist->accumulate_lat = 0LL;
26763 +
26764 +       atomic_inc(&hist->hist_mode);
26765 +}
26766 +
26767 +static ssize_t
26768 +latency_hist_reset(struct file *file, const char __user *a,
26769 +                  size_t size, loff_t *off)
26770 +{
26771 +       int cpu;
26772 +       struct hist_data *hist = NULL;
26773 +#if defined(CONFIG_WAKEUP_LATENCY_HIST) || \
26774 +       defined(CONFIG_MISSED_TIMER_OFFSETS_HIST)
26775 +       struct maxlatproc_data *mp = NULL;
26776 +#endif
26777 +       off_t latency_type = (off_t) file->private_data;
26778 +
26779 +       for_each_online_cpu(cpu) {
26780 +
26781 +               switch (latency_type) {
26782 +#ifdef CONFIG_PREEMPT_OFF_HIST
26783 +               case PREEMPTOFF_LATENCY:
26784 +                       hist = &per_cpu(preemptoff_hist, cpu);
26785 +                       break;
26786 +#endif
26787 +#ifdef CONFIG_INTERRUPT_OFF_HIST
26788 +               case IRQSOFF_LATENCY:
26789 +                       hist = &per_cpu(irqsoff_hist, cpu);
26790 +                       break;
26791 +#endif
26792 +#if defined(CONFIG_INTERRUPT_OFF_HIST) && defined(CONFIG_PREEMPT_OFF_HIST)
26793 +               case PREEMPTIRQSOFF_LATENCY:
26794 +                       hist = &per_cpu(preemptirqsoff_hist, cpu);
26795 +                       break;
26796 +#endif
26797 +#ifdef CONFIG_WAKEUP_LATENCY_HIST
26798 +               case WAKEUP_LATENCY:
26799 +                       hist = &per_cpu(wakeup_latency_hist, cpu);
26800 +                       mp = &per_cpu(wakeup_maxlatproc, cpu);
26801 +                       break;
26802 +               case WAKEUP_LATENCY_SHAREDPRIO:
26803 +                       hist = &per_cpu(wakeup_latency_hist_sharedprio, cpu);
26804 +                       mp = &per_cpu(wakeup_maxlatproc_sharedprio, cpu);
26805 +                       break;
26806 +#endif
26807 +#ifdef CONFIG_MISSED_TIMER_OFFSETS_HIST
26808 +               case MISSED_TIMER_OFFSETS:
26809 +                       hist = &per_cpu(missed_timer_offsets, cpu);
26810 +                       mp = &per_cpu(missed_timer_offsets_maxlatproc, cpu);
26811 +                       break;
26812 +#endif
26813 +#if defined(CONFIG_WAKEUP_LATENCY_HIST) && \
26814 +       defined(CONFIG_MISSED_TIMER_OFFSETS_HIST)
26815 +               case TIMERANDWAKEUP_LATENCY:
26816 +                       hist = &per_cpu(timerandwakeup_latency_hist, cpu);
26817 +                       mp = &per_cpu(timerandwakeup_maxlatproc, cpu);
26818 +                       break;
26819 +#endif
26820 +               }
26821 +
26822 +               hist_reset(hist);
26823 +#if defined(CONFIG_WAKEUP_LATENCY_HIST) || \
26824 +       defined(CONFIG_MISSED_TIMER_OFFSETS_HIST)
26825 +               if (latency_type == WAKEUP_LATENCY ||
26826 +                   latency_type == WAKEUP_LATENCY_SHAREDPRIO ||
26827 +                   latency_type == MISSED_TIMER_OFFSETS ||
26828 +                   latency_type == TIMERANDWAKEUP_LATENCY)
26829 +                       clear_maxlatprocdata(mp);
26830 +#endif
26831 +       }
26832 +
26833 +       return size;
26834 +}
26835 +
26836 +#if defined(CONFIG_WAKEUP_LATENCY_HIST) || \
26837 +       defined(CONFIG_MISSED_TIMER_OFFSETS_HIST)
26838 +static ssize_t
26839 +show_pid(struct file *file, char __user *ubuf, size_t cnt, loff_t *ppos)
26840 +{
26841 +       char buf[64];
26842 +       int r;
26843 +       unsigned long *this_pid = file->private_data;
26844 +
26845 +       r = snprintf(buf, sizeof(buf), "%lu\n", *this_pid);
26846 +       return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
26847 +}
26848 +
26849 +static ssize_t do_pid(struct file *file, const char __user *ubuf,
26850 +                     size_t cnt, loff_t *ppos)
26851 +{
26852 +       char buf[64];
26853 +       unsigned long pid;
26854 +       unsigned long *this_pid = file->private_data;
26855 +
26856 +       if (cnt >= sizeof(buf))
26857 +               return -EINVAL;
26858 +
26859 +       if (copy_from_user(&buf, ubuf, cnt))
26860 +               return -EFAULT;
26861 +
26862 +       buf[cnt] = '\0';
26863 +
26864 +       if (kstrtoul(buf, 10, &pid))
26865 +               return -EINVAL;
26866 +
26867 +       *this_pid = pid;
26868 +
26869 +       return cnt;
26870 +}
26871 +#endif
26872 +
26873 +#if defined(CONFIG_WAKEUP_LATENCY_HIST) || \
26874 +       defined(CONFIG_MISSED_TIMER_OFFSETS_HIST)
26875 +static ssize_t
26876 +show_maxlatproc(struct file *file, char __user *ubuf, size_t cnt, loff_t *ppos)
26877 +{
26878 +       int r;
26879 +       struct maxlatproc_data *mp = file->private_data;
26880 +       int strmaxlen = (TASK_COMM_LEN * 2) + (8 * 8);
26881 +       unsigned long long t;
26882 +       unsigned long usecs, secs;
26883 +       char *buf;
26884 +
26885 +       if (mp->pid == -1 || mp->current_pid == -1) {
26886 +               buf = "(none)\n";
26887 +               return simple_read_from_buffer(ubuf, cnt, ppos, buf,
26888 +                   strlen(buf));
26889 +       }
26890 +
26891 +       buf = kmalloc(strmaxlen, GFP_KERNEL);
26892 +       if (buf == NULL)
26893 +               return -ENOMEM;
26894 +
26895 +       t = ns2usecs(mp->timestamp);
26896 +       usecs = do_div(t, USEC_PER_SEC);
26897 +       secs = (unsigned long) t;
26898 +       r = snprintf(buf, strmaxlen,
26899 +           "%d %d %ld (%ld) %s <- %d %d %s %lu.%06lu\n", mp->pid,
26900 +           MAX_RT_PRIO-1 - mp->prio, mp->latency, mp->timeroffset, mp->comm,
26901 +           mp->current_pid, MAX_RT_PRIO-1 - mp->current_prio, mp->current_comm,
26902 +           secs, usecs);
26903 +       r = simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
26904 +       kfree(buf);
26905 +       return r;
26906 +}
26907 +#endif
26908 +
26909 +static ssize_t
26910 +show_enable(struct file *file, char __user *ubuf, size_t cnt, loff_t *ppos)
26911 +{
26912 +       char buf[64];
26913 +       struct enable_data *ed = file->private_data;
26914 +       int r;
26915 +
26916 +       r = snprintf(buf, sizeof(buf), "%d\n", ed->enabled);
26917 +       return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
26918 +}
26919 +
26920 +static ssize_t
26921 +do_enable(struct file *file, const char __user *ubuf, size_t cnt, loff_t *ppos)
26922 +{
26923 +       char buf[64];
26924 +       long enable;
26925 +       struct enable_data *ed = file->private_data;
26926 +
26927 +       if (cnt >= sizeof(buf))
26928 +               return -EINVAL;
26929 +
26930 +       if (copy_from_user(&buf, ubuf, cnt))
26931 +               return -EFAULT;
26932 +
26933 +       buf[cnt] = 0;
26934 +
26935 +       if (kstrtoul(buf, 10, &enable))
26936 +               return -EINVAL;
26937 +
26938 +       if ((enable && ed->enabled) || (!enable && !ed->enabled))
26939 +               return cnt;
26940 +
26941 +       if (enable) {
26942 +               int ret;
26943 +
26944 +               switch (ed->latency_type) {
26945 +#if defined(CONFIG_INTERRUPT_OFF_HIST) || defined(CONFIG_PREEMPT_OFF_HIST)
26946 +               case PREEMPTIRQSOFF_LATENCY:
26947 +                       ret = register_trace_preemptirqsoff_hist(
26948 +                           probe_preemptirqsoff_hist, NULL);
26949 +                       if (ret) {
26950 +                               pr_info("wakeup trace: Couldn't assign "
26951 +                                   "probe_preemptirqsoff_hist "
26952 +                                   "to trace_preemptirqsoff_hist\n");
26953 +                               return ret;
26954 +                       }
26955 +                       break;
26956 +#endif
26957 +#ifdef CONFIG_WAKEUP_LATENCY_HIST
26958 +               case WAKEUP_LATENCY:
26959 +                       ret = register_trace_sched_wakeup(
26960 +                           probe_wakeup_latency_hist_start, NULL);
26961 +                       if (ret) {
26962 +                               pr_info("wakeup trace: Couldn't assign "
26963 +                                   "probe_wakeup_latency_hist_start "
26964 +                                   "to trace_sched_wakeup\n");
26965 +                               return ret;
26966 +                       }
26967 +                       ret = register_trace_sched_wakeup_new(
26968 +                           probe_wakeup_latency_hist_start, NULL);
26969 +                       if (ret) {
26970 +                               pr_info("wakeup trace: Couldn't assign "
26971 +                                   "probe_wakeup_latency_hist_start "
26972 +                                   "to trace_sched_wakeup_new\n");
26973 +                               unregister_trace_sched_wakeup(
26974 +                                   probe_wakeup_latency_hist_start, NULL);
26975 +                               return ret;
26976 +                       }
26977 +                       ret = register_trace_sched_switch(
26978 +                           probe_wakeup_latency_hist_stop, NULL);
26979 +                       if (ret) {
26980 +                               pr_info("wakeup trace: Couldn't assign "
26981 +                                   "probe_wakeup_latency_hist_stop "
26982 +                                   "to trace_sched_switch\n");
26983 +                               unregister_trace_sched_wakeup(
26984 +                                   probe_wakeup_latency_hist_start, NULL);
26985 +                               unregister_trace_sched_wakeup_new(
26986 +                                   probe_wakeup_latency_hist_start, NULL);
26987 +                               return ret;
26988 +                       }
26989 +                       ret = register_trace_sched_migrate_task(
26990 +                           probe_sched_migrate_task, NULL);
26991 +                       if (ret) {
26992 +                               pr_info("wakeup trace: Couldn't assign "
26993 +                                   "probe_sched_migrate_task "
26994 +                                   "to trace_sched_migrate_task\n");
26995 +                               unregister_trace_sched_wakeup(
26996 +                                   probe_wakeup_latency_hist_start, NULL);
26997 +                               unregister_trace_sched_wakeup_new(
26998 +                                   probe_wakeup_latency_hist_start, NULL);
26999 +                               unregister_trace_sched_switch(
27000 +                                   probe_wakeup_latency_hist_stop, NULL);
27001 +                               return ret;
27002 +                       }
27003 +                       break;
27004 +#endif
27005 +#ifdef CONFIG_MISSED_TIMER_OFFSETS_HIST
27006 +               case MISSED_TIMER_OFFSETS:
27007 +                       ret = register_trace_hrtimer_interrupt(
27008 +                           probe_hrtimer_interrupt, NULL);
27009 +                       if (ret) {
27010 +                               pr_info("wakeup trace: Couldn't assign "
27011 +                                   "probe_hrtimer_interrupt "
27012 +                                   "to trace_hrtimer_interrupt\n");
27013 +                               return ret;
27014 +                       }
27015 +                       break;
27016 +#endif
27017 +#if defined(CONFIG_WAKEUP_LATENCY_HIST) && \
27018 +       defined(CONFIG_MISSED_TIMER_OFFSETS_HIST)
27019 +               case TIMERANDWAKEUP_LATENCY:
27020 +                       if (!wakeup_latency_enabled_data.enabled ||
27021 +                           !missed_timer_offsets_enabled_data.enabled)
27022 +                               return -EINVAL;
27023 +                       break;
27024 +#endif
27025 +               default:
27026 +                       break;
27027 +               }
27028 +       } else {
27029 +               switch (ed->latency_type) {
27030 +#if defined(CONFIG_INTERRUPT_OFF_HIST) || defined(CONFIG_PREEMPT_OFF_HIST)
27031 +               case PREEMPTIRQSOFF_LATENCY:
27032 +                       {
27033 +                               int cpu;
27034 +
27035 +                               unregister_trace_preemptirqsoff_hist(
27036 +                                   probe_preemptirqsoff_hist, NULL);
27037 +                               for_each_online_cpu(cpu) {
27038 +#ifdef CONFIG_INTERRUPT_OFF_HIST
27039 +                                       per_cpu(hist_irqsoff_counting,
27040 +                                           cpu) = 0;
27041 +#endif
27042 +#ifdef CONFIG_PREEMPT_OFF_HIST
27043 +                                       per_cpu(hist_preemptoff_counting,
27044 +                                           cpu) = 0;
27045 +#endif
27046 +#if defined(CONFIG_INTERRUPT_OFF_HIST) && defined(CONFIG_PREEMPT_OFF_HIST)
27047 +                                       per_cpu(hist_preemptirqsoff_counting,
27048 +                                           cpu) = 0;
27049 +#endif
27050 +                               }
27051 +                       }
27052 +                       break;
27053 +#endif
27054 +#ifdef CONFIG_WAKEUP_LATENCY_HIST
27055 +               case WAKEUP_LATENCY:
27056 +                       {
27057 +                               int cpu;
27058 +
27059 +                               unregister_trace_sched_wakeup(
27060 +                                   probe_wakeup_latency_hist_start, NULL);
27061 +                               unregister_trace_sched_wakeup_new(
27062 +                                   probe_wakeup_latency_hist_start, NULL);
27063 +                               unregister_trace_sched_switch(
27064 +                                   probe_wakeup_latency_hist_stop, NULL);
27065 +                               unregister_trace_sched_migrate_task(
27066 +                                   probe_sched_migrate_task, NULL);
27067 +
27068 +                               for_each_online_cpu(cpu) {
27069 +                                       per_cpu(wakeup_task, cpu) = NULL;
27070 +                                       per_cpu(wakeup_sharedprio, cpu) = 0;
27071 +                               }
27072 +                       }
27073 +#ifdef CONFIG_MISSED_TIMER_OFFSETS_HIST
27074 +                       timerandwakeup_enabled_data.enabled = 0;
27075 +#endif
27076 +                       break;
27077 +#endif
27078 +#ifdef CONFIG_MISSED_TIMER_OFFSETS_HIST
27079 +               case MISSED_TIMER_OFFSETS:
27080 +                       unregister_trace_hrtimer_interrupt(
27081 +                           probe_hrtimer_interrupt, NULL);
27082 +#ifdef CONFIG_WAKEUP_LATENCY_HIST
27083 +                       timerandwakeup_enabled_data.enabled = 0;
27084 +#endif
27085 +                       break;
27086 +#endif
27087 +               default:
27088 +                       break;
27089 +               }
27090 +       }
27091 +       ed->enabled = enable;
27092 +       return cnt;
27093 +}
27094 +
27095 +static const struct file_operations latency_hist_reset_fops = {
27096 +       .open = tracing_open_generic,
27097 +       .write = latency_hist_reset,
27098 +};
27099 +
27100 +static const struct file_operations enable_fops = {
27101 +       .open = tracing_open_generic,
27102 +       .read = show_enable,
27103 +       .write = do_enable,
27104 +};
27105 +
27106 +#if defined(CONFIG_WAKEUP_LATENCY_HIST) || \
27107 +       defined(CONFIG_MISSED_TIMER_OFFSETS_HIST)
27108 +static const struct file_operations pid_fops = {
27109 +       .open = tracing_open_generic,
27110 +       .read = show_pid,
27111 +       .write = do_pid,
27112 +};
27113 +
27114 +static const struct file_operations maxlatproc_fops = {
27115 +       .open = tracing_open_generic,
27116 +       .read = show_maxlatproc,
27117 +};
27118 +#endif
27119 +
27120 +#if defined(CONFIG_INTERRUPT_OFF_HIST) || defined(CONFIG_PREEMPT_OFF_HIST)
27121 +static notrace void probe_preemptirqsoff_hist(void *v, int reason,
27122 +       int starthist)
27123 +{
27124 +       int cpu = raw_smp_processor_id();
27125 +       int time_set = 0;
27126 +
27127 +       if (starthist) {
27128 +               cycle_t uninitialized_var(start);
27129 +
27130 +               if (!preempt_count() && !irqs_disabled())
27131 +                       return;
27132 +
27133 +#ifdef CONFIG_INTERRUPT_OFF_HIST
27134 +               if ((reason == IRQS_OFF || reason == TRACE_START) &&
27135 +                   !per_cpu(hist_irqsoff_counting, cpu)) {
27136 +                       per_cpu(hist_irqsoff_counting, cpu) = 1;
27137 +                       start = ftrace_now(cpu);
27138 +                       time_set++;
27139 +                       per_cpu(hist_irqsoff_start, cpu) = start;
27140 +               }
27141 +#endif
27142 +
27143 +#ifdef CONFIG_PREEMPT_OFF_HIST
27144 +               if ((reason == PREEMPT_OFF || reason == TRACE_START) &&
27145 +                   !per_cpu(hist_preemptoff_counting, cpu)) {
27146 +                       per_cpu(hist_preemptoff_counting, cpu) = 1;
27147 +                       if (!(time_set++))
27148 +                               start = ftrace_now(cpu);
27149 +                       per_cpu(hist_preemptoff_start, cpu) = start;
27150 +               }
27151 +#endif
27152 +
27153 +#if defined(CONFIG_INTERRUPT_OFF_HIST) && defined(CONFIG_PREEMPT_OFF_HIST)
27154 +               if (per_cpu(hist_irqsoff_counting, cpu) &&
27155 +                   per_cpu(hist_preemptoff_counting, cpu) &&
27156 +                   !per_cpu(hist_preemptirqsoff_counting, cpu)) {
27157 +                       per_cpu(hist_preemptirqsoff_counting, cpu) = 1;
27158 +                       if (!time_set)
27159 +                               start = ftrace_now(cpu);
27160 +                       per_cpu(hist_preemptirqsoff_start, cpu) = start;
27161 +               }
27162 +#endif
27163 +       } else {
27164 +               cycle_t uninitialized_var(stop);
27165 +
27166 +#ifdef CONFIG_INTERRUPT_OFF_HIST
27167 +               if ((reason == IRQS_ON || reason == TRACE_STOP) &&
27168 +                   per_cpu(hist_irqsoff_counting, cpu)) {
27169 +                       cycle_t start = per_cpu(hist_irqsoff_start, cpu);
27170 +
27171 +                       stop = ftrace_now(cpu);
27172 +                       time_set++;
27173 +                       if (start) {
27174 +                               long latency = ((long) (stop - start)) /
27175 +                                   NSECS_PER_USECS;
27176 +
27177 +                               latency_hist(IRQSOFF_LATENCY, cpu, latency, 0,
27178 +                                   stop, NULL);
27179 +                       }
27180 +                       per_cpu(hist_irqsoff_counting, cpu) = 0;
27181 +               }
27182 +#endif
27183 +
27184 +#ifdef CONFIG_PREEMPT_OFF_HIST
27185 +               if ((reason == PREEMPT_ON || reason == TRACE_STOP) &&
27186 +                   per_cpu(hist_preemptoff_counting, cpu)) {
27187 +                       cycle_t start = per_cpu(hist_preemptoff_start, cpu);
27188 +
27189 +                       if (!(time_set++))
27190 +                               stop = ftrace_now(cpu);
27191 +                       if (start) {
27192 +                               long latency = ((long) (stop - start)) /
27193 +                                   NSECS_PER_USECS;
27194 +
27195 +                               latency_hist(PREEMPTOFF_LATENCY, cpu, latency,
27196 +                                   0, stop, NULL);
27197 +                       }
27198 +                       per_cpu(hist_preemptoff_counting, cpu) = 0;
27199 +               }
27200 +#endif
27201 +
27202 +#if defined(CONFIG_INTERRUPT_OFF_HIST) && defined(CONFIG_PREEMPT_OFF_HIST)
27203 +               if ((!per_cpu(hist_irqsoff_counting, cpu) ||
27204 +                    !per_cpu(hist_preemptoff_counting, cpu)) &&
27205 +                  per_cpu(hist_preemptirqsoff_counting, cpu)) {
27206 +                       cycle_t start = per_cpu(hist_preemptirqsoff_start, cpu);
27207 +
27208 +                       if (!time_set)
27209 +                               stop = ftrace_now(cpu);
27210 +                       if (start) {
27211 +                               long latency = ((long) (stop - start)) /
27212 +                                   NSECS_PER_USECS;
27213 +
27214 +                               latency_hist(PREEMPTIRQSOFF_LATENCY, cpu,
27215 +                                   latency, 0, stop, NULL);
27216 +                       }
27217 +                       per_cpu(hist_preemptirqsoff_counting, cpu) = 0;
27218 +               }
27219 +#endif
27220 +       }
27221 +}
27222 +#endif
27223 +
27224 +#ifdef CONFIG_WAKEUP_LATENCY_HIST
27225 +static DEFINE_RAW_SPINLOCK(wakeup_lock);
27226 +static notrace void probe_sched_migrate_task(void *v, struct task_struct *task,
27227 +       int cpu)
27228 +{
27229 +       int old_cpu = task_cpu(task);
27230 +
27231 +       if (cpu != old_cpu) {
27232 +               unsigned long flags;
27233 +               struct task_struct *cpu_wakeup_task;
27234 +
27235 +               raw_spin_lock_irqsave(&wakeup_lock, flags);
27236 +
27237 +               cpu_wakeup_task = per_cpu(wakeup_task, old_cpu);
27238 +               if (task == cpu_wakeup_task) {
27239 +                       put_task_struct(cpu_wakeup_task);
27240 +                       per_cpu(wakeup_task, old_cpu) = NULL;
27241 +                       cpu_wakeup_task = per_cpu(wakeup_task, cpu) = task;
27242 +                       get_task_struct(cpu_wakeup_task);
27243 +               }
27244 +
27245 +               raw_spin_unlock_irqrestore(&wakeup_lock, flags);
27246 +       }
27247 +}
27248 +
27249 +static notrace void probe_wakeup_latency_hist_start(void *v,
27250 +       struct task_struct *p)
27251 +{
27252 +       unsigned long flags;
27253 +       struct task_struct *curr = current;
27254 +       int cpu = task_cpu(p);
27255 +       struct task_struct *cpu_wakeup_task;
27256 +
27257 +       raw_spin_lock_irqsave(&wakeup_lock, flags);
27258 +
27259 +       cpu_wakeup_task = per_cpu(wakeup_task, cpu);
27260 +
27261 +       if (wakeup_pid) {
27262 +               if ((cpu_wakeup_task && p->prio == cpu_wakeup_task->prio) ||
27263 +                   p->prio == curr->prio)
27264 +                       per_cpu(wakeup_sharedprio, cpu) = 1;
27265 +               if (likely(wakeup_pid != task_pid_nr(p)))
27266 +                       goto out;
27267 +       } else {
27268 +               if (likely(!rt_task(p)) ||
27269 +                   (cpu_wakeup_task && p->prio > cpu_wakeup_task->prio) ||
27270 +                   p->prio > curr->prio)
27271 +                       goto out;
27272 +               if ((cpu_wakeup_task && p->prio == cpu_wakeup_task->prio) ||
27273 +                   p->prio == curr->prio)
27274 +                       per_cpu(wakeup_sharedprio, cpu) = 1;
27275 +       }
27276 +
27277 +       if (cpu_wakeup_task)
27278 +               put_task_struct(cpu_wakeup_task);
27279 +       cpu_wakeup_task = per_cpu(wakeup_task, cpu) = p;
27280 +       get_task_struct(cpu_wakeup_task);
27281 +       cpu_wakeup_task->preempt_timestamp_hist =
27282 +               ftrace_now(raw_smp_processor_id());
27283 +out:
27284 +       raw_spin_unlock_irqrestore(&wakeup_lock, flags);
27285 +}
27286 +
27287 +static notrace void probe_wakeup_latency_hist_stop(void *v,
27288 +       bool preempt, struct task_struct *prev, struct task_struct *next)
27289 +{
27290 +       unsigned long flags;
27291 +       int cpu = task_cpu(next);
27292 +       long latency;
27293 +       cycle_t stop;
27294 +       struct task_struct *cpu_wakeup_task;
27295 +
27296 +       raw_spin_lock_irqsave(&wakeup_lock, flags);
27297 +
27298 +       cpu_wakeup_task = per_cpu(wakeup_task, cpu);
27299 +
27300 +       if (cpu_wakeup_task == NULL)
27301 +               goto out;
27302 +
27303 +       /* Already running? */
27304 +       if (unlikely(current == cpu_wakeup_task))
27305 +               goto out_reset;
27306 +
27307 +       if (next != cpu_wakeup_task) {
27308 +               if (next->prio < cpu_wakeup_task->prio)
27309 +                       goto out_reset;
27310 +
27311 +               if (next->prio == cpu_wakeup_task->prio)
27312 +                       per_cpu(wakeup_sharedprio, cpu) = 1;
27313 +
27314 +               goto out;
27315 +       }
27316 +
27317 +       if (current->prio == cpu_wakeup_task->prio)
27318 +               per_cpu(wakeup_sharedprio, cpu) = 1;
27319 +
27320 +       /*
27321 +        * The task we are waiting for is about to be switched to.
27322 +        * Calculate latency and store it in histogram.
27323 +        */
27324 +       stop = ftrace_now(raw_smp_processor_id());
27325 +
27326 +       latency = ((long) (stop - next->preempt_timestamp_hist)) /
27327 +           NSECS_PER_USECS;
27328 +
27329 +       if (per_cpu(wakeup_sharedprio, cpu)) {
27330 +               latency_hist(WAKEUP_LATENCY_SHAREDPRIO, cpu, latency, 0, stop,
27331 +                   next);
27332 +               per_cpu(wakeup_sharedprio, cpu) = 0;
27333 +       } else {
27334 +               latency_hist(WAKEUP_LATENCY, cpu, latency, 0, stop, next);
27335 +#ifdef CONFIG_MISSED_TIMER_OFFSETS_HIST
27336 +               if (timerandwakeup_enabled_data.enabled) {
27337 +                       latency_hist(TIMERANDWAKEUP_LATENCY, cpu,
27338 +                           next->timer_offset + latency, next->timer_offset,
27339 +                           stop, next);
27340 +               }
27341 +#endif
27342 +       }
27343 +
27344 +out_reset:
27345 +#ifdef CONFIG_MISSED_TIMER_OFFSETS_HIST
27346 +       next->timer_offset = 0;
27347 +#endif
27348 +       put_task_struct(cpu_wakeup_task);
27349 +       per_cpu(wakeup_task, cpu) = NULL;
27350 +out:
27351 +       raw_spin_unlock_irqrestore(&wakeup_lock, flags);
27352 +}
27353 +#endif
27354 +
27355 +#ifdef CONFIG_MISSED_TIMER_OFFSETS_HIST
27356 +static notrace void probe_hrtimer_interrupt(void *v, int cpu,
27357 +       long long latency_ns, struct task_struct *curr,
27358 +       struct task_struct *task)
27359 +{
27360 +       if (latency_ns <= 0 && task != NULL && rt_task(task) &&
27361 +           (task->prio < curr->prio ||
27362 +           (task->prio == curr->prio &&
27363 +           !cpumask_test_cpu(cpu, &task->cpus_allowed)))) {
27364 +               long latency;
27365 +               cycle_t now;
27366 +
27367 +               if (missed_timer_offsets_pid) {
27368 +                       if (likely(missed_timer_offsets_pid !=
27369 +                           task_pid_nr(task)))
27370 +                               return;
27371 +               }
27372 +
27373 +               now = ftrace_now(cpu);
27374 +               latency = (long) div_s64(-latency_ns, NSECS_PER_USECS);
27375 +               latency_hist(MISSED_TIMER_OFFSETS, cpu, latency, latency, now,
27376 +                   task);
27377 +#ifdef CONFIG_WAKEUP_LATENCY_HIST
27378 +               task->timer_offset = latency;
27379 +#endif
27380 +       }
27381 +}
27382 +#endif
27383 +
27384 +static __init int latency_hist_init(void)
27385 +{
27386 +       struct dentry *latency_hist_root = NULL;
27387 +       struct dentry *dentry;
27388 +#ifdef CONFIG_WAKEUP_LATENCY_HIST
27389 +       struct dentry *dentry_sharedprio;
27390 +#endif
27391 +       struct dentry *entry;
27392 +       struct dentry *enable_root;
27393 +       int i = 0;
27394 +       struct hist_data *my_hist;
27395 +       char name[64];
27396 +       char *cpufmt = "CPU%d";
27397 +#if defined(CONFIG_WAKEUP_LATENCY_HIST) || \
27398 +       defined(CONFIG_MISSED_TIMER_OFFSETS_HIST)
27399 +       char *cpufmt_maxlatproc = "max_latency-CPU%d";
27400 +       struct maxlatproc_data *mp = NULL;
27401 +#endif
27402 +
27403 +       dentry = tracing_init_dentry();
27404 +       latency_hist_root = debugfs_create_dir(latency_hist_dir_root, dentry);
27405 +       enable_root = debugfs_create_dir("enable", latency_hist_root);
27406 +
27407 +#ifdef CONFIG_INTERRUPT_OFF_HIST
27408 +       dentry = debugfs_create_dir(irqsoff_hist_dir, latency_hist_root);
27409 +       for_each_possible_cpu(i) {
27410 +               sprintf(name, cpufmt, i);
27411 +               entry = debugfs_create_file(name, 0444, dentry,
27412 +                   &per_cpu(irqsoff_hist, i), &latency_hist_fops);
27413 +               my_hist = &per_cpu(irqsoff_hist, i);
27414 +               atomic_set(&my_hist->hist_mode, 1);
27415 +               my_hist->min_lat = LONG_MAX;
27416 +       }
27417 +       entry = debugfs_create_file("reset", 0644, dentry,
27418 +           (void *)IRQSOFF_LATENCY, &latency_hist_reset_fops);
27419 +#endif
27420 +
27421 +#ifdef CONFIG_PREEMPT_OFF_HIST
27422 +       dentry = debugfs_create_dir(preemptoff_hist_dir,
27423 +           latency_hist_root);
27424 +       for_each_possible_cpu(i) {
27425 +               sprintf(name, cpufmt, i);
27426 +               entry = debugfs_create_file(name, 0444, dentry,
27427 +                   &per_cpu(preemptoff_hist, i), &latency_hist_fops);
27428 +               my_hist = &per_cpu(preemptoff_hist, i);
27429 +               atomic_set(&my_hist->hist_mode, 1);
27430 +               my_hist->min_lat = LONG_MAX;
27431 +       }
27432 +       entry = debugfs_create_file("reset", 0644, dentry,
27433 +           (void *)PREEMPTOFF_LATENCY, &latency_hist_reset_fops);
27434 +#endif
27435 +
27436 +#if defined(CONFIG_INTERRUPT_OFF_HIST) && defined(CONFIG_PREEMPT_OFF_HIST)
27437 +       dentry = debugfs_create_dir(preemptirqsoff_hist_dir,
27438 +           latency_hist_root);
27439 +       for_each_possible_cpu(i) {
27440 +               sprintf(name, cpufmt, i);
27441 +               entry = debugfs_create_file(name, 0444, dentry,
27442 +                   &per_cpu(preemptirqsoff_hist, i), &latency_hist_fops);
27443 +               my_hist = &per_cpu(preemptirqsoff_hist, i);
27444 +               atomic_set(&my_hist->hist_mode, 1);
27445 +               my_hist->min_lat = LONG_MAX;
27446 +       }
27447 +       entry = debugfs_create_file("reset", 0644, dentry,
27448 +           (void *)PREEMPTIRQSOFF_LATENCY, &latency_hist_reset_fops);
27449 +#endif
27450 +
27451 +#if defined(CONFIG_INTERRUPT_OFF_HIST) || defined(CONFIG_PREEMPT_OFF_HIST)
27452 +       entry = debugfs_create_file("preemptirqsoff", 0644,
27453 +           enable_root, (void *)&preemptirqsoff_enabled_data,
27454 +           &enable_fops);
27455 +#endif
27456 +
27457 +#ifdef CONFIG_WAKEUP_LATENCY_HIST
27458 +       dentry = debugfs_create_dir(wakeup_latency_hist_dir,
27459 +           latency_hist_root);
27460 +       dentry_sharedprio = debugfs_create_dir(
27461 +           wakeup_latency_hist_dir_sharedprio, dentry);
27462 +       for_each_possible_cpu(i) {
27463 +               sprintf(name, cpufmt, i);
27464 +
27465 +               entry = debugfs_create_file(name, 0444, dentry,
27466 +                   &per_cpu(wakeup_latency_hist, i),
27467 +                   &latency_hist_fops);
27468 +               my_hist = &per_cpu(wakeup_latency_hist, i);
27469 +               atomic_set(&my_hist->hist_mode, 1);
27470 +               my_hist->min_lat = LONG_MAX;
27471 +
27472 +               entry = debugfs_create_file(name, 0444, dentry_sharedprio,
27473 +                   &per_cpu(wakeup_latency_hist_sharedprio, i),
27474 +                   &latency_hist_fops);
27475 +               my_hist = &per_cpu(wakeup_latency_hist_sharedprio, i);
27476 +               atomic_set(&my_hist->hist_mode, 1);
27477 +               my_hist->min_lat = LONG_MAX;
27478 +
27479 +               sprintf(name, cpufmt_maxlatproc, i);
27480 +
27481 +               mp = &per_cpu(wakeup_maxlatproc, i);
27482 +               entry = debugfs_create_file(name, 0444, dentry, mp,
27483 +                   &maxlatproc_fops);
27484 +               clear_maxlatprocdata(mp);
27485 +
27486 +               mp = &per_cpu(wakeup_maxlatproc_sharedprio, i);
27487 +               entry = debugfs_create_file(name, 0444, dentry_sharedprio, mp,
27488 +                   &maxlatproc_fops);
27489 +               clear_maxlatprocdata(mp);
27490 +       }
27491 +       entry = debugfs_create_file("pid", 0644, dentry,
27492 +           (void *)&wakeup_pid, &pid_fops);
27493 +       entry = debugfs_create_file("reset", 0644, dentry,
27494 +           (void *)WAKEUP_LATENCY, &latency_hist_reset_fops);
27495 +       entry = debugfs_create_file("reset", 0644, dentry_sharedprio,
27496 +           (void *)WAKEUP_LATENCY_SHAREDPRIO, &latency_hist_reset_fops);
27497 +       entry = debugfs_create_file("wakeup", 0644,
27498 +           enable_root, (void *)&wakeup_latency_enabled_data,
27499 +           &enable_fops);
27500 +#endif
27501 +
27502 +#ifdef CONFIG_MISSED_TIMER_OFFSETS_HIST
27503 +       dentry = debugfs_create_dir(missed_timer_offsets_dir,
27504 +           latency_hist_root);
27505 +       for_each_possible_cpu(i) {
27506 +               sprintf(name, cpufmt, i);
27507 +               entry = debugfs_create_file(name, 0444, dentry,
27508 +                   &per_cpu(missed_timer_offsets, i), &latency_hist_fops);
27509 +               my_hist = &per_cpu(missed_timer_offsets, i);
27510 +               atomic_set(&my_hist->hist_mode, 1);
27511 +               my_hist->min_lat = LONG_MAX;
27512 +
27513 +               sprintf(name, cpufmt_maxlatproc, i);
27514 +               mp = &per_cpu(missed_timer_offsets_maxlatproc, i);
27515 +               entry = debugfs_create_file(name, 0444, dentry, mp,
27516 +                   &maxlatproc_fops);
27517 +               clear_maxlatprocdata(mp);
27518 +       }
27519 +       entry = debugfs_create_file("pid", 0644, dentry,
27520 +           (void *)&missed_timer_offsets_pid, &pid_fops);
27521 +       entry = debugfs_create_file("reset", 0644, dentry,
27522 +           (void *)MISSED_TIMER_OFFSETS, &latency_hist_reset_fops);
27523 +       entry = debugfs_create_file("missed_timer_offsets", 0644,
27524 +           enable_root, (void *)&missed_timer_offsets_enabled_data,
27525 +           &enable_fops);
27526 +#endif
27527 +
27528 +#if defined(CONFIG_WAKEUP_LATENCY_HIST) && \
27529 +       defined(CONFIG_MISSED_TIMER_OFFSETS_HIST)
27530 +       dentry = debugfs_create_dir(timerandwakeup_latency_hist_dir,
27531 +           latency_hist_root);
27532 +       for_each_possible_cpu(i) {
27533 +               sprintf(name, cpufmt, i);
27534 +               entry = debugfs_create_file(name, 0444, dentry,
27535 +                   &per_cpu(timerandwakeup_latency_hist, i),
27536 +                   &latency_hist_fops);
27537 +               my_hist = &per_cpu(timerandwakeup_latency_hist, i);
27538 +               atomic_set(&my_hist->hist_mode, 1);
27539 +               my_hist->min_lat = LONG_MAX;
27540 +
27541 +               sprintf(name, cpufmt_maxlatproc, i);
27542 +               mp = &per_cpu(timerandwakeup_maxlatproc, i);
27543 +               entry = debugfs_create_file(name, 0444, dentry, mp,
27544 +                   &maxlatproc_fops);
27545 +               clear_maxlatprocdata(mp);
27546 +       }
27547 +       entry = debugfs_create_file("reset", 0644, dentry,
27548 +           (void *)TIMERANDWAKEUP_LATENCY, &latency_hist_reset_fops);
27549 +       entry = debugfs_create_file("timerandwakeup", 0644,
27550 +           enable_root, (void *)&timerandwakeup_enabled_data,
27551 +           &enable_fops);
27552 +#endif
27553 +       return 0;
27554 +}
27555 +
27556 +device_initcall(latency_hist_init);
27557 diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
27558 index 059233abcfcf..cad1a28bfbe2 100644
27559 --- a/kernel/trace/trace.c
27560 +++ b/kernel/trace/trace.c
27561 @@ -1652,6 +1652,7 @@ tracing_generic_entry_update(struct trace_entry *entry, unsigned long flags,
27562         struct task_struct *tsk = current;
27563
27564         entry->preempt_count            = pc & 0xff;
27565 +       entry->preempt_lazy_count       = preempt_lazy_count();
27566         entry->pid                      = (tsk) ? tsk->pid : 0;
27567         entry->flags =
27568  #ifdef CONFIG_TRACE_IRQFLAGS_SUPPORT
27569 @@ -1661,8 +1662,11 @@ tracing_generic_entry_update(struct trace_entry *entry, unsigned long flags,
27570  #endif
27571                 ((pc & HARDIRQ_MASK) ? TRACE_FLAG_HARDIRQ : 0) |
27572                 ((pc & SOFTIRQ_MASK) ? TRACE_FLAG_SOFTIRQ : 0) |
27573 -               (tif_need_resched() ? TRACE_FLAG_NEED_RESCHED : 0) |
27574 +               (tif_need_resched_now() ? TRACE_FLAG_NEED_RESCHED : 0) |
27575 +               (need_resched_lazy() ? TRACE_FLAG_NEED_RESCHED_LAZY : 0) |
27576                 (test_preempt_need_resched() ? TRACE_FLAG_PREEMPT_RESCHED : 0);
27577 +
27578 +       entry->migrate_disable = (tsk) ? __migrate_disabled(tsk) & 0xFF : 0;
27579  }
27580  EXPORT_SYMBOL_GPL(tracing_generic_entry_update);
27581
27582 @@ -2555,14 +2559,17 @@ get_total_entries(struct trace_buffer *buf,
27583
27584  static void print_lat_help_header(struct seq_file *m)
27585  {
27586 -       seq_puts(m, "#                  _------=> CPU#            \n"
27587 -                   "#                 / _-----=> irqs-off        \n"
27588 -                   "#                | / _----=> need-resched    \n"
27589 -                   "#                || / _---=> hardirq/softirq \n"
27590 -                   "#                ||| / _--=> preempt-depth   \n"
27591 -                   "#                |||| /     delay            \n"
27592 -                   "#  cmd     pid   ||||| time  |   caller      \n"
27593 -                   "#     \\   /      |||||  \\    |   /         \n");
27594 +       seq_puts(m, "#                  _--------=> CPU#              \n"
27595 +                   "#                 / _-------=> irqs-off          \n"
27596 +                   "#                | / _------=> need-resched      \n"
27597 +                   "#                || / _-----=> need-resched_lazy \n"
27598 +                   "#                ||| / _----=> hardirq/softirq   \n"
27599 +                   "#                |||| / _---=> preempt-depth     \n"
27600 +                   "#                ||||| / _--=> preempt-lazy-depth\n"
27601 +                   "#                |||||| / _-=> migrate-disable   \n"
27602 +                   "#                ||||||| /     delay             \n"
27603 +                   "# cmd     pid    |||||||| time   |  caller       \n"
27604 +                   "#     \\   /      ||||||||   \\    |  /            \n");
27605  }
27606
27607  static void print_event_info(struct trace_buffer *buf, struct seq_file *m)
27608 @@ -2588,11 +2595,14 @@ static void print_func_help_header_irq(struct trace_buffer *buf, struct seq_file
27609         print_event_info(buf, m);
27610         seq_puts(m, "#                              _-----=> irqs-off\n"
27611                     "#                             / _----=> need-resched\n"
27612 -                   "#                            | / _---=> hardirq/softirq\n"
27613 -                   "#                            || / _--=> preempt-depth\n"
27614 -                   "#                            ||| /     delay\n"
27615 -                   "#           TASK-PID   CPU#  ||||    TIMESTAMP  FUNCTION\n"
27616 -                   "#              | |       |   ||||       |         |\n");
27617 +                   "#                            |/  _-----=> need-resched_lazy\n"
27618 +                   "#                            || / _---=> hardirq/softirq\n"
27619 +                   "#                            ||| / _--=> preempt-depth\n"
27620 +                   "#                            |||| / _-=> preempt-lazy-depth\n"
27621 +                   "#                            ||||| / _-=> migrate-disable   \n"
27622 +                   "#                            |||||| /    delay\n"
27623 +                   "#           TASK-PID   CPU#  |||||||   TIMESTAMP  FUNCTION\n"
27624 +                   "#              | |       |   |||||||      |         |\n");
27625  }
27626
27627  void
27628 diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
27629 index 919d9d07686f..3bf86ece683c 100644
27630 --- a/kernel/trace/trace.h
27631 +++ b/kernel/trace/trace.h
27632 @@ -117,6 +117,7 @@ struct kretprobe_trace_entry_head {
27633   *  NEED_RESCHED       - reschedule is requested
27634   *  HARDIRQ            - inside an interrupt handler
27635   *  SOFTIRQ            - inside a softirq handler
27636 + *  NEED_RESCHED_LAZY  - lazy reschedule is requested
27637   */
27638  enum trace_flag_type {
27639         TRACE_FLAG_IRQS_OFF             = 0x01,
27640 @@ -125,6 +126,7 @@ enum trace_flag_type {
27641         TRACE_FLAG_HARDIRQ              = 0x08,
27642         TRACE_FLAG_SOFTIRQ              = 0x10,
27643         TRACE_FLAG_PREEMPT_RESCHED      = 0x20,
27644 +       TRACE_FLAG_NEED_RESCHED_LAZY    = 0x40,
27645  };
27646
27647  #define TRACE_BUF_SIZE         1024
27648 diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
27649 index 996f0fd34312..5bd79b347398 100644
27650 --- a/kernel/trace/trace_events.c
27651 +++ b/kernel/trace/trace_events.c
27652 @@ -188,6 +188,8 @@ static int trace_define_common_fields(void)
27653         __common_field(unsigned char, flags);
27654         __common_field(unsigned char, preempt_count);
27655         __common_field(int, pid);
27656 +       __common_field(unsigned short, migrate_disable);
27657 +       __common_field(unsigned short, padding);
27658
27659         return ret;
27660  }
27661 @@ -244,6 +246,14 @@ void *trace_event_buffer_reserve(struct trace_event_buffer *fbuffer,
27662
27663         local_save_flags(fbuffer->flags);
27664         fbuffer->pc = preempt_count();
27665 +       /*
27666 +        * If CONFIG_PREEMPT is enabled, then the tracepoint itself disables
27667 +        * preemption (adding one to the preempt_count). Since we are
27668 +        * interested in the preempt_count at the time the tracepoint was
27669 +        * hit, we need to subtract one to offset the increment.
27670 +        */
27671 +       if (IS_ENABLED(CONFIG_PREEMPT))
27672 +               fbuffer->pc--;
27673         fbuffer->trace_file = trace_file;
27674
27675         fbuffer->event =
27676 diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c
27677 index be3222b7d72e..553e71254ad6 100644
27678 --- a/kernel/trace/trace_irqsoff.c
27679 +++ b/kernel/trace/trace_irqsoff.c
27680 @@ -13,6 +13,7 @@
27681  #include <linux/uaccess.h>
27682  #include <linux/module.h>
27683  #include <linux/ftrace.h>
27684 +#include <trace/events/hist.h>
27685
27686  #include "trace.h"
27687
27688 @@ -424,11 +425,13 @@ void start_critical_timings(void)
27689  {
27690         if (preempt_trace() || irq_trace())
27691                 start_critical_timing(CALLER_ADDR0, CALLER_ADDR1);
27692 +       trace_preemptirqsoff_hist_rcuidle(TRACE_START, 1);
27693  }
27694  EXPORT_SYMBOL_GPL(start_critical_timings);
27695
27696  void stop_critical_timings(void)
27697  {
27698 +       trace_preemptirqsoff_hist_rcuidle(TRACE_STOP, 0);
27699         if (preempt_trace() || irq_trace())
27700                 stop_critical_timing(CALLER_ADDR0, CALLER_ADDR1);
27701  }
27702 @@ -438,6 +441,7 @@ EXPORT_SYMBOL_GPL(stop_critical_timings);
27703  #ifdef CONFIG_PROVE_LOCKING
27704  void time_hardirqs_on(unsigned long a0, unsigned long a1)
27705  {
27706 +       trace_preemptirqsoff_hist_rcuidle(IRQS_ON, 0);
27707         if (!preempt_trace() && irq_trace())
27708                 stop_critical_timing(a0, a1);
27709  }
27710 @@ -446,6 +450,7 @@ void time_hardirqs_off(unsigned long a0, unsigned long a1)
27711  {
27712         if (!preempt_trace() && irq_trace())
27713                 start_critical_timing(a0, a1);
27714 +       trace_preemptirqsoff_hist_rcuidle(IRQS_OFF, 1);
27715  }
27716
27717  #else /* !CONFIG_PROVE_LOCKING */
27718 @@ -471,6 +476,7 @@ inline void print_irqtrace_events(struct task_struct *curr)
27719   */
27720  void trace_hardirqs_on(void)
27721  {
27722 +       trace_preemptirqsoff_hist(IRQS_ON, 0);
27723         if (!preempt_trace() && irq_trace())
27724                 stop_critical_timing(CALLER_ADDR0, CALLER_ADDR1);
27725  }
27726 @@ -480,11 +486,13 @@ void trace_hardirqs_off(void)
27727  {
27728         if (!preempt_trace() && irq_trace())
27729                 start_critical_timing(CALLER_ADDR0, CALLER_ADDR1);
27730 +       trace_preemptirqsoff_hist(IRQS_OFF, 1);
27731  }
27732  EXPORT_SYMBOL(trace_hardirqs_off);
27733
27734  __visible void trace_hardirqs_on_caller(unsigned long caller_addr)
27735  {
27736 +       trace_preemptirqsoff_hist(IRQS_ON, 0);
27737         if (!preempt_trace() && irq_trace())
27738                 stop_critical_timing(CALLER_ADDR0, caller_addr);
27739  }
27740 @@ -494,6 +502,7 @@ __visible void trace_hardirqs_off_caller(unsigned long caller_addr)
27741  {
27742         if (!preempt_trace() && irq_trace())
27743                 start_critical_timing(CALLER_ADDR0, caller_addr);
27744 +       trace_preemptirqsoff_hist(IRQS_OFF, 1);
27745  }
27746  EXPORT_SYMBOL(trace_hardirqs_off_caller);
27747
27748 @@ -503,12 +512,14 @@ EXPORT_SYMBOL(trace_hardirqs_off_caller);
27749  #ifdef CONFIG_PREEMPT_TRACER
27750  void trace_preempt_on(unsigned long a0, unsigned long a1)
27751  {
27752 +       trace_preemptirqsoff_hist(PREEMPT_ON, 0);
27753         if (preempt_trace() && !irq_trace())
27754                 stop_critical_timing(a0, a1);
27755  }
27756
27757  void trace_preempt_off(unsigned long a0, unsigned long a1)
27758  {
27759 +       trace_preemptirqsoff_hist(PREEMPT_ON, 1);
27760         if (preempt_trace() && !irq_trace())
27761                 start_critical_timing(a0, a1);
27762  }
27763 diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
27764 index 282982195e09..9f19d839a756 100644
27765 --- a/kernel/trace/trace_output.c
27766 +++ b/kernel/trace/trace_output.c
27767 @@ -386,6 +386,7 @@ int trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry)
27768  {
27769         char hardsoft_irq;
27770         char need_resched;
27771 +       char need_resched_lazy;
27772         char irqs_off;
27773         int hardirq;
27774         int softirq;
27775 @@ -413,6 +414,8 @@ int trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry)
27776                 need_resched = '.';
27777                 break;
27778         }
27779 +       need_resched_lazy =
27780 +               (entry->flags & TRACE_FLAG_NEED_RESCHED_LAZY) ? 'L' : '.';
27781
27782         hardsoft_irq =
27783                 (hardirq && softirq) ? 'H' :
27784 @@ -420,14 +423,25 @@ int trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry)
27785                 softirq ? 's' :
27786                 '.';
27787
27788 -       trace_seq_printf(s, "%c%c%c",
27789 -                        irqs_off, need_resched, hardsoft_irq);
27790 +       trace_seq_printf(s, "%c%c%c%c",
27791 +                        irqs_off, need_resched, need_resched_lazy,
27792 +                        hardsoft_irq);
27793
27794         if (entry->preempt_count)
27795                 trace_seq_printf(s, "%x", entry->preempt_count);
27796         else
27797                 trace_seq_putc(s, '.');
27798
27799 +       if (entry->preempt_lazy_count)
27800 +               trace_seq_printf(s, "%x", entry->preempt_lazy_count);
27801 +       else
27802 +               trace_seq_putc(s, '.');
27803 +
27804 +       if (entry->migrate_disable)
27805 +               trace_seq_printf(s, "%x", entry->migrate_disable);
27806 +       else
27807 +               trace_seq_putc(s, '.');
27808 +
27809         return !trace_seq_has_overflowed(s);
27810  }
27811
27812 diff --git a/kernel/user.c b/kernel/user.c
27813 index b069ccbfb0b0..1a2e88e98b5e 100644
27814 --- a/kernel/user.c
27815 +++ b/kernel/user.c
27816 @@ -161,11 +161,11 @@ void free_uid(struct user_struct *up)
27817         if (!up)
27818                 return;
27819
27820 -       local_irq_save(flags);
27821 +       local_irq_save_nort(flags);
27822         if (atomic_dec_and_lock(&up->__count, &uidhash_lock))
27823                 free_user(up, flags);
27824         else
27825 -               local_irq_restore(flags);
27826 +               local_irq_restore_nort(flags);
27827  }
27828
27829  struct user_struct *alloc_uid(kuid_t uid)
27830 diff --git a/kernel/watchdog.c b/kernel/watchdog.c
27831 index 198137b1cadc..47d143740774 100644
27832 --- a/kernel/watchdog.c
27833 +++ b/kernel/watchdog.c
27834 @@ -299,6 +299,8 @@ static int is_softlockup(unsigned long touch_ts)
27835
27836  #ifdef CONFIG_HARDLOCKUP_DETECTOR
27837
27838 +static DEFINE_RAW_SPINLOCK(watchdog_output_lock);
27839 +
27840  static struct perf_event_attr wd_hw_attr = {
27841         .type           = PERF_TYPE_HARDWARE,
27842         .config         = PERF_COUNT_HW_CPU_CYCLES,
27843 @@ -333,6 +335,13 @@ static void watchdog_overflow_callback(struct perf_event *event,
27844                 /* only print hardlockups once */
27845                 if (__this_cpu_read(hard_watchdog_warn) == true)
27846                         return;
27847 +               /*
27848 +                * If early-printk is enabled then make sure we do not
27849 +                * lock up in printk() and kill console logging:
27850 +                */
27851 +               printk_kill();
27852 +
27853 +               raw_spin_lock(&watchdog_output_lock);
27854
27855                 pr_emerg("Watchdog detected hard LOCKUP on cpu %d", this_cpu);
27856                 print_modules();
27857 @@ -350,8 +359,9 @@ static void watchdog_overflow_callback(struct perf_event *event,
27858                                 !test_and_set_bit(0, &hardlockup_allcpu_dumped))
27859                         trigger_allbutself_cpu_backtrace();
27860
27861 +               raw_spin_unlock(&watchdog_output_lock);
27862                 if (hardlockup_panic)
27863 -                       panic("Hard LOCKUP");
27864 +                       nmi_panic(regs, "Hard LOCKUP");
27865
27866                 __this_cpu_write(hard_watchdog_warn, true);
27867                 return;
27868 @@ -497,6 +507,7 @@ static void watchdog_enable(unsigned int cpu)
27869         /* kick off the timer for the hardlockup detector */
27870         hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
27871         hrtimer->function = watchdog_timer_fn;
27872 +       hrtimer->irqsafe = 1;
27873
27874         /* Enable the perf event */
27875         watchdog_nmi_enable(cpu);
27876 diff --git a/kernel/workqueue.c b/kernel/workqueue.c
27877 index 2c2f971f3e75..965d5f65e847 100644
27878 --- a/kernel/workqueue.c
27879 +++ b/kernel/workqueue.c
27880 @@ -48,6 +48,8 @@
27881  #include <linux/nodemask.h>
27882  #include <linux/moduleparam.h>
27883  #include <linux/uaccess.h>
27884 +#include <linux/locallock.h>
27885 +#include <linux/delay.h>
27886
27887  #include "workqueue_internal.h"
27888
27889 @@ -121,11 +123,16 @@ enum {
27890   *    cpu or grabbing pool->lock is enough for read access.  If
27891   *    POOL_DISASSOCIATED is set, it's identical to L.
27892   *
27893 + *    On RT we need the extra protection via rt_lock_idle_list() for
27894 + *    the list manipulations against read access from
27895 + *    wq_worker_sleeping(). All other places are nicely serialized via
27896 + *    pool->lock.
27897 + *
27898   * A: pool->attach_mutex protected.
27899   *
27900   * PL: wq_pool_mutex protected.
27901   *
27902 - * PR: wq_pool_mutex protected for writes.  Sched-RCU protected for reads.
27903 + * PR: wq_pool_mutex protected for writes.  RCU protected for reads.
27904   *
27905   * PW: wq_pool_mutex and wq->mutex protected for writes.  Either for reads.
27906   *
27907 @@ -134,7 +141,7 @@ enum {
27908   *
27909   * WQ: wq->mutex protected.
27910   *
27911 - * WR: wq->mutex protected for writes.  Sched-RCU protected for reads.
27912 + * WR: wq->mutex protected for writes.  RCU protected for reads.
27913   *
27914   * MD: wq_mayday_lock protected.
27915   */
27916 @@ -183,7 +190,7 @@ struct worker_pool {
27917         atomic_t                nr_running ____cacheline_aligned_in_smp;
27918
27919         /*
27920 -        * Destruction of pool is sched-RCU protected to allow dereferences
27921 +        * Destruction of pool is RCU protected to allow dereferences
27922          * from get_work_pool().
27923          */
27924         struct rcu_head         rcu;
27925 @@ -212,7 +219,7 @@ struct pool_workqueue {
27926         /*
27927          * Release of unbound pwq is punted to system_wq.  See put_pwq()
27928          * and pwq_unbound_release_workfn() for details.  pool_workqueue
27929 -        * itself is also sched-RCU protected so that the first pwq can be
27930 +        * itself is also RCU protected so that the first pwq can be
27931          * determined without grabbing wq->mutex.
27932          */
27933         struct work_struct      unbound_release_work;
27934 @@ -331,6 +338,8 @@ EXPORT_SYMBOL_GPL(system_power_efficient_wq);
27935  struct workqueue_struct *system_freezable_power_efficient_wq __read_mostly;
27936  EXPORT_SYMBOL_GPL(system_freezable_power_efficient_wq);
27937
27938 +static DEFINE_LOCAL_IRQ_LOCK(pendingb_lock);
27939 +
27940  static int worker_thread(void *__worker);
27941  static void workqueue_sysfs_unregister(struct workqueue_struct *wq);
27942
27943 @@ -338,20 +347,20 @@ static void workqueue_sysfs_unregister(struct workqueue_struct *wq);
27944  #include <trace/events/workqueue.h>
27945
27946  #define assert_rcu_or_pool_mutex()                                     \
27947 -       RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held() &&                 \
27948 +       RCU_LOCKDEP_WARN(!rcu_read_lock_held() &&                       \
27949                          !lockdep_is_held(&wq_pool_mutex),              \
27950 -                        "sched RCU or wq_pool_mutex should be held")
27951 +                        "RCU or wq_pool_mutex should be held")
27952
27953  #define assert_rcu_or_wq_mutex(wq)                                     \
27954 -       RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held() &&                 \
27955 +       RCU_LOCKDEP_WARN(!rcu_read_lock_held() &&                       \
27956                          !lockdep_is_held(&wq->mutex),                  \
27957 -                        "sched RCU or wq->mutex should be held")
27958 +                        "RCU or wq->mutex should be held")
27959
27960  #define assert_rcu_or_wq_mutex_or_pool_mutex(wq)                       \
27961 -       RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held() &&                 \
27962 +       RCU_LOCKDEP_WARN(!rcu_read_lock_held() &&                       \
27963                          !lockdep_is_held(&wq->mutex) &&                \
27964                          !lockdep_is_held(&wq_pool_mutex),              \
27965 -                        "sched RCU, wq->mutex or wq_pool_mutex should be held")
27966 +                        "RCU, wq->mutex or wq_pool_mutex should be held")
27967
27968  #define for_each_cpu_worker_pool(pool, cpu)                            \
27969         for ((pool) = &per_cpu(cpu_worker_pools, cpu)[0];               \
27970 @@ -363,7 +372,7 @@ static void workqueue_sysfs_unregister(struct workqueue_struct *wq);
27971   * @pool: iteration cursor
27972   * @pi: integer used for iteration
27973   *
27974 - * This must be called either with wq_pool_mutex held or sched RCU read
27975 + * This must be called either with wq_pool_mutex held or RCU read
27976   * locked.  If the pool needs to be used beyond the locking in effect, the
27977   * caller is responsible for guaranteeing that the pool stays online.
27978   *
27979 @@ -395,7 +404,7 @@ static void workqueue_sysfs_unregister(struct workqueue_struct *wq);
27980   * @pwq: iteration cursor
27981   * @wq: the target workqueue
27982   *
27983 - * This must be called either with wq->mutex held or sched RCU read locked.
27984 + * This must be called either with wq->mutex held or RCU read locked.
27985   * If the pwq needs to be used beyond the locking in effect, the caller is
27986   * responsible for guaranteeing that the pwq stays online.
27987   *
27988 @@ -407,6 +416,31 @@ static void workqueue_sysfs_unregister(struct workqueue_struct *wq);
27989                 if (({ assert_rcu_or_wq_mutex(wq); false; })) { }       \
27990                 else
27991
27992 +#ifdef CONFIG_PREEMPT_RT_BASE
27993 +static inline void rt_lock_idle_list(struct worker_pool *pool)
27994 +{
27995 +       preempt_disable();
27996 +}
27997 +static inline void rt_unlock_idle_list(struct worker_pool *pool)
27998 +{
27999 +       preempt_enable();
28000 +}
28001 +static inline void sched_lock_idle_list(struct worker_pool *pool) { }
28002 +static inline void sched_unlock_idle_list(struct worker_pool *pool) { }
28003 +#else
28004 +static inline void rt_lock_idle_list(struct worker_pool *pool) { }
28005 +static inline void rt_unlock_idle_list(struct worker_pool *pool) { }
28006 +static inline void sched_lock_idle_list(struct worker_pool *pool)
28007 +{
28008 +       spin_lock_irq(&pool->lock);
28009 +}
28010 +static inline void sched_unlock_idle_list(struct worker_pool *pool)
28011 +{
28012 +       spin_unlock_irq(&pool->lock);
28013 +}
28014 +#endif
28015 +
28016 +
28017  #ifdef CONFIG_DEBUG_OBJECTS_WORK
28018
28019  static struct debug_obj_descr work_debug_descr;
28020 @@ -557,7 +591,7 @@ static int worker_pool_assign_id(struct worker_pool *pool)
28021   * @wq: the target workqueue
28022   * @node: the node ID
28023   *
28024 - * This must be called with any of wq_pool_mutex, wq->mutex or sched RCU
28025 + * This must be called with any of wq_pool_mutex, wq->mutex or RCU
28026   * read locked.
28027   * If the pwq needs to be used beyond the locking in effect, the caller is
28028   * responsible for guaranteeing that the pwq stays online.
28029 @@ -701,8 +735,8 @@ static struct pool_workqueue *get_work_pwq(struct work_struct *work)
28030   * @work: the work item of interest
28031   *
28032   * Pools are created and destroyed under wq_pool_mutex, and allows read
28033 - * access under sched-RCU read lock.  As such, this function should be
28034 - * called under wq_pool_mutex or with preemption disabled.
28035 + * access under RCU read lock.  As such, this function should be
28036 + * called under wq_pool_mutex or inside of a rcu_read_lock() region.
28037   *
28038   * All fields of the returned pool are accessible as long as the above
28039   * mentioned locking is in effect.  If the returned pool needs to be used
28040 @@ -839,51 +873,44 @@ static struct worker *first_idle_worker(struct worker_pool *pool)
28041   */
28042  static void wake_up_worker(struct worker_pool *pool)
28043  {
28044 -       struct worker *worker = first_idle_worker(pool);
28045 +       struct worker *worker;
28046 +
28047 +       rt_lock_idle_list(pool);
28048 +
28049 +       worker = first_idle_worker(pool);
28050
28051         if (likely(worker))
28052                 wake_up_process(worker->task);
28053 +
28054 +       rt_unlock_idle_list(pool);
28055  }
28056
28057  /**
28058 - * wq_worker_waking_up - a worker is waking up
28059 - * @task: task waking up
28060 - * @cpu: CPU @task is waking up to
28061 + * wq_worker_running - a worker is running again
28062 + * @task: task returning from sleep
28063   *
28064 - * This function is called during try_to_wake_up() when a worker is
28065 - * being awoken.
28066 - *
28067 - * CONTEXT:
28068 - * spin_lock_irq(rq->lock)
28069 + * This function is called when a worker returns from schedule()
28070   */
28071 -void wq_worker_waking_up(struct task_struct *task, int cpu)
28072 +void wq_worker_running(struct task_struct *task)
28073  {
28074         struct worker *worker = kthread_data(task);
28075
28076 -       if (!(worker->flags & WORKER_NOT_RUNNING)) {
28077 -               WARN_ON_ONCE(worker->pool->cpu != cpu);
28078 +       if (!worker->sleeping)
28079 +               return;
28080 +       if (!(worker->flags & WORKER_NOT_RUNNING))
28081                 atomic_inc(&worker->pool->nr_running);
28082 -       }
28083 +       worker->sleeping = 0;
28084  }
28085
28086  /**
28087   * wq_worker_sleeping - a worker is going to sleep
28088   * @task: task going to sleep
28089 - * @cpu: CPU in question, must be the current CPU number
28090 - *
28091 - * This function is called during schedule() when a busy worker is
28092 - * going to sleep.  Worker on the same cpu can be woken up by
28093 - * returning pointer to its task.
28094 - *
28095 - * CONTEXT:
28096 - * spin_lock_irq(rq->lock)
28097 - *
28098 - * Return:
28099 - * Worker task on @cpu to wake up, %NULL if none.
28100 + * This function is called from schedule() when a busy worker is
28101 + * going to sleep.
28102   */
28103 -struct task_struct *wq_worker_sleeping(struct task_struct *task, int cpu)
28104 +void wq_worker_sleeping(struct task_struct *task)
28105  {
28106 -       struct worker *worker = kthread_data(task), *to_wakeup = NULL;
28107 +       struct worker *worker = kthread_data(task);
28108         struct worker_pool *pool;
28109
28110         /*
28111 @@ -892,29 +919,26 @@ struct task_struct *wq_worker_sleeping(struct task_struct *task, int cpu)
28112          * checking NOT_RUNNING.
28113          */
28114         if (worker->flags & WORKER_NOT_RUNNING)
28115 -               return NULL;
28116 +               return;
28117
28118         pool = worker->pool;
28119
28120 -       /* this can only happen on the local cpu */
28121 -       if (WARN_ON_ONCE(cpu != raw_smp_processor_id() || pool->cpu != cpu))
28122 -               return NULL;
28123 +       if (WARN_ON_ONCE(worker->sleeping))
28124 +               return;
28125 +
28126 +       worker->sleeping = 1;
28127
28128         /*
28129          * The counterpart of the following dec_and_test, implied mb,
28130          * worklist not empty test sequence is in insert_work().
28131          * Please read comment there.
28132 -        *
28133 -        * NOT_RUNNING is clear.  This means that we're bound to and
28134 -        * running on the local cpu w/ rq lock held and preemption
28135 -        * disabled, which in turn means that none else could be
28136 -        * manipulating idle_list, so dereferencing idle_list without pool
28137 -        * lock is safe.
28138          */
28139         if (atomic_dec_and_test(&pool->nr_running) &&
28140 -           !list_empty(&pool->worklist))
28141 -               to_wakeup = first_idle_worker(pool);
28142 -       return to_wakeup ? to_wakeup->task : NULL;
28143 +           !list_empty(&pool->worklist)) {
28144 +               sched_lock_idle_list(pool);
28145 +               wake_up_worker(pool);
28146 +               sched_unlock_idle_list(pool);
28147 +       }
28148  }
28149
28150  /**
28151 @@ -1108,12 +1132,12 @@ static void put_pwq_unlocked(struct pool_workqueue *pwq)
28152  {
28153         if (pwq) {
28154                 /*
28155 -                * As both pwqs and pools are sched-RCU protected, the
28156 +                * As both pwqs and pools are RCU protected, the
28157                  * following lock operations are safe.
28158                  */
28159 -               spin_lock_irq(&pwq->pool->lock);
28160 +               local_spin_lock_irq(pendingb_lock, &pwq->pool->lock);
28161                 put_pwq(pwq);
28162 -               spin_unlock_irq(&pwq->pool->lock);
28163 +               local_spin_unlock_irq(pendingb_lock, &pwq->pool->lock);
28164         }
28165  }
28166
28167 @@ -1215,7 +1239,7 @@ static int try_to_grab_pending(struct work_struct *work, bool is_dwork,
28168         struct worker_pool *pool;
28169         struct pool_workqueue *pwq;
28170
28171 -       local_irq_save(*flags);
28172 +       local_lock_irqsave(pendingb_lock, *flags);
28173
28174         /* try to steal the timer if it exists */
28175         if (is_dwork) {
28176 @@ -1234,6 +1258,7 @@ static int try_to_grab_pending(struct work_struct *work, bool is_dwork,
28177         if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work)))
28178                 return 0;
28179
28180 +       rcu_read_lock();
28181         /*
28182          * The queueing is in progress, or it is already queued. Try to
28183          * steal it from ->worklist without clearing WORK_STRUCT_PENDING.
28184 @@ -1272,14 +1297,16 @@ static int try_to_grab_pending(struct work_struct *work, bool is_dwork,
28185                 set_work_pool_and_keep_pending(work, pool->id);
28186
28187                 spin_unlock(&pool->lock);
28188 +               rcu_read_unlock();
28189                 return 1;
28190         }
28191         spin_unlock(&pool->lock);
28192  fail:
28193 -       local_irq_restore(*flags);
28194 +       rcu_read_unlock();
28195 +       local_unlock_irqrestore(pendingb_lock, *flags);
28196         if (work_is_canceling(work))
28197                 return -ENOENT;
28198 -       cpu_relax();
28199 +       cpu_chill();
28200         return -EAGAIN;
28201  }
28202
28203 @@ -1348,7 +1375,7 @@ static void __queue_work(int cpu, struct workqueue_struct *wq,
28204          * queued or lose PENDING.  Grabbing PENDING and queueing should
28205          * happen with IRQ disabled.
28206          */
28207 -       WARN_ON_ONCE(!irqs_disabled());
28208 +       WARN_ON_ONCE_NONRT(!irqs_disabled());
28209
28210         debug_work_activate(work);
28211
28212 @@ -1356,6 +1383,8 @@ static void __queue_work(int cpu, struct workqueue_struct *wq,
28213         if (unlikely(wq->flags & __WQ_DRAINING) &&
28214             WARN_ON_ONCE(!is_chained_work(wq)))
28215                 return;
28216 +
28217 +       rcu_read_lock();
28218  retry:
28219         if (req_cpu == WORK_CPU_UNBOUND)
28220                 cpu = raw_smp_processor_id();
28221 @@ -1412,10 +1441,8 @@ retry:
28222         /* pwq determined, queue */
28223         trace_workqueue_queue_work(req_cpu, pwq, work);
28224
28225 -       if (WARN_ON(!list_empty(&work->entry))) {
28226 -               spin_unlock(&pwq->pool->lock);
28227 -               return;
28228 -       }
28229 +       if (WARN_ON(!list_empty(&work->entry)))
28230 +               goto out;
28231
28232         pwq->nr_in_flight[pwq->work_color]++;
28233         work_flags = work_color_to_flags(pwq->work_color);
28234 @@ -1431,7 +1458,9 @@ retry:
28235
28236         insert_work(pwq, work, worklist, work_flags);
28237
28238 +out:
28239         spin_unlock(&pwq->pool->lock);
28240 +       rcu_read_unlock();
28241  }
28242
28243  /**
28244 @@ -1451,14 +1480,14 @@ bool queue_work_on(int cpu, struct workqueue_struct *wq,
28245         bool ret = false;
28246         unsigned long flags;
28247
28248 -       local_irq_save(flags);
28249 +       local_lock_irqsave(pendingb_lock,flags);
28250
28251         if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work))) {
28252                 __queue_work(cpu, wq, work);
28253                 ret = true;
28254         }
28255
28256 -       local_irq_restore(flags);
28257 +       local_unlock_irqrestore(pendingb_lock, flags);
28258         return ret;
28259  }
28260  EXPORT_SYMBOL(queue_work_on);
28261 @@ -1525,14 +1554,14 @@ bool queue_delayed_work_on(int cpu, struct workqueue_struct *wq,
28262         unsigned long flags;
28263
28264         /* read the comment in __queue_work() */
28265 -       local_irq_save(flags);
28266 +       local_lock_irqsave(pendingb_lock, flags);
28267
28268         if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work))) {
28269                 __queue_delayed_work(cpu, wq, dwork, delay);
28270                 ret = true;
28271         }
28272
28273 -       local_irq_restore(flags);
28274 +       local_unlock_irqrestore(pendingb_lock, flags);
28275         return ret;
28276  }
28277  EXPORT_SYMBOL(queue_delayed_work_on);
28278 @@ -1567,7 +1596,7 @@ bool mod_delayed_work_on(int cpu, struct workqueue_struct *wq,
28279
28280         if (likely(ret >= 0)) {
28281                 __queue_delayed_work(cpu, wq, dwork, delay);
28282 -               local_irq_restore(flags);
28283 +               local_unlock_irqrestore(pendingb_lock, flags);
28284         }
28285
28286         /* -ENOENT from try_to_grab_pending() becomes %true */
28287 @@ -1600,7 +1629,9 @@ static void worker_enter_idle(struct worker *worker)
28288         worker->last_active = jiffies;
28289
28290         /* idle_list is LIFO */
28291 +       rt_lock_idle_list(pool);
28292         list_add(&worker->entry, &pool->idle_list);
28293 +       rt_unlock_idle_list(pool);
28294
28295         if (too_many_workers(pool) && !timer_pending(&pool->idle_timer))
28296                 mod_timer(&pool->idle_timer, jiffies + IDLE_WORKER_TIMEOUT);
28297 @@ -1633,7 +1664,9 @@ static void worker_leave_idle(struct worker *worker)
28298                 return;
28299         worker_clr_flags(worker, WORKER_IDLE);
28300         pool->nr_idle--;
28301 +       rt_lock_idle_list(pool);
28302         list_del_init(&worker->entry);
28303 +       rt_unlock_idle_list(pool);
28304  }
28305
28306  static struct worker *alloc_worker(int node)
28307 @@ -1799,7 +1832,9 @@ static void destroy_worker(struct worker *worker)
28308         pool->nr_workers--;
28309         pool->nr_idle--;
28310
28311 +       rt_lock_idle_list(pool);
28312         list_del_init(&worker->entry);
28313 +       rt_unlock_idle_list(pool);
28314         worker->flags |= WORKER_DIE;
28315         wake_up_process(worker->task);
28316  }
28317 @@ -2716,14 +2751,14 @@ static bool start_flush_work(struct work_struct *work, struct wq_barrier *barr)
28318
28319         might_sleep();
28320
28321 -       local_irq_disable();
28322 +       rcu_read_lock();
28323         pool = get_work_pool(work);
28324         if (!pool) {
28325 -               local_irq_enable();
28326 +               rcu_read_unlock();
28327                 return false;
28328         }
28329
28330 -       spin_lock(&pool->lock);
28331 +       spin_lock_irq(&pool->lock);
28332         /* see the comment in try_to_grab_pending() with the same code */
28333         pwq = get_work_pwq(work);
28334         if (pwq) {
28335 @@ -2750,10 +2785,11 @@ static bool start_flush_work(struct work_struct *work, struct wq_barrier *barr)
28336         else
28337                 lock_map_acquire_read(&pwq->wq->lockdep_map);
28338         lock_map_release(&pwq->wq->lockdep_map);
28339 -
28340 +       rcu_read_unlock();
28341         return true;
28342  already_gone:
28343         spin_unlock_irq(&pool->lock);
28344 +       rcu_read_unlock();
28345         return false;
28346  }
28347
28348 @@ -2840,7 +2876,7 @@ static bool __cancel_work_timer(struct work_struct *work, bool is_dwork)
28349
28350         /* tell other tasks trying to grab @work to back off */
28351         mark_work_canceling(work);
28352 -       local_irq_restore(flags);
28353 +       local_unlock_irqrestore(pendingb_lock, flags);
28354
28355         flush_work(work);
28356         clear_work_data(work);
28357 @@ -2895,10 +2931,10 @@ EXPORT_SYMBOL_GPL(cancel_work_sync);
28358   */
28359  bool flush_delayed_work(struct delayed_work *dwork)
28360  {
28361 -       local_irq_disable();
28362 +       local_lock_irq(pendingb_lock);
28363         if (del_timer_sync(&dwork->timer))
28364                 __queue_work(dwork->cpu, dwork->wq, &dwork->work);
28365 -       local_irq_enable();
28366 +       local_unlock_irq(pendingb_lock);
28367         return flush_work(&dwork->work);
28368  }
28369  EXPORT_SYMBOL(flush_delayed_work);
28370 @@ -2933,7 +2969,7 @@ bool cancel_delayed_work(struct delayed_work *dwork)
28371
28372         set_work_pool_and_clear_pending(&dwork->work,
28373                                         get_work_pool_id(&dwork->work));
28374 -       local_irq_restore(flags);
28375 +       local_unlock_irqrestore(pendingb_lock, flags);
28376         return ret;
28377  }
28378  EXPORT_SYMBOL(cancel_delayed_work);
28379 @@ -3161,7 +3197,7 @@ static void rcu_free_pool(struct rcu_head *rcu)
28380   * put_unbound_pool - put a worker_pool
28381   * @pool: worker_pool to put
28382   *
28383 - * Put @pool.  If its refcnt reaches zero, it gets destroyed in sched-RCU
28384 + * Put @pool.  If its refcnt reaches zero, it gets destroyed in RCU
28385   * safe manner.  get_unbound_pool() calls this function on its failure path
28386   * and this function should be able to release pools which went through,
28387   * successfully or not, init_worker_pool().
28388 @@ -3215,8 +3251,8 @@ static void put_unbound_pool(struct worker_pool *pool)
28389         del_timer_sync(&pool->idle_timer);
28390         del_timer_sync(&pool->mayday_timer);
28391
28392 -       /* sched-RCU protected to allow dereferences from get_work_pool() */
28393 -       call_rcu_sched(&pool->rcu, rcu_free_pool);
28394 +       /* RCU protected to allow dereferences from get_work_pool() */
28395 +       call_rcu(&pool->rcu, rcu_free_pool);
28396  }
28397
28398  /**
28399 @@ -3323,14 +3359,14 @@ static void pwq_unbound_release_workfn(struct work_struct *work)
28400         put_unbound_pool(pool);
28401         mutex_unlock(&wq_pool_mutex);
28402
28403 -       call_rcu_sched(&pwq->rcu, rcu_free_pwq);
28404 +       call_rcu(&pwq->rcu, rcu_free_pwq);
28405
28406         /*
28407          * If we're the last pwq going away, @wq is already dead and no one
28408          * is gonna access it anymore.  Schedule RCU free.
28409          */
28410         if (is_last)
28411 -               call_rcu_sched(&wq->rcu, rcu_free_wq);
28412 +               call_rcu(&wq->rcu, rcu_free_wq);
28413  }
28414
28415  /**
28416 @@ -3983,7 +4019,7 @@ void destroy_workqueue(struct workqueue_struct *wq)
28417                  * The base ref is never dropped on per-cpu pwqs.  Directly
28418                  * schedule RCU free.
28419                  */
28420 -               call_rcu_sched(&wq->rcu, rcu_free_wq);
28421 +               call_rcu(&wq->rcu, rcu_free_wq);
28422         } else {
28423                 /*
28424                  * We're the sole accessor of @wq at this point.  Directly
28425 @@ -4076,7 +4112,8 @@ bool workqueue_congested(int cpu, struct workqueue_struct *wq)
28426         struct pool_workqueue *pwq;
28427         bool ret;
28428
28429 -       rcu_read_lock_sched();
28430 +       rcu_read_lock();
28431 +       preempt_disable();
28432
28433         if (cpu == WORK_CPU_UNBOUND)
28434                 cpu = smp_processor_id();
28435 @@ -4087,7 +4124,8 @@ bool workqueue_congested(int cpu, struct workqueue_struct *wq)
28436                 pwq = unbound_pwq_by_node(wq, cpu_to_node(cpu));
28437
28438         ret = !list_empty(&pwq->delayed_works);
28439 -       rcu_read_unlock_sched();
28440 +       preempt_enable();
28441 +       rcu_read_unlock();
28442
28443         return ret;
28444  }
28445 @@ -4113,15 +4151,15 @@ unsigned int work_busy(struct work_struct *work)
28446         if (work_pending(work))
28447                 ret |= WORK_BUSY_PENDING;
28448
28449 -       local_irq_save(flags);
28450 +       rcu_read_lock();
28451         pool = get_work_pool(work);
28452         if (pool) {
28453 -               spin_lock(&pool->lock);
28454 +               spin_lock_irqsave(&pool->lock, flags);
28455                 if (find_worker_executing_work(pool, work))
28456                         ret |= WORK_BUSY_RUNNING;
28457 -               spin_unlock(&pool->lock);
28458 +               spin_unlock_irqrestore(&pool->lock, flags);
28459         }
28460 -       local_irq_restore(flags);
28461 +       rcu_read_unlock();
28462
28463         return ret;
28464  }
28465 @@ -4310,7 +4348,7 @@ void show_workqueue_state(void)
28466         unsigned long flags;
28467         int pi;
28468
28469 -       rcu_read_lock_sched();
28470 +       rcu_read_lock();
28471
28472         pr_info("Showing busy workqueues and worker pools:\n");
28473
28474 @@ -4361,7 +4399,7 @@ void show_workqueue_state(void)
28475                 spin_unlock_irqrestore(&pool->lock, flags);
28476         }
28477
28478 -       rcu_read_unlock_sched();
28479 +       rcu_read_unlock();
28480  }
28481
28482  /*
28483 @@ -4722,16 +4760,16 @@ bool freeze_workqueues_busy(void)
28484                  * nr_active is monotonically decreasing.  It's safe
28485                  * to peek without lock.
28486                  */
28487 -               rcu_read_lock_sched();
28488 +               rcu_read_lock();
28489                 for_each_pwq(pwq, wq) {
28490                         WARN_ON_ONCE(pwq->nr_active < 0);
28491                         if (pwq->nr_active) {
28492                                 busy = true;
28493 -                               rcu_read_unlock_sched();
28494 +                               rcu_read_unlock();
28495                                 goto out_unlock;
28496                         }
28497                 }
28498 -               rcu_read_unlock_sched();
28499 +               rcu_read_unlock();
28500         }
28501  out_unlock:
28502         mutex_unlock(&wq_pool_mutex);
28503 @@ -4921,7 +4959,8 @@ static ssize_t wq_pool_ids_show(struct device *dev,
28504         const char *delim = "";
28505         int node, written = 0;
28506
28507 -       rcu_read_lock_sched();
28508 +       get_online_cpus();
28509 +       rcu_read_lock();
28510         for_each_node(node) {
28511                 written += scnprintf(buf + written, PAGE_SIZE - written,
28512                                      "%s%d:%d", delim, node,
28513 @@ -4929,7 +4968,8 @@ static ssize_t wq_pool_ids_show(struct device *dev,
28514                 delim = " ";
28515         }
28516         written += scnprintf(buf + written, PAGE_SIZE - written, "\n");
28517 -       rcu_read_unlock_sched();
28518 +       rcu_read_unlock();
28519 +       put_online_cpus();
28520
28521         return written;
28522  }
28523 diff --git a/kernel/workqueue_internal.h b/kernel/workqueue_internal.h
28524 index 45215870ac6c..f000c4d6917e 100644
28525 --- a/kernel/workqueue_internal.h
28526 +++ b/kernel/workqueue_internal.h
28527 @@ -43,6 +43,7 @@ struct worker {
28528         unsigned long           last_active;    /* L: last active timestamp */
28529         unsigned int            flags;          /* X: flags */
28530         int                     id;             /* I: worker id */
28531 +       int                     sleeping;       /* None */
28532
28533         /*
28534          * Opaque string set with work_set_desc().  Printed out with task
28535 @@ -68,7 +69,7 @@ static inline struct worker *current_wq_worker(void)
28536   * Scheduler hooks for concurrency managed workqueue.  Only to be used from
28537   * sched/core.c and workqueue.c.
28538   */
28539 -void wq_worker_waking_up(struct task_struct *task, int cpu);
28540 -struct task_struct *wq_worker_sleeping(struct task_struct *task, int cpu);
28541 +void wq_worker_running(struct task_struct *task);
28542 +void wq_worker_sleeping(struct task_struct *task);
28543
28544  #endif /* _KERNEL_WORKQUEUE_INTERNAL_H */
28545 diff --git a/lib/Kconfig b/lib/Kconfig
28546 index 1a48744253d7..f75de578cca8 100644
28547 --- a/lib/Kconfig
28548 +++ b/lib/Kconfig
28549 @@ -397,6 +397,7 @@ config CHECK_SIGNATURE
28550
28551  config CPUMASK_OFFSTACK
28552         bool "Force CPU masks off stack" if DEBUG_PER_CPU_MAPS
28553 +       depends on !PREEMPT_RT_FULL
28554         help
28555           Use dynamic allocation for cpumask_var_t, instead of putting
28556           them on the stack.  This is a bit more expensive, but avoids
28557 diff --git a/lib/debugobjects.c b/lib/debugobjects.c
28558 index 547f7f923dbc..8fcdbc2fc6d0 100644
28559 --- a/lib/debugobjects.c
28560 +++ b/lib/debugobjects.c
28561 @@ -309,7 +309,10 @@ __debug_object_init(void *addr, struct debug_obj_descr *descr, int onstack)
28562         struct debug_obj *obj;
28563         unsigned long flags;
28564
28565 -       fill_pool();
28566 +#ifdef CONFIG_PREEMPT_RT_FULL
28567 +       if (preempt_count() == 0 && !irqs_disabled())
28568 +#endif
28569 +               fill_pool();
28570
28571         db = get_bucket((unsigned long) addr);
28572
28573 diff --git a/lib/idr.c b/lib/idr.c
28574 index 6098336df267..9decbe914595 100644
28575 --- a/lib/idr.c
28576 +++ b/lib/idr.c
28577 @@ -30,6 +30,7 @@
28578  #include <linux/idr.h>
28579  #include <linux/spinlock.h>
28580  #include <linux/percpu.h>
28581 +#include <linux/locallock.h>
28582
28583  #define MAX_IDR_SHIFT          (sizeof(int) * 8 - 1)
28584  #define MAX_IDR_BIT            (1U << MAX_IDR_SHIFT)
28585 @@ -45,6 +46,37 @@ static DEFINE_PER_CPU(struct idr_layer *, idr_preload_head);
28586  static DEFINE_PER_CPU(int, idr_preload_cnt);
28587  static DEFINE_SPINLOCK(simple_ida_lock);
28588
28589 +#ifdef CONFIG_PREEMPT_RT_FULL
28590 +static DEFINE_LOCAL_IRQ_LOCK(idr_lock);
28591 +
28592 +static inline void idr_preload_lock(void)
28593 +{
28594 +       local_lock(idr_lock);
28595 +}
28596 +
28597 +static inline void idr_preload_unlock(void)
28598 +{
28599 +       local_unlock(idr_lock);
28600 +}
28601 +
28602 +void idr_preload_end(void)
28603 +{
28604 +       idr_preload_unlock();
28605 +}
28606 +EXPORT_SYMBOL(idr_preload_end);
28607 +#else
28608 +static inline void idr_preload_lock(void)
28609 +{
28610 +       preempt_disable();
28611 +}
28612 +
28613 +static inline void idr_preload_unlock(void)
28614 +{
28615 +       preempt_enable();
28616 +}
28617 +#endif
28618 +
28619 +
28620  /* the maximum ID which can be allocated given idr->layers */
28621  static int idr_max(int layers)
28622  {
28623 @@ -115,14 +147,14 @@ static struct idr_layer *idr_layer_alloc(gfp_t gfp_mask, struct idr *layer_idr)
28624          * context.  See idr_preload() for details.
28625          */
28626         if (!in_interrupt()) {
28627 -               preempt_disable();
28628 +               idr_preload_lock();
28629                 new = __this_cpu_read(idr_preload_head);
28630                 if (new) {
28631                         __this_cpu_write(idr_preload_head, new->ary[0]);
28632                         __this_cpu_dec(idr_preload_cnt);
28633                         new->ary[0] = NULL;
28634                 }
28635 -               preempt_enable();
28636 +               idr_preload_unlock();
28637                 if (new)
28638                         return new;
28639         }
28640 @@ -366,7 +398,6 @@ static void idr_fill_slot(struct idr *idr, void *ptr, int id,
28641         idr_mark_full(pa, id);
28642  }
28643
28644 -
28645  /**
28646   * idr_preload - preload for idr_alloc()
28647   * @gfp_mask: allocation mask to use for preloading
28648 @@ -401,7 +432,7 @@ void idr_preload(gfp_t gfp_mask)
28649         WARN_ON_ONCE(in_interrupt());
28650         might_sleep_if(gfpflags_allow_blocking(gfp_mask));
28651
28652 -       preempt_disable();
28653 +       idr_preload_lock();
28654
28655         /*
28656          * idr_alloc() is likely to succeed w/o full idr_layer buffer and
28657 @@ -413,9 +444,9 @@ void idr_preload(gfp_t gfp_mask)
28658         while (__this_cpu_read(idr_preload_cnt) < MAX_IDR_FREE) {
28659                 struct idr_layer *new;
28660
28661 -               preempt_enable();
28662 +               idr_preload_unlock();
28663                 new = kmem_cache_zalloc(idr_layer_cache, gfp_mask);
28664 -               preempt_disable();
28665 +               idr_preload_lock();
28666                 if (!new)
28667                         break;
28668
28669 diff --git a/lib/locking-selftest.c b/lib/locking-selftest.c
28670 index 872a15a2a637..b93a6103fa4d 100644
28671 --- a/lib/locking-selftest.c
28672 +++ b/lib/locking-selftest.c
28673 @@ -590,6 +590,8 @@ GENERATE_TESTCASE(init_held_rsem)
28674  #include "locking-selftest-spin-hardirq.h"
28675  GENERATE_PERMUTATIONS_2_EVENTS(irqsafe1_hard_spin)
28676
28677 +#ifndef CONFIG_PREEMPT_RT_FULL
28678 +
28679  #include "locking-selftest-rlock-hardirq.h"
28680  GENERATE_PERMUTATIONS_2_EVENTS(irqsafe1_hard_rlock)
28681
28682 @@ -605,9 +607,12 @@ GENERATE_PERMUTATIONS_2_EVENTS(irqsafe1_soft_rlock)
28683  #include "locking-selftest-wlock-softirq.h"
28684  GENERATE_PERMUTATIONS_2_EVENTS(irqsafe1_soft_wlock)
28685
28686 +#endif
28687 +
28688  #undef E1
28689  #undef E2
28690
28691 +#ifndef CONFIG_PREEMPT_RT_FULL
28692  /*
28693   * Enabling hardirqs with a softirq-safe lock held:
28694   */
28695 @@ -640,6 +645,8 @@ GENERATE_PERMUTATIONS_2_EVENTS(irqsafe2A_rlock)
28696  #undef E1
28697  #undef E2
28698
28699 +#endif
28700 +
28701  /*
28702   * Enabling irqs with an irq-safe lock held:
28703   */
28704 @@ -663,6 +670,8 @@ GENERATE_PERMUTATIONS_2_EVENTS(irqsafe2A_rlock)
28705  #include "locking-selftest-spin-hardirq.h"
28706  GENERATE_PERMUTATIONS_2_EVENTS(irqsafe2B_hard_spin)
28707
28708 +#ifndef CONFIG_PREEMPT_RT_FULL
28709 +
28710  #include "locking-selftest-rlock-hardirq.h"
28711  GENERATE_PERMUTATIONS_2_EVENTS(irqsafe2B_hard_rlock)
28712
28713 @@ -678,6 +687,8 @@ GENERATE_PERMUTATIONS_2_EVENTS(irqsafe2B_soft_rlock)
28714  #include "locking-selftest-wlock-softirq.h"
28715  GENERATE_PERMUTATIONS_2_EVENTS(irqsafe2B_soft_wlock)
28716
28717 +#endif
28718 +
28719  #undef E1
28720  #undef E2
28721
28722 @@ -709,6 +720,8 @@ GENERATE_PERMUTATIONS_2_EVENTS(irqsafe2B_soft_wlock)
28723  #include "locking-selftest-spin-hardirq.h"
28724  GENERATE_PERMUTATIONS_3_EVENTS(irqsafe3_hard_spin)
28725
28726 +#ifndef CONFIG_PREEMPT_RT_FULL
28727 +
28728  #include "locking-selftest-rlock-hardirq.h"
28729  GENERATE_PERMUTATIONS_3_EVENTS(irqsafe3_hard_rlock)
28730
28731 @@ -724,6 +737,8 @@ GENERATE_PERMUTATIONS_3_EVENTS(irqsafe3_soft_rlock)
28732  #include "locking-selftest-wlock-softirq.h"
28733  GENERATE_PERMUTATIONS_3_EVENTS(irqsafe3_soft_wlock)
28734
28735 +#endif
28736 +
28737  #undef E1
28738  #undef E2
28739  #undef E3
28740 @@ -757,6 +772,8 @@ GENERATE_PERMUTATIONS_3_EVENTS(irqsafe3_soft_wlock)
28741  #include "locking-selftest-spin-hardirq.h"
28742  GENERATE_PERMUTATIONS_3_EVENTS(irqsafe4_hard_spin)
28743
28744 +#ifndef CONFIG_PREEMPT_RT_FULL
28745 +
28746  #include "locking-selftest-rlock-hardirq.h"
28747  GENERATE_PERMUTATIONS_3_EVENTS(irqsafe4_hard_rlock)
28748
28749 @@ -772,10 +789,14 @@ GENERATE_PERMUTATIONS_3_EVENTS(irqsafe4_soft_rlock)
28750  #include "locking-selftest-wlock-softirq.h"
28751  GENERATE_PERMUTATIONS_3_EVENTS(irqsafe4_soft_wlock)
28752
28753 +#endif
28754 +
28755  #undef E1
28756  #undef E2
28757  #undef E3
28758
28759 +#ifndef CONFIG_PREEMPT_RT_FULL
28760 +
28761  /*
28762   * read-lock / write-lock irq inversion.
28763   *
28764 @@ -838,6 +859,10 @@ GENERATE_PERMUTATIONS_3_EVENTS(irq_inversion_soft_wlock)
28765  #undef E2
28766  #undef E3
28767
28768 +#endif
28769 +
28770 +#ifndef CONFIG_PREEMPT_RT_FULL
28771 +
28772  /*
28773   * read-lock / write-lock recursion that is actually safe.
28774   */
28775 @@ -876,6 +901,8 @@ GENERATE_PERMUTATIONS_3_EVENTS(irq_read_recursion_soft)
28776  #undef E2
28777  #undef E3
28778
28779 +#endif
28780 +
28781  /*
28782   * read-lock / write-lock recursion that is unsafe.
28783   */
28784 @@ -1858,6 +1885,7 @@ void locking_selftest(void)
28785
28786         printk("  --------------------------------------------------------------------------\n");
28787
28788 +#ifndef CONFIG_PREEMPT_RT_FULL
28789         /*
28790          * irq-context testcases:
28791          */
28792 @@ -1870,6 +1898,28 @@ void locking_selftest(void)
28793
28794         DO_TESTCASE_6x2("irq read-recursion", irq_read_recursion);
28795  //     DO_TESTCASE_6x2B("irq read-recursion #2", irq_read_recursion2);
28796 +#else
28797 +       /* On -rt, we only do hardirq context test for raw spinlock */
28798 +       DO_TESTCASE_1B("hard-irqs-on + irq-safe-A", irqsafe1_hard_spin, 12);
28799 +       DO_TESTCASE_1B("hard-irqs-on + irq-safe-A", irqsafe1_hard_spin, 21);
28800 +
28801 +       DO_TESTCASE_1B("hard-safe-A + irqs-on", irqsafe2B_hard_spin, 12);
28802 +       DO_TESTCASE_1B("hard-safe-A + irqs-on", irqsafe2B_hard_spin, 21);
28803 +
28804 +       DO_TESTCASE_1B("hard-safe-A + unsafe-B #1", irqsafe3_hard_spin, 123);
28805 +       DO_TESTCASE_1B("hard-safe-A + unsafe-B #1", irqsafe3_hard_spin, 132);
28806 +       DO_TESTCASE_1B("hard-safe-A + unsafe-B #1", irqsafe3_hard_spin, 213);
28807 +       DO_TESTCASE_1B("hard-safe-A + unsafe-B #1", irqsafe3_hard_spin, 231);
28808 +       DO_TESTCASE_1B("hard-safe-A + unsafe-B #1", irqsafe3_hard_spin, 312);
28809 +       DO_TESTCASE_1B("hard-safe-A + unsafe-B #1", irqsafe3_hard_spin, 321);
28810 +
28811 +       DO_TESTCASE_1B("hard-safe-A + unsafe-B #2", irqsafe4_hard_spin, 123);
28812 +       DO_TESTCASE_1B("hard-safe-A + unsafe-B #2", irqsafe4_hard_spin, 132);
28813 +       DO_TESTCASE_1B("hard-safe-A + unsafe-B #2", irqsafe4_hard_spin, 213);
28814 +       DO_TESTCASE_1B("hard-safe-A + unsafe-B #2", irqsafe4_hard_spin, 231);
28815 +       DO_TESTCASE_1B("hard-safe-A + unsafe-B #2", irqsafe4_hard_spin, 312);
28816 +       DO_TESTCASE_1B("hard-safe-A + unsafe-B #2", irqsafe4_hard_spin, 321);
28817 +#endif
28818
28819         ww_tests();
28820
28821 diff --git a/lib/percpu_ida.c b/lib/percpu_ida.c
28822 index 6d40944960de..822a2c027e72 100644
28823 --- a/lib/percpu_ida.c
28824 +++ b/lib/percpu_ida.c
28825 @@ -26,6 +26,9 @@
28826  #include <linux/string.h>
28827  #include <linux/spinlock.h>
28828  #include <linux/percpu_ida.h>
28829 +#include <linux/locallock.h>
28830 +
28831 +static DEFINE_LOCAL_IRQ_LOCK(irq_off_lock);
28832
28833  struct percpu_ida_cpu {
28834         /*
28835 @@ -148,13 +151,13 @@ int percpu_ida_alloc(struct percpu_ida *pool, int state)
28836         unsigned long flags;
28837         int tag;
28838
28839 -       local_irq_save(flags);
28840 +       local_lock_irqsave(irq_off_lock, flags);
28841         tags = this_cpu_ptr(pool->tag_cpu);
28842
28843         /* Fastpath */
28844         tag = alloc_local_tag(tags);
28845         if (likely(tag >= 0)) {
28846 -               local_irq_restore(flags);
28847 +               local_unlock_irqrestore(irq_off_lock, flags);
28848                 return tag;
28849         }
28850
28851 @@ -173,6 +176,7 @@ int percpu_ida_alloc(struct percpu_ida *pool, int state)
28852
28853                 if (!tags->nr_free)
28854                         alloc_global_tags(pool, tags);
28855 +
28856                 if (!tags->nr_free)
28857                         steal_tags(pool, tags);
28858
28859 @@ -184,7 +188,7 @@ int percpu_ida_alloc(struct percpu_ida *pool, int state)
28860                 }
28861
28862                 spin_unlock(&pool->lock);
28863 -               local_irq_restore(flags);
28864 +               local_unlock_irqrestore(irq_off_lock, flags);
28865
28866                 if (tag >= 0 || state == TASK_RUNNING)
28867                         break;
28868 @@ -196,7 +200,7 @@ int percpu_ida_alloc(struct percpu_ida *pool, int state)
28869
28870                 schedule();
28871
28872 -               local_irq_save(flags);
28873 +               local_lock_irqsave(irq_off_lock, flags);
28874                 tags = this_cpu_ptr(pool->tag_cpu);
28875         }
28876         if (state != TASK_RUNNING)
28877 @@ -221,7 +225,7 @@ void percpu_ida_free(struct percpu_ida *pool, unsigned tag)
28878
28879         BUG_ON(tag >= pool->nr_tags);
28880
28881 -       local_irq_save(flags);
28882 +       local_lock_irqsave(irq_off_lock, flags);
28883         tags = this_cpu_ptr(pool->tag_cpu);
28884
28885         spin_lock(&tags->lock);
28886 @@ -253,7 +257,7 @@ void percpu_ida_free(struct percpu_ida *pool, unsigned tag)
28887                 spin_unlock(&pool->lock);
28888         }
28889
28890 -       local_irq_restore(flags);
28891 +       local_unlock_irqrestore(irq_off_lock, flags);
28892  }
28893  EXPORT_SYMBOL_GPL(percpu_ida_free);
28894
28895 @@ -345,7 +349,7 @@ int percpu_ida_for_each_free(struct percpu_ida *pool, percpu_ida_cb fn,
28896         struct percpu_ida_cpu *remote;
28897         unsigned cpu, i, err = 0;
28898
28899 -       local_irq_save(flags);
28900 +       local_lock_irqsave(irq_off_lock, flags);
28901         for_each_possible_cpu(cpu) {
28902                 remote = per_cpu_ptr(pool->tag_cpu, cpu);
28903                 spin_lock(&remote->lock);
28904 @@ -367,7 +371,7 @@ int percpu_ida_for_each_free(struct percpu_ida *pool, percpu_ida_cb fn,
28905         }
28906         spin_unlock(&pool->lock);
28907  out:
28908 -       local_irq_restore(flags);
28909 +       local_unlock_irqrestore(irq_off_lock, flags);
28910         return err;
28911  }
28912  EXPORT_SYMBOL_GPL(percpu_ida_for_each_free);
28913 diff --git a/lib/radix-tree.c b/lib/radix-tree.c
28914 index 6b79e9026e24..f27e0bcb74f7 100644
28915 --- a/lib/radix-tree.c
28916 +++ b/lib/radix-tree.c
28917 @@ -196,13 +196,14 @@ radix_tree_node_alloc(struct radix_tree_root *root)
28918                  * succeed in getting a node here (and never reach
28919                  * kmem_cache_alloc)
28920                  */
28921 -               rtp = this_cpu_ptr(&radix_tree_preloads);
28922 +               rtp = &get_cpu_var(radix_tree_preloads);
28923                 if (rtp->nr) {
28924                         ret = rtp->nodes;
28925                         rtp->nodes = ret->private_data;
28926                         ret->private_data = NULL;
28927                         rtp->nr--;
28928                 }
28929 +               put_cpu_var(radix_tree_preloads);
28930                 /*
28931                  * Update the allocation stack trace as this is more useful
28932                  * for debugging.
28933 @@ -242,6 +243,7 @@ radix_tree_node_free(struct radix_tree_node *node)
28934         call_rcu(&node->rcu_head, radix_tree_node_rcu_free);
28935  }
28936
28937 +#ifndef CONFIG_PREEMPT_RT_FULL
28938  /*
28939   * Load up this CPU's radix_tree_node buffer with sufficient objects to
28940   * ensure that the addition of a single element in the tree cannot fail.  On
28941 @@ -310,6 +312,7 @@ int radix_tree_maybe_preload(gfp_t gfp_mask)
28942         return 0;
28943  }
28944  EXPORT_SYMBOL(radix_tree_maybe_preload);
28945 +#endif
28946
28947  /*
28948   *     Return the maximum key which can be store into a
28949 diff --git a/lib/rbtree.c b/lib/rbtree.c
28950 index 1356454e36de..d15d6c4327f1 100644
28951 --- a/lib/rbtree.c
28952 +++ b/lib/rbtree.c
28953 @@ -23,6 +23,7 @@
28954
28955  #include <linux/rbtree_augmented.h>
28956  #include <linux/export.h>
28957 +#include <linux/rcupdate.h>
28958
28959  /*
28960   * red-black trees properties:  http://en.wikipedia.org/wiki/Rbtree
28961 @@ -590,3 +591,13 @@ struct rb_node *rb_first_postorder(const struct rb_root *root)
28962         return rb_left_deepest_node(root->rb_node);
28963  }
28964  EXPORT_SYMBOL(rb_first_postorder);
28965 +
28966 +void rb_link_node_rcu(struct rb_node *node, struct rb_node *parent,
28967 +                                   struct rb_node **rb_link)
28968 +{
28969 +       node->__rb_parent_color = (unsigned long)parent;
28970 +       node->rb_left = node->rb_right = NULL;
28971 +
28972 +       rcu_assign_pointer(*rb_link, node);
28973 +}
28974 +EXPORT_SYMBOL(rb_link_node_rcu);
28975 diff --git a/lib/scatterlist.c b/lib/scatterlist.c
28976 index bafa9933fa76..ebe3b7edd086 100644
28977 --- a/lib/scatterlist.c
28978 +++ b/lib/scatterlist.c
28979 @@ -620,7 +620,7 @@ void sg_miter_stop(struct sg_mapping_iter *miter)
28980                         flush_kernel_dcache_page(miter->page);
28981
28982                 if (miter->__flags & SG_MITER_ATOMIC) {
28983 -                       WARN_ON_ONCE(preemptible());
28984 +                       WARN_ON_ONCE(!pagefault_disabled());
28985                         kunmap_atomic(miter->addr);
28986                 } else
28987                         kunmap(miter->page);
28988 @@ -664,7 +664,7 @@ size_t sg_copy_buffer(struct scatterlist *sgl, unsigned int nents, void *buf,
28989         if (!sg_miter_skip(&miter, skip))
28990                 return false;
28991
28992 -       local_irq_save(flags);
28993 +       local_irq_save_nort(flags);
28994
28995         while (sg_miter_next(&miter) && offset < buflen) {
28996                 unsigned int len;
28997 @@ -681,7 +681,7 @@ size_t sg_copy_buffer(struct scatterlist *sgl, unsigned int nents, void *buf,
28998
28999         sg_miter_stop(&miter);
29000
29001 -       local_irq_restore(flags);
29002 +       local_irq_restore_nort(flags);
29003         return offset;
29004  }
29005  EXPORT_SYMBOL(sg_copy_buffer);
29006 diff --git a/lib/smp_processor_id.c b/lib/smp_processor_id.c
29007 index 1afec32de6f2..11fa431046a8 100644
29008 --- a/lib/smp_processor_id.c
29009 +++ b/lib/smp_processor_id.c
29010 @@ -39,8 +39,9 @@ notrace static unsigned int check_preemption_disabled(const char *what1,
29011         if (!printk_ratelimit())
29012                 goto out_enable;
29013
29014 -       printk(KERN_ERR "BUG: using %s%s() in preemptible [%08x] code: %s/%d\n",
29015 -               what1, what2, preempt_count() - 1, current->comm, current->pid);
29016 +       printk(KERN_ERR "BUG: using %s%s() in preemptible [%08x %08x] code: %s/%d\n",
29017 +               what1, what2, preempt_count() - 1, __migrate_disabled(current),
29018 +               current->comm, current->pid);
29019
29020         print_symbol("caller is %s\n", (long)__builtin_return_address(0));
29021         dump_stack();
29022 diff --git a/localversion-rt b/localversion-rt
29023 new file mode 100644
29024 index 000000000000..629e0b4384b8
29025 --- /dev/null
29026 +++ b/localversion-rt
29027 @@ -0,0 +1 @@
29028 +-rt41
29029 diff --git a/mm/Kconfig b/mm/Kconfig
29030 index 97a4e06b15c0..9614351e68b8 100644
29031 --- a/mm/Kconfig
29032 +++ b/mm/Kconfig
29033 @@ -392,7 +392,7 @@ config NOMMU_INITIAL_TRIM_EXCESS
29034
29035  config TRANSPARENT_HUGEPAGE
29036         bool "Transparent Hugepage Support"
29037 -       depends on HAVE_ARCH_TRANSPARENT_HUGEPAGE
29038 +       depends on HAVE_ARCH_TRANSPARENT_HUGEPAGE && !PREEMPT_RT_FULL
29039         select COMPACTION
29040         help
29041           Transparent Hugepages allows the kernel to use huge pages and
29042 diff --git a/mm/backing-dev.c b/mm/backing-dev.c
29043 index 9ef80bf441b3..826fed55c1cc 100644
29044 --- a/mm/backing-dev.c
29045 +++ b/mm/backing-dev.c
29046 @@ -457,9 +457,9 @@ void wb_congested_put(struct bdi_writeback_congested *congested)
29047  {
29048         unsigned long flags;
29049
29050 -       local_irq_save(flags);
29051 +       local_irq_save_nort(flags);
29052         if (!atomic_dec_and_lock(&congested->refcnt, &cgwb_lock)) {
29053 -               local_irq_restore(flags);
29054 +               local_irq_restore_nort(flags);
29055                 return;
29056         }
29057
29058 diff --git a/mm/compaction.c b/mm/compaction.c
29059 index dba02dec7195..51963f58a29b 100644
29060 --- a/mm/compaction.c
29061 +++ b/mm/compaction.c
29062 @@ -1430,10 +1430,12 @@ check_drain:
29063                                 cc->migrate_pfn & ~((1UL << cc->order) - 1);
29064
29065                         if (cc->last_migrated_pfn < current_block_start) {
29066 -                               cpu = get_cpu();
29067 +                               cpu = get_cpu_light();
29068 +                               local_lock_irq(swapvec_lock);
29069                                 lru_add_drain_cpu(cpu);
29070 +                               local_unlock_irq(swapvec_lock);
29071                                 drain_local_pages(zone);
29072 -                               put_cpu();
29073 +                               put_cpu_light();
29074                                 /* No more flushing until we migrate again */
29075                                 cc->last_migrated_pfn = 0;
29076                         }
29077 diff --git a/mm/filemap.c b/mm/filemap.c
29078 index c588d1222b2a..da6a5fbfadd2 100644
29079 --- a/mm/filemap.c
29080 +++ b/mm/filemap.c
29081 @@ -144,9 +144,12 @@ static int page_cache_tree_insert(struct address_space *mapping,
29082                  * node->private_list is protected by
29083                  * mapping->tree_lock.
29084                  */
29085 -               if (!list_empty(&node->private_list))
29086 -                       list_lru_del(&workingset_shadow_nodes,
29087 +               if (!list_empty(&node->private_list)) {
29088 +                       local_lock(workingset_shadow_lock);
29089 +                       list_lru_del(&__workingset_shadow_nodes,
29090                                      &node->private_list);
29091 +                       local_unlock(workingset_shadow_lock);
29092 +               }
29093         }
29094         return 0;
29095  }
29096 @@ -218,7 +221,9 @@ static void page_cache_tree_delete(struct address_space *mapping,
29097         if (!workingset_node_pages(node) &&
29098             list_empty(&node->private_list)) {
29099                 node->private_data = mapping;
29100 -               list_lru_add(&workingset_shadow_nodes, &node->private_list);
29101 +               local_lock(workingset_shadow_lock);
29102 +               list_lru_add(&__workingset_shadow_nodes, &node->private_list);
29103 +               local_unlock(workingset_shadow_lock);
29104         }
29105  }
29106
29107 diff --git a/mm/highmem.c b/mm/highmem.c
29108 index 123bcd3ed4f2..16e8cf26d38a 100644
29109 --- a/mm/highmem.c
29110 +++ b/mm/highmem.c
29111 @@ -29,10 +29,11 @@
29112  #include <linux/kgdb.h>
29113  #include <asm/tlbflush.h>
29114
29115 -
29116 +#ifndef CONFIG_PREEMPT_RT_FULL
29117  #if defined(CONFIG_HIGHMEM) || defined(CONFIG_X86_32)
29118  DEFINE_PER_CPU(int, __kmap_atomic_idx);
29119  #endif
29120 +#endif
29121
29122  /*
29123   * Virtual_count is not a pure "count".
29124 @@ -107,8 +108,9 @@ static inline wait_queue_head_t *get_pkmap_wait_queue_head(unsigned int color)
29125  unsigned long totalhigh_pages __read_mostly;
29126  EXPORT_SYMBOL(totalhigh_pages);
29127
29128 -
29129 +#ifndef CONFIG_PREEMPT_RT_FULL
29130  EXPORT_PER_CPU_SYMBOL(__kmap_atomic_idx);
29131 +#endif
29132
29133  unsigned int nr_free_highpages (void)
29134  {
29135 diff --git a/mm/memcontrol.c b/mm/memcontrol.c
29136 index 6b90d184e9c0..ed7aa011ad70 100644
29137 --- a/mm/memcontrol.c
29138 +++ b/mm/memcontrol.c
29139 @@ -67,6 +67,8 @@
29140  #include <net/sock.h>
29141  #include <net/ip.h>
29142  #include <net/tcp_memcontrol.h>
29143 +#include <linux/locallock.h>
29144 +
29145  #include "slab.h"
29146
29147  #include <asm/uaccess.h>
29148 @@ -87,6 +89,7 @@ int do_swap_account __read_mostly;
29149  #define do_swap_account                0
29150  #endif
29151
29152 +static DEFINE_LOCAL_IRQ_LOCK(event_lock);
29153  static const char * const mem_cgroup_stat_names[] = {
29154         "cache",
29155         "rss",
29156 @@ -1922,14 +1925,17 @@ static void drain_local_stock(struct work_struct *dummy)
29157   */
29158  static void refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
29159  {
29160 -       struct memcg_stock_pcp *stock = &get_cpu_var(memcg_stock);
29161 +       struct memcg_stock_pcp *stock;
29162 +       int cpu = get_cpu_light();
29163 +
29164 +       stock = &per_cpu(memcg_stock, cpu);
29165
29166         if (stock->cached != memcg) { /* reset if necessary */
29167                 drain_stock(stock);
29168                 stock->cached = memcg;
29169         }
29170         stock->nr_pages += nr_pages;
29171 -       put_cpu_var(memcg_stock);
29172 +       put_cpu_light();
29173  }
29174
29175  /*
29176 @@ -1945,7 +1951,7 @@ static void drain_all_stock(struct mem_cgroup *root_memcg)
29177                 return;
29178         /* Notify other cpus that system-wide "drain" is running */
29179         get_online_cpus();
29180 -       curcpu = get_cpu();
29181 +       curcpu = get_cpu_light();
29182         for_each_online_cpu(cpu) {
29183                 struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu);
29184                 struct mem_cgroup *memcg;
29185 @@ -1962,7 +1968,7 @@ static void drain_all_stock(struct mem_cgroup *root_memcg)
29186                                 schedule_work_on(cpu, &stock->work);
29187                 }
29188         }
29189 -       put_cpu();
29190 +       put_cpu_light();
29191         put_online_cpus();
29192         mutex_unlock(&percpu_charge_mutex);
29193  }
29194 @@ -4700,12 +4706,12 @@ static int mem_cgroup_move_account(struct page *page,
29195
29196         ret = 0;
29197
29198 -       local_irq_disable();
29199 +       local_lock_irq(event_lock);
29200         mem_cgroup_charge_statistics(to, page, nr_pages);
29201         memcg_check_events(to, page);
29202         mem_cgroup_charge_statistics(from, page, -nr_pages);
29203         memcg_check_events(from, page);
29204 -       local_irq_enable();
29205 +       local_unlock_irq(event_lock);
29206  out_unlock:
29207         unlock_page(page);
29208  out:
29209 @@ -5495,10 +5501,10 @@ void mem_cgroup_commit_charge(struct page *page, struct mem_cgroup *memcg,
29210                 VM_BUG_ON_PAGE(!PageTransHuge(page), page);
29211         }
29212
29213 -       local_irq_disable();
29214 +       local_lock_irq(event_lock);
29215         mem_cgroup_charge_statistics(memcg, page, nr_pages);
29216         memcg_check_events(memcg, page);
29217 -       local_irq_enable();
29218 +       local_unlock_irq(event_lock);
29219
29220         if (do_swap_account && PageSwapCache(page)) {
29221                 swp_entry_t entry = { .val = page_private(page) };
29222 @@ -5554,14 +5560,14 @@ static void uncharge_batch(struct mem_cgroup *memcg, unsigned long pgpgout,
29223                 memcg_oom_recover(memcg);
29224         }
29225
29226 -       local_irq_save(flags);
29227 +       local_lock_irqsave(event_lock, flags);
29228         __this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_RSS], nr_anon);
29229         __this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_CACHE], nr_file);
29230         __this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_RSS_HUGE], nr_huge);
29231         __this_cpu_add(memcg->stat->events[MEM_CGROUP_EVENTS_PGPGOUT], pgpgout);
29232         __this_cpu_add(memcg->stat->nr_page_events, nr_pages);
29233         memcg_check_events(memcg, dummy_page);
29234 -       local_irq_restore(flags);
29235 +       local_unlock_irqrestore(event_lock, flags);
29236
29237         if (!mem_cgroup_is_root(memcg))
29238                 css_put_many(&memcg->css, nr_pages);
29239 @@ -5753,6 +5759,7 @@ void mem_cgroup_swapout(struct page *page, swp_entry_t entry)
29240  {
29241         struct mem_cgroup *memcg, *swap_memcg;
29242         unsigned short oldid;
29243 +       unsigned long flags;
29244
29245         VM_BUG_ON_PAGE(PageLRU(page), page);
29246         VM_BUG_ON_PAGE(page_count(page), page);
29247 @@ -5793,12 +5800,16 @@ void mem_cgroup_swapout(struct page *page, swp_entry_t entry)
29248          * important here to have the interrupts disabled because it is the
29249          * only synchronisation we have for udpating the per-CPU variables.
29250          */
29251 +       local_lock_irqsave(event_lock, flags);
29252 +#ifndef CONFIG_PREEMPT_RT_BASE
29253         VM_BUG_ON(!irqs_disabled());
29254 +#endif
29255         mem_cgroup_charge_statistics(memcg, page, -1);
29256         memcg_check_events(memcg, page);
29257
29258         if (!mem_cgroup_is_root(memcg))
29259                 css_put(&memcg->css);
29260 +       local_unlock_irqrestore(event_lock, flags);
29261  }
29262
29263  /**
29264 diff --git a/mm/mmu_context.c b/mm/mmu_context.c
29265 index f802c2d216a7..b1b6f238e42d 100644
29266 --- a/mm/mmu_context.c
29267 +++ b/mm/mmu_context.c
29268 @@ -23,6 +23,7 @@ void use_mm(struct mm_struct *mm)
29269         struct task_struct *tsk = current;
29270
29271         task_lock(tsk);
29272 +       preempt_disable_rt();
29273         active_mm = tsk->active_mm;
29274         if (active_mm != mm) {
29275                 atomic_inc(&mm->mm_count);
29276 @@ -30,6 +31,7 @@ void use_mm(struct mm_struct *mm)
29277         }
29278         tsk->mm = mm;
29279         switch_mm(active_mm, mm, tsk);
29280 +       preempt_enable_rt();
29281         task_unlock(tsk);
29282  #ifdef finish_arch_post_lock_switch
29283         finish_arch_post_lock_switch();
29284 diff --git a/mm/page_alloc.c b/mm/page_alloc.c
29285 index 2bcdfbf8c36d..a500c9e740dd 100644
29286 --- a/mm/page_alloc.c
29287 +++ b/mm/page_alloc.c
29288 @@ -60,6 +60,7 @@
29289  #include <linux/page_ext.h>
29290  #include <linux/hugetlb.h>
29291  #include <linux/sched/rt.h>
29292 +#include <linux/locallock.h>
29293  #include <linux/page_owner.h>
29294  #include <linux/kthread.h>
29295
29296 @@ -264,6 +265,18 @@ EXPORT_SYMBOL(nr_node_ids);
29297  EXPORT_SYMBOL(nr_online_nodes);
29298  #endif
29299
29300 +static DEFINE_LOCAL_IRQ_LOCK(pa_lock);
29301 +
29302 +#ifdef CONFIG_PREEMPT_RT_BASE
29303 +# define cpu_lock_irqsave(cpu, flags)          \
29304 +       local_lock_irqsave_on(pa_lock, flags, cpu)
29305 +# define cpu_unlock_irqrestore(cpu, flags)     \
29306 +       local_unlock_irqrestore_on(pa_lock, flags, cpu)
29307 +#else
29308 +# define cpu_lock_irqsave(cpu, flags)          local_irq_save(flags)
29309 +# define cpu_unlock_irqrestore(cpu, flags)     local_irq_restore(flags)
29310 +#endif
29311 +
29312  int page_group_by_mobility_disabled __read_mostly;
29313
29314  #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
29315 @@ -786,7 +799,7 @@ static inline int free_pages_check(struct page *page)
29316  }
29317
29318  /*
29319 - * Frees a number of pages from the PCP lists
29320 + * Frees a number of pages which have been collected from the pcp lists.
29321   * Assumes all pages on list are in same zone, and of same order.
29322   * count is the number of pages to free.
29323   *
29324 @@ -797,18 +810,53 @@ static inline int free_pages_check(struct page *page)
29325   * pinned" detection logic.
29326   */
29327  static void free_pcppages_bulk(struct zone *zone, int count,
29328 -                                       struct per_cpu_pages *pcp)
29329 +                              struct list_head *list)
29330  {
29331 -       int migratetype = 0;
29332 -       int batch_free = 0;
29333         int to_free = count;
29334         unsigned long nr_scanned;
29335 +       unsigned long flags;
29336 +
29337 +       spin_lock_irqsave(&zone->lock, flags);
29338
29339 -       spin_lock(&zone->lock);
29340         nr_scanned = zone_page_state(zone, NR_PAGES_SCANNED);
29341         if (nr_scanned)
29342                 __mod_zone_page_state(zone, NR_PAGES_SCANNED, -nr_scanned);
29343
29344 +       while (!list_empty(list)) {
29345 +               struct page *page = list_first_entry(list, struct page, lru);
29346 +               int mt; /* migratetype of the to-be-freed page */
29347 +
29348 +               /* must delete as __free_one_page list manipulates */
29349 +               list_del(&page->lru);
29350 +
29351 +               mt = get_pcppage_migratetype(page);
29352 +               /* MIGRATE_ISOLATE page should not go to pcplists */
29353 +               VM_BUG_ON_PAGE(is_migrate_isolate(mt), page);
29354 +               /* Pageblock could have been isolated meanwhile */
29355 +               if (unlikely(has_isolate_pageblock(zone)))
29356 +                       mt = get_pageblock_migratetype(page);
29357 +
29358 +               __free_one_page(page, page_to_pfn(page), zone, 0, mt);
29359 +               trace_mm_page_pcpu_drain(page, 0, mt);
29360 +               to_free--;
29361 +       }
29362 +       WARN_ON(to_free != 0);
29363 +       spin_unlock_irqrestore(&zone->lock, flags);
29364 +}
29365 +
29366 +/*
29367 + * Moves a number of pages from the PCP lists to free list which
29368 + * is freed outside of the locked region.
29369 + *
29370 + * Assumes all pages on list are in same zone, and of same order.
29371 + * count is the number of pages to free.
29372 + */
29373 +static void isolate_pcp_pages(int to_free, struct per_cpu_pages *src,
29374 +                             struct list_head *dst)
29375 +{
29376 +       int migratetype = 0;
29377 +       int batch_free = 0;
29378 +
29379         while (to_free) {
29380                 struct page *page;
29381                 struct list_head *list;
29382 @@ -824,7 +872,7 @@ static void free_pcppages_bulk(struct zone *zone, int count,
29383                         batch_free++;
29384                         if (++migratetype == MIGRATE_PCPTYPES)
29385                                 migratetype = 0;
29386 -                       list = &pcp->lists[migratetype];
29387 +                       list = &src->lists[migratetype];
29388                 } while (list_empty(list));
29389
29390                 /* This is the only non-empty list. Free them all. */
29391 @@ -832,24 +880,12 @@ static void free_pcppages_bulk(struct zone *zone, int count,
29392                         batch_free = to_free;
29393
29394                 do {
29395 -                       int mt; /* migratetype of the to-be-freed page */
29396 -
29397 -                       page = list_entry(list->prev, struct page, lru);
29398 -                       /* must delete as __free_one_page list manipulates */
29399 +                       page = list_last_entry(list, struct page, lru);
29400                         list_del(&page->lru);
29401
29402 -                       mt = get_pcppage_migratetype(page);
29403 -                       /* MIGRATE_ISOLATE page should not go to pcplists */
29404 -                       VM_BUG_ON_PAGE(is_migrate_isolate(mt), page);
29405 -                       /* Pageblock could have been isolated meanwhile */
29406 -                       if (unlikely(has_isolate_pageblock(zone)))
29407 -                               mt = get_pageblock_migratetype(page);
29408 -
29409 -                       __free_one_page(page, page_to_pfn(page), zone, 0, mt);
29410 -                       trace_mm_page_pcpu_drain(page, 0, mt);
29411 +                       list_add(&page->lru, dst);
29412                 } while (--to_free && --batch_free && !list_empty(list));
29413         }
29414 -       spin_unlock(&zone->lock);
29415  }
29416
29417  static void free_one_page(struct zone *zone,
29418 @@ -858,7 +894,9 @@ static void free_one_page(struct zone *zone,
29419                                 int migratetype)
29420  {
29421         unsigned long nr_scanned;
29422 -       spin_lock(&zone->lock);
29423 +       unsigned long flags;
29424 +
29425 +       spin_lock_irqsave(&zone->lock, flags);
29426         nr_scanned = zone_page_state(zone, NR_PAGES_SCANNED);
29427         if (nr_scanned)
29428                 __mod_zone_page_state(zone, NR_PAGES_SCANNED, -nr_scanned);
29429 @@ -868,7 +906,7 @@ static void free_one_page(struct zone *zone,
29430                 migratetype = get_pfnblock_migratetype(page, pfn);
29431         }
29432         __free_one_page(page, pfn, zone, order, migratetype);
29433 -       spin_unlock(&zone->lock);
29434 +       spin_unlock_irqrestore(&zone->lock, flags);
29435  }
29436
29437  static int free_tail_pages_check(struct page *head_page, struct page *page)
29438 @@ -1019,10 +1057,10 @@ static void __free_pages_ok(struct page *page, unsigned int order)
29439                 return;
29440
29441         migratetype = get_pfnblock_migratetype(page, pfn);
29442 -       local_irq_save(flags);
29443 +       local_lock_irqsave(pa_lock, flags);
29444         __count_vm_events(PGFREE, 1 << order);
29445         free_one_page(page_zone(page), page, pfn, order, migratetype);
29446 -       local_irq_restore(flags);
29447 +       local_unlock_irqrestore(pa_lock, flags);
29448  }
29449
29450  static void __init __free_pages_boot_core(struct page *page,
29451 @@ -1879,16 +1917,18 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order,
29452  void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp)
29453  {
29454         unsigned long flags;
29455 +       LIST_HEAD(dst);
29456         int to_drain, batch;
29457
29458 -       local_irq_save(flags);
29459 +       local_lock_irqsave(pa_lock, flags);
29460         batch = READ_ONCE(pcp->batch);
29461         to_drain = min(pcp->count, batch);
29462         if (to_drain > 0) {
29463 -               free_pcppages_bulk(zone, to_drain, pcp);
29464 +               isolate_pcp_pages(to_drain, pcp, &dst);
29465                 pcp->count -= to_drain;
29466         }
29467 -       local_irq_restore(flags);
29468 +       local_unlock_irqrestore(pa_lock, flags);
29469 +       free_pcppages_bulk(zone, to_drain, &dst);
29470  }
29471  #endif
29472
29473 @@ -1904,16 +1944,21 @@ static void drain_pages_zone(unsigned int cpu, struct zone *zone)
29474         unsigned long flags;
29475         struct per_cpu_pageset *pset;
29476         struct per_cpu_pages *pcp;
29477 +       LIST_HEAD(dst);
29478 +       int count;
29479
29480 -       local_irq_save(flags);
29481 +       cpu_lock_irqsave(cpu, flags);
29482         pset = per_cpu_ptr(zone->pageset, cpu);
29483
29484         pcp = &pset->pcp;
29485 -       if (pcp->count) {
29486 -               free_pcppages_bulk(zone, pcp->count, pcp);
29487 +       count = pcp->count;
29488 +       if (count) {
29489 +               isolate_pcp_pages(count, pcp, &dst);
29490                 pcp->count = 0;
29491         }
29492 -       local_irq_restore(flags);
29493 +       cpu_unlock_irqrestore(cpu, flags);
29494 +       if (count)
29495 +               free_pcppages_bulk(zone, count, &dst);
29496  }
29497
29498  /*
29499 @@ -1999,8 +2044,17 @@ void drain_all_pages(struct zone *zone)
29500                 else
29501                         cpumask_clear_cpu(cpu, &cpus_with_pcps);
29502         }
29503 +#ifndef CONFIG_PREEMPT_RT_BASE
29504         on_each_cpu_mask(&cpus_with_pcps, (smp_call_func_t) drain_local_pages,
29505                                                                 zone, 1);
29506 +#else
29507 +       for_each_cpu(cpu, &cpus_with_pcps) {
29508 +               if (zone)
29509 +                       drain_pages_zone(cpu, zone);
29510 +               else
29511 +                       drain_pages(cpu);
29512 +       }
29513 +#endif
29514  }
29515
29516  #ifdef CONFIG_HIBERNATION
29517 @@ -2056,7 +2110,7 @@ void free_hot_cold_page(struct page *page, bool cold)
29518
29519         migratetype = get_pfnblock_migratetype(page, pfn);
29520         set_pcppage_migratetype(page, migratetype);
29521 -       local_irq_save(flags);
29522 +       local_lock_irqsave(pa_lock, flags);
29523         __count_vm_event(PGFREE);
29524
29525         /*
29526 @@ -2082,12 +2136,17 @@ void free_hot_cold_page(struct page *page, bool cold)
29527         pcp->count++;
29528         if (pcp->count >= pcp->high) {
29529                 unsigned long batch = READ_ONCE(pcp->batch);
29530 -               free_pcppages_bulk(zone, batch, pcp);
29531 +               LIST_HEAD(dst);
29532 +
29533 +               isolate_pcp_pages(batch, pcp, &dst);
29534                 pcp->count -= batch;
29535 +               local_unlock_irqrestore(pa_lock, flags);
29536 +               free_pcppages_bulk(zone, batch, &dst);
29537 +               return;
29538         }
29539
29540  out:
29541 -       local_irq_restore(flags);
29542 +       local_unlock_irqrestore(pa_lock, flags);
29543  }
29544
29545  /*
29546 @@ -2222,7 +2281,7 @@ struct page *buffered_rmqueue(struct zone *preferred_zone,
29547                 struct per_cpu_pages *pcp;
29548                 struct list_head *list;
29549
29550 -               local_irq_save(flags);
29551 +               local_lock_irqsave(pa_lock, flags);
29552                 pcp = &this_cpu_ptr(zone->pageset)->pcp;
29553                 list = &pcp->lists[migratetype];
29554                 if (list_empty(list)) {
29555 @@ -2254,7 +2313,7 @@ struct page *buffered_rmqueue(struct zone *preferred_zone,
29556                          */
29557                         WARN_ON_ONCE(order > 1);
29558                 }
29559 -               spin_lock_irqsave(&zone->lock, flags);
29560 +               local_spin_lock_irqsave(pa_lock, &zone->lock, flags);
29561
29562                 page = NULL;
29563                 if (alloc_flags & ALLOC_HARDER) {
29564 @@ -2264,11 +2323,13 @@ struct page *buffered_rmqueue(struct zone *preferred_zone,
29565                 }
29566                 if (!page)
29567                         page = __rmqueue(zone, order, migratetype, gfp_flags);
29568 -               spin_unlock(&zone->lock);
29569 -               if (!page)
29570 +               if (!page) {
29571 +                       spin_unlock(&zone->lock);
29572                         goto failed;
29573 +               }
29574                 __mod_zone_freepage_state(zone, -(1 << order),
29575                                           get_pcppage_migratetype(page));
29576 +               spin_unlock(&zone->lock);
29577         }
29578
29579         __mod_zone_page_state(zone, NR_ALLOC_BATCH, -(1 << order));
29580 @@ -2278,13 +2339,13 @@ struct page *buffered_rmqueue(struct zone *preferred_zone,
29581
29582         __count_zone_vm_events(PGALLOC, zone, 1 << order);
29583         zone_statistics(preferred_zone, zone, gfp_flags);
29584 -       local_irq_restore(flags);
29585 +       local_unlock_irqrestore(pa_lock, flags);
29586
29587         VM_BUG_ON_PAGE(bad_range(zone, page), page);
29588         return page;
29589
29590  failed:
29591 -       local_irq_restore(flags);
29592 +       local_unlock_irqrestore(pa_lock, flags);
29593         return NULL;
29594  }
29595
29596 @@ -5950,6 +6011,7 @@ static int page_alloc_cpu_notify(struct notifier_block *self,
29597  void __init page_alloc_init(void)
29598  {
29599         hotcpu_notifier(page_alloc_cpu_notify, 0);
29600 +       local_irq_lock_init(pa_lock);
29601  }
29602
29603  /*
29604 @@ -6844,7 +6906,7 @@ void zone_pcp_reset(struct zone *zone)
29605         struct per_cpu_pageset *pset;
29606
29607         /* avoid races with drain_pages()  */
29608 -       local_irq_save(flags);
29609 +       local_lock_irqsave(pa_lock, flags);
29610         if (zone->pageset != &boot_pageset) {
29611                 for_each_online_cpu(cpu) {
29612                         pset = per_cpu_ptr(zone->pageset, cpu);
29613 @@ -6853,7 +6915,7 @@ void zone_pcp_reset(struct zone *zone)
29614                 free_percpu(zone->pageset);
29615                 zone->pageset = &boot_pageset;
29616         }
29617 -       local_irq_restore(flags);
29618 +       local_unlock_irqrestore(pa_lock, flags);
29619  }
29620
29621  #ifdef CONFIG_MEMORY_HOTREMOVE
29622 diff --git a/mm/slab.h b/mm/slab.h
29623 index 7b6087197997..afdc57941179 100644
29624 --- a/mm/slab.h
29625 +++ b/mm/slab.h
29626 @@ -324,7 +324,11 @@ static inline struct kmem_cache *cache_from_obj(struct kmem_cache *s, void *x)
29627   * The slab lists for all objects.
29628   */
29629  struct kmem_cache_node {
29630 +#ifdef CONFIG_SLUB
29631 +       raw_spinlock_t list_lock;
29632 +#else
29633         spinlock_t list_lock;
29634 +#endif
29635
29636  #ifdef CONFIG_SLAB
29637         struct list_head slabs_partial; /* partial list first, better asm code */
29638 diff --git a/mm/slub.c b/mm/slub.c
29639 index 65d5f92d51d2..feb4a445a546 100644
29640 --- a/mm/slub.c
29641 +++ b/mm/slub.c
29642 @@ -1075,7 +1075,7 @@ static noinline struct kmem_cache_node *free_debug_processing(
29643         void *object = head;
29644         int cnt = 0;
29645
29646 -       spin_lock_irqsave(&n->list_lock, *flags);
29647 +       raw_spin_lock_irqsave(&n->list_lock, *flags);
29648         slab_lock(page);
29649
29650         if (!check_slab(s, page))
29651 @@ -1136,7 +1136,7 @@ out:
29652
29653  fail:
29654         slab_unlock(page);
29655 -       spin_unlock_irqrestore(&n->list_lock, *flags);
29656 +       raw_spin_unlock_irqrestore(&n->list_lock, *flags);
29657         slab_fix(s, "Object at 0x%p not freed", object);
29658         return NULL;
29659  }
29660 @@ -1263,6 +1263,12 @@ static inline void dec_slabs_node(struct kmem_cache *s, int node,
29661
29662  #endif /* CONFIG_SLUB_DEBUG */
29663
29664 +struct slub_free_list {
29665 +       raw_spinlock_t          lock;
29666 +       struct list_head        list;
29667 +};
29668 +static DEFINE_PER_CPU(struct slub_free_list, slub_free_list);
29669 +
29670  /*
29671   * Hooks for other subsystems that check memory allocations. In a typical
29672   * production configuration these hooks all should produce no code at all.
29673 @@ -1399,10 +1405,17 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
29674         gfp_t alloc_gfp;
29675         void *start, *p;
29676         int idx, order;
29677 +       bool enableirqs = false;
29678
29679         flags &= gfp_allowed_mask;
29680
29681         if (gfpflags_allow_blocking(flags))
29682 +               enableirqs = true;
29683 +#ifdef CONFIG_PREEMPT_RT_FULL
29684 +       if (system_state == SYSTEM_RUNNING)
29685 +               enableirqs = true;
29686 +#endif
29687 +       if (enableirqs)
29688                 local_irq_enable();
29689
29690         flags |= s->allocflags;
29691 @@ -1473,7 +1486,7 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
29692         page->frozen = 1;
29693
29694  out:
29695 -       if (gfpflags_allow_blocking(flags))
29696 +       if (enableirqs)
29697                 local_irq_disable();
29698         if (!page)
29699                 return NULL;
29700 @@ -1529,6 +1542,16 @@ static void __free_slab(struct kmem_cache *s, struct page *page)
29701         __free_kmem_pages(page, order);
29702  }
29703
29704 +static void free_delayed(struct list_head *h)
29705 +{
29706 +       while(!list_empty(h)) {
29707 +               struct page *page = list_first_entry(h, struct page, lru);
29708 +
29709 +               list_del(&page->lru);
29710 +               __free_slab(page->slab_cache, page);
29711 +       }
29712 +}
29713 +
29714  #define need_reserve_slab_rcu                                          \
29715         (sizeof(((struct page *)NULL)->lru) < sizeof(struct rcu_head))
29716
29717 @@ -1560,6 +1583,12 @@ static void free_slab(struct kmem_cache *s, struct page *page)
29718                 }
29719
29720                 call_rcu(head, rcu_free_slab);
29721 +       } else if (irqs_disabled()) {
29722 +               struct slub_free_list *f = this_cpu_ptr(&slub_free_list);
29723 +
29724 +               raw_spin_lock(&f->lock);
29725 +               list_add(&page->lru, &f->list);
29726 +               raw_spin_unlock(&f->lock);
29727         } else
29728                 __free_slab(s, page);
29729  }
29730 @@ -1673,7 +1702,7 @@ static void *get_partial_node(struct kmem_cache *s, struct kmem_cache_node *n,
29731         if (!n || !n->nr_partial)
29732                 return NULL;
29733
29734 -       spin_lock(&n->list_lock);
29735 +       raw_spin_lock(&n->list_lock);
29736         list_for_each_entry_safe(page, page2, &n->partial, lru) {
29737                 void *t;
29738
29739 @@ -1698,7 +1727,7 @@ static void *get_partial_node(struct kmem_cache *s, struct kmem_cache_node *n,
29740                         break;
29741
29742         }
29743 -       spin_unlock(&n->list_lock);
29744 +       raw_spin_unlock(&n->list_lock);
29745         return object;
29746  }
29747
29748 @@ -1944,7 +1973,7 @@ redo:
29749                          * that acquire_slab() will see a slab page that
29750                          * is frozen
29751                          */
29752 -                       spin_lock(&n->list_lock);
29753 +                       raw_spin_lock(&n->list_lock);
29754                 }
29755         } else {
29756                 m = M_FULL;
29757 @@ -1955,7 +1984,7 @@ redo:
29758                          * slabs from diagnostic functions will not see
29759                          * any frozen slabs.
29760                          */
29761 -                       spin_lock(&n->list_lock);
29762 +                       raw_spin_lock(&n->list_lock);
29763                 }
29764         }
29765
29766 @@ -1990,7 +2019,7 @@ redo:
29767                 goto redo;
29768
29769         if (lock)
29770 -               spin_unlock(&n->list_lock);
29771 +               raw_spin_unlock(&n->list_lock);
29772
29773         if (m == M_FREE) {
29774                 stat(s, DEACTIVATE_EMPTY);
29775 @@ -2022,10 +2051,10 @@ static void unfreeze_partials(struct kmem_cache *s,
29776                 n2 = get_node(s, page_to_nid(page));
29777                 if (n != n2) {
29778                         if (n)
29779 -                               spin_unlock(&n->list_lock);
29780 +                               raw_spin_unlock(&n->list_lock);
29781
29782                         n = n2;
29783 -                       spin_lock(&n->list_lock);
29784 +                       raw_spin_lock(&n->list_lock);
29785                 }
29786
29787                 do {
29788 @@ -2054,7 +2083,7 @@ static void unfreeze_partials(struct kmem_cache *s,
29789         }
29790
29791         if (n)
29792 -               spin_unlock(&n->list_lock);
29793 +               raw_spin_unlock(&n->list_lock);
29794
29795         while (discard_page) {
29796                 page = discard_page;
29797 @@ -2093,14 +2122,21 @@ static void put_cpu_partial(struct kmem_cache *s, struct page *page, int drain)
29798                         pobjects = oldpage->pobjects;
29799                         pages = oldpage->pages;
29800                         if (drain && pobjects > s->cpu_partial) {
29801 +                               struct slub_free_list *f;
29802                                 unsigned long flags;
29803 +                               LIST_HEAD(tofree);
29804                                 /*
29805                                  * partial array is full. Move the existing
29806                                  * set to the per node partial list.
29807                                  */
29808                                 local_irq_save(flags);
29809                                 unfreeze_partials(s, this_cpu_ptr(s->cpu_slab));
29810 +                               f = this_cpu_ptr(&slub_free_list);
29811 +                               raw_spin_lock(&f->lock);
29812 +                               list_splice_init(&f->list, &tofree);
29813 +                               raw_spin_unlock(&f->lock);
29814                                 local_irq_restore(flags);
29815 +                               free_delayed(&tofree);
29816                                 oldpage = NULL;
29817                                 pobjects = 0;
29818                                 pages = 0;
29819 @@ -2172,7 +2208,22 @@ static bool has_cpu_slab(int cpu, void *info)
29820
29821  static void flush_all(struct kmem_cache *s)
29822  {
29823 +       LIST_HEAD(tofree);
29824 +       int cpu;
29825 +
29826         on_each_cpu_cond(has_cpu_slab, flush_cpu_slab, s, 1, GFP_ATOMIC);
29827 +       for_each_online_cpu(cpu) {
29828 +               struct slub_free_list *f;
29829 +
29830 +               if (!has_cpu_slab(cpu, s))
29831 +                       continue;
29832 +
29833 +               f = &per_cpu(slub_free_list, cpu);
29834 +               raw_spin_lock_irq(&f->lock);
29835 +               list_splice_init(&f->list, &tofree);
29836 +               raw_spin_unlock_irq(&f->lock);
29837 +               free_delayed(&tofree);
29838 +       }
29839  }
29840
29841  /*
29842 @@ -2208,10 +2259,10 @@ static unsigned long count_partial(struct kmem_cache_node *n,
29843         unsigned long x = 0;
29844         struct page *page;
29845
29846 -       spin_lock_irqsave(&n->list_lock, flags);
29847 +       raw_spin_lock_irqsave(&n->list_lock, flags);
29848         list_for_each_entry(page, &n->partial, lru)
29849                 x += get_count(page);
29850 -       spin_unlock_irqrestore(&n->list_lock, flags);
29851 +       raw_spin_unlock_irqrestore(&n->list_lock, flags);
29852         return x;
29853  }
29854  #endif /* CONFIG_SLUB_DEBUG || CONFIG_SYSFS */
29855 @@ -2349,8 +2400,10 @@ static inline void *get_freelist(struct kmem_cache *s, struct page *page)
29856   * already disabled (which is the case for bulk allocation).
29857   */
29858  static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
29859 -                         unsigned long addr, struct kmem_cache_cpu *c)
29860 +                         unsigned long addr, struct kmem_cache_cpu *c,
29861 +                         struct list_head *to_free)
29862  {
29863 +       struct slub_free_list *f;
29864         void *freelist;
29865         struct page *page;
29866
29867 @@ -2410,6 +2463,13 @@ load_freelist:
29868         VM_BUG_ON(!c->page->frozen);
29869         c->freelist = get_freepointer(s, freelist);
29870         c->tid = next_tid(c->tid);
29871 +
29872 +out:
29873 +       f = this_cpu_ptr(&slub_free_list);
29874 +       raw_spin_lock(&f->lock);
29875 +       list_splice_init(&f->list, to_free);
29876 +       raw_spin_unlock(&f->lock);
29877 +
29878         return freelist;
29879
29880  new_slab:
29881 @@ -2441,7 +2501,7 @@ new_slab:
29882         deactivate_slab(s, page, get_freepointer(s, freelist));
29883         c->page = NULL;
29884         c->freelist = NULL;
29885 -       return freelist;
29886 +       goto out;
29887  }
29888
29889  /*
29890 @@ -2453,6 +2513,7 @@ static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
29891  {
29892         void *p;
29893         unsigned long flags;
29894 +       LIST_HEAD(tofree);
29895
29896         local_irq_save(flags);
29897  #ifdef CONFIG_PREEMPT
29898 @@ -2464,8 +2525,9 @@ static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
29899         c = this_cpu_ptr(s->cpu_slab);
29900  #endif
29901
29902 -       p = ___slab_alloc(s, gfpflags, node, addr, c);
29903 +       p = ___slab_alloc(s, gfpflags, node, addr, c, &tofree);
29904         local_irq_restore(flags);
29905 +       free_delayed(&tofree);
29906         return p;
29907  }
29908
29909 @@ -2652,7 +2714,7 @@ static void __slab_free(struct kmem_cache *s, struct page *page,
29910
29911         do {
29912                 if (unlikely(n)) {
29913 -                       spin_unlock_irqrestore(&n->list_lock, flags);
29914 +                       raw_spin_unlock_irqrestore(&n->list_lock, flags);
29915                         n = NULL;
29916                 }
29917                 prior = page->freelist;
29918 @@ -2684,7 +2746,7 @@ static void __slab_free(struct kmem_cache *s, struct page *page,
29919                                  * Otherwise the list_lock will synchronize with
29920                                  * other processors updating the list of slabs.
29921                                  */
29922 -                               spin_lock_irqsave(&n->list_lock, flags);
29923 +                               raw_spin_lock_irqsave(&n->list_lock, flags);
29924
29925                         }
29926                 }
29927 @@ -2726,7 +2788,7 @@ static void __slab_free(struct kmem_cache *s, struct page *page,
29928                 add_partial(n, page, DEACTIVATE_TO_TAIL);
29929                 stat(s, FREE_ADD_PARTIAL);
29930         }
29931 -       spin_unlock_irqrestore(&n->list_lock, flags);
29932 +       raw_spin_unlock_irqrestore(&n->list_lock, flags);
29933         return;
29934
29935  slab_empty:
29936 @@ -2741,7 +2803,7 @@ slab_empty:
29937                 remove_full(s, n, page);
29938         }
29939
29940 -       spin_unlock_irqrestore(&n->list_lock, flags);
29941 +       raw_spin_unlock_irqrestore(&n->list_lock, flags);
29942         stat(s, FREE_SLAB);
29943         discard_slab(s, page);
29944  }
29945 @@ -2913,6 +2975,7 @@ int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size,
29946                           void **p)
29947  {
29948         struct kmem_cache_cpu *c;
29949 +       LIST_HEAD(to_free);
29950         int i;
29951
29952         /* memcg and kmem_cache debug support */
29953 @@ -2936,7 +2999,7 @@ int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size,
29954                          * of re-populating per CPU c->freelist
29955                          */
29956                         p[i] = ___slab_alloc(s, flags, NUMA_NO_NODE,
29957 -                                           _RET_IP_, c);
29958 +                                           _RET_IP_, c, &to_free);
29959                         if (unlikely(!p[i]))
29960                                 goto error;
29961
29962 @@ -2948,6 +3011,7 @@ int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size,
29963         }
29964         c->tid = next_tid(c->tid);
29965         local_irq_enable();
29966 +       free_delayed(&to_free);
29967
29968         /* Clear memory outside IRQ disabled fastpath loop */
29969         if (unlikely(flags & __GFP_ZERO)) {
29970 @@ -3095,7 +3159,7 @@ static void
29971  init_kmem_cache_node(struct kmem_cache_node *n)
29972  {
29973         n->nr_partial = 0;
29974 -       spin_lock_init(&n->list_lock);
29975 +       raw_spin_lock_init(&n->list_lock);
29976         INIT_LIST_HEAD(&n->partial);
29977  #ifdef CONFIG_SLUB_DEBUG
29978         atomic_long_set(&n->nr_slabs, 0);
29979 @@ -3677,7 +3741,7 @@ int __kmem_cache_shrink(struct kmem_cache *s, bool deactivate)
29980                 for (i = 0; i < SHRINK_PROMOTE_MAX; i++)
29981                         INIT_LIST_HEAD(promote + i);
29982
29983 -               spin_lock_irqsave(&n->list_lock, flags);
29984 +               raw_spin_lock_irqsave(&n->list_lock, flags);
29985
29986                 /*
29987                  * Build lists of slabs to discard or promote.
29988 @@ -3708,7 +3772,7 @@ int __kmem_cache_shrink(struct kmem_cache *s, bool deactivate)
29989                 for (i = SHRINK_PROMOTE_MAX - 1; i >= 0; i--)
29990                         list_splice(promote + i, &n->partial);
29991
29992 -               spin_unlock_irqrestore(&n->list_lock, flags);
29993 +               raw_spin_unlock_irqrestore(&n->list_lock, flags);
29994
29995                 /* Release empty slabs */
29996                 list_for_each_entry_safe(page, t, &discard, lru)
29997 @@ -3884,6 +3948,12 @@ void __init kmem_cache_init(void)
29998  {
29999         static __initdata struct kmem_cache boot_kmem_cache,
30000                 boot_kmem_cache_node;
30001 +       int cpu;
30002 +
30003 +       for_each_possible_cpu(cpu) {
30004 +               raw_spin_lock_init(&per_cpu(slub_free_list, cpu).lock);
30005 +               INIT_LIST_HEAD(&per_cpu(slub_free_list, cpu).list);
30006 +       }
30007
30008         if (debug_guardpage_minorder())
30009                 slub_max_order = 0;
30010 @@ -4127,7 +4197,7 @@ static int validate_slab_node(struct kmem_cache *s,
30011         struct page *page;
30012         unsigned long flags;
30013
30014 -       spin_lock_irqsave(&n->list_lock, flags);
30015 +       raw_spin_lock_irqsave(&n->list_lock, flags);
30016
30017         list_for_each_entry(page, &n->partial, lru) {
30018                 validate_slab_slab(s, page, map);
30019 @@ -4149,7 +4219,7 @@ static int validate_slab_node(struct kmem_cache *s,
30020                        s->name, count, atomic_long_read(&n->nr_slabs));
30021
30022  out:
30023 -       spin_unlock_irqrestore(&n->list_lock, flags);
30024 +       raw_spin_unlock_irqrestore(&n->list_lock, flags);
30025         return count;
30026  }
30027
30028 @@ -4337,12 +4407,12 @@ static int list_locations(struct kmem_cache *s, char *buf,
30029                 if (!atomic_long_read(&n->nr_slabs))
30030                         continue;
30031
30032 -               spin_lock_irqsave(&n->list_lock, flags);
30033 +               raw_spin_lock_irqsave(&n->list_lock, flags);
30034                 list_for_each_entry(page, &n->partial, lru)
30035                         process_slab(&t, s, page, alloc, map);
30036                 list_for_each_entry(page, &n->full, lru)
30037                         process_slab(&t, s, page, alloc, map);
30038 -               spin_unlock_irqrestore(&n->list_lock, flags);
30039 +               raw_spin_unlock_irqrestore(&n->list_lock, flags);
30040         }
30041
30042         for (i = 0; i < t.count; i++) {
30043 diff --git a/mm/swap.c b/mm/swap.c
30044 index 39395fb549c0..ad16649221d7 100644
30045 --- a/mm/swap.c
30046 +++ b/mm/swap.c
30047 @@ -31,6 +31,7 @@
30048  #include <linux/memcontrol.h>
30049  #include <linux/gfp.h>
30050  #include <linux/uio.h>
30051 +#include <linux/locallock.h>
30052  #include <linux/hugetlb.h>
30053  #include <linux/page_idle.h>
30054
30055 @@ -46,6 +47,9 @@ static DEFINE_PER_CPU(struct pagevec, lru_add_pvec);
30056  static DEFINE_PER_CPU(struct pagevec, lru_rotate_pvecs);
30057  static DEFINE_PER_CPU(struct pagevec, lru_deactivate_file_pvecs);
30058
30059 +static DEFINE_LOCAL_IRQ_LOCK(rotate_lock);
30060 +DEFINE_LOCAL_IRQ_LOCK(swapvec_lock);
30061 +
30062  /*
30063   * This path almost never happens for VM activity - pages are normally
30064   * freed via pagevecs.  But it gets used by networking.
30065 @@ -481,11 +485,11 @@ void rotate_reclaimable_page(struct page *page)
30066                 unsigned long flags;
30067
30068                 page_cache_get(page);
30069 -               local_irq_save(flags);
30070 +               local_lock_irqsave(rotate_lock, flags);
30071                 pvec = this_cpu_ptr(&lru_rotate_pvecs);
30072                 if (!pagevec_add(pvec, page))
30073                         pagevec_move_tail(pvec);
30074 -               local_irq_restore(flags);
30075 +               local_unlock_irqrestore(rotate_lock, flags);
30076         }
30077  }
30078
30079 @@ -536,12 +540,13 @@ static bool need_activate_page_drain(int cpu)
30080  void activate_page(struct page *page)
30081  {
30082         if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) {
30083 -               struct pagevec *pvec = &get_cpu_var(activate_page_pvecs);
30084 +               struct pagevec *pvec = &get_locked_var(swapvec_lock,
30085 +                                                      activate_page_pvecs);
30086
30087                 page_cache_get(page);
30088                 if (!pagevec_add(pvec, page))
30089                         pagevec_lru_move_fn(pvec, __activate_page, NULL);
30090 -               put_cpu_var(activate_page_pvecs);
30091 +               put_locked_var(swapvec_lock, activate_page_pvecs);
30092         }
30093  }
30094
30095 @@ -567,7 +572,7 @@ void activate_page(struct page *page)
30096
30097  static void __lru_cache_activate_page(struct page *page)
30098  {
30099 -       struct pagevec *pvec = &get_cpu_var(lru_add_pvec);
30100 +       struct pagevec *pvec = &get_locked_var(swapvec_lock, lru_add_pvec);
30101         int i;
30102
30103         /*
30104 @@ -589,7 +594,7 @@ static void __lru_cache_activate_page(struct page *page)
30105                 }
30106         }
30107
30108 -       put_cpu_var(lru_add_pvec);
30109 +       put_locked_var(swapvec_lock, lru_add_pvec);
30110  }
30111
30112  /*
30113 @@ -630,13 +635,13 @@ EXPORT_SYMBOL(mark_page_accessed);
30114
30115  static void __lru_cache_add(struct page *page)
30116  {
30117 -       struct pagevec *pvec = &get_cpu_var(lru_add_pvec);
30118 +       struct pagevec *pvec = &get_locked_var(swapvec_lock, lru_add_pvec);
30119
30120         page_cache_get(page);
30121         if (!pagevec_space(pvec))
30122                 __pagevec_lru_add(pvec);
30123         pagevec_add(pvec, page);
30124 -       put_cpu_var(lru_add_pvec);
30125 +       put_locked_var(swapvec_lock, lru_add_pvec);
30126  }
30127
30128  /**
30129 @@ -816,9 +821,15 @@ void lru_add_drain_cpu(int cpu)
30130                 unsigned long flags;
30131
30132                 /* No harm done if a racing interrupt already did this */
30133 -               local_irq_save(flags);
30134 +#ifdef CONFIG_PREEMPT_RT_BASE
30135 +               local_lock_irqsave_on(rotate_lock, flags, cpu);
30136 +               pagevec_move_tail(pvec);
30137 +               local_unlock_irqrestore_on(rotate_lock, flags, cpu);
30138 +#else
30139 +               local_lock_irqsave(rotate_lock, flags);
30140                 pagevec_move_tail(pvec);
30141 -               local_irq_restore(flags);
30142 +               local_unlock_irqrestore(rotate_lock, flags);
30143 +#endif
30144         }
30145
30146         pvec = &per_cpu(lru_deactivate_file_pvecs, cpu);
30147 @@ -846,26 +857,47 @@ void deactivate_file_page(struct page *page)
30148                 return;
30149
30150         if (likely(get_page_unless_zero(page))) {
30151 -               struct pagevec *pvec = &get_cpu_var(lru_deactivate_file_pvecs);
30152 +               struct pagevec *pvec = &get_locked_var(swapvec_lock,
30153 +                                                      lru_deactivate_file_pvecs);
30154
30155                 if (!pagevec_add(pvec, page))
30156                         pagevec_lru_move_fn(pvec, lru_deactivate_file_fn, NULL);
30157 -               put_cpu_var(lru_deactivate_file_pvecs);
30158 +               put_locked_var(swapvec_lock, lru_deactivate_file_pvecs);
30159         }
30160  }
30161
30162  void lru_add_drain(void)
30163  {
30164 -       lru_add_drain_cpu(get_cpu());
30165 -       put_cpu();
30166 +       lru_add_drain_cpu(local_lock_cpu(swapvec_lock));
30167 +       local_unlock_cpu(swapvec_lock);
30168  }
30169
30170 +
30171 +#ifdef CONFIG_PREEMPT_RT_BASE
30172 +static inline void remote_lru_add_drain(int cpu, struct cpumask *has_work)
30173 +{
30174 +       local_lock_on(swapvec_lock, cpu);
30175 +       lru_add_drain_cpu(cpu);
30176 +       local_unlock_on(swapvec_lock, cpu);
30177 +}
30178 +
30179 +#else
30180 +
30181  static void lru_add_drain_per_cpu(struct work_struct *dummy)
30182  {
30183         lru_add_drain();
30184  }
30185
30186  static DEFINE_PER_CPU(struct work_struct, lru_add_drain_work);
30187 +static inline void remote_lru_add_drain(int cpu, struct cpumask *has_work)
30188 +{
30189 +       struct work_struct *work = &per_cpu(lru_add_drain_work, cpu);
30190 +
30191 +       INIT_WORK(work, lru_add_drain_per_cpu);
30192 +       schedule_work_on(cpu, work);
30193 +       cpumask_set_cpu(cpu, has_work);
30194 +}
30195 +#endif
30196
30197  void lru_add_drain_all(void)
30198  {
30199 @@ -878,20 +910,17 @@ void lru_add_drain_all(void)
30200         cpumask_clear(&has_work);
30201
30202         for_each_online_cpu(cpu) {
30203 -               struct work_struct *work = &per_cpu(lru_add_drain_work, cpu);
30204 -
30205                 if (pagevec_count(&per_cpu(lru_add_pvec, cpu)) ||
30206                     pagevec_count(&per_cpu(lru_rotate_pvecs, cpu)) ||
30207                     pagevec_count(&per_cpu(lru_deactivate_file_pvecs, cpu)) ||
30208 -                   need_activate_page_drain(cpu)) {
30209 -                       INIT_WORK(work, lru_add_drain_per_cpu);
30210 -                       schedule_work_on(cpu, work);
30211 -                       cpumask_set_cpu(cpu, &has_work);
30212 -               }
30213 +                   need_activate_page_drain(cpu))
30214 +                       remote_lru_add_drain(cpu, &has_work);
30215         }
30216
30217 +#ifndef CONFIG_PREEMPT_RT_BASE
30218         for_each_cpu(cpu, &has_work)
30219                 flush_work(&per_cpu(lru_add_drain_work, cpu));
30220 +#endif
30221
30222         put_online_cpus();
30223         mutex_unlock(&lock);
30224 diff --git a/mm/truncate.c b/mm/truncate.c
30225 index 76e35ad97102..5f196420020c 100644
30226 --- a/mm/truncate.c
30227 +++ b/mm/truncate.c
30228 @@ -56,8 +56,11 @@ static void clear_exceptional_entry(struct address_space *mapping,
30229          * protected by mapping->tree_lock.
30230          */
30231         if (!workingset_node_shadows(node) &&
30232 -           !list_empty(&node->private_list))
30233 -               list_lru_del(&workingset_shadow_nodes, &node->private_list);
30234 +           !list_empty(&node->private_list)) {
30235 +               local_lock(workingset_shadow_lock);
30236 +               list_lru_del(&__workingset_shadow_nodes, &node->private_list);
30237 +               local_unlock(workingset_shadow_lock);
30238 +       }
30239         __radix_tree_delete_node(&mapping->page_tree, node);
30240  unlock:
30241         spin_unlock_irq(&mapping->tree_lock);
30242 diff --git a/mm/vmalloc.c b/mm/vmalloc.c
30243 index 8e3c9c5a3042..68740314ad54 100644
30244 --- a/mm/vmalloc.c
30245 +++ b/mm/vmalloc.c
30246 @@ -821,7 +821,7 @@ static void *new_vmap_block(unsigned int order, gfp_t gfp_mask)
30247         struct vmap_block *vb;
30248         struct vmap_area *va;
30249         unsigned long vb_idx;
30250 -       int node, err;
30251 +       int node, err, cpu;
30252         void *vaddr;
30253
30254         node = numa_node_id();
30255 @@ -864,11 +864,12 @@ static void *new_vmap_block(unsigned int order, gfp_t gfp_mask)
30256         BUG_ON(err);
30257         radix_tree_preload_end();
30258
30259 -       vbq = &get_cpu_var(vmap_block_queue);
30260 +       cpu = get_cpu_light();
30261 +       vbq = this_cpu_ptr(&vmap_block_queue);
30262         spin_lock(&vbq->lock);
30263         list_add_tail_rcu(&vb->free_list, &vbq->free);
30264         spin_unlock(&vbq->lock);
30265 -       put_cpu_var(vmap_block_queue);
30266 +       put_cpu_light();
30267
30268         return vaddr;
30269  }
30270 @@ -937,6 +938,7 @@ static void *vb_alloc(unsigned long size, gfp_t gfp_mask)
30271         struct vmap_block *vb;
30272         void *vaddr = NULL;
30273         unsigned int order;
30274 +       int cpu;
30275
30276         BUG_ON(offset_in_page(size));
30277         BUG_ON(size > PAGE_SIZE*VMAP_MAX_ALLOC);
30278 @@ -951,7 +953,8 @@ static void *vb_alloc(unsigned long size, gfp_t gfp_mask)
30279         order = get_order(size);
30280
30281         rcu_read_lock();
30282 -       vbq = &get_cpu_var(vmap_block_queue);
30283 +       cpu = get_cpu_light();
30284 +       vbq = this_cpu_ptr(&vmap_block_queue);
30285         list_for_each_entry_rcu(vb, &vbq->free, free_list) {
30286                 unsigned long pages_off;
30287
30288 @@ -974,7 +977,7 @@ static void *vb_alloc(unsigned long size, gfp_t gfp_mask)
30289                 break;
30290         }
30291
30292 -       put_cpu_var(vmap_block_queue);
30293 +       put_cpu_light();
30294         rcu_read_unlock();
30295
30296         /* Allocate new block if nothing was found */
30297 diff --git a/mm/vmstat.c b/mm/vmstat.c
30298 index c54fd2924f25..64416fd7c209 100644
30299 --- a/mm/vmstat.c
30300 +++ b/mm/vmstat.c
30301 @@ -226,6 +226,7 @@ void __mod_zone_page_state(struct zone *zone, enum zone_stat_item item,
30302         long x;
30303         long t;
30304
30305 +       preempt_disable_rt();
30306         x = delta + __this_cpu_read(*p);
30307
30308         t = __this_cpu_read(pcp->stat_threshold);
30309 @@ -235,6 +236,7 @@ void __mod_zone_page_state(struct zone *zone, enum zone_stat_item item,
30310                 x = 0;
30311         }
30312         __this_cpu_write(*p, x);
30313 +       preempt_enable_rt();
30314  }
30315  EXPORT_SYMBOL(__mod_zone_page_state);
30316
30317 @@ -267,6 +269,7 @@ void __inc_zone_state(struct zone *zone, enum zone_stat_item item)
30318         s8 __percpu *p = pcp->vm_stat_diff + item;
30319         s8 v, t;
30320
30321 +       preempt_disable_rt();
30322         v = __this_cpu_inc_return(*p);
30323         t = __this_cpu_read(pcp->stat_threshold);
30324         if (unlikely(v > t)) {
30325 @@ -275,6 +278,7 @@ void __inc_zone_state(struct zone *zone, enum zone_stat_item item)
30326                 zone_page_state_add(v + overstep, zone, item);
30327                 __this_cpu_write(*p, -overstep);
30328         }
30329 +       preempt_enable_rt();
30330  }
30331
30332  void __inc_zone_page_state(struct page *page, enum zone_stat_item item)
30333 @@ -289,6 +293,7 @@ void __dec_zone_state(struct zone *zone, enum zone_stat_item item)
30334         s8 __percpu *p = pcp->vm_stat_diff + item;
30335         s8 v, t;
30336
30337 +       preempt_disable_rt();
30338         v = __this_cpu_dec_return(*p);
30339         t = __this_cpu_read(pcp->stat_threshold);
30340         if (unlikely(v < - t)) {
30341 @@ -297,6 +302,7 @@ void __dec_zone_state(struct zone *zone, enum zone_stat_item item)
30342                 zone_page_state_add(v - overstep, zone, item);
30343                 __this_cpu_write(*p, overstep);
30344         }
30345 +       preempt_enable_rt();
30346  }
30347
30348  void __dec_zone_page_state(struct page *page, enum zone_stat_item item)
30349 diff --git a/mm/workingset.c b/mm/workingset.c
30350 index df66f426fdcf..6db7b243fa0d 100644
30351 --- a/mm/workingset.c
30352 +++ b/mm/workingset.c
30353 @@ -264,7 +264,8 @@ void workingset_activation(struct page *page)
30354   * point where they would still be useful.
30355   */
30356
30357 -struct list_lru workingset_shadow_nodes;
30358 +struct list_lru __workingset_shadow_nodes;
30359 +DEFINE_LOCAL_IRQ_LOCK(workingset_shadow_lock);
30360
30361  static unsigned long count_shadow_nodes(struct shrinker *shrinker,
30362                                         struct shrink_control *sc)
30363 @@ -274,9 +275,9 @@ static unsigned long count_shadow_nodes(struct shrinker *shrinker,
30364         unsigned long pages;
30365
30366         /* list_lru lock nests inside IRQ-safe mapping->tree_lock */
30367 -       local_irq_disable();
30368 -       shadow_nodes = list_lru_shrink_count(&workingset_shadow_nodes, sc);
30369 -       local_irq_enable();
30370 +       local_lock_irq(workingset_shadow_lock);
30371 +       shadow_nodes = list_lru_shrink_count(&__workingset_shadow_nodes, sc);
30372 +       local_unlock_irq(workingset_shadow_lock);
30373
30374         pages = node_present_pages(sc->nid);
30375         /*
30376 @@ -361,9 +362,9 @@ static enum lru_status shadow_lru_isolate(struct list_head *item,
30377         spin_unlock(&mapping->tree_lock);
30378         ret = LRU_REMOVED_RETRY;
30379  out:
30380 -       local_irq_enable();
30381 +       local_unlock_irq(workingset_shadow_lock);
30382         cond_resched();
30383 -       local_irq_disable();
30384 +       local_lock_irq(workingset_shadow_lock);
30385         spin_lock(lru_lock);
30386         return ret;
30387  }
30388 @@ -374,10 +375,10 @@ static unsigned long scan_shadow_nodes(struct shrinker *shrinker,
30389         unsigned long ret;
30390
30391         /* list_lru lock nests inside IRQ-safe mapping->tree_lock */
30392 -       local_irq_disable();
30393 -       ret =  list_lru_shrink_walk(&workingset_shadow_nodes, sc,
30394 +       local_lock_irq(workingset_shadow_lock);
30395 +       ret =  list_lru_shrink_walk(&__workingset_shadow_nodes, sc,
30396                                     shadow_lru_isolate, NULL);
30397 -       local_irq_enable();
30398 +       local_unlock_irq(workingset_shadow_lock);
30399         return ret;
30400  }
30401
30402 @@ -398,7 +399,7 @@ static int __init workingset_init(void)
30403  {
30404         int ret;
30405
30406 -       ret = list_lru_init_key(&workingset_shadow_nodes, &shadow_nodes_key);
30407 +       ret = list_lru_init_key(&__workingset_shadow_nodes, &shadow_nodes_key);
30408         if (ret)
30409                 goto err;
30410         ret = register_shrinker(&workingset_shadow_shrinker);
30411 @@ -406,7 +407,7 @@ static int __init workingset_init(void)
30412                 goto err_list_lru;
30413         return 0;
30414  err_list_lru:
30415 -       list_lru_destroy(&workingset_shadow_nodes);
30416 +       list_lru_destroy(&__workingset_shadow_nodes);
30417  err:
30418         return ret;
30419  }
30420 diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
30421 index c1ea19478119..529552c3716d 100644
30422 --- a/mm/zsmalloc.c
30423 +++ b/mm/zsmalloc.c
30424 @@ -64,6 +64,7 @@
30425  #include <linux/debugfs.h>
30426  #include <linux/zsmalloc.h>
30427  #include <linux/zpool.h>
30428 +#include <linux/locallock.h>
30429
30430  /*
30431   * This must be power of 2 and greater than of equal to sizeof(link_free).
30432 @@ -403,6 +404,7 @@ static unsigned int get_maxobj_per_zspage(int size, int pages_per_zspage)
30433
30434  /* per-cpu VM mapping areas for zspage accesses that cross page boundaries */
30435  static DEFINE_PER_CPU(struct mapping_area, zs_map_area);
30436 +static DEFINE_LOCAL_IRQ_LOCK(zs_map_area_lock);
30437
30438  static int is_first_page(struct page *page)
30439  {
30440 @@ -1289,7 +1291,7 @@ void *zs_map_object(struct zs_pool *pool, unsigned long handle,
30441         class = pool->size_class[class_idx];
30442         off = obj_idx_to_offset(page, obj_idx, class->size);
30443
30444 -       area = &get_cpu_var(zs_map_area);
30445 +       area = &get_locked_var(zs_map_area_lock, zs_map_area);
30446         area->vm_mm = mm;
30447         if (off + class->size <= PAGE_SIZE) {
30448                 /* this object is contained entirely within a page */
30449 @@ -1342,7 +1344,7 @@ void zs_unmap_object(struct zs_pool *pool, unsigned long handle)
30450
30451                 __zs_unmap_object(area, pages, off, class->size);
30452         }
30453 -       put_cpu_var(zs_map_area);
30454 +       put_locked_var(zs_map_area_lock, zs_map_area);
30455         unpin_tag(handle);
30456  }
30457  EXPORT_SYMBOL_GPL(zs_unmap_object);
30458 diff --git a/net/core/dev.c b/net/core/dev.c
30459 index 0989fea88c44..4d5f550f01f5 100644
30460 --- a/net/core/dev.c
30461 +++ b/net/core/dev.c
30462 @@ -186,6 +186,7 @@ static unsigned int napi_gen_id;
30463  static DEFINE_HASHTABLE(napi_hash, 8);
30464
30465  static seqcount_t devnet_rename_seq;
30466 +static DEFINE_MUTEX(devnet_rename_mutex);
30467
30468  static inline void dev_base_seq_inc(struct net *net)
30469  {
30470 @@ -207,14 +208,14 @@ static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex)
30471  static inline void rps_lock(struct softnet_data *sd)
30472  {
30473  #ifdef CONFIG_RPS
30474 -       spin_lock(&sd->input_pkt_queue.lock);
30475 +       raw_spin_lock(&sd->input_pkt_queue.raw_lock);
30476  #endif
30477  }
30478
30479  static inline void rps_unlock(struct softnet_data *sd)
30480  {
30481  #ifdef CONFIG_RPS
30482 -       spin_unlock(&sd->input_pkt_queue.lock);
30483 +       raw_spin_unlock(&sd->input_pkt_queue.raw_lock);
30484  #endif
30485  }
30486
30487 @@ -884,7 +885,8 @@ retry:
30488         strcpy(name, dev->name);
30489         rcu_read_unlock();
30490         if (read_seqcount_retry(&devnet_rename_seq, seq)) {
30491 -               cond_resched();
30492 +               mutex_lock(&devnet_rename_mutex);
30493 +               mutex_unlock(&devnet_rename_mutex);
30494                 goto retry;
30495         }
30496
30497 @@ -1153,20 +1155,17 @@ int dev_change_name(struct net_device *dev, const char *newname)
30498         if (dev->flags & IFF_UP)
30499                 return -EBUSY;
30500
30501 -       write_seqcount_begin(&devnet_rename_seq);
30502 +       mutex_lock(&devnet_rename_mutex);
30503 +       __raw_write_seqcount_begin(&devnet_rename_seq);
30504
30505 -       if (strncmp(newname, dev->name, IFNAMSIZ) == 0) {
30506 -               write_seqcount_end(&devnet_rename_seq);
30507 -               return 0;
30508 -       }
30509 +       if (strncmp(newname, dev->name, IFNAMSIZ) == 0)
30510 +               goto outunlock;
30511
30512         memcpy(oldname, dev->name, IFNAMSIZ);
30513
30514         err = dev_get_valid_name(net, dev, newname);
30515 -       if (err < 0) {
30516 -               write_seqcount_end(&devnet_rename_seq);
30517 -               return err;
30518 -       }
30519 +       if (err < 0)
30520 +               goto outunlock;
30521
30522         if (oldname[0] && !strchr(oldname, '%'))
30523                 netdev_info(dev, "renamed from %s\n", oldname);
30524 @@ -1179,11 +1178,12 @@ rollback:
30525         if (ret) {
30526                 memcpy(dev->name, oldname, IFNAMSIZ);
30527                 dev->name_assign_type = old_assign_type;
30528 -               write_seqcount_end(&devnet_rename_seq);
30529 -               return ret;
30530 +               err = ret;
30531 +               goto outunlock;
30532         }
30533
30534 -       write_seqcount_end(&devnet_rename_seq);
30535 +       __raw_write_seqcount_end(&devnet_rename_seq);
30536 +       mutex_unlock(&devnet_rename_mutex);
30537
30538         netdev_adjacent_rename_links(dev, oldname);
30539
30540 @@ -1204,7 +1204,8 @@ rollback:
30541                 /* err >= 0 after dev_alloc_name() or stores the first errno */
30542                 if (err >= 0) {
30543                         err = ret;
30544 -                       write_seqcount_begin(&devnet_rename_seq);
30545 +                       mutex_lock(&devnet_rename_mutex);
30546 +                       __raw_write_seqcount_begin(&devnet_rename_seq);
30547                         memcpy(dev->name, oldname, IFNAMSIZ);
30548                         memcpy(oldname, newname, IFNAMSIZ);
30549                         dev->name_assign_type = old_assign_type;
30550 @@ -1217,6 +1218,11 @@ rollback:
30551         }
30552
30553         return err;
30554 +
30555 +outunlock:
30556 +       __raw_write_seqcount_end(&devnet_rename_seq);
30557 +       mutex_unlock(&devnet_rename_mutex);
30558 +       return err;
30559  }
30560
30561  /**
30562 @@ -2246,6 +2252,7 @@ static inline void __netif_reschedule(struct Qdisc *q)
30563         sd->output_queue_tailp = &q->next_sched;
30564         raise_softirq_irqoff(NET_TX_SOFTIRQ);
30565         local_irq_restore(flags);
30566 +       preempt_check_resched_rt();
30567  }
30568
30569  void __netif_schedule(struct Qdisc *q)
30570 @@ -2327,6 +2334,7 @@ void __dev_kfree_skb_irq(struct sk_buff *skb, enum skb_free_reason reason)
30571         __this_cpu_write(softnet_data.completion_queue, skb);
30572         raise_softirq_irqoff(NET_TX_SOFTIRQ);
30573         local_irq_restore(flags);
30574 +       preempt_check_resched_rt();
30575  }
30576  EXPORT_SYMBOL(__dev_kfree_skb_irq);
30577
30578 @@ -2883,7 +2891,11 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
30579          * This permits __QDISC___STATE_RUNNING owner to get the lock more
30580          * often and dequeue packets faster.
30581          */
30582 +#ifdef CONFIG_PREEMPT_RT_FULL
30583 +       contended = true;
30584 +#else
30585         contended = qdisc_is_running(q);
30586 +#endif
30587         if (unlikely(contended))
30588                 spin_lock(&q->busylock);
30589
30590 @@ -2943,9 +2955,44 @@ static void skb_update_prio(struct sk_buff *skb)
30591  #define skb_update_prio(skb)
30592  #endif
30593
30594 +#ifdef CONFIG_PREEMPT_RT_FULL
30595 +
30596 +static inline int xmit_rec_read(void)
30597 +{
30598 +       return current->xmit_recursion;
30599 +}
30600 +
30601 +static inline void xmit_rec_inc(void)
30602 +{
30603 +       current->xmit_recursion++;
30604 +}
30605 +
30606 +static inline void xmit_rec_dec(void)
30607 +{
30608 +       current->xmit_recursion--;
30609 +}
30610 +
30611 +#else
30612 +
30613  DEFINE_PER_CPU(int, xmit_recursion);
30614  EXPORT_SYMBOL(xmit_recursion);
30615
30616 +static inline int xmit_rec_read(void)
30617 +{
30618 +       return __this_cpu_read(xmit_recursion);
30619 +}
30620 +
30621 +static inline void xmit_rec_inc(void)
30622 +{
30623 +       __this_cpu_inc(xmit_recursion);
30624 +}
30625 +
30626 +static inline void xmit_rec_dec(void)
30627 +{
30628 +       __this_cpu_dec(xmit_recursion);
30629 +}
30630 +#endif
30631 +
30632  #define RECURSION_LIMIT 10
30633
30634  /**
30635 @@ -3138,7 +3185,7 @@ static int __dev_queue_xmit(struct sk_buff *skb, void *accel_priv)
30636
30637                 if (txq->xmit_lock_owner != cpu) {
30638
30639 -                       if (__this_cpu_read(xmit_recursion) > RECURSION_LIMIT)
30640 +                       if (xmit_rec_read() > RECURSION_LIMIT)
30641                                 goto recursion_alert;
30642
30643                         skb = validate_xmit_skb(skb, dev);
30644 @@ -3148,9 +3195,9 @@ static int __dev_queue_xmit(struct sk_buff *skb, void *accel_priv)
30645                         HARD_TX_LOCK(dev, txq, cpu);
30646
30647                         if (!netif_xmit_stopped(txq)) {
30648 -                               __this_cpu_inc(xmit_recursion);
30649 +                               xmit_rec_inc();
30650                                 skb = dev_hard_start_xmit(skb, dev, txq, &rc);
30651 -                               __this_cpu_dec(xmit_recursion);
30652 +                               xmit_rec_dec();
30653                                 if (dev_xmit_complete(rc)) {
30654                                         HARD_TX_UNLOCK(dev, txq);
30655                                         goto out;
30656 @@ -3524,6 +3571,7 @@ drop:
30657         rps_unlock(sd);
30658
30659         local_irq_restore(flags);
30660 +       preempt_check_resched_rt();
30661
30662         atomic_long_inc(&skb->dev->rx_dropped);
30663         kfree_skb(skb);
30664 @@ -3542,7 +3590,7 @@ static int netif_rx_internal(struct sk_buff *skb)
30665                 struct rps_dev_flow voidflow, *rflow = &voidflow;
30666                 int cpu;
30667
30668 -               preempt_disable();
30669 +               migrate_disable();
30670                 rcu_read_lock();
30671
30672                 cpu = get_rps_cpu(skb->dev, skb, &rflow);
30673 @@ -3552,13 +3600,13 @@ static int netif_rx_internal(struct sk_buff *skb)
30674                 ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
30675
30676                 rcu_read_unlock();
30677 -               preempt_enable();
30678 +               migrate_enable();
30679         } else
30680  #endif
30681         {
30682                 unsigned int qtail;
30683 -               ret = enqueue_to_backlog(skb, get_cpu(), &qtail);
30684 -               put_cpu();
30685 +               ret = enqueue_to_backlog(skb, get_cpu_light(), &qtail);
30686 +               put_cpu_light();
30687         }
30688         return ret;
30689  }
30690 @@ -3592,16 +3640,44 @@ int netif_rx_ni(struct sk_buff *skb)
30691
30692         trace_netif_rx_ni_entry(skb);
30693
30694 -       preempt_disable();
30695 +       local_bh_disable();
30696         err = netif_rx_internal(skb);
30697 -       if (local_softirq_pending())
30698 -               do_softirq();
30699 -       preempt_enable();
30700 +       local_bh_enable();
30701
30702         return err;
30703  }
30704  EXPORT_SYMBOL(netif_rx_ni);
30705
30706 +#ifdef CONFIG_PREEMPT_RT_FULL
30707 +/*
30708 + * RT runs ksoftirqd as a real time thread and the root_lock is a
30709 + * "sleeping spinlock". If the trylock fails then we can go into an
30710 + * infinite loop when ksoftirqd preempted the task which actually
30711 + * holds the lock, because we requeue q and raise NET_TX softirq
30712 + * causing ksoftirqd to loop forever.
30713 + *
30714 + * It's safe to use spin_lock on RT here as softirqs run in thread
30715 + * context and cannot deadlock against the thread which is holding
30716 + * root_lock.
30717 + *
30718 + * On !RT the trylock might fail, but there we bail out from the
30719 + * softirq loop after 10 attempts which we can't do on RT. And the
30720 + * task holding root_lock cannot be preempted, so the only downside of
30721 + * that trylock is that we need 10 loops to decide that we should have
30722 + * given up in the first one :)
30723 + */
30724 +static inline int take_root_lock(spinlock_t *lock)
30725 +{
30726 +       spin_lock(lock);
30727 +       return 1;
30728 +}
30729 +#else
30730 +static inline int take_root_lock(spinlock_t *lock)
30731 +{
30732 +       return spin_trylock(lock);
30733 +}
30734 +#endif
30735 +
30736  static void net_tx_action(struct softirq_action *h)
30737  {
30738         struct softnet_data *sd = this_cpu_ptr(&softnet_data);
30739 @@ -3643,7 +3719,7 @@ static void net_tx_action(struct softirq_action *h)
30740                         head = head->next_sched;
30741
30742                         root_lock = qdisc_lock(q);
30743 -                       if (spin_trylock(root_lock)) {
30744 +                       if (take_root_lock(root_lock)) {
30745                                 smp_mb__before_atomic();
30746                                 clear_bit(__QDISC_STATE_SCHED,
30747                                           &q->state);
30748 @@ -4065,7 +4141,7 @@ static void flush_backlog(void *arg)
30749         skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) {
30750                 if (skb->dev == dev) {
30751                         __skb_unlink(skb, &sd->input_pkt_queue);
30752 -                       kfree_skb(skb);
30753 +                       __skb_queue_tail(&sd->tofree_queue, skb);
30754                         input_queue_head_incr(sd);
30755                 }
30756         }
30757 @@ -4074,10 +4150,13 @@ static void flush_backlog(void *arg)
30758         skb_queue_walk_safe(&sd->process_queue, skb, tmp) {
30759                 if (skb->dev == dev) {
30760                         __skb_unlink(skb, &sd->process_queue);
30761 -                       kfree_skb(skb);
30762 +                       __skb_queue_tail(&sd->tofree_queue, skb);
30763                         input_queue_head_incr(sd);
30764                 }
30765         }
30766 +
30767 +       if (!skb_queue_empty(&sd->tofree_queue))
30768 +               raise_softirq_irqoff(NET_RX_SOFTIRQ);
30769  }
30770
30771  static int napi_gro_complete(struct sk_buff *skb)
30772 @@ -4531,6 +4610,7 @@ static void net_rps_action_and_irq_enable(struct softnet_data *sd)
30773                 sd->rps_ipi_list = NULL;
30774
30775                 local_irq_enable();
30776 +               preempt_check_resched_rt();
30777
30778                 /* Send pending IPI's to kick RPS processing on remote cpus. */
30779                 while (remsd) {
30780 @@ -4544,6 +4624,7 @@ static void net_rps_action_and_irq_enable(struct softnet_data *sd)
30781         } else
30782  #endif
30783                 local_irq_enable();
30784 +       preempt_check_resched_rt();
30785  }
30786
30787  static bool sd_has_rps_ipi_waiting(struct softnet_data *sd)
30788 @@ -4625,6 +4706,7 @@ void __napi_schedule(struct napi_struct *n)
30789         local_irq_save(flags);
30790         ____napi_schedule(this_cpu_ptr(&softnet_data), n);
30791         local_irq_restore(flags);
30792 +       preempt_check_resched_rt();
30793  }
30794  EXPORT_SYMBOL(__napi_schedule);
30795
30796 @@ -4901,7 +4983,7 @@ static void net_rx_action(struct softirq_action *h)
30797         list_splice_tail(&repoll, &list);
30798         list_splice(&list, &sd->poll_list);
30799         if (!list_empty(&sd->poll_list))
30800 -               __raise_softirq_irqoff(NET_RX_SOFTIRQ);
30801 +               __raise_softirq_irqoff_ksoft(NET_RX_SOFTIRQ);
30802
30803         net_rps_action_and_irq_enable(sd);
30804  }
30805 @@ -7234,7 +7316,7 @@ EXPORT_SYMBOL(free_netdev);
30806  void synchronize_net(void)
30807  {
30808         might_sleep();
30809 -       if (rtnl_is_locked())
30810 +       if (rtnl_is_locked() && !IS_ENABLED(CONFIG_PREEMPT_RT_FULL))
30811                 synchronize_rcu_expedited();
30812         else
30813                 synchronize_rcu();
30814 @@ -7475,16 +7557,20 @@ static int dev_cpu_callback(struct notifier_block *nfb,
30815
30816         raise_softirq_irqoff(NET_TX_SOFTIRQ);
30817         local_irq_enable();
30818 +       preempt_check_resched_rt();
30819
30820         /* Process offline CPU's input_pkt_queue */
30821         while ((skb = __skb_dequeue(&oldsd->process_queue))) {
30822                 netif_rx_ni(skb);
30823                 input_queue_head_incr(oldsd);
30824         }
30825 -       while ((skb = skb_dequeue(&oldsd->input_pkt_queue))) {
30826 +       while ((skb = __skb_dequeue(&oldsd->input_pkt_queue))) {
30827                 netif_rx_ni(skb);
30828                 input_queue_head_incr(oldsd);
30829         }
30830 +       while ((skb = __skb_dequeue(&oldsd->tofree_queue))) {
30831 +               kfree_skb(skb);
30832 +       }
30833
30834         return NOTIFY_OK;
30835  }
30836 @@ -7786,8 +7872,9 @@ static int __init net_dev_init(void)
30837         for_each_possible_cpu(i) {
30838                 struct softnet_data *sd = &per_cpu(softnet_data, i);
30839
30840 -               skb_queue_head_init(&sd->input_pkt_queue);
30841 -               skb_queue_head_init(&sd->process_queue);
30842 +               skb_queue_head_init_raw(&sd->input_pkt_queue);
30843 +               skb_queue_head_init_raw(&sd->process_queue);
30844 +               skb_queue_head_init_raw(&sd->tofree_queue);
30845                 INIT_LIST_HEAD(&sd->poll_list);
30846                 sd->output_queue_tailp = &sd->output_queue;
30847  #ifdef CONFIG_RPS
30848 diff --git a/net/core/skbuff.c b/net/core/skbuff.c
30849 index 4968b5ddea69..c8d778f405dc 100644
30850 --- a/net/core/skbuff.c
30851 +++ b/net/core/skbuff.c
30852 @@ -63,6 +63,7 @@
30853  #include <linux/errqueue.h>
30854  #include <linux/prefetch.h>
30855  #include <linux/if_vlan.h>
30856 +#include <linux/locallock.h>
30857
30858  #include <net/protocol.h>
30859  #include <net/dst.h>
30860 @@ -351,6 +352,8 @@ EXPORT_SYMBOL(build_skb);
30861
30862  static DEFINE_PER_CPU(struct page_frag_cache, netdev_alloc_cache);
30863  static DEFINE_PER_CPU(struct page_frag_cache, napi_alloc_cache);
30864 +static DEFINE_LOCAL_IRQ_LOCK(netdev_alloc_lock);
30865 +static DEFINE_LOCAL_IRQ_LOCK(napi_alloc_cache_lock);
30866
30867  static void *__netdev_alloc_frag(unsigned int fragsz, gfp_t gfp_mask)
30868  {
30869 @@ -358,10 +361,10 @@ static void *__netdev_alloc_frag(unsigned int fragsz, gfp_t gfp_mask)
30870         unsigned long flags;
30871         void *data;
30872
30873 -       local_irq_save(flags);
30874 +       local_lock_irqsave(netdev_alloc_lock, flags);
30875         nc = this_cpu_ptr(&netdev_alloc_cache);
30876         data = __alloc_page_frag(nc, fragsz, gfp_mask);
30877 -       local_irq_restore(flags);
30878 +       local_unlock_irqrestore(netdev_alloc_lock, flags);
30879         return data;
30880  }
30881
30882 @@ -380,9 +383,13 @@ EXPORT_SYMBOL(netdev_alloc_frag);
30883
30884  static void *__napi_alloc_frag(unsigned int fragsz, gfp_t gfp_mask)
30885  {
30886 -       struct page_frag_cache *nc = this_cpu_ptr(&napi_alloc_cache);
30887 +       struct page_frag_cache *nc;
30888 +       void *data;
30889
30890 -       return __alloc_page_frag(nc, fragsz, gfp_mask);
30891 +       nc = &get_locked_var(napi_alloc_cache_lock, napi_alloc_cache);
30892 +       data = __alloc_page_frag(nc, fragsz, gfp_mask);
30893 +       put_locked_var(napi_alloc_cache_lock, napi_alloc_cache);
30894 +       return data;
30895  }
30896
30897  void *napi_alloc_frag(unsigned int fragsz)
30898 @@ -429,13 +436,13 @@ struct sk_buff *__netdev_alloc_skb(struct net_device *dev, unsigned int len,
30899         if (sk_memalloc_socks())
30900                 gfp_mask |= __GFP_MEMALLOC;
30901
30902 -       local_irq_save(flags);
30903 +       local_lock_irqsave(netdev_alloc_lock, flags);
30904
30905         nc = this_cpu_ptr(&netdev_alloc_cache);
30906         data = __alloc_page_frag(nc, len, gfp_mask);
30907         pfmemalloc = nc->pfmemalloc;
30908
30909 -       local_irq_restore(flags);
30910 +       local_unlock_irqrestore(netdev_alloc_lock, flags);
30911
30912         if (unlikely(!data))
30913                 return NULL;
30914 @@ -476,9 +483,10 @@ EXPORT_SYMBOL(__netdev_alloc_skb);
30915  struct sk_buff *__napi_alloc_skb(struct napi_struct *napi, unsigned int len,
30916                                  gfp_t gfp_mask)
30917  {
30918 -       struct page_frag_cache *nc = this_cpu_ptr(&napi_alloc_cache);
30919 +       struct page_frag_cache *nc;
30920         struct sk_buff *skb;
30921         void *data;
30922 +       bool pfmemalloc;
30923
30924         len += NET_SKB_PAD + NET_IP_ALIGN;
30925
30926 @@ -496,7 +504,11 @@ struct sk_buff *__napi_alloc_skb(struct napi_struct *napi, unsigned int len,
30927         if (sk_memalloc_socks())
30928                 gfp_mask |= __GFP_MEMALLOC;
30929
30930 +       nc = &get_locked_var(napi_alloc_cache_lock, napi_alloc_cache);
30931         data = __alloc_page_frag(nc, len, gfp_mask);
30932 +       pfmemalloc = nc->pfmemalloc;
30933 +       put_locked_var(napi_alloc_cache_lock, napi_alloc_cache);
30934 +
30935         if (unlikely(!data))
30936                 return NULL;
30937
30938 @@ -507,7 +519,7 @@ struct sk_buff *__napi_alloc_skb(struct napi_struct *napi, unsigned int len,
30939         }
30940
30941         /* use OR instead of assignment to avoid clearing of bits in mask */
30942 -       if (nc->pfmemalloc)
30943 +       if (pfmemalloc)
30944                 skb->pfmemalloc = 1;
30945         skb->head_frag = 1;
30946
30947 diff --git a/net/core/sock.c b/net/core/sock.c
30948 index 0d91f7dca751..9c3234299fc3 100644
30949 --- a/net/core/sock.c
30950 +++ b/net/core/sock.c
30951 @@ -2435,12 +2435,11 @@ void lock_sock_nested(struct sock *sk, int subclass)
30952         if (sk->sk_lock.owned)
30953                 __lock_sock(sk);
30954         sk->sk_lock.owned = 1;
30955 -       spin_unlock(&sk->sk_lock.slock);
30956 +       spin_unlock_bh(&sk->sk_lock.slock);
30957         /*
30958          * The sk_lock has mutex_lock() semantics here:
30959          */
30960         mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_);
30961 -       local_bh_enable();
30962  }
30963  EXPORT_SYMBOL(lock_sock_nested);
30964
30965 diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c
30966 index 36e26977c908..ff2593269089 100644
30967 --- a/net/ipv4/icmp.c
30968 +++ b/net/ipv4/icmp.c
30969 @@ -69,6 +69,7 @@
30970  #include <linux/jiffies.h>
30971  #include <linux/kernel.h>
30972  #include <linux/fcntl.h>
30973 +#include <linux/sysrq.h>
30974  #include <linux/socket.h>
30975  #include <linux/in.h>
30976  #include <linux/inet.h>
30977 @@ -77,6 +78,7 @@
30978  #include <linux/string.h>
30979  #include <linux/netfilter_ipv4.h>
30980  #include <linux/slab.h>
30981 +#include <linux/locallock.h>
30982  #include <net/snmp.h>
30983  #include <net/ip.h>
30984  #include <net/route.h>
30985 @@ -204,6 +206,8 @@ static const struct icmp_control icmp_pointers[NR_ICMP_TYPES+1];
30986   *
30987   *     On SMP we have one ICMP socket per-cpu.
30988   */
30989 +static DEFINE_LOCAL_IRQ_LOCK(icmp_sk_lock);
30990 +
30991  static struct sock *icmp_sk(struct net *net)
30992  {
30993         return *this_cpu_ptr(net->ipv4.icmp_sk);
30994 @@ -215,12 +219,14 @@ static inline struct sock *icmp_xmit_lock(struct net *net)
30995
30996         local_bh_disable();
30997
30998 +       local_lock(icmp_sk_lock);
30999         sk = icmp_sk(net);
31000
31001         if (unlikely(!spin_trylock(&sk->sk_lock.slock))) {
31002                 /* This can happen if the output path signals a
31003                  * dst_link_failure() for an outgoing ICMP packet.
31004                  */
31005 +               local_unlock(icmp_sk_lock);
31006                 local_bh_enable();
31007                 return NULL;
31008         }
31009 @@ -230,6 +236,7 @@ static inline struct sock *icmp_xmit_lock(struct net *net)
31010  static inline void icmp_xmit_unlock(struct sock *sk)
31011  {
31012         spin_unlock_bh(&sk->sk_lock.slock);
31013 +       local_unlock(icmp_sk_lock);
31014  }
31015
31016  int sysctl_icmp_msgs_per_sec __read_mostly = 1000;
31017 @@ -358,6 +365,7 @@ static void icmp_push_reply(struct icmp_bxm *icmp_param,
31018         struct sock *sk;
31019         struct sk_buff *skb;
31020
31021 +       local_lock(icmp_sk_lock);
31022         sk = icmp_sk(dev_net((*rt)->dst.dev));
31023         if (ip_append_data(sk, fl4, icmp_glue_bits, icmp_param,
31024                            icmp_param->data_len+icmp_param->head_len,
31025 @@ -380,6 +388,7 @@ static void icmp_push_reply(struct icmp_bxm *icmp_param,
31026                 skb->ip_summed = CHECKSUM_NONE;
31027                 ip_push_pending_frames(sk, fl4);
31028         }
31029 +       local_unlock(icmp_sk_lock);
31030  }
31031
31032  /*
31033 @@ -891,6 +900,30 @@ static bool icmp_redirect(struct sk_buff *skb)
31034  }
31035
31036  /*
31037 + * 32bit and 64bit have different timestamp length, so we check for
31038 + * the cookie at offset 20 and verify it is repeated at offset 50
31039 + */
31040 +#define CO_POS0                20
31041 +#define CO_POS1                50
31042 +#define CO_SIZE                sizeof(int)
31043 +#define ICMP_SYSRQ_SIZE        57
31044 +
31045 +/*
31046 + * We got a ICMP_SYSRQ_SIZE sized ping request. Check for the cookie
31047 + * pattern and if it matches send the next byte as a trigger to sysrq.
31048 + */
31049 +static void icmp_check_sysrq(struct net *net, struct sk_buff *skb)
31050 +{
31051 +       int cookie = htonl(net->ipv4.sysctl_icmp_echo_sysrq);
31052 +       char *p = skb->data;
31053 +
31054 +       if (!memcmp(&cookie, p + CO_POS0, CO_SIZE) &&
31055 +           !memcmp(&cookie, p + CO_POS1, CO_SIZE) &&
31056 +           p[CO_POS0 + CO_SIZE] == p[CO_POS1 + CO_SIZE])
31057 +               handle_sysrq(p[CO_POS0 + CO_SIZE]);
31058 +}
31059 +
31060 +/*
31061   *     Handle ICMP_ECHO ("ping") requests.
31062   *
31063   *     RFC 1122: 3.2.2.6 MUST have an echo server that answers ICMP echo
31064 @@ -917,6 +950,11 @@ static bool icmp_echo(struct sk_buff *skb)
31065                 icmp_param.data_len        = skb->len;
31066                 icmp_param.head_len        = sizeof(struct icmphdr);
31067                 icmp_reply(&icmp_param, skb);
31068 +
31069 +               if (skb->len == ICMP_SYSRQ_SIZE &&
31070 +                   net->ipv4.sysctl_icmp_echo_sysrq) {
31071 +                       icmp_check_sysrq(net, skb);
31072 +               }
31073         }
31074         /* should there be an ICMP stat for ignored echos? */
31075         return true;
31076 diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
31077 index a0bd7a55193e..1866f910263f 100644
31078 --- a/net/ipv4/sysctl_net_ipv4.c
31079 +++ b/net/ipv4/sysctl_net_ipv4.c
31080 @@ -818,6 +818,13 @@ static struct ctl_table ipv4_net_table[] = {
31081                 .proc_handler   = proc_dointvec
31082         },
31083         {
31084 +               .procname       = "icmp_echo_sysrq",
31085 +               .data           = &init_net.ipv4.sysctl_icmp_echo_sysrq,
31086 +               .maxlen         = sizeof(int),
31087 +               .mode           = 0644,
31088 +               .proc_handler   = proc_dointvec
31089 +       },
31090 +       {
31091                 .procname       = "icmp_ignore_bogus_error_responses",
31092                 .data           = &init_net.ipv4.sysctl_icmp_ignore_bogus_error_responses,
31093                 .maxlen         = sizeof(int),
31094 diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
31095 index b5853cac3269..de922d86ba2c 100644
31096 --- a/net/ipv4/tcp_ipv4.c
31097 +++ b/net/ipv4/tcp_ipv4.c
31098 @@ -62,6 +62,7 @@
31099  #include <linux/init.h>
31100  #include <linux/times.h>
31101  #include <linux/slab.h>
31102 +#include <linux/locallock.h>
31103
31104  #include <net/net_namespace.h>
31105  #include <net/icmp.h>
31106 @@ -566,6 +567,7 @@ void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
31107  }
31108  EXPORT_SYMBOL(tcp_v4_send_check);
31109
31110 +static DEFINE_LOCAL_IRQ_LOCK(tcp_sk_lock);
31111  /*
31112   *     This routine will send an RST to the other tcp.
31113   *
31114 @@ -687,10 +689,13 @@ static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
31115                 arg.bound_dev_if = sk->sk_bound_dev_if;
31116
31117         arg.tos = ip_hdr(skb)->tos;
31118 +
31119 +       local_lock(tcp_sk_lock);
31120         ip_send_unicast_reply(*this_cpu_ptr(net->ipv4.tcp_sk),
31121                               skb, &TCP_SKB_CB(skb)->header.h4.opt,
31122                               ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
31123                               &arg, arg.iov[0].iov_len);
31124 +       local_unlock(tcp_sk_lock);
31125
31126         TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS);
31127         TCP_INC_STATS_BH(net, TCP_MIB_OUTRSTS);
31128 @@ -772,10 +777,12 @@ static void tcp_v4_send_ack(struct net *net,
31129         if (oif)
31130                 arg.bound_dev_if = oif;
31131         arg.tos = tos;
31132 +       local_lock(tcp_sk_lock);
31133         ip_send_unicast_reply(*this_cpu_ptr(net->ipv4.tcp_sk),
31134                               skb, &TCP_SKB_CB(skb)->header.h4.opt,
31135                               ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
31136                               &arg, arg.iov[0].iov_len);
31137 +       local_unlock(tcp_sk_lock);
31138
31139         TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS);
31140  }
31141 diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c
31142 index a3bb8f7f5fc5..3be977394a80 100644
31143 --- a/net/mac80211/rx.c
31144 +++ b/net/mac80211/rx.c
31145 @@ -3574,7 +3574,7 @@ void ieee80211_rx_napi(struct ieee80211_hw *hw, struct sk_buff *skb,
31146         struct ieee80211_supported_band *sband;
31147         struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(skb);
31148
31149 -       WARN_ON_ONCE(softirq_count() == 0);
31150 +       WARN_ON_ONCE_NONRT(softirq_count() == 0);
31151
31152         if (WARN_ON(status->band >= IEEE80211_NUM_BANDS))
31153                 goto drop;
31154 diff --git a/net/netfilter/core.c b/net/netfilter/core.c
31155 index f39276d1c2d7..10880c89d62f 100644
31156 --- a/net/netfilter/core.c
31157 +++ b/net/netfilter/core.c
31158 @@ -22,11 +22,17 @@
31159  #include <linux/proc_fs.h>
31160  #include <linux/mutex.h>
31161  #include <linux/slab.h>
31162 +#include <linux/locallock.h>
31163  #include <net/net_namespace.h>
31164  #include <net/sock.h>
31165
31166  #include "nf_internals.h"
31167
31168 +#ifdef CONFIG_PREEMPT_RT_BASE
31169 +DEFINE_LOCAL_IRQ_LOCK(xt_write_lock);
31170 +EXPORT_PER_CPU_SYMBOL(xt_write_lock);
31171 +#endif
31172 +
31173  static DEFINE_MUTEX(afinfo_mutex);
31174
31175  const struct nf_afinfo __rcu *nf_afinfo[NFPROTO_NUMPROTO] __read_mostly;
31176 diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
31177 index a86f26d05bc2..77276e3ff2a2 100644
31178 --- a/net/packet/af_packet.c
31179 +++ b/net/packet/af_packet.c
31180 @@ -63,6 +63,7 @@
31181  #include <linux/if_packet.h>
31182  #include <linux/wireless.h>
31183  #include <linux/kernel.h>
31184 +#include <linux/delay.h>
31185  #include <linux/kmod.h>
31186  #include <linux/slab.h>
31187  #include <linux/vmalloc.h>
31188 @@ -695,7 +696,7 @@ static void prb_retire_rx_blk_timer_expired(unsigned long data)
31189         if (BLOCK_NUM_PKTS(pbd)) {
31190                 while (atomic_read(&pkc->blk_fill_in_prog)) {
31191                         /* Waiting for skb_copy_bits to finish... */
31192 -                       cpu_relax();
31193 +                       cpu_chill();
31194                 }
31195         }
31196
31197 @@ -957,7 +958,7 @@ static void prb_retire_current_block(struct tpacket_kbdq_core *pkc,
31198                 if (!(status & TP_STATUS_BLK_TMO)) {
31199                         while (atomic_read(&pkc->blk_fill_in_prog)) {
31200                                 /* Waiting for skb_copy_bits to finish... */
31201 -                               cpu_relax();
31202 +                               cpu_chill();
31203                         }
31204                 }
31205                 prb_close_block(pkc, pbd, po, status);
31206 diff --git a/net/rds/ib_rdma.c b/net/rds/ib_rdma.c
31207 index a2340748ec86..19123a97b354 100644
31208 --- a/net/rds/ib_rdma.c
31209 +++ b/net/rds/ib_rdma.c
31210 @@ -34,6 +34,7 @@
31211  #include <linux/slab.h>
31212  #include <linux/rculist.h>
31213  #include <linux/llist.h>
31214 +#include <linux/delay.h>
31215
31216  #include "rds.h"
31217  #include "ib.h"
31218 @@ -313,7 +314,7 @@ static inline void wait_clean_list_grace(void)
31219         for_each_online_cpu(cpu) {
31220                 flag = &per_cpu(clean_list_grace, cpu);
31221                 while (test_bit(CLEAN_LIST_BUSY_BIT, flag))
31222 -                       cpu_relax();
31223 +                       cpu_chill();
31224         }
31225  }
31226
31227 diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
31228 index aa4725038f94..00b81cab28f3 100644
31229 --- a/net/sched/sch_generic.c
31230 +++ b/net/sched/sch_generic.c
31231 @@ -893,7 +893,7 @@ void dev_deactivate_many(struct list_head *head)
31232         /* Wait for outstanding qdisc_run calls. */
31233         list_for_each_entry(dev, head, close_list)
31234                 while (some_qdisc_is_busy(dev))
31235 -                       yield();
31236 +                       msleep(1);
31237  }
31238
31239  void dev_deactivate(struct net_device *dev)
31240 diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c
31241 index a6cbb2104667..5b69bb580617 100644
31242 --- a/net/sunrpc/svc_xprt.c
31243 +++ b/net/sunrpc/svc_xprt.c
31244 @@ -340,7 +340,7 @@ void svc_xprt_do_enqueue(struct svc_xprt *xprt)
31245                 goto out;
31246         }
31247
31248 -       cpu = get_cpu();
31249 +       cpu = get_cpu_light();
31250         pool = svc_pool_for_cpu(xprt->xpt_server, cpu);
31251
31252         atomic_long_inc(&pool->sp_stats.packets);
31253 @@ -376,7 +376,7 @@ redo_search:
31254
31255                 atomic_long_inc(&pool->sp_stats.threads_woken);
31256                 wake_up_process(rqstp->rq_task);
31257 -               put_cpu();
31258 +               put_cpu_light();
31259                 goto out;
31260         }
31261         rcu_read_unlock();
31262 @@ -397,7 +397,7 @@ redo_search:
31263                 goto redo_search;
31264         }
31265         rqstp = NULL;
31266 -       put_cpu();
31267 +       put_cpu_light();
31268  out:
31269         trace_svc_xprt_do_enqueue(xprt, rqstp);
31270  }
31271 diff --git a/scripts/mkcompile_h b/scripts/mkcompile_h
31272 index 6fdc97ef6023..523e0420d7f0 100755
31273 --- a/scripts/mkcompile_h
31274 +++ b/scripts/mkcompile_h
31275 @@ -4,7 +4,8 @@ TARGET=$1
31276  ARCH=$2
31277  SMP=$3
31278  PREEMPT=$4
31279 -CC=$5
31280 +RT=$5
31281 +CC=$6
31282
31283  vecho() { [ "${quiet}" = "silent_" ] || echo "$@" ; }
31284
31285 @@ -57,6 +58,7 @@ UTS_VERSION="#$VERSION"
31286  CONFIG_FLAGS=""
31287  if [ -n "$SMP" ] ; then CONFIG_FLAGS="SMP"; fi
31288  if [ -n "$PREEMPT" ] ; then CONFIG_FLAGS="$CONFIG_FLAGS PREEMPT"; fi
31289 +if [ -n "$RT" ] ; then CONFIG_FLAGS="$CONFIG_FLAGS RT"; fi
31290  UTS_VERSION="$UTS_VERSION $CONFIG_FLAGS $TIMESTAMP"
31291
31292  # Truncate to maximum length
31293 diff --git a/sound/core/pcm_native.c b/sound/core/pcm_native.c
31294 index 4ba64fd49759..34e50186885d 100644
31295 --- a/sound/core/pcm_native.c
31296 +++ b/sound/core/pcm_native.c
31297 @@ -135,7 +135,7 @@ EXPORT_SYMBOL_GPL(snd_pcm_stream_unlock);
31298  void snd_pcm_stream_lock_irq(struct snd_pcm_substream *substream)
31299  {
31300         if (!substream->pcm->nonatomic)
31301 -               local_irq_disable();
31302 +               local_irq_disable_nort();
31303         snd_pcm_stream_lock(substream);
31304  }
31305  EXPORT_SYMBOL_GPL(snd_pcm_stream_lock_irq);
31306 @@ -150,7 +150,7 @@ void snd_pcm_stream_unlock_irq(struct snd_pcm_substream *substream)
31307  {
31308         snd_pcm_stream_unlock(substream);
31309         if (!substream->pcm->nonatomic)
31310 -               local_irq_enable();
31311 +               local_irq_enable_nort();
31312  }
31313  EXPORT_SYMBOL_GPL(snd_pcm_stream_unlock_irq);
31314
31315 @@ -158,7 +158,7 @@ unsigned long _snd_pcm_stream_lock_irqsave(struct snd_pcm_substream *substream)
31316  {
31317         unsigned long flags = 0;
31318         if (!substream->pcm->nonatomic)
31319 -               local_irq_save(flags);
31320 +               local_irq_save_nort(flags);
31321         snd_pcm_stream_lock(substream);
31322         return flags;
31323  }
31324 @@ -176,7 +176,7 @@ void snd_pcm_stream_unlock_irqrestore(struct snd_pcm_substream *substream,
31325  {
31326         snd_pcm_stream_unlock(substream);
31327         if (!substream->pcm->nonatomic)
31328 -               local_irq_restore(flags);
31329 +               local_irq_restore_nort(flags);
31330  }
31331  EXPORT_SYMBOL_GPL(snd_pcm_stream_unlock_irqrestore);
31332
31333 diff --git a/virt/kvm/async_pf.c b/virt/kvm/async_pf.c
31334 index 4f70d12e392d..9378d0919ed8 100644
31335 --- a/virt/kvm/async_pf.c
31336 +++ b/virt/kvm/async_pf.c
31337 @@ -98,8 +98,8 @@ static void async_pf_execute(struct work_struct *work)
31338          * This memory barrier pairs with prepare_to_wait's set_current_state()
31339          */
31340         smp_mb();
31341 -       if (waitqueue_active(&vcpu->wq))
31342 -               wake_up_interruptible(&vcpu->wq);
31343 +       if (swait_active(&vcpu->wq))
31344 +               swake_up(&vcpu->wq);
31345
31346         mmput(mm);
31347         kvm_put_kvm(vcpu->kvm);
31348 diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
31349 index 336ed267c407..7748ca386e60 100644
31350 --- a/virt/kvm/kvm_main.c
31351 +++ b/virt/kvm/kvm_main.c
31352 @@ -228,8 +228,7 @@ int kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id)
31353         vcpu->kvm = kvm;
31354         vcpu->vcpu_id = id;
31355         vcpu->pid = NULL;
31356 -       vcpu->halt_poll_ns = 0;
31357 -       init_waitqueue_head(&vcpu->wq);
31358 +       init_swait_queue_head(&vcpu->wq);
31359         kvm_async_pf_vcpu_init(vcpu);
31360
31361         vcpu->pre_pcpu = -1;
31362 @@ -2005,7 +2004,7 @@ static int kvm_vcpu_check_block(struct kvm_vcpu *vcpu)
31363  void kvm_vcpu_block(struct kvm_vcpu *vcpu)
31364  {
31365         ktime_t start, cur;
31366 -       DEFINE_WAIT(wait);
31367 +       DECLARE_SWAITQUEUE(wait);
31368         bool waited = false;
31369         u64 block_ns;
31370
31371 @@ -2030,7 +2029,7 @@ void kvm_vcpu_block(struct kvm_vcpu *vcpu)
31372         kvm_arch_vcpu_blocking(vcpu);
31373
31374         for (;;) {
31375 -               prepare_to_wait(&vcpu->wq, &wait, TASK_INTERRUPTIBLE);
31376 +               prepare_to_swait(&vcpu->wq, &wait, TASK_INTERRUPTIBLE);
31377
31378                 if (kvm_vcpu_check_block(vcpu) < 0)
31379                         break;
31380 @@ -2039,7 +2038,7 @@ void kvm_vcpu_block(struct kvm_vcpu *vcpu)
31381                 schedule();
31382         }
31383
31384 -       finish_wait(&vcpu->wq, &wait);
31385 +       finish_swait(&vcpu->wq, &wait);
31386         cur = ktime_get();
31387
31388         kvm_arch_vcpu_unblocking(vcpu);
31389 @@ -2071,11 +2070,11 @@ void kvm_vcpu_kick(struct kvm_vcpu *vcpu)
31390  {
31391         int me;
31392         int cpu = vcpu->cpu;
31393 -       wait_queue_head_t *wqp;
31394 +       struct swait_queue_head *wqp;
31395
31396         wqp = kvm_arch_vcpu_wq(vcpu);
31397 -       if (waitqueue_active(wqp)) {
31398 -               wake_up_interruptible(wqp);
31399 +       if (swait_active(wqp)) {
31400 +               swake_up(wqp);
31401                 ++vcpu->stat.halt_wakeup;
31402         }
31403
31404 @@ -2176,7 +2175,7 @@ void kvm_vcpu_on_spin(struct kvm_vcpu *me)
31405                                 continue;
31406                         if (vcpu == me)
31407                                 continue;
31408 -                       if (waitqueue_active(&vcpu->wq) && !kvm_arch_vcpu_runnable(vcpu))
31409 +                       if (swait_active(&vcpu->wq) && !kvm_arch_vcpu_runnable(vcpu))
31410                                 continue;
31411                         if (!kvm_vcpu_eligible_for_directed_yield(vcpu))
31412                                 continue;