kernel-rt.patch

   1 diff --git a/Documentation/hwlat_detector.txt b/Documentation/hwlat_detector.txt
   2 new file mode 100644
   3 index 000000000000..cb61516483d3
   4 --- /dev/null
   5 +++ b/Documentation/hwlat_detector.txt
   6 @@ -0,0 +1,64 @@
   7 +Introduction:
   8 +-------------
   9 +
  10 +The module hwlat_detector is a special purpose kernel module that is used to
  11 +detect large system latencies induced by the behavior of certain underlying
  12 +hardware or firmware, independent of Linux itself. The code was developed
  13 +originally to detect SMIs (System Management Interrupts) on x86 systems,
  14 +however there is nothing x86 specific about this patchset. It was
  15 +originally written for use by the "RT" patch since the Real Time
  16 +kernel is highly latency sensitive.
  17 +
  18 +SMIs are usually not serviced by the Linux kernel, which typically does not
  19 +even know that they are occuring. SMIs are instead are set up by BIOS code
  20 +and are serviced by BIOS code, usually for "critical" events such as
  21 +management of thermal sensors and fans. Sometimes though, SMIs are used for
  22 +other tasks and those tasks can spend an inordinate amount of time in the
  23 +handler (sometimes measured in milliseconds). Obviously this is a problem if
  24 +you are trying to keep event service latencies down in the microsecond range.
  25 +
  26 +The hardware latency detector works by hogging all of the cpus for configurable
  27 +amounts of time (by calling stop_machine()), polling the CPU Time Stamp Counter
  28 +for some period, then looking for gaps in the TSC data. Any gap indicates a
  29 +time when the polling was interrupted and since the machine is stopped and
  30 +interrupts turned off the only thing that could do that would be an SMI.
  31 +
  32 +Note that the SMI detector should *NEVER* be used in a production environment.
  33 +It is intended to be run manually to determine if the hardware platform has a
  34 +problem with long system firmware service routines.
  35 +
  36 +Usage:
  37 +------
  38 +
  39 +Loading the module hwlat_detector passing the parameter "enabled=1" (or by
  40 +setting the "enable" entry in "hwlat_detector" debugfs toggled on) is the only
  41 +step required to start the hwlat_detector. It is possible to redefine the
  42 +threshold in microseconds (us) above which latency spikes will be taken
  43 +into account (parameter "threshold=").
  44 +
  45 +Example:
  46 +
  47 +       # modprobe hwlat_detector enabled=1 threshold=100
  48 +
  49 +After the module is loaded, it creates a directory named "hwlat_detector" under
  50 +the debugfs mountpoint, "/debug/hwlat_detector" for this text. It is necessary
  51 +to have debugfs mounted, which might be on /sys/debug on your system.
  52 +
  53 +The /debug/hwlat_detector interface contains the following files:
  54 +
  55 +count                  - number of latency spikes observed since last reset
  56 +enable                 - a global enable/disable toggle (0/1), resets count
  57 +max                    - maximum hardware latency actually observed (usecs)
  58 +sample                 - a pipe from which to read current raw sample data
  59 +                         in the format <timestamp> <latency observed usecs>
  60 +                         (can be opened O_NONBLOCK for a single sample)
  61 +threshold              - minimum latency value to be considered (usecs)
  62 +width                  - time period to sample with CPUs held (usecs)
  63 +                         must be less than the total window size (enforced)
  64 +window                 - total period of sampling, width being inside (usecs)
  65 +
  66 +By default we will set width to 500,000 and window to 1,000,000, meaning that
  67 +we will sample every 1,000,000 usecs (1s) for 500,000 usecs (0.5s). If we
  68 +observe any latencies that exceed the threshold (initially 100 usecs),
  69 +then we write to a global sample ring buffer of 8K samples, which is
  70 +consumed by reading from the "sample" (pipe) debugfs file interface.
  71 diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
  72 index c360f80c3473..5489dea355a2 100644
  73 --- a/Documentation/kernel-parameters.txt
  74 +++ b/Documentation/kernel-parameters.txt
  75 @@ -1636,6 +1636,15 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
  76         ip=             [IP_PNP]
  77                         See Documentation/filesystems/nfs/nfsroot.txt.
  78
  79 +       irqaffinity=    [SMP] Set the default irq affinity mask
  80 +                       Format:
  81 +                       <cpu number>,...,<cpu number>
  82 +                       or
  83 +                       <cpu number>-<cpu number>
  84 +                       (must be a positive range in ascending order)
  85 +                       or a mixture
  86 +                       <cpu number>,...,<cpu number>-<cpu number>
  87 +
  88         irqfixup        [HW]
  89                         When an interrupt is not handled search all handlers
  90                         for it. Intended to get systems with badly broken
  91 diff --git a/Documentation/sysrq.txt b/Documentation/sysrq.txt
  92 index 13f5619b2203..f64d075ba647 100644
  93 --- a/Documentation/sysrq.txt
  94 +++ b/Documentation/sysrq.txt
  95 @@ -59,10 +59,17 @@ On PowerPC - Press 'ALT - Print Screen (or F13) - <command key>,
  96  On other - If you know of the key combos for other architectures, please
  97             let me know so I can add them to this section.
  98
  99 -On all -  write a character to /proc/sysrq-trigger.  e.g.:
 100 -
 101 +On all -  write a character to /proc/sysrq-trigger, e.g.:
 102                 echo t > /proc/sysrq-trigger
 103
 104 +On all - Enable network SysRq by writing a cookie to icmp_echo_sysrq, e.g.
 105 +               echo 0x01020304 >/proc/sys/net/ipv4/icmp_echo_sysrq
 106 +        Send an ICMP echo request with this pattern plus the particular
 107 +        SysRq command key. Example:
 108 +               # ping -c1 -s57 -p0102030468
 109 +        will trigger the SysRq-H (help) command.
 110 +
 111 +
 112  *  What are the 'command' keys?
 113  ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 114  'b'     - Will immediately reboot the system without syncing or unmounting
 115 diff --git a/Documentation/trace/histograms.txt b/Documentation/trace/histograms.txt
 116 new file mode 100644
 117 index 000000000000..6f2aeabf7faa
 118 --- /dev/null
 119 +++ b/Documentation/trace/histograms.txt
 120 @@ -0,0 +1,186 @@
 121 +               Using the Linux Kernel Latency Histograms
 122 +
 123 +
 124 +This document gives a short explanation how to enable, configure and use
 125 +latency histograms. Latency histograms are primarily relevant in the
 126 +context of real-time enabled kernels (CONFIG_PREEMPT/CONFIG_PREEMPT_RT)
 127 +and are used in the quality management of the Linux real-time
 128 +capabilities.
 129 +
 130 +
 131 +* Purpose of latency histograms
 132 +
 133 +A latency histogram continuously accumulates the frequencies of latency
 134 +data. There are two types of histograms
 135 +- potential sources of latencies
 136 +- effective latencies
 137 +
 138 +
 139 +* Potential sources of latencies
 140 +
 141 +Potential sources of latencies are code segments where interrupts,
 142 +preemption or both are disabled (aka critical sections). To create
 143 +histograms of potential sources of latency, the kernel stores the time
 144 +stamp at the start of a critical section, determines the time elapsed
 145 +when the end of the section is reached, and increments the frequency
 146 +counter of that latency value - irrespective of whether any concurrently
 147 +running process is affected by latency or not.
 148 +- Configuration items (in the Kernel hacking/Tracers submenu)
 149 +  CONFIG_INTERRUPT_OFF_LATENCY
 150 +  CONFIG_PREEMPT_OFF_LATENCY
 151 +
 152 +
 153 +* Effective latencies
 154 +
 155 +Effective latencies are actually occuring during wakeup of a process. To
 156 +determine effective latencies, the kernel stores the time stamp when a
 157 +process is scheduled to be woken up, and determines the duration of the
 158 +wakeup time shortly before control is passed over to this process. Note
 159 +that the apparent latency in user space may be somewhat longer, since the
 160 +process may be interrupted after control is passed over to it but before
 161 +the execution in user space takes place. Simply measuring the interval
 162 +between enqueuing and wakeup may also not appropriate in cases when a
 163 +process is scheduled as a result of a timer expiration. The timer may have
 164 +missed its deadline, e.g. due to disabled interrupts, but this latency
 165 +would not be registered. Therefore, the offsets of missed timers are
 166 +recorded in a separate histogram. If both wakeup latency and missed timer
 167 +offsets are configured and enabled, a third histogram may be enabled that
 168 +records the overall latency as a sum of the timer latency, if any, and the
 169 +wakeup latency. This histogram is called "timerandwakeup".
 170 +- Configuration items (in the Kernel hacking/Tracers submenu)
 171 +  CONFIG_WAKEUP_LATENCY
 172 +  CONFIG_MISSED_TIMER_OFSETS
 173 +
 174 +
 175 +* Usage
 176 +
 177 +The interface to the administration of the latency histograms is located
 178 +in the debugfs file system. To mount it, either enter
 179 +
 180 +mount -t sysfs nodev /sys
 181 +mount -t debugfs nodev /sys/kernel/debug
 182 +
 183 +from shell command line level, or add
 184 +
 185 +nodev  /sys                    sysfs   defaults        0 0
 186 +nodev  /sys/kernel/debug       debugfs defaults        0 0
 187 +
 188 +to the file /etc/fstab. All latency histogram related files are then
 189 +available in the directory /sys/kernel/debug/tracing/latency_hist. A
 190 +particular histogram type is enabled by writing non-zero to the related
 191 +variable in the /sys/kernel/debug/tracing/latency_hist/enable directory.
 192 +Select "preemptirqsoff" for the histograms of potential sources of
 193 +latencies and "wakeup" for histograms of effective latencies etc. The
 194 +histogram data - one per CPU - are available in the files
 195 +
 196 +/sys/kernel/debug/tracing/latency_hist/preemptoff/CPUx
 197 +/sys/kernel/debug/tracing/latency_hist/irqsoff/CPUx
 198 +/sys/kernel/debug/tracing/latency_hist/preemptirqsoff/CPUx
 199 +/sys/kernel/debug/tracing/latency_hist/wakeup/CPUx
 200 +/sys/kernel/debug/tracing/latency_hist/wakeup/sharedprio/CPUx
 201 +/sys/kernel/debug/tracing/latency_hist/missed_timer_offsets/CPUx
 202 +/sys/kernel/debug/tracing/latency_hist/timerandwakeup/CPUx
 203 +
 204 +The histograms are reset by writing non-zero to the file "reset" in a
 205 +particular latency directory. To reset all latency data, use
 206 +
 207 +#!/bin/sh
 208 +
 209 +TRACINGDIR=/sys/kernel/debug/tracing
 210 +HISTDIR=$TRACINGDIR/latency_hist
 211 +
 212 +if test -d $HISTDIR
 213 +then
 214 +  cd $HISTDIR
 215 +  for i in `find . | grep /reset$`
 216 +  do
 217 +    echo 1 >$i
 218 +  done
 219 +fi
 220 +
 221 +
 222 +* Data format
 223 +
 224 +Latency data are stored with a resolution of one microsecond. The
 225 +maximum latency is 10,240 microseconds. The data are only valid, if the
 226 +overflow register is empty. Every output line contains the latency in
 227 +microseconds in the first row and the number of samples in the second
 228 +row. To display only lines with a positive latency count, use, for
 229 +example,
 230 +
 231 +grep -v " 0$" /sys/kernel/debug/tracing/latency_hist/preemptoff/CPU0
 232 +
 233 +#Minimum latency: 0 microseconds.
 234 +#Average latency: 0 microseconds.
 235 +#Maximum latency: 25 microseconds.
 236 +#Total samples: 3104770694
 237 +#There are 0 samples greater or equal than 10240 microseconds
 238 +#usecs          samples
 239 +    0        2984486876
 240 +    1          49843506
 241 +    2          58219047
 242 +    3           5348126
 243 +    4           2187960
 244 +    5           3388262
 245 +    6            959289
 246 +    7            208294
 247 +    8             40420
 248 +    9              4485
 249 +   10             14918
 250 +   11             18340
 251 +   12             25052
 252 +   13             19455
 253 +   14              5602
 254 +   15               969
 255 +   16                47
 256 +   17                18
 257 +   18                14
 258 +   19                 1
 259 +   20                 3
 260 +   21                 2
 261 +   22                 5
 262 +   23                 2
 263 +   25                 1
 264 +
 265 +
 266 +* Wakeup latency of a selected process
 267 +
 268 +To only collect wakeup latency data of a particular process, write the
 269 +PID of the requested process to
 270 +
 271 +/sys/kernel/debug/tracing/latency_hist/wakeup/pid
 272 +
 273 +PIDs are not considered, if this variable is set to 0.
 274 +
 275 +
 276 +* Details of the process with the highest wakeup latency so far
 277 +
 278 +Selected data of the process that suffered from the highest wakeup
 279 +latency that occurred in a particular CPU are available in the file
 280 +
 281 +/sys/kernel/debug/tracing/latency_hist/wakeup/max_latency-CPUx.
 282 +
 283 +In addition, other relevant system data at the time when the
 284 +latency occurred are given.
 285 +
 286 +The format of the data is (all in one line):
 287 +<PID> <Priority> <Latency> (<Timeroffset>) <Command> \
 288 +<- <PID> <Priority> <Command> <Timestamp>
 289 +
 290 +The value of <Timeroffset> is only relevant in the combined timer
 291 +and wakeup latency recording. In the wakeup recording, it is
 292 +always 0, in the missed_timer_offsets recording, it is the same
 293 +as <Latency>.
 294 +
 295 +When retrospectively searching for the origin of a latency and
 296 +tracing was not enabled, it may be helpful to know the name and
 297 +some basic data of the task that (finally) was switching to the
 298 +late real-tlme task. In addition to the victim's data, also the
 299 +data of the possible culprit are therefore displayed after the
 300 +"<-" symbol.
 301 +
 302 +Finally, the timestamp of the time when the latency occurred
 303 +in <seconds>.<microseconds> after the most recent system boot
 304 +is provided.
 305 +
 306 +These data are also reset when the wakeup histogram is reset.
 307 diff --git a/Makefile b/Makefile
 308 index 98239d56924c..5ed3edefebde 100644
 309 --- a/Makefile
 310 +++ b/Makefile
 311 @@ -394,7 +394,7 @@ KBUILD_CPPFLAGS := -D__KERNEL__
 312  KBUILD_CFLAGS   := -Wall -Wundef -Wstrict-prototypes -Wno-trigraphs \
 313                    -fno-strict-aliasing -fno-common \
 314                    -Werror-implicit-function-declaration \
 315 -                  -Wno-format-security \
 316 +                  -Wno-format-security -fno-PIE \
 317                    -std=gnu89
 318
 319  KBUILD_AFLAGS_KERNEL :=
 320 diff --git a/arch/Kconfig b/arch/Kconfig
 321 index 4e949e58b192..3b26d76933fb 100644
 322 --- a/arch/Kconfig
 323 +++ b/arch/Kconfig
 324 @@ -9,6 +9,7 @@ config OPROFILE
 325         tristate "OProfile system profiling"
 326         depends on PROFILING
 327         depends on HAVE_OPROFILE
 328 +       depends on !PREEMPT_RT_FULL
 329         select RING_BUFFER
 330         select RING_BUFFER_ALLOW_SWAP
 331         help
 332 @@ -52,6 +53,7 @@ config KPROBES
 333  config JUMP_LABEL
 334         bool "Optimize very unlikely/likely branches"
 335         depends on HAVE_ARCH_JUMP_LABEL
 336 +       depends on (!INTERRUPT_OFF_HIST && !PREEMPT_OFF_HIST && !WAKEUP_LATENCY_HIST && !MISSED_TIMER_OFFSETS_HIST)
 337         help
 338           This option enables a transparent branch optimization that
 339          makes certain almost-always-true or almost-always-false branch
 340 diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig
 341 index 34e1569a11ee..79c4603e9453 100644
 342 --- a/arch/arm/Kconfig
 343 +++ b/arch/arm/Kconfig
 344 @@ -33,7 +33,7 @@ config ARM
 345         select HARDIRQS_SW_RESEND
 346         select HAVE_ARCH_AUDITSYSCALL if (AEABI && !OABI_COMPAT)
 347         select HAVE_ARCH_BITREVERSE if (CPU_32v7M || CPU_32v7) && !CPU_32v6
 348 -       select HAVE_ARCH_JUMP_LABEL if !XIP_KERNEL && !CPU_ENDIAN_BE32
 349 +       select HAVE_ARCH_JUMP_LABEL if !XIP_KERNEL && !CPU_ENDIAN_BE32 && !PREEMPT_RT_BASE
 350         select HAVE_ARCH_KGDB if !CPU_ENDIAN_BE32
 351         select HAVE_ARCH_SECCOMP_FILTER if (AEABI && !OABI_COMPAT)
 352         select HAVE_ARCH_TRACEHOOK
 353 @@ -68,6 +68,7 @@ config ARM
 354         select HAVE_PERF_EVENTS
 355         select HAVE_PERF_REGS
 356         select HAVE_PERF_USER_STACK_DUMP
 357 +       select HAVE_PREEMPT_LAZY
 358         select HAVE_RCU_TABLE_FREE if (SMP && ARM_LPAE)
 359         select HAVE_REGS_AND_STACK_ACCESS_API
 360         select HAVE_SYSCALL_TRACEPOINTS
 361 diff --git a/arch/arm/include/asm/switch_to.h b/arch/arm/include/asm/switch_to.h
 362 index 12ebfcc1d539..c962084605bc 100644
 363 --- a/arch/arm/include/asm/switch_to.h
 364 +++ b/arch/arm/include/asm/switch_to.h
 365 @@ -3,6 +3,13 @@
 366
 367  #include <linux/thread_info.h>
 368
 369 +#if defined CONFIG_PREEMPT_RT_FULL && defined CONFIG_HIGHMEM
 370 +void switch_kmaps(struct task_struct *prev_p, struct task_struct *next_p);
 371 +#else
 372 +static inline void
 373 +switch_kmaps(struct task_struct *prev_p, struct task_struct *next_p) { }
 374 +#endif
 375 +
 376  /*
 377   * For v7 SMP cores running a preemptible kernel we may be pre-empted
 378   * during a TLB maintenance operation, so execute an inner-shareable dsb
 379 @@ -25,6 +32,7 @@ extern struct task_struct *__switch_to(struct task_struct *, struct thread_info
 380  #define switch_to(prev,next,last)                                      \
 381  do {                                                                   \
 382         __complete_pending_tlbi();                                      \
 383 +       switch_kmaps(prev, next);                                       \
 384         last = __switch_to(prev,task_thread_info(prev), task_thread_info(next));        \
 385  } while (0)
 386
 387 diff --git a/arch/arm/include/asm/thread_info.h b/arch/arm/include/asm/thread_info.h
 388 index 776757d1604a..1f36a4eccc72 100644
 389 --- a/arch/arm/include/asm/thread_info.h
 390 +++ b/arch/arm/include/asm/thread_info.h
 391 @@ -49,6 +49,7 @@ struct cpu_context_save {
 392  struct thread_info {
 393         unsigned long           flags;          /* low level flags */
 394         int                     preempt_count;  /* 0 => preemptable, <0 => bug */
 395 +       int                     preempt_lazy_count; /* 0 => preemptable, <0 => bug */
 396         mm_segment_t            addr_limit;     /* address limit */
 397         struct task_struct      *task;          /* main task structure */
 398         __u32                   cpu;            /* cpu */
 399 @@ -142,7 +143,8 @@ extern int vfp_restore_user_hwstate(struct user_vfp __user *,
 400  #define TIF_SYSCALL_TRACE      4       /* syscall trace active */
 401  #define TIF_SYSCALL_AUDIT      5       /* syscall auditing active */
 402  #define TIF_SYSCALL_TRACEPOINT 6       /* syscall tracepoint instrumentation */
 403 -#define TIF_SECCOMP            7       /* seccomp syscall filtering active */
 404 +#define TIF_SECCOMP            8       /* seccomp syscall filtering active */
 405 +#define TIF_NEED_RESCHED_LAZY  7
 406
 407  #define TIF_NOHZ               12      /* in adaptive nohz mode */
 408  #define TIF_USING_IWMMXT       17
 409 @@ -152,6 +154,7 @@ extern int vfp_restore_user_hwstate(struct user_vfp __user *,
 410  #define _TIF_SIGPENDING                (1 << TIF_SIGPENDING)
 411  #define _TIF_NEED_RESCHED      (1 << TIF_NEED_RESCHED)
 412  #define _TIF_NOTIFY_RESUME     (1 << TIF_NOTIFY_RESUME)
 413 +#define _TIF_NEED_RESCHED_LAZY (1 << TIF_NEED_RESCHED_LAZY)
 414  #define _TIF_UPROBE            (1 << TIF_UPROBE)
 415  #define _TIF_SYSCALL_TRACE     (1 << TIF_SYSCALL_TRACE)
 416  #define _TIF_SYSCALL_AUDIT     (1 << TIF_SYSCALL_AUDIT)
 417 @@ -167,7 +170,8 @@ extern int vfp_restore_user_hwstate(struct user_vfp __user *,
 418   * Change these and you break ASM code in entry-common.S
 419   */
 420  #define _TIF_WORK_MASK         (_TIF_NEED_RESCHED | _TIF_SIGPENDING | \
 421 -                                _TIF_NOTIFY_RESUME | _TIF_UPROBE)
 422 +                                _TIF_NOTIFY_RESUME | _TIF_UPROBE | \
 423 +                                _TIF_NEED_RESCHED_LAZY)
 424
 425  #endif /* __KERNEL__ */
 426  #endif /* __ASM_ARM_THREAD_INFO_H */
 427 diff --git a/arch/arm/kernel/asm-offsets.c b/arch/arm/kernel/asm-offsets.c
 428 index 871b8267d211..4dbe70de7318 100644
 429 --- a/arch/arm/kernel/asm-offsets.c
 430 +++ b/arch/arm/kernel/asm-offsets.c
 431 @@ -65,6 +65,7 @@ int main(void)
 432    BLANK();
 433    DEFINE(TI_FLAGS,             offsetof(struct thread_info, flags));
 434    DEFINE(TI_PREEMPT,           offsetof(struct thread_info, preempt_count));
 435 +  DEFINE(TI_PREEMPT_LAZY,      offsetof(struct thread_info, preempt_lazy_count));
 436    DEFINE(TI_ADDR_LIMIT,                offsetof(struct thread_info, addr_limit));
 437    DEFINE(TI_TASK,              offsetof(struct thread_info, task));
 438    DEFINE(TI_CPU,               offsetof(struct thread_info, cpu));
 439 diff --git a/arch/arm/kernel/entry-armv.S b/arch/arm/kernel/entry-armv.S
 440 index 3ce377f7251f..d044cea59f54 100644
 441 --- a/arch/arm/kernel/entry-armv.S
 442 +++ b/arch/arm/kernel/entry-armv.S
 443 @@ -215,11 +215,18 @@ __irq_svc:
 444  #ifdef CONFIG_PREEMPT
 445         get_thread_info tsk
 446         ldr     r8, [tsk, #TI_PREEMPT]          @ get preempt count
 447 -       ldr     r0, [tsk, #TI_FLAGS]            @ get flags
 448         teq     r8, #0                          @ if preempt count != 0
 449 +       bne     1f                              @ return from exeption
 450 +       ldr     r0, [tsk, #TI_FLAGS]            @ get flags
 451 +       tst     r0, #_TIF_NEED_RESCHED          @ if NEED_RESCHED is set
 452 +       blne    svc_preempt                     @ preempt!
 453 +
 454 +       ldr     r8, [tsk, #TI_PREEMPT_LAZY]     @ get preempt lazy count
 455 +       teq     r8, #0                          @ if preempt lazy count != 0
 456         movne   r0, #0                          @ force flags to 0
 457 -       tst     r0, #_TIF_NEED_RESCHED
 458 +       tst     r0, #_TIF_NEED_RESCHED_LAZY
 459         blne    svc_preempt
 460 +1:
 461  #endif
 462
 463         svc_exit r5, irq = 1                    @ return from exception
 464 @@ -234,8 +241,14 @@ svc_preempt:
 465  1:     bl      preempt_schedule_irq            @ irq en/disable is done inside
 466         ldr     r0, [tsk, #TI_FLAGS]            @ get new tasks TI_FLAGS
 467         tst     r0, #_TIF_NEED_RESCHED
 468 +       bne     1b
 469 +       tst     r0, #_TIF_NEED_RESCHED_LAZY
 470         reteq   r8                              @ go again
 471 -       b       1b
 472 +       ldr     r0, [tsk, #TI_PREEMPT_LAZY]     @ get preempt lazy count
 473 +       teq     r0, #0                          @ if preempt lazy count != 0
 474 +       beq     1b
 475 +       ret     r8                              @ go again
 476 +
 477  #endif
 478
 479  __und_fault:
 480 diff --git a/arch/arm/kernel/entry-common.S b/arch/arm/kernel/entry-common.S
 481 index 30a7228eaceb..c3bd6cbfce4b 100644
 482 --- a/arch/arm/kernel/entry-common.S
 483 +++ b/arch/arm/kernel/entry-common.S
 484 @@ -36,7 +36,9 @@ ret_fast_syscall:
 485   UNWIND(.cantunwind    )
 486         disable_irq_notrace                     @ disable interrupts
 487         ldr     r1, [tsk, #TI_FLAGS]            @ re-check for syscall tracing
 488 -       tst     r1, #_TIF_SYSCALL_WORK | _TIF_WORK_MASK
 489 +       tst     r1, #((_TIF_SYSCALL_WORK | _TIF_WORK_MASK) & ~_TIF_SECCOMP)
 490 +       bne     fast_work_pending
 491 +       tst     r1, #_TIF_SECCOMP
 492         bne     fast_work_pending
 493
 494         /* perform architecture specific actions before user return */
 495 @@ -62,8 +64,11 @@ ret_fast_syscall:
 496         str     r0, [sp, #S_R0 + S_OFF]!        @ save returned r0
 497         disable_irq_notrace                     @ disable interrupts
 498         ldr     r1, [tsk, #TI_FLAGS]            @ re-check for syscall tracing
 499 -       tst     r1, #_TIF_SYSCALL_WORK | _TIF_WORK_MASK
 500 +       tst     r1, #((_TIF_SYSCALL_WORK | _TIF_WORK_MASK) & ~_TIF_SECCOMP)
 501 +       bne     do_slower_path
 502 +       tst     r1, #_TIF_SECCOMP
 503         beq     no_work_pending
 504 +do_slower_path:
 505   UNWIND(.fnend         )
 506  ENDPROC(ret_fast_syscall)
 507
 508 diff --git a/arch/arm/kernel/process.c b/arch/arm/kernel/process.c
 509 index 4adfb46e3ee9..15f1d94b47c5 100644
 510 --- a/arch/arm/kernel/process.c
 511 +++ b/arch/arm/kernel/process.c
 512 @@ -319,6 +319,30 @@ unsigned long arch_randomize_brk(struct mm_struct *mm)
 513  }
 514
 515  #ifdef CONFIG_MMU
 516 +/*
 517 + * CONFIG_SPLIT_PTLOCK_CPUS results in a page->ptl lock.  If the lock is not
 518 + * initialized by pgtable_page_ctor() then a coredump of the vector page will
 519 + * fail.
 520 + */
 521 +static int __init vectors_user_mapping_init_page(void)
 522 +{
 523 +       struct page *page;
 524 +       unsigned long addr = 0xffff0000;
 525 +       pgd_t *pgd;
 526 +       pud_t *pud;
 527 +       pmd_t *pmd;
 528 +
 529 +       pgd = pgd_offset_k(addr);
 530 +       pud = pud_offset(pgd, addr);
 531 +       pmd = pmd_offset(pud, addr);
 532 +       page = pmd_page(*(pmd));
 533 +
 534 +       pgtable_page_ctor(page);
 535 +
 536 +       return 0;
 537 +}
 538 +late_initcall(vectors_user_mapping_init_page);
 539 +
 540  #ifdef CONFIG_KUSER_HELPERS
 541  /*
 542   * The vectors page is always readable from user space for the
 543 diff --git a/arch/arm/kernel/signal.c b/arch/arm/kernel/signal.c
 544 index 7b8f2141427b..96541e00b74a 100644
 545 --- a/arch/arm/kernel/signal.c
 546 +++ b/arch/arm/kernel/signal.c
 547 @@ -572,7 +572,8 @@ do_work_pending(struct pt_regs *regs, unsigned int thread_flags, int syscall)
 548          */
 549         trace_hardirqs_off();
 550         do {
 551 -               if (likely(thread_flags & _TIF_NEED_RESCHED)) {
 552 +               if (likely(thread_flags & (_TIF_NEED_RESCHED |
 553 +                                          _TIF_NEED_RESCHED_LAZY))) {
 554                         schedule();
 555                 } else {
 556                         if (unlikely(!user_mode(regs)))
 557 diff --git a/arch/arm/kernel/smp.c b/arch/arm/kernel/smp.c
 558 index b26361355dae..e5754e3b03c4 100644
 559 --- a/arch/arm/kernel/smp.c
 560 +++ b/arch/arm/kernel/smp.c
 561 @@ -230,8 +230,6 @@ int __cpu_disable(void)
 562         flush_cache_louis();
 563         local_flush_tlb_all();
 564
 565 -       clear_tasks_mm_cpumask(cpu);
 566 -
 567         return 0;
 568  }
 569
 570 @@ -247,6 +245,9 @@ void __cpu_die(unsigned int cpu)
 571                 pr_err("CPU%u: cpu didn't die\n", cpu);
 572                 return;
 573         }
 574 +
 575 +       clear_tasks_mm_cpumask(cpu);
 576 +
 577         pr_notice("CPU%u: shutdown\n", cpu);
 578
 579         /*
 580 diff --git a/arch/arm/kernel/unwind.c b/arch/arm/kernel/unwind.c
 581 index 0bee233fef9a..314cfb232a63 100644
 582 --- a/arch/arm/kernel/unwind.c
 583 +++ b/arch/arm/kernel/unwind.c
 584 @@ -93,7 +93,7 @@ extern const struct unwind_idx __start_unwind_idx[];
 585  static const struct unwind_idx *__origin_unwind_idx;
 586  extern const struct unwind_idx __stop_unwind_idx[];
 587
 588 -static DEFINE_SPINLOCK(unwind_lock);
 589 +static DEFINE_RAW_SPINLOCK(unwind_lock);
 590  static LIST_HEAD(unwind_tables);
 591
 592  /* Convert a prel31 symbol to an absolute address */
 593 @@ -201,7 +201,7 @@ static const struct unwind_idx *unwind_find_idx(unsigned long addr)
 594                 /* module unwind tables */
 595                 struct unwind_table *table;
 596
 597 -               spin_lock_irqsave(&unwind_lock, flags);
 598 +               raw_spin_lock_irqsave(&unwind_lock, flags);
 599                 list_for_each_entry(table, &unwind_tables, list) {
 600                         if (addr >= table->begin_addr &&
 601                             addr < table->end_addr) {
 602 @@ -213,7 +213,7 @@ static const struct unwind_idx *unwind_find_idx(unsigned long addr)
 603                                 break;
 604                         }
 605                 }
 606 -               spin_unlock_irqrestore(&unwind_lock, flags);
 607 +               raw_spin_unlock_irqrestore(&unwind_lock, flags);
 608         }
 609
 610         pr_debug("%s: idx = %p\n", __func__, idx);
 611 @@ -529,9 +529,9 @@ struct unwind_table *unwind_table_add(unsigned long start, unsigned long size,
 612         tab->begin_addr = text_addr;
 613         tab->end_addr = text_addr + text_size;
 614
 615 -       spin_lock_irqsave(&unwind_lock, flags);
 616 +       raw_spin_lock_irqsave(&unwind_lock, flags);
 617         list_add_tail(&tab->list, &unwind_tables);
 618 -       spin_unlock_irqrestore(&unwind_lock, flags);
 619 +       raw_spin_unlock_irqrestore(&unwind_lock, flags);
 620
 621         return tab;
 622  }
 623 @@ -543,9 +543,9 @@ void unwind_table_del(struct unwind_table *tab)
 624         if (!tab)
 625                 return;
 626
 627 -       spin_lock_irqsave(&unwind_lock, flags);
 628 +       raw_spin_lock_irqsave(&unwind_lock, flags);
 629         list_del(&tab->list);
 630 -       spin_unlock_irqrestore(&unwind_lock, flags);
 631 +       raw_spin_unlock_irqrestore(&unwind_lock, flags);
 632
 633         kfree(tab);
 634  }
 635 diff --git a/arch/arm/kvm/arm.c b/arch/arm/kvm/arm.c
 636 index d7bef2144760..36a3e51492f7 100644
 637 --- a/arch/arm/kvm/arm.c
 638 +++ b/arch/arm/kvm/arm.c
 639 @@ -496,18 +496,18 @@ static void kvm_arm_resume_guest(struct kvm *kvm)
 640         struct kvm_vcpu *vcpu;
 641
 642         kvm_for_each_vcpu(i, vcpu, kvm) {
 643 -               wait_queue_head_t *wq = kvm_arch_vcpu_wq(vcpu);
 644 +               struct swait_queue_head *wq = kvm_arch_vcpu_wq(vcpu);
 645
 646                 vcpu->arch.pause = false;
 647 -               wake_up_interruptible(wq);
 648 +               swake_up(wq);
 649         }
 650  }
 651
 652  static void vcpu_sleep(struct kvm_vcpu *vcpu)
 653  {
 654 -       wait_queue_head_t *wq = kvm_arch_vcpu_wq(vcpu);
 655 +       struct swait_queue_head *wq = kvm_arch_vcpu_wq(vcpu);
 656
 657 -       wait_event_interruptible(*wq, ((!vcpu->arch.power_off) &&
 658 +       swait_event_interruptible(*wq, ((!vcpu->arch.power_off) &&
 659                                        (!vcpu->arch.pause)));
 660  }
 661
 662 @@ -566,7 +566,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run)
 663                  * involves poking the GIC, which must be done in a
 664                  * non-preemptible context.
 665                  */
 666 -               preempt_disable();
 667 +               migrate_disable();
 668                 kvm_timer_flush_hwstate(vcpu);
 669                 kvm_vgic_flush_hwstate(vcpu);
 670
 671 @@ -585,7 +585,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run)
 672                         local_irq_enable();
 673                         kvm_timer_sync_hwstate(vcpu);
 674                         kvm_vgic_sync_hwstate(vcpu);
 675 -                       preempt_enable();
 676 +                       migrate_enable();
 677                         continue;
 678                 }
 679
 680 @@ -639,7 +639,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run)
 681
 682                 kvm_vgic_sync_hwstate(vcpu);
 683
 684 -               preempt_enable();
 685 +               migrate_enable();
 686
 687                 ret = handle_exit(vcpu, run, ret);
 688         }
 689 diff --git a/arch/arm/kvm/psci.c b/arch/arm/kvm/psci.c
 690 index a9b3b905e661..c2b131527a64 100644
 691 --- a/arch/arm/kvm/psci.c
 692 +++ b/arch/arm/kvm/psci.c
 693 @@ -70,7 +70,7 @@ static unsigned long kvm_psci_vcpu_on(struct kvm_vcpu *source_vcpu)
 694  {
 695         struct kvm *kvm = source_vcpu->kvm;
 696         struct kvm_vcpu *vcpu = NULL;
 697 -       wait_queue_head_t *wq;
 698 +       struct swait_queue_head *wq;
 699         unsigned long cpu_id;
 700         unsigned long context_id;
 701         phys_addr_t target_pc;
 702 @@ -119,7 +119,7 @@ static unsigned long kvm_psci_vcpu_on(struct kvm_vcpu *source_vcpu)
 703         smp_mb();               /* Make sure the above is visible */
 704
 705         wq = kvm_arch_vcpu_wq(vcpu);
 706 -       wake_up_interruptible(wq);
 707 +       swake_up(wq);
 708
 709         return PSCI_RET_SUCCESS;
 710  }
 711 diff --git a/arch/arm/mach-at91/Kconfig b/arch/arm/mach-at91/Kconfig
 712 index 28656c2b54a0..3f501305ca26 100644
 713 --- a/arch/arm/mach-at91/Kconfig
 714 +++ b/arch/arm/mach-at91/Kconfig
 715 @@ -99,6 +99,7 @@ config HAVE_AT91_USB_CLK
 716  config COMMON_CLK_AT91
 717         bool
 718         select COMMON_CLK
 719 +       select MFD_SYSCON
 720
 721  config HAVE_AT91_SMD
 722         bool
 723 diff --git a/arch/arm/mach-at91/at91rm9200.c b/arch/arm/mach-at91/at91rm9200.c
 724 index c1a7c6cc00e1..63b4fa25b48a 100644
 725 --- a/arch/arm/mach-at91/at91rm9200.c
 726 +++ b/arch/arm/mach-at91/at91rm9200.c
 727 @@ -12,7 +12,6 @@
 728  #include <linux/of_platform.h>
 729
 730  #include <asm/mach/arch.h>
 731 -#include <asm/system_misc.h>
 732
 733  #include "generic.h"
 734  #include "soc.h"
 735 @@ -33,7 +32,6 @@ static void __init at91rm9200_dt_device_init(void)
 736
 737         of_platform_populate(NULL, of_default_bus_match_table, NULL, soc_dev);
 738
 739 -       arm_pm_idle = at91rm9200_idle;
 740         at91rm9200_pm_init();
 741  }
 742
 743 diff --git a/arch/arm/mach-at91/at91sam9.c b/arch/arm/mach-at91/at91sam9.c
 744 index 7eb64f763034..cada2a6412b3 100644
 745 --- a/arch/arm/mach-at91/at91sam9.c
 746 +++ b/arch/arm/mach-at91/at91sam9.c
 747 @@ -62,8 +62,6 @@ static void __init at91sam9_common_init(void)
 748                 soc_dev = soc_device_to_device(soc);
 749
 750         of_platform_populate(NULL, of_default_bus_match_table, NULL, soc_dev);
 751 -
 752 -       arm_pm_idle = at91sam9_idle;
 753  }
 754
 755  static void __init at91sam9_dt_device_init(void)
 756 diff --git a/arch/arm/mach-at91/generic.h b/arch/arm/mach-at91/generic.h
 757 index b0fa7dc7286d..28ca57a2060f 100644
 758 --- a/arch/arm/mach-at91/generic.h
 759 +++ b/arch/arm/mach-at91/generic.h
 760 @@ -11,27 +11,18 @@
 761  #ifndef _AT91_GENERIC_H
 762  #define _AT91_GENERIC_H
 763
 764 -#include <linux/of.h>
 765 -#include <linux/reboot.h>
 766 -
 767 - /* Map io */
 768 -extern void __init at91_map_io(void);
 769 -extern void __init at91_alt_map_io(void);
 770 -
 771 -/* idle */
 772 -extern void at91rm9200_idle(void);
 773 -extern void at91sam9_idle(void);
 774 -
 775  #ifdef CONFIG_PM
 776  extern void __init at91rm9200_pm_init(void);
 777  extern void __init at91sam9260_pm_init(void);
 778  extern void __init at91sam9g45_pm_init(void);
 779  extern void __init at91sam9x5_pm_init(void);
 780 +extern void __init sama5_pm_init(void);
 781  #else
 782  static inline void __init at91rm9200_pm_init(void) { }
 783  static inline void __init at91sam9260_pm_init(void) { }
 784  static inline void __init at91sam9g45_pm_init(void) { }
 785  static inline void __init at91sam9x5_pm_init(void) { }
 786 +static inline void __init sama5_pm_init(void) { }
 787  #endif
 788
 789  #endif /* _AT91_GENERIC_H */
 790 diff --git a/arch/arm/mach-at91/pm.c b/arch/arm/mach-at91/pm.c
 791 index 23726fb31741..f06270198bf1 100644
 792 --- a/arch/arm/mach-at91/pm.c
 793 +++ b/arch/arm/mach-at91/pm.c
 794 @@ -31,10 +31,13 @@
 795  #include <asm/mach/irq.h>
 796  #include <asm/fncpy.h>
 797  #include <asm/cacheflush.h>
 798 +#include <asm/system_misc.h>
 799
 800  #include "generic.h"
 801  #include "pm.h"
 802
 803 +static void __iomem *pmc;
 804 +
 805  /*
 806   * FIXME: this is needed to communicate between the pinctrl driver and
 807   * the PM implementation in the machine. Possibly part of the PM
 808 @@ -87,7 +90,7 @@ static int at91_pm_verify_clocks(void)
 809         unsigned long scsr;
 810         int i;
 811
 812 -       scsr = at91_pmc_read(AT91_PMC_SCSR);
 813 +       scsr = readl(pmc + AT91_PMC_SCSR);
 814
 815         /* USB must not be using PLLB */
 816         if ((scsr & at91_pm_data.uhp_udp_mask) != 0) {
 817 @@ -101,8 +104,7 @@ static int at91_pm_verify_clocks(void)
 818
 819                 if ((scsr & (AT91_PMC_PCK0 << i)) == 0)
 820                         continue;
 821 -
 822 -               css = at91_pmc_read(AT91_PMC_PCKR(i)) & AT91_PMC_CSS;
 823 +               css = readl(pmc + AT91_PMC_PCKR(i)) & AT91_PMC_CSS;
 824                 if (css != AT91_PMC_CSS_SLOW) {
 825                         pr_err("AT91: PM - Suspend-to-RAM with PCK%d src %d\n", i, css);
 826                         return 0;
 827 @@ -145,8 +147,8 @@ static void at91_pm_suspend(suspend_state_t state)
 828         flush_cache_all();
 829         outer_disable();
 830
 831 -       at91_suspend_sram_fn(at91_pmc_base, at91_ramc_base[0],
 832 -                               at91_ramc_base[1], pm_data);
 833 +       at91_suspend_sram_fn(pmc, at91_ramc_base[0],
 834 +                            at91_ramc_base[1], pm_data);
 835
 836         outer_resume();
 837  }
 838 @@ -353,6 +355,21 @@ static __init void at91_dt_ramc(void)
 839         at91_pm_set_standby(standby);
 840  }
 841
 842 +void at91rm9200_idle(void)
 843 +{
 844 +       /*
 845 +        * Disable the processor clock.  The processor will be automatically
 846 +        * re-enabled by an interrupt or by a reset.
 847 +        */
 848 +       writel(AT91_PMC_PCK, pmc + AT91_PMC_SCDR);
 849 +}
 850 +
 851 +void at91sam9_idle(void)
 852 +{
 853 +       writel(AT91_PMC_PCK, pmc + AT91_PMC_SCDR);
 854 +       cpu_do_idle();
 855 +}
 856 +
 857  static void __init at91_pm_sram_init(void)
 858  {
 859         struct gen_pool *sram_pool;
 860 @@ -399,13 +416,36 @@ static void __init at91_pm_sram_init(void)
 861                         &at91_pm_suspend_in_sram, at91_pm_suspend_in_sram_sz);
 862  }
 863
 864 -static void __init at91_pm_init(void)
 865 +static const struct of_device_id atmel_pmc_ids[] __initconst = {
 866 +       { .compatible = "atmel,at91rm9200-pmc"  },
 867 +       { .compatible = "atmel,at91sam9260-pmc" },
 868 +       { .compatible = "atmel,at91sam9g45-pmc" },
 869 +       { .compatible = "atmel,at91sam9n12-pmc" },
 870 +       { .compatible = "atmel,at91sam9x5-pmc" },
 871 +       { .compatible = "atmel,sama5d3-pmc" },
 872 +       { .compatible = "atmel,sama5d2-pmc" },
 873 +       { /* sentinel */ },
 874 +};
 875 +
 876 +static void __init at91_pm_init(void (*pm_idle)(void))
 877  {
 878 -       at91_pm_sram_init();
 879 +       struct device_node *pmc_np;
 880
 881         if (at91_cpuidle_device.dev.platform_data)
 882                 platform_device_register(&at91_cpuidle_device);
 883
 884 +       pmc_np = of_find_matching_node(NULL, atmel_pmc_ids);
 885 +       pmc = of_iomap(pmc_np, 0);
 886 +       if (!pmc) {
 887 +               pr_err("AT91: PM not supported, PMC not found\n");
 888 +               return;
 889 +       }
 890 +
 891 +       if (pm_idle)
 892 +               arm_pm_idle = pm_idle;
 893 +
 894 +       at91_pm_sram_init();
 895 +
 896         if (at91_suspend_sram_fn)
 897                 suspend_set_ops(&at91_pm_ops);
 898         else
 899 @@ -424,7 +464,7 @@ void __init at91rm9200_pm_init(void)
 900         at91_pm_data.uhp_udp_mask = AT91RM9200_PMC_UHP | AT91RM9200_PMC_UDP;
 901         at91_pm_data.memctrl = AT91_MEMCTRL_MC;
 902
 903 -       at91_pm_init();
 904 +       at91_pm_init(at91rm9200_idle);
 905  }
 906
 907  void __init at91sam9260_pm_init(void)
 908 @@ -432,7 +472,7 @@ void __init at91sam9260_pm_init(void)
 909         at91_dt_ramc();
 910         at91_pm_data.memctrl = AT91_MEMCTRL_SDRAMC;
 911         at91_pm_data.uhp_udp_mask = AT91SAM926x_PMC_UHP | AT91SAM926x_PMC_UDP;
 912 -       return at91_pm_init();
 913 +       at91_pm_init(at91sam9_idle);
 914  }
 915
 916  void __init at91sam9g45_pm_init(void)
 917 @@ -440,7 +480,7 @@ void __init at91sam9g45_pm_init(void)
 918         at91_dt_ramc();
 919         at91_pm_data.uhp_udp_mask = AT91SAM926x_PMC_UHP;
 920         at91_pm_data.memctrl = AT91_MEMCTRL_DDRSDR;
 921 -       return at91_pm_init();
 922 +       at91_pm_init(at91sam9_idle);
 923  }
 924
 925  void __init at91sam9x5_pm_init(void)
 926 @@ -448,5 +488,13 @@ void __init at91sam9x5_pm_init(void)
 927         at91_dt_ramc();
 928         at91_pm_data.uhp_udp_mask = AT91SAM926x_PMC_UHP | AT91SAM926x_PMC_UDP;
 929         at91_pm_data.memctrl = AT91_MEMCTRL_DDRSDR;
 930 -       return at91_pm_init();
 931 +       at91_pm_init(at91sam9_idle);
 932 +}
 933 +
 934 +void __init sama5_pm_init(void)
 935 +{
 936 +       at91_dt_ramc();
 937 +       at91_pm_data.uhp_udp_mask = AT91SAM926x_PMC_UHP | AT91SAM926x_PMC_UDP;
 938 +       at91_pm_data.memctrl = AT91_MEMCTRL_DDRSDR;
 939 +       at91_pm_init(NULL);
 940  }
 941 diff --git a/arch/arm/mach-at91/sama5.c b/arch/arm/mach-at91/sama5.c
 942 index d9cf6799aec0..df8fdf1cf66d 100644
 943 --- a/arch/arm/mach-at91/sama5.c
 944 +++ b/arch/arm/mach-at91/sama5.c
 945 @@ -51,7 +51,7 @@ static void __init sama5_dt_device_init(void)
 946                 soc_dev = soc_device_to_device(soc);
 947
 948         of_platform_populate(NULL, of_default_bus_match_table, NULL, soc_dev);
 949 -       at91sam9x5_pm_init();
 950 +       sama5_pm_init();
 951  }
 952
 953  static const char *const sama5_dt_board_compat[] __initconst = {
 954 diff --git a/arch/arm/mach-exynos/platsmp.c b/arch/arm/mach-exynos/platsmp.c
 955 index 98a2c0cbb833..310dce500d3e 100644
 956 --- a/arch/arm/mach-exynos/platsmp.c
 957 +++ b/arch/arm/mach-exynos/platsmp.c
 958 @@ -230,7 +230,7 @@ static void __iomem *scu_base_addr(void)
 959         return (void __iomem *)(S5P_VA_SCU);
 960  }
 961
 962 -static DEFINE_SPINLOCK(boot_lock);
 963 +static DEFINE_RAW_SPINLOCK(boot_lock);
 964
 965  static void exynos_secondary_init(unsigned int cpu)
 966  {
 967 @@ -243,8 +243,8 @@ static void exynos_secondary_init(unsigned int cpu)
 968         /*
 969          * Synchronise with the boot thread.
 970          */
 971 -       spin_lock(&boot_lock);
 972 -       spin_unlock(&boot_lock);
 973 +       raw_spin_lock(&boot_lock);
 974 +       raw_spin_unlock(&boot_lock);
 975  }
 976
 977  int exynos_set_boot_addr(u32 core_id, unsigned long boot_addr)
 978 @@ -308,7 +308,7 @@ static int exynos_boot_secondary(unsigned int cpu, struct task_struct *idle)
 979          * Set synchronisation state between this boot processor
 980          * and the secondary one
 981          */
 982 -       spin_lock(&boot_lock);
 983 +       raw_spin_lock(&boot_lock);
 984
 985         /*
 986          * The secondary processor is waiting to be released from
 987 @@ -335,7 +335,7 @@ static int exynos_boot_secondary(unsigned int cpu, struct task_struct *idle)
 988
 989                 if (timeout == 0) {
 990                         printk(KERN_ERR "cpu1 power enable failed");
 991 -                       spin_unlock(&boot_lock);
 992 +                       raw_spin_unlock(&boot_lock);
 993                         return -ETIMEDOUT;
 994                 }
 995         }
 996 @@ -381,7 +381,7 @@ static int exynos_boot_secondary(unsigned int cpu, struct task_struct *idle)
 997          * calibrations, then wait for it to finish
 998          */
 999  fail:
1000 -       spin_unlock(&boot_lock);
1001 +       raw_spin_unlock(&boot_lock);
1002
1003         return pen_release != -1 ? ret : 0;
1004  }
1005 diff --git a/arch/arm/mach-hisi/platmcpm.c b/arch/arm/mach-hisi/platmcpm.c
1006 index b5f8f5ffda79..9753a84df9c4 100644
1007 --- a/arch/arm/mach-hisi/platmcpm.c
1008 +++ b/arch/arm/mach-hisi/platmcpm.c
1009 @@ -61,7 +61,7 @@
1010
1011  static void __iomem *sysctrl, *fabric;
1012  static int hip04_cpu_table[HIP04_MAX_CLUSTERS][HIP04_MAX_CPUS_PER_CLUSTER];
1013 -static DEFINE_SPINLOCK(boot_lock);
1014 +static DEFINE_RAW_SPINLOCK(boot_lock);
1015  static u32 fabric_phys_addr;
1016  /*
1017   * [0]: bootwrapper physical address
1018 @@ -113,7 +113,7 @@ static int hip04_boot_secondary(unsigned int l_cpu, struct task_struct *idle)
1019         if (cluster >= HIP04_MAX_CLUSTERS || cpu >= HIP04_MAX_CPUS_PER_CLUSTER)
1020                 return -EINVAL;
1021
1022 -       spin_lock_irq(&boot_lock);
1023 +       raw_spin_lock_irq(&boot_lock);
1024
1025         if (hip04_cpu_table[cluster][cpu])
1026                 goto out;
1027 @@ -147,7 +147,7 @@ static int hip04_boot_secondary(unsigned int l_cpu, struct task_struct *idle)
1028
1029  out:
1030         hip04_cpu_table[cluster][cpu]++;
1031 -       spin_unlock_irq(&boot_lock);
1032 +       raw_spin_unlock_irq(&boot_lock);
1033
1034         return 0;
1035  }
1036 @@ -162,11 +162,11 @@ static void hip04_cpu_die(unsigned int l_cpu)
1037         cpu = MPIDR_AFFINITY_LEVEL(mpidr, 0);
1038         cluster = MPIDR_AFFINITY_LEVEL(mpidr, 1);
1039
1040 -       spin_lock(&boot_lock);
1041 +       raw_spin_lock(&boot_lock);
1042         hip04_cpu_table[cluster][cpu]--;
1043         if (hip04_cpu_table[cluster][cpu] == 1) {
1044                 /* A power_up request went ahead of us. */
1045 -               spin_unlock(&boot_lock);
1046 +               raw_spin_unlock(&boot_lock);
1047                 return;
1048         } else if (hip04_cpu_table[cluster][cpu] > 1) {
1049                 pr_err("Cluster %d CPU%d boots multiple times\n", cluster, cpu);
1050 @@ -174,7 +174,7 @@ static void hip04_cpu_die(unsigned int l_cpu)
1051         }
1052
1053         last_man = hip04_cluster_is_down(cluster);
1054 -       spin_unlock(&boot_lock);
1055 +       raw_spin_unlock(&boot_lock);
1056         if (last_man) {
1057                 /* Since it's Cortex A15, disable L2 prefetching. */
1058                 asm volatile(
1059 @@ -203,7 +203,7 @@ static int hip04_cpu_kill(unsigned int l_cpu)
1060                cpu >= HIP04_MAX_CPUS_PER_CLUSTER);
1061
1062         count = TIMEOUT_MSEC / POLL_MSEC;
1063 -       spin_lock_irq(&boot_lock);
1064 +       raw_spin_lock_irq(&boot_lock);
1065         for (tries = 0; tries < count; tries++) {
1066                 if (hip04_cpu_table[cluster][cpu])
1067                         goto err;
1068 @@ -211,10 +211,10 @@ static int hip04_cpu_kill(unsigned int l_cpu)
1069                 data = readl_relaxed(sysctrl + SC_CPU_RESET_STATUS(cluster));
1070                 if (data & CORE_WFI_STATUS(cpu))
1071                         break;
1072 -               spin_unlock_irq(&boot_lock);
1073 +               raw_spin_unlock_irq(&boot_lock);
1074                 /* Wait for clean L2 when the whole cluster is down. */
1075                 msleep(POLL_MSEC);
1076 -               spin_lock_irq(&boot_lock);
1077 +               raw_spin_lock_irq(&boot_lock);
1078         }
1079         if (tries >= count)
1080                 goto err;
1081 @@ -231,10 +231,10 @@ static int hip04_cpu_kill(unsigned int l_cpu)
1082                 goto err;
1083         if (hip04_cluster_is_down(cluster))
1084                 hip04_set_snoop_filter(cluster, 0);
1085 -       spin_unlock_irq(&boot_lock);
1086 +       raw_spin_unlock_irq(&boot_lock);
1087         return 1;
1088  err:
1089 -       spin_unlock_irq(&boot_lock);
1090 +       raw_spin_unlock_irq(&boot_lock);
1091         return 0;
1092  }
1093  #endif
1094 diff --git a/arch/arm/mach-imx/Kconfig b/arch/arm/mach-imx/Kconfig
1095 index 8ceda2844c4f..08bcf8fb76f2 100644
1096 --- a/arch/arm/mach-imx/Kconfig
1097 +++ b/arch/arm/mach-imx/Kconfig
1098 @@ -524,7 +524,7 @@ config SOC_IMX6Q
1099         bool "i.MX6 Quad/DualLite support"
1100         select ARM_ERRATA_764369 if SMP
1101         select HAVE_ARM_SCU if SMP
1102 -       select HAVE_ARM_TWD if SMP
1103 +       select HAVE_ARM_TWD
1104         select PCI_DOMAINS if PCI
1105         select PINCTRL_IMX6Q
1106         select SOC_IMX6
1107 diff --git a/arch/arm/mach-omap2/omap-smp.c b/arch/arm/mach-omap2/omap-smp.c
1108 index 79e1f876d1c9..7e625c17f78e 100644
1109 --- a/arch/arm/mach-omap2/omap-smp.c
1110 +++ b/arch/arm/mach-omap2/omap-smp.c
1111 @@ -43,7 +43,7 @@
1112  /* SCU base address */
1113  static void __iomem *scu_base;
1114
1115 -static DEFINE_SPINLOCK(boot_lock);
1116 +static DEFINE_RAW_SPINLOCK(boot_lock);
1117
1118  void __iomem *omap4_get_scu_base(void)
1119  {
1120 @@ -74,8 +74,8 @@ static void omap4_secondary_init(unsigned int cpu)
1121         /*
1122          * Synchronise with the boot thread.
1123          */
1124 -       spin_lock(&boot_lock);
1125 -       spin_unlock(&boot_lock);
1126 +       raw_spin_lock(&boot_lock);
1127 +       raw_spin_unlock(&boot_lock);
1128  }
1129
1130  static int omap4_boot_secondary(unsigned int cpu, struct task_struct *idle)
1131 @@ -89,7 +89,7 @@ static int omap4_boot_secondary(unsigned int cpu, struct task_struct *idle)
1132          * Set synchronisation state between this boot processor
1133          * and the secondary one
1134          */
1135 -       spin_lock(&boot_lock);
1136 +       raw_spin_lock(&boot_lock);
1137
1138         /*
1139          * Update the AuxCoreBoot0 with boot state for secondary core.
1140 @@ -166,7 +166,7 @@ static int omap4_boot_secondary(unsigned int cpu, struct task_struct *idle)
1141          * Now the secondary core is starting up let it run its
1142          * calibrations, then wait for it to finish
1143          */
1144 -       spin_unlock(&boot_lock);
1145 +       raw_spin_unlock(&boot_lock);
1146
1147         return 0;
1148  }
1149 diff --git a/arch/arm/mach-prima2/platsmp.c b/arch/arm/mach-prima2/platsmp.c
1150 index e46c91094dde..dcb3ed0c26da 100644
1151 --- a/arch/arm/mach-prima2/platsmp.c
1152 +++ b/arch/arm/mach-prima2/platsmp.c
1153 @@ -22,7 +22,7 @@
1154
1155  static void __iomem *clk_base;
1156
1157 -static DEFINE_SPINLOCK(boot_lock);
1158 +static DEFINE_RAW_SPINLOCK(boot_lock);
1159
1160  static void sirfsoc_secondary_init(unsigned int cpu)
1161  {
1162 @@ -36,8 +36,8 @@ static void sirfsoc_secondary_init(unsigned int cpu)
1163         /*
1164          * Synchronise with the boot thread.
1165          */
1166 -       spin_lock(&boot_lock);
1167 -       spin_unlock(&boot_lock);
1168 +       raw_spin_lock(&boot_lock);
1169 +       raw_spin_unlock(&boot_lock);
1170  }
1171
1172  static const struct of_device_id clk_ids[]  = {
1173 @@ -75,7 +75,7 @@ static int sirfsoc_boot_secondary(unsigned int cpu, struct task_struct *idle)
1174         /* make sure write buffer is drained */
1175         mb();
1176
1177 -       spin_lock(&boot_lock);
1178 +       raw_spin_lock(&boot_lock);
1179
1180         /*
1181          * The secondary processor is waiting to be released from
1182 @@ -107,7 +107,7 @@ static int sirfsoc_boot_secondary(unsigned int cpu, struct task_struct *idle)
1183          * now the secondary core is starting up let it run its
1184          * calibrations, then wait for it to finish
1185          */
1186 -       spin_unlock(&boot_lock);
1187 +       raw_spin_unlock(&boot_lock);
1188
1189         return pen_release != -1 ? -ENOSYS : 0;
1190  }
1191 diff --git a/arch/arm/mach-qcom/platsmp.c b/arch/arm/mach-qcom/platsmp.c
1192 index 9b00123a315d..0a49fe1bc8cf 100644
1193 --- a/arch/arm/mach-qcom/platsmp.c
1194 +++ b/arch/arm/mach-qcom/platsmp.c
1195 @@ -46,7 +46,7 @@
1196
1197  extern void secondary_startup_arm(void);
1198
1199 -static DEFINE_SPINLOCK(boot_lock);
1200 +static DEFINE_RAW_SPINLOCK(boot_lock);
1201
1202  #ifdef CONFIG_HOTPLUG_CPU
1203  static void qcom_cpu_die(unsigned int cpu)
1204 @@ -60,8 +60,8 @@ static void qcom_secondary_init(unsigned int cpu)
1205         /*
1206          * Synchronise with the boot thread.
1207          */
1208 -       spin_lock(&boot_lock);
1209 -       spin_unlock(&boot_lock);
1210 +       raw_spin_lock(&boot_lock);
1211 +       raw_spin_unlock(&boot_lock);
1212  }
1213
1214  static int scss_release_secondary(unsigned int cpu)
1215 @@ -284,7 +284,7 @@ static int qcom_boot_secondary(unsigned int cpu, int (*func)(unsigned int))
1216          * set synchronisation state between this boot processor
1217          * and the secondary one
1218          */
1219 -       spin_lock(&boot_lock);
1220 +       raw_spin_lock(&boot_lock);
1221
1222         /*
1223          * Send the secondary CPU a soft interrupt, thereby causing
1224 @@ -297,7 +297,7 @@ static int qcom_boot_secondary(unsigned int cpu, int (*func)(unsigned int))
1225          * now the secondary core is starting up let it run its
1226          * calibrations, then wait for it to finish
1227          */
1228 -       spin_unlock(&boot_lock);
1229 +       raw_spin_unlock(&boot_lock);
1230
1231         return ret;
1232  }
1233 diff --git a/arch/arm/mach-spear/platsmp.c b/arch/arm/mach-spear/platsmp.c
1234 index fd4297713d67..b0553b2c2d53 100644
1235 --- a/arch/arm/mach-spear/platsmp.c
1236 +++ b/arch/arm/mach-spear/platsmp.c
1237 @@ -32,7 +32,7 @@ static void write_pen_release(int val)
1238         sync_cache_w(&pen_release);
1239  }
1240
1241 -static DEFINE_SPINLOCK(boot_lock);
1242 +static DEFINE_RAW_SPINLOCK(boot_lock);
1243
1244  static void __iomem *scu_base = IOMEM(VA_SCU_BASE);
1245
1246 @@ -47,8 +47,8 @@ static void spear13xx_secondary_init(unsigned int cpu)
1247         /*
1248          * Synchronise with the boot thread.
1249          */
1250 -       spin_lock(&boot_lock);
1251 -       spin_unlock(&boot_lock);
1252 +       raw_spin_lock(&boot_lock);
1253 +       raw_spin_unlock(&boot_lock);
1254  }
1255
1256  static int spear13xx_boot_secondary(unsigned int cpu, struct task_struct *idle)
1257 @@ -59,7 +59,7 @@ static int spear13xx_boot_secondary(unsigned int cpu, struct task_struct *idle)
1258          * set synchronisation state between this boot processor
1259          * and the secondary one
1260          */
1261 -       spin_lock(&boot_lock);
1262 +       raw_spin_lock(&boot_lock);
1263
1264         /*
1265          * The secondary processor is waiting to be released from
1266 @@ -84,7 +84,7 @@ static int spear13xx_boot_secondary(unsigned int cpu, struct task_struct *idle)
1267          * now the secondary core is starting up let it run its
1268          * calibrations, then wait for it to finish
1269          */
1270 -       spin_unlock(&boot_lock);
1271 +       raw_spin_unlock(&boot_lock);
1272
1273         return pen_release != -1 ? -ENOSYS : 0;
1274  }
1275 diff --git a/arch/arm/mach-sti/platsmp.c b/arch/arm/mach-sti/platsmp.c
1276 index c4ad6eae67fa..e830b20b212f 100644
1277 --- a/arch/arm/mach-sti/platsmp.c
1278 +++ b/arch/arm/mach-sti/platsmp.c
1279 @@ -35,7 +35,7 @@ static void write_pen_release(int val)
1280         sync_cache_w(&pen_release);
1281  }
1282
1283 -static DEFINE_SPINLOCK(boot_lock);
1284 +static DEFINE_RAW_SPINLOCK(boot_lock);
1285
1286  static void sti_secondary_init(unsigned int cpu)
1287  {
1288 @@ -48,8 +48,8 @@ static void sti_secondary_init(unsigned int cpu)
1289         /*
1290          * Synchronise with the boot thread.
1291          */
1292 -       spin_lock(&boot_lock);
1293 -       spin_unlock(&boot_lock);
1294 +       raw_spin_lock(&boot_lock);
1295 +       raw_spin_unlock(&boot_lock);
1296  }
1297
1298  static int sti_boot_secondary(unsigned int cpu, struct task_struct *idle)
1299 @@ -60,7 +60,7 @@ static int sti_boot_secondary(unsigned int cpu, struct task_struct *idle)
1300          * set synchronisation state between this boot processor
1301          * and the secondary one
1302          */
1303 -       spin_lock(&boot_lock);
1304 +       raw_spin_lock(&boot_lock);
1305
1306         /*
1307          * The secondary processor is waiting to be released from
1308 @@ -91,7 +91,7 @@ static int sti_boot_secondary(unsigned int cpu, struct task_struct *idle)
1309          * now the secondary core is starting up let it run its
1310          * calibrations, then wait for it to finish
1311          */
1312 -       spin_unlock(&boot_lock);
1313 +       raw_spin_unlock(&boot_lock);
1314
1315         return pen_release != -1 ? -ENOSYS : 0;
1316  }
1317 diff --git a/arch/arm/mm/fault.c b/arch/arm/mm/fault.c
1318 index daafcf121ce0..b8aa1e9ee8ee 100644
1319 --- a/arch/arm/mm/fault.c
1320 +++ b/arch/arm/mm/fault.c
1321 @@ -430,6 +430,9 @@ do_translation_fault(unsigned long addr, unsigned int fsr,
1322         if (addr < TASK_SIZE)
1323                 return do_page_fault(addr, fsr, regs);
1324
1325 +       if (interrupts_enabled(regs))
1326 +               local_irq_enable();
1327 +
1328         if (user_mode(regs))
1329                 goto bad_area;
1330
1331 @@ -497,6 +500,9 @@ do_translation_fault(unsigned long addr, unsigned int fsr,
1332  static int
1333  do_sect_fault(unsigned long addr, unsigned int fsr, struct pt_regs *regs)
1334  {
1335 +       if (interrupts_enabled(regs))
1336 +               local_irq_enable();
1337 +
1338         do_bad_area(addr, fsr, regs);
1339         return 0;
1340  }
1341 diff --git a/arch/arm/mm/highmem.c b/arch/arm/mm/highmem.c
1342 index d02f8187b1cc..542692dbd40a 100644
1343 --- a/arch/arm/mm/highmem.c
1344 +++ b/arch/arm/mm/highmem.c
1345 @@ -34,6 +34,11 @@ static inline pte_t get_fixmap_pte(unsigned long vaddr)
1346         return *ptep;
1347  }
1348
1349 +static unsigned int fixmap_idx(int type)
1350 +{
1351 +       return FIX_KMAP_BEGIN + type + KM_TYPE_NR * smp_processor_id();
1352 +}
1353 +
1354  void *kmap(struct page *page)
1355  {
1356         might_sleep();
1357 @@ -54,12 +59,13 @@ EXPORT_SYMBOL(kunmap);
1358
1359  void *kmap_atomic(struct page *page)
1360  {
1361 +       pte_t pte = mk_pte(page, kmap_prot);
1362         unsigned int idx;
1363         unsigned long vaddr;
1364         void *kmap;
1365         int type;
1366
1367 -       preempt_disable();
1368 +       preempt_disable_nort();
1369         pagefault_disable();
1370         if (!PageHighMem(page))
1371                 return page_address(page);
1372 @@ -79,7 +85,7 @@ void *kmap_atomic(struct page *page)
1373
1374         type = kmap_atomic_idx_push();
1375
1376 -       idx = FIX_KMAP_BEGIN + type + KM_TYPE_NR * smp_processor_id();
1377 +       idx = fixmap_idx(type);
1378         vaddr = __fix_to_virt(idx);
1379  #ifdef CONFIG_DEBUG_HIGHMEM
1380         /*
1381 @@ -93,7 +99,10 @@ void *kmap_atomic(struct page *page)
1382          * in place, so the contained TLB flush ensures the TLB is updated
1383          * with the new mapping.
1384          */
1385 -       set_fixmap_pte(idx, mk_pte(page, kmap_prot));
1386 +#ifdef CONFIG_PREEMPT_RT_FULL
1387 +       current->kmap_pte[type] = pte;
1388 +#endif
1389 +       set_fixmap_pte(idx, pte);
1390
1391         return (void *)vaddr;
1392  }
1393 @@ -106,44 +115,75 @@ void __kunmap_atomic(void *kvaddr)
1394
1395         if (kvaddr >= (void *)FIXADDR_START) {
1396                 type = kmap_atomic_idx();
1397 -               idx = FIX_KMAP_BEGIN + type + KM_TYPE_NR * smp_processor_id();
1398 +               idx = fixmap_idx(type);
1399
1400                 if (cache_is_vivt())
1401                         __cpuc_flush_dcache_area((void *)vaddr, PAGE_SIZE);
1402 +#ifdef CONFIG_PREEMPT_RT_FULL
1403 +               current->kmap_pte[type] = __pte(0);
1404 +#endif
1405  #ifdef CONFIG_DEBUG_HIGHMEM
1406                 BUG_ON(vaddr != __fix_to_virt(idx));
1407 -               set_fixmap_pte(idx, __pte(0));
1408  #else
1409                 (void) idx;  /* to kill a warning */
1410  #endif
1411 +               set_fixmap_pte(idx, __pte(0));
1412                 kmap_atomic_idx_pop();
1413         } else if (vaddr >= PKMAP_ADDR(0) && vaddr < PKMAP_ADDR(LAST_PKMAP)) {
1414                 /* this address was obtained through kmap_high_get() */
1415                 kunmap_high(pte_page(pkmap_page_table[PKMAP_NR(vaddr)]));
1416         }
1417         pagefault_enable();
1418 -       preempt_enable();
1419 +       preempt_enable_nort();
1420  }
1421  EXPORT_SYMBOL(__kunmap_atomic);
1422
1423  void *kmap_atomic_pfn(unsigned long pfn)
1424  {
1425 +       pte_t pte = pfn_pte(pfn, kmap_prot);
1426         unsigned long vaddr;
1427         int idx, type;
1428         struct page *page = pfn_to_page(pfn);
1429
1430 -       preempt_disable();
1431 +       preempt_disable_nort();
1432         pagefault_disable();
1433         if (!PageHighMem(page))
1434                 return page_address(page);
1435
1436         type = kmap_atomic_idx_push();
1437 -       idx = FIX_KMAP_BEGIN + type + KM_TYPE_NR * smp_processor_id();
1438 +       idx = fixmap_idx(type);
1439         vaddr = __fix_to_virt(idx);
1440  #ifdef CONFIG_DEBUG_HIGHMEM
1441         BUG_ON(!pte_none(get_fixmap_pte(vaddr)));
1442  #endif
1443 -       set_fixmap_pte(idx, pfn_pte(pfn, kmap_prot));
1444 +#ifdef CONFIG_PREEMPT_RT_FULL
1445 +       current->kmap_pte[type] = pte;
1446 +#endif
1447 +       set_fixmap_pte(idx, pte);
1448
1449         return (void *)vaddr;
1450  }
1451 +#if defined CONFIG_PREEMPT_RT_FULL
1452 +void switch_kmaps(struct task_struct *prev_p, struct task_struct *next_p)
1453 +{
1454 +       int i;
1455 +
1456 +       /*
1457 +        * Clear @prev's kmap_atomic mappings
1458 +        */
1459 +       for (i = 0; i < prev_p->kmap_idx; i++) {
1460 +               int idx = fixmap_idx(i);
1461 +
1462 +               set_fixmap_pte(idx, __pte(0));
1463 +       }
1464 +       /*
1465 +        * Restore @next_p's kmap_atomic mappings
1466 +        */
1467 +       for (i = 0; i < next_p->kmap_idx; i++) {
1468 +               int idx = fixmap_idx(i);
1469 +
1470 +               if (!pte_none(next_p->kmap_pte[i]))
1471 +                       set_fixmap_pte(idx, next_p->kmap_pte[i]);
1472 +       }
1473 +}
1474 +#endif
1475 diff --git a/arch/arm/plat-versatile/platsmp.c b/arch/arm/plat-versatile/platsmp.c
1476 index 53feb90c840c..b4a8d54fc3f3 100644
1477 --- a/arch/arm/plat-versatile/platsmp.c
1478 +++ b/arch/arm/plat-versatile/platsmp.c
1479 @@ -30,7 +30,7 @@ static void write_pen_release(int val)
1480         sync_cache_w(&pen_release);
1481  }
1482
1483 -static DEFINE_SPINLOCK(boot_lock);
1484 +static DEFINE_RAW_SPINLOCK(boot_lock);
1485
1486  void versatile_secondary_init(unsigned int cpu)
1487  {
1488 @@ -43,8 +43,8 @@ void versatile_secondary_init(unsigned int cpu)
1489         /*
1490          * Synchronise with the boot thread.
1491          */
1492 -       spin_lock(&boot_lock);
1493 -       spin_unlock(&boot_lock);
1494 +       raw_spin_lock(&boot_lock);
1495 +       raw_spin_unlock(&boot_lock);
1496  }
1497
1498  int versatile_boot_secondary(unsigned int cpu, struct task_struct *idle)
1499 @@ -55,7 +55,7 @@ int versatile_boot_secondary(unsigned int cpu, struct task_struct *idle)
1500          * Set synchronisation state between this boot processor
1501          * and the secondary one
1502          */
1503 -       spin_lock(&boot_lock);
1504 +       raw_spin_lock(&boot_lock);
1505
1506         /*
1507          * This is really belt and braces; we hold unintended secondary
1508 @@ -85,7 +85,7 @@ int versatile_boot_secondary(unsigned int cpu, struct task_struct *idle)
1509          * now the secondary core is starting up let it run its
1510          * calibrations, then wait for it to finish
1511          */
1512 -       spin_unlock(&boot_lock);
1513 +       raw_spin_unlock(&boot_lock);
1514
1515         return pen_release != -1 ? -ENOSYS : 0;
1516  }
1517 diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
1518 index 14cdc6dea493..9196cf82f7be 100644
1519 --- a/arch/arm64/Kconfig
1520 +++ b/arch/arm64/Kconfig
1521 @@ -76,6 +76,7 @@ config ARM64
1522         select HAVE_PERF_REGS
1523         select HAVE_PERF_USER_STACK_DUMP
1524         select HAVE_RCU_TABLE_FREE
1525 +       select HAVE_PREEMPT_LAZY
1526         select HAVE_SYSCALL_TRACEPOINTS
1527         select IOMMU_DMA if IOMMU_SUPPORT
1528         select IRQ_DOMAIN
1529 @@ -582,7 +583,7 @@ config XEN_DOM0
1530
1531  config XEN
1532         bool "Xen guest support on ARM64"
1533 -       depends on ARM64 && OF
1534 +       depends on ARM64 && OF && !PREEMPT_RT_FULL
1535         select SWIOTLB_XEN
1536         help
1537           Say Y if you want to run Linux in a Virtual Machine on Xen on ARM64.
1538 diff --git a/arch/arm64/include/asm/thread_info.h b/arch/arm64/include/asm/thread_info.h
1539 index 90c7ff233735..5f4e89fbc290 100644
1540 --- a/arch/arm64/include/asm/thread_info.h
1541 +++ b/arch/arm64/include/asm/thread_info.h
1542 @@ -49,6 +49,7 @@ struct thread_info {
1543         mm_segment_t            addr_limit;     /* address limit */
1544         struct task_struct      *task;          /* main task structure */
1545         int                     preempt_count;  /* 0 => preemptable, <0 => bug */
1546 +       int                     preempt_lazy_count; /* 0 => preemptable, <0 => bug */
1547         int                     cpu;            /* cpu */
1548  };
1549
1550 @@ -103,6 +104,7 @@ static inline struct thread_info *current_thread_info(void)
1551  #define TIF_NEED_RESCHED       1
1552  #define TIF_NOTIFY_RESUME      2       /* callback before returning to user */
1553  #define TIF_FOREIGN_FPSTATE    3       /* CPU's FP state is not current's */
1554 +#define TIF_NEED_RESCHED_LAZY  4
1555  #define TIF_NOHZ               7
1556  #define TIF_SYSCALL_TRACE      8
1557  #define TIF_SYSCALL_AUDIT      9
1558 @@ -118,6 +120,7 @@ static inline struct thread_info *current_thread_info(void)
1559  #define _TIF_NEED_RESCHED      (1 << TIF_NEED_RESCHED)
1560  #define _TIF_NOTIFY_RESUME     (1 << TIF_NOTIFY_RESUME)
1561  #define _TIF_FOREIGN_FPSTATE   (1 << TIF_FOREIGN_FPSTATE)
1562 +#define _TIF_NEED_RESCHED_LAZY (1 << TIF_NEED_RESCHED_LAZY)
1563  #define _TIF_NOHZ              (1 << TIF_NOHZ)
1564  #define _TIF_SYSCALL_TRACE     (1 << TIF_SYSCALL_TRACE)
1565  #define _TIF_SYSCALL_AUDIT     (1 << TIF_SYSCALL_AUDIT)
1566 @@ -126,7 +129,8 @@ static inline struct thread_info *current_thread_info(void)
1567  #define _TIF_32BIT             (1 << TIF_32BIT)
1568
1569  #define _TIF_WORK_MASK         (_TIF_NEED_RESCHED | _TIF_SIGPENDING | \
1570 -                                _TIF_NOTIFY_RESUME | _TIF_FOREIGN_FPSTATE)
1571 +                                _TIF_NOTIFY_RESUME | _TIF_FOREIGN_FPSTATE | \
1572 +                                _TIF_NEED_RESCHED_LAZY)
1573
1574  #define _TIF_SYSCALL_WORK      (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_AUDIT | \
1575                                  _TIF_SYSCALL_TRACEPOINT | _TIF_SECCOMP | \
1576 diff --git a/arch/arm64/kernel/asm-offsets.c b/arch/arm64/kernel/asm-offsets.c
1577 index 087cf9a65359..d74475928399 100644
1578 --- a/arch/arm64/kernel/asm-offsets.c
1579 +++ b/arch/arm64/kernel/asm-offsets.c
1580 @@ -35,6 +35,7 @@ int main(void)
1581    BLANK();
1582    DEFINE(TI_FLAGS,             offsetof(struct thread_info, flags));
1583    DEFINE(TI_PREEMPT,           offsetof(struct thread_info, preempt_count));
1584 +  DEFINE(TI_PREEMPT_LAZY,      offsetof(struct thread_info, preempt_lazy_count));
1585    DEFINE(TI_ADDR_LIMIT,                offsetof(struct thread_info, addr_limit));
1586    DEFINE(TI_TASK,              offsetof(struct thread_info, task));
1587    DEFINE(TI_CPU,               offsetof(struct thread_info, cpu));
1588 diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S
1589 index 5a3753d09e20..05d73c4c03f6 100644
1590 --- a/arch/arm64/kernel/entry.S
1591 +++ b/arch/arm64/kernel/entry.S
1592 @@ -376,11 +376,16 @@ el1_irq:
1593  #ifdef CONFIG_PREEMPT
1594         get_thread_info tsk
1595         ldr     w24, [tsk, #TI_PREEMPT]         // get preempt count
1596 -       cbnz    w24, 1f                         // preempt count != 0
1597 +       cbnz    w24, 2f                         // preempt count != 0
1598         ldr     x0, [tsk, #TI_FLAGS]            // get flags
1599 -       tbz     x0, #TIF_NEED_RESCHED, 1f       // needs rescheduling?
1600 -       bl      el1_preempt
1601 +       tbnz    x0, #TIF_NEED_RESCHED, 1f       // needs rescheduling?
1602 +
1603 +       ldr     w24, [tsk, #TI_PREEMPT_LAZY]    // get preempt lazy count
1604 +       cbnz    w24, 2f                         // preempt lazy count != 0
1605 +       tbz     x0, #TIF_NEED_RESCHED_LAZY, 2f  // needs rescheduling?
1606  1:
1607 +       bl      el1_preempt
1608 +2:
1609  #endif
1610  #ifdef CONFIG_TRACE_IRQFLAGS
1611         bl      trace_hardirqs_on
1612 @@ -394,6 +399,7 @@ el1_preempt:
1613  1:     bl      preempt_schedule_irq            // irq en/disable is done inside
1614         ldr     x0, [tsk, #TI_FLAGS]            // get new tasks TI_FLAGS
1615         tbnz    x0, #TIF_NEED_RESCHED, 1b       // needs rescheduling?
1616 +       tbnz    x0, #TIF_NEED_RESCHED_LAZY, 1b  // needs rescheduling?
1617         ret     x24
1618  #endif
1619
1620 @@ -638,6 +644,7 @@ ret_fast_syscall_trace:
1621   */
1622  work_pending:
1623         tbnz    x1, #TIF_NEED_RESCHED, work_resched
1624 +       tbnz    x1, #TIF_NEED_RESCHED_LAZY, work_resched
1625         /* TIF_SIGPENDING, TIF_NOTIFY_RESUME or TIF_FOREIGN_FPSTATE case */
1626         ldr     x2, [sp, #S_PSTATE]
1627         mov     x0, sp                          // 'regs'
1628 diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig
1629 index db459612de44..bd8be6a0e745 100644
1630 --- a/arch/mips/Kconfig
1631 +++ b/arch/mips/Kconfig
1632 @@ -2410,7 +2410,7 @@ config CPU_R4400_WORKAROUNDS
1633  #
1634  config HIGHMEM
1635         bool "High Memory Support"
1636 -       depends on 32BIT && CPU_SUPPORTS_HIGHMEM && SYS_SUPPORTS_HIGHMEM && !CPU_MIPS32_3_5_EVA
1637 +       depends on 32BIT && CPU_SUPPORTS_HIGHMEM && SYS_SUPPORTS_HIGHMEM && !CPU_MIPS32_3_5_EVA && !PREEMPT_RT_FULL
1638
1639  config CPU_SUPPORTS_HIGHMEM
1640         bool
1641 diff --git a/arch/mips/kvm/mips.c b/arch/mips/kvm/mips.c
1642 index e86b7499921a..b2a2f678c5dc 100644
1643 --- a/arch/mips/kvm/mips.c
1644 +++ b/arch/mips/kvm/mips.c
1645 @@ -454,8 +454,8 @@ int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu,
1646
1647         dvcpu->arch.wait = 0;
1648
1649 -       if (waitqueue_active(&dvcpu->wq))
1650 -               wake_up_interruptible(&dvcpu->wq);
1651 +       if (swait_active(&dvcpu->wq))
1652 +               swake_up(&dvcpu->wq);
1653
1654         return 0;
1655  }
1656 @@ -1183,8 +1183,8 @@ static void kvm_mips_comparecount_func(unsigned long data)
1657         kvm_mips_callbacks->queue_timer_int(vcpu);
1658
1659         vcpu->arch.wait = 0;
1660 -       if (waitqueue_active(&vcpu->wq))
1661 -               wake_up_interruptible(&vcpu->wq);
1662 +       if (swait_active(&vcpu->wq))
1663 +               swake_up(&vcpu->wq);
1664  }
1665
1666  /* low level hrtimer wake routine */
1667 diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
1668 index db49e0d796b1..1d2be228661c 100644
1669 --- a/arch/powerpc/Kconfig
1670 +++ b/arch/powerpc/Kconfig
1671 @@ -60,10 +60,11 @@ config LOCKDEP_SUPPORT
1672
1673  config RWSEM_GENERIC_SPINLOCK
1674         bool
1675 +       default y if PREEMPT_RT_FULL
1676
1677  config RWSEM_XCHGADD_ALGORITHM
1678         bool
1679 -       default y
1680 +       default y if !PREEMPT_RT_FULL
1681
1682  config GENERIC_LOCKBREAK
1683         bool
1684 @@ -141,6 +142,7 @@ config PPC
1685         select ARCH_HAS_TICK_BROADCAST if GENERIC_CLOCKEVENTS_BROADCAST
1686         select GENERIC_STRNCPY_FROM_USER
1687         select GENERIC_STRNLEN_USER
1688 +       select HAVE_PREEMPT_LAZY
1689         select HAVE_MOD_ARCH_SPECIFIC
1690         select MODULES_USE_ELF_RELA
1691         select CLONE_BACKWARDS
1692 @@ -319,7 +321,7 @@ menu "Kernel options"
1693
1694  config HIGHMEM
1695         bool "High memory support"
1696 -       depends on PPC32
1697 +       depends on PPC32 && !PREEMPT_RT_FULL
1698
1699  source kernel/Kconfig.hz
1700  source kernel/Kconfig.preempt
1701 diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h
1702 index cfa758c6b4f6..f8673ff84b31 100644
1703 --- a/arch/powerpc/include/asm/kvm_host.h
1704 +++ b/arch/powerpc/include/asm/kvm_host.h
1705 @@ -286,7 +286,7 @@ struct kvmppc_vcore {
1706         struct list_head runnable_threads;
1707         struct list_head preempt_list;
1708         spinlock_t lock;
1709 -       wait_queue_head_t wq;
1710 +       struct swait_queue_head wq;
1711         spinlock_t stoltb_lock; /* protects stolen_tb and preempt_tb */
1712         u64 stolen_tb;
1713         u64 preempt_tb;
1714 @@ -626,7 +626,7 @@ struct kvm_vcpu_arch {
1715         u8 prodded;
1716         u32 last_inst;
1717
1718 -       wait_queue_head_t *wqp;
1719 +       struct swait_queue_head *wqp;
1720         struct kvmppc_vcore *vcore;
1721         int ret;
1722         int trap;
1723 diff --git a/arch/powerpc/include/asm/thread_info.h b/arch/powerpc/include/asm/thread_info.h
1724 index 7efee4a3240b..40e6fa1b85b2 100644
1725 --- a/arch/powerpc/include/asm/thread_info.h
1726 +++ b/arch/powerpc/include/asm/thread_info.h
1727 @@ -42,6 +42,8 @@ struct thread_info {
1728         int             cpu;                    /* cpu we're on */
1729         int             preempt_count;          /* 0 => preemptable,
1730                                                    <0 => BUG */
1731 +       int             preempt_lazy_count;      /* 0 => preemptable,
1732 +                                                  <0 => BUG */
1733         unsigned long   local_flags;            /* private flags for thread */
1734
1735         /* low level flags - has atomic operations done on it */
1736 @@ -82,8 +84,7 @@ static inline struct thread_info *current_thread_info(void)
1737  #define TIF_SYSCALL_TRACE      0       /* syscall trace active */
1738  #define TIF_SIGPENDING         1       /* signal pending */
1739  #define TIF_NEED_RESCHED       2       /* rescheduling necessary */
1740 -#define TIF_POLLING_NRFLAG     3       /* true if poll_idle() is polling
1741 -                                          TIF_NEED_RESCHED */
1742 +#define TIF_NEED_RESCHED_LAZY  3       /* lazy rescheduling necessary */
1743  #define TIF_32BIT              4       /* 32 bit binary */
1744  #define TIF_RESTORE_TM         5       /* need to restore TM FP/VEC/VSX */
1745  #define TIF_SYSCALL_AUDIT      7       /* syscall auditing active */
1746 @@ -101,6 +102,8 @@ static inline struct thread_info *current_thread_info(void)
1747  #if defined(CONFIG_PPC64)
1748  #define TIF_ELF2ABI            18      /* function descriptors must die! */
1749  #endif
1750 +#define TIF_POLLING_NRFLAG     19      /* true if poll_idle() is polling
1751 +                                          TIF_NEED_RESCHED */
1752
1753  /* as above, but as bit values */
1754  #define _TIF_SYSCALL_TRACE     (1<<TIF_SYSCALL_TRACE)
1755 @@ -119,14 +122,16 @@ static inline struct thread_info *current_thread_info(void)
1756  #define _TIF_SYSCALL_TRACEPOINT        (1<<TIF_SYSCALL_TRACEPOINT)
1757  #define _TIF_EMULATE_STACK_STORE       (1<<TIF_EMULATE_STACK_STORE)
1758  #define _TIF_NOHZ              (1<<TIF_NOHZ)
1759 +#define _TIF_NEED_RESCHED_LAZY (1<<TIF_NEED_RESCHED_LAZY)
1760  #define _TIF_SYSCALL_DOTRACE   (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_AUDIT | \
1761                                  _TIF_SECCOMP | _TIF_SYSCALL_TRACEPOINT | \
1762                                  _TIF_NOHZ)
1763
1764  #define _TIF_USER_WORK_MASK    (_TIF_SIGPENDING | _TIF_NEED_RESCHED | \
1765                                  _TIF_NOTIFY_RESUME | _TIF_UPROBE | \
1766 -                                _TIF_RESTORE_TM)
1767 +                                _TIF_RESTORE_TM | _TIF_NEED_RESCHED_LAZY)
1768  #define _TIF_PERSYSCALL_MASK   (_TIF_RESTOREALL|_TIF_NOERROR)
1769 +#define _TIF_NEED_RESCHED_MASK (_TIF_NEED_RESCHED | _TIF_NEED_RESCHED_LAZY)
1770
1771  /* Bits in local_flags */
1772  /* Don't move TLF_NAPPING without adjusting the code in entry_32.S */
1773 diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c
1774 index 221d584d089f..d6d0c59ef8ae 100644
1775 --- a/arch/powerpc/kernel/asm-offsets.c
1776 +++ b/arch/powerpc/kernel/asm-offsets.c
1777 @@ -160,6 +160,7 @@ int main(void)
1778         DEFINE(TI_FLAGS, offsetof(struct thread_info, flags));
1779         DEFINE(TI_LOCAL_FLAGS, offsetof(struct thread_info, local_flags));
1780         DEFINE(TI_PREEMPT, offsetof(struct thread_info, preempt_count));
1781 +       DEFINE(TI_PREEMPT_LAZY, offsetof(struct thread_info, preempt_lazy_count));
1782         DEFINE(TI_TASK, offsetof(struct thread_info, task));
1783         DEFINE(TI_CPU, offsetof(struct thread_info, cpu));
1784
1785 diff --git a/arch/powerpc/kernel/entry_32.S b/arch/powerpc/kernel/entry_32.S
1786 index 2405631e91a2..c21b4b42eaa0 100644
1787 --- a/arch/powerpc/kernel/entry_32.S
1788 +++ b/arch/powerpc/kernel/entry_32.S
1789 @@ -818,7 +818,14 @@ resume_kernel:
1790         cmpwi   0,r0,0          /* if non-zero, just restore regs and return */
1791         bne     restore
1792         andi.   r8,r8,_TIF_NEED_RESCHED
1793 +       bne+    1f
1794 +       lwz     r0,TI_PREEMPT_LAZY(r9)
1795 +       cmpwi   0,r0,0          /* if non-zero, just restore regs and return */
1796 +       bne     restore
1797 +       lwz     r0,TI_FLAGS(r9)
1798 +       andi.   r0,r0,_TIF_NEED_RESCHED_LAZY
1799         beq+    restore
1800 +1:
1801         lwz     r3,_MSR(r1)
1802         andi.   r0,r3,MSR_EE    /* interrupts off? */
1803         beq     restore         /* don't schedule if so */
1804 @@ -829,11 +836,11 @@ resume_kernel:
1805          */
1806         bl      trace_hardirqs_off
1807  #endif
1808 -1:     bl      preempt_schedule_irq
1809 +2:     bl      preempt_schedule_irq
1810         CURRENT_THREAD_INFO(r9, r1)
1811         lwz     r3,TI_FLAGS(r9)
1812 -       andi.   r0,r3,_TIF_NEED_RESCHED
1813 -       bne-    1b
1814 +       andi.   r0,r3,_TIF_NEED_RESCHED_MASK
1815 +       bne-    2b
1816  #ifdef CONFIG_TRACE_IRQFLAGS
1817         /* And now, to properly rebalance the above, we tell lockdep they
1818          * are being turned back on, which will happen when we return
1819 @@ -1154,7 +1161,7 @@ global_dbcr0:
1820  #endif /* !(CONFIG_4xx || CONFIG_BOOKE) */
1821
1822  do_work:                       /* r10 contains MSR_KERNEL here */
1823 -       andi.   r0,r9,_TIF_NEED_RESCHED
1824 +       andi.   r0,r9,_TIF_NEED_RESCHED_MASK
1825         beq     do_user_signal
1826
1827  do_resched:                    /* r10 contains MSR_KERNEL here */
1828 @@ -1175,7 +1182,7 @@ recheck:
1829         MTMSRD(r10)             /* disable interrupts */
1830         CURRENT_THREAD_INFO(r9, r1)
1831         lwz     r9,TI_FLAGS(r9)
1832 -       andi.   r0,r9,_TIF_NEED_RESCHED
1833 +       andi.   r0,r9,_TIF_NEED_RESCHED_MASK
1834         bne-    do_resched
1835         andi.   r0,r9,_TIF_USER_WORK_MASK
1836         beq     restore_user
1837 diff --git a/arch/powerpc/kernel/entry_64.S b/arch/powerpc/kernel/entry_64.S
1838 index edba294620db..1aae3fdb0c2a 100644
1839 --- a/arch/powerpc/kernel/entry_64.S
1840 +++ b/arch/powerpc/kernel/entry_64.S
1841 @@ -683,7 +683,7 @@ _GLOBAL(ret_from_except_lite)
1842  #else
1843         beq     restore
1844  #endif
1845 -1:     andi.   r0,r4,_TIF_NEED_RESCHED
1846 +1:     andi.   r0,r4,_TIF_NEED_RESCHED_MASK
1847         beq     2f
1848         bl      restore_interrupts
1849         SCHEDULE_USER
1850 @@ -745,10 +745,18 @@ resume_kernel:
1851
1852  #ifdef CONFIG_PREEMPT
1853         /* Check if we need to preempt */
1854 +       lwz     r8,TI_PREEMPT(r9)
1855 +       cmpwi   0,r8,0          /* if non-zero, just restore regs and return */
1856 +       bne     restore
1857         andi.   r0,r4,_TIF_NEED_RESCHED
1858 +       bne+    check_count
1859 +
1860 +       andi.   r0,r4,_TIF_NEED_RESCHED_LAZY
1861         beq+    restore
1862 +       lwz     r8,TI_PREEMPT_LAZY(r9)
1863 +
1864         /* Check that preempt_count() == 0 and interrupts are enabled */
1865 -       lwz     r8,TI_PREEMPT(r9)
1866 +check_count:
1867         cmpwi   cr1,r8,0
1868         ld      r0,SOFTE(r1)
1869         cmpdi   r0,0
1870 @@ -765,7 +773,7 @@ resume_kernel:
1871         /* Re-test flags and eventually loop */
1872         CURRENT_THREAD_INFO(r9, r1)
1873         ld      r4,TI_FLAGS(r9)
1874 -       andi.   r0,r4,_TIF_NEED_RESCHED
1875 +       andi.   r0,r4,_TIF_NEED_RESCHED_MASK
1876         bne     1b
1877
1878         /*
1879 diff --git a/arch/powerpc/kernel/irq.c b/arch/powerpc/kernel/irq.c
1880 index 290559df1e8b..070afa6da35d 100644
1881 --- a/arch/powerpc/kernel/irq.c
1882 +++ b/arch/powerpc/kernel/irq.c
1883 @@ -614,6 +614,7 @@ void irq_ctx_init(void)
1884         }
1885  }
1886
1887 +#ifndef CONFIG_PREEMPT_RT_FULL
1888  void do_softirq_own_stack(void)
1889  {
1890         struct thread_info *curtp, *irqtp;
1891 @@ -631,6 +632,7 @@ void do_softirq_own_stack(void)
1892         if (irqtp->flags)
1893                 set_bits(irqtp->flags, &curtp->flags);
1894  }
1895 +#endif
1896
1897  irq_hw_number_t virq_to_hw(unsigned int virq)
1898  {
1899 diff --git a/arch/powerpc/kernel/misc_32.S b/arch/powerpc/kernel/misc_32.S
1900 index ed3ab509faca..8b261416c070 100644
1901 --- a/arch/powerpc/kernel/misc_32.S
1902 +++ b/arch/powerpc/kernel/misc_32.S
1903 @@ -40,6 +40,7 @@
1904   * We store the saved ksp_limit in the unused part
1905   * of the STACK_FRAME_OVERHEAD
1906   */
1907 +#ifndef CONFIG_PREEMPT_RT_FULL
1908  _GLOBAL(call_do_softirq)
1909         mflr    r0
1910         stw     r0,4(r1)
1911 @@ -56,6 +57,7 @@ _GLOBAL(call_do_softirq)
1912         stw     r10,THREAD+KSP_LIMIT(r2)
1913         mtlr    r0
1914         blr
1915 +#endif
1916
1917  /*
1918   * void call_do_irq(struct pt_regs *regs, struct thread_info *irqtp);
1919 diff --git a/arch/powerpc/kernel/misc_64.S b/arch/powerpc/kernel/misc_64.S
1920 index db475d41b57a..96b7ef80e05d 100644
1921 --- a/arch/powerpc/kernel/misc_64.S
1922 +++ b/arch/powerpc/kernel/misc_64.S
1923 @@ -30,6 +30,7 @@
1924
1925         .text
1926
1927 +#ifndef CONFIG_PREEMPT_RT_FULL
1928  _GLOBAL(call_do_softirq)
1929         mflr    r0
1930         std     r0,16(r1)
1931 @@ -40,6 +41,7 @@ _GLOBAL(call_do_softirq)
1932         ld      r0,16(r1)
1933         mtlr    r0
1934         blr
1935 +#endif
1936
1937  _GLOBAL(call_do_irq)
1938         mflr    r0
1939 diff --git a/arch/powerpc/kvm/Kconfig b/arch/powerpc/kvm/Kconfig
1940 index c2024ac9d4e8..2303788da7e1 100644
1941 --- a/arch/powerpc/kvm/Kconfig
1942 +++ b/arch/powerpc/kvm/Kconfig
1943 @@ -172,6 +172,7 @@ config KVM_E500MC
1944  config KVM_MPIC
1945         bool "KVM in-kernel MPIC emulation"
1946         depends on KVM && E500
1947 +       depends on !PREEMPT_RT_FULL
1948         select HAVE_KVM_IRQCHIP
1949         select HAVE_KVM_IRQFD
1950         select HAVE_KVM_IRQ_ROUTING
1951 diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
1952 index a7352b59e6f9..df34a6432873 100644
1953 --- a/arch/powerpc/kvm/book3s_hv.c
1954 +++ b/arch/powerpc/kvm/book3s_hv.c
1955 @@ -114,11 +114,11 @@ static bool kvmppc_ipi_thread(int cpu)
1956  static void kvmppc_fast_vcpu_kick_hv(struct kvm_vcpu *vcpu)
1957  {
1958         int cpu;
1959 -       wait_queue_head_t *wqp;
1960 +       struct swait_queue_head *wqp;
1961
1962         wqp = kvm_arch_vcpu_wq(vcpu);
1963 -       if (waitqueue_active(wqp)) {
1964 -               wake_up_interruptible(wqp);
1965 +       if (swait_active(wqp)) {
1966 +               swake_up(wqp);
1967                 ++vcpu->stat.halt_wakeup;
1968         }
1969
1970 @@ -707,8 +707,8 @@ int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu)
1971                 tvcpu->arch.prodded = 1;
1972                 smp_mb();
1973                 if (vcpu->arch.ceded) {
1974 -                       if (waitqueue_active(&vcpu->wq)) {
1975 -                               wake_up_interruptible(&vcpu->wq);
1976 +                       if (swait_active(&vcpu->wq)) {
1977 +                               swake_up(&vcpu->wq);
1978                                 vcpu->stat.halt_wakeup++;
1979                         }
1980                 }
1981 @@ -1447,7 +1447,7 @@ static struct kvmppc_vcore *kvmppc_vcore_create(struct kvm *kvm, int core)
1982         INIT_LIST_HEAD(&vcore->runnable_threads);
1983         spin_lock_init(&vcore->lock);
1984         spin_lock_init(&vcore->stoltb_lock);
1985 -       init_waitqueue_head(&vcore->wq);
1986 +       init_swait_queue_head(&vcore->wq);
1987         vcore->preempt_tb = TB_NIL;
1988         vcore->lpcr = kvm->arch.lpcr;
1989         vcore->first_vcpuid = core * threads_per_subcore;
1990 @@ -2519,10 +2519,9 @@ static void kvmppc_vcore_blocked(struct kvmppc_vcore *vc)
1991  {
1992         struct kvm_vcpu *vcpu;
1993         int do_sleep = 1;
1994 +       DECLARE_SWAITQUEUE(wait);
1995
1996 -       DEFINE_WAIT(wait);
1997 -
1998 -       prepare_to_wait(&vc->wq, &wait, TASK_INTERRUPTIBLE);
1999 +       prepare_to_swait(&vc->wq, &wait, TASK_INTERRUPTIBLE);
2000
2001         /*
2002          * Check one last time for pending exceptions and ceded state after
2003 @@ -2536,7 +2535,7 @@ static void kvmppc_vcore_blocked(struct kvmppc_vcore *vc)
2004         }
2005
2006         if (!do_sleep) {
2007 -               finish_wait(&vc->wq, &wait);
2008 +               finish_swait(&vc->wq, &wait);
2009                 return;
2010         }
2011
2012 @@ -2544,7 +2543,7 @@ static void kvmppc_vcore_blocked(struct kvmppc_vcore *vc)
2013         trace_kvmppc_vcore_blocked(vc, 0);
2014         spin_unlock(&vc->lock);
2015         schedule();
2016 -       finish_wait(&vc->wq, &wait);
2017 +       finish_swait(&vc->wq, &wait);
2018         spin_lock(&vc->lock);
2019         vc->vcore_state = VCORE_INACTIVE;
2020         trace_kvmppc_vcore_blocked(vc, 1);
2021 @@ -2600,7 +2599,7 @@ static int kvmppc_run_vcpu(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
2022                         kvmppc_start_thread(vcpu, vc);
2023                         trace_kvm_guest_enter(vcpu);
2024                 } else if (vc->vcore_state == VCORE_SLEEPING) {
2025 -                       wake_up(&vc->wq);
2026 +                       swake_up(&vc->wq);
2027                 }
2028
2029         }
2030 diff --git a/arch/powerpc/platforms/ps3/device-init.c b/arch/powerpc/platforms/ps3/device-init.c
2031 index 3f175e8aedb4..c4c02f91904c 100644
2032 --- a/arch/powerpc/platforms/ps3/device-init.c
2033 +++ b/arch/powerpc/platforms/ps3/device-init.c
2034 @@ -752,7 +752,7 @@ static int ps3_notification_read_write(struct ps3_notification_device *dev,
2035         }
2036         pr_debug("%s:%u: notification %s issued\n", __func__, __LINE__, op);
2037
2038 -       res = wait_event_interruptible(dev->done.wait,
2039 +       res = swait_event_interruptible(dev->done.wait,
2040                                        dev->done.done || kthread_should_stop());
2041         if (kthread_should_stop())
2042                 res = -EINTR;
2043 diff --git a/arch/s390/include/asm/kvm_host.h b/arch/s390/include/asm/kvm_host.h
2044 index e9a983f40a24..bbdc539fb3c6 100644
2045 --- a/arch/s390/include/asm/kvm_host.h
2046 +++ b/arch/s390/include/asm/kvm_host.h
2047 @@ -427,7 +427,7 @@ struct kvm_s390_irq_payload {
2048  struct kvm_s390_local_interrupt {
2049         spinlock_t lock;
2050         struct kvm_s390_float_interrupt *float_int;
2051 -       wait_queue_head_t *wq;
2052 +       struct swait_queue_head *wq;
2053         atomic_t *cpuflags;
2054         DECLARE_BITMAP(sigp_emerg_pending, KVM_MAX_VCPUS);
2055         struct kvm_s390_irq_payload irq;
2056 diff --git a/arch/s390/kvm/interrupt.c b/arch/s390/kvm/interrupt.c
2057 index 6a75352f453c..cc862c486002 100644
2058 --- a/arch/s390/kvm/interrupt.c
2059 +++ b/arch/s390/kvm/interrupt.c
2060 @@ -868,13 +868,13 @@ no_timer:
2061
2062  void kvm_s390_vcpu_wakeup(struct kvm_vcpu *vcpu)
2063  {
2064 -       if (waitqueue_active(&vcpu->wq)) {
2065 +       if (swait_active(&vcpu->wq)) {
2066                 /*
2067                  * The vcpu gave up the cpu voluntarily, mark it as a good
2068                  * yield-candidate.
2069                  */
2070                 vcpu->preempted = true;
2071 -               wake_up_interruptible(&vcpu->wq);
2072 +               swake_up(&vcpu->wq);
2073                 vcpu->stat.halt_wakeup++;
2074         }
2075  }
2076 diff --git a/arch/sh/kernel/irq.c b/arch/sh/kernel/irq.c
2077 index 6c0378c0b8b5..abd58b4dff97 100644
2078 --- a/arch/sh/kernel/irq.c
2079 +++ b/arch/sh/kernel/irq.c
2080 @@ -147,6 +147,7 @@ void irq_ctx_exit(int cpu)
2081         hardirq_ctx[cpu] = NULL;
2082  }
2083
2084 +#ifndef CONFIG_PREEMPT_RT_FULL
2085  void do_softirq_own_stack(void)
2086  {
2087         struct thread_info *curctx;
2088 @@ -174,6 +175,7 @@ void do_softirq_own_stack(void)
2089                   "r5", "r6", "r7", "r8", "r9", "r15", "t", "pr"
2090         );
2091  }
2092 +#endif
2093  #else
2094  static inline void handle_one_irq(unsigned int irq)
2095  {
2096 diff --git a/arch/sparc/Kconfig b/arch/sparc/Kconfig
2097 index 56442d2d7bbc..8c9598f534c9 100644
2098 --- a/arch/sparc/Kconfig
2099 +++ b/arch/sparc/Kconfig
2100 @@ -189,12 +189,10 @@ config NR_CPUS
2101  source kernel/Kconfig.hz
2102
2103  config RWSEM_GENERIC_SPINLOCK
2104 -       bool
2105 -       default y if SPARC32
2106 +       def_bool PREEMPT_RT_FULL
2107
2108  config RWSEM_XCHGADD_ALGORITHM
2109 -       bool
2110 -       default y if SPARC64
2111 +       def_bool !RWSEM_GENERIC_SPINLOCK && !PREEMPT_RT_FULL
2112
2113  config GENERIC_HWEIGHT
2114         bool
2115 diff --git a/arch/sparc/kernel/irq_64.c b/arch/sparc/kernel/irq_64.c
2116 index e22416ce56ea..d359de71153a 100644
2117 --- a/arch/sparc/kernel/irq_64.c
2118 +++ b/arch/sparc/kernel/irq_64.c
2119 @@ -854,6 +854,7 @@ void __irq_entry handler_irq(int pil, struct pt_regs *regs)
2120         set_irq_regs(old_regs);
2121  }
2122
2123 +#ifndef CONFIG_PREEMPT_RT_FULL
2124  void do_softirq_own_stack(void)
2125  {
2126         void *orig_sp, *sp = softirq_stack[smp_processor_id()];
2127 @@ -868,6 +869,7 @@ void do_softirq_own_stack(void)
2128         __asm__ __volatile__("mov %0, %%sp"
2129                              : : "r" (orig_sp));
2130  }
2131 +#endif
2132
2133  #ifdef CONFIG_HOTPLUG_CPU
2134  void fixup_irqs(void)
2135 diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
2136 index 436639a31624..6ee1dd0deadc 100644
2137 --- a/arch/x86/Kconfig
2138 +++ b/arch/x86/Kconfig
2139 @@ -17,6 +17,7 @@ config X86_64
2140  ### Arch settings
2141  config X86
2142         def_bool y
2143 +       select HAVE_PREEMPT_LAZY
2144         select ACPI_LEGACY_TABLES_LOOKUP        if ACPI
2145         select ACPI_SYSTEM_POWER_STATES_SUPPORT if ACPI
2146         select ANON_INODES
2147 @@ -212,8 +213,11 @@ config ARCH_MAY_HAVE_PC_FDC
2148         def_bool y
2149         depends on ISA_DMA_API
2150
2151 +config RWSEM_GENERIC_SPINLOCK
2152 +       def_bool PREEMPT_RT_FULL
2153 +
2154  config RWSEM_XCHGADD_ALGORITHM
2155 -       def_bool y
2156 +       def_bool !RWSEM_GENERIC_SPINLOCK && !PREEMPT_RT_FULL
2157
2158  config GENERIC_CALIBRATE_DELAY
2159         def_bool y
2160 @@ -848,7 +852,7 @@ config IOMMU_HELPER
2161  config MAXSMP
2162         bool "Enable Maximum number of SMP Processors and NUMA Nodes"
2163         depends on X86_64 && SMP && DEBUG_KERNEL
2164 -       select CPUMASK_OFFSTACK
2165 +       select CPUMASK_OFFSTACK if !PREEMPT_RT_FULL
2166         ---help---
2167           Enable maximum number of CPUS and NUMA Nodes for this architecture.
2168           If unsure, say N.
2169 diff --git a/arch/x86/crypto/aesni-intel_glue.c b/arch/x86/crypto/aesni-intel_glue.c
2170 index 3633ad6145c5..c6d5458ee7f9 100644
2171 --- a/arch/x86/crypto/aesni-intel_glue.c
2172 +++ b/arch/x86/crypto/aesni-intel_glue.c
2173 @@ -383,14 +383,14 @@ static int ecb_encrypt(struct blkcipher_desc *desc,
2174         err = blkcipher_walk_virt(desc, &walk);
2175         desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
2176
2177 -       kernel_fpu_begin();
2178         while ((nbytes = walk.nbytes)) {
2179 +               kernel_fpu_begin();
2180                 aesni_ecb_enc(ctx, walk.dst.virt.addr, walk.src.virt.addr,
2181 -                             nbytes & AES_BLOCK_MASK);
2182 +                               nbytes & AES_BLOCK_MASK);
2183 +               kernel_fpu_end();
2184                 nbytes &= AES_BLOCK_SIZE - 1;
2185                 err = blkcipher_walk_done(desc, &walk, nbytes);
2186         }
2187 -       kernel_fpu_end();
2188
2189         return err;
2190  }
2191 @@ -407,14 +407,14 @@ static int ecb_decrypt(struct blkcipher_desc *desc,
2192         err = blkcipher_walk_virt(desc, &walk);
2193         desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
2194
2195 -       kernel_fpu_begin();
2196         while ((nbytes = walk.nbytes)) {
2197 +               kernel_fpu_begin();
2198                 aesni_ecb_dec(ctx, walk.dst.virt.addr, walk.src.virt.addr,
2199                               nbytes & AES_BLOCK_MASK);
2200 +               kernel_fpu_end();
2201                 nbytes &= AES_BLOCK_SIZE - 1;
2202                 err = blkcipher_walk_done(desc, &walk, nbytes);
2203         }
2204 -       kernel_fpu_end();
2205
2206         return err;
2207  }
2208 @@ -431,14 +431,14 @@ static int cbc_encrypt(struct blkcipher_desc *desc,
2209         err = blkcipher_walk_virt(desc, &walk);
2210         desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
2211
2212 -       kernel_fpu_begin();
2213         while ((nbytes = walk.nbytes)) {
2214 +               kernel_fpu_begin();
2215                 aesni_cbc_enc(ctx, walk.dst.virt.addr, walk.src.virt.addr,
2216                               nbytes & AES_BLOCK_MASK, walk.iv);
2217 +               kernel_fpu_end();
2218                 nbytes &= AES_BLOCK_SIZE - 1;
2219                 err = blkcipher_walk_done(desc, &walk, nbytes);
2220         }
2221 -       kernel_fpu_end();
2222
2223         return err;
2224  }
2225 @@ -455,14 +455,14 @@ static int cbc_decrypt(struct blkcipher_desc *desc,
2226         err = blkcipher_walk_virt(desc, &walk);
2227         desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
2228
2229 -       kernel_fpu_begin();
2230         while ((nbytes = walk.nbytes)) {
2231 +               kernel_fpu_begin();
2232                 aesni_cbc_dec(ctx, walk.dst.virt.addr, walk.src.virt.addr,
2233                               nbytes & AES_BLOCK_MASK, walk.iv);
2234 +               kernel_fpu_end();
2235                 nbytes &= AES_BLOCK_SIZE - 1;
2236                 err = blkcipher_walk_done(desc, &walk, nbytes);
2237         }
2238 -       kernel_fpu_end();
2239
2240         return err;
2241  }
2242 @@ -514,18 +514,20 @@ static int ctr_crypt(struct blkcipher_desc *desc,
2243         err = blkcipher_walk_virt_block(desc, &walk, AES_BLOCK_SIZE);
2244         desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
2245
2246 -       kernel_fpu_begin();
2247         while ((nbytes = walk.nbytes) >= AES_BLOCK_SIZE) {
2248 +               kernel_fpu_begin();
2249                 aesni_ctr_enc_tfm(ctx, walk.dst.virt.addr, walk.src.virt.addr,
2250                                       nbytes & AES_BLOCK_MASK, walk.iv);
2251 +               kernel_fpu_end();
2252                 nbytes &= AES_BLOCK_SIZE - 1;
2253                 err = blkcipher_walk_done(desc, &walk, nbytes);
2254         }
2255         if (walk.nbytes) {
2256 +               kernel_fpu_begin();
2257                 ctr_crypt_final(ctx, &walk);
2258 +               kernel_fpu_end();
2259                 err = blkcipher_walk_done(desc, &walk, 0);
2260         }
2261 -       kernel_fpu_end();
2262
2263         return err;
2264  }
2265 diff --git a/arch/x86/crypto/cast5_avx_glue.c b/arch/x86/crypto/cast5_avx_glue.c
2266 index 8648158f3916..d7699130ee36 100644
2267 --- a/arch/x86/crypto/cast5_avx_glue.c
2268 +++ b/arch/x86/crypto/cast5_avx_glue.c
2269 @@ -59,7 +59,7 @@ static inline void cast5_fpu_end(bool fpu_enabled)
2270  static int ecb_crypt(struct blkcipher_desc *desc, struct blkcipher_walk *walk,
2271                      bool enc)
2272  {
2273 -       bool fpu_enabled = false;
2274 +       bool fpu_enabled;
2275         struct cast5_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
2276         const unsigned int bsize = CAST5_BLOCK_SIZE;
2277         unsigned int nbytes;
2278 @@ -75,7 +75,7 @@ static int ecb_crypt(struct blkcipher_desc *desc, struct blkcipher_walk *walk,
2279                 u8 *wsrc = walk->src.virt.addr;
2280                 u8 *wdst = walk->dst.virt.addr;
2281
2282 -               fpu_enabled = cast5_fpu_begin(fpu_enabled, nbytes);
2283 +               fpu_enabled = cast5_fpu_begin(false, nbytes);
2284
2285                 /* Process multi-block batch */
2286                 if (nbytes >= bsize * CAST5_PARALLEL_BLOCKS) {
2287 @@ -103,10 +103,9 @@ static int ecb_crypt(struct blkcipher_desc *desc, struct blkcipher_walk *walk,
2288                 } while (nbytes >= bsize);
2289
2290  done:
2291 +               cast5_fpu_end(fpu_enabled);
2292                 err = blkcipher_walk_done(desc, walk, nbytes);
2293         }
2294 -
2295 -       cast5_fpu_end(fpu_enabled);
2296         return err;
2297  }
2298
2299 @@ -227,7 +226,7 @@ done:
2300  static int cbc_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
2301                        struct scatterlist *src, unsigned int nbytes)
2302  {
2303 -       bool fpu_enabled = false;
2304 +       bool fpu_enabled;
2305         struct blkcipher_walk walk;
2306         int err;
2307
2308 @@ -236,12 +235,11 @@ static int cbc_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
2309         desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
2310
2311         while ((nbytes = walk.nbytes)) {
2312 -               fpu_enabled = cast5_fpu_begin(fpu_enabled, nbytes);
2313 +               fpu_enabled = cast5_fpu_begin(false, nbytes);
2314                 nbytes = __cbc_decrypt(desc, &walk);
2315 +               cast5_fpu_end(fpu_enabled);
2316                 err = blkcipher_walk_done(desc, &walk, nbytes);
2317         }
2318 -
2319 -       cast5_fpu_end(fpu_enabled);
2320         return err;
2321  }
2322
2323 @@ -311,7 +309,7 @@ done:
2324  static int ctr_crypt(struct blkcipher_desc *desc, struct scatterlist *dst,
2325                      struct scatterlist *src, unsigned int nbytes)
2326  {
2327 -       bool fpu_enabled = false;
2328 +       bool fpu_enabled;
2329         struct blkcipher_walk walk;
2330         int err;
2331
2332 @@ -320,13 +318,12 @@ static int ctr_crypt(struct blkcipher_desc *desc, struct scatterlist *dst,
2333         desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
2334
2335         while ((nbytes = walk.nbytes) >= CAST5_BLOCK_SIZE) {
2336 -               fpu_enabled = cast5_fpu_begin(fpu_enabled, nbytes);
2337 +               fpu_enabled = cast5_fpu_begin(false, nbytes);
2338                 nbytes = __ctr_crypt(desc, &walk);
2339 +               cast5_fpu_end(fpu_enabled);
2340                 err = blkcipher_walk_done(desc, &walk, nbytes);
2341         }
2342
2343 -       cast5_fpu_end(fpu_enabled);
2344 -
2345         if (walk.nbytes) {
2346                 ctr_crypt_final(desc, &walk);
2347                 err = blkcipher_walk_done(desc, &walk, 0);
2348 diff --git a/arch/x86/crypto/glue_helper.c b/arch/x86/crypto/glue_helper.c
2349 index 6a85598931b5..3a506ce7ed93 100644
2350 --- a/arch/x86/crypto/glue_helper.c
2351 +++ b/arch/x86/crypto/glue_helper.c
2352 @@ -39,7 +39,7 @@ static int __glue_ecb_crypt_128bit(const struct common_glue_ctx *gctx,
2353         void *ctx = crypto_blkcipher_ctx(desc->tfm);
2354         const unsigned int bsize = 128 / 8;
2355         unsigned int nbytes, i, func_bytes;
2356 -       bool fpu_enabled = false;
2357 +       bool fpu_enabled;
2358         int err;
2359
2360         err = blkcipher_walk_virt(desc, walk);
2361 @@ -49,7 +49,7 @@ static int __glue_ecb_crypt_128bit(const struct common_glue_ctx *gctx,
2362                 u8 *wdst = walk->dst.virt.addr;
2363
2364                 fpu_enabled = glue_fpu_begin(bsize, gctx->fpu_blocks_limit,
2365 -                                            desc, fpu_enabled, nbytes);
2366 +                                            desc, false, nbytes);
2367
2368                 for (i = 0; i < gctx->num_funcs; i++) {
2369                         func_bytes = bsize * gctx->funcs[i].num_blocks;
2370 @@ -71,10 +71,10 @@ static int __glue_ecb_crypt_128bit(const struct common_glue_ctx *gctx,
2371                 }
2372
2373  done:
2374 +               glue_fpu_end(fpu_enabled);
2375                 err = blkcipher_walk_done(desc, walk, nbytes);
2376         }
2377
2378 -       glue_fpu_end(fpu_enabled);
2379         return err;
2380  }
2381
2382 @@ -194,7 +194,7 @@ int glue_cbc_decrypt_128bit(const struct common_glue_ctx *gctx,
2383                             struct scatterlist *src, unsigned int nbytes)
2384  {
2385         const unsigned int bsize = 128 / 8;
2386 -       bool fpu_enabled = false;
2387 +       bool fpu_enabled;
2388         struct blkcipher_walk walk;
2389         int err;
2390
2391 @@ -203,12 +203,12 @@ int glue_cbc_decrypt_128bit(const struct common_glue_ctx *gctx,
2392
2393         while ((nbytes = walk.nbytes)) {
2394                 fpu_enabled = glue_fpu_begin(bsize, gctx->fpu_blocks_limit,
2395 -                                            desc, fpu_enabled, nbytes);
2396 +                                            desc, false, nbytes);
2397                 nbytes = __glue_cbc_decrypt_128bit(gctx, desc, &walk);
2398 +               glue_fpu_end(fpu_enabled);
2399                 err = blkcipher_walk_done(desc, &walk, nbytes);
2400         }
2401
2402 -       glue_fpu_end(fpu_enabled);
2403         return err;
2404  }
2405  EXPORT_SYMBOL_GPL(glue_cbc_decrypt_128bit);
2406 @@ -277,7 +277,7 @@ int glue_ctr_crypt_128bit(const struct common_glue_ctx *gctx,
2407                           struct scatterlist *src, unsigned int nbytes)
2408  {
2409         const unsigned int bsize = 128 / 8;
2410 -       bool fpu_enabled = false;
2411 +       bool fpu_enabled;
2412         struct blkcipher_walk walk;
2413         int err;
2414
2415 @@ -286,13 +286,12 @@ int glue_ctr_crypt_128bit(const struct common_glue_ctx *gctx,
2416
2417         while ((nbytes = walk.nbytes) >= bsize) {
2418                 fpu_enabled = glue_fpu_begin(bsize, gctx->fpu_blocks_limit,
2419 -                                            desc, fpu_enabled, nbytes);
2420 +                                            desc, false, nbytes);
2421                 nbytes = __glue_ctr_crypt_128bit(gctx, desc, &walk);
2422 +               glue_fpu_end(fpu_enabled);
2423                 err = blkcipher_walk_done(desc, &walk, nbytes);
2424         }
2425
2426 -       glue_fpu_end(fpu_enabled);
2427 -
2428         if (walk.nbytes) {
2429                 glue_ctr_crypt_final_128bit(
2430                         gctx->funcs[gctx->num_funcs - 1].fn_u.ctr, desc, &walk);
2431 @@ -347,7 +346,7 @@ int glue_xts_crypt_128bit(const struct common_glue_ctx *gctx,
2432                           void *tweak_ctx, void *crypt_ctx)
2433  {
2434         const unsigned int bsize = 128 / 8;
2435 -       bool fpu_enabled = false;
2436 +       bool fpu_enabled;
2437         struct blkcipher_walk walk;
2438         int err;
2439
2440 @@ -360,21 +359,21 @@ int glue_xts_crypt_128bit(const struct common_glue_ctx *gctx,
2441
2442         /* set minimum length to bsize, for tweak_fn */
2443         fpu_enabled = glue_fpu_begin(bsize, gctx->fpu_blocks_limit,
2444 -                                    desc, fpu_enabled,
2445 +                                    desc, false,
2446                                      nbytes < bsize ? bsize : nbytes);
2447 -
2448         /* calculate first value of T */
2449         tweak_fn(tweak_ctx, walk.iv, walk.iv);
2450 +       glue_fpu_end(fpu_enabled);
2451
2452         while (nbytes) {
2453 +               fpu_enabled = glue_fpu_begin(bsize, gctx->fpu_blocks_limit,
2454 +                               desc, false, nbytes);
2455                 nbytes = __glue_xts_crypt_128bit(gctx, crypt_ctx, desc, &walk);
2456
2457 +               glue_fpu_end(fpu_enabled);
2458                 err = blkcipher_walk_done(desc, &walk, nbytes);
2459                 nbytes = walk.nbytes;
2460         }
2461 -
2462 -       glue_fpu_end(fpu_enabled);
2463 -
2464         return err;
2465  }
2466  EXPORT_SYMBOL_GPL(glue_xts_crypt_128bit);
2467 diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c
2468 index 1a4477cedc49..75a301b6a5b6 100644
2469 --- a/arch/x86/entry/common.c
2470 +++ b/arch/x86/entry/common.c
2471 @@ -220,7 +220,7 @@ long syscall_trace_enter(struct pt_regs *regs)
2472
2473  #define EXIT_TO_USERMODE_LOOP_FLAGS                            \
2474         (_TIF_SIGPENDING | _TIF_NOTIFY_RESUME | _TIF_UPROBE |   \
2475 -        _TIF_NEED_RESCHED | _TIF_USER_RETURN_NOTIFY)
2476 +        _TIF_NEED_RESCHED_MASK | _TIF_USER_RETURN_NOTIFY)
2477
2478  static void exit_to_usermode_loop(struct pt_regs *regs, u32 cached_flags)
2479  {
2480 @@ -236,9 +236,16 @@ static void exit_to_usermode_loop(struct pt_regs *regs, u32 cached_flags)
2481                 /* We have work to do. */
2482                 local_irq_enable();
2483
2484 -               if (cached_flags & _TIF_NEED_RESCHED)
2485 +               if (cached_flags & _TIF_NEED_RESCHED_MASK)
2486                         schedule();
2487
2488 +#ifdef ARCH_RT_DELAYS_SIGNAL_SEND
2489 +               if (unlikely(current->forced_info.si_signo)) {
2490 +                       struct task_struct *t = current;
2491 +                       force_sig_info(t->forced_info.si_signo, &t->forced_info, t);
2492 +                       t->forced_info.si_signo = 0;
2493 +               }
2494 +#endif
2495                 if (cached_flags & _TIF_UPROBE)
2496                         uprobe_notify_resume(regs);
2497
2498 diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S
2499 index f3b6d54e0042..2d722ee01fc2 100644
2500 --- a/arch/x86/entry/entry_32.S
2501 +++ b/arch/x86/entry/entry_32.S
2502 @@ -278,8 +278,24 @@ END(ret_from_exception)
2503  ENTRY(resume_kernel)
2504         DISABLE_INTERRUPTS(CLBR_ANY)
2505  need_resched:
2506 +       # preempt count == 0 + NEED_RS set?
2507         cmpl    $0, PER_CPU_VAR(__preempt_count)
2508 +#ifndef CONFIG_PREEMPT_LAZY
2509         jnz     restore_all
2510 +#else
2511 +       jz test_int_off
2512 +
2513 +       # atleast preempt count == 0 ?
2514 +       cmpl $_PREEMPT_ENABLED,PER_CPU_VAR(__preempt_count)
2515 +       jne restore_all
2516 +
2517 +       cmpl $0,TI_preempt_lazy_count(%ebp)     # non-zero preempt_lazy_count ?
2518 +       jnz restore_all
2519 +
2520 +       testl $_TIF_NEED_RESCHED_LAZY, TI_flags(%ebp)
2521 +       jz restore_all
2522 +test_int_off:
2523 +#endif
2524         testl   $X86_EFLAGS_IF, PT_EFLAGS(%esp) # interrupts off (exception path) ?
2525         jz      restore_all
2526         call    preempt_schedule_irq
2527 diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
2528 index a55697d19824..316081a2ca85 100644
2529 --- a/arch/x86/entry/entry_64.S
2530 +++ b/arch/x86/entry/entry_64.S
2531 @@ -579,7 +579,23 @@ retint_kernel:
2532         bt      $9, EFLAGS(%rsp)                /* were interrupts off? */
2533         jnc     1f
2534  0:     cmpl    $0, PER_CPU_VAR(__preempt_count)
2535 +#ifndef CONFIG_PREEMPT_LAZY
2536         jnz     1f
2537 +#else
2538 +       jz      do_preempt_schedule_irq
2539 +
2540 +       # atleast preempt count == 0 ?
2541 +       cmpl $_PREEMPT_ENABLED,PER_CPU_VAR(__preempt_count)
2542 +       jnz     1f
2543 +
2544 +       GET_THREAD_INFO(%rcx)
2545 +       cmpl    $0, TI_preempt_lazy_count(%rcx)
2546 +       jnz     1f
2547 +
2548 +       bt      $TIF_NEED_RESCHED_LAZY,TI_flags(%rcx)
2549 +       jnc     1f
2550 +do_preempt_schedule_irq:
2551 +#endif
2552         call    preempt_schedule_irq
2553         jmp     0b
2554  1:
2555 @@ -867,6 +883,7 @@ bad_gs:
2556         jmp     2b
2557         .previous
2558
2559 +#ifndef CONFIG_PREEMPT_RT_FULL
2560  /* Call softirq on interrupt stack. Interrupts are off. */
2561  ENTRY(do_softirq_own_stack)
2562         pushq   %rbp
2563 @@ -879,6 +896,7 @@ ENTRY(do_softirq_own_stack)
2564         decl    PER_CPU_VAR(irq_count)
2565         ret
2566  END(do_softirq_own_stack)
2567 +#endif
2568
2569  #ifdef CONFIG_XEN
2570  idtentry xen_hypervisor_callback xen_do_hypervisor_callback has_error_code=0
2571 diff --git a/arch/x86/include/asm/preempt.h b/arch/x86/include/asm/preempt.h
2572 index 01bcde84d3e4..6f432adc55cd 100644
2573 --- a/arch/x86/include/asm/preempt.h
2574 +++ b/arch/x86/include/asm/preempt.h
2575 @@ -79,17 +79,46 @@ static __always_inline void __preempt_count_sub(int val)
2576   * a decrement which hits zero means we have no preempt_count and should
2577   * reschedule.
2578   */
2579 -static __always_inline bool __preempt_count_dec_and_test(void)
2580 +static __always_inline bool ____preempt_count_dec_and_test(void)
2581  {
2582         GEN_UNARY_RMWcc("decl", __preempt_count, __percpu_arg(0), "e");
2583  }
2584
2585 +static __always_inline bool __preempt_count_dec_and_test(void)
2586 +{
2587 +       if (____preempt_count_dec_and_test())
2588 +               return true;
2589 +#ifdef CONFIG_PREEMPT_LAZY
2590 +       if (current_thread_info()->preempt_lazy_count)
2591 +               return false;
2592 +       return test_thread_flag(TIF_NEED_RESCHED_LAZY);
2593 +#else
2594 +       return false;
2595 +#endif
2596 +}
2597 +
2598  /*
2599   * Returns true when we need to resched and can (barring IRQ state).
2600   */
2601  static __always_inline bool should_resched(int preempt_offset)
2602  {
2603 +#ifdef CONFIG_PREEMPT_LAZY
2604 +       u32 tmp;
2605 +
2606 +       tmp = raw_cpu_read_4(__preempt_count);
2607 +       if (tmp == preempt_offset)
2608 +               return true;
2609 +
2610 +       /* preempt count == 0 ? */
2611 +       tmp &= ~PREEMPT_NEED_RESCHED;
2612 +       if (tmp)
2613 +               return false;
2614 +       if (current_thread_info()->preempt_lazy_count)
2615 +               return false;
2616 +       return test_thread_flag(TIF_NEED_RESCHED_LAZY);
2617 +#else
2618         return unlikely(raw_cpu_read_4(__preempt_count) == preempt_offset);
2619 +#endif
2620  }
2621
2622  #ifdef CONFIG_PREEMPT
2623 diff --git a/arch/x86/include/asm/signal.h b/arch/x86/include/asm/signal.h
2624 index 2138c9ae19ee..3f5b4ee2e2c1 100644
2625 --- a/arch/x86/include/asm/signal.h
2626 +++ b/arch/x86/include/asm/signal.h
2627 @@ -23,6 +23,19 @@ typedef struct {
2628         unsigned long sig[_NSIG_WORDS];
2629  } sigset_t;
2630
2631 +/*
2632 + * Because some traps use the IST stack, we must keep preemption
2633 + * disabled while calling do_trap(), but do_trap() may call
2634 + * force_sig_info() which will grab the signal spin_locks for the
2635 + * task, which in PREEMPT_RT_FULL are mutexes.  By defining
2636 + * ARCH_RT_DELAYS_SIGNAL_SEND the force_sig_info() will set
2637 + * TIF_NOTIFY_RESUME and set up the signal to be sent on exit of the
2638 + * trap.
2639 + */
2640 +#if defined(CONFIG_PREEMPT_RT_FULL)
2641 +#define ARCH_RT_DELAYS_SIGNAL_SEND
2642 +#endif
2643 +
2644  #ifndef CONFIG_COMPAT
2645  typedef sigset_t compat_sigset_t;
2646  #endif
2647 diff --git a/arch/x86/include/asm/stackprotector.h b/arch/x86/include/asm/stackprotector.h
2648 index 58505f01962f..02fa39652cd6 100644
2649 --- a/arch/x86/include/asm/stackprotector.h
2650 +++ b/arch/x86/include/asm/stackprotector.h
2651 @@ -59,7 +59,7 @@
2652   */
2653  static __always_inline void boot_init_stack_canary(void)
2654  {
2655 -       u64 canary;
2656 +       u64 uninitialized_var(canary);
2657         u64 tsc;
2658
2659  #ifdef CONFIG_X86_64
2660 @@ -70,8 +70,15 @@ static __always_inline void boot_init_stack_canary(void)
2661          * of randomness. The TSC only matters for very early init,
2662          * there it already has some randomness on most systems. Later
2663          * on during the bootup the random pool has true entropy too.
2664 +        *
2665 +        * For preempt-rt we need to weaken the randomness a bit, as
2666 +        * we can't call into the random generator from atomic context
2667 +        * due to locking constraints. We just leave canary
2668 +        * uninitialized and use the TSC based randomness on top of it.
2669          */
2670 +#ifndef CONFIG_PREEMPT_RT_FULL
2671         get_random_bytes(&canary, sizeof(canary));
2672 +#endif
2673         tsc = rdtsc();
2674         canary += tsc + (tsc << 32UL);
2675
2676 diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h
2677 index c7b551028740..ddb63bd90e3c 100644
2678 --- a/arch/x86/include/asm/thread_info.h
2679 +++ b/arch/x86/include/asm/thread_info.h
2680 @@ -58,6 +58,8 @@ struct thread_info {
2681         __u32                   status;         /* thread synchronous flags */
2682         __u32                   cpu;            /* current CPU */
2683         mm_segment_t            addr_limit;
2684 +       int                     preempt_lazy_count;     /* 0 => lazy preemptable
2685 +                                                         <0 => BUG */
2686         unsigned int            sig_on_uaccess_error:1;
2687         unsigned int            uaccess_err:1;  /* uaccess failed */
2688  };
2689 @@ -95,6 +97,7 @@ struct thread_info {
2690  #define TIF_SYSCALL_EMU                6       /* syscall emulation active */
2691  #define TIF_SYSCALL_AUDIT      7       /* syscall auditing active */
2692  #define TIF_SECCOMP            8       /* secure computing */
2693 +#define TIF_NEED_RESCHED_LAZY  9       /* lazy rescheduling necessary */
2694  #define TIF_USER_RETURN_NOTIFY 11      /* notify kernel of userspace return */
2695  #define TIF_UPROBE             12      /* breakpointed or singlestepping */
2696  #define TIF_NOTSC              16      /* TSC is not accessible in userland */
2697 @@ -119,6 +122,7 @@ struct thread_info {
2698  #define _TIF_SYSCALL_EMU       (1 << TIF_SYSCALL_EMU)
2699  #define _TIF_SYSCALL_AUDIT     (1 << TIF_SYSCALL_AUDIT)
2700  #define _TIF_SECCOMP           (1 << TIF_SECCOMP)
2701 +#define _TIF_NEED_RESCHED_LAZY (1 << TIF_NEED_RESCHED_LAZY)
2702  #define _TIF_USER_RETURN_NOTIFY        (1 << TIF_USER_RETURN_NOTIFY)
2703  #define _TIF_UPROBE            (1 << TIF_UPROBE)
2704  #define _TIF_NOTSC             (1 << TIF_NOTSC)
2705 @@ -152,6 +156,8 @@ struct thread_info {
2706  #define _TIF_WORK_CTXSW_PREV (_TIF_WORK_CTXSW|_TIF_USER_RETURN_NOTIFY)
2707  #define _TIF_WORK_CTXSW_NEXT (_TIF_WORK_CTXSW)
2708
2709 +#define _TIF_NEED_RESCHED_MASK (_TIF_NEED_RESCHED | _TIF_NEED_RESCHED_LAZY)
2710 +
2711  #define STACK_WARN             (THREAD_SIZE/8)
2712
2713  /*
2714 diff --git a/arch/x86/include/asm/uv/uv_bau.h b/arch/x86/include/asm/uv/uv_bau.h
2715 index fc808b83fccb..ebb40118abf5 100644
2716 --- a/arch/x86/include/asm/uv/uv_bau.h
2717 +++ b/arch/x86/include/asm/uv/uv_bau.h
2718 @@ -615,9 +615,9 @@ struct bau_control {
2719         cycles_t                send_message;
2720         cycles_t                period_end;
2721         cycles_t                period_time;
2722 -       spinlock_t              uvhub_lock;
2723 -       spinlock_t              queue_lock;
2724 -       spinlock_t              disable_lock;
2725 +       raw_spinlock_t          uvhub_lock;
2726 +       raw_spinlock_t          queue_lock;
2727 +       raw_spinlock_t          disable_lock;
2728         /* tunables */
2729         int                     max_concurr;
2730         int                     max_concurr_const;
2731 @@ -776,15 +776,15 @@ static inline int atom_asr(short i, struct atomic_short *v)
2732   * to be lowered below the current 'v'.  atomic_add_unless can only stop
2733   * on equal.
2734   */
2735 -static inline int atomic_inc_unless_ge(spinlock_t *lock, atomic_t *v, int u)
2736 +static inline int atomic_inc_unless_ge(raw_spinlock_t *lock, atomic_t *v, int u)
2737  {
2738 -       spin_lock(lock);
2739 +       raw_spin_lock(lock);
2740         if (atomic_read(v) >= u) {
2741 -               spin_unlock(lock);
2742 +               raw_spin_unlock(lock);
2743                 return 0;
2744         }
2745         atomic_inc(v);
2746 -       spin_unlock(lock);
2747 +       raw_spin_unlock(lock);
2748         return 1;
2749  }
2750
2751 diff --git a/arch/x86/include/asm/uv/uv_hub.h b/arch/x86/include/asm/uv/uv_hub.h
2752 index ea7074784cc4..01ec643ce66e 100644
2753 --- a/arch/x86/include/asm/uv/uv_hub.h
2754 +++ b/arch/x86/include/asm/uv/uv_hub.h
2755 @@ -492,7 +492,7 @@ struct uv_blade_info {
2756         unsigned short  nr_online_cpus;
2757         unsigned short  pnode;
2758         short           memory_nid;
2759 -       spinlock_t      nmi_lock;       /* obsolete, see uv_hub_nmi */
2760 +       raw_spinlock_t  nmi_lock;       /* obsolete, see uv_hub_nmi */
2761         unsigned long   nmi_count;      /* obsolete, see uv_hub_nmi */
2762  };
2763  extern struct uv_blade_info *uv_blade_info;
2764 diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
2765 index e75907601a41..a29fc4f84fc4 100644
2766 --- a/arch/x86/kernel/acpi/boot.c
2767 +++ b/arch/x86/kernel/acpi/boot.c
2768 @@ -87,7 +87,9 @@ static u64 acpi_lapic_addr __initdata = APIC_DEFAULT_PHYS_BASE;
2769   *             ->ioapic_mutex
2770   *                     ->ioapic_lock
2771   */
2772 +#ifdef CONFIG_X86_IO_APIC
2773  static DEFINE_MUTEX(acpi_ioapic_lock);
2774 +#endif
2775
2776  /* --------------------------------------------------------------------------
2777                                Boot-time Configuration
2778 diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
2779 index fdb0fbfb1197..678c711e2a16 100644
2780 --- a/arch/x86/kernel/apic/io_apic.c
2781 +++ b/arch/x86/kernel/apic/io_apic.c
2782 @@ -1711,7 +1711,8 @@ static bool io_apic_level_ack_pending(struct mp_chip_data *data)
2783  static inline bool ioapic_irqd_mask(struct irq_data *data)
2784  {
2785         /* If we are moving the irq we need to mask it */
2786 -       if (unlikely(irqd_is_setaffinity_pending(data))) {
2787 +       if (unlikely(irqd_is_setaffinity_pending(data) &&
2788 +                    !irqd_irq_inprogress(data))) {
2789                 mask_ioapic_irq(data);
2790                 return true;
2791         }
2792 diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c
2793 index 4a139465f1d4..ad2afff02b36 100644
2794 --- a/arch/x86/kernel/apic/x2apic_uv_x.c
2795 +++ b/arch/x86/kernel/apic/x2apic_uv_x.c
2796 @@ -947,7 +947,7 @@ void __init uv_system_init(void)
2797                         uv_blade_info[blade].pnode = pnode;
2798                         uv_blade_info[blade].nr_possible_cpus = 0;
2799                         uv_blade_info[blade].nr_online_cpus = 0;
2800 -                       spin_lock_init(&uv_blade_info[blade].nmi_lock);
2801 +                       raw_spin_lock_init(&uv_blade_info[blade].nmi_lock);
2802                         min_pnode = min(pnode, min_pnode);
2803                         max_pnode = max(pnode, max_pnode);
2804                         blade++;
2805 diff --git a/arch/x86/kernel/asm-offsets.c b/arch/x86/kernel/asm-offsets.c
2806 index 439df975bc7a..b7954ddd6a0a 100644
2807 --- a/arch/x86/kernel/asm-offsets.c
2808 +++ b/arch/x86/kernel/asm-offsets.c
2809 @@ -32,6 +32,7 @@ void common(void) {
2810         OFFSET(TI_flags, thread_info, flags);
2811         OFFSET(TI_status, thread_info, status);
2812         OFFSET(TI_addr_limit, thread_info, addr_limit);
2813 +       OFFSET(TI_preempt_lazy_count, thread_info, preempt_lazy_count);
2814
2815         BLANK();
2816         OFFSET(crypto_tfm_ctx_offset, crypto_tfm, __crt_ctx);
2817 @@ -89,4 +90,5 @@ void common(void) {
2818
2819         BLANK();
2820         DEFINE(PTREGS_SIZE, sizeof(struct pt_regs));
2821 +       DEFINE(_PREEMPT_ENABLED, PREEMPT_ENABLED);
2822  }
2823 diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
2824 index 7e8a736d09db..430a4ec07811 100644
2825 --- a/arch/x86/kernel/cpu/mcheck/mce.c
2826 +++ b/arch/x86/kernel/cpu/mcheck/mce.c
2827 @@ -41,6 +41,8 @@
2828  #include <linux/debugfs.h>
2829  #include <linux/irq_work.h>
2830  #include <linux/export.h>
2831 +#include <linux/jiffies.h>
2832 +#include <linux/swork.h>
2833
2834  #include <asm/processor.h>
2835  #include <asm/traps.h>
2836 @@ -1236,7 +1238,7 @@ void mce_log_therm_throt_event(__u64 status)
2837  static unsigned long check_interval = INITIAL_CHECK_INTERVAL;
2838
2839  static DEFINE_PER_CPU(unsigned long, mce_next_interval); /* in jiffies */
2840 -static DEFINE_PER_CPU(struct timer_list, mce_timer);
2841 +static DEFINE_PER_CPU(struct hrtimer, mce_timer);
2842
2843  static unsigned long mce_adjust_timer_default(unsigned long interval)
2844  {
2845 @@ -1245,32 +1247,18 @@ static unsigned long mce_adjust_timer_default(unsigned long interval)
2846
2847  static unsigned long (*mce_adjust_timer)(unsigned long interval) = mce_adjust_timer_default;
2848
2849 -static void __restart_timer(struct timer_list *t, unsigned long interval)
2850 +static enum hrtimer_restart __restart_timer(struct hrtimer *timer, unsigned long interval)
2851  {
2852 -       unsigned long when = jiffies + interval;
2853 -       unsigned long flags;
2854 -
2855 -       local_irq_save(flags);
2856 -
2857 -       if (timer_pending(t)) {
2858 -               if (time_before(when, t->expires))
2859 -                       mod_timer_pinned(t, when);
2860 -       } else {
2861 -               t->expires = round_jiffies(when);
2862 -               add_timer_on(t, smp_processor_id());
2863 -       }
2864 -
2865 -       local_irq_restore(flags);
2866 +       if (!interval)
2867 +               return HRTIMER_NORESTART;
2868 +       hrtimer_forward_now(timer, ns_to_ktime(jiffies_to_nsecs(interval)));
2869 +       return HRTIMER_RESTART;
2870  }
2871
2872 -static void mce_timer_fn(unsigned long data)
2873 +static enum hrtimer_restart mce_timer_fn(struct hrtimer *timer)
2874  {
2875 -       struct timer_list *t = this_cpu_ptr(&mce_timer);
2876 -       int cpu = smp_processor_id();
2877         unsigned long iv;
2878
2879 -       WARN_ON(cpu != data);
2880 -
2881         iv = __this_cpu_read(mce_next_interval);
2882
2883         if (mce_available(this_cpu_ptr(&cpu_info))) {
2884 @@ -1293,7 +1281,7 @@ static void mce_timer_fn(unsigned long data)
2885
2886  done:
2887         __this_cpu_write(mce_next_interval, iv);
2888 -       __restart_timer(t, iv);
2889 +       return __restart_timer(timer, iv);
2890  }
2891
2892  /*
2893 @@ -1301,7 +1289,7 @@ done:
2894   */
2895  void mce_timer_kick(unsigned long interval)
2896  {
2897 -       struct timer_list *t = this_cpu_ptr(&mce_timer);
2898 +       struct hrtimer *t = this_cpu_ptr(&mce_timer);
2899         unsigned long iv = __this_cpu_read(mce_next_interval);
2900
2901         __restart_timer(t, interval);
2902 @@ -1316,7 +1304,7 @@ static void mce_timer_delete_all(void)
2903         int cpu;
2904
2905         for_each_online_cpu(cpu)
2906 -               del_timer_sync(&per_cpu(mce_timer, cpu));
2907 +               hrtimer_cancel(&per_cpu(mce_timer, cpu));
2908  }
2909
2910  static void mce_do_trigger(struct work_struct *work)
2911 @@ -1326,6 +1314,56 @@ static void mce_do_trigger(struct work_struct *work)
2912
2913  static DECLARE_WORK(mce_trigger_work, mce_do_trigger);
2914
2915 +static void __mce_notify_work(struct swork_event *event)
2916 +{
2917 +       /* Not more than two messages every minute */
2918 +       static DEFINE_RATELIMIT_STATE(ratelimit, 60*HZ, 2);
2919 +
2920 +       /* wake processes polling /dev/mcelog */
2921 +       wake_up_interruptible(&mce_chrdev_wait);
2922 +
2923 +       /*
2924 +        * There is no risk of missing notifications because
2925 +        * work_pending is always cleared before the function is
2926 +        * executed.
2927 +        */
2928 +       if (mce_helper[0] && !work_pending(&mce_trigger_work))
2929 +               schedule_work(&mce_trigger_work);
2930 +
2931 +       if (__ratelimit(&ratelimit))
2932 +               pr_info(HW_ERR "Machine check events logged\n");
2933 +}
2934 +
2935 +#ifdef CONFIG_PREEMPT_RT_FULL
2936 +static bool notify_work_ready __read_mostly;
2937 +static struct swork_event notify_work;
2938 +
2939 +static int mce_notify_work_init(void)
2940 +{
2941 +       int err;
2942 +
2943 +       err = swork_get();
2944 +       if (err)
2945 +               return err;
2946 +
2947 +       INIT_SWORK(&notify_work, __mce_notify_work);
2948 +       notify_work_ready = true;
2949 +       return 0;
2950 +}
2951 +
2952 +static void mce_notify_work(void)
2953 +{
2954 +       if (notify_work_ready)
2955 +               swork_queue(&notify_work);
2956 +}
2957 +#else
2958 +static void mce_notify_work(void)
2959 +{
2960 +       __mce_notify_work(NULL);
2961 +}
2962 +static inline int mce_notify_work_init(void) { return 0; }
2963 +#endif
2964 +
2965  /*
2966   * Notify the user(s) about new machine check events.
2967   * Can be called from interrupt context, but not from machine check/NMI
2968 @@ -1333,19 +1371,8 @@ static DECLARE_WORK(mce_trigger_work, mce_do_trigger);
2969   */
2970  int mce_notify_irq(void)
2971  {
2972 -       /* Not more than two messages every minute */
2973 -       static DEFINE_RATELIMIT_STATE(ratelimit, 60*HZ, 2);
2974 -
2975         if (test_and_clear_bit(0, &mce_need_notify)) {
2976 -               /* wake processes polling /dev/mcelog */
2977 -               wake_up_interruptible(&mce_chrdev_wait);
2978 -
2979 -               if (mce_helper[0])
2980 -                       schedule_work(&mce_trigger_work);
2981 -
2982 -               if (__ratelimit(&ratelimit))
2983 -                       pr_info(HW_ERR "Machine check events logged\n");
2984 -
2985 +               mce_notify_work();
2986                 return 1;
2987         }
2988         return 0;
2989 @@ -1639,7 +1666,7 @@ static void __mcheck_cpu_clear_vendor(struct cpuinfo_x86 *c)
2990         }
2991  }
2992
2993 -static void mce_start_timer(unsigned int cpu, struct timer_list *t)
2994 +static void mce_start_timer(unsigned int cpu, struct hrtimer *t)
2995  {
2996         unsigned long iv = check_interval * HZ;
2997
2998 @@ -1648,16 +1675,17 @@ static void mce_start_timer(unsigned int cpu, struct timer_list *t)
2999
3000         per_cpu(mce_next_interval, cpu) = iv;
3001
3002 -       t->expires = round_jiffies(jiffies + iv);
3003 -       add_timer_on(t, cpu);
3004 +       hrtimer_start_range_ns(t, ns_to_ktime(jiffies_to_usecs(iv) * 1000ULL),
3005 +                       0, HRTIMER_MODE_REL_PINNED);
3006  }
3007
3008  static void __mcheck_cpu_init_timer(void)
3009  {
3010 -       struct timer_list *t = this_cpu_ptr(&mce_timer);
3011 +       struct hrtimer *t = this_cpu_ptr(&mce_timer);
3012         unsigned int cpu = smp_processor_id();
3013
3014 -       setup_timer(t, mce_timer_fn, cpu);
3015 +       hrtimer_init(t, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
3016 +       t->function = mce_timer_fn;
3017         mce_start_timer(cpu, t);
3018  }
3019
3020 @@ -2376,6 +2404,8 @@ static void mce_disable_cpu(void *h)
3021         if (!mce_available(raw_cpu_ptr(&cpu_info)))
3022                 return;
3023
3024 +       hrtimer_cancel(this_cpu_ptr(&mce_timer));
3025 +
3026         if (!(action & CPU_TASKS_FROZEN))
3027                 cmci_clear();
3028
3029 @@ -2398,6 +2428,7 @@ static void mce_reenable_cpu(void *h)
3030                 if (b->init)
3031                         wrmsrl(MSR_IA32_MCx_CTL(i), b->ctl);
3032         }
3033 +       __mcheck_cpu_init_timer();
3034  }
3035
3036  /* Get notified when a cpu comes on/off. Be hotplug friendly. */
3037 @@ -2405,7 +2436,6 @@ static int
3038  mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
3039  {
3040         unsigned int cpu = (unsigned long)hcpu;
3041 -       struct timer_list *t = &per_cpu(mce_timer, cpu);
3042
3043         switch (action & ~CPU_TASKS_FROZEN) {
3044         case CPU_ONLINE:
3045 @@ -2425,11 +2455,9 @@ mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
3046                 break;
3047         case CPU_DOWN_PREPARE:
3048                 smp_call_function_single(cpu, mce_disable_cpu, &action, 1);
3049 -               del_timer_sync(t);
3050                 break;
3051         case CPU_DOWN_FAILED:
3052                 smp_call_function_single(cpu, mce_reenable_cpu, &action, 1);
3053 -               mce_start_timer(cpu, t);
3054                 break;
3055         }
3056
3057 @@ -2468,6 +2496,10 @@ static __init int mcheck_init_device(void)
3058                 goto err_out;
3059         }
3060
3061 +       err = mce_notify_work_init();
3062 +       if (err)
3063 +               goto err_out;
3064 +
3065         if (!zalloc_cpumask_var(&mce_device_initialized, GFP_KERNEL)) {
3066                 err = -ENOMEM;
3067                 goto err_out;
3068 diff --git a/arch/x86/kernel/cpu/perf_event_intel_rapl.c b/arch/x86/kernel/cpu/perf_event_intel_rapl.c
3069 index ed446bdcbf31..d2ac364e2118 100644
3070 --- a/arch/x86/kernel/cpu/perf_event_intel_rapl.c
3071 +++ b/arch/x86/kernel/cpu/perf_event_intel_rapl.c
3072 @@ -117,7 +117,7 @@ static struct perf_pmu_events_attr event_attr_##v = {                       \
3073  };
3074
3075  struct rapl_pmu {
3076 -       spinlock_t       lock;
3077 +       raw_spinlock_t   lock;
3078         int              n_active; /* number of active events */
3079         struct list_head active_list;
3080         struct pmu       *pmu; /* pointer to rapl_pmu_class */
3081 @@ -220,13 +220,13 @@ static enum hrtimer_restart rapl_hrtimer_handle(struct hrtimer *hrtimer)
3082         if (!pmu->n_active)
3083                 return HRTIMER_NORESTART;
3084
3085 -       spin_lock_irqsave(&pmu->lock, flags);
3086 +       raw_spin_lock_irqsave(&pmu->lock, flags);
3087
3088         list_for_each_entry(event, &pmu->active_list, active_entry) {
3089                 rapl_event_update(event);
3090         }
3091
3092 -       spin_unlock_irqrestore(&pmu->lock, flags);
3093 +       raw_spin_unlock_irqrestore(&pmu->lock, flags);
3094
3095         hrtimer_forward_now(hrtimer, pmu->timer_interval);
3096
3097 @@ -263,9 +263,9 @@ static void rapl_pmu_event_start(struct perf_event *event, int mode)
3098         struct rapl_pmu *pmu = __this_cpu_read(rapl_pmu);
3099         unsigned long flags;
3100
3101 -       spin_lock_irqsave(&pmu->lock, flags);
3102 +       raw_spin_lock_irqsave(&pmu->lock, flags);
3103         __rapl_pmu_event_start(pmu, event);
3104 -       spin_unlock_irqrestore(&pmu->lock, flags);
3105 +       raw_spin_unlock_irqrestore(&pmu->lock, flags);
3106  }
3107
3108  static void rapl_pmu_event_stop(struct perf_event *event, int mode)
3109 @@ -274,7 +274,7 @@ static void rapl_pmu_event_stop(struct perf_event *event, int mode)
3110         struct hw_perf_event *hwc = &event->hw;
3111         unsigned long flags;
3112
3113 -       spin_lock_irqsave(&pmu->lock, flags);
3114 +       raw_spin_lock_irqsave(&pmu->lock, flags);
3115
3116         /* mark event as deactivated and stopped */
3117         if (!(hwc->state & PERF_HES_STOPPED)) {
3118 @@ -299,7 +299,7 @@ static void rapl_pmu_event_stop(struct perf_event *event, int mode)
3119                 hwc->state |= PERF_HES_UPTODATE;
3120         }
3121
3122 -       spin_unlock_irqrestore(&pmu->lock, flags);
3123 +       raw_spin_unlock_irqrestore(&pmu->lock, flags);
3124  }
3125
3126  static int rapl_pmu_event_add(struct perf_event *event, int mode)
3127 @@ -308,14 +308,14 @@ static int rapl_pmu_event_add(struct perf_event *event, int mode)
3128         struct hw_perf_event *hwc = &event->hw;
3129         unsigned long flags;
3130
3131 -       spin_lock_irqsave(&pmu->lock, flags);
3132 +       raw_spin_lock_irqsave(&pmu->lock, flags);
3133
3134         hwc->state = PERF_HES_UPTODATE | PERF_HES_STOPPED;
3135
3136         if (mode & PERF_EF_START)
3137                 __rapl_pmu_event_start(pmu, event);
3138
3139 -       spin_unlock_irqrestore(&pmu->lock, flags);
3140 +       raw_spin_unlock_irqrestore(&pmu->lock, flags);
3141
3142         return 0;
3143  }
3144 @@ -603,7 +603,7 @@ static int rapl_cpu_prepare(int cpu)
3145         pmu = kzalloc_node(sizeof(*pmu), GFP_KERNEL, cpu_to_node(cpu));
3146         if (!pmu)
3147                 return -1;
3148 -       spin_lock_init(&pmu->lock);
3149 +       raw_spin_lock_init(&pmu->lock);
3150
3151         INIT_LIST_HEAD(&pmu->active_list);
3152
3153 diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c
3154 index 464ffd69b92e..00db1aad1548 100644
3155 --- a/arch/x86/kernel/dumpstack_32.c
3156 +++ b/arch/x86/kernel/dumpstack_32.c
3157 @@ -42,7 +42,7 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs,
3158                 unsigned long *stack, unsigned long bp,
3159                 const struct stacktrace_ops *ops, void *data)
3160  {
3161 -       const unsigned cpu = get_cpu();
3162 +       const unsigned cpu = get_cpu_light();
3163         int graph = 0;
3164         u32 *prev_esp;
3165
3166 @@ -86,7 +86,7 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs,
3167                         break;
3168                 touch_nmi_watchdog();
3169         }
3170 -       put_cpu();
3171 +       put_cpu_light();
3172  }
3173  EXPORT_SYMBOL(dump_trace);
3174
3175 diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c
3176 index 5f1c6266eb30..c331e3fef465 100644
3177 --- a/arch/x86/kernel/dumpstack_64.c
3178 +++ b/arch/x86/kernel/dumpstack_64.c
3179 @@ -152,7 +152,7 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs,
3180                 unsigned long *stack, unsigned long bp,
3181                 const struct stacktrace_ops *ops, void *data)
3182  {
3183 -       const unsigned cpu = get_cpu();
3184 +       const unsigned cpu = get_cpu_light();
3185         struct thread_info *tinfo;
3186         unsigned long *irq_stack = (unsigned long *)per_cpu(irq_stack_ptr, cpu);
3187         unsigned long dummy;
3188 @@ -241,7 +241,7 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs,
3189          * This handles the process stack:
3190          */
3191         bp = ops->walk_stack(tinfo, stack, bp, ops, data, NULL, &graph);
3192 -       put_cpu();
3193 +       put_cpu_light();
3194  }
3195  EXPORT_SYMBOL(dump_trace);
3196
3197 @@ -255,7 +255,7 @@ show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
3198         int cpu;
3199         int i;
3200
3201 -       preempt_disable();
3202 +       migrate_disable();
3203         cpu = smp_processor_id();
3204
3205         irq_stack_end   = (unsigned long *)(per_cpu(irq_stack_ptr, cpu));
3206 @@ -291,7 +291,7 @@ show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
3207                         pr_cont(" %016lx", *stack++);
3208                 touch_nmi_watchdog();
3209         }
3210 -       preempt_enable();
3211 +       migrate_enable();
3212
3213         pr_cont("\n");
3214         show_trace_log_lvl(task, regs, sp, bp, log_lvl);
3215 diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c
3216 index 38da8f29a9c8..ce71f7098f15 100644
3217 --- a/arch/x86/kernel/irq_32.c
3218 +++ b/arch/x86/kernel/irq_32.c
3219 @@ -128,6 +128,7 @@ void irq_ctx_init(int cpu)
3220                cpu, per_cpu(hardirq_stack, cpu),  per_cpu(softirq_stack, cpu));
3221  }
3222
3223 +#ifndef CONFIG_PREEMPT_RT_FULL
3224  void do_softirq_own_stack(void)
3225  {
3226         struct thread_info *curstk;
3227 @@ -146,6 +147,7 @@ void do_softirq_own_stack(void)
3228
3229         call_on_stack(__do_softirq, isp);
3230  }
3231 +#endif
3232
3233  bool handle_irq(struct irq_desc *desc, struct pt_regs *regs)
3234  {
3235 diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
3236 index 47190bd399e7..807950860fb7 100644
3237 --- a/arch/x86/kernel/kvm.c
3238 +++ b/arch/x86/kernel/kvm.c
3239 @@ -36,6 +36,7 @@
3240  #include <linux/kprobes.h>
3241  #include <linux/debugfs.h>
3242  #include <linux/nmi.h>
3243 +#include <linux/swait.h>
3244  #include <asm/timer.h>
3245  #include <asm/cpu.h>
3246  #include <asm/traps.h>
3247 @@ -91,14 +92,14 @@ static void kvm_io_delay(void)
3248
3249  struct kvm_task_sleep_node {
3250         struct hlist_node link;
3251 -       wait_queue_head_t wq;
3252 +       struct swait_queue_head wq;
3253         u32 token;
3254         int cpu;
3255         bool halted;
3256  };
3257
3258  static struct kvm_task_sleep_head {
3259 -       spinlock_t lock;
3260 +       raw_spinlock_t lock;
3261         struct hlist_head list;
3262  } async_pf_sleepers[KVM_TASK_SLEEP_HASHSIZE];
3263
3264 @@ -122,17 +123,17 @@ void kvm_async_pf_task_wait(u32 token)
3265         u32 key = hash_32(token, KVM_TASK_SLEEP_HASHBITS);
3266         struct kvm_task_sleep_head *b = &async_pf_sleepers[key];
3267         struct kvm_task_sleep_node n, *e;
3268 -       DEFINE_WAIT(wait);
3269 +       DECLARE_SWAITQUEUE(wait);
3270
3271         rcu_irq_enter();
3272
3273 -       spin_lock(&b->lock);
3274 +       raw_spin_lock(&b->lock);
3275         e = _find_apf_task(b, token);
3276         if (e) {
3277                 /* dummy entry exist -> wake up was delivered ahead of PF */
3278                 hlist_del(&e->link);
3279                 kfree(e);
3280 -               spin_unlock(&b->lock);
3281 +               raw_spin_unlock(&b->lock);
3282
3283                 rcu_irq_exit();
3284                 return;
3285 @@ -141,13 +142,13 @@ void kvm_async_pf_task_wait(u32 token)
3286         n.token = token;
3287         n.cpu = smp_processor_id();
3288         n.halted = is_idle_task(current) || preempt_count() > 1;
3289 -       init_waitqueue_head(&n.wq);
3290 +       init_swait_queue_head(&n.wq);
3291         hlist_add_head(&n.link, &b->list);
3292 -       spin_unlock(&b->lock);
3293 +       raw_spin_unlock(&b->lock);
3294
3295         for (;;) {
3296                 if (!n.halted)
3297 -                       prepare_to_wait(&n.wq, &wait, TASK_UNINTERRUPTIBLE);
3298 +                       prepare_to_swait(&n.wq, &wait, TASK_UNINTERRUPTIBLE);
3299                 if (hlist_unhashed(&n.link))
3300                         break;
3301
3302 @@ -166,7 +167,7 @@ void kvm_async_pf_task_wait(u32 token)
3303                 }
3304         }
3305         if (!n.halted)
3306 -               finish_wait(&n.wq, &wait);
3307 +               finish_swait(&n.wq, &wait);
3308
3309         rcu_irq_exit();
3310         return;
3311 @@ -178,8 +179,8 @@ static void apf_task_wake_one(struct kvm_task_sleep_node *n)
3312         hlist_del_init(&n->link);
3313         if (n->halted)
3314                 smp_send_reschedule(n->cpu);
3315 -       else if (waitqueue_active(&n->wq))
3316 -               wake_up(&n->wq);
3317 +       else if (swait_active(&n->wq))
3318 +               swake_up(&n->wq);
3319  }
3320
3321  static void apf_task_wake_all(void)
3322 @@ -189,14 +190,14 @@ static void apf_task_wake_all(void)
3323         for (i = 0; i < KVM_TASK_SLEEP_HASHSIZE; i++) {
3324                 struct hlist_node *p, *next;
3325                 struct kvm_task_sleep_head *b = &async_pf_sleepers[i];
3326 -               spin_lock(&b->lock);
3327 +               raw_spin_lock(&b->lock);
3328                 hlist_for_each_safe(p, next, &b->list) {
3329                         struct kvm_task_sleep_node *n =
3330                                 hlist_entry(p, typeof(*n), link);
3331                         if (n->cpu == smp_processor_id())
3332                                 apf_task_wake_one(n);
3333                 }
3334 -               spin_unlock(&b->lock);
3335 +               raw_spin_unlock(&b->lock);
3336         }
3337  }
3338
3339 @@ -212,7 +213,7 @@ void kvm_async_pf_task_wake(u32 token)
3340         }
3341
3342  again:
3343 -       spin_lock(&b->lock);
3344 +       raw_spin_lock(&b->lock);
3345         n = _find_apf_task(b, token);
3346         if (!n) {
3347                 /*
3348 @@ -225,17 +226,17 @@ again:
3349                          * Allocation failed! Busy wait while other cpu
3350                          * handles async PF.
3351                          */
3352 -                       spin_unlock(&b->lock);
3353 +                       raw_spin_unlock(&b->lock);
3354                         cpu_relax();
3355                         goto again;
3356                 }
3357                 n->token = token;
3358                 n->cpu = smp_processor_id();
3359 -               init_waitqueue_head(&n->wq);
3360 +               init_swait_queue_head(&n->wq);
3361                 hlist_add_head(&n->link, &b->list);
3362         } else
3363                 apf_task_wake_one(n);
3364 -       spin_unlock(&b->lock);
3365 +       raw_spin_unlock(&b->lock);
3366         return;
3367  }
3368  EXPORT_SYMBOL_GPL(kvm_async_pf_task_wake);
3369 @@ -486,7 +487,7 @@ void __init kvm_guest_init(void)
3370         paravirt_ops_setup();
3371         register_reboot_notifier(&kvm_pv_reboot_nb);
3372         for (i = 0; i < KVM_TASK_SLEEP_HASHSIZE; i++)
3373 -               spin_lock_init(&async_pf_sleepers[i].lock);
3374 +               raw_spin_lock_init(&async_pf_sleepers[i].lock);
3375         if (kvm_para_has_feature(KVM_FEATURE_ASYNC_PF))
3376                 x86_init.irqs.trap_init = kvm_apf_trap_init;
3377
3378 diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c
3379 index 697f90db0e37..424aec4a4c71 100644
3380 --- a/arch/x86/kernel/nmi.c
3381 +++ b/arch/x86/kernel/nmi.c
3382 @@ -231,7 +231,7 @@ pci_serr_error(unsigned char reason, struct pt_regs *regs)
3383  #endif
3384
3385         if (panic_on_unrecovered_nmi)
3386 -               panic("NMI: Not continuing");
3387 +               nmi_panic(regs, "NMI: Not continuing");
3388
3389         pr_emerg("Dazed and confused, but trying to continue\n");
3390
3391 @@ -255,8 +255,16 @@ io_check_error(unsigned char reason, struct pt_regs *regs)
3392                  reason, smp_processor_id());
3393         show_regs(regs);
3394
3395 -       if (panic_on_io_nmi)
3396 -               panic("NMI IOCK error: Not continuing");
3397 +       if (panic_on_io_nmi) {
3398 +               nmi_panic(regs, "NMI IOCK error: Not continuing");
3399 +
3400 +               /*
3401 +                * If we end up here, it means we have received an NMI while
3402 +                * processing panic(). Simply return without delaying and
3403 +                * re-enabling NMIs.
3404 +                */
3405 +               return;
3406 +       }
3407
3408         /* Re-enable the IOCK line, wait for a few seconds */
3409         reason = (reason & NMI_REASON_CLEAR_MASK) | NMI_REASON_CLEAR_IOCHK;
3410 @@ -297,7 +305,7 @@ unknown_nmi_error(unsigned char reason, struct pt_regs *regs)
3411
3412         pr_emerg("Do you have a strange power saving mode enabled?\n");
3413         if (unknown_nmi_panic || panic_on_unrecovered_nmi)
3414 -               panic("NMI: Not continuing");
3415 +               nmi_panic(regs, "NMI: Not continuing");
3416
3417         pr_emerg("Dazed and confused, but trying to continue\n");
3418  }
3419 diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
3420 index 9f950917528b..4dd4beae917a 100644
3421 --- a/arch/x86/kernel/process_32.c
3422 +++ b/arch/x86/kernel/process_32.c
3423 @@ -35,6 +35,7 @@
3424  #include <linux/uaccess.h>
3425  #include <linux/io.h>
3426  #include <linux/kdebug.h>
3427 +#include <linux/highmem.h>
3428
3429  #include <asm/pgtable.h>
3430  #include <asm/ldt.h>
3431 @@ -210,6 +211,35 @@ start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp)
3432  }
3433  EXPORT_SYMBOL_GPL(start_thread);
3434
3435 +#ifdef CONFIG_PREEMPT_RT_FULL
3436 +static void switch_kmaps(struct task_struct *prev_p, struct task_struct *next_p)
3437 +{
3438 +       int i;
3439 +
3440 +       /*
3441 +        * Clear @prev's kmap_atomic mappings
3442 +        */
3443 +       for (i = 0; i < prev_p->kmap_idx; i++) {
3444 +               int idx = i + KM_TYPE_NR * smp_processor_id();
3445 +               pte_t *ptep = kmap_pte - idx;
3446 +
3447 +               kpte_clear_flush(ptep, __fix_to_virt(FIX_KMAP_BEGIN + idx));
3448 +       }
3449 +       /*
3450 +        * Restore @next_p's kmap_atomic mappings
3451 +        */
3452 +       for (i = 0; i < next_p->kmap_idx; i++) {
3453 +               int idx = i + KM_TYPE_NR * smp_processor_id();
3454 +
3455 +               if (!pte_none(next_p->kmap_pte[i]))
3456 +                       set_pte(kmap_pte - idx, next_p->kmap_pte[i]);
3457 +       }
3458 +}
3459 +#else
3460 +static inline void
3461 +switch_kmaps(struct task_struct *prev_p, struct task_struct *next_p) { }
3462 +#endif
3463 +
3464
3465  /*
3466   *     switch_to(x,y) should switch tasks from x to y.
3467 @@ -286,6 +316,8 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
3468                      task_thread_info(next_p)->flags & _TIF_WORK_CTXSW_NEXT))
3469                 __switch_to_xtra(prev_p, next_p, tss);
3470
3471 +       switch_kmaps(prev_p, next_p);
3472 +
3473         /*
3474          * Leave lazy mode, flushing any hypercalls made here.
3475          * This must be done before restoring TLS segments so
3476 diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c
3477 index f660d63f40fe..8384207adde2 100644
3478 --- a/arch/x86/kernel/reboot.c
3479 +++ b/arch/x86/kernel/reboot.c
3480 @@ -726,6 +726,7 @@ static int crashing_cpu;
3481  static nmi_shootdown_cb shootdown_callback;
3482
3483  static atomic_t waiting_for_crash_ipi;
3484 +static int crash_ipi_issued;
3485
3486  static int crash_nmi_callback(unsigned int val, struct pt_regs *regs)
3487  {
3488 @@ -788,6 +789,9 @@ void nmi_shootdown_cpus(nmi_shootdown_cb callback)
3489
3490         smp_send_nmi_allbutself();
3491
3492 +       /* Kick CPUs looping in NMI context. */
3493 +       WRITE_ONCE(crash_ipi_issued, 1);
3494 +
3495         msecs = 1000; /* Wait at most a second for the other cpus to stop */
3496         while ((atomic_read(&waiting_for_crash_ipi) > 0) && msecs) {
3497                 mdelay(1);
3498 @@ -796,6 +800,22 @@ void nmi_shootdown_cpus(nmi_shootdown_cb callback)
3499
3500         /* Leave the nmi callback set */
3501  }
3502 +
3503 +/* Override the weak function in kernel/panic.c */
3504 +void nmi_panic_self_stop(struct pt_regs *regs)
3505 +{
3506 +       while (1) {
3507 +               /*
3508 +                * Wait for the crash dumping IPI to be issued, and then
3509 +                * call its callback directly.
3510 +                */
3511 +               if (READ_ONCE(crash_ipi_issued))
3512 +                       crash_nmi_callback(0, regs); /* Don't return */
3513 +
3514 +               cpu_relax();
3515 +       }
3516 +}
3517 +
3518  #else /* !CONFIG_SMP */
3519  void nmi_shootdown_cpus(nmi_shootdown_cb callback)
3520  {
3521 diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
3522 index 4d30b865be30..20d9e9fb3b74 100644
3523 --- a/arch/x86/kvm/lapic.c
3524 +++ b/arch/x86/kvm/lapic.c
3525 @@ -1195,7 +1195,7 @@ static void apic_update_lvtt(struct kvm_lapic *apic)
3526  static void apic_timer_expired(struct kvm_lapic *apic)
3527  {
3528         struct kvm_vcpu *vcpu = apic->vcpu;
3529 -       wait_queue_head_t *q = &vcpu->wq;
3530 +       struct swait_queue_head *q = &vcpu->wq;
3531         struct kvm_timer *ktimer = &apic->lapic_timer;
3532
3533         if (atomic_read(&apic->lapic_timer.pending))
3534 @@ -1204,8 +1204,8 @@ static void apic_timer_expired(struct kvm_lapic *apic)
3535         atomic_inc(&apic->lapic_timer.pending);
3536         kvm_set_pending_timer(vcpu);
3537
3538 -       if (waitqueue_active(q))
3539 -               wake_up_interruptible(q);
3540 +       if (swait_active(q))
3541 +               swake_up(q);
3542
3543         if (apic_lvtt_tscdeadline(apic))
3544                 ktimer->expired_tscdeadline = ktimer->tscdeadline;
3545 @@ -1801,6 +1801,7 @@ int kvm_create_lapic(struct kvm_vcpu *vcpu)
3546         hrtimer_init(&apic->lapic_timer.timer, CLOCK_MONOTONIC,
3547                      HRTIMER_MODE_ABS);
3548         apic->lapic_timer.timer.function = apic_timer_fn;
3549 +       apic->lapic_timer.timer.irqsafe = 1;
3550
3551         /*
3552          * APIC is created enabled. This will prevent kvm_lapic_set_base from
3553 diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
3554 index d7cb9577fa31..77c1bdd802df 100644
3555 --- a/arch/x86/kvm/x86.c
3556 +++ b/arch/x86/kvm/x86.c
3557 @@ -5792,6 +5792,13 @@ int kvm_arch_init(void *opaque)
3558                 goto out;
3559         }
3560
3561 +#ifdef CONFIG_PREEMPT_RT_FULL
3562 +       if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) {
3563 +               printk(KERN_ERR "RT requires X86_FEATURE_CONSTANT_TSC\n");
3564 +               return -EOPNOTSUPP;
3565 +       }
3566 +#endif
3567 +
3568         r = kvm_mmu_module_init();
3569         if (r)
3570                 goto out_free_percpu;
3571 diff --git a/arch/x86/mm/highmem_32.c b/arch/x86/mm/highmem_32.c
3572 index a6d739258137..bd24ba1c4a86 100644
3573 --- a/arch/x86/mm/highmem_32.c
3574 +++ b/arch/x86/mm/highmem_32.c
3575 @@ -32,10 +32,11 @@ EXPORT_SYMBOL(kunmap);
3576   */
3577  void *kmap_atomic_prot(struct page *page, pgprot_t prot)
3578  {
3579 +       pte_t pte = mk_pte(page, prot);
3580         unsigned long vaddr;
3581         int idx, type;
3582
3583 -       preempt_disable();
3584 +       preempt_disable_nort();
3585         pagefault_disable();
3586
3587         if (!PageHighMem(page))
3588 @@ -45,7 +46,10 @@ void *kmap_atomic_prot(struct page *page, pgprot_t prot)
3589         idx = type + KM_TYPE_NR*smp_processor_id();
3590         vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx);
3591         BUG_ON(!pte_none(*(kmap_pte-idx)));
3592 -       set_pte(kmap_pte-idx, mk_pte(page, prot));
3593 +#ifdef CONFIG_PREEMPT_RT_FULL
3594 +       current->kmap_pte[type] = pte;
3595 +#endif
3596 +       set_pte(kmap_pte-idx, pte);
3597         arch_flush_lazy_mmu_mode();
3598
3599         return (void *)vaddr;
3600 @@ -88,6 +92,9 @@ void __kunmap_atomic(void *kvaddr)
3601                  * is a bad idea also, in case the page changes cacheability
3602                  * attributes or becomes a protected page in a hypervisor.
3603                  */
3604 +#ifdef CONFIG_PREEMPT_RT_FULL
3605 +               current->kmap_pte[type] = __pte(0);
3606 +#endif
3607                 kpte_clear_flush(kmap_pte-idx, vaddr);
3608                 kmap_atomic_idx_pop();
3609                 arch_flush_lazy_mmu_mode();
3610 @@ -100,7 +107,7 @@ void __kunmap_atomic(void *kvaddr)
3611  #endif
3612
3613         pagefault_enable();
3614 -       preempt_enable();
3615 +       preempt_enable_nort();
3616  }
3617  EXPORT_SYMBOL(__kunmap_atomic);
3618
3619 diff --git a/arch/x86/mm/iomap_32.c b/arch/x86/mm/iomap_32.c
3620 index 9c0ff045fdd4..dd25dd1671b6 100644
3621 --- a/arch/x86/mm/iomap_32.c
3622 +++ b/arch/x86/mm/iomap_32.c
3623 @@ -56,6 +56,7 @@ EXPORT_SYMBOL_GPL(iomap_free);
3624
3625  void *kmap_atomic_prot_pfn(unsigned long pfn, pgprot_t prot)
3626  {
3627 +       pte_t pte = pfn_pte(pfn, prot);
3628         unsigned long vaddr;
3629         int idx, type;
3630
3631 @@ -65,7 +66,12 @@ void *kmap_atomic_prot_pfn(unsigned long pfn, pgprot_t prot)
3632         type = kmap_atomic_idx_push();
3633         idx = type + KM_TYPE_NR * smp_processor_id();
3634         vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx);
3635 -       set_pte(kmap_pte - idx, pfn_pte(pfn, prot));
3636 +       WARN_ON(!pte_none(*(kmap_pte - idx)));
3637 +
3638 +#ifdef CONFIG_PREEMPT_RT_FULL
3639 +       current->kmap_pte[type] = pte;
3640 +#endif
3641 +       set_pte(kmap_pte - idx, pte);
3642         arch_flush_lazy_mmu_mode();
3643
3644         return (void *)vaddr;
3645 @@ -113,6 +119,9 @@ iounmap_atomic(void __iomem *kvaddr)
3646                  * is a bad idea also, in case the page changes cacheability
3647                  * attributes or becomes a protected page in a hypervisor.
3648                  */
3649 +#ifdef CONFIG_PREEMPT_RT_FULL
3650 +               current->kmap_pte[type] = __pte(0);
3651 +#endif
3652                 kpte_clear_flush(kmap_pte-idx, vaddr);
3653                 kmap_atomic_idx_pop();
3654         }
3655 diff --git a/arch/x86/platform/uv/tlb_uv.c b/arch/x86/platform/uv/tlb_uv.c
3656 index 3b6ec42718e4..7871083de089 100644
3657 --- a/arch/x86/platform/uv/tlb_uv.c
3658 +++ b/arch/x86/platform/uv/tlb_uv.c
3659 @@ -714,9 +714,9 @@ static void destination_plugged(struct bau_desc *bau_desc,
3660
3661                 quiesce_local_uvhub(hmaster);
3662
3663 -               spin_lock(&hmaster->queue_lock);
3664 +               raw_spin_lock(&hmaster->queue_lock);
3665                 reset_with_ipi(&bau_desc->distribution, bcp);
3666 -               spin_unlock(&hmaster->queue_lock);
3667 +               raw_spin_unlock(&hmaster->queue_lock);
3668
3669                 end_uvhub_quiesce(hmaster);
3670
3671 @@ -736,9 +736,9 @@ static void destination_timeout(struct bau_desc *bau_desc,
3672
3673                 quiesce_local_uvhub(hmaster);
3674
3675 -               spin_lock(&hmaster->queue_lock);
3676 +               raw_spin_lock(&hmaster->queue_lock);
3677                 reset_with_ipi(&bau_desc->distribution, bcp);
3678 -               spin_unlock(&hmaster->queue_lock);
3679 +               raw_spin_unlock(&hmaster->queue_lock);
3680
3681                 end_uvhub_quiesce(hmaster);
3682
3683 @@ -759,7 +759,7 @@ static void disable_for_period(struct bau_control *bcp, struct ptc_stats *stat)
3684         cycles_t tm1;
3685
3686         hmaster = bcp->uvhub_master;
3687 -       spin_lock(&hmaster->disable_lock);
3688 +       raw_spin_lock(&hmaster->disable_lock);
3689         if (!bcp->baudisabled) {
3690                 stat->s_bau_disabled++;
3691                 tm1 = get_cycles();
3692 @@ -772,7 +772,7 @@ static void disable_for_period(struct bau_control *bcp, struct ptc_stats *stat)
3693                         }
3694                 }
3695         }
3696 -       spin_unlock(&hmaster->disable_lock);
3697 +       raw_spin_unlock(&hmaster->disable_lock);
3698  }
3699
3700  static void count_max_concurr(int stat, struct bau_control *bcp,
3701 @@ -835,7 +835,7 @@ static void record_send_stats(cycles_t time1, cycles_t time2,
3702   */
3703  static void uv1_throttle(struct bau_control *hmaster, struct ptc_stats *stat)
3704  {
3705 -       spinlock_t *lock = &hmaster->uvhub_lock;
3706 +       raw_spinlock_t *lock = &hmaster->uvhub_lock;
3707         atomic_t *v;
3708
3709         v = &hmaster->active_descriptor_count;
3710 @@ -968,7 +968,7 @@ static int check_enable(struct bau_control *bcp, struct ptc_stats *stat)
3711         struct bau_control *hmaster;
3712
3713         hmaster = bcp->uvhub_master;
3714 -       spin_lock(&hmaster->disable_lock);
3715 +       raw_spin_lock(&hmaster->disable_lock);
3716         if (bcp->baudisabled && (get_cycles() >= bcp->set_bau_on_time)) {
3717                 stat->s_bau_reenabled++;
3718                 for_each_present_cpu(tcpu) {
3719 @@ -980,10 +980,10 @@ static int check_enable(struct bau_control *bcp, struct ptc_stats *stat)
3720                                 tbcp->period_giveups = 0;
3721                         }
3722                 }
3723 -               spin_unlock(&hmaster->disable_lock);
3724 +               raw_spin_unlock(&hmaster->disable_lock);
3725                 return 0;
3726         }
3727 -       spin_unlock(&hmaster->disable_lock);
3728 +       raw_spin_unlock(&hmaster->disable_lock);
3729         return -1;
3730  }
3731
3732 @@ -1901,9 +1901,9 @@ static void __init init_per_cpu_tunables(void)
3733                 bcp->cong_reps                  = congested_reps;
3734                 bcp->disabled_period =          sec_2_cycles(disabled_period);
3735                 bcp->giveup_limit =             giveup_limit;
3736 -               spin_lock_init(&bcp->queue_lock);
3737 -               spin_lock_init(&bcp->uvhub_lock);
3738 -               spin_lock_init(&bcp->disable_lock);
3739 +               raw_spin_lock_init(&bcp->queue_lock);
3740 +               raw_spin_lock_init(&bcp->uvhub_lock);
3741 +               raw_spin_lock_init(&bcp->disable_lock);
3742         }
3743  }
3744
3745 diff --git a/arch/x86/platform/uv/uv_time.c b/arch/x86/platform/uv/uv_time.c
3746 index 2b158a9fa1d7..5e0b122620cb 100644
3747 --- a/arch/x86/platform/uv/uv_time.c
3748 +++ b/arch/x86/platform/uv/uv_time.c
3749 @@ -57,7 +57,7 @@ static DEFINE_PER_CPU(struct clock_event_device, cpu_ced);
3750
3751  /* There is one of these allocated per node */
3752  struct uv_rtc_timer_head {
3753 -       spinlock_t      lock;
3754 +       raw_spinlock_t  lock;
3755         /* next cpu waiting for timer, local node relative: */
3756         int             next_cpu;
3757         /* number of cpus on this node: */
3758 @@ -177,7 +177,7 @@ static __init int uv_rtc_allocate_timers(void)
3759                                 uv_rtc_deallocate_timers();
3760                                 return -ENOMEM;
3761                         }
3762 -                       spin_lock_init(&head->lock);
3763 +                       raw_spin_lock_init(&head->lock);
3764                         head->ncpus = uv_blade_nr_possible_cpus(bid);
3765                         head->next_cpu = -1;
3766                         blade_info[bid] = head;
3767 @@ -231,7 +231,7 @@ static int uv_rtc_set_timer(int cpu, u64 expires)
3768         unsigned long flags;
3769         int next_cpu;
3770
3771 -       spin_lock_irqsave(&head->lock, flags);
3772 +       raw_spin_lock_irqsave(&head->lock, flags);
3773
3774         next_cpu = head->next_cpu;
3775         *t = expires;
3776 @@ -243,12 +243,12 @@ static int uv_rtc_set_timer(int cpu, u64 expires)
3777                 if (uv_setup_intr(cpu, expires)) {
3778                         *t = ULLONG_MAX;
3779                         uv_rtc_find_next_timer(head, pnode);
3780 -                       spin_unlock_irqrestore(&head->lock, flags);
3781 +                       raw_spin_unlock_irqrestore(&head->lock, flags);
3782                         return -ETIME;
3783                 }
3784         }
3785
3786 -       spin_unlock_irqrestore(&head->lock, flags);
3787 +       raw_spin_unlock_irqrestore(&head->lock, flags);
3788         return 0;
3789  }
3790
3791 @@ -267,7 +267,7 @@ static int uv_rtc_unset_timer(int cpu, int force)
3792         unsigned long flags;
3793         int rc = 0;
3794
3795 -       spin_lock_irqsave(&head->lock, flags);
3796 +       raw_spin_lock_irqsave(&head->lock, flags);
3797
3798         if ((head->next_cpu == bcpu && uv_read_rtc(NULL) >= *t) || force)
3799                 rc = 1;
3800 @@ -279,7 +279,7 @@ static int uv_rtc_unset_timer(int cpu, int force)
3801                         uv_rtc_find_next_timer(head, pnode);
3802         }
3803
3804 -       spin_unlock_irqrestore(&head->lock, flags);
3805 +       raw_spin_unlock_irqrestore(&head->lock, flags);
3806
3807         return rc;
3808  }
3809 @@ -299,13 +299,18 @@ static int uv_rtc_unset_timer(int cpu, int force)
3810  static cycle_t uv_read_rtc(struct clocksource *cs)
3811  {
3812         unsigned long offset;
3813 +       cycle_t cycles;
3814
3815 +       preempt_disable();
3816         if (uv_get_min_hub_revision_id() == 1)
3817                 offset = 0;
3818         else
3819                 offset = (uv_blade_processor_id() * L1_CACHE_BYTES) % PAGE_SIZE;
3820
3821 -       return (cycle_t)uv_read_local_mmr(UVH_RTC | offset);
3822 +       cycles = (cycle_t)uv_read_local_mmr(UVH_RTC | offset);
3823 +       preempt_enable();
3824 +
3825 +       return cycles;
3826  }
3827
3828  /*
3829 diff --git a/block/blk-core.c b/block/blk-core.c
3830 index 4fab5d610805..52d2fe2fec8f 100644
3831 --- a/block/blk-core.c
3832 +++ b/block/blk-core.c
3833 @@ -125,6 +125,9 @@ void blk_rq_init(struct request_queue *q, struct request *rq)
3834
3835         INIT_LIST_HEAD(&rq->queuelist);
3836         INIT_LIST_HEAD(&rq->timeout_list);
3837 +#ifdef CONFIG_PREEMPT_RT_FULL
3838 +       INIT_WORK(&rq->work, __blk_mq_complete_request_remote_work);
3839 +#endif
3840         rq->cpu = -1;
3841         rq->q = q;
3842         rq->__sector = (sector_t) -1;
3843 @@ -233,7 +236,7 @@ EXPORT_SYMBOL(blk_start_queue_async);
3844   **/
3845  void blk_start_queue(struct request_queue *q)
3846  {
3847 -       WARN_ON(!irqs_disabled());
3848 +       WARN_ON_NONRT(!irqs_disabled());
3849
3850         queue_flag_clear(QUEUE_FLAG_STOPPED, q);
3851         __blk_run_queue(q);
3852 @@ -659,7 +662,7 @@ int blk_queue_enter(struct request_queue *q, gfp_t gfp)
3853                 if (!gfpflags_allow_blocking(gfp))
3854                         return -EBUSY;
3855
3856 -               ret = wait_event_interruptible(q->mq_freeze_wq,
3857 +               ret = swait_event_interruptible(q->mq_freeze_wq,
3858                                 !atomic_read(&q->mq_freeze_depth) ||
3859                                 blk_queue_dying(q));
3860                 if (blk_queue_dying(q))
3861 @@ -679,7 +682,7 @@ static void blk_queue_usage_counter_release(struct percpu_ref *ref)
3862         struct request_queue *q =
3863                 container_of(ref, struct request_queue, q_usage_counter);
3864
3865 -       wake_up_all(&q->mq_freeze_wq);
3866 +       swake_up_all(&q->mq_freeze_wq);
3867  }
3868
3869  struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id)
3870 @@ -741,7 +744,7 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id)
3871         q->bypass_depth = 1;
3872         __set_bit(QUEUE_FLAG_BYPASS, &q->queue_flags);
3873
3874 -       init_waitqueue_head(&q->mq_freeze_wq);
3875 +       init_swait_queue_head(&q->mq_freeze_wq);
3876
3877         /*
3878          * Init percpu_ref in atomic mode so that it's faster to shutdown.
3879 @@ -3200,7 +3203,7 @@ static void queue_unplugged(struct request_queue *q, unsigned int depth,
3880                 blk_run_queue_async(q);
3881         else
3882                 __blk_run_queue(q);
3883 -       spin_unlock(q->queue_lock);
3884 +       spin_unlock_irq(q->queue_lock);
3885  }
3886
3887  static void flush_plug_callbacks(struct blk_plug *plug, bool from_schedule)
3888 @@ -3248,7 +3251,6 @@ EXPORT_SYMBOL(blk_check_plugged);
3889  void blk_flush_plug_list(struct blk_plug *plug, bool from_schedule)
3890  {
3891         struct request_queue *q;
3892 -       unsigned long flags;
3893         struct request *rq;
3894         LIST_HEAD(list);
3895         unsigned int depth;
3896 @@ -3268,11 +3270,6 @@ void blk_flush_plug_list(struct blk_plug *plug, bool from_schedule)
3897         q = NULL;
3898         depth = 0;
3899
3900 -       /*
3901 -        * Save and disable interrupts here, to avoid doing it for every
3902 -        * queue lock we have to take.
3903 -        */
3904 -       local_irq_save(flags);
3905         while (!list_empty(&list)) {
3906                 rq = list_entry_rq(list.next);
3907                 list_del_init(&rq->queuelist);
3908 @@ -3285,7 +3282,7 @@ void blk_flush_plug_list(struct blk_plug *plug, bool from_schedule)
3909                                 queue_unplugged(q, depth, from_schedule);
3910                         q = rq->q;
3911                         depth = 0;
3912 -                       spin_lock(q->queue_lock);
3913 +                       spin_lock_irq(q->queue_lock);
3914                 }
3915
3916                 /*
3917 @@ -3312,8 +3309,6 @@ void blk_flush_plug_list(struct blk_plug *plug, bool from_schedule)
3918          */
3919         if (q)
3920                 queue_unplugged(q, depth, from_schedule);
3921 -
3922 -       local_irq_restore(flags);
3923  }
3924
3925  void blk_finish_plug(struct blk_plug *plug)
3926 diff --git a/block/blk-ioc.c b/block/blk-ioc.c
3927 index 381cb50a673c..dc8785233d94 100644
3928 --- a/block/blk-ioc.c
3929 +++ b/block/blk-ioc.c
3930 @@ -7,6 +7,7 @@
3931  #include <linux/bio.h>
3932  #include <linux/blkdev.h>
3933  #include <linux/slab.h>
3934 +#include <linux/delay.h>
3935
3936  #include "blk.h"
3937
3938 @@ -109,7 +110,7 @@ static void ioc_release_fn(struct work_struct *work)
3939                         spin_unlock(q->queue_lock);
3940                 } else {
3941                         spin_unlock_irqrestore(&ioc->lock, flags);
3942 -                       cpu_relax();
3943 +                       cpu_chill();
3944                         spin_lock_irqsave_nested(&ioc->lock, flags, 1);
3945                 }
3946         }
3947 @@ -187,7 +188,7 @@ retry:
3948                         spin_unlock(icq->q->queue_lock);
3949                 } else {
3950                         spin_unlock_irqrestore(&ioc->lock, flags);
3951 -                       cpu_relax();
3952 +                       cpu_chill();
3953                         goto retry;
3954                 }
3955         }
3956 diff --git a/block/blk-iopoll.c b/block/blk-iopoll.c
3957 index 0736729d6494..3e21e31d0d7e 100644
3958 --- a/block/blk-iopoll.c
3959 +++ b/block/blk-iopoll.c
3960 @@ -35,6 +35,7 @@ void blk_iopoll_sched(struct blk_iopoll *iop)
3961         list_add_tail(&iop->list, this_cpu_ptr(&blk_cpu_iopoll));
3962         __raise_softirq_irqoff(BLOCK_IOPOLL_SOFTIRQ);
3963         local_irq_restore(flags);
3964 +       preempt_check_resched_rt();
3965  }
3966  EXPORT_SYMBOL(blk_iopoll_sched);
3967
3968 @@ -132,6 +133,7 @@ static void blk_iopoll_softirq(struct softirq_action *h)
3969                 __raise_softirq_irqoff(BLOCK_IOPOLL_SOFTIRQ);
3970
3971         local_irq_enable();
3972 +       preempt_check_resched_rt();
3973  }
3974
3975  /**
3976 @@ -201,6 +203,7 @@ static int blk_iopoll_cpu_notify(struct notifier_block *self,
3977                                  this_cpu_ptr(&blk_cpu_iopoll));
3978                 __raise_softirq_irqoff(BLOCK_IOPOLL_SOFTIRQ);
3979                 local_irq_enable();
3980 +               preempt_check_resched_rt();
3981         }
3982
3983         return NOTIFY_OK;
3984 diff --git a/block/blk-mq-cpu.c b/block/blk-mq-cpu.c
3985 index bb3ed488f7b5..628c6c13c482 100644
3986 --- a/block/blk-mq-cpu.c
3987 +++ b/block/blk-mq-cpu.c
3988 @@ -16,7 +16,7 @@
3989  #include "blk-mq.h"
3990
3991  static LIST_HEAD(blk_mq_cpu_notify_list);
3992 -static DEFINE_RAW_SPINLOCK(blk_mq_cpu_notify_lock);
3993 +static DEFINE_SPINLOCK(blk_mq_cpu_notify_lock);
3994
3995  static int blk_mq_main_cpu_notify(struct notifier_block *self,
3996                                   unsigned long action, void *hcpu)
3997 @@ -25,7 +25,10 @@ static int blk_mq_main_cpu_notify(struct notifier_block *self,
3998         struct blk_mq_cpu_notifier *notify;
3999         int ret = NOTIFY_OK;
4000
4001 -       raw_spin_lock(&blk_mq_cpu_notify_lock);
4002 +       if (action != CPU_POST_DEAD)
4003 +               return NOTIFY_OK;
4004 +
4005 +       spin_lock(&blk_mq_cpu_notify_lock);
4006
4007         list_for_each_entry(notify, &blk_mq_cpu_notify_list, list) {
4008                 ret = notify->notify(notify->data, action, cpu);
4009 @@ -33,7 +36,7 @@ static int blk_mq_main_cpu_notify(struct notifier_block *self,
4010                         break;
4011         }
4012
4013 -       raw_spin_unlock(&blk_mq_cpu_notify_lock);
4014 +       spin_unlock(&blk_mq_cpu_notify_lock);
4015         return ret;
4016  }
4017
4018 @@ -41,16 +44,16 @@ void blk_mq_register_cpu_notifier(struct blk_mq_cpu_notifier *notifier)
4019  {
4020         BUG_ON(!notifier->notify);
4021
4022 -       raw_spin_lock(&blk_mq_cpu_notify_lock);
4023 +       spin_lock(&blk_mq_cpu_notify_lock);
4024         list_add_tail(&notifier->list, &blk_mq_cpu_notify_list);
4025 -       raw_spin_unlock(&blk_mq_cpu_notify_lock);
4026 +       spin_unlock(&blk_mq_cpu_notify_lock);
4027  }
4028
4029  void blk_mq_unregister_cpu_notifier(struct blk_mq_cpu_notifier *notifier)
4030  {
4031 -       raw_spin_lock(&blk_mq_cpu_notify_lock);
4032 +       spin_lock(&blk_mq_cpu_notify_lock);
4033         list_del(&notifier->list);
4034 -       raw_spin_unlock(&blk_mq_cpu_notify_lock);
4035 +       spin_unlock(&blk_mq_cpu_notify_lock);
4036  }
4037
4038  void blk_mq_init_cpu_notifier(struct blk_mq_cpu_notifier *notifier,
4039 diff --git a/block/blk-mq.c b/block/blk-mq.c
4040 index c3e461ec40e4..03dfc2c91595 100644
4041 --- a/block/blk-mq.c
4042 +++ b/block/blk-mq.c
4043 @@ -92,7 +92,7 @@ EXPORT_SYMBOL_GPL(blk_mq_freeze_queue_start);
4044
4045  static void blk_mq_freeze_queue_wait(struct request_queue *q)
4046  {
4047 -       wait_event(q->mq_freeze_wq, percpu_ref_is_zero(&q->q_usage_counter));
4048 +       swait_event(q->mq_freeze_wq, percpu_ref_is_zero(&q->q_usage_counter));
4049  }
4050
4051  /*
4052 @@ -130,7 +130,7 @@ void blk_mq_unfreeze_queue(struct request_queue *q)
4053         WARN_ON_ONCE(freeze_depth < 0);
4054         if (!freeze_depth) {
4055                 percpu_ref_reinit(&q->q_usage_counter);
4056 -               wake_up_all(&q->mq_freeze_wq);
4057 +               swake_up_all(&q->mq_freeze_wq);
4058         }
4059  }
4060  EXPORT_SYMBOL_GPL(blk_mq_unfreeze_queue);
4061 @@ -149,7 +149,7 @@ void blk_mq_wake_waiters(struct request_queue *q)
4062          * dying, we need to ensure that processes currently waiting on
4063          * the queue are notified as well.
4064          */
4065 -       wake_up_all(&q->mq_freeze_wq);
4066 +       swake_up_all(&q->mq_freeze_wq);
4067  }
4068
4069  bool blk_mq_can_queue(struct blk_mq_hw_ctx *hctx)
4070 @@ -196,6 +196,9 @@ static void blk_mq_rq_ctx_init(struct request_queue *q, struct blk_mq_ctx *ctx,
4071         rq->resid_len = 0;
4072         rq->sense = NULL;
4073
4074 +#ifdef CONFIG_PREEMPT_RT_FULL
4075 +       INIT_WORK(&rq->work, __blk_mq_complete_request_remote_work);
4076 +#endif
4077         INIT_LIST_HEAD(&rq->timeout_list);
4078         rq->timeout = 0;
4079
4080 @@ -325,6 +328,17 @@ void blk_mq_end_request(struct request *rq, int error)
4081  }
4082  EXPORT_SYMBOL(blk_mq_end_request);
4083
4084 +#ifdef CONFIG_PREEMPT_RT_FULL
4085 +
4086 +void __blk_mq_complete_request_remote_work(struct work_struct *work)
4087 +{
4088 +       struct request *rq = container_of(work, struct request, work);
4089 +
4090 +       rq->q->softirq_done_fn(rq);
4091 +}
4092 +
4093 +#else
4094 +
4095  static void __blk_mq_complete_request_remote(void *data)
4096  {
4097         struct request *rq = data;
4098 @@ -332,6 +346,8 @@ static void __blk_mq_complete_request_remote(void *data)
4099         rq->q->softirq_done_fn(rq);
4100  }
4101
4102 +#endif
4103 +
4104  static void blk_mq_ipi_complete_request(struct request *rq)
4105  {
4106         struct blk_mq_ctx *ctx = rq->mq_ctx;
4107 @@ -343,19 +359,23 @@ static void blk_mq_ipi_complete_request(struct request *rq)
4108                 return;
4109         }
4110
4111 -       cpu = get_cpu();
4112 +       cpu = get_cpu_light();
4113         if (!test_bit(QUEUE_FLAG_SAME_FORCE, &rq->q->queue_flags))
4114                 shared = cpus_share_cache(cpu, ctx->cpu);
4115
4116         if (cpu != ctx->cpu && !shared && cpu_online(ctx->cpu)) {
4117 +#ifdef CONFIG_PREEMPT_RT_FULL
4118 +               schedule_work_on(ctx->cpu, &rq->work);
4119 +#else
4120                 rq->csd.func = __blk_mq_complete_request_remote;
4121                 rq->csd.info = rq;
4122                 rq->csd.flags = 0;
4123                 smp_call_function_single_async(ctx->cpu, &rq->csd);
4124 +#endif
4125         } else {
4126                 rq->q->softirq_done_fn(rq);
4127         }
4128 -       put_cpu();
4129 +       put_cpu_light();
4130  }
4131
4132  static void __blk_mq_complete_request(struct request *rq)
4133 @@ -864,14 +884,14 @@ void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async)
4134                 return;
4135
4136         if (!async) {
4137 -               int cpu = get_cpu();
4138 +               int cpu = get_cpu_light();
4139                 if (cpumask_test_cpu(cpu, hctx->cpumask)) {
4140                         __blk_mq_run_hw_queue(hctx);
4141 -                       put_cpu();
4142 +                       put_cpu_light();
4143                         return;
4144                 }
4145
4146 -               put_cpu();
4147 +               put_cpu_light();
4148         }
4149
4150         kblockd_schedule_delayed_work_on(blk_mq_hctx_next_cpu(hctx),
4151 @@ -1619,7 +1639,7 @@ static int blk_mq_hctx_notify(void *data, unsigned long action,
4152  {
4153         struct blk_mq_hw_ctx *hctx = data;
4154
4155 -       if (action == CPU_DEAD || action == CPU_DEAD_FROZEN)
4156 +       if (action == CPU_POST_DEAD)
4157                 return blk_mq_hctx_cpu_offline(hctx, cpu);
4158
4159         /*
4160 diff --git a/block/blk-mq.h b/block/blk-mq.h
4161 index 713820b47b31..3cb6feb4fe23 100644
4162 --- a/block/blk-mq.h
4163 +++ b/block/blk-mq.h
4164 @@ -74,7 +74,10 @@ struct blk_align_bitmap {
4165  static inline struct blk_mq_ctx *__blk_mq_get_ctx(struct request_queue *q,
4166                                            unsigned int cpu)
4167  {
4168 -       return per_cpu_ptr(q->queue_ctx, cpu);
4169 +       struct blk_mq_ctx *ctx;
4170 +
4171 +       ctx = per_cpu_ptr(q->queue_ctx, cpu);
4172 +       return ctx;
4173  }
4174
4175  /*
4176 @@ -85,12 +88,12 @@ static inline struct blk_mq_ctx *__blk_mq_get_ctx(struct request_queue *q,
4177   */
4178  static inline struct blk_mq_ctx *blk_mq_get_ctx(struct request_queue *q)
4179  {
4180 -       return __blk_mq_get_ctx(q, get_cpu());
4181 +       return __blk_mq_get_ctx(q, get_cpu_light());
4182  }
4183
4184  static inline void blk_mq_put_ctx(struct blk_mq_ctx *ctx)
4185  {
4186 -       put_cpu();
4187 +       put_cpu_light();
4188  }
4189
4190  struct blk_mq_alloc_data {
4191 diff --git a/block/blk-softirq.c b/block/blk-softirq.c
4192 index 53b1737e978d..81c3c0a62edf 100644
4193 --- a/block/blk-softirq.c
4194 +++ b/block/blk-softirq.c
4195 @@ -51,6 +51,7 @@ static void trigger_softirq(void *data)
4196                 raise_softirq_irqoff(BLOCK_SOFTIRQ);
4197
4198         local_irq_restore(flags);
4199 +       preempt_check_resched_rt();
4200  }
4201
4202  /*
4203 @@ -93,6 +94,7 @@ static int blk_cpu_notify(struct notifier_block *self, unsigned long action,
4204                                  this_cpu_ptr(&blk_cpu_done));
4205                 raise_softirq_irqoff(BLOCK_SOFTIRQ);
4206                 local_irq_enable();
4207 +               preempt_check_resched_rt();
4208         }
4209
4210         return NOTIFY_OK;
4211 @@ -150,6 +152,7 @@ do_local:
4212                 goto do_local;
4213
4214         local_irq_restore(flags);
4215 +       preempt_check_resched_rt();
4216  }
4217
4218  /**
4219 diff --git a/block/bounce.c b/block/bounce.c
4220 index 1cb5dd3a5da1..2f1ec8a67cbe 100644
4221 --- a/block/bounce.c
4222 +++ b/block/bounce.c
4223 @@ -55,11 +55,11 @@ static void bounce_copy_vec(struct bio_vec *to, unsigned char *vfrom)
4224         unsigned long flags;
4225         unsigned char *vto;
4226
4227 -       local_irq_save(flags);
4228 +       local_irq_save_nort(flags);
4229         vto = kmap_atomic(to->bv_page);
4230         memcpy(vto + to->bv_offset, vfrom, to->bv_len);
4231         kunmap_atomic(vto);
4232 -       local_irq_restore(flags);
4233 +       local_irq_restore_nort(flags);
4234  }
4235
4236  #else /* CONFIG_HIGHMEM */
4237 diff --git a/crypto/algapi.c b/crypto/algapi.c
4238 index 59bf491fe3d8..f98e79c8cd77 100644
4239 --- a/crypto/algapi.c
4240 +++ b/crypto/algapi.c
4241 @@ -719,13 +719,13 @@ EXPORT_SYMBOL_GPL(crypto_spawn_tfm2);
4242
4243  int crypto_register_notifier(struct notifier_block *nb)
4244  {
4245 -       return blocking_notifier_chain_register(&crypto_chain, nb);
4246 +       return srcu_notifier_chain_register(&crypto_chain, nb);
4247  }
4248  EXPORT_SYMBOL_GPL(crypto_register_notifier);
4249
4250  int crypto_unregister_notifier(struct notifier_block *nb)
4251  {
4252 -       return blocking_notifier_chain_unregister(&crypto_chain, nb);
4253 +       return srcu_notifier_chain_unregister(&crypto_chain, nb);
4254  }
4255  EXPORT_SYMBOL_GPL(crypto_unregister_notifier);
4256
4257 diff --git a/crypto/api.c b/crypto/api.c
4258 index bbc147cb5dec..bc1a848f02ec 100644
4259 --- a/crypto/api.c
4260 +++ b/crypto/api.c
4261 @@ -31,7 +31,7 @@ EXPORT_SYMBOL_GPL(crypto_alg_list);
4262  DECLARE_RWSEM(crypto_alg_sem);
4263  EXPORT_SYMBOL_GPL(crypto_alg_sem);
4264
4265 -BLOCKING_NOTIFIER_HEAD(crypto_chain);
4266 +SRCU_NOTIFIER_HEAD(crypto_chain);
4267  EXPORT_SYMBOL_GPL(crypto_chain);
4268
4269  static struct crypto_alg *crypto_larval_wait(struct crypto_alg *alg);
4270 @@ -236,10 +236,10 @@ int crypto_probing_notify(unsigned long val, void *v)
4271  {
4272         int ok;
4273
4274 -       ok = blocking_notifier_call_chain(&crypto_chain, val, v);
4275 +       ok = srcu_notifier_call_chain(&crypto_chain, val, v);
4276         if (ok == NOTIFY_DONE) {
4277                 request_module("cryptomgr");
4278 -               ok = blocking_notifier_call_chain(&crypto_chain, val, v);
4279 +               ok = srcu_notifier_call_chain(&crypto_chain, val, v);
4280         }
4281
4282         return ok;
4283 diff --git a/crypto/internal.h b/crypto/internal.h
4284 index 00e42a3ed814..2e85551e235f 100644
4285 --- a/crypto/internal.h
4286 +++ b/crypto/internal.h
4287 @@ -47,7 +47,7 @@ struct crypto_larval {
4288
4289  extern struct list_head crypto_alg_list;
4290  extern struct rw_semaphore crypto_alg_sem;
4291 -extern struct blocking_notifier_head crypto_chain;
4292 +extern struct srcu_notifier_head crypto_chain;
4293
4294  #ifdef CONFIG_PROC_FS
4295  void __init crypto_init_proc(void);
4296 @@ -143,7 +143,7 @@ static inline int crypto_is_moribund(struct crypto_alg *alg)
4297
4298  static inline void crypto_notify(unsigned long val, void *v)
4299  {
4300 -       blocking_notifier_call_chain(&crypto_chain, val, v);
4301 +       srcu_notifier_call_chain(&crypto_chain, val, v);
4302  }
4303
4304  #endif /* _CRYPTO_INTERNAL_H */
4305 diff --git a/drivers/acpi/acpica/acglobal.h b/drivers/acpi/acpica/acglobal.h
4306 index faa97604d878..941497f31cf0 100644
4307 --- a/drivers/acpi/acpica/acglobal.h
4308 +++ b/drivers/acpi/acpica/acglobal.h
4309 @@ -116,7 +116,7 @@ ACPI_GLOBAL(u8, acpi_gbl_global_lock_pending);
4310   * interrupt level
4311   */
4312  ACPI_GLOBAL(acpi_spinlock, acpi_gbl_gpe_lock); /* For GPE data structs and registers */
4313 -ACPI_GLOBAL(acpi_spinlock, acpi_gbl_hardware_lock);    /* For ACPI H/W except GPE registers */
4314 +ACPI_GLOBAL(acpi_raw_spinlock, acpi_gbl_hardware_lock);        /* For ACPI H/W except GPE registers */
4315  ACPI_GLOBAL(acpi_spinlock, acpi_gbl_reference_count_lock);
4316
4317  /* Mutex for _OSI support */
4318 diff --git a/drivers/acpi/acpica/hwregs.c b/drivers/acpi/acpica/hwregs.c
4319 index 3cf77afd142c..dc32e72132f1 100644
4320 --- a/drivers/acpi/acpica/hwregs.c
4321 +++ b/drivers/acpi/acpica/hwregs.c
4322 @@ -269,14 +269,14 @@ acpi_status acpi_hw_clear_acpi_status(void)
4323                           ACPI_BITMASK_ALL_FIXED_STATUS,
4324                           ACPI_FORMAT_UINT64(acpi_gbl_xpm1a_status.address)));
4325
4326 -       lock_flags = acpi_os_acquire_lock(acpi_gbl_hardware_lock);
4327 +       raw_spin_lock_irqsave(acpi_gbl_hardware_lock, lock_flags);
4328
4329         /* Clear the fixed events in PM1 A/B */
4330
4331         status = acpi_hw_register_write(ACPI_REGISTER_PM1_STATUS,
4332                                         ACPI_BITMASK_ALL_FIXED_STATUS);
4333
4334 -       acpi_os_release_lock(acpi_gbl_hardware_lock, lock_flags);
4335 +       raw_spin_unlock_irqrestore(acpi_gbl_hardware_lock, lock_flags);
4336
4337         if (ACPI_FAILURE(status)) {
4338                 goto exit;
4339 diff --git a/drivers/acpi/acpica/hwxface.c b/drivers/acpi/acpica/hwxface.c
4340 index b2e50d8007fe..ff007084dc48 100644
4341 --- a/drivers/acpi/acpica/hwxface.c
4342 +++ b/drivers/acpi/acpica/hwxface.c
4343 @@ -374,7 +374,7 @@ acpi_status acpi_write_bit_register(u32 register_id, u32 value)
4344                 return_ACPI_STATUS(AE_BAD_PARAMETER);
4345         }
4346
4347 -       lock_flags = acpi_os_acquire_lock(acpi_gbl_hardware_lock);
4348 +       raw_spin_lock_irqsave(acpi_gbl_hardware_lock, lock_flags);
4349
4350         /*
4351          * At this point, we know that the parent register is one of the
4352 @@ -435,7 +435,7 @@ acpi_status acpi_write_bit_register(u32 register_id, u32 value)
4353
4354  unlock_and_exit:
4355
4356 -       acpi_os_release_lock(acpi_gbl_hardware_lock, lock_flags);
4357 +       raw_spin_unlock_irqrestore(acpi_gbl_hardware_lock, lock_flags);
4358         return_ACPI_STATUS(status);
4359  }
4360
4361 diff --git a/drivers/acpi/acpica/utmutex.c b/drivers/acpi/acpica/utmutex.c
4362 index ce406e39b669..41a75eb3ae9d 100644
4363 --- a/drivers/acpi/acpica/utmutex.c
4364 +++ b/drivers/acpi/acpica/utmutex.c
4365 @@ -88,7 +88,7 @@ acpi_status acpi_ut_mutex_initialize(void)
4366                 return_ACPI_STATUS (status);
4367         }
4368
4369 -       status = acpi_os_create_lock (&acpi_gbl_hardware_lock);
4370 +       status = acpi_os_create_raw_lock (&acpi_gbl_hardware_lock);
4371         if (ACPI_FAILURE (status)) {
4372                 return_ACPI_STATUS (status);
4373         }
4374 @@ -156,7 +156,7 @@ void acpi_ut_mutex_terminate(void)
4375         /* Delete the spinlocks */
4376
4377         acpi_os_delete_lock(acpi_gbl_gpe_lock);
4378 -       acpi_os_delete_lock(acpi_gbl_hardware_lock);
4379 +       acpi_os_delete_raw_lock(acpi_gbl_hardware_lock);
4380         acpi_os_delete_lock(acpi_gbl_reference_count_lock);
4381
4382         /* Delete the reader/writer lock */
4383 diff --git a/drivers/ata/libata-sff.c b/drivers/ata/libata-sff.c
4384 index 7dbba387d12a..65beb7abb4e7 100644
4385 --- a/drivers/ata/libata-sff.c
4386 +++ b/drivers/ata/libata-sff.c
4387 @@ -678,9 +678,9 @@ unsigned int ata_sff_data_xfer_noirq(struct ata_device *dev, unsigned char *buf,
4388         unsigned long flags;
4389         unsigned int consumed;
4390
4391 -       local_irq_save(flags);
4392 +       local_irq_save_nort(flags);
4393         consumed = ata_sff_data_xfer32(dev, buf, buflen, rw);
4394 -       local_irq_restore(flags);
4395 +       local_irq_restore_nort(flags);
4396
4397         return consumed;
4398  }
4399 @@ -719,7 +719,7 @@ static void ata_pio_sector(struct ata_queued_cmd *qc)
4400                 unsigned long flags;
4401
4402                 /* FIXME: use a bounce buffer */
4403 -               local_irq_save(flags);
4404 +               local_irq_save_nort(flags);
4405                 buf = kmap_atomic(page);
4406
4407                 /* do the actual data transfer */
4408 @@ -727,7 +727,7 @@ static void ata_pio_sector(struct ata_queued_cmd *qc)
4409                                        do_write);
4410
4411                 kunmap_atomic(buf);
4412 -               local_irq_restore(flags);
4413 +               local_irq_restore_nort(flags);
4414         } else {
4415                 buf = page_address(page);
4416                 ap->ops->sff_data_xfer(qc->dev, buf + offset, qc->sect_size,
4417 @@ -864,7 +864,7 @@ next_sg:
4418                 unsigned long flags;
4419
4420                 /* FIXME: use bounce buffer */
4421 -               local_irq_save(flags);
4422 +               local_irq_save_nort(flags);
4423                 buf = kmap_atomic(page);
4424
4425                 /* do the actual data transfer */
4426 @@ -872,7 +872,7 @@ next_sg:
4427                                                                 count, rw);
4428
4429                 kunmap_atomic(buf);
4430 -               local_irq_restore(flags);
4431 +               local_irq_restore_nort(flags);
4432         } else {
4433                 buf = page_address(page);
4434                 consumed = ap->ops->sff_data_xfer(dev,  buf + offset,
4435 diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
4436 index 370c2f76016d..65e0b375a291 100644
4437 --- a/drivers/block/zram/zram_drv.c
4438 +++ b/drivers/block/zram/zram_drv.c
4439 @@ -520,6 +520,8 @@ static struct zram_meta *zram_meta_alloc(char *pool_name, u64 disksize)
4440                 goto out_error;
4441         }
4442
4443 +       zram_meta_init_table_locks(meta, disksize);
4444 +
4445         return meta;
4446
4447  out_error:
4448 @@ -568,12 +570,12 @@ static int zram_decompress_page(struct zram *zram, char *mem, u32 index)
4449         unsigned long handle;
4450         size_t size;
4451
4452 -       bit_spin_lock(ZRAM_ACCESS, &meta->table[index].value);
4453 +       zram_lock_table(&meta->table[index]);
4454         handle = meta->table[index].handle;
4455         size = zram_get_obj_size(meta, index);
4456
4457         if (!handle || zram_test_flag(meta, index, ZRAM_ZERO)) {
4458 -               bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value);
4459 +               zram_unlock_table(&meta->table[index]);
4460                 clear_page(mem);
4461                 return 0;
4462         }
4463 @@ -584,7 +586,7 @@ static int zram_decompress_page(struct zram *zram, char *mem, u32 index)
4464         else
4465                 ret = zcomp_decompress(zram->comp, cmem, size, mem);
4466         zs_unmap_object(meta->mem_pool, handle);
4467 -       bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value);
4468 +       zram_unlock_table(&meta->table[index]);
4469
4470         /* Should NEVER happen. Return bio error if it does. */
4471         if (unlikely(ret)) {
4472 @@ -604,14 +606,14 @@ static int zram_bvec_read(struct zram *zram, struct bio_vec *bvec,
4473         struct zram_meta *meta = zram->meta;
4474         page = bvec->bv_page;
4475
4476 -       bit_spin_lock(ZRAM_ACCESS, &meta->table[index].value);
4477 +       zram_lock_table(&meta->table[index]);
4478         if (unlikely(!meta->table[index].handle) ||
4479                         zram_test_flag(meta, index, ZRAM_ZERO)) {
4480 -               bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value);
4481 +               zram_unlock_table(&meta->table[index]);
4482                 handle_zero_page(bvec);
4483                 return 0;
4484         }
4485 -       bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value);
4486 +       zram_unlock_table(&meta->table[index]);
4487
4488         if (is_partial_io(bvec))
4489                 /* Use  a temporary buffer to decompress the page */
4490 @@ -689,10 +691,10 @@ static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index,
4491                 if (user_mem)
4492                         kunmap_atomic(user_mem);
4493                 /* Free memory associated with this sector now. */
4494 -               bit_spin_lock(ZRAM_ACCESS, &meta->table[index].value);
4495 +               zram_lock_table(&meta->table[index]);
4496                 zram_free_page(zram, index);
4497                 zram_set_flag(meta, index, ZRAM_ZERO);
4498 -               bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value);
4499 +               zram_unlock_table(&meta->table[index]);
4500
4501                 atomic64_inc(&zram->stats.zero_pages);
4502                 ret = 0;
4503 @@ -752,12 +754,12 @@ static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index,
4504          * Free memory associated with this sector
4505          * before overwriting unused sectors.
4506          */
4507 -       bit_spin_lock(ZRAM_ACCESS, &meta->table[index].value);
4508 +       zram_lock_table(&meta->table[index]);
4509         zram_free_page(zram, index);
4510
4511         meta->table[index].handle = handle;
4512         zram_set_obj_size(meta, index, clen);
4513 -       bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value);
4514 +       zram_unlock_table(&meta->table[index]);
4515
4516         /* Update stats */
4517         atomic64_add(clen, &zram->stats.compr_data_size);
4518 @@ -800,9 +802,9 @@ static void zram_bio_discard(struct zram *zram, u32 index,
4519         }
4520
4521         while (n >= PAGE_SIZE) {
4522 -               bit_spin_lock(ZRAM_ACCESS, &meta->table[index].value);
4523 +               zram_lock_table(&meta->table[index]);
4524                 zram_free_page(zram, index);
4525 -               bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value);
4526 +               zram_unlock_table(&meta->table[index]);
4527                 atomic64_inc(&zram->stats.notify_free);
4528                 index++;
4529                 n -= PAGE_SIZE;
4530 @@ -928,9 +930,9 @@ static void zram_slot_free_notify(struct block_device *bdev,
4531         zram = bdev->bd_disk->private_data;
4532         meta = zram->meta;
4533
4534 -       bit_spin_lock(ZRAM_ACCESS, &meta->table[index].value);
4535 +       zram_lock_table(&meta->table[index]);
4536         zram_free_page(zram, index);
4537 -       bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value);
4538 +       zram_unlock_table(&meta->table[index]);
4539         atomic64_inc(&zram->stats.notify_free);
4540  }
4541
4542 diff --git a/drivers/block/zram/zram_drv.h b/drivers/block/zram/zram_drv.h
4543 index 8e92339686d7..9e3e953d680e 100644
4544 --- a/drivers/block/zram/zram_drv.h
4545 +++ b/drivers/block/zram/zram_drv.h
4546 @@ -72,6 +72,9 @@ enum zram_pageflags {
4547  struct zram_table_entry {
4548         unsigned long handle;
4549         unsigned long value;
4550 +#ifdef CONFIG_PREEMPT_RT_BASE
4551 +       spinlock_t lock;
4552 +#endif
4553  };
4554
4555  struct zram_stats {
4556 @@ -119,4 +122,42 @@ struct zram {
4557          */
4558         bool claim; /* Protected by bdev->bd_mutex */
4559  };
4560 +
4561 +#ifndef CONFIG_PREEMPT_RT_BASE
4562 +static inline void zram_lock_table(struct zram_table_entry *table)
4563 +{
4564 +       bit_spin_lock(ZRAM_ACCESS, &table->value);
4565 +}
4566 +
4567 +static inline void zram_unlock_table(struct zram_table_entry *table)
4568 +{
4569 +       bit_spin_unlock(ZRAM_ACCESS, &table->value);
4570 +}
4571 +
4572 +static inline void zram_meta_init_table_locks(struct zram_meta *meta, u64 disksize) { }
4573 +#else /* CONFIG_PREEMPT_RT_BASE */
4574 +static inline void zram_lock_table(struct zram_table_entry *table)
4575 +{
4576 +       spin_lock(&table->lock);
4577 +       __set_bit(ZRAM_ACCESS, &table->value);
4578 +}
4579 +
4580 +static inline void zram_unlock_table(struct zram_table_entry *table)
4581 +{
4582 +       __clear_bit(ZRAM_ACCESS, &table->value);
4583 +       spin_unlock(&table->lock);
4584 +}
4585 +
4586 +static inline void zram_meta_init_table_locks(struct zram_meta *meta, u64 disksize)
4587 +{
4588 +        size_t num_pages = disksize >> PAGE_SHIFT;
4589 +        size_t index;
4590 +
4591 +        for (index = 0; index < num_pages; index++) {
4592 +               spinlock_t *lock = &meta->table[index].lock;
4593 +               spin_lock_init(lock);
4594 +        }
4595 +}
4596 +#endif /* CONFIG_PREEMPT_RT_BASE */
4597 +
4598  #endif
4599 diff --git a/drivers/char/random.c b/drivers/char/random.c
4600 index 491a4dce13fe..cf69b6b42208 100644
4601 --- a/drivers/char/random.c
4602 +++ b/drivers/char/random.c
4603 @@ -799,8 +799,6 @@ static void add_timer_randomness(struct timer_rand_state *state, unsigned num)
4604         } sample;
4605         long delta, delta2, delta3;
4606
4607 -       preempt_disable();
4608 -
4609         sample.jiffies = jiffies;
4610         sample.cycles = random_get_entropy();
4611         sample.num = num;
4612 @@ -841,7 +839,6 @@ static void add_timer_randomness(struct timer_rand_state *state, unsigned num)
4613                  */
4614                 credit_entropy_bits(r, min_t(int, fls(delta>>1), 11));
4615         }
4616 -       preempt_enable();
4617  }
4618
4619  void add_input_randomness(unsigned int type, unsigned int code,
4620 @@ -894,28 +891,27 @@ static __u32 get_reg(struct fast_pool *f, struct pt_regs *regs)
4621         return *(ptr + f->reg_idx++);
4622  }
4623
4624 -void add_interrupt_randomness(int irq, int irq_flags)
4625 +void add_interrupt_randomness(int irq, int irq_flags, __u64 ip)
4626  {
4627         struct entropy_store    *r;
4628         struct fast_pool        *fast_pool = this_cpu_ptr(&irq_randomness);
4629 -       struct pt_regs          *regs = get_irq_regs();
4630         unsigned long           now = jiffies;
4631         cycles_t                cycles = random_get_entropy();
4632         __u32                   c_high, j_high;
4633 -       __u64                   ip;
4634         unsigned long           seed;
4635         int                     credit = 0;
4636
4637         if (cycles == 0)
4638 -               cycles = get_reg(fast_pool, regs);
4639 +               cycles = get_reg(fast_pool, NULL);
4640         c_high = (sizeof(cycles) > 4) ? cycles >> 32 : 0;
4641         j_high = (sizeof(now) > 4) ? now >> 32 : 0;
4642         fast_pool->pool[0] ^= cycles ^ j_high ^ irq;
4643         fast_pool->pool[1] ^= now ^ c_high;
4644 -       ip = regs ? instruction_pointer(regs) : _RET_IP_;
4645 +       if (!ip)
4646 +               ip = _RET_IP_;
4647         fast_pool->pool[2] ^= ip;
4648         fast_pool->pool[3] ^= (sizeof(ip) > 4) ? ip >> 32 :
4649 -               get_reg(fast_pool, regs);
4650 +               get_reg(fast_pool, NULL);
4651
4652         fast_mix(fast_pool);
4653         add_interrupt_bench(cycles);
4654 diff --git a/drivers/clk/at91/clk-generated.c b/drivers/clk/at91/clk-generated.c
4655 index abc80949e1dd..4ad3298eb372 100644
4656 --- a/drivers/clk/at91/clk-generated.c
4657 +++ b/drivers/clk/at91/clk-generated.c
4658 @@ -15,8 +15,8 @@
4659  #include <linux/clkdev.h>
4660  #include <linux/clk/at91_pmc.h>
4661  #include <linux/of.h>
4662 -#include <linux/of_address.h>
4663 -#include <linux/io.h>
4664 +#include <linux/mfd/syscon.h>
4665 +#include <linux/regmap.h>
4666
4667  #include "pmc.h"
4668
4669 @@ -28,8 +28,9 @@
4670
4671  struct clk_generated {
4672         struct clk_hw hw;
4673 -       struct at91_pmc *pmc;
4674 +       struct regmap *regmap;
4675         struct clk_range range;
4676 +       spinlock_t *lock;
4677         u32 id;
4678         u32 gckdiv;
4679         u8 parent_id;
4680 @@ -41,49 +42,52 @@ struct clk_generated {
4681  static int clk_generated_enable(struct clk_hw *hw)
4682  {
4683         struct clk_generated *gck = to_clk_generated(hw);
4684 -       struct at91_pmc *pmc = gck->pmc;
4685 -       u32 tmp;
4686 +       unsigned long flags;
4687
4688         pr_debug("GCLK: %s, gckdiv = %d, parent id = %d\n",
4689                  __func__, gck->gckdiv, gck->parent_id);
4690
4691 -       pmc_lock(pmc);
4692 -       pmc_write(pmc, AT91_PMC_PCR, (gck->id & AT91_PMC_PCR_PID_MASK));
4693 -       tmp = pmc_read(pmc, AT91_PMC_PCR) &
4694 -                       ~(AT91_PMC_PCR_GCKDIV_MASK | AT91_PMC_PCR_GCKCSS_MASK);
4695 -       pmc_write(pmc, AT91_PMC_PCR, tmp | AT91_PMC_PCR_GCKCSS(gck->parent_id)
4696 -                                        | AT91_PMC_PCR_CMD
4697 -                                        | AT91_PMC_PCR_GCKDIV(gck->gckdiv)
4698 -                                        | AT91_PMC_PCR_GCKEN);
4699 -       pmc_unlock(pmc);
4700 +       spin_lock_irqsave(gck->lock, flags);
4701 +       regmap_write(gck->regmap, AT91_PMC_PCR,
4702 +                    (gck->id & AT91_PMC_PCR_PID_MASK));
4703 +       regmap_update_bits(gck->regmap, AT91_PMC_PCR,
4704 +                          AT91_PMC_PCR_GCKDIV_MASK | AT91_PMC_PCR_GCKCSS_MASK |
4705 +                          AT91_PMC_PCR_CMD | AT91_PMC_PCR_GCKEN,
4706 +                          AT91_PMC_PCR_GCKCSS(gck->parent_id) |
4707 +                          AT91_PMC_PCR_CMD |
4708 +                          AT91_PMC_PCR_GCKDIV(gck->gckdiv) |
4709 +                          AT91_PMC_PCR_GCKEN);
4710 +       spin_unlock_irqrestore(gck->lock, flags);
4711         return 0;
4712  }
4713
4714  static void clk_generated_disable(struct clk_hw *hw)
4715  {
4716         struct clk_generated *gck = to_clk_generated(hw);
4717 -       struct at91_pmc *pmc = gck->pmc;
4718 -       u32 tmp;
4719 -
4720 -       pmc_lock(pmc);
4721 -       pmc_write(pmc, AT91_PMC_PCR, (gck->id & AT91_PMC_PCR_PID_MASK));
4722 -       tmp = pmc_read(pmc, AT91_PMC_PCR) & ~AT91_PMC_PCR_GCKEN;
4723 -       pmc_write(pmc, AT91_PMC_PCR, tmp | AT91_PMC_PCR_CMD);
4724 -       pmc_unlock(pmc);
4725 +       unsigned long flags;
4726 +
4727 +       spin_lock_irqsave(gck->lock, flags);
4728 +       regmap_write(gck->regmap, AT91_PMC_PCR,
4729 +                    (gck->id & AT91_PMC_PCR_PID_MASK));
4730 +       regmap_update_bits(gck->regmap, AT91_PMC_PCR,
4731 +                          AT91_PMC_PCR_CMD | AT91_PMC_PCR_GCKEN,
4732 +                          AT91_PMC_PCR_CMD);
4733 +       spin_unlock_irqrestore(gck->lock, flags);
4734  }
4735
4736  static int clk_generated_is_enabled(struct clk_hw *hw)
4737  {
4738         struct clk_generated *gck = to_clk_generated(hw);
4739 -       struct at91_pmc *pmc = gck->pmc;
4740 -       int ret;
4741 +       unsigned long flags;
4742 +       unsigned int status;
4743
4744 -       pmc_lock(pmc);
4745 -       pmc_write(pmc, AT91_PMC_PCR, (gck->id & AT91_PMC_PCR_PID_MASK));
4746 -       ret = !!(pmc_read(pmc, AT91_PMC_PCR) & AT91_PMC_PCR_GCKEN);
4747 -       pmc_unlock(pmc);
4748 +       spin_lock_irqsave(gck->lock, flags);
4749 +       regmap_write(gck->regmap, AT91_PMC_PCR,
4750 +                    (gck->id & AT91_PMC_PCR_PID_MASK));
4751 +       regmap_read(gck->regmap, AT91_PMC_PCR, &status);
4752 +       spin_unlock_irqrestore(gck->lock, flags);
4753
4754 -       return ret;
4755 +       return status & AT91_PMC_PCR_GCKEN ? 1 : 0;
4756  }
4757
4758  static unsigned long
4759 @@ -214,13 +218,14 @@ static const struct clk_ops generated_ops = {
4760   */
4761  static void clk_generated_startup(struct clk_generated *gck)
4762  {
4763 -       struct at91_pmc *pmc = gck->pmc;
4764         u32 tmp;
4765 +       unsigned long flags;
4766
4767 -       pmc_lock(pmc);
4768 -       pmc_write(pmc, AT91_PMC_PCR, (gck->id & AT91_PMC_PCR_PID_MASK));
4769 -       tmp = pmc_read(pmc, AT91_PMC_PCR);
4770 -       pmc_unlock(pmc);
4771 +       spin_lock_irqsave(gck->lock, flags);
4772 +       regmap_write(gck->regmap, AT91_PMC_PCR,
4773 +                    (gck->id & AT91_PMC_PCR_PID_MASK));
4774 +       regmap_read(gck->regmap, AT91_PMC_PCR, &tmp);
4775 +       spin_unlock_irqrestore(gck->lock, flags);
4776
4777         gck->parent_id = (tmp & AT91_PMC_PCR_GCKCSS_MASK)
4778                                         >> AT91_PMC_PCR_GCKCSS_OFFSET;
4779 @@ -229,8 +234,8 @@ static void clk_generated_startup(struct clk_generated *gck)
4780  }
4781
4782  static struct clk * __init
4783 -at91_clk_register_generated(struct at91_pmc *pmc, const char *name,
4784 -                           const char **parent_names, u8 num_parents,
4785 +at91_clk_register_generated(struct regmap *regmap,  spinlock_t *lock, const char
4786 +                           *name, const char **parent_names, u8 num_parents,
4787                             u8 id, const struct clk_range *range)
4788  {
4789         struct clk_generated *gck;
4790 @@ -249,7 +254,8 @@ at91_clk_register_generated(struct at91_pmc *pmc, const char *name,
4791
4792         gck->id = id;
4793         gck->hw.init = &init;
4794 -       gck->pmc = pmc;
4795 +       gck->regmap = regmap;
4796 +       gck->lock = lock;
4797         gck->range = *range;
4798
4799         clk = clk_register(NULL, &gck->hw);
4800 @@ -261,8 +267,7 @@ at91_clk_register_generated(struct at91_pmc *pmc, const char *name,
4801         return clk;
4802  }
4803
4804 -void __init of_sama5d2_clk_generated_setup(struct device_node *np,
4805 -                                          struct at91_pmc *pmc)
4806 +void __init of_sama5d2_clk_generated_setup(struct device_node *np)
4807  {
4808         int num;
4809         u32 id;
4810 @@ -272,6 +277,7 @@ void __init of_sama5d2_clk_generated_setup(struct device_node *np,
4811         const char *parent_names[GENERATED_SOURCE_MAX];
4812         struct device_node *gcknp;
4813         struct clk_range range = CLK_RANGE(0, 0);
4814 +       struct regmap *regmap;
4815
4816         num_parents = of_clk_get_parent_count(np);
4817         if (num_parents <= 0 || num_parents > GENERATED_SOURCE_MAX)
4818 @@ -283,6 +289,10 @@ void __init of_sama5d2_clk_generated_setup(struct device_node *np,
4819         if (!num || num > PERIPHERAL_MAX)
4820                 return;
4821
4822 +       regmap = syscon_node_to_regmap(of_get_parent(np));
4823 +       if (IS_ERR(regmap))
4824 +               return;
4825 +
4826         for_each_child_of_node(np, gcknp) {
4827                 if (of_property_read_u32(gcknp, "reg", &id))
4828                         continue;
4829 @@ -296,11 +306,14 @@ void __init of_sama5d2_clk_generated_setup(struct device_node *np,
4830                 of_at91_get_clk_range(gcknp, "atmel,clk-output-range",
4831                                       &range);
4832
4833 -               clk = at91_clk_register_generated(pmc, name, parent_names,
4834 -                                                 num_parents, id, &range);
4835 +               clk = at91_clk_register_generated(regmap, &pmc_pcr_lock, name,
4836 +                                                 parent_names, num_parents,
4837 +                                                 id, &range);
4838                 if (IS_ERR(clk))
4839                         continue;
4840
4841                 of_clk_add_provider(gcknp, of_clk_src_simple_get, clk);
4842         }
4843  }
4844 +CLK_OF_DECLARE(of_sama5d2_clk_generated_setup, "atmel,sama5d2-clk-generated",
4845 +              of_sama5d2_clk_generated_setup);
4846 diff --git a/drivers/clk/at91/clk-h32mx.c b/drivers/clk/at91/clk-h32mx.c
4847 index a165230e7eda..8e20c8a76db7 100644
4848 --- a/drivers/clk/at91/clk-h32mx.c
4849 +++ b/drivers/clk/at91/clk-h32mx.c
4850 @@ -15,15 +15,9 @@
4851  #include <linux/clk-provider.h>
4852  #include <linux/clkdev.h>
4853  #include <linux/clk/at91_pmc.h>
4854 -#include <linux/delay.h>
4855  #include <linux/of.h>
4856 -#include <linux/of_address.h>
4857 -#include <linux/of_irq.h>
4858 -#include <linux/io.h>
4859 -#include <linux/interrupt.h>
4860 -#include <linux/irq.h>
4861 -#include <linux/sched.h>
4862 -#include <linux/wait.h>
4863 +#include <linux/regmap.h>
4864 +#include <linux/mfd/syscon.h>
4865
4866  #include "pmc.h"
4867
4868 @@ -31,7 +25,7 @@
4869
4870  struct clk_sama5d4_h32mx {
4871         struct clk_hw hw;
4872 -       struct at91_pmc *pmc;
4873 +       struct regmap *regmap;
4874  };
4875
4876  #define to_clk_sama5d4_h32mx(hw) container_of(hw, struct clk_sama5d4_h32mx, hw)
4877 @@ -40,8 +34,10 @@ static unsigned long clk_sama5d4_h32mx_recalc_rate(struct clk_hw *hw,
4878                                                  unsigned long parent_rate)
4879  {
4880         struct clk_sama5d4_h32mx *h32mxclk = to_clk_sama5d4_h32mx(hw);
4881 +       unsigned int mckr;
4882
4883 -       if (pmc_read(h32mxclk->pmc, AT91_PMC_MCKR) & AT91_PMC_H32MXDIV)
4884 +       regmap_read(h32mxclk->regmap, AT91_PMC_MCKR, &mckr);
4885 +       if (mckr & AT91_PMC_H32MXDIV)
4886                 return parent_rate / 2;
4887
4888         if (parent_rate > H32MX_MAX_FREQ)
4889 @@ -70,18 +66,16 @@ static int clk_sama5d4_h32mx_set_rate(struct clk_hw *hw, unsigned long rate,
4890                                     unsigned long parent_rate)
4891  {
4892         struct clk_sama5d4_h32mx *h32mxclk = to_clk_sama5d4_h32mx(hw);
4893 -       struct at91_pmc *pmc = h32mxclk->pmc;
4894 -       u32 tmp;
4895 +       u32 mckr = 0;
4896
4897         if (parent_rate != rate && (parent_rate / 2) != rate)
4898                 return -EINVAL;
4899
4900 -       pmc_lock(pmc);
4901 -       tmp = pmc_read(pmc, AT91_PMC_MCKR) & ~AT91_PMC_H32MXDIV;
4902         if ((parent_rate / 2) == rate)
4903 -               tmp |= AT91_PMC_H32MXDIV;
4904 -       pmc_write(pmc, AT91_PMC_MCKR, tmp);
4905 -       pmc_unlock(pmc);
4906 +               mckr = AT91_PMC_H32MXDIV;
4907 +
4908 +       regmap_update_bits(h32mxclk->regmap, AT91_PMC_MCKR,
4909 +                          AT91_PMC_H32MXDIV, mckr);
4910
4911         return 0;
4912  }
4913 @@ -92,14 +86,18 @@ static const struct clk_ops h32mx_ops = {
4914         .set_rate = clk_sama5d4_h32mx_set_rate,
4915  };
4916
4917 -void __init of_sama5d4_clk_h32mx_setup(struct device_node *np,
4918 -                                    struct at91_pmc *pmc)
4919 +static void __init of_sama5d4_clk_h32mx_setup(struct device_node *np)
4920  {
4921         struct clk_sama5d4_h32mx *h32mxclk;
4922         struct clk_init_data init;
4923         const char *parent_name;
4924 +       struct regmap *regmap;
4925         struct clk *clk;
4926
4927 +       regmap = syscon_node_to_regmap(of_get_parent(np));
4928 +       if (IS_ERR(regmap))
4929 +               return;
4930 +
4931         h32mxclk = kzalloc(sizeof(*h32mxclk), GFP_KERNEL);
4932         if (!h32mxclk)
4933                 return;
4934 @@ -113,7 +111,7 @@ void __init of_sama5d4_clk_h32mx_setup(struct device_node *np,
4935         init.flags = CLK_SET_RATE_GATE;
4936
4937         h32mxclk->hw.init = &init;
4938 -       h32mxclk->pmc = pmc;
4939 +       h32mxclk->regmap = regmap;
4940
4941         clk = clk_register(NULL, &h32mxclk->hw);
4942         if (IS_ERR(clk)) {
4943 @@ -123,3 +121,5 @@ void __init of_sama5d4_clk_h32mx_setup(struct device_node *np,
4944
4945         of_clk_add_provider(np, of_clk_src_simple_get, clk);
4946  }
4947 +CLK_OF_DECLARE(of_sama5d4_clk_h32mx_setup, "atmel,sama5d4-clk-h32mx",
4948 +              of_sama5d4_clk_h32mx_setup);
4949 diff --git a/drivers/clk/at91/clk-main.c b/drivers/clk/at91/clk-main.c
4950 index fd7247deabdc..4bfc94d6c26e 100644
4951 --- a/drivers/clk/at91/clk-main.c
4952 +++ b/drivers/clk/at91/clk-main.c
4953 @@ -13,13 +13,8 @@
4954  #include <linux/clk/at91_pmc.h>
4955  #include <linux/delay.h>
4956  #include <linux/of.h>
4957 -#include <linux/of_address.h>
4958 -#include <linux/of_irq.h>
4959 -#include <linux/io.h>
4960 -#include <linux/interrupt.h>
4961 -#include <linux/irq.h>
4962 -#include <linux/sched.h>
4963 -#include <linux/wait.h>
4964 +#include <linux/mfd/syscon.h>
4965 +#include <linux/regmap.h>
4966
4967  #include "pmc.h"
4968
4969 @@ -34,18 +29,14 @@
4970
4971  struct clk_main_osc {
4972         struct clk_hw hw;
4973 -       struct at91_pmc *pmc;
4974 -       unsigned int irq;
4975 -       wait_queue_head_t wait;
4976 +       struct regmap *regmap;
4977  };
4978
4979  #define to_clk_main_osc(hw) container_of(hw, struct clk_main_osc, hw)
4980
4981  struct clk_main_rc_osc {
4982         struct clk_hw hw;
4983 -       struct at91_pmc *pmc;
4984 -       unsigned int irq;
4985 -       wait_queue_head_t wait;
4986 +       struct regmap *regmap;
4987         unsigned long frequency;
4988         unsigned long accuracy;
4989  };
4990 @@ -54,51 +45,47 @@ struct clk_main_rc_osc {
4991
4992  struct clk_rm9200_main {
4993         struct clk_hw hw;
4994 -       struct at91_pmc *pmc;
4995 +       struct regmap *regmap;
4996  };
4997
4998  #define to_clk_rm9200_main(hw) container_of(hw, struct clk_rm9200_main, hw)
4999
5000  struct clk_sam9x5_main {
5001         struct clk_hw hw;
5002 -       struct at91_pmc *pmc;
5003 -       unsigned int irq;
5004 -       wait_queue_head_t wait;
5005 +       struct regmap *regmap;
5006         u8 parent;
5007  };
5008
5009  #define to_clk_sam9x5_main(hw) container_of(hw, struct clk_sam9x5_main, hw)
5010
5011 -static irqreturn_t clk_main_osc_irq_handler(int irq, void *dev_id)
5012 +static inline bool clk_main_osc_ready(struct regmap *regmap)
5013  {
5014 -       struct clk_main_osc *osc = dev_id;
5015 +       unsigned int status;
5016
5017 -       wake_up(&osc->wait);
5018 -       disable_irq_nosync(osc->irq);
5019 +       regmap_read(regmap, AT91_PMC_SR, &status);
5020
5021 -       return IRQ_HANDLED;
5022 +       return status & AT91_PMC_MOSCS;
5023  }
5024
5025  static int clk_main_osc_prepare(struct clk_hw *hw)
5026  {
5027         struct clk_main_osc *osc = to_clk_main_osc(hw);
5028 -       struct at91_pmc *pmc = osc->pmc;
5029 +       struct regmap *regmap = osc->regmap;
5030         u32 tmp;
5031
5032 -       tmp = pmc_read(pmc, AT91_CKGR_MOR) & ~MOR_KEY_MASK;
5033 +       regmap_read(regmap, AT91_CKGR_MOR, &tmp);
5034 +       tmp &= ~MOR_KEY_MASK;
5035 +
5036         if (tmp & AT91_PMC_OSCBYPASS)
5037                 return 0;
5038
5039         if (!(tmp & AT91_PMC_MOSCEN)) {
5040                 tmp |= AT91_PMC_MOSCEN | AT91_PMC_KEY;
5041 -               pmc_write(pmc, AT91_CKGR_MOR, tmp);
5042 +               regmap_write(regmap, AT91_CKGR_MOR, tmp);
5043         }
5044
5045 -       while (!(pmc_read(pmc, AT91_PMC_SR) & AT91_PMC_MOSCS)) {
5046 -               enable_irq(osc->irq);
5047 -               wait_event(osc->wait,
5048 -                          pmc_read(pmc, AT91_PMC_SR) & AT91_PMC_MOSCS);
5049 -       }
5050 +       while (!clk_main_osc_ready(regmap))
5051 +               cpu_relax();
5052
5053         return 0;
5054  }
5055 @@ -106,9 +93,10 @@ static int clk_main_osc_prepare(struct clk_hw *hw)
5056  static void clk_main_osc_unprepare(struct clk_hw *hw)
5057  {
5058         struct clk_main_osc *osc = to_clk_main_osc(hw);
5059 -       struct at91_pmc *pmc = osc->pmc;
5060 -       u32 tmp = pmc_read(pmc, AT91_CKGR_MOR);
5061 +       struct regmap *regmap = osc->regmap;
5062 +       u32 tmp;
5063
5064 +       regmap_read(regmap, AT91_CKGR_MOR, &tmp);
5065         if (tmp & AT91_PMC_OSCBYPASS)
5066                 return;
5067
5068 @@ -116,20 +104,22 @@ static void clk_main_osc_unprepare(struct clk_hw *hw)
5069                 return;
5070
5071         tmp &= ~(AT91_PMC_KEY | AT91_PMC_MOSCEN);
5072 -       pmc_write(pmc, AT91_CKGR_MOR, tmp | AT91_PMC_KEY);
5073 +       regmap_write(regmap, AT91_CKGR_MOR, tmp | AT91_PMC_KEY);
5074  }
5075
5076  static int clk_main_osc_is_prepared(struct clk_hw *hw)
5077  {
5078         struct clk_main_osc *osc = to_clk_main_osc(hw);
5079 -       struct at91_pmc *pmc = osc->pmc;
5080 -       u32 tmp = pmc_read(pmc, AT91_CKGR_MOR);
5081 +       struct regmap *regmap = osc->regmap;
5082 +       u32 tmp, status;
5083
5084 +       regmap_read(regmap, AT91_CKGR_MOR, &tmp);
5085         if (tmp & AT91_PMC_OSCBYPASS)
5086                 return 1;
5087
5088 -       return !!((pmc_read(pmc, AT91_PMC_SR) & AT91_PMC_MOSCS) &&
5089 -                 (pmc_read(pmc, AT91_CKGR_MOR) & AT91_PMC_MOSCEN));
5090 +       regmap_read(regmap, AT91_PMC_SR, &status);
5091 +
5092 +       return (status & AT91_PMC_MOSCS) && (tmp & AT91_PMC_MOSCEN);
5093  }
5094
5095  static const struct clk_ops main_osc_ops = {
5096 @@ -139,18 +129,16 @@ static const struct clk_ops main_osc_ops = {
5097  };
5098
5099  static struct clk * __init
5100 -at91_clk_register_main_osc(struct at91_pmc *pmc,
5101 -                          unsigned int irq,
5102 +at91_clk_register_main_osc(struct regmap *regmap,
5103                            const char *name,
5104                            const char *parent_name,
5105                            bool bypass)
5106  {
5107 -       int ret;
5108         struct clk_main_osc *osc;
5109         struct clk *clk = NULL;
5110         struct clk_init_data init;
5111
5112 -       if (!pmc || !irq || !name || !parent_name)
5113 +       if (!name || !parent_name)
5114                 return ERR_PTR(-EINVAL);
5115
5116         osc = kzalloc(sizeof(*osc), GFP_KERNEL);
5117 @@ -164,85 +152,70 @@ at91_clk_register_main_osc(struct at91_pmc *pmc,
5118         init.flags = CLK_IGNORE_UNUSED;
5119
5120         osc->hw.init = &init;
5121 -       osc->pmc = pmc;
5122 -       osc->irq = irq;
5123 -
5124 -       init_waitqueue_head(&osc->wait);
5125 -       irq_set_status_flags(osc->irq, IRQ_NOAUTOEN);
5126 -       ret = request_irq(osc->irq, clk_main_osc_irq_handler,
5127 -                         IRQF_TRIGGER_HIGH, name, osc);
5128 -       if (ret) {
5129 -               kfree(osc);
5130 -               return ERR_PTR(ret);
5131 -       }
5132 +       osc->regmap = regmap;
5133
5134         if (bypass)
5135 -               pmc_write(pmc, AT91_CKGR_MOR,
5136 -                         (pmc_read(pmc, AT91_CKGR_MOR) &
5137 -                          ~(MOR_KEY_MASK | AT91_PMC_MOSCEN)) |
5138 -                         AT91_PMC_OSCBYPASS | AT91_PMC_KEY);
5139 +               regmap_update_bits(regmap,
5140 +                                  AT91_CKGR_MOR, MOR_KEY_MASK |
5141 +                                  AT91_PMC_MOSCEN,
5142 +                                  AT91_PMC_OSCBYPASS | AT91_PMC_KEY);
5143
5144         clk = clk_register(NULL, &osc->hw);
5145 -       if (IS_ERR(clk)) {
5146 -               free_irq(irq, osc);
5147 +       if (IS_ERR(clk))
5148                 kfree(osc);
5149 -       }
5150
5151         return clk;
5152  }
5153
5154 -void __init of_at91rm9200_clk_main_osc_setup(struct device_node *np,
5155 -                                            struct at91_pmc *pmc)
5156 +static void __init of_at91rm9200_clk_main_osc_setup(struct device_node *np)
5157  {
5158         struct clk *clk;
5159 -       unsigned int irq;
5160         const char *name = np->name;
5161         const char *parent_name;
5162 +       struct regmap *regmap;
5163         bool bypass;
5164
5165         of_property_read_string(np, "clock-output-names", &name);
5166         bypass = of_property_read_bool(np, "atmel,osc-bypass");
5167         parent_name = of_clk_get_parent_name(np, 0);
5168
5169 -       irq = irq_of_parse_and_map(np, 0);
5170 -       if (!irq)
5171 +       regmap = syscon_node_to_regmap(of_get_parent(np));
5172 +       if (IS_ERR(regmap))
5173                 return;
5174
5175 -       clk = at91_clk_register_main_osc(pmc, irq, name, parent_name, bypass);
5176 +       clk = at91_clk_register_main_osc(regmap, name, parent_name, bypass);
5177         if (IS_ERR(clk))
5178                 return;
5179
5180         of_clk_add_provider(np, of_clk_src_simple_get, clk);
5181  }
5182 +CLK_OF_DECLARE(at91rm9200_clk_main_osc, "atmel,at91rm9200-clk-main-osc",
5183 +              of_at91rm9200_clk_main_osc_setup);
5184
5185 -static irqreturn_t clk_main_rc_osc_irq_handler(int irq, void *dev_id)
5186 +static bool clk_main_rc_osc_ready(struct regmap *regmap)
5187  {
5188 -       struct clk_main_rc_osc *osc = dev_id;
5189 +       unsigned int status;
5190
5191 -       wake_up(&osc->wait);
5192 -       disable_irq_nosync(osc->irq);
5193 +       regmap_read(regmap, AT91_PMC_SR, &status);
5194
5195 -       return IRQ_HANDLED;
5196 +       return status & AT91_PMC_MOSCRCS;
5197  }
5198
5199  static int clk_main_rc_osc_prepare(struct clk_hw *hw)
5200  {
5201         struct clk_main_rc_osc *osc = to_clk_main_rc_osc(hw);
5202 -       struct at91_pmc *pmc = osc->pmc;
5203 -       u32 tmp;
5204 +       struct regmap *regmap = osc->regmap;
5205 +       unsigned int mor;
5206
5207 -       tmp = pmc_read(pmc, AT91_CKGR_MOR) & ~MOR_KEY_MASK;
5208 +       regmap_read(regmap, AT91_CKGR_MOR, &mor);
5209
5210 -       if (!(tmp & AT91_PMC_MOSCRCEN)) {
5211 -               tmp |= AT91_PMC_MOSCRCEN | AT91_PMC_KEY;
5212 -               pmc_write(pmc, AT91_CKGR_MOR, tmp);
5213 -       }
5214 +       if (!(mor & AT91_PMC_MOSCRCEN))
5215 +               regmap_update_bits(regmap, AT91_CKGR_MOR,
5216 +                                  MOR_KEY_MASK | AT91_PMC_MOSCRCEN,
5217 +                                  AT91_PMC_MOSCRCEN | AT91_PMC_KEY);
5218
5219 -       while (!(pmc_read(pmc, AT91_PMC_SR) & AT91_PMC_MOSCRCS)) {
5220 -               enable_irq(osc->irq);
5221 -               wait_event(osc->wait,
5222 -                          pmc_read(pmc, AT91_PMC_SR) & AT91_PMC_MOSCRCS);
5223 -       }
5224 +       while (!clk_main_rc_osc_ready(regmap))
5225 +               cpu_relax();
5226
5227         return 0;
5228  }
5229 @@ -250,23 +223,28 @@ static int clk_main_rc_osc_prepare(struct clk_hw *hw)
5230  static void clk_main_rc_osc_unprepare(struct clk_hw *hw)
5231  {
5232         struct clk_main_rc_osc *osc = to_clk_main_rc_osc(hw);
5233 -       struct at91_pmc *pmc = osc->pmc;
5234 -       u32 tmp = pmc_read(pmc, AT91_CKGR_MOR);
5235 +       struct regmap *regmap = osc->regmap;
5236 +       unsigned int mor;
5237 +
5238 +       regmap_read(regmap, AT91_CKGR_MOR, &mor);
5239
5240 -       if (!(tmp & AT91_PMC_MOSCRCEN))
5241 +       if (!(mor & AT91_PMC_MOSCRCEN))
5242                 return;
5243
5244 -       tmp &= ~(MOR_KEY_MASK | AT91_PMC_MOSCRCEN);
5245 -       pmc_write(pmc, AT91_CKGR_MOR, tmp | AT91_PMC_KEY);
5246 +       regmap_update_bits(regmap, AT91_CKGR_MOR,
5247 +                          MOR_KEY_MASK | AT91_PMC_MOSCRCEN, AT91_PMC_KEY);
5248  }
5249
5250  static int clk_main_rc_osc_is_prepared(struct clk_hw *hw)
5251  {
5252         struct clk_main_rc_osc *osc = to_clk_main_rc_osc(hw);
5253 -       struct at91_pmc *pmc = osc->pmc;
5254 +       struct regmap *regmap = osc->regmap;
5255 +       unsigned int mor, status;
5256
5257 -       return !!((pmc_read(pmc, AT91_PMC_SR) & AT91_PMC_MOSCRCS) &&
5258 -                 (pmc_read(pmc, AT91_CKGR_MOR) & AT91_PMC_MOSCRCEN));
5259 +       regmap_read(regmap, AT91_CKGR_MOR, &mor);
5260 +       regmap_read(regmap, AT91_PMC_SR, &status);
5261 +
5262 +       return (mor & AT91_PMC_MOSCRCEN) && (status & AT91_PMC_MOSCRCS);
5263  }
5264
5265  static unsigned long clk_main_rc_osc_recalc_rate(struct clk_hw *hw,
5266 @@ -294,17 +272,15 @@ static const struct clk_ops main_rc_osc_ops = {
5267  };
5268
5269  static struct clk * __init
5270 -at91_clk_register_main_rc_osc(struct at91_pmc *pmc,
5271 -                             unsigned int irq,
5272 +at91_clk_register_main_rc_osc(struct regmap *regmap,
5273                               const char *name,
5274                               u32 frequency, u32 accuracy)
5275  {
5276 -       int ret;
5277         struct clk_main_rc_osc *osc;
5278         struct clk *clk = NULL;
5279         struct clk_init_data init;
5280
5281 -       if (!pmc || !irq || !name || !frequency)
5282 +       if (!name || !frequency)
5283                 return ERR_PTR(-EINVAL);
5284
5285         osc = kzalloc(sizeof(*osc), GFP_KERNEL);
5286 @@ -318,63 +294,53 @@ at91_clk_register_main_rc_osc(struct at91_pmc *pmc,
5287         init.flags = CLK_IS_ROOT | CLK_IGNORE_UNUSED;
5288
5289         osc->hw.init = &init;
5290 -       osc->pmc = pmc;
5291 -       osc->irq = irq;
5292 +       osc->regmap = regmap;
5293         osc->frequency = frequency;
5294         osc->accuracy = accuracy;
5295
5296 -       init_waitqueue_head(&osc->wait);
5297 -       irq_set_status_flags(osc->irq, IRQ_NOAUTOEN);
5298 -       ret = request_irq(osc->irq, clk_main_rc_osc_irq_handler,
5299 -                         IRQF_TRIGGER_HIGH, name, osc);
5300 -       if (ret)
5301 -               return ERR_PTR(ret);
5302 -
5303         clk = clk_register(NULL, &osc->hw);
5304 -       if (IS_ERR(clk)) {
5305 -               free_irq(irq, osc);
5306 +       if (IS_ERR(clk))
5307                 kfree(osc);
5308 -       }
5309
5310         return clk;
5311  }
5312
5313 -void __init of_at91sam9x5_clk_main_rc_osc_setup(struct device_node *np,
5314 -                                               struct at91_pmc *pmc)
5315 +static void __init of_at91sam9x5_clk_main_rc_osc_setup(struct device_node *np)
5316  {
5317         struct clk *clk;
5318 -       unsigned int irq;
5319         u32 frequency = 0;
5320         u32 accuracy = 0;
5321         const char *name = np->name;
5322 +       struct regmap *regmap;
5323
5324         of_property_read_string(np, "clock-output-names", &name);
5325         of_property_read_u32(np, "clock-frequency", &frequency);
5326         of_property_read_u32(np, "clock-accuracy", &accuracy);
5327
5328 -       irq = irq_of_parse_and_map(np, 0);
5329 -       if (!irq)
5330 +       regmap = syscon_node_to_regmap(of_get_parent(np));
5331 +       if (IS_ERR(regmap))
5332                 return;
5333
5334 -       clk = at91_clk_register_main_rc_osc(pmc, irq, name, frequency,
5335 -                                           accuracy);
5336 +       clk = at91_clk_register_main_rc_osc(regmap, name, frequency, accuracy);
5337         if (IS_ERR(clk))
5338                 return;
5339
5340         of_clk_add_provider(np, of_clk_src_simple_get, clk);
5341  }
5342 +CLK_OF_DECLARE(at91sam9x5_clk_main_rc_osc, "atmel,at91sam9x5-clk-main-rc-osc",
5343 +              of_at91sam9x5_clk_main_rc_osc_setup);
5344
5345
5346 -static int clk_main_probe_frequency(struct at91_pmc *pmc)
5347 +static int clk_main_probe_frequency(struct regmap *regmap)
5348  {
5349         unsigned long prep_time, timeout;
5350 -       u32 tmp;
5351 +       unsigned int mcfr;
5352
5353         timeout = jiffies + usecs_to_jiffies(MAINFRDY_TIMEOUT);
5354         do {
5355                 prep_time = jiffies;
5356 -               tmp = pmc_read(pmc, AT91_CKGR_MCFR);
5357 -               if (tmp & AT91_PMC_MAINRDY)
5358 +               regmap_read(regmap, AT91_CKGR_MCFR, &mcfr);
5359 +               if (mcfr & AT91_PMC_MAINRDY)
5360                         return 0;
5361                 usleep_range(MAINF_LOOP_MIN_WAIT, MAINF_LOOP_MAX_WAIT);
5362         } while (time_before(prep_time, timeout));
5363 @@ -382,34 +348,37 @@ static int clk_main_probe_frequency(struct at91_pmc *pmc)
5364         return -ETIMEDOUT;
5365  }
5366
5367 -static unsigned long clk_main_recalc_rate(struct at91_pmc *pmc,
5368 +static unsigned long clk_main_recalc_rate(struct regmap *regmap,
5369                                           unsigned long parent_rate)
5370  {
5371 -       u32 tmp;
5372 +       unsigned int mcfr;
5373
5374         if (parent_rate)
5375                 return parent_rate;
5376
5377         pr_warn("Main crystal frequency not set, using approximate value\n");
5378 -       tmp = pmc_read(pmc, AT91_CKGR_MCFR);
5379 -       if (!(tmp & AT91_PMC_MAINRDY))
5380 +       regmap_read(regmap, AT91_CKGR_MCFR, &mcfr);
5381 +       if (!(mcfr & AT91_PMC_MAINRDY))
5382                 return 0;
5383
5384 -       return ((tmp & AT91_PMC_MAINF) * SLOW_CLOCK_FREQ) / MAINF_DIV;
5385 +       return ((mcfr & AT91_PMC_MAINF) * SLOW_CLOCK_FREQ) / MAINF_DIV;
5386  }
5387
5388  static int clk_rm9200_main_prepare(struct clk_hw *hw)
5389  {
5390         struct clk_rm9200_main *clkmain = to_clk_rm9200_main(hw);
5391
5392 -       return clk_main_probe_frequency(clkmain->pmc);
5393 +       return clk_main_probe_frequency(clkmain->regmap);
5394  }
5395
5396  static int clk_rm9200_main_is_prepared(struct clk_hw *hw)
5397  {
5398         struct clk_rm9200_main *clkmain = to_clk_rm9200_main(hw);
5399 +       unsigned int status;
5400 +
5401 +       regmap_read(clkmain->regmap, AT91_CKGR_MCFR, &status);
5402
5403 -       return !!(pmc_read(clkmain->pmc, AT91_CKGR_MCFR) & AT91_PMC_MAINRDY);
5404 +       return status & AT91_PMC_MAINRDY ? 1 : 0;
5405  }
5406
5407  static unsigned long clk_rm9200_main_recalc_rate(struct clk_hw *hw,
5408 @@ -417,7 +386,7 @@ static unsigned long clk_rm9200_main_recalc_rate(struct clk_hw *hw,
5409  {
5410         struct clk_rm9200_main *clkmain = to_clk_rm9200_main(hw);
5411
5412 -       return clk_main_recalc_rate(clkmain->pmc, parent_rate);
5413 +       return clk_main_recalc_rate(clkmain->regmap, parent_rate);
5414  }
5415
5416  static const struct clk_ops rm9200_main_ops = {
5417 @@ -427,7 +396,7 @@ static const struct clk_ops rm9200_main_ops = {
5418  };
5419
5420  static struct clk * __init
5421 -at91_clk_register_rm9200_main(struct at91_pmc *pmc,
5422 +at91_clk_register_rm9200_main(struct regmap *regmap,
5423                               const char *name,
5424                               const char *parent_name)
5425  {
5426 @@ -435,7 +404,7 @@ at91_clk_register_rm9200_main(struct at91_pmc *pmc,
5427         struct clk *clk = NULL;
5428         struct clk_init_data init;
5429
5430 -       if (!pmc || !name)
5431 +       if (!name)
5432                 return ERR_PTR(-EINVAL);
5433
5434         if (!parent_name)
5435 @@ -452,7 +421,7 @@ at91_clk_register_rm9200_main(struct at91_pmc *pmc,
5436         init.flags = 0;
5437
5438         clkmain->hw.init = &init;
5439 -       clkmain->pmc = pmc;
5440 +       clkmain->regmap = regmap;
5441
5442         clk = clk_register(NULL, &clkmain->hw);
5443         if (IS_ERR(clk))
5444 @@ -461,52 +430,54 @@ at91_clk_register_rm9200_main(struct at91_pmc *pmc,
5445         return clk;
5446  }
5447
5448 -void __init of_at91rm9200_clk_main_setup(struct device_node *np,
5449 -                                        struct at91_pmc *pmc)
5450 +static void __init of_at91rm9200_clk_main_setup(struct device_node *np)
5451  {
5452         struct clk *clk;
5453         const char *parent_name;
5454         const char *name = np->name;
5455 +       struct regmap *regmap;
5456
5457         parent_name = of_clk_get_parent_name(np, 0);
5458         of_property_read_string(np, "clock-output-names", &name);
5459
5460 -       clk = at91_clk_register_rm9200_main(pmc, name, parent_name);
5461 +       regmap = syscon_node_to_regmap(of_get_parent(np));
5462 +       if (IS_ERR(regmap))
5463 +               return;
5464 +
5465 +       clk = at91_clk_register_rm9200_main(regmap, name, parent_name);
5466         if (IS_ERR(clk))
5467                 return;
5468
5469         of_clk_add_provider(np, of_clk_src_simple_get, clk);
5470  }
5471 +CLK_OF_DECLARE(at91rm9200_clk_main, "atmel,at91rm9200-clk-main",
5472 +              of_at91rm9200_clk_main_setup);
5473
5474 -static irqreturn_t clk_sam9x5_main_irq_handler(int irq, void *dev_id)
5475 +static inline bool clk_sam9x5_main_ready(struct regmap *regmap)
5476  {
5477 -       struct clk_sam9x5_main *clkmain = dev_id;
5478 +       unsigned int status;
5479
5480 -       wake_up(&clkmain->wait);
5481 -       disable_irq_nosync(clkmain->irq);
5482 +       regmap_read(regmap, AT91_PMC_SR, &status);
5483
5484 -       return IRQ_HANDLED;
5485 +       return status & AT91_PMC_MOSCSELS ? 1 : 0;
5486  }
5487
5488  static int clk_sam9x5_main_prepare(struct clk_hw *hw)
5489  {
5490         struct clk_sam9x5_main *clkmain = to_clk_sam9x5_main(hw);
5491 -       struct at91_pmc *pmc = clkmain->pmc;
5492 +       struct regmap *regmap = clkmain->regmap;
5493
5494 -       while (!(pmc_read(pmc, AT91_PMC_SR) & AT91_PMC_MOSCSELS)) {
5495 -               enable_irq(clkmain->irq);
5496 -               wait_event(clkmain->wait,
5497 -                          pmc_read(pmc, AT91_PMC_SR) & AT91_PMC_MOSCSELS);
5498 -       }
5499 +       while (!clk_sam9x5_main_ready(regmap))
5500 +               cpu_relax();
5501
5502 -       return clk_main_probe_frequency(pmc);
5503 +       return clk_main_probe_frequency(regmap);
5504  }
5505
5506  static int clk_sam9x5_main_is_prepared(struct clk_hw *hw)
5507  {
5508         struct clk_sam9x5_main *clkmain = to_clk_sam9x5_main(hw);
5509
5510 -       return !!(pmc_read(clkmain->pmc, AT91_PMC_SR) & AT91_PMC_MOSCSELS);
5511 +       return clk_sam9x5_main_ready(clkmain->regmap);
5512  }
5513
5514  static unsigned long clk_sam9x5_main_recalc_rate(struct clk_hw *hw,
5515 @@ -514,30 +485,28 @@ static unsigned long clk_sam9x5_main_recalc_rate(struct clk_hw *hw,
5516  {
5517         struct clk_sam9x5_main *clkmain = to_clk_sam9x5_main(hw);
5518
5519 -       return clk_main_recalc_rate(clkmain->pmc, parent_rate);
5520 +       return clk_main_recalc_rate(clkmain->regmap, parent_rate);
5521  }
5522
5523  static int clk_sam9x5_main_set_parent(struct clk_hw *hw, u8 index)
5524  {
5525         struct clk_sam9x5_main *clkmain = to_clk_sam9x5_main(hw);
5526 -       struct at91_pmc *pmc = clkmain->pmc;
5527 -       u32 tmp;
5528 +       struct regmap *regmap = clkmain->regmap;
5529 +       unsigned int tmp;
5530
5531         if (index > 1)
5532                 return -EINVAL;
5533
5534 -       tmp = pmc_read(pmc, AT91_CKGR_MOR) & ~MOR_KEY_MASK;
5535 +       regmap_read(regmap, AT91_CKGR_MOR, &tmp);
5536 +       tmp &= ~MOR_KEY_MASK;
5537
5538         if (index && !(tmp & AT91_PMC_MOSCSEL))
5539 -               pmc_write(pmc, AT91_CKGR_MOR, tmp | AT91_PMC_MOSCSEL);
5540 +               regmap_write(regmap, AT91_CKGR_MOR, tmp | AT91_PMC_MOSCSEL);
5541         else if (!index && (tmp & AT91_PMC_MOSCSEL))
5542 -               pmc_write(pmc, AT91_CKGR_MOR, tmp & ~AT91_PMC_MOSCSEL);
5543 +               regmap_write(regmap, AT91_CKGR_MOR, tmp & ~AT91_PMC_MOSCSEL);
5544
5545 -       while (!(pmc_read(pmc, AT91_PMC_SR) & AT91_PMC_MOSCSELS)) {
5546 -               enable_irq(clkmain->irq);
5547 -               wait_event(clkmain->wait,
5548 -                          pmc_read(pmc, AT91_PMC_SR) & AT91_PMC_MOSCSELS);
5549 -       }
5550 +       while (!clk_sam9x5_main_ready(regmap))
5551 +               cpu_relax();
5552
5553         return 0;
5554  }
5555 @@ -545,8 +514,11 @@ static int clk_sam9x5_main_set_parent(struct clk_hw *hw, u8 index)
5556  static u8 clk_sam9x5_main_get_parent(struct clk_hw *hw)
5557  {
5558         struct clk_sam9x5_main *clkmain = to_clk_sam9x5_main(hw);
5559 +       unsigned int status;
5560 +
5561 +       regmap_read(clkmain->regmap, AT91_CKGR_MOR, &status);
5562
5563 -       return !!(pmc_read(clkmain->pmc, AT91_CKGR_MOR) & AT91_PMC_MOSCEN);
5564 +       return status & AT91_PMC_MOSCEN ? 1 : 0;
5565  }
5566
5567  static const struct clk_ops sam9x5_main_ops = {
5568 @@ -558,18 +530,17 @@ static const struct clk_ops sam9x5_main_ops = {
5569  };
5570
5571  static struct clk * __init
5572 -at91_clk_register_sam9x5_main(struct at91_pmc *pmc,
5573 -                             unsigned int irq,
5574 +at91_clk_register_sam9x5_main(struct regmap *regmap,
5575                               const char *name,
5576                               const char **parent_names,
5577                               int num_parents)
5578  {
5579 -       int ret;
5580         struct clk_sam9x5_main *clkmain;
5581         struct clk *clk = NULL;
5582         struct clk_init_data init;
5583 +       unsigned int status;
5584
5585 -       if (!pmc || !irq || !name)
5586 +       if (!name)
5587                 return ERR_PTR(-EINVAL);
5588
5589         if (!parent_names || !num_parents)
5590 @@ -586,51 +557,42 @@ at91_clk_register_sam9x5_main(struct at91_pmc *pmc,
5591         init.flags = CLK_SET_PARENT_GATE;
5592
5593         clkmain->hw.init = &init;
5594 -       clkmain->pmc = pmc;
5595 -       clkmain->irq = irq;
5596 -       clkmain->parent = !!(pmc_read(clkmain->pmc, AT91_CKGR_MOR) &
5597 -                            AT91_PMC_MOSCEN);
5598 -       init_waitqueue_head(&clkmain->wait);
5599 -       irq_set_status_flags(clkmain->irq, IRQ_NOAUTOEN);
5600 -       ret = request_irq(clkmain->irq, clk_sam9x5_main_irq_handler,
5601 -                         IRQF_TRIGGER_HIGH, name, clkmain);
5602 -       if (ret)
5603 -               return ERR_PTR(ret);
5604 +       clkmain->regmap = regmap;
5605 +       regmap_read(clkmain->regmap, AT91_CKGR_MOR, &status);
5606 +       clkmain->parent = status & AT91_PMC_MOSCEN ? 1 : 0;
5607
5608         clk = clk_register(NULL, &clkmain->hw);
5609 -       if (IS_ERR(clk)) {
5610 -               free_irq(clkmain->irq, clkmain);
5611 +       if (IS_ERR(clk))
5612                 kfree(clkmain);
5613 -       }
5614
5615         return clk;
5616  }
5617
5618 -void __init of_at91sam9x5_clk_main_setup(struct device_node *np,
5619 -                                        struct at91_pmc *pmc)
5620 +static void __init of_at91sam9x5_clk_main_setup(struct device_node *np)
5621  {
5622         struct clk *clk;
5623         const char *parent_names[2];
5624         int num_parents;
5625 -       unsigned int irq;
5626         const char *name = np->name;
5627 +       struct regmap *regmap;
5628
5629         num_parents = of_clk_get_parent_count(np);
5630         if (num_parents <= 0 || num_parents > 2)
5631                 return;
5632
5633         of_clk_parent_fill(np, parent_names, num_parents);
5634 +       regmap = syscon_node_to_regmap(of_get_parent(np));
5635 +       if (IS_ERR(regmap))
5636 +               return;
5637
5638         of_property_read_string(np, "clock-output-names", &name);
5639
5640 -       irq = irq_of_parse_and_map(np, 0);
5641 -       if (!irq)
5642 -               return;
5643 -
5644 -       clk = at91_clk_register_sam9x5_main(pmc, irq, name, parent_names,
5645 +       clk = at91_clk_register_sam9x5_main(regmap, name, parent_names,
5646                                             num_parents);
5647         if (IS_ERR(clk))
5648                 return;
5649
5650         of_clk_add_provider(np, of_clk_src_simple_get, clk);
5651  }
5652 +CLK_OF_DECLARE(at91sam9x5_clk_main, "atmel,at91sam9x5-clk-main",
5653 +              of_at91sam9x5_clk_main_setup);
5654 diff --git a/drivers/clk/at91/clk-master.c b/drivers/clk/at91/clk-master.c
5655 index 620ea323356b..7d4a1864ea7c 100644
5656 --- a/drivers/clk/at91/clk-master.c
5657 +++ b/drivers/clk/at91/clk-master.c
5658 @@ -12,13 +12,8 @@
5659  #include <linux/clkdev.h>
5660  #include <linux/clk/at91_pmc.h>
5661  #include <linux/of.h>
5662 -#include <linux/of_address.h>
5663 -#include <linux/of_irq.h>
5664 -#include <linux/io.h>
5665 -#include <linux/wait.h>
5666 -#include <linux/sched.h>
5667 -#include <linux/interrupt.h>
5668 -#include <linux/irq.h>
5669 +#include <linux/mfd/syscon.h>
5670 +#include <linux/regmap.h>
5671
5672  #include "pmc.h"
5673
5674 @@ -44,32 +39,26 @@ struct clk_master_layout {
5675
5676  struct clk_master {
5677         struct clk_hw hw;
5678 -       struct at91_pmc *pmc;
5679 -       unsigned int irq;
5680 -       wait_queue_head_t wait;
5681 +       struct regmap *regmap;
5682         const struct clk_master_layout *layout;
5683         const struct clk_master_characteristics *characteristics;
5684  };
5685
5686 -static irqreturn_t clk_master_irq_handler(int irq, void *dev_id)
5687 +static inline bool clk_master_ready(struct regmap *regmap)
5688  {
5689 -       struct clk_master *master = (struct clk_master *)dev_id;
5690 +       unsigned int status;
5691
5692 -       wake_up(&master->wait);
5693 -       disable_irq_nosync(master->irq);
5694 +       regmap_read(regmap, AT91_PMC_SR, &status);
5695
5696 -       return IRQ_HANDLED;
5697 +       return status & AT91_PMC_MCKRDY ? 1 : 0;
5698  }
5699 +
5700  static int clk_master_prepare(struct clk_hw *hw)
5701  {
5702         struct clk_master *master = to_clk_master(hw);
5703 -       struct at91_pmc *pmc = master->pmc;
5704
5705 -       while (!(pmc_read(pmc, AT91_PMC_SR) & AT91_PMC_MCKRDY)) {
5706 -               enable_irq(master->irq);
5707 -               wait_event(master->wait,
5708 -                          pmc_read(pmc, AT91_PMC_SR) & AT91_PMC_MCKRDY);
5709 -       }
5710 +       while (!clk_master_ready(master->regmap))
5711 +               cpu_relax();
5712
5713         return 0;
5714  }
5715 @@ -78,7 +67,7 @@ static int clk_master_is_prepared(struct clk_hw *hw)
5716  {
5717         struct clk_master *master = to_clk_master(hw);
5718
5719 -       return !!(pmc_read(master->pmc, AT91_PMC_SR) & AT91_PMC_MCKRDY);
5720 +       return clk_master_ready(master->regmap);
5721  }
5722
5723  static unsigned long clk_master_recalc_rate(struct clk_hw *hw,
5724 @@ -88,18 +77,16 @@ static unsigned long clk_master_recalc_rate(struct clk_hw *hw,
5725         u8 div;
5726         unsigned long rate = parent_rate;
5727         struct clk_master *master = to_clk_master(hw);
5728 -       struct at91_pmc *pmc = master->pmc;
5729         const struct clk_master_layout *layout = master->layout;
5730         const struct clk_master_characteristics *characteristics =
5731                                                 master->characteristics;
5732 -       u32 tmp;
5733 +       unsigned int mckr;
5734
5735 -       pmc_lock(pmc);
5736 -       tmp = pmc_read(pmc, AT91_PMC_MCKR) & layout->mask;
5737 -       pmc_unlock(pmc);
5738 +       regmap_read(master->regmap, AT91_PMC_MCKR, &mckr);
5739 +       mckr &= layout->mask;
5740
5741 -       pres = (tmp >> layout->pres_shift) & MASTER_PRES_MASK;
5742 -       div = (tmp >> MASTER_DIV_SHIFT) & MASTER_DIV_MASK;
5743 +       pres = (mckr >> layout->pres_shift) & MASTER_PRES_MASK;
5744 +       div = (mckr >> MASTER_DIV_SHIFT) & MASTER_DIV_MASK;
5745
5746         if (characteristics->have_div3_pres && pres == MASTER_PRES_MAX)
5747                 rate /= 3;
5748 @@ -119,9 +106,11 @@ static unsigned long clk_master_recalc_rate(struct clk_hw *hw,
5749  static u8 clk_master_get_parent(struct clk_hw *hw)
5750  {
5751         struct clk_master *master = to_clk_master(hw);
5752 -       struct at91_pmc *pmc = master->pmc;
5753 +       unsigned int mckr;
5754
5755 -       return pmc_read(pmc, AT91_PMC_MCKR) & AT91_PMC_CSS;
5756 +       regmap_read(master->regmap, AT91_PMC_MCKR, &mckr);
5757 +
5758 +       return mckr & AT91_PMC_CSS;
5759  }
5760
5761  static const struct clk_ops master_ops = {
5762 @@ -132,18 +121,17 @@ static const struct clk_ops master_ops = {
5763  };
5764
5765  static struct clk * __init
5766 -at91_clk_register_master(struct at91_pmc *pmc, unsigned int irq,
5767 +at91_clk_register_master(struct regmap *regmap,
5768                 const char *name, int num_parents,
5769                 const char **parent_names,
5770                 const struct clk_master_layout *layout,
5771                 const struct clk_master_characteristics *characteristics)
5772  {
5773 -       int ret;
5774         struct clk_master *master;
5775         struct clk *clk = NULL;
5776         struct clk_init_data init;
5777
5778 -       if (!pmc || !irq || !name || !num_parents || !parent_names)
5779 +       if (!name || !num_parents || !parent_names)
5780                 return ERR_PTR(-EINVAL);
5781
5782         master = kzalloc(sizeof(*master), GFP_KERNEL);
5783 @@ -159,20 +147,10 @@ at91_clk_register_master(struct at91_pmc *pmc, unsigned int irq,
5784         master->hw.init = &init;
5785         master->layout = layout;
5786         master->characteristics = characteristics;
5787 -       master->pmc = pmc;
5788 -       master->irq = irq;
5789 -       init_waitqueue_head(&master->wait);
5790 -       irq_set_status_flags(master->irq, IRQ_NOAUTOEN);
5791 -       ret = request_irq(master->irq, clk_master_irq_handler,
5792 -                         IRQF_TRIGGER_HIGH, "clk-master", master);
5793 -       if (ret) {
5794 -               kfree(master);
5795 -               return ERR_PTR(ret);
5796 -       }
5797 +       master->regmap = regmap;
5798
5799         clk = clk_register(NULL, &master->hw);
5800         if (IS_ERR(clk)) {
5801 -               free_irq(master->irq, master);
5802                 kfree(master);
5803         }
5804
5805 @@ -217,15 +195,15 @@ out_free_characteristics:
5806  }
5807
5808  static void __init
5809 -of_at91_clk_master_setup(struct device_node *np, struct at91_pmc *pmc,
5810 +of_at91_clk_master_setup(struct device_node *np,
5811                          const struct clk_master_layout *layout)
5812  {
5813         struct clk *clk;
5814         int num_parents;
5815 -       unsigned int irq;
5816         const char *parent_names[MASTER_SOURCE_MAX];
5817         const char *name = np->name;
5818         struct clk_master_characteristics *characteristics;
5819 +       struct regmap *regmap;
5820
5821         num_parents = of_clk_get_parent_count(np);
5822         if (num_parents <= 0 || num_parents > MASTER_SOURCE_MAX)
5823 @@ -239,11 +217,11 @@ of_at91_clk_master_setup(struct device_node *np, struct at91_pmc *pmc,
5824         if (!characteristics)
5825                 return;
5826
5827 -       irq = irq_of_parse_and_map(np, 0);
5828 -       if (!irq)
5829 -               goto out_free_characteristics;
5830 +       regmap = syscon_node_to_regmap(of_get_parent(np));
5831 +       if (IS_ERR(regmap))
5832 +               return;
5833
5834 -       clk = at91_clk_register_master(pmc, irq, name, num_parents,
5835 +       clk = at91_clk_register_master(regmap, name, num_parents,
5836                                        parent_names, layout,
5837                                        characteristics);
5838         if (IS_ERR(clk))
5839 @@ -256,14 +234,16 @@ out_free_characteristics:
5840         kfree(characteristics);
5841  }
5842
5843 -void __init of_at91rm9200_clk_master_setup(struct device_node *np,
5844 -                                          struct at91_pmc *pmc)
5845 +static void __init of_at91rm9200_clk_master_setup(struct device_node *np)
5846  {
5847 -       of_at91_clk_master_setup(np, pmc, &at91rm9200_master_layout);
5848 +       of_at91_clk_master_setup(np, &at91rm9200_master_layout);
5849  }
5850 +CLK_OF_DECLARE(at91rm9200_clk_master, "atmel,at91rm9200-clk-master",
5851 +              of_at91rm9200_clk_master_setup);
5852
5853 -void __init of_at91sam9x5_clk_master_setup(struct device_node *np,
5854 -                                          struct at91_pmc *pmc)
5855 +static void __init of_at91sam9x5_clk_master_setup(struct device_node *np)
5856  {
5857 -       of_at91_clk_master_setup(np, pmc, &at91sam9x5_master_layout);
5858 +       of_at91_clk_master_setup(np, &at91sam9x5_master_layout);
5859  }
5860 +CLK_OF_DECLARE(at91sam9x5_clk_master, "atmel,at91sam9x5-clk-master",
5861 +              of_at91sam9x5_clk_master_setup);
5862 diff --git a/drivers/clk/at91/clk-peripheral.c b/drivers/clk/at91/clk-peripheral.c
5863 index 58f3b568e9cb..d69cd2a121b1 100644
5864 --- a/drivers/clk/at91/clk-peripheral.c
5865 +++ b/drivers/clk/at91/clk-peripheral.c
5866 @@ -12,11 +12,13 @@
5867  #include <linux/clkdev.h>
5868  #include <linux/clk/at91_pmc.h>
5869  #include <linux/of.h>
5870 -#include <linux/of_address.h>
5871 -#include <linux/io.h>
5872 +#include <linux/mfd/syscon.h>
5873 +#include <linux/regmap.h>
5874
5875  #include "pmc.h"
5876
5877 +DEFINE_SPINLOCK(pmc_pcr_lock);
5878 +
5879  #define PERIPHERAL_MAX         64
5880
5881  #define PERIPHERAL_AT91RM9200  0
5882 @@ -33,7 +35,7 @@
5883
5884  struct clk_peripheral {
5885         struct clk_hw hw;
5886 -       struct at91_pmc *pmc;
5887 +       struct regmap *regmap;
5888         u32 id;
5889  };
5890
5891 @@ -41,8 +43,9 @@ struct clk_peripheral {
5892
5893  struct clk_sam9x5_peripheral {
5894         struct clk_hw hw;
5895 -       struct at91_pmc *pmc;
5896 +       struct regmap *regmap;
5897         struct clk_range range;
5898 +       spinlock_t *lock;
5899         u32 id;
5900         u32 div;
5901         bool auto_div;
5902 @@ -54,7 +57,6 @@ struct clk_sam9x5_peripheral {
5903  static int clk_peripheral_enable(struct clk_hw *hw)
5904  {
5905         struct clk_peripheral *periph = to_clk_peripheral(hw);
5906 -       struct at91_pmc *pmc = periph->pmc;
5907         int offset = AT91_PMC_PCER;
5908         u32 id = periph->id;
5909
5910 @@ -62,14 +64,14 @@ static int clk_peripheral_enable(struct clk_hw *hw)
5911                 return 0;
5912         if (id > PERIPHERAL_ID_MAX)
5913                 offset = AT91_PMC_PCER1;
5914 -       pmc_write(pmc, offset, PERIPHERAL_MASK(id));
5915 +       regmap_write(periph->regmap, offset, PERIPHERAL_MASK(id));
5916 +
5917         return 0;
5918  }
5919
5920  static void clk_peripheral_disable(struct clk_hw *hw)
5921  {
5922         struct clk_peripheral *periph = to_clk_peripheral(hw);
5923 -       struct at91_pmc *pmc = periph->pmc;
5924         int offset = AT91_PMC_PCDR;
5925         u32 id = periph->id;
5926
5927 @@ -77,21 +79,23 @@ static void clk_peripheral_disable(struct clk_hw *hw)
5928                 return;
5929         if (id > PERIPHERAL_ID_MAX)
5930                 offset = AT91_PMC_PCDR1;
5931 -       pmc_write(pmc, offset, PERIPHERAL_MASK(id));
5932 +       regmap_write(periph->regmap, offset, PERIPHERAL_MASK(id));
5933  }
5934
5935  static int clk_peripheral_is_enabled(struct clk_hw *hw)
5936  {
5937         struct clk_peripheral *periph = to_clk_peripheral(hw);
5938 -       struct at91_pmc *pmc = periph->pmc;
5939         int offset = AT91_PMC_PCSR;
5940 +       unsigned int status;
5941         u32 id = periph->id;
5942
5943         if (id < PERIPHERAL_ID_MIN)
5944                 return 1;
5945         if (id > PERIPHERAL_ID_MAX)
5946                 offset = AT91_PMC_PCSR1;
5947 -       return !!(pmc_read(pmc, offset) & PERIPHERAL_MASK(id));
5948 +       regmap_read(periph->regmap, offset, &status);
5949 +
5950 +       return status & PERIPHERAL_MASK(id) ? 1 : 0;
5951  }
5952
5953  static const struct clk_ops peripheral_ops = {
5954 @@ -101,14 +105,14 @@ static const struct clk_ops peripheral_ops = {
5955  };
5956
5957  static struct clk * __init
5958 -at91_clk_register_peripheral(struct at91_pmc *pmc, const char *name,
5959 +at91_clk_register_peripheral(struct regmap *regmap, const char *name,
5960                              const char *parent_name, u32 id)
5961  {
5962         struct clk_peripheral *periph;
5963         struct clk *clk = NULL;
5964         struct clk_init_data init;
5965
5966 -       if (!pmc || !name || !parent_name || id > PERIPHERAL_ID_MAX)
5967 +       if (!name || !parent_name || id > PERIPHERAL_ID_MAX)
5968                 return ERR_PTR(-EINVAL);
5969
5970         periph = kzalloc(sizeof(*periph), GFP_KERNEL);
5971 @@ -123,7 +127,7 @@ at91_clk_register_peripheral(struct at91_pmc *pmc, const char *name,
5972
5973         periph->id = id;
5974         periph->hw.init = &init;
5975 -       periph->pmc = pmc;
5976 +       periph->regmap = regmap;
5977
5978         clk = clk_register(NULL, &periph->hw);
5979         if (IS_ERR(clk))
5980 @@ -160,53 +164,58 @@ static void clk_sam9x5_peripheral_autodiv(struct clk_sam9x5_peripheral *periph)
5981  static int clk_sam9x5_peripheral_enable(struct clk_hw *hw)
5982  {
5983         struct clk_sam9x5_peripheral *periph = to_clk_sam9x5_peripheral(hw);
5984 -       struct at91_pmc *pmc = periph->pmc;
5985 -       u32 tmp;
5986 +       unsigned long flags;
5987
5988         if (periph->id < PERIPHERAL_ID_MIN)
5989                 return 0;
5990
5991 -       pmc_lock(pmc);
5992 -       pmc_write(pmc, AT91_PMC_PCR, (periph->id & AT91_PMC_PCR_PID_MASK));
5993 -       tmp = pmc_read(pmc, AT91_PMC_PCR) & ~AT91_PMC_PCR_DIV_MASK;
5994 -       pmc_write(pmc, AT91_PMC_PCR, tmp | AT91_PMC_PCR_DIV(periph->div)
5995 -                                        | AT91_PMC_PCR_CMD
5996 -                                        | AT91_PMC_PCR_EN);
5997 -       pmc_unlock(pmc);
5998 +       spin_lock_irqsave(periph->lock, flags);
5999 +       regmap_write(periph->regmap, AT91_PMC_PCR,
6000 +                    (periph->id & AT91_PMC_PCR_PID_MASK));
6001 +       regmap_update_bits(periph->regmap, AT91_PMC_PCR,
6002 +                          AT91_PMC_PCR_DIV_MASK | AT91_PMC_PCR_CMD |
6003 +                          AT91_PMC_PCR_EN,
6004 +                          AT91_PMC_PCR_DIV(periph->div) |
6005 +                          AT91_PMC_PCR_CMD |
6006 +                          AT91_PMC_PCR_EN);
6007 +       spin_unlock_irqrestore(periph->lock, flags);
6008 +
6009         return 0;
6010  }
6011
6012  static void clk_sam9x5_peripheral_disable(struct clk_hw *hw)
6013  {
6014         struct clk_sam9x5_peripheral *periph = to_clk_sam9x5_peripheral(hw);
6015 -       struct at91_pmc *pmc = periph->pmc;
6016 -       u32 tmp;
6017 +       unsigned long flags;
6018
6019         if (periph->id < PERIPHERAL_ID_MIN)
6020                 return;
6021
6022 -       pmc_lock(pmc);
6023 -       pmc_write(pmc, AT91_PMC_PCR, (periph->id & AT91_PMC_PCR_PID_MASK));
6024 -       tmp = pmc_read(pmc, AT91_PMC_PCR) & ~AT91_PMC_PCR_EN;
6025 -       pmc_write(pmc, AT91_PMC_PCR, tmp | AT91_PMC_PCR_CMD);
6026 -       pmc_unlock(pmc);
6027 +       spin_lock_irqsave(periph->lock, flags);
6028 +       regmap_write(periph->regmap, AT91_PMC_PCR,
6029 +                    (periph->id & AT91_PMC_PCR_PID_MASK));
6030 +       regmap_update_bits(periph->regmap, AT91_PMC_PCR,
6031 +                          AT91_PMC_PCR_EN | AT91_PMC_PCR_CMD,
6032 +                          AT91_PMC_PCR_CMD);
6033 +       spin_unlock_irqrestore(periph->lock, flags);
6034  }
6035
6036  static int clk_sam9x5_peripheral_is_enabled(struct clk_hw *hw)
6037  {
6038         struct clk_sam9x5_peripheral *periph = to_clk_sam9x5_peripheral(hw);
6039 -       struct at91_pmc *pmc = periph->pmc;
6040 -       int ret;
6041 +       unsigned long flags;
6042 +       unsigned int status;
6043
6044         if (periph->id < PERIPHERAL_ID_MIN)
6045                 return 1;
6046
6047 -       pmc_lock(pmc);
6048 -       pmc_write(pmc, AT91_PMC_PCR, (periph->id & AT91_PMC_PCR_PID_MASK));
6049 -       ret = !!(pmc_read(pmc, AT91_PMC_PCR) & AT91_PMC_PCR_EN);
6050 -       pmc_unlock(pmc);
6051 +       spin_lock_irqsave(periph->lock, flags);
6052 +       regmap_write(periph->regmap, AT91_PMC_PCR,
6053 +                    (periph->id & AT91_PMC_PCR_PID_MASK));
6054 +       regmap_read(periph->regmap, AT91_PMC_PCR, &status);
6055 +       spin_unlock_irqrestore(periph->lock, flags);
6056
6057 -       return ret;
6058 +       return status & AT91_PMC_PCR_EN ? 1 : 0;
6059  }
6060
6061  static unsigned long
6062 @@ -214,19 +223,20 @@ clk_sam9x5_peripheral_recalc_rate(struct clk_hw *hw,
6063                                   unsigned long parent_rate)
6064  {
6065         struct clk_sam9x5_peripheral *periph = to_clk_sam9x5_peripheral(hw);
6066 -       struct at91_pmc *pmc = periph->pmc;
6067 -       u32 tmp;
6068 +       unsigned long flags;
6069 +       unsigned int status;
6070
6071         if (periph->id < PERIPHERAL_ID_MIN)
6072                 return parent_rate;
6073
6074 -       pmc_lock(pmc);
6075 -       pmc_write(pmc, AT91_PMC_PCR, (periph->id & AT91_PMC_PCR_PID_MASK));
6076 -       tmp = pmc_read(pmc, AT91_PMC_PCR);
6077 -       pmc_unlock(pmc);
6078 +       spin_lock_irqsave(periph->lock, flags);
6079 +       regmap_write(periph->regmap, AT91_PMC_PCR,
6080 +                    (periph->id & AT91_PMC_PCR_PID_MASK));
6081 +       regmap_read(periph->regmap, AT91_PMC_PCR, &status);
6082 +       spin_unlock_irqrestore(periph->lock, flags);
6083
6084 -       if (tmp & AT91_PMC_PCR_EN) {
6085 -               periph->div = PERIPHERAL_RSHIFT(tmp);
6086 +       if (status & AT91_PMC_PCR_EN) {
6087 +               periph->div = PERIPHERAL_RSHIFT(status);
6088                 periph->auto_div = false;
6089         } else {
6090                 clk_sam9x5_peripheral_autodiv(periph);
6091 @@ -318,15 +328,15 @@ static const struct clk_ops sam9x5_peripheral_ops = {
6092  };
6093
6094  static struct clk * __init
6095 -at91_clk_register_sam9x5_peripheral(struct at91_pmc *pmc, const char *name,
6096 -                                   const char *parent_name, u32 id,
6097 -                                   const struct clk_range *range)
6098 +at91_clk_register_sam9x5_peripheral(struct regmap *regmap, spinlock_t *lock,
6099 +                                   const char *name, const char *parent_name,
6100 +                                   u32 id, const struct clk_range *range)
6101  {
6102         struct clk_sam9x5_peripheral *periph;
6103         struct clk *clk = NULL;
6104         struct clk_init_data init;
6105
6106 -       if (!pmc || !name || !parent_name)
6107 +       if (!name || !parent_name)
6108                 return ERR_PTR(-EINVAL);
6109
6110         periph = kzalloc(sizeof(*periph), GFP_KERNEL);
6111 @@ -342,7 +352,8 @@ at91_clk_register_sam9x5_peripheral(struct at91_pmc *pmc, const char *name,
6112         periph->id = id;
6113         periph->hw.init = &init;
6114         periph->div = 0;
6115 -       periph->pmc = pmc;
6116 +       periph->regmap = regmap;
6117 +       periph->lock = lock;
6118         periph->auto_div = true;
6119         periph->range = *range;
6120
6121 @@ -356,7 +367,7 @@ at91_clk_register_sam9x5_peripheral(struct at91_pmc *pmc, const char *name,
6122  }
6123
6124  static void __init
6125 -of_at91_clk_periph_setup(struct device_node *np, struct at91_pmc *pmc, u8 type)
6126 +of_at91_clk_periph_setup(struct device_node *np, u8 type)
6127  {
6128         int num;
6129         u32 id;
6130 @@ -364,6 +375,7 @@ of_at91_clk_periph_setup(struct device_node *np, struct at91_pmc *pmc, u8 type)
6131         const char *parent_name;
6132         const char *name;
6133         struct device_node *periphclknp;
6134 +       struct regmap *regmap;
6135
6136         parent_name = of_clk_get_parent_name(np, 0);
6137         if (!parent_name)
6138 @@ -373,6 +385,10 @@ of_at91_clk_periph_setup(struct device_node *np, struct at91_pmc *pmc, u8 type)
6139         if (!num || num > PERIPHERAL_MAX)
6140                 return;
6141
6142 +       regmap = syscon_node_to_regmap(of_get_parent(np));
6143 +       if (IS_ERR(regmap))
6144 +               return;
6145 +
6146         for_each_child_of_node(np, periphclknp) {
6147                 if (of_property_read_u32(periphclknp, "reg", &id))
6148                         continue;
6149 @@ -384,7 +400,7 @@ of_at91_clk_periph_setup(struct device_node *np, struct at91_pmc *pmc, u8 type)
6150                         name = periphclknp->name;
6151
6152                 if (type == PERIPHERAL_AT91RM9200) {
6153 -                       clk = at91_clk_register_peripheral(pmc, name,
6154 +                       clk = at91_clk_register_peripheral(regmap, name,
6155                                                            parent_name, id);
6156                 } else {
6157                         struct clk_range range = CLK_RANGE(0, 0);
6158 @@ -393,7 +409,9 @@ of_at91_clk_periph_setup(struct device_node *np, struct at91_pmc *pmc, u8 type)
6159                                               "atmel,clk-output-range",
6160                                               &range);
6161
6162 -                       clk = at91_clk_register_sam9x5_peripheral(pmc, name,
6163 +                       clk = at91_clk_register_sam9x5_peripheral(regmap,
6164 +                                                                 &pmc_pcr_lock,
6165 +                                                                 name,
6166                                                                   parent_name,
6167                                                                   id, &range);
6168                 }
6169 @@ -405,14 +423,16 @@ of_at91_clk_periph_setup(struct device_node *np, struct at91_pmc *pmc, u8 type)
6170         }
6171  }
6172
6173 -void __init of_at91rm9200_clk_periph_setup(struct device_node *np,
6174 -                                          struct at91_pmc *pmc)
6175 +static void __init of_at91rm9200_clk_periph_setup(struct device_node *np)
6176  {
6177 -       of_at91_clk_periph_setup(np, pmc, PERIPHERAL_AT91RM9200);
6178 +       of_at91_clk_periph_setup(np, PERIPHERAL_AT91RM9200);
6179  }
6180 +CLK_OF_DECLARE(at91rm9200_clk_periph, "atmel,at91rm9200-clk-peripheral",
6181 +              of_at91rm9200_clk_periph_setup);
6182
6183 -void __init of_at91sam9x5_clk_periph_setup(struct device_node *np,
6184 -                                          struct at91_pmc *pmc)
6185 +static void __init of_at91sam9x5_clk_periph_setup(struct device_node *np)
6186  {
6187 -       of_at91_clk_periph_setup(np, pmc, PERIPHERAL_AT91SAM9X5);
6188 +       of_at91_clk_periph_setup(np, PERIPHERAL_AT91SAM9X5);
6189  }
6190 +CLK_OF_DECLARE(at91sam9x5_clk_periph, "atmel,at91sam9x5-clk-peripheral",
6191 +              of_at91sam9x5_clk_periph_setup);
6192 diff --git a/drivers/clk/at91/clk-pll.c b/drivers/clk/at91/clk-pll.c
6193 index 18b60f4895a6..fb2e0b56d4b7 100644
6194 --- a/drivers/clk/at91/clk-pll.c
6195 +++ b/drivers/clk/at91/clk-pll.c
6196 @@ -12,14 +12,8 @@
6197  #include <linux/clkdev.h>
6198  #include <linux/clk/at91_pmc.h>
6199  #include <linux/of.h>
6200 -#include <linux/of_address.h>
6201 -#include <linux/of_irq.h>
6202 -#include <linux/io.h>
6203 -#include <linux/kernel.h>
6204 -#include <linux/wait.h>
6205 -#include <linux/sched.h>
6206 -#include <linux/interrupt.h>
6207 -#include <linux/irq.h>
6208 +#include <linux/mfd/syscon.h>
6209 +#include <linux/regmap.h>
6210
6211  #include "pmc.h"
6212
6213 @@ -58,9 +52,7 @@ struct clk_pll_layout {
6214
6215  struct clk_pll {
6216         struct clk_hw hw;
6217 -       struct at91_pmc *pmc;
6218 -       unsigned int irq;
6219 -       wait_queue_head_t wait;
6220 +       struct regmap *regmap;
6221         u8 id;
6222         u8 div;
6223         u8 range;
6224 @@ -69,20 +61,19 @@ struct clk_pll {
6225         const struct clk_pll_characteristics *characteristics;
6226  };
6227
6228 -static irqreturn_t clk_pll_irq_handler(int irq, void *dev_id)
6229 +static inline bool clk_pll_ready(struct regmap *regmap, int id)
6230  {
6231 -       struct clk_pll *pll = (struct clk_pll *)dev_id;
6232 +       unsigned int status;
6233
6234 -       wake_up(&pll->wait);
6235 -       disable_irq_nosync(pll->irq);
6236 +       regmap_read(regmap, AT91_PMC_SR, &status);
6237
6238 -       return IRQ_HANDLED;
6239 +       return status & PLL_STATUS_MASK(id) ? 1 : 0;
6240  }
6241
6242  static int clk_pll_prepare(struct clk_hw *hw)
6243  {
6244         struct clk_pll *pll = to_clk_pll(hw);
6245 -       struct at91_pmc *pmc = pll->pmc;
6246 +       struct regmap *regmap = pll->regmap;
6247         const struct clk_pll_layout *layout = pll->layout;
6248         const struct clk_pll_characteristics *characteristics =
6249                                                         pll->characteristics;
6250 @@ -90,39 +81,34 @@ static int clk_pll_prepare(struct clk_hw *hw)
6251         u32 mask = PLL_STATUS_MASK(id);
6252         int offset = PLL_REG(id);
6253         u8 out = 0;
6254 -       u32 pllr, icpr;
6255 +       unsigned int pllr;
6256 +       unsigned int status;
6257         u8 div;
6258         u16 mul;
6259
6260 -       pllr = pmc_read(pmc, offset);
6261 +       regmap_read(regmap, offset, &pllr);
6262         div = PLL_DIV(pllr);
6263         mul = PLL_MUL(pllr, layout);
6264
6265 -       if ((pmc_read(pmc, AT91_PMC_SR) & mask) &&
6266 +       regmap_read(regmap, AT91_PMC_SR, &status);
6267 +       if ((status & mask) &&
6268             (div == pll->div && mul == pll->mul))
6269                 return 0;
6270
6271         if (characteristics->out)
6272                 out = characteristics->out[pll->range];
6273 -       if (characteristics->icpll) {
6274 -               icpr = pmc_read(pmc, AT91_PMC_PLLICPR) & ~PLL_ICPR_MASK(id);
6275 -               icpr |= (characteristics->icpll[pll->range] <<
6276 -                       PLL_ICPR_SHIFT(id));
6277 -               pmc_write(pmc, AT91_PMC_PLLICPR, icpr);
6278 -       }
6279
6280 -       pllr &= ~layout->pllr_mask;
6281 -       pllr |= layout->pllr_mask &
6282 -              (pll->div | (PLL_MAX_COUNT << PLL_COUNT_SHIFT) |
6283 -               (out << PLL_OUT_SHIFT) |
6284 -               ((pll->mul & layout->mul_mask) << layout->mul_shift));
6285 -       pmc_write(pmc, offset, pllr);
6286 -
6287 -       while (!(pmc_read(pmc, AT91_PMC_SR) & mask)) {
6288 -               enable_irq(pll->irq);
6289 -               wait_event(pll->wait,
6290 -                          pmc_read(pmc, AT91_PMC_SR) & mask);
6291 -       }
6292 +       if (characteristics->icpll)
6293 +               regmap_update_bits(regmap, AT91_PMC_PLLICPR, PLL_ICPR_MASK(id),
6294 +                       characteristics->icpll[pll->range] << PLL_ICPR_SHIFT(id));
6295 +
6296 +       regmap_update_bits(regmap, offset, layout->pllr_mask,
6297 +                       pll->div | (PLL_MAX_COUNT << PLL_COUNT_SHIFT) |
6298 +                       (out << PLL_OUT_SHIFT) |
6299 +                       ((pll->mul & layout->mul_mask) << layout->mul_shift));
6300 +
6301 +       while (!clk_pll_ready(regmap, pll->id))
6302 +               cpu_relax();
6303
6304         return 0;
6305  }
6306 @@ -130,32 +116,35 @@ static int clk_pll_prepare(struct clk_hw *hw)
6307  static int clk_pll_is_prepared(struct clk_hw *hw)
6308  {
6309         struct clk_pll *pll = to_clk_pll(hw);
6310 -       struct at91_pmc *pmc = pll->pmc;
6311
6312 -       return !!(pmc_read(pmc, AT91_PMC_SR) &
6313 -                 PLL_STATUS_MASK(pll->id));
6314 +       return clk_pll_ready(pll->regmap, pll->id);
6315  }
6316
6317  static void clk_pll_unprepare(struct clk_hw *hw)
6318  {
6319         struct clk_pll *pll = to_clk_pll(hw);
6320 -       struct at91_pmc *pmc = pll->pmc;
6321 -       const struct clk_pll_layout *layout = pll->layout;
6322 -       int offset = PLL_REG(pll->id);
6323 -       u32 tmp = pmc_read(pmc, offset) & ~(layout->pllr_mask);
6324 +       unsigned int mask = pll->layout->pllr_mask;
6325
6326 -       pmc_write(pmc, offset, tmp);
6327 +       regmap_update_bits(pll->regmap, PLL_REG(pll->id), mask, ~mask);
6328  }
6329
6330  static unsigned long clk_pll_recalc_rate(struct clk_hw *hw,
6331                                          unsigned long parent_rate)
6332  {
6333         struct clk_pll *pll = to_clk_pll(hw);
6334 +       unsigned int pllr;
6335 +       u16 mul;
6336 +       u8 div;
6337
6338 -       if (!pll->div || !pll->mul)
6339 +       regmap_read(pll->regmap, PLL_REG(pll->id), &pllr);
6340 +
6341 +       div = PLL_DIV(pllr);
6342 +       mul = PLL_MUL(pllr, pll->layout);
6343 +
6344 +       if (!div || !mul)
6345                 return 0;
6346
6347 -       return (parent_rate / pll->div) * (pll->mul + 1);
6348 +       return (parent_rate / div) * (mul + 1);
6349  }
6350
6351  static long clk_pll_get_best_div_mul(struct clk_pll *pll, unsigned long rate,
6352 @@ -308,7 +297,7 @@ static const struct clk_ops pll_ops = {
6353  };
6354
6355  static struct clk * __init
6356 -at91_clk_register_pll(struct at91_pmc *pmc, unsigned int irq, const char *name,
6357 +at91_clk_register_pll(struct regmap *regmap, const char *name,
6358                       const char *parent_name, u8 id,
6359                       const struct clk_pll_layout *layout,
6360                       const struct clk_pll_characteristics *characteristics)
6361 @@ -316,9 +305,8 @@ at91_clk_register_pll(struct at91_pmc *pmc, unsigned int irq, const char *name,
6362         struct clk_pll *pll;
6363         struct clk *clk = NULL;
6364         struct clk_init_data init;
6365 -       int ret;
6366         int offset = PLL_REG(id);
6367 -       u32 tmp;
6368 +       unsigned int pllr;
6369
6370         if (id > PLL_MAX_ID)
6371                 return ERR_PTR(-EINVAL);
6372 @@ -337,23 +325,13 @@ at91_clk_register_pll(struct at91_pmc *pmc, unsigned int irq, const char *name,
6373         pll->hw.init = &init;
6374         pll->layout = layout;
6375         pll->characteristics = characteristics;
6376 -       pll->pmc = pmc;
6377 -       pll->irq = irq;
6378 -       tmp = pmc_read(pmc, offset) & layout->pllr_mask;
6379 -       pll->div = PLL_DIV(tmp);
6380 -       pll->mul = PLL_MUL(tmp, layout);
6381 -       init_waitqueue_head(&pll->wait);
6382 -       irq_set_status_flags(pll->irq, IRQ_NOAUTOEN);
6383 -       ret = request_irq(pll->irq, clk_pll_irq_handler, IRQF_TRIGGER_HIGH,
6384 -                         id ? "clk-pllb" : "clk-plla", pll);
6385 -       if (ret) {
6386 -               kfree(pll);
6387 -               return ERR_PTR(ret);
6388 -       }
6389 +       pll->regmap = regmap;
6390 +       regmap_read(regmap, offset, &pllr);
6391 +       pll->div = PLL_DIV(pllr);
6392 +       pll->mul = PLL_MUL(pllr, layout);
6393
6394         clk = clk_register(NULL, &pll->hw);
6395         if (IS_ERR(clk)) {
6396 -               free_irq(pll->irq, pll);
6397                 kfree(pll);
6398         }
6399
6400 @@ -483,12 +461,12 @@ out_free_characteristics:
6401  }
6402
6403  static void __init
6404 -of_at91_clk_pll_setup(struct device_node *np, struct at91_pmc *pmc,
6405 +of_at91_clk_pll_setup(struct device_node *np,
6406                       const struct clk_pll_layout *layout)
6407  {
6408         u32 id;
6409 -       unsigned int irq;
6410         struct clk *clk;
6411 +       struct regmap *regmap;
6412         const char *parent_name;
6413         const char *name = np->name;
6414         struct clk_pll_characteristics *characteristics;
6415 @@ -500,15 +478,15 @@ of_at91_clk_pll_setup(struct device_node *np, struct at91_pmc *pmc,
6416
6417         of_property_read_string(np, "clock-output-names", &name);
6418
6419 -       characteristics = of_at91_clk_pll_get_characteristics(np);
6420 -       if (!characteristics)
6421 +       regmap = syscon_node_to_regmap(of_get_parent(np));
6422 +       if (IS_ERR(regmap))
6423                 return;
6424
6425 -       irq = irq_of_parse_and_map(np, 0);
6426 -       if (!irq)
6427 +       characteristics = of_at91_clk_pll_get_characteristics(np);
6428 +       if (!characteristics)
6429                 return;
6430
6431 -       clk = at91_clk_register_pll(pmc, irq, name, parent_name, id, layout,
6432 +       clk = at91_clk_register_pll(regmap, name, parent_name, id, layout,
6433                                     characteristics);
6434         if (IS_ERR(clk))
6435                 goto out_free_characteristics;
6436 @@ -520,26 +498,30 @@ out_free_characteristics:
6437         kfree(characteristics);
6438  }
6439
6440 -void __init of_at91rm9200_clk_pll_setup(struct device_node *np,
6441 -                                              struct at91_pmc *pmc)
6442 +static void __init of_at91rm9200_clk_pll_setup(struct device_node *np)
6443  {
6444 -       of_at91_clk_pll_setup(np, pmc, &at91rm9200_pll_layout);
6445 +       of_at91_clk_pll_setup(np, &at91rm9200_pll_layout);
6446  }
6447 +CLK_OF_DECLARE(at91rm9200_clk_pll, "atmel,at91rm9200-clk-pll",
6448 +              of_at91rm9200_clk_pll_setup);
6449
6450 -void __init of_at91sam9g45_clk_pll_setup(struct device_node *np,
6451 -                                               struct at91_pmc *pmc)
6452 +static void __init of_at91sam9g45_clk_pll_setup(struct device_node *np)
6453  {
6454 -       of_at91_clk_pll_setup(np, pmc, &at91sam9g45_pll_layout);
6455 +       of_at91_clk_pll_setup(np, &at91sam9g45_pll_layout);
6456  }
6457 +CLK_OF_DECLARE(at91sam9g45_clk_pll, "atmel,at91sam9g45-clk-pll",
6458 +              of_at91sam9g45_clk_pll_setup);
6459
6460 -void __init of_at91sam9g20_clk_pllb_setup(struct device_node *np,
6461 -                                                struct at91_pmc *pmc)
6462 +static void __init of_at91sam9g20_clk_pllb_setup(struct device_node *np)
6463  {
6464 -       of_at91_clk_pll_setup(np, pmc, &at91sam9g20_pllb_layout);
6465 +       of_at91_clk_pll_setup(np, &at91sam9g20_pllb_layout);
6466  }
6467 +CLK_OF_DECLARE(at91sam9g20_clk_pllb, "atmel,at91sam9g20-clk-pllb",
6468 +              of_at91sam9g20_clk_pllb_setup);
6469
6470 -void __init of_sama5d3_clk_pll_setup(struct device_node *np,
6471 -                                           struct at91_pmc *pmc)
6472 +static void __init of_sama5d3_clk_pll_setup(struct device_node *np)
6473  {
6474 -       of_at91_clk_pll_setup(np, pmc, &sama5d3_pll_layout);
6475 +       of_at91_clk_pll_setup(np, &sama5d3_pll_layout);
6476  }
6477 +CLK_OF_DECLARE(sama5d3_clk_pll, "atmel,sama5d3-clk-pll",
6478 +              of_sama5d3_clk_pll_setup);
6479 diff --git a/drivers/clk/at91/clk-plldiv.c b/drivers/clk/at91/clk-plldiv.c
6480 index ea226562bb40..2bed26481027 100644
6481 --- a/drivers/clk/at91/clk-plldiv.c
6482 +++ b/drivers/clk/at91/clk-plldiv.c
6483 @@ -12,8 +12,8 @@
6484  #include <linux/clkdev.h>
6485  #include <linux/clk/at91_pmc.h>
6486  #include <linux/of.h>
6487 -#include <linux/of_address.h>
6488 -#include <linux/io.h>
6489 +#include <linux/mfd/syscon.h>
6490 +#include <linux/regmap.h>
6491
6492  #include "pmc.h"
6493
6494 @@ -21,16 +21,18 @@
6495
6496  struct clk_plldiv {
6497         struct clk_hw hw;
6498 -       struct at91_pmc *pmc;
6499 +       struct regmap *regmap;
6500  };
6501
6502  static unsigned long clk_plldiv_recalc_rate(struct clk_hw *hw,
6503                                             unsigned long parent_rate)
6504  {
6505         struct clk_plldiv *plldiv = to_clk_plldiv(hw);
6506 -       struct at91_pmc *pmc = plldiv->pmc;
6507 +       unsigned int mckr;
6508
6509 -       if (pmc_read(pmc, AT91_PMC_MCKR) & AT91_PMC_PLLADIV2)
6510 +       regmap_read(plldiv->regmap, AT91_PMC_MCKR, &mckr);
6511 +
6512 +       if (mckr & AT91_PMC_PLLADIV2)
6513                 return parent_rate / 2;
6514
6515         return parent_rate;
6516 @@ -57,18 +59,12 @@ static int clk_plldiv_set_rate(struct clk_hw *hw, unsigned long rate,
6517                                unsigned long parent_rate)
6518  {
6519         struct clk_plldiv *plldiv = to_clk_plldiv(hw);
6520 -       struct at91_pmc *pmc = plldiv->pmc;
6521 -       u32 tmp;
6522
6523 -       if (parent_rate != rate && (parent_rate / 2) != rate)
6524 +       if ((parent_rate != rate) && (parent_rate / 2 != rate))
6525                 return -EINVAL;
6526
6527 -       pmc_lock(pmc);
6528 -       tmp = pmc_read(pmc, AT91_PMC_MCKR) & ~AT91_PMC_PLLADIV2;
6529 -       if ((parent_rate / 2) == rate)
6530 -               tmp |= AT91_PMC_PLLADIV2;
6531 -       pmc_write(pmc, AT91_PMC_MCKR, tmp);
6532 -       pmc_unlock(pmc);
6533 +       regmap_update_bits(plldiv->regmap, AT91_PMC_MCKR, AT91_PMC_PLLADIV2,
6534 +                          parent_rate != rate ? AT91_PMC_PLLADIV2 : 0);
6535
6536         return 0;
6537  }
6538 @@ -80,7 +76,7 @@ static const struct clk_ops plldiv_ops = {
6539  };
6540
6541  static struct clk * __init
6542 -at91_clk_register_plldiv(struct at91_pmc *pmc, const char *name,
6543 +at91_clk_register_plldiv(struct regmap *regmap, const char *name,
6544                          const char *parent_name)
6545  {
6546         struct clk_plldiv *plldiv;
6547 @@ -98,7 +94,7 @@ at91_clk_register_plldiv(struct at91_pmc *pmc, const char *name,
6548         init.flags = CLK_SET_RATE_GATE;
6549
6550         plldiv->hw.init = &init;
6551 -       plldiv->pmc = pmc;
6552 +       plldiv->regmap = regmap;
6553
6554         clk = clk_register(NULL, &plldiv->hw);
6555
6556 @@ -109,27 +105,27 @@ at91_clk_register_plldiv(struct at91_pmc *pmc, const char *name,
6557  }
6558
6559  static void __init
6560 -of_at91_clk_plldiv_setup(struct device_node *np, struct at91_pmc *pmc)
6561 +of_at91sam9x5_clk_plldiv_setup(struct device_node *np)
6562  {
6563         struct clk *clk;
6564         const char *parent_name;
6565         const char *name = np->name;
6566 +       struct regmap *regmap;
6567
6568         parent_name = of_clk_get_parent_name(np, 0);
6569
6570         of_property_read_string(np, "clock-output-names", &name);
6571
6572 -       clk = at91_clk_register_plldiv(pmc, name, parent_name);
6573 +       regmap = syscon_node_to_regmap(of_get_parent(np));
6574 +       if (IS_ERR(regmap))
6575 +               return;
6576
6577 +       clk = at91_clk_register_plldiv(regmap, name, parent_name);
6578         if (IS_ERR(clk))
6579                 return;
6580
6581         of_clk_add_provider(np, of_clk_src_simple_get, clk);
6582         return;
6583  }
6584 -
6585 -void __init of_at91sam9x5_clk_plldiv_setup(struct device_node *np,
6586 -                                          struct at91_pmc *pmc)
6587 -{
6588 -       of_at91_clk_plldiv_setup(np, pmc);
6589 -}
6590 +CLK_OF_DECLARE(at91sam9x5_clk_plldiv, "atmel,at91sam9x5-clk-plldiv",
6591 +              of_at91sam9x5_clk_plldiv_setup);
6592 diff --git a/drivers/clk/at91/clk-programmable.c b/drivers/clk/at91/clk-programmable.c
6593 index 14b270b85fec..bc0be629671b 100644
6594 --- a/drivers/clk/at91/clk-programmable.c
6595 +++ b/drivers/clk/at91/clk-programmable.c
6596 @@ -12,10 +12,8 @@
6597  #include <linux/clkdev.h>
6598  #include <linux/clk/at91_pmc.h>
6599  #include <linux/of.h>
6600 -#include <linux/of_address.h>
6601 -#include <linux/io.h>
6602 -#include <linux/wait.h>
6603 -#include <linux/sched.h>
6604 +#include <linux/mfd/syscon.h>
6605 +#include <linux/regmap.h>
6606
6607  #include "pmc.h"
6608
6609 @@ -24,6 +22,7 @@
6610
6611  #define PROG_STATUS_MASK(id)   (1 << ((id) + 8))
6612  #define PROG_PRES_MASK         0x7
6613 +#define PROG_PRES(layout, pckr)        ((pckr >> layout->pres_shift) & PROG_PRES_MASK)
6614  #define PROG_MAX_RM9200_CSS    3
6615
6616  struct clk_programmable_layout {
6617 @@ -34,7 +33,7 @@ struct clk_programmable_layout {
6618
6619  struct clk_programmable {
6620         struct clk_hw hw;
6621 -       struct at91_pmc *pmc;
6622 +       struct regmap *regmap;
6623         u8 id;
6624         const struct clk_programmable_layout *layout;
6625  };
6626 @@ -44,14 +43,12 @@ struct clk_programmable {
6627  static unsigned long clk_programmable_recalc_rate(struct clk_hw *hw,
6628                                                   unsigned long parent_rate)
6629  {
6630 -       u32 pres;
6631         struct clk_programmable *prog = to_clk_programmable(hw);
6632 -       struct at91_pmc *pmc = prog->pmc;
6633 -       const struct clk_programmable_layout *layout = prog->layout;
6634 +       unsigned int pckr;
6635 +
6636 +       regmap_read(prog->regmap, AT91_PMC_PCKR(prog->id), &pckr);
6637
6638 -       pres = (pmc_read(pmc, AT91_PMC_PCKR(prog->id)) >> layout->pres_shift) &
6639 -              PROG_PRES_MASK;
6640 -       return parent_rate >> pres;
6641 +       return parent_rate >> PROG_PRES(prog->layout, pckr);
6642  }
6643
6644  static int clk_programmable_determine_rate(struct clk_hw *hw,
6645 @@ -101,36 +98,36 @@ static int clk_programmable_set_parent(struct clk_hw *hw, u8 index)
6646  {
6647         struct clk_programmable *prog = to_clk_programmable(hw);
6648         const struct clk_programmable_layout *layout = prog->layout;
6649 -       struct at91_pmc *pmc = prog->pmc;
6650 -       u32 tmp = pmc_read(pmc, AT91_PMC_PCKR(prog->id)) & ~layout->css_mask;
6651 +       unsigned int mask = layout->css_mask;
6652 +       unsigned int pckr = 0;
6653
6654         if (layout->have_slck_mck)
6655 -               tmp &= AT91_PMC_CSSMCK_MCK;
6656 +               mask |= AT91_PMC_CSSMCK_MCK;
6657
6658         if (index > layout->css_mask) {
6659 -               if (index > PROG_MAX_RM9200_CSS && layout->have_slck_mck) {
6660 -                       tmp |= AT91_PMC_CSSMCK_MCK;
6661 -                       return 0;
6662 -               } else {
6663 +               if (index > PROG_MAX_RM9200_CSS && !layout->have_slck_mck)
6664                         return -EINVAL;
6665 -               }
6666 +
6667 +               pckr |= AT91_PMC_CSSMCK_MCK;
6668         }
6669
6670 -       pmc_write(pmc, AT91_PMC_PCKR(prog->id), tmp | index);
6671 +       regmap_update_bits(prog->regmap, AT91_PMC_PCKR(prog->id), mask, pckr);
6672 +
6673         return 0;
6674  }
6675
6676  static u8 clk_programmable_get_parent(struct clk_hw *hw)
6677  {
6678 -       u32 tmp;
6679 -       u8 ret;
6680         struct clk_programmable *prog = to_clk_programmable(hw);
6681 -       struct at91_pmc *pmc = prog->pmc;
6682         const struct clk_programmable_layout *layout = prog->layout;
6683 +       unsigned int pckr;
6684 +       u8 ret;
6685 +
6686 +       regmap_read(prog->regmap, AT91_PMC_PCKR(prog->id), &pckr);
6687 +
6688 +       ret = pckr & layout->css_mask;
6689
6690 -       tmp = pmc_read(pmc, AT91_PMC_PCKR(prog->id));
6691 -       ret = tmp & layout->css_mask;
6692 -       if (layout->have_slck_mck && (tmp & AT91_PMC_CSSMCK_MCK) && !ret)
6693 +       if (layout->have_slck_mck && (pckr & AT91_PMC_CSSMCK_MCK) && !ret)
6694                 ret = PROG_MAX_RM9200_CSS + 1;
6695
6696         return ret;
6697 @@ -140,26 +137,27 @@ static int clk_programmable_set_rate(struct clk_hw *hw, unsigned long rate,
6698                                      unsigned long parent_rate)
6699  {
6700         struct clk_programmable *prog = to_clk_programmable(hw);
6701 -       struct at91_pmc *pmc = prog->pmc;
6702         const struct clk_programmable_layout *layout = prog->layout;
6703         unsigned long div = parent_rate / rate;
6704 +       unsigned int pckr;
6705         int shift = 0;
6706 -       u32 tmp = pmc_read(pmc, AT91_PMC_PCKR(prog->id)) &
6707 -                 ~(PROG_PRES_MASK << layout->pres_shift);
6708 +
6709 +       regmap_read(prog->regmap, AT91_PMC_PCKR(prog->id), &pckr);
6710
6711         if (!div)
6712                 return -EINVAL;
6713
6714         shift = fls(div) - 1;
6715
6716 -       if (div != (1<<shift))
6717 +       if (div != (1 << shift))
6718                 return -EINVAL;
6719
6720         if (shift >= PROG_PRES_MASK)
6721                 return -EINVAL;
6722
6723 -       pmc_write(pmc, AT91_PMC_PCKR(prog->id),
6724 -                 tmp | (shift << layout->pres_shift));
6725 +       regmap_update_bits(prog->regmap, AT91_PMC_PCKR(prog->id),
6726 +                          PROG_PRES_MASK << layout->pres_shift,
6727 +                          shift << layout->pres_shift);
6728
6729         return 0;
6730  }
6731 @@ -173,7 +171,7 @@ static const struct clk_ops programmable_ops = {
6732  };
6733
6734  static struct clk * __init
6735 -at91_clk_register_programmable(struct at91_pmc *pmc,
6736 +at91_clk_register_programmable(struct regmap *regmap,
6737                                const char *name, const char **parent_names,
6738                                u8 num_parents, u8 id,
6739                                const struct clk_programmable_layout *layout)
6740 @@ -198,7 +196,7 @@ at91_clk_register_programmable(struct at91_pmc *pmc,
6741         prog->id = id;
6742         prog->layout = layout;
6743         prog->hw.init = &init;
6744 -       prog->pmc = pmc;
6745 +       prog->regmap = regmap;
6746
6747         clk = clk_register(NULL, &prog->hw);
6748         if (IS_ERR(clk))
6749 @@ -226,7 +224,7 @@ static const struct clk_programmable_layout at91sam9x5_programmable_layout = {
6750  };
6751
6752  static void __init
6753 -of_at91_clk_prog_setup(struct device_node *np, struct at91_pmc *pmc,
6754 +of_at91_clk_prog_setup(struct device_node *np,
6755                        const struct clk_programmable_layout *layout)
6756  {
6757         int num;
6758 @@ -236,6 +234,7 @@ of_at91_clk_prog_setup(struct device_node *np, struct at91_pmc *pmc,
6759         const char *parent_names[PROG_SOURCE_MAX];
6760         const char *name;
6761         struct device_node *progclknp;
6762 +       struct regmap *regmap;
6763
6764         num_parents = of_clk_get_parent_count(np);
6765         if (num_parents <= 0 || num_parents > PROG_SOURCE_MAX)
6766 @@ -247,6 +246,10 @@ of_at91_clk_prog_setup(struct device_node *np, struct at91_pmc *pmc,
6767         if (!num || num > (PROG_ID_MAX + 1))
6768                 return;
6769
6770 +       regmap = syscon_node_to_regmap(of_get_parent(np));
6771 +       if (IS_ERR(regmap))
6772 +               return;
6773 +
6774         for_each_child_of_node(np, progclknp) {
6775                 if (of_property_read_u32(progclknp, "reg", &id))
6776                         continue;
6777 @@ -254,7 +257,7 @@ of_at91_clk_prog_setup(struct device_node *np, struct at91_pmc *pmc,
6778                 if (of_property_read_string(np, "clock-output-names", &name))
6779                         name = progclknp->name;
6780
6781 -               clk = at91_clk_register_programmable(pmc, name,
6782 +               clk = at91_clk_register_programmable(regmap, name,
6783                                                      parent_names, num_parents,
6784                                                      id, layout);
6785                 if (IS_ERR(clk))
6786 @@ -265,20 +268,23 @@ of_at91_clk_prog_setup(struct device_node *np, struct at91_pmc *pmc,
6787  }
6788
6789
6790 -void __init of_at91rm9200_clk_prog_setup(struct device_node *np,
6791 -                                        struct at91_pmc *pmc)
6792 +static void __init of_at91rm9200_clk_prog_setup(struct device_node *np)
6793  {
6794 -       of_at91_clk_prog_setup(np, pmc, &at91rm9200_programmable_layout);
6795 +       of_at91_clk_prog_setup(np, &at91rm9200_programmable_layout);
6796  }
6797 +CLK_OF_DECLARE(at91rm9200_clk_prog, "atmel,at91rm9200-clk-programmable",
6798 +              of_at91rm9200_clk_prog_setup);
6799
6800 -void __init of_at91sam9g45_clk_prog_setup(struct device_node *np,
6801 -                                         struct at91_pmc *pmc)
6802 +static void __init of_at91sam9g45_clk_prog_setup(struct device_node *np)
6803  {
6804 -       of_at91_clk_prog_setup(np, pmc, &at91sam9g45_programmable_layout);
6805 +       of_at91_clk_prog_setup(np, &at91sam9g45_programmable_layout);
6806  }
6807 +CLK_OF_DECLARE(at91sam9g45_clk_prog, "atmel,at91sam9g45-clk-programmable",
6808 +              of_at91sam9g45_clk_prog_setup);
6809
6810 -void __init of_at91sam9x5_clk_prog_setup(struct device_node *np,
6811 -                                        struct at91_pmc *pmc)
6812 +static void __init of_at91sam9x5_clk_prog_setup(struct device_node *np)
6813  {
6814 -       of_at91_clk_prog_setup(np, pmc, &at91sam9x5_programmable_layout);
6815 +       of_at91_clk_prog_setup(np, &at91sam9x5_programmable_layout);
6816  }
6817 +CLK_OF_DECLARE(at91sam9x5_clk_prog, "atmel,at91sam9x5-clk-programmable",
6818 +              of_at91sam9x5_clk_prog_setup);
6819 diff --git a/drivers/clk/at91/clk-slow.c b/drivers/clk/at91/clk-slow.c
6820 index d0d5076a9b94..221c09684ba3 100644
6821 --- a/drivers/clk/at91/clk-slow.c
6822 +++ b/drivers/clk/at91/clk-slow.c
6823 @@ -13,17 +13,11 @@
6824  #include <linux/clk.h>
6825  #include <linux/clk-provider.h>
6826  #include <linux/clkdev.h>
6827 -#include <linux/slab.h>
6828  #include <linux/clk/at91_pmc.h>
6829  #include <linux/delay.h>
6830  #include <linux/of.h>
6831 -#include <linux/of_address.h>
6832 -#include <linux/of_irq.h>
6833 -#include <linux/io.h>
6834 -#include <linux/interrupt.h>
6835 -#include <linux/irq.h>
6836 -#include <linux/sched.h>
6837 -#include <linux/wait.h>
6838 +#include <linux/mfd/syscon.h>
6839 +#include <linux/regmap.h>
6840
6841  #include "pmc.h"
6842  #include "sckc.h"
6843 @@ -59,7 +53,7 @@ struct clk_slow_rc_osc {
6844
6845  struct clk_sam9260_slow {
6846         struct clk_hw hw;
6847 -       struct at91_pmc *pmc;
6848 +       struct regmap *regmap;
6849  };
6850
6851  #define to_clk_sam9260_slow(hw) container_of(hw, struct clk_sam9260_slow, hw)
6852 @@ -393,8 +387,11 @@ void __init of_at91sam9x5_clk_slow_setup(struct device_node *np,
6853  static u8 clk_sam9260_slow_get_parent(struct clk_hw *hw)
6854  {
6855         struct clk_sam9260_slow *slowck = to_clk_sam9260_slow(hw);
6856 +       unsigned int status;
6857
6858 -       return !!(pmc_read(slowck->pmc, AT91_PMC_SR) & AT91_PMC_OSCSEL);
6859 +       regmap_read(slowck->regmap, AT91_PMC_SR, &status);
6860 +
6861 +       return status & AT91_PMC_OSCSEL ? 1 : 0;
6862  }
6863
6864  static const struct clk_ops sam9260_slow_ops = {
6865 @@ -402,7 +399,7 @@ static const struct clk_ops sam9260_slow_ops = {
6866  };
6867
6868  static struct clk * __init
6869 -at91_clk_register_sam9260_slow(struct at91_pmc *pmc,
6870 +at91_clk_register_sam9260_slow(struct regmap *regmap,
6871                                const char *name,
6872                                const char **parent_names,
6873                                int num_parents)
6874 @@ -411,7 +408,7 @@ at91_clk_register_sam9260_slow(struct at91_pmc *pmc,
6875         struct clk *clk = NULL;
6876         struct clk_init_data init;
6877
6878 -       if (!pmc || !name)
6879 +       if (!name)
6880                 return ERR_PTR(-EINVAL);
6881
6882         if (!parent_names || !num_parents)
6883 @@ -428,7 +425,7 @@ at91_clk_register_sam9260_slow(struct at91_pmc *pmc,
6884         init.flags = 0;
6885
6886         slowck->hw.init = &init;
6887 -       slowck->pmc = pmc;
6888 +       slowck->regmap = regmap;
6889
6890         clk = clk_register(NULL, &slowck->hw);
6891         if (IS_ERR(clk))
6892 @@ -439,29 +436,34 @@ at91_clk_register_sam9260_slow(struct at91_pmc *pmc,
6893         return clk;
6894  }
6895
6896 -void __init of_at91sam9260_clk_slow_setup(struct device_node *np,
6897 -                                         struct at91_pmc *pmc)
6898 +static void __init of_at91sam9260_clk_slow_setup(struct device_node *np)
6899  {
6900         struct clk *clk;
6901         const char *parent_names[2];
6902         int num_parents;
6903         const char *name = np->name;
6904 +       struct regmap *regmap;
6905
6906         num_parents = of_clk_get_parent_count(np);
6907         if (num_parents != 2)
6908                 return;
6909
6910         of_clk_parent_fill(np, parent_names, num_parents);
6911 +       regmap = syscon_node_to_regmap(of_get_parent(np));
6912 +       if (IS_ERR(regmap))
6913 +               return;
6914
6915         of_property_read_string(np, "clock-output-names", &name);
6916
6917 -       clk = at91_clk_register_sam9260_slow(pmc, name, parent_names,
6918 +       clk = at91_clk_register_sam9260_slow(regmap, name, parent_names,
6919                                              num_parents);
6920         if (IS_ERR(clk))
6921                 return;
6922
6923         of_clk_add_provider(np, of_clk_src_simple_get, clk);
6924  }
6925 +CLK_OF_DECLARE(at91sam9260_clk_slow, "atmel,at91sam9260-clk-slow",
6926 +              of_at91sam9260_clk_slow_setup);
6927
6928  /*
6929   * FIXME: All slow clk users are not properly claiming it (get + prepare +
6930 diff --git a/drivers/clk/at91/clk-smd.c b/drivers/clk/at91/clk-smd.c
6931 index a7f8501cfa05..e6948a52005a 100644
6932 --- a/drivers/clk/at91/clk-smd.c
6933 +++ b/drivers/clk/at91/clk-smd.c
6934 @@ -12,8 +12,8 @@
6935  #include <linux/clkdev.h>
6936  #include <linux/clk/at91_pmc.h>
6937  #include <linux/of.h>
6938 -#include <linux/of_address.h>
6939 -#include <linux/io.h>
6940 +#include <linux/mfd/syscon.h>
6941 +#include <linux/regmap.h>
6942
6943  #include "pmc.h"
6944
6945 @@ -24,7 +24,7 @@
6946
6947  struct at91sam9x5_clk_smd {
6948         struct clk_hw hw;
6949 -       struct at91_pmc *pmc;
6950 +       struct regmap *regmap;
6951  };
6952
6953  #define to_at91sam9x5_clk_smd(hw) \
6954 @@ -33,13 +33,13 @@ struct at91sam9x5_clk_smd {
6955  static unsigned long at91sam9x5_clk_smd_recalc_rate(struct clk_hw *hw,
6956                                                     unsigned long parent_rate)
6957  {
6958 -       u32 tmp;
6959 -       u8 smddiv;
6960         struct at91sam9x5_clk_smd *smd = to_at91sam9x5_clk_smd(hw);
6961 -       struct at91_pmc *pmc = smd->pmc;
6962 +       unsigned int smdr;
6963 +       u8 smddiv;
6964 +
6965 +       regmap_read(smd->regmap, AT91_PMC_SMD, &smdr);
6966 +       smddiv = (smdr & AT91_PMC_SMD_DIV) >> SMD_DIV_SHIFT;
6967
6968 -       tmp = pmc_read(pmc, AT91_PMC_SMD);
6969 -       smddiv = (tmp & AT91_PMC_SMD_DIV) >> SMD_DIV_SHIFT;
6970         return parent_rate / (smddiv + 1);
6971  }
6972
6973 @@ -67,40 +67,38 @@ static long at91sam9x5_clk_smd_round_rate(struct clk_hw *hw, unsigned long rate,
6974
6975  static int at91sam9x5_clk_smd_set_parent(struct clk_hw *hw, u8 index)
6976  {
6977 -       u32 tmp;
6978         struct at91sam9x5_clk_smd *smd = to_at91sam9x5_clk_smd(hw);
6979 -       struct at91_pmc *pmc = smd->pmc;
6980
6981         if (index > 1)
6982                 return -EINVAL;
6983 -       tmp = pmc_read(pmc, AT91_PMC_SMD) & ~AT91_PMC_SMDS;
6984 -       if (index)
6985 -               tmp |= AT91_PMC_SMDS;
6986 -       pmc_write(pmc, AT91_PMC_SMD, tmp);
6987 +
6988 +       regmap_update_bits(smd->regmap, AT91_PMC_SMD, AT91_PMC_SMDS,
6989 +                          index ? AT91_PMC_SMDS : 0);
6990 +
6991         return 0;
6992  }
6993
6994  static u8 at91sam9x5_clk_smd_get_parent(struct clk_hw *hw)
6995  {
6996         struct at91sam9x5_clk_smd *smd = to_at91sam9x5_clk_smd(hw);
6997 -       struct at91_pmc *pmc = smd->pmc;
6998 +       unsigned int smdr;
6999
7000 -       return pmc_read(pmc, AT91_PMC_SMD) & AT91_PMC_SMDS;
7001 +       regmap_read(smd->regmap, AT91_PMC_SMD, &smdr);
7002 +
7003 +       return smdr & AT91_PMC_SMDS;
7004  }
7005
7006  static int at91sam9x5_clk_smd_set_rate(struct clk_hw *hw, unsigned long rate,
7007                                        unsigned long parent_rate)
7008  {
7009 -       u32 tmp;
7010         struct at91sam9x5_clk_smd *smd = to_at91sam9x5_clk_smd(hw);
7011 -       struct at91_pmc *pmc = smd->pmc;
7012         unsigned long div = parent_rate / rate;
7013
7014         if (parent_rate % rate || div < 1 || div > (SMD_MAX_DIV + 1))
7015                 return -EINVAL;
7016 -       tmp = pmc_read(pmc, AT91_PMC_SMD) & ~AT91_PMC_SMD_DIV;
7017 -       tmp |= (div - 1) << SMD_DIV_SHIFT;
7018 -       pmc_write(pmc, AT91_PMC_SMD, tmp);
7019 +
7020 +       regmap_update_bits(smd->regmap, AT91_PMC_SMD, AT91_PMC_SMD_DIV,
7021 +                          (div - 1) << SMD_DIV_SHIFT);
7022
7023         return 0;
7024  }
7025 @@ -114,7 +112,7 @@ static const struct clk_ops at91sam9x5_smd_ops = {
7026  };
7027
7028  static struct clk * __init
7029 -at91sam9x5_clk_register_smd(struct at91_pmc *pmc, const char *name,
7030 +at91sam9x5_clk_register_smd(struct regmap *regmap, const char *name,
7031                             const char **parent_names, u8 num_parents)
7032  {
7033         struct at91sam9x5_clk_smd *smd;
7034 @@ -132,7 +130,7 @@ at91sam9x5_clk_register_smd(struct at91_pmc *pmc, const char *name,
7035         init.flags = CLK_SET_RATE_GATE | CLK_SET_PARENT_GATE;
7036
7037         smd->hw.init = &init;
7038 -       smd->pmc = pmc;
7039 +       smd->regmap = regmap;
7040
7041         clk = clk_register(NULL, &smd->hw);
7042         if (IS_ERR(clk))
7043 @@ -141,13 +139,13 @@ at91sam9x5_clk_register_smd(struct at91_pmc *pmc, const char *name,
7044         return clk;
7045  }
7046
7047 -void __init of_at91sam9x5_clk_smd_setup(struct device_node *np,
7048 -                                       struct at91_pmc *pmc)
7049 +static void __init of_at91sam9x5_clk_smd_setup(struct device_node *np)
7050  {
7051         struct clk *clk;
7052         int num_parents;
7053         const char *parent_names[SMD_SOURCE_MAX];
7054         const char *name = np->name;
7055 +       struct regmap *regmap;
7056
7057         num_parents = of_clk_get_parent_count(np);
7058         if (num_parents <= 0 || num_parents > SMD_SOURCE_MAX)
7059 @@ -157,10 +155,16 @@ void __init of_at91sam9x5_clk_smd_setup(struct device_node *np,
7060
7061         of_property_read_string(np, "clock-output-names", &name);
7062
7063 -       clk = at91sam9x5_clk_register_smd(pmc, name, parent_names,
7064 +       regmap = syscon_node_to_regmap(of_get_parent(np));
7065 +       if (IS_ERR(regmap))
7066 +               return;
7067 +
7068 +       clk = at91sam9x5_clk_register_smd(regmap, name, parent_names,
7069                                           num_parents);
7070         if (IS_ERR(clk))
7071                 return;
7072
7073         of_clk_add_provider(np, of_clk_src_simple_get, clk);
7074  }
7075 +CLK_OF_DECLARE(at91sam9x5_clk_smd, "atmel,at91sam9x5-clk-smd",
7076 +              of_at91sam9x5_clk_smd_setup);
7077 diff --git a/drivers/clk/at91/clk-system.c b/drivers/clk/at91/clk-system.c
7078 index 3f5314344286..8f35d8172909 100644
7079 --- a/drivers/clk/at91/clk-system.c
7080 +++ b/drivers/clk/at91/clk-system.c
7081 @@ -12,13 +12,8 @@
7082  #include <linux/clkdev.h>
7083  #include <linux/clk/at91_pmc.h>
7084  #include <linux/of.h>
7085 -#include <linux/of_address.h>
7086 -#include <linux/io.h>
7087 -#include <linux/irq.h>
7088 -#include <linux/of_irq.h>
7089 -#include <linux/interrupt.h>
7090 -#include <linux/wait.h>
7091 -#include <linux/sched.h>
7092 +#include <linux/mfd/syscon.h>
7093 +#include <linux/regmap.h>
7094
7095  #include "pmc.h"
7096
7097 @@ -29,9 +24,7 @@
7098  #define to_clk_system(hw) container_of(hw, struct clk_system, hw)
7099  struct clk_system {
7100         struct clk_hw hw;
7101 -       struct at91_pmc *pmc;
7102 -       unsigned int irq;
7103 -       wait_queue_head_t wait;
7104 +       struct regmap *regmap;
7105         u8 id;
7106  };
7107
7108 @@ -39,58 +32,54 @@ static inline int is_pck(int id)
7109  {
7110         return (id >= 8) && (id <= 15);
7111  }
7112 -static irqreturn_t clk_system_irq_handler(int irq, void *dev_id)
7113 +
7114 +static inline bool clk_system_ready(struct regmap *regmap, int id)
7115  {
7116 -       struct clk_system *sys = (struct clk_system *)dev_id;
7117 +       unsigned int status;
7118
7119 -       wake_up(&sys->wait);
7120 -       disable_irq_nosync(sys->irq);
7121 +       regmap_read(regmap, AT91_PMC_SR, &status);
7122
7123 -       return IRQ_HANDLED;
7124 +       return status & (1 << id) ? 1 : 0;
7125  }
7126
7127  static int clk_system_prepare(struct clk_hw *hw)
7128  {
7129         struct clk_system *sys = to_clk_system(hw);
7130 -       struct at91_pmc *pmc = sys->pmc;
7131 -       u32 mask = 1 << sys->id;
7132
7133 -       pmc_write(pmc, AT91_PMC_SCER, mask);
7134 +       regmap_write(sys->regmap, AT91_PMC_SCER, 1 << sys->id);
7135
7136         if (!is_pck(sys->id))
7137                 return 0;
7138
7139 -       while (!(pmc_read(pmc, AT91_PMC_SR) & mask)) {
7140 -               if (sys->irq) {
7141 -                       enable_irq(sys->irq);
7142 -                       wait_event(sys->wait,
7143 -                                  pmc_read(pmc, AT91_PMC_SR) & mask);
7144 -               } else
7145 -                       cpu_relax();
7146 -       }
7147 +       while (!clk_system_ready(sys->regmap, sys->id))
7148 +               cpu_relax();
7149 +
7150         return 0;
7151  }
7152
7153  static void clk_system_unprepare(struct clk_hw *hw)
7154  {
7155         struct clk_system *sys = to_clk_system(hw);
7156 -       struct at91_pmc *pmc = sys->pmc;
7157
7158 -       pmc_write(pmc, AT91_PMC_SCDR, 1 << sys->id);
7159 +       regmap_write(sys->regmap, AT91_PMC_SCDR, 1 << sys->id);
7160  }
7161
7162  static int clk_system_is_prepared(struct clk_hw *hw)
7163  {
7164         struct clk_system *sys = to_clk_system(hw);
7165 -       struct at91_pmc *pmc = sys->pmc;
7166 +       unsigned int status;
7167 +
7168 +       regmap_read(sys->regmap, AT91_PMC_SCSR, &status);
7169
7170 -       if (!(pmc_read(pmc, AT91_PMC_SCSR) & (1 << sys->id)))
7171 +       if (!(status & (1 << sys->id)))
7172                 return 0;
7173
7174         if (!is_pck(sys->id))
7175                 return 1;
7176
7177 -       return !!(pmc_read(pmc, AT91_PMC_SR) & (1 << sys->id));
7178 +       regmap_read(sys->regmap, AT91_PMC_SR, &status);
7179 +
7180 +       return status & (1 << sys->id) ? 1 : 0;
7181  }
7182
7183  static const struct clk_ops system_ops = {
7184 @@ -100,13 +89,12 @@ static const struct clk_ops system_ops = {
7185  };
7186
7187  static struct clk * __init
7188 -at91_clk_register_system(struct at91_pmc *pmc, const char *name,
7189 -                        const char *parent_name, u8 id, int irq)
7190 +at91_clk_register_system(struct regmap *regmap, const char *name,
7191 +                        const char *parent_name, u8 id)
7192  {
7193         struct clk_system *sys;
7194         struct clk *clk = NULL;
7195         struct clk_init_data init;
7196 -       int ret;
7197
7198         if (!parent_name || id > SYSTEM_MAX_ID)
7199                 return ERR_PTR(-EINVAL);
7200 @@ -123,44 +111,33 @@ at91_clk_register_system(struct at91_pmc *pmc, const char *name,
7201
7202         sys->id = id;
7203         sys->hw.init = &init;
7204 -       sys->pmc = pmc;
7205 -       sys->irq = irq;
7206 -       if (irq) {
7207 -               init_waitqueue_head(&sys->wait);
7208 -               irq_set_status_flags(sys->irq, IRQ_NOAUTOEN);
7209 -               ret = request_irq(sys->irq, clk_system_irq_handler,
7210 -                               IRQF_TRIGGER_HIGH, name, sys);
7211 -               if (ret) {
7212 -                       kfree(sys);
7213 -                       return ERR_PTR(ret);
7214 -               }
7215 -       }
7216 +       sys->regmap = regmap;
7217
7218         clk = clk_register(NULL, &sys->hw);
7219 -       if (IS_ERR(clk)) {
7220 -               if (irq)
7221 -                       free_irq(sys->irq, sys);
7222 +       if (IS_ERR(clk))
7223                 kfree(sys);
7224 -       }
7225
7226         return clk;
7227  }
7228
7229 -static void __init
7230 -of_at91_clk_sys_setup(struct device_node *np, struct at91_pmc *pmc)
7231 +static void __init of_at91rm9200_clk_sys_setup(struct device_node *np)
7232  {
7233         int num;
7234 -       int irq = 0;
7235         u32 id;
7236         struct clk *clk;
7237         const char *name;
7238         struct device_node *sysclknp;
7239         const char *parent_name;
7240 +       struct regmap *regmap;
7241
7242         num = of_get_child_count(np);
7243         if (num > (SYSTEM_MAX_ID + 1))
7244                 return;
7245
7246 +       regmap = syscon_node_to_regmap(of_get_parent(np));
7247 +       if (IS_ERR(regmap))
7248 +               return;
7249 +
7250         for_each_child_of_node(np, sysclknp) {
7251                 if (of_property_read_u32(sysclknp, "reg", &id))
7252                         continue;
7253 @@ -168,21 +145,14 @@ of_at91_clk_sys_setup(struct device_node *np, struct at91_pmc *pmc)
7254                 if (of_property_read_string(np, "clock-output-names", &name))
7255                         name = sysclknp->name;
7256
7257 -               if (is_pck(id))
7258 -                       irq = irq_of_parse_and_map(sysclknp, 0);
7259 -
7260                 parent_name = of_clk_get_parent_name(sysclknp, 0);
7261
7262 -               clk = at91_clk_register_system(pmc, name, parent_name, id, irq);
7263 +               clk = at91_clk_register_system(regmap, name, parent_name, id);
7264                 if (IS_ERR(clk))
7265                         continue;
7266
7267                 of_clk_add_provider(sysclknp, of_clk_src_simple_get, clk);
7268         }
7269  }
7270 -
7271 -void __init of_at91rm9200_clk_sys_setup(struct device_node *np,
7272 -                                       struct at91_pmc *pmc)
7273 -{
7274 -       of_at91_clk_sys_setup(np, pmc);
7275 -}
7276 +CLK_OF_DECLARE(at91rm9200_clk_sys, "atmel,at91rm9200-clk-system",
7277 +              of_at91rm9200_clk_sys_setup);
7278 diff --git a/drivers/clk/at91/clk-usb.c b/drivers/clk/at91/clk-usb.c
7279 index 8ab8502778a2..650ca45892c0 100644
7280 --- a/drivers/clk/at91/clk-usb.c
7281 +++ b/drivers/clk/at91/clk-usb.c
7282 @@ -12,8 +12,8 @@
7283  #include <linux/clkdev.h>
7284  #include <linux/clk/at91_pmc.h>
7285  #include <linux/of.h>
7286 -#include <linux/of_address.h>
7287 -#include <linux/io.h>
7288 +#include <linux/mfd/syscon.h>
7289 +#include <linux/regmap.h>
7290
7291  #include "pmc.h"
7292
7293 @@ -27,7 +27,7 @@
7294
7295  struct at91sam9x5_clk_usb {
7296         struct clk_hw hw;
7297 -       struct at91_pmc *pmc;
7298 +       struct regmap *regmap;
7299  };
7300
7301  #define to_at91sam9x5_clk_usb(hw) \
7302 @@ -35,7 +35,7 @@ struct at91sam9x5_clk_usb {
7303
7304  struct at91rm9200_clk_usb {
7305         struct clk_hw hw;
7306 -       struct at91_pmc *pmc;
7307 +       struct regmap *regmap;
7308         u32 divisors[4];
7309  };
7310
7311 @@ -45,13 +45,12 @@ struct at91rm9200_clk_usb {
7312  static unsigned long at91sam9x5_clk_usb_recalc_rate(struct clk_hw *hw,
7313                                                     unsigned long parent_rate)
7314  {
7315 -       u32 tmp;
7316 -       u8 usbdiv;
7317         struct at91sam9x5_clk_usb *usb = to_at91sam9x5_clk_usb(hw);
7318 -       struct at91_pmc *pmc = usb->pmc;
7319 +       unsigned int usbr;
7320 +       u8 usbdiv;
7321
7322 -       tmp = pmc_read(pmc, AT91_PMC_USB);
7323 -       usbdiv = (tmp & AT91_PMC_OHCIUSBDIV) >> SAM9X5_USB_DIV_SHIFT;
7324 +       regmap_read(usb->regmap, AT91_PMC_USB, &usbr);
7325 +       usbdiv = (usbr & AT91_PMC_OHCIUSBDIV) >> SAM9X5_USB_DIV_SHIFT;
7326
7327         return DIV_ROUND_CLOSEST(parent_rate, (usbdiv + 1));
7328  }
7329 @@ -109,33 +108,31 @@ static int at91sam9x5_clk_usb_determine_rate(struct clk_hw *hw,
7330
7331  static int at91sam9x5_clk_usb_set_parent(struct clk_hw *hw, u8 index)
7332  {
7333 -       u32 tmp;
7334         struct at91sam9x5_clk_usb *usb = to_at91sam9x5_clk_usb(hw);
7335 -       struct at91_pmc *pmc = usb->pmc;
7336
7337         if (index > 1)
7338                 return -EINVAL;
7339 -       tmp = pmc_read(pmc, AT91_PMC_USB) & ~AT91_PMC_USBS;
7340 -       if (index)
7341 -               tmp |= AT91_PMC_USBS;
7342 -       pmc_write(pmc, AT91_PMC_USB, tmp);
7343 +
7344 +       regmap_update_bits(usb->regmap, AT91_PMC_USB, AT91_PMC_USBS,
7345 +                          index ? AT91_PMC_USBS : 0);
7346 +
7347         return 0;
7348  }
7349
7350  static u8 at91sam9x5_clk_usb_get_parent(struct clk_hw *hw)
7351  {
7352         struct at91sam9x5_clk_usb *usb = to_at91sam9x5_clk_usb(hw);
7353 -       struct at91_pmc *pmc = usb->pmc;
7354 +       unsigned int usbr;
7355
7356 -       return pmc_read(pmc, AT91_PMC_USB) & AT91_PMC_USBS;
7357 +       regmap_read(usb->regmap, AT91_PMC_USB, &usbr);
7358 +
7359 +       return usbr & AT91_PMC_USBS;
7360  }
7361
7362  static int at91sam9x5_clk_usb_set_rate(struct clk_hw *hw, unsigned long rate,
7363                                        unsigned long parent_rate)
7364  {
7365 -       u32 tmp;
7366         struct at91sam9x5_clk_usb *usb = to_at91sam9x5_clk_usb(hw);
7367 -       struct at91_pmc *pmc = usb->pmc;
7368         unsigned long div;
7369
7370         if (!rate)
7371 @@ -145,9 +142,8 @@ static int at91sam9x5_clk_usb_set_rate(struct clk_hw *hw, unsigned long rate,
7372         if (div > SAM9X5_USB_MAX_DIV + 1 || !div)
7373                 return -EINVAL;
7374
7375 -       tmp = pmc_read(pmc, AT91_PMC_USB) & ~AT91_PMC_OHCIUSBDIV;
7376 -       tmp |= (div - 1) << SAM9X5_USB_DIV_SHIFT;
7377 -       pmc_write(pmc, AT91_PMC_USB, tmp);
7378 +       regmap_update_bits(usb->regmap, AT91_PMC_USB, AT91_PMC_OHCIUSBDIV,
7379 +                          (div - 1) << SAM9X5_USB_DIV_SHIFT);
7380
7381         return 0;
7382  }
7383 @@ -163,28 +159,28 @@ static const struct clk_ops at91sam9x5_usb_ops = {
7384  static int at91sam9n12_clk_usb_enable(struct clk_hw *hw)
7385  {
7386         struct at91sam9x5_clk_usb *usb = to_at91sam9x5_clk_usb(hw);
7387 -       struct at91_pmc *pmc = usb->pmc;
7388
7389 -       pmc_write(pmc, AT91_PMC_USB,
7390 -                 pmc_read(pmc, AT91_PMC_USB) | AT91_PMC_USBS);
7391 +       regmap_update_bits(usb->regmap, AT91_PMC_USB, AT91_PMC_USBS,
7392 +                          AT91_PMC_USBS);
7393 +
7394         return 0;
7395  }
7396
7397  static void at91sam9n12_clk_usb_disable(struct clk_hw *hw)
7398  {
7399         struct at91sam9x5_clk_usb *usb = to_at91sam9x5_clk_usb(hw);
7400 -       struct at91_pmc *pmc = usb->pmc;
7401
7402 -       pmc_write(pmc, AT91_PMC_USB,
7403 -                 pmc_read(pmc, AT91_PMC_USB) & ~AT91_PMC_USBS);
7404 +       regmap_update_bits(usb->regmap, AT91_PMC_USB, AT91_PMC_USBS, 0);
7405  }
7406
7407  static int at91sam9n12_clk_usb_is_enabled(struct clk_hw *hw)
7408  {
7409         struct at91sam9x5_clk_usb *usb = to_at91sam9x5_clk_usb(hw);
7410 -       struct at91_pmc *pmc = usb->pmc;
7411 +       unsigned int usbr;
7412
7413 -       return !!(pmc_read(pmc, AT91_PMC_USB) & AT91_PMC_USBS);
7414 +       regmap_read(usb->regmap, AT91_PMC_USB, &usbr);
7415 +
7416 +       return usbr & AT91_PMC_USBS;
7417  }
7418
7419  static const struct clk_ops at91sam9n12_usb_ops = {
7420 @@ -197,7 +193,7 @@ static const struct clk_ops at91sam9n12_usb_ops = {
7421  };
7422
7423  static struct clk * __init
7424 -at91sam9x5_clk_register_usb(struct at91_pmc *pmc, const char *name,
7425 +at91sam9x5_clk_register_usb(struct regmap *regmap, const char *name,
7426                             const char **parent_names, u8 num_parents)
7427  {
7428         struct at91sam9x5_clk_usb *usb;
7429 @@ -216,7 +212,7 @@ at91sam9x5_clk_register_usb(struct at91_pmc *pmc, const char *name,
7430                      CLK_SET_RATE_PARENT;
7431
7432         usb->hw.init = &init;
7433 -       usb->pmc = pmc;
7434 +       usb->regmap = regmap;
7435
7436         clk = clk_register(NULL, &usb->hw);
7437         if (IS_ERR(clk))
7438 @@ -226,7 +222,7 @@ at91sam9x5_clk_register_usb(struct at91_pmc *pmc, const char *name,
7439  }
7440
7441  static struct clk * __init
7442 -at91sam9n12_clk_register_usb(struct at91_pmc *pmc, const char *name,
7443 +at91sam9n12_clk_register_usb(struct regmap *regmap, const char *name,
7444                              const char *parent_name)
7445  {
7446         struct at91sam9x5_clk_usb *usb;
7447 @@ -244,7 +240,7 @@ at91sam9n12_clk_register_usb(struct at91_pmc *pmc, const char *name,
7448         init.flags = CLK_SET_RATE_GATE | CLK_SET_RATE_PARENT;
7449
7450         usb->hw.init = &init;
7451 -       usb->pmc = pmc;
7452 +       usb->regmap = regmap;
7453
7454         clk = clk_register(NULL, &usb->hw);
7455         if (IS_ERR(clk))
7456 @@ -257,12 +253,12 @@ static unsigned long at91rm9200_clk_usb_recalc_rate(struct clk_hw *hw,
7457                                                     unsigned long parent_rate)
7458  {
7459         struct at91rm9200_clk_usb *usb = to_at91rm9200_clk_usb(hw);
7460 -       struct at91_pmc *pmc = usb->pmc;
7461 -       u32 tmp;
7462 +       unsigned int pllbr;
7463         u8 usbdiv;
7464
7465 -       tmp = pmc_read(pmc, AT91_CKGR_PLLBR);
7466 -       usbdiv = (tmp & AT91_PMC_USBDIV) >> RM9200_USB_DIV_SHIFT;
7467 +       regmap_read(usb->regmap, AT91_CKGR_PLLBR, &pllbr);
7468 +
7469 +       usbdiv = (pllbr & AT91_PMC_USBDIV) >> RM9200_USB_DIV_SHIFT;
7470         if (usb->divisors[usbdiv])
7471                 return parent_rate / usb->divisors[usbdiv];
7472
7473 @@ -310,10 +306,8 @@ static long at91rm9200_clk_usb_round_rate(struct clk_hw *hw, unsigned long rate,
7474  static int at91rm9200_clk_usb_set_rate(struct clk_hw *hw, unsigned long rate,
7475                                        unsigned long parent_rate)
7476  {
7477 -       u32 tmp;
7478         int i;
7479         struct at91rm9200_clk_usb *usb = to_at91rm9200_clk_usb(hw);
7480 -       struct at91_pmc *pmc = usb->pmc;
7481         unsigned long div;
7482
7483         if (!rate)
7484 @@ -323,10 +317,10 @@ static int at91rm9200_clk_usb_set_rate(struct clk_hw *hw, unsigned long rate,
7485
7486         for (i = 0; i < RM9200_USB_DIV_TAB_SIZE; i++) {
7487                 if (usb->divisors[i] == div) {
7488 -                       tmp = pmc_read(pmc, AT91_CKGR_PLLBR) &
7489 -                             ~AT91_PMC_USBDIV;
7490 -                       tmp |= i << RM9200_USB_DIV_SHIFT;
7491 -                       pmc_write(pmc, AT91_CKGR_PLLBR, tmp);
7492 +                       regmap_update_bits(usb->regmap, AT91_CKGR_PLLBR,
7493 +                                          AT91_PMC_USBDIV,
7494 +                                          i << RM9200_USB_DIV_SHIFT);
7495 +
7496                         return 0;
7497                 }
7498         }
7499 @@ -341,7 +335,7 @@ static const struct clk_ops at91rm9200_usb_ops = {
7500  };
7501
7502  static struct clk * __init
7503 -at91rm9200_clk_register_usb(struct at91_pmc *pmc, const char *name,
7504 +at91rm9200_clk_register_usb(struct regmap *regmap, const char *name,
7505                             const char *parent_name, const u32 *divisors)
7506  {
7507         struct at91rm9200_clk_usb *usb;
7508 @@ -359,7 +353,7 @@ at91rm9200_clk_register_usb(struct at91_pmc *pmc, const char *name,
7509         init.flags = CLK_SET_RATE_PARENT;
7510
7511         usb->hw.init = &init;
7512 -       usb->pmc = pmc;
7513 +       usb->regmap = regmap;
7514         memcpy(usb->divisors, divisors, sizeof(usb->divisors));
7515
7516         clk = clk_register(NULL, &usb->hw);
7517 @@ -369,13 +363,13 @@ at91rm9200_clk_register_usb(struct at91_pmc *pmc, const char *name,
7518         return clk;
7519  }
7520
7521 -void __init of_at91sam9x5_clk_usb_setup(struct device_node *np,
7522 -                                       struct at91_pmc *pmc)
7523 +static void __init of_at91sam9x5_clk_usb_setup(struct device_node *np)
7524  {
7525         struct clk *clk;
7526         int num_parents;
7527         const char *parent_names[USB_SOURCE_MAX];
7528         const char *name = np->name;
7529 +       struct regmap *regmap;
7530
7531         num_parents = of_clk_get_parent_count(np);
7532         if (num_parents <= 0 || num_parents > USB_SOURCE_MAX)
7533 @@ -385,19 +379,26 @@ void __init of_at91sam9x5_clk_usb_setup(struct device_node *np,
7534
7535         of_property_read_string(np, "clock-output-names", &name);
7536
7537 -       clk = at91sam9x5_clk_register_usb(pmc, name, parent_names, num_parents);
7538 +       regmap = syscon_node_to_regmap(of_get_parent(np));
7539 +       if (IS_ERR(regmap))
7540 +               return;
7541 +
7542 +       clk = at91sam9x5_clk_register_usb(regmap, name, parent_names,
7543 +                                         num_parents);
7544         if (IS_ERR(clk))
7545                 return;
7546
7547         of_clk_add_provider(np, of_clk_src_simple_get, clk);
7548  }
7549 +CLK_OF_DECLARE(at91sam9x5_clk_usb, "atmel,at91sam9x5-clk-usb",
7550 +              of_at91sam9x5_clk_usb_setup);
7551
7552 -void __init of_at91sam9n12_clk_usb_setup(struct device_node *np,
7553 -                                        struct at91_pmc *pmc)
7554 +static void __init of_at91sam9n12_clk_usb_setup(struct device_node *np)
7555  {
7556         struct clk *clk;
7557         const char *parent_name;
7558         const char *name = np->name;
7559 +       struct regmap *regmap;
7560
7561         parent_name = of_clk_get_parent_name(np, 0);
7562         if (!parent_name)
7563 @@ -405,20 +406,26 @@ void __init of_at91sam9n12_clk_usb_setup(struct device_node *np,
7564
7565         of_property_read_string(np, "clock-output-names", &name);
7566
7567 -       clk = at91sam9n12_clk_register_usb(pmc, name, parent_name);
7568 +       regmap = syscon_node_to_regmap(of_get_parent(np));
7569 +       if (IS_ERR(regmap))
7570 +               return;
7571 +
7572 +       clk = at91sam9n12_clk_register_usb(regmap, name, parent_name);
7573         if (IS_ERR(clk))
7574                 return;
7575
7576         of_clk_add_provider(np, of_clk_src_simple_get, clk);
7577  }
7578 +CLK_OF_DECLARE(at91sam9n12_clk_usb, "atmel,at91sam9n12-clk-usb",
7579 +              of_at91sam9n12_clk_usb_setup);
7580
7581 -void __init of_at91rm9200_clk_usb_setup(struct device_node *np,
7582 -                                       struct at91_pmc *pmc)
7583 +static void __init of_at91rm9200_clk_usb_setup(struct device_node *np)
7584  {
7585         struct clk *clk;
7586         const char *parent_name;
7587         const char *name = np->name;
7588         u32 divisors[4] = {0, 0, 0, 0};
7589 +       struct regmap *regmap;
7590
7591         parent_name = of_clk_get_parent_name(np, 0);
7592         if (!parent_name)
7593 @@ -430,9 +437,15 @@ void __init of_at91rm9200_clk_usb_setup(struct device_node *np,
7594
7595         of_property_read_string(np, "clock-output-names", &name);
7596
7597 -       clk = at91rm9200_clk_register_usb(pmc, name, parent_name, divisors);
7598 +       regmap = syscon_node_to_regmap(of_get_parent(np));
7599 +       if (IS_ERR(regmap))
7600 +               return;
7601 +
7602 +       clk = at91rm9200_clk_register_usb(regmap, name, parent_name, divisors);
7603         if (IS_ERR(clk))
7604                 return;
7605
7606         of_clk_add_provider(np, of_clk_src_simple_get, clk);
7607  }
7608 +CLK_OF_DECLARE(at91rm9200_clk_usb, "atmel,at91rm9200-clk-usb",
7609 +              of_at91rm9200_clk_usb_setup);
7610 diff --git a/drivers/clk/at91/clk-utmi.c b/drivers/clk/at91/clk-utmi.c
7611 index ca561e90a60f..61fcf399e58c 100644
7612 --- a/drivers/clk/at91/clk-utmi.c
7613 +++ b/drivers/clk/at91/clk-utmi.c
7614 @@ -11,14 +11,9 @@
7615  #include <linux/clk-provider.h>
7616  #include <linux/clkdev.h>
7617  #include <linux/clk/at91_pmc.h>
7618 -#include <linux/interrupt.h>
7619 -#include <linux/irq.h>
7620  #include <linux/of.h>
7621 -#include <linux/of_address.h>
7622 -#include <linux/of_irq.h>
7623 -#include <linux/io.h>
7624 -#include <linux/sched.h>
7625 -#include <linux/wait.h>
7626 +#include <linux/mfd/syscon.h>
7627 +#include <linux/regmap.h>
7628
7629  #include "pmc.h"
7630
7631 @@ -26,37 +21,30 @@
7632
7633  struct clk_utmi {
7634         struct clk_hw hw;
7635 -       struct at91_pmc *pmc;
7636 -       unsigned int irq;
7637 -       wait_queue_head_t wait;
7638 +       struct regmap *regmap;
7639  };
7640
7641  #define to_clk_utmi(hw) container_of(hw, struct clk_utmi, hw)
7642
7643 -static irqreturn_t clk_utmi_irq_handler(int irq, void *dev_id)
7644 +static inline bool clk_utmi_ready(struct regmap *regmap)
7645  {
7646 -       struct clk_utmi *utmi = (struct clk_utmi *)dev_id;
7647 +       unsigned int status;
7648
7649 -       wake_up(&utmi->wait);
7650 -       disable_irq_nosync(utmi->irq);
7651 +       regmap_read(regmap, AT91_PMC_SR, &status);
7652
7653 -       return IRQ_HANDLED;
7654 +       return status & AT91_PMC_LOCKU;
7655  }
7656
7657  static int clk_utmi_prepare(struct clk_hw *hw)
7658  {
7659         struct clk_utmi *utmi = to_clk_utmi(hw);
7660 -       struct at91_pmc *pmc = utmi->pmc;
7661 -       u32 tmp = pmc_read(pmc, AT91_CKGR_UCKR) | AT91_PMC_UPLLEN |
7662 -                 AT91_PMC_UPLLCOUNT | AT91_PMC_BIASEN;
7663 +       unsigned int uckr = AT91_PMC_UPLLEN | AT91_PMC_UPLLCOUNT |
7664 +                           AT91_PMC_BIASEN;
7665
7666 -       pmc_write(pmc, AT91_CKGR_UCKR, tmp);
7667 +       regmap_update_bits(utmi->regmap, AT91_CKGR_UCKR, uckr, uckr);
7668
7669 -       while (!(pmc_read(pmc, AT91_PMC_SR) & AT91_PMC_LOCKU)) {
7670 -               enable_irq(utmi->irq);
7671 -               wait_event(utmi->wait,
7672 -                          pmc_read(pmc, AT91_PMC_SR) & AT91_PMC_LOCKU);
7673 -       }
7674 +       while (!clk_utmi_ready(utmi->regmap))
7675 +               cpu_relax();
7676
7677         return 0;
7678  }
7679 @@ -64,18 +52,15 @@ static int clk_utmi_prepare(struct clk_hw *hw)
7680  static int clk_utmi_is_prepared(struct clk_hw *hw)
7681  {
7682         struct clk_utmi *utmi = to_clk_utmi(hw);
7683 -       struct at91_pmc *pmc = utmi->pmc;
7684
7685 -       return !!(pmc_read(pmc, AT91_PMC_SR) & AT91_PMC_LOCKU);
7686 +       return clk_utmi_ready(utmi->regmap);
7687  }
7688
7689  static void clk_utmi_unprepare(struct clk_hw *hw)
7690  {
7691         struct clk_utmi *utmi = to_clk_utmi(hw);
7692 -       struct at91_pmc *pmc = utmi->pmc;
7693 -       u32 tmp = pmc_read(pmc, AT91_CKGR_UCKR) & ~AT91_PMC_UPLLEN;
7694
7695 -       pmc_write(pmc, AT91_CKGR_UCKR, tmp);
7696 +       regmap_update_bits(utmi->regmap, AT91_CKGR_UCKR, AT91_PMC_UPLLEN, 0);
7697  }
7698
7699  static unsigned long clk_utmi_recalc_rate(struct clk_hw *hw,
7700 @@ -93,10 +78,9 @@ static const struct clk_ops utmi_ops = {
7701  };
7702
7703  static struct clk * __init
7704 -at91_clk_register_utmi(struct at91_pmc *pmc, unsigned int irq,
7705 +at91_clk_register_utmi(struct regmap *regmap,
7706                        const char *name, const char *parent_name)
7707  {
7708 -       int ret;
7709         struct clk_utmi *utmi;
7710         struct clk *clk = NULL;
7711         struct clk_init_data init;
7712 @@ -112,52 +96,36 @@ at91_clk_register_utmi(struct at91_pmc *pmc, unsigned int irq,
7713         init.flags = CLK_SET_RATE_GATE;
7714
7715         utmi->hw.init = &init;
7716 -       utmi->pmc = pmc;
7717 -       utmi->irq = irq;
7718 -       init_waitqueue_head(&utmi->wait);
7719 -       irq_set_status_flags(utmi->irq, IRQ_NOAUTOEN);
7720 -       ret = request_irq(utmi->irq, clk_utmi_irq_handler,
7721 -                         IRQF_TRIGGER_HIGH, "clk-utmi", utmi);
7722 -       if (ret) {
7723 -               kfree(utmi);
7724 -               return ERR_PTR(ret);
7725 -       }
7726 +       utmi->regmap = regmap;
7727
7728         clk = clk_register(NULL, &utmi->hw);
7729 -       if (IS_ERR(clk)) {
7730 -               free_irq(utmi->irq, utmi);
7731 +       if (IS_ERR(clk))
7732                 kfree(utmi);
7733 -       }
7734
7735         return clk;
7736  }
7737
7738 -static void __init
7739 -of_at91_clk_utmi_setup(struct device_node *np, struct at91_pmc *pmc)
7740 +static void __init of_at91sam9x5_clk_utmi_setup(struct device_node *np)
7741  {
7742 -       unsigned int irq;
7743         struct clk *clk;
7744         const char *parent_name;
7745         const char *name = np->name;
7746 +       struct regmap *regmap;
7747
7748         parent_name = of_clk_get_parent_name(np, 0);
7749
7750         of_property_read_string(np, "clock-output-names", &name);
7751
7752 -       irq = irq_of_parse_and_map(np, 0);
7753 -       if (!irq)
7754 +       regmap = syscon_node_to_regmap(of_get_parent(np));
7755 +       if (IS_ERR(regmap))
7756                 return;
7757
7758 -       clk = at91_clk_register_utmi(pmc, irq, name, parent_name);
7759 +       clk = at91_clk_register_utmi(regmap, name, parent_name);
7760         if (IS_ERR(clk))
7761                 return;
7762
7763         of_clk_add_provider(np, of_clk_src_simple_get, clk);
7764         return;
7765  }
7766 -
7767 -void __init of_at91sam9x5_clk_utmi_setup(struct device_node *np,
7768 -                                        struct at91_pmc *pmc)
7769 -{
7770 -       of_at91_clk_utmi_setup(np, pmc);
7771 -}
7772 +CLK_OF_DECLARE(at91sam9x5_clk_utmi, "atmel,at91sam9x5-clk-utmi",
7773 +              of_at91sam9x5_clk_utmi_setup);
7774 diff --git a/drivers/clk/at91/pmc.c b/drivers/clk/at91/pmc.c
7775 index 8476b570779b..526df5ba042d 100644
7776 --- a/drivers/clk/at91/pmc.c
7777 +++ b/drivers/clk/at91/pmc.c
7778 @@ -12,36 +12,13 @@
7779  #include <linux/clkdev.h>
7780  #include <linux/clk/at91_pmc.h>
7781  #include <linux/of.h>
7782 -#include <linux/of_address.h>
7783 -#include <linux/io.h>
7784 -#include <linux/interrupt.h>
7785 -#include <linux/irq.h>
7786 -#include <linux/irqchip/chained_irq.h>
7787 -#include <linux/irqdomain.h>
7788 -#include <linux/of_irq.h>
7789 +#include <linux/mfd/syscon.h>
7790 +#include <linux/regmap.h>
7791
7792  #include <asm/proc-fns.h>
7793
7794  #include "pmc.h"
7795
7796 -void __iomem *at91_pmc_base;
7797 -EXPORT_SYMBOL_GPL(at91_pmc_base);
7798 -
7799 -void at91rm9200_idle(void)
7800 -{
7801 -       /*
7802 -        * Disable the processor clock.  The processor will be automatically
7803 -        * re-enabled by an interrupt or by a reset.
7804 -        */
7805 -       at91_pmc_write(AT91_PMC_SCDR, AT91_PMC_PCK);
7806 -}
7807 -
7808 -void at91sam9_idle(void)
7809 -{
7810 -       at91_pmc_write(AT91_PMC_SCDR, AT91_PMC_PCK);
7811 -       cpu_do_idle();
7812 -}
7813 -
7814  int of_at91_get_clk_range(struct device_node *np, const char *propname,
7815                           struct clk_range *range)
7816  {
7817 @@ -64,402 +41,3 @@ int of_at91_get_clk_range(struct device_node *np, const char *propname,
7818         return 0;
7819  }
7820  EXPORT_SYMBOL_GPL(of_at91_get_clk_range);
7821 -
7822 -static void pmc_irq_mask(struct irq_data *d)
7823 -{
7824 -       struct at91_pmc *pmc = irq_data_get_irq_chip_data(d);
7825 -
7826 -       pmc_write(pmc, AT91_PMC_IDR, 1 << d->hwirq);
7827 -}
7828 -
7829 -static void pmc_irq_unmask(struct irq_data *d)
7830 -{
7831 -       struct at91_pmc *pmc = irq_data_get_irq_chip_data(d);
7832 -
7833 -       pmc_write(pmc, AT91_PMC_IER, 1 << d->hwirq);
7834 -}
7835 -
7836 -static int pmc_irq_set_type(struct irq_data *d, unsigned type)
7837 -{
7838 -       if (type != IRQ_TYPE_LEVEL_HIGH) {
7839 -               pr_warn("PMC: type not supported (support only IRQ_TYPE_LEVEL_HIGH type)\n");
7840 -               return -EINVAL;
7841 -       }
7842 -
7843 -       return 0;
7844 -}
7845 -
7846 -static void pmc_irq_suspend(struct irq_data *d)
7847 -{
7848 -       struct at91_pmc *pmc = irq_data_get_irq_chip_data(d);
7849 -
7850 -       pmc->imr = pmc_read(pmc, AT91_PMC_IMR);
7851 -       pmc_write(pmc, AT91_PMC_IDR, pmc->imr);
7852 -}
7853 -
7854 -static void pmc_irq_resume(struct irq_data *d)
7855 -{
7856 -       struct at91_pmc *pmc = irq_data_get_irq_chip_data(d);
7857 -
7858 -       pmc_write(pmc, AT91_PMC_IER, pmc->imr);
7859 -}
7860 -
7861 -static struct irq_chip pmc_irq = {
7862 -       .name = "PMC",
7863 -       .irq_disable = pmc_irq_mask,
7864 -       .irq_mask = pmc_irq_mask,
7865 -       .irq_unmask = pmc_irq_unmask,
7866 -       .irq_set_type = pmc_irq_set_type,
7867 -       .irq_suspend = pmc_irq_suspend,
7868 -       .irq_resume = pmc_irq_resume,
7869 -};
7870 -
7871 -static struct lock_class_key pmc_lock_class;
7872 -
7873 -static int pmc_irq_map(struct irq_domain *h, unsigned int virq,
7874 -                      irq_hw_number_t hw)
7875 -{
7876 -       struct at91_pmc *pmc = h->host_data;
7877 -
7878 -       irq_set_lockdep_class(virq, &pmc_lock_class);
7879 -
7880 -       irq_set_chip_and_handler(virq, &pmc_irq,
7881 -                                handle_level_irq);
7882 -       irq_set_chip_data(virq, pmc);
7883 -
7884 -       return 0;
7885 -}
7886 -
7887 -static int pmc_irq_domain_xlate(struct irq_domain *d,
7888 -                               struct device_node *ctrlr,
7889 -                               const u32 *intspec, unsigned int intsize,
7890 -                               irq_hw_number_t *out_hwirq,
7891 -                               unsigned int *out_type)
7892 -{
7893 -       struct at91_pmc *pmc = d->host_data;
7894 -       const struct at91_pmc_caps *caps = pmc->caps;
7895 -
7896 -       if (WARN_ON(intsize < 1))
7897 -               return -EINVAL;
7898 -
7899 -       *out_hwirq = intspec[0];
7900 -
7901 -       if (!(caps->available_irqs & (1 << *out_hwirq)))
7902 -               return -EINVAL;
7903 -
7904 -       *out_type = IRQ_TYPE_LEVEL_HIGH;
7905 -
7906 -       return 0;
7907 -}
7908 -
7909 -static const struct irq_domain_ops pmc_irq_ops = {
7910 -       .map    = pmc_irq_map,
7911 -       .xlate  = pmc_irq_domain_xlate,
7912 -};
7913 -
7914 -static irqreturn_t pmc_irq_handler(int irq, void *data)
7915 -{
7916 -       struct at91_pmc *pmc = (struct at91_pmc *)data;
7917 -       unsigned long sr;
7918 -       int n;
7919 -
7920 -       sr = pmc_read(pmc, AT91_PMC_SR) & pmc_read(pmc, AT91_PMC_IMR);
7921 -       if (!sr)
7922 -               return IRQ_NONE;
7923 -
7924 -       for_each_set_bit(n, &sr, BITS_PER_LONG)
7925 -               generic_handle_irq(irq_find_mapping(pmc->irqdomain, n));
7926 -
7927 -       return IRQ_HANDLED;
7928 -}
7929 -
7930 -static const struct at91_pmc_caps at91rm9200_caps = {
7931 -       .available_irqs = AT91_PMC_MOSCS | AT91_PMC_LOCKA | AT91_PMC_LOCKB |
7932 -                         AT91_PMC_MCKRDY | AT91_PMC_PCK0RDY |
7933 -                         AT91_PMC_PCK1RDY | AT91_PMC_PCK2RDY |
7934 -                         AT91_PMC_PCK3RDY,
7935 -};
7936 -
7937 -static const struct at91_pmc_caps at91sam9260_caps = {
7938 -       .available_irqs = AT91_PMC_MOSCS | AT91_PMC_LOCKA | AT91_PMC_LOCKB |
7939 -                         AT91_PMC_MCKRDY | AT91_PMC_PCK0RDY |
7940 -                         AT91_PMC_PCK1RDY,
7941 -};
7942 -
7943 -static const struct at91_pmc_caps at91sam9g45_caps = {
7944 -       .available_irqs = AT91_PMC_MOSCS | AT91_PMC_LOCKA | AT91_PMC_MCKRDY |
7945 -                         AT91_PMC_LOCKU | AT91_PMC_PCK0RDY |
7946 -                         AT91_PMC_PCK1RDY,
7947 -};
7948 -
7949 -static const struct at91_pmc_caps at91sam9n12_caps = {
7950 -       .available_irqs = AT91_PMC_MOSCS | AT91_PMC_LOCKA | AT91_PMC_LOCKB |
7951 -                         AT91_PMC_MCKRDY | AT91_PMC_PCK0RDY |
7952 -                         AT91_PMC_PCK1RDY | AT91_PMC_MOSCSELS |
7953 -                         AT91_PMC_MOSCRCS | AT91_PMC_CFDEV,
7954 -};
7955 -
7956 -static const struct at91_pmc_caps at91sam9x5_caps = {
7957 -       .available_irqs = AT91_PMC_MOSCS | AT91_PMC_LOCKA | AT91_PMC_MCKRDY |
7958 -                         AT91_PMC_LOCKU | AT91_PMC_PCK0RDY |
7959 -                         AT91_PMC_PCK1RDY | AT91_PMC_MOSCSELS |
7960 -                         AT91_PMC_MOSCRCS | AT91_PMC_CFDEV,
7961 -};
7962 -
7963 -static const struct at91_pmc_caps sama5d2_caps = {
7964 -       .available_irqs = AT91_PMC_MOSCS | AT91_PMC_LOCKA | AT91_PMC_MCKRDY |
7965 -                         AT91_PMC_LOCKU | AT91_PMC_PCK0RDY |
7966 -                         AT91_PMC_PCK1RDY | AT91_PMC_PCK2RDY |
7967 -                         AT91_PMC_MOSCSELS | AT91_PMC_MOSCRCS |
7968 -                         AT91_PMC_CFDEV | AT91_PMC_GCKRDY,
7969 -};
7970 -
7971 -static const struct at91_pmc_caps sama5d3_caps = {
7972 -       .available_irqs = AT91_PMC_MOSCS | AT91_PMC_LOCKA | AT91_PMC_MCKRDY |
7973 -                         AT91_PMC_LOCKU | AT91_PMC_PCK0RDY |
7974 -                         AT91_PMC_PCK1RDY | AT91_PMC_PCK2RDY |
7975 -                         AT91_PMC_MOSCSELS | AT91_PMC_MOSCRCS |
7976 -                         AT91_PMC_CFDEV,
7977 -};
7978 -
7979 -static struct at91_pmc *__init at91_pmc_init(struct device_node *np,
7980 -                                            void __iomem *regbase, int virq,
7981 -                                            const struct at91_pmc_caps *caps)
7982 -{
7983 -       struct at91_pmc *pmc;
7984 -
7985 -       if (!regbase || !virq ||  !caps)
7986 -               return NULL;
7987 -
7988 -       at91_pmc_base = regbase;
7989 -
7990 -       pmc = kzalloc(sizeof(*pmc), GFP_KERNEL);
7991 -       if (!pmc)
7992 -               return NULL;
7993 -
7994 -       spin_lock_init(&pmc->lock);
7995 -       pmc->regbase = regbase;
7996 -       pmc->virq = virq;
7997 -       pmc->caps = caps;
7998 -
7999 -       pmc->irqdomain = irq_domain_add_linear(np, 32, &pmc_irq_ops, pmc);
8000 -
8001 -       if (!pmc->irqdomain)
8002 -               goto out_free_pmc;
8003 -
8004 -       pmc_write(pmc, AT91_PMC_IDR, 0xffffffff);
8005 -       if (request_irq(pmc->virq, pmc_irq_handler,
8006 -                       IRQF_SHARED | IRQF_COND_SUSPEND, "pmc", pmc))
8007 -               goto out_remove_irqdomain;
8008 -
8009 -       return pmc;
8010 -
8011 -out_remove_irqdomain:
8012 -       irq_domain_remove(pmc->irqdomain);
8013 -out_free_pmc:
8014 -       kfree(pmc);
8015 -
8016 -       return NULL;
8017 -}
8018 -
8019 -static const struct of_device_id pmc_clk_ids[] __initconst = {
8020 -       /* Slow oscillator */
8021 -       {
8022 -               .compatible = "atmel,at91sam9260-clk-slow",
8023 -               .data = of_at91sam9260_clk_slow_setup,
8024 -       },
8025 -       /* Main clock */
8026 -       {
8027 -               .compatible = "atmel,at91rm9200-clk-main-osc",
8028 -               .data = of_at91rm9200_clk_main_osc_setup,
8029 -       },
8030 -       {
8031 -               .compatible = "atmel,at91sam9x5-clk-main-rc-osc",
8032 -               .data = of_at91sam9x5_clk_main_rc_osc_setup,
8033 -       },
8034 -       {
8035 -               .compatible = "atmel,at91rm9200-clk-main",
8036 -               .data = of_at91rm9200_clk_main_setup,
8037 -       },
8038 -       {
8039 -               .compatible = "atmel,at91sam9x5-clk-main",
8040 -               .data = of_at91sam9x5_clk_main_setup,
8041 -       },
8042 -       /* PLL clocks */
8043 -       {
8044 -               .compatible = "atmel,at91rm9200-clk-pll",
8045 -               .data = of_at91rm9200_clk_pll_setup,
8046 -       },
8047 -       {
8048 -               .compatible = "atmel,at91sam9g45-clk-pll",
8049 -               .data = of_at91sam9g45_clk_pll_setup,
8050 -       },
8051 -       {
8052 -               .compatible = "atmel,at91sam9g20-clk-pllb",
8053 -               .data = of_at91sam9g20_clk_pllb_setup,
8054 -       },
8055 -       {
8056 -               .compatible = "atmel,sama5d3-clk-pll",
8057 -               .data = of_sama5d3_clk_pll_setup,
8058 -       },
8059 -       {
8060 -               .compatible = "atmel,at91sam9x5-clk-plldiv",
8061 -               .data = of_at91sam9x5_clk_plldiv_setup,
8062 -       },
8063 -       /* Master clock */
8064 -       {
8065 -               .compatible = "atmel,at91rm9200-clk-master",
8066 -               .data = of_at91rm9200_clk_master_setup,
8067 -       },
8068 -       {
8069 -               .compatible = "atmel,at91sam9x5-clk-master",
8070 -               .data = of_at91sam9x5_clk_master_setup,
8071 -       },
8072 -       /* System clocks */
8073 -       {
8074 -               .compatible = "atmel,at91rm9200-clk-system",
8075 -               .data = of_at91rm9200_clk_sys_setup,
8076 -       },
8077 -       /* Peripheral clocks */
8078 -       {
8079 -               .compatible = "atmel,at91rm9200-clk-peripheral",
8080 -               .data = of_at91rm9200_clk_periph_setup,
8081 -       },
8082 -       {
8083 -               .compatible = "atmel,at91sam9x5-clk-peripheral",
8084 -               .data = of_at91sam9x5_clk_periph_setup,
8085 -       },
8086 -       /* Programmable clocks */
8087 -       {
8088 -               .compatible = "atmel,at91rm9200-clk-programmable",
8089 -               .data = of_at91rm9200_clk_prog_setup,
8090 -       },
8091 -       {
8092 -               .compatible = "atmel,at91sam9g45-clk-programmable",
8093 -               .data = of_at91sam9g45_clk_prog_setup,
8094 -       },
8095 -       {
8096 -               .compatible = "atmel,at91sam9x5-clk-programmable",
8097 -               .data = of_at91sam9x5_clk_prog_setup,
8098 -       },
8099 -       /* UTMI clock */
8100 -#if defined(CONFIG_HAVE_AT91_UTMI)
8101 -       {
8102 -               .compatible = "atmel,at91sam9x5-clk-utmi",
8103 -               .data = of_at91sam9x5_clk_utmi_setup,
8104 -       },
8105 -#endif
8106 -       /* USB clock */
8107 -#if defined(CONFIG_HAVE_AT91_USB_CLK)
8108 -       {
8109 -               .compatible = "atmel,at91rm9200-clk-usb",
8110 -               .data = of_at91rm9200_clk_usb_setup,
8111 -       },
8112 -       {
8113 -               .compatible = "atmel,at91sam9x5-clk-usb",
8114 -               .data = of_at91sam9x5_clk_usb_setup,
8115 -       },
8116 -       {
8117 -               .compatible = "atmel,at91sam9n12-clk-usb",
8118 -               .data = of_at91sam9n12_clk_usb_setup,
8119 -       },
8120 -#endif
8121 -       /* SMD clock */
8122 -#if defined(CONFIG_HAVE_AT91_SMD)
8123 -       {
8124 -               .compatible = "atmel,at91sam9x5-clk-smd",
8125 -               .data = of_at91sam9x5_clk_smd_setup,
8126 -       },
8127 -#endif
8128 -#if defined(CONFIG_HAVE_AT91_H32MX)
8129 -       {
8130 -               .compatible = "atmel,sama5d4-clk-h32mx",
8131 -               .data = of_sama5d4_clk_h32mx_setup,
8132 -       },
8133 -#endif
8134 -#if defined(CONFIG_HAVE_AT91_GENERATED_CLK)
8135 -       {
8136 -               .compatible = "atmel,sama5d2-clk-generated",
8137 -               .data = of_sama5d2_clk_generated_setup,
8138 -       },
8139 -#endif
8140 -       { /*sentinel*/ }
8141 -};
8142 -
8143 -static void __init of_at91_pmc_setup(struct device_node *np,
8144 -                                    const struct at91_pmc_caps *caps)
8145 -{
8146 -       struct at91_pmc *pmc;
8147 -       struct device_node *childnp;
8148 -       void (*clk_setup)(struct device_node *, struct at91_pmc *);
8149 -       const struct of_device_id *clk_id;
8150 -       void __iomem *regbase = of_iomap(np, 0);
8151 -       int virq;
8152 -
8153 -       if (!regbase)
8154 -               return;
8155 -
8156 -       virq = irq_of_parse_and_map(np, 0);
8157 -       if (!virq)
8158 -               return;
8159 -
8160 -       pmc = at91_pmc_init(np, regbase, virq, caps);
8161 -       if (!pmc)
8162 -               return;
8163 -       for_each_child_of_node(np, childnp) {
8164 -               clk_id = of_match_node(pmc_clk_ids, childnp);
8165 -               if (!clk_id)
8166 -                       continue;
8167 -               clk_setup = clk_id->data;
8168 -               clk_setup(childnp, pmc);
8169 -       }
8170 -}
8171 -
8172 -static void __init of_at91rm9200_pmc_setup(struct device_node *np)
8173 -{
8174 -       of_at91_pmc_setup(np, &at91rm9200_caps);
8175 -}
8176 -CLK_OF_DECLARE(at91rm9200_clk_pmc, "atmel,at91rm9200-pmc",
8177 -              of_at91rm9200_pmc_setup);
8178 -
8179 -static void __init of_at91sam9260_pmc_setup(struct device_node *np)
8180 -{
8181 -       of_at91_pmc_setup(np, &at91sam9260_caps);
8182 -}
8183 -CLK_OF_DECLARE(at91sam9260_clk_pmc, "atmel,at91sam9260-pmc",
8184 -              of_at91sam9260_pmc_setup);
8185 -
8186 -static void __init of_at91sam9g45_pmc_setup(struct device_node *np)
8187 -{
8188 -       of_at91_pmc_setup(np, &at91sam9g45_caps);
8189 -}
8190 -CLK_OF_DECLARE(at91sam9g45_clk_pmc, "atmel,at91sam9g45-pmc",
8191 -              of_at91sam9g45_pmc_setup);
8192 -
8193 -static void __init of_at91sam9n12_pmc_setup(struct device_node *np)
8194 -{
8195 -       of_at91_pmc_setup(np, &at91sam9n12_caps);
8196 -}
8197 -CLK_OF_DECLARE(at91sam9n12_clk_pmc, "atmel,at91sam9n12-pmc",
8198 -              of_at91sam9n12_pmc_setup);
8199 -
8200 -static void __init of_at91sam9x5_pmc_setup(struct device_node *np)
8201 -{
8202 -       of_at91_pmc_setup(np, &at91sam9x5_caps);
8203 -}
8204 -CLK_OF_DECLARE(at91sam9x5_clk_pmc, "atmel,at91sam9x5-pmc",
8205 -              of_at91sam9x5_pmc_setup);
8206 -
8207 -static void __init of_sama5d2_pmc_setup(struct device_node *np)
8208 -{
8209 -       of_at91_pmc_setup(np, &sama5d2_caps);
8210 -}
8211 -CLK_OF_DECLARE(sama5d2_clk_pmc, "atmel,sama5d2-pmc",
8212 -              of_sama5d2_pmc_setup);
8213 -
8214 -static void __init of_sama5d3_pmc_setup(struct device_node *np)
8215 -{
8216 -       of_at91_pmc_setup(np, &sama5d3_caps);
8217 -}
8218 -CLK_OF_DECLARE(sama5d3_clk_pmc, "atmel,sama5d3-pmc",
8219 -              of_sama5d3_pmc_setup);
8220 diff --git a/drivers/clk/at91/pmc.h b/drivers/clk/at91/pmc.h
8221 index f65739272779..5771fff0ee3f 100644
8222 --- a/drivers/clk/at91/pmc.h
8223 +++ b/drivers/clk/at91/pmc.h
8224 @@ -14,8 +14,11 @@
8225
8226  #include <linux/io.h>
8227  #include <linux/irqdomain.h>
8228 +#include <linux/regmap.h>
8229  #include <linux/spinlock.h>
8230
8231 +extern spinlock_t pmc_pcr_lock;
8232 +
8233  struct clk_range {
8234         unsigned long min;
8235         unsigned long max;
8236 @@ -23,102 +26,7 @@ struct clk_range {
8237
8238  #define CLK_RANGE(MIN, MAX) {.min = MIN, .max = MAX,}
8239
8240 -struct at91_pmc_caps {
8241 -       u32 available_irqs;
8242 -};
8243 -
8244 -struct at91_pmc {
8245 -       void __iomem *regbase;
8246 -       int virq;
8247 -       spinlock_t lock;
8248 -       const struct at91_pmc_caps *caps;
8249 -       struct irq_domain *irqdomain;
8250 -       u32 imr;
8251 -};
8252 -
8253 -static inline void pmc_lock(struct at91_pmc *pmc)
8254 -{
8255 -       spin_lock(&pmc->lock);
8256 -}
8257 -
8258 -static inline void pmc_unlock(struct at91_pmc *pmc)
8259 -{
8260 -       spin_unlock(&pmc->lock);
8261 -}
8262 -
8263 -static inline u32 pmc_read(struct at91_pmc *pmc, int offset)
8264 -{
8265 -       return readl(pmc->regbase + offset);
8266 -}
8267 -
8268 -static inline void pmc_write(struct at91_pmc *pmc, int offset, u32 value)
8269 -{
8270 -       writel(value, pmc->regbase + offset);
8271 -}
8272 -
8273  int of_at91_get_clk_range(struct device_node *np, const char *propname,
8274                           struct clk_range *range);
8275
8276 -void of_at91sam9260_clk_slow_setup(struct device_node *np,
8277 -                                  struct at91_pmc *pmc);
8278 -
8279 -void of_at91rm9200_clk_main_osc_setup(struct device_node *np,
8280 -                                     struct at91_pmc *pmc);
8281 -void of_at91sam9x5_clk_main_rc_osc_setup(struct device_node *np,
8282 -                                        struct at91_pmc *pmc);
8283 -void of_at91rm9200_clk_main_setup(struct device_node *np,
8284 -                                 struct at91_pmc *pmc);
8285 -void of_at91sam9x5_clk_main_setup(struct device_node *np,
8286 -                                 struct at91_pmc *pmc);
8287 -
8288 -void of_at91rm9200_clk_pll_setup(struct device_node *np,
8289 -                                struct at91_pmc *pmc);
8290 -void of_at91sam9g45_clk_pll_setup(struct device_node *np,
8291 -                                 struct at91_pmc *pmc);
8292 -void of_at91sam9g20_clk_pllb_setup(struct device_node *np,
8293 -                                  struct at91_pmc *pmc);
8294 -void of_sama5d3_clk_pll_setup(struct device_node *np,
8295 -                             struct at91_pmc *pmc);
8296 -void of_at91sam9x5_clk_plldiv_setup(struct device_node *np,
8297 -                                   struct at91_pmc *pmc);
8298 -
8299 -void of_at91rm9200_clk_master_setup(struct device_node *np,
8300 -                                   struct at91_pmc *pmc);
8301 -void of_at91sam9x5_clk_master_setup(struct device_node *np,
8302 -                                   struct at91_pmc *pmc);
8303 -
8304 -void of_at91rm9200_clk_sys_setup(struct device_node *np,
8305 -                                struct at91_pmc *pmc);
8306 -
8307 -void of_at91rm9200_clk_periph_setup(struct device_node *np,
8308 -                                   struct at91_pmc *pmc);
8309 -void of_at91sam9x5_clk_periph_setup(struct device_node *np,
8310 -                                   struct at91_pmc *pmc);
8311 -
8312 -void of_at91rm9200_clk_prog_setup(struct device_node *np,
8313 -                                 struct at91_pmc *pmc);
8314 -void of_at91sam9g45_clk_prog_setup(struct device_node *np,
8315 -                                  struct at91_pmc *pmc);
8316 -void of_at91sam9x5_clk_prog_setup(struct device_node *np,
8317 -                                 struct at91_pmc *pmc);
8318 -
8319 -void of_at91sam9x5_clk_utmi_setup(struct device_node *np,
8320 -                                 struct at91_pmc *pmc);
8321 -
8322 -void of_at91rm9200_clk_usb_setup(struct device_node *np,
8323 -                                struct at91_pmc *pmc);
8324 -void of_at91sam9x5_clk_usb_setup(struct device_node *np,
8325 -                                struct at91_pmc *pmc);
8326 -void of_at91sam9n12_clk_usb_setup(struct device_node *np,
8327 -                                 struct at91_pmc *pmc);
8328 -
8329 -void of_at91sam9x5_clk_smd_setup(struct device_node *np,
8330 -                                struct at91_pmc *pmc);
8331 -
8332 -void of_sama5d4_clk_h32mx_setup(struct device_node *np,
8333 -                               struct at91_pmc *pmc);
8334 -
8335 -void of_sama5d2_clk_generated_setup(struct device_node *np,
8336 -                                   struct at91_pmc *pmc);
8337 -
8338  #endif /* __PMC_H_ */
8339 diff --git a/drivers/clocksource/tcb_clksrc.c b/drivers/clocksource/tcb_clksrc.c
8340 index 4da2af9694a2..5b6f57f500b8 100644
8341 --- a/drivers/clocksource/tcb_clksrc.c
8342 +++ b/drivers/clocksource/tcb_clksrc.c
8343 @@ -23,8 +23,7 @@
8344   *     this 32 bit free-running counter. the second channel is not used.
8345   *
8346   *   - The third channel may be used to provide a 16-bit clockevent
8347 - *     source, used in either periodic or oneshot mode.  This runs
8348 - *     at 32 KiHZ, and can handle delays of up to two seconds.
8349 + *     source, used in either periodic or oneshot mode.
8350   *
8351   * A boot clocksource and clockevent source are also currently needed,
8352   * unless the relevant platforms (ARM/AT91, AVR32/AT32) are changed so
8353 @@ -74,6 +73,8 @@ static struct clocksource clksrc = {
8354  struct tc_clkevt_device {
8355         struct clock_event_device       clkevt;
8356         struct clk                      *clk;
8357 +       bool                            clk_enabled;
8358 +       u32                             freq;
8359         void __iomem                    *regs;
8360  };
8361
8362 @@ -82,15 +83,26 @@ static struct tc_clkevt_device *to_tc_clkevt(struct clock_event_device *clkevt)
8363         return container_of(clkevt, struct tc_clkevt_device, clkevt);
8364  }
8365
8366 -/* For now, we always use the 32K clock ... this optimizes for NO_HZ,
8367 - * because using one of the divided clocks would usually mean the
8368 - * tick rate can never be less than several dozen Hz (vs 0.5 Hz).
8369 - *
8370 - * A divided clock could be good for high resolution timers, since
8371 - * 30.5 usec resolution can seem "low".
8372 - */
8373  static u32 timer_clock;
8374
8375 +static void tc_clk_disable(struct clock_event_device *d)
8376 +{
8377 +       struct tc_clkevt_device *tcd = to_tc_clkevt(d);
8378 +
8379 +       clk_disable(tcd->clk);
8380 +       tcd->clk_enabled = false;
8381 +}
8382 +
8383 +static void tc_clk_enable(struct clock_event_device *d)
8384 +{
8385 +       struct tc_clkevt_device *tcd = to_tc_clkevt(d);
8386 +
8387 +       if (tcd->clk_enabled)
8388 +               return;
8389 +       clk_enable(tcd->clk);
8390 +       tcd->clk_enabled = true;
8391 +}
8392 +
8393  static int tc_shutdown(struct clock_event_device *d)
8394  {
8395         struct tc_clkevt_device *tcd = to_tc_clkevt(d);
8396 @@ -98,8 +110,14 @@ static int tc_shutdown(struct clock_event_device *d)
8397
8398         __raw_writel(0xff, regs + ATMEL_TC_REG(2, IDR));
8399         __raw_writel(ATMEL_TC_CLKDIS, regs + ATMEL_TC_REG(2, CCR));
8400 +       return 0;
8401 +}
8402 +
8403 +static int tc_shutdown_clk_off(struct clock_event_device *d)
8404 +{
8405 +       tc_shutdown(d);
8406         if (!clockevent_state_detached(d))
8407 -               clk_disable(tcd->clk);
8408 +               tc_clk_disable(d);
8409
8410         return 0;
8411  }
8412 @@ -112,9 +130,9 @@ static int tc_set_oneshot(struct clock_event_device *d)
8413         if (clockevent_state_oneshot(d) || clockevent_state_periodic(d))
8414                 tc_shutdown(d);
8415
8416 -       clk_enable(tcd->clk);
8417 +       tc_clk_enable(d);
8418
8419 -       /* slow clock, count up to RC, then irq and stop */
8420 +       /* count up to RC, then irq and stop */
8421         __raw_writel(timer_clock | ATMEL_TC_CPCSTOP | ATMEL_TC_WAVE |
8422                      ATMEL_TC_WAVESEL_UP_AUTO, regs + ATMEL_TC_REG(2, CMR));
8423         __raw_writel(ATMEL_TC_CPCS, regs + ATMEL_TC_REG(2, IER));
8424 @@ -134,12 +152,12 @@ static int tc_set_periodic(struct clock_event_device *d)
8425         /* By not making the gentime core emulate periodic mode on top
8426          * of oneshot, we get lower overhead and improved accuracy.
8427          */
8428 -       clk_enable(tcd->clk);
8429 +       tc_clk_enable(d);
8430
8431 -       /* slow clock, count up to RC, then irq and restart */
8432 +       /* count up to RC, then irq and restart */
8433         __raw_writel(timer_clock | ATMEL_TC_WAVE | ATMEL_TC_WAVESEL_UP_AUTO,
8434                      regs + ATMEL_TC_REG(2, CMR));
8435 -       __raw_writel((32768 + HZ / 2) / HZ, tcaddr + ATMEL_TC_REG(2, RC));
8436 +       __raw_writel((tcd->freq + HZ / 2) / HZ, tcaddr + ATMEL_TC_REG(2, RC));
8437
8438         /* Enable clock and interrupts on RC compare */
8439         __raw_writel(ATMEL_TC_CPCS, regs + ATMEL_TC_REG(2, IER));
8440 @@ -166,9 +184,13 @@ static struct tc_clkevt_device clkevt = {
8441                 .features               = CLOCK_EVT_FEAT_PERIODIC |
8442                                           CLOCK_EVT_FEAT_ONESHOT,
8443                 /* Should be lower than at91rm9200's system timer */
8444 +#ifdef CONFIG_ATMEL_TCB_CLKSRC_USE_SLOW_CLOCK
8445                 .rating                 = 125,
8446 +#else
8447 +               .rating                 = 200,
8448 +#endif
8449                 .set_next_event         = tc_next_event,
8450 -               .set_state_shutdown     = tc_shutdown,
8451 +               .set_state_shutdown     = tc_shutdown_clk_off,
8452                 .set_state_periodic     = tc_set_periodic,
8453                 .set_state_oneshot      = tc_set_oneshot,
8454         },
8455 @@ -188,8 +210,9 @@ static irqreturn_t ch2_irq(int irq, void *handle)
8456         return IRQ_NONE;
8457  }
8458
8459 -static int __init setup_clkevents(struct atmel_tc *tc, int clk32k_divisor_idx)
8460 +static int __init setup_clkevents(struct atmel_tc *tc, int divisor_idx)
8461  {
8462 +       unsigned divisor = atmel_tc_divisors[divisor_idx];
8463         int ret;
8464         struct clk *t2_clk = tc->clk[2];
8465         int irq = tc->irq[2];
8466 @@ -210,7 +233,11 @@ static int __init setup_clkevents(struct atmel_tc *tc, int clk32k_divisor_idx)
8467         clkevt.regs = tc->regs;
8468         clkevt.clk = t2_clk;
8469
8470 -       timer_clock = clk32k_divisor_idx;
8471 +       timer_clock = divisor_idx;
8472 +       if (!divisor)
8473 +               clkevt.freq = 32768;
8474 +       else
8475 +               clkevt.freq = clk_get_rate(t2_clk) / divisor;
8476
8477         clkevt.clkevt.cpumask = cpumask_of(0);
8478
8479 @@ -221,7 +248,7 @@ static int __init setup_clkevents(struct atmel_tc *tc, int clk32k_divisor_idx)
8480                 return ret;
8481         }
8482
8483 -       clockevents_config_and_register(&clkevt.clkevt, 32768, 1, 0xffff);
8484 +       clockevents_config_and_register(&clkevt.clkevt, clkevt.freq, 1, 0xffff);
8485
8486         return ret;
8487  }
8488 @@ -358,7 +385,11 @@ static int __init tcb_clksrc_init(void)
8489                 goto err_disable_t1;
8490
8491         /* channel 2:  periodic and oneshot timer support */
8492 +#ifdef CONFIG_ATMEL_TCB_CLKSRC_USE_SLOW_CLOCK
8493         ret = setup_clkevents(tc, clk32k_divisor_idx);
8494 +#else
8495 +       ret = setup_clkevents(tc, best_divisor_idx);
8496 +#endif
8497         if (ret)
8498                 goto err_unregister_clksrc;
8499
8500 diff --git a/drivers/clocksource/timer-atmel-pit.c b/drivers/clocksource/timer-atmel-pit.c
8501 index d911c5dca8f1..7a40f7e88468 100644
8502 --- a/drivers/clocksource/timer-atmel-pit.c
8503 +++ b/drivers/clocksource/timer-atmel-pit.c
8504 @@ -46,6 +46,7 @@ struct pit_data {
8505         u32             cycle;
8506         u32             cnt;
8507         unsigned int    irq;
8508 +       bool            irq_requested;
8509         struct clk      *mck;
8510  };
8511
8512 @@ -96,15 +97,29 @@ static int pit_clkevt_shutdown(struct clock_event_device *dev)
8513
8514         /* disable irq, leaving the clocksource active */
8515         pit_write(data->base, AT91_PIT_MR, (data->cycle - 1) | AT91_PIT_PITEN);
8516 +       if (data->irq_requested) {
8517 +               free_irq(data->irq, data);
8518 +               data->irq_requested = false;
8519 +       }
8520         return 0;
8521  }
8522
8523 +static irqreturn_t at91sam926x_pit_interrupt(int irq, void *dev_id);
8524  /*
8525   * Clockevent device:  interrupts every 1/HZ (== pit_cycles * MCK/16)
8526   */
8527  static int pit_clkevt_set_periodic(struct clock_event_device *dev)
8528  {
8529         struct pit_data *data = clkevt_to_pit_data(dev);
8530 +       int ret;
8531 +
8532 +       ret = request_irq(data->irq, at91sam926x_pit_interrupt,
8533 +                         IRQF_SHARED | IRQF_TIMER | IRQF_IRQPOLL,
8534 +                         "at91_tick", data);
8535 +       if (ret)
8536 +               panic(pr_fmt("Unable to setup IRQ\n"));
8537 +
8538 +       data->irq_requested = true;
8539
8540         /* update clocksource counter */
8541         data->cnt += data->cycle * PIT_PICNT(pit_read(data->base, AT91_PIT_PIVR));
8542 @@ -181,7 +196,6 @@ static void __init at91sam926x_pit_common_init(struct pit_data *data)
8543  {
8544         unsigned long   pit_rate;
8545         unsigned        bits;
8546 -       int             ret;
8547
8548         /*
8549          * Use our actual MCK to figure out how many MCK/16 ticks per
8550 @@ -206,13 +220,6 @@ static void __init at91sam926x_pit_common_init(struct pit_data *data)
8551         data->clksrc.flags = CLOCK_SOURCE_IS_CONTINUOUS;
8552         clocksource_register_hz(&data->clksrc, pit_rate);
8553
8554 -       /* Set up irq handler */
8555 -       ret = request_irq(data->irq, at91sam926x_pit_interrupt,
8556 -                         IRQF_SHARED | IRQF_TIMER | IRQF_IRQPOLL,
8557 -                         "at91_tick", data);
8558 -       if (ret)
8559 -               panic(pr_fmt("Unable to setup IRQ\n"));
8560 -
8561         /* Set up and register clockevents */
8562         data->clkevt.name = "pit";
8563         data->clkevt.features = CLOCK_EVT_FEAT_PERIODIC;
8564 diff --git a/drivers/clocksource/timer-atmel-st.c b/drivers/clocksource/timer-atmel-st.c
8565 index 29d21d68df5a..103d0fd70cc4 100644
8566 --- a/drivers/clocksource/timer-atmel-st.c
8567 +++ b/drivers/clocksource/timer-atmel-st.c
8568 @@ -115,18 +115,29 @@ static void clkdev32k_disable_and_flush_irq(void)
8569         last_crtr = read_CRTR();
8570  }
8571
8572 +static int atmel_st_irq;
8573 +
8574  static int clkevt32k_shutdown(struct clock_event_device *evt)
8575  {
8576         clkdev32k_disable_and_flush_irq();
8577         irqmask = 0;
8578         regmap_write(regmap_st, AT91_ST_IER, irqmask);
8579 +       free_irq(atmel_st_irq, regmap_st);
8580         return 0;
8581  }
8582
8583  static int clkevt32k_set_oneshot(struct clock_event_device *dev)
8584  {
8585 +       int ret;
8586 +
8587         clkdev32k_disable_and_flush_irq();
8588
8589 +       ret = request_irq(atmel_st_irq, at91rm9200_timer_interrupt,
8590 +                         IRQF_SHARED | IRQF_TIMER | IRQF_IRQPOLL,
8591 +                         "at91_tick", regmap_st);
8592 +       if (ret)
8593 +               panic(pr_fmt("Unable to setup IRQ\n"));
8594 +
8595         /*
8596          * ALM for oneshot irqs, set by next_event()
8597          * before 32 seconds have passed.
8598 @@ -139,8 +150,16 @@ static int clkevt32k_set_oneshot(struct clock_event_device *dev)
8599
8600  static int clkevt32k_set_periodic(struct clock_event_device *dev)
8601  {
8602 +       int ret;
8603 +
8604         clkdev32k_disable_and_flush_irq();
8605
8606 +       ret = request_irq(atmel_st_irq, at91rm9200_timer_interrupt,
8607 +                         IRQF_SHARED | IRQF_TIMER | IRQF_IRQPOLL,
8608 +                         "at91_tick", regmap_st);
8609 +       if (ret)
8610 +               panic(pr_fmt("Unable to setup IRQ\n"));
8611 +
8612         /* PIT for periodic irqs; fixed rate of 1/HZ */
8613         irqmask = AT91_ST_PITS;
8614         regmap_write(regmap_st, AT91_ST_PIMR, timer_latch);
8615 @@ -198,7 +217,7 @@ static void __init atmel_st_timer_init(struct device_node *node)
8616  {
8617         struct clk *sclk;
8618         unsigned int sclk_rate, val;
8619 -       int irq, ret;
8620 +       int ret;
8621
8622         regmap_st = syscon_node_to_regmap(node);
8623         if (IS_ERR(regmap_st))
8624 @@ -210,17 +229,10 @@ static void __init atmel_st_timer_init(struct device_node *node)
8625         regmap_read(regmap_st, AT91_ST_SR, &val);
8626
8627         /* Get the interrupts property */
8628 -       irq  = irq_of_parse_and_map(node, 0);
8629 -       if (!irq)
8630 +       atmel_st_irq  = irq_of_parse_and_map(node, 0);
8631 +       if (!atmel_st_irq)
8632                 panic(pr_fmt("Unable to get IRQ from DT\n"));
8633
8634 -       /* Make IRQs happen for the system timer */
8635 -       ret = request_irq(irq, at91rm9200_timer_interrupt,
8636 -                         IRQF_SHARED | IRQF_TIMER | IRQF_IRQPOLL,
8637 -                         "at91_tick", regmap_st);
8638 -       if (ret)
8639 -               panic(pr_fmt("Unable to setup IRQ\n"));
8640 -
8641         sclk = of_clk_get(node, 0);
8642         if (IS_ERR(sclk))
8643                 panic(pr_fmt("Unable to get slow clock\n"));
8644 diff --git a/drivers/cpufreq/Kconfig.x86 b/drivers/cpufreq/Kconfig.x86
8645 index c59bdcb83217..8f23161d80be 100644
8646 --- a/drivers/cpufreq/Kconfig.x86
8647 +++ b/drivers/cpufreq/Kconfig.x86
8648 @@ -123,7 +123,7 @@ config X86_POWERNOW_K7_ACPI
8649
8650  config X86_POWERNOW_K8
8651         tristate "AMD Opteron/Athlon64 PowerNow!"
8652 -       depends on ACPI && ACPI_PROCESSOR && X86_ACPI_CPUFREQ
8653 +       depends on ACPI && ACPI_PROCESSOR && X86_ACPI_CPUFREQ && !PREEMPT_RT_BASE
8654         help
8655           This adds the CPUFreq driver for K8/early Opteron/Athlon64 processors.
8656           Support for K10 and newer processors is now in acpi-cpufreq.
8657 diff --git a/drivers/cpuidle/coupled.c b/drivers/cpuidle/coupled.c
8658 index 344058f8501a..d5657d50ac40 100644
8659 --- a/drivers/cpuidle/coupled.c
8660 +++ b/drivers/cpuidle/coupled.c
8661 @@ -119,7 +119,6 @@ struct cpuidle_coupled {
8662
8663  #define CPUIDLE_COUPLED_NOT_IDLE       (-1)
8664
8665 -static DEFINE_MUTEX(cpuidle_coupled_lock);
8666  static DEFINE_PER_CPU(struct call_single_data, cpuidle_coupled_poke_cb);
8667
8668  /*
8669 diff --git a/drivers/gpu/drm/i915/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
8670 index 6ed7d63a0688..9da7482ad256 100644
8671 --- a/drivers/gpu/drm/i915/i915_gem_execbuffer.c
8672 +++ b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
8673 @@ -1264,7 +1264,9 @@ i915_gem_ringbuffer_submission(struct i915_execbuffer_params *params,
8674         if (ret)
8675                 return ret;
8676
8677 +#ifndef CONFIG_PREEMPT_RT_BASE
8678         trace_i915_gem_ring_dispatch(params->request, params->dispatch_flags);
8679 +#endif
8680
8681         i915_gem_execbuffer_move_to_active(vmas, params->request);
8682         i915_gem_execbuffer_retire_commands(params);
8683 diff --git a/drivers/gpu/drm/i915/i915_gem_shrinker.c b/drivers/gpu/drm/i915/i915_gem_shrinker.c
8684 index c0a96f1ee18e..deb1e207fa3c 100644
8685 --- a/drivers/gpu/drm/i915/i915_gem_shrinker.c
8686 +++ b/drivers/gpu/drm/i915/i915_gem_shrinker.c
8687 @@ -39,7 +39,7 @@ static bool mutex_is_locked_by(struct mutex *mutex, struct task_struct *task)
8688         if (!mutex_is_locked(mutex))
8689                 return false;
8690
8691 -#if defined(CONFIG_DEBUG_MUTEXES) || defined(CONFIG_MUTEX_SPIN_ON_OWNER)
8692 +#if (defined(CONFIG_DEBUG_MUTEXES) || defined(CONFIG_MUTEX_SPIN_ON_OWNER)) && !defined(CONFIG_PREEMPT_RT_BASE)
8693         return mutex->owner == task;
8694  #else
8695         /* Since UP may be pre-empted, we cannot assume that we own the lock */
8696 diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
8697 index 0f42a2782afc..80a1db09a379 100644
8698 --- a/drivers/gpu/drm/i915/i915_irq.c
8699 +++ b/drivers/gpu/drm/i915/i915_irq.c
8700 @@ -812,6 +812,7 @@ static int i915_get_crtc_scanoutpos(struct drm_device *dev, unsigned int pipe,
8701         spin_lock_irqsave(&dev_priv->uncore.lock, irqflags);
8702
8703         /* preempt_disable_rt() should go right here in PREEMPT_RT patchset. */
8704 +       preempt_disable_rt();
8705
8706         /* Get optional system timestamp before query. */
8707         if (stime)
8708 @@ -863,6 +864,7 @@ static int i915_get_crtc_scanoutpos(struct drm_device *dev, unsigned int pipe,
8709                 *etime = ktime_get();
8710
8711         /* preempt_enable_rt() should go right here in PREEMPT_RT patchset. */
8712 +       preempt_enable_rt();
8713
8714         spin_unlock_irqrestore(&dev_priv->uncore.lock, irqflags);
8715
8716 diff --git a/drivers/gpu/drm/i915/intel_display.c b/drivers/gpu/drm/i915/intel_display.c
8717 index 909d1d71d130..8688709b4ffa 100644
8718 --- a/drivers/gpu/drm/i915/intel_display.c
8719 +++ b/drivers/gpu/drm/i915/intel_display.c
8720 @@ -11400,7 +11400,7 @@ void intel_check_page_flip(struct drm_device *dev, int pipe)
8721         struct intel_crtc *intel_crtc = to_intel_crtc(crtc);
8722         struct intel_unpin_work *work;
8723
8724 -       WARN_ON(!in_interrupt());
8725 +       WARN_ON_NONRT(!in_interrupt());
8726
8727         if (crtc == NULL)
8728                 return;
8729 diff --git a/drivers/gpu/drm/i915/intel_sprite.c b/drivers/gpu/drm/i915/intel_sprite.c
8730 index 2cc6aa072f4c..b79d33f14868 100644
8731 --- a/drivers/gpu/drm/i915/intel_sprite.c
8732 +++ b/drivers/gpu/drm/i915/intel_sprite.c
8733 @@ -38,6 +38,7 @@
8734  #include "intel_drv.h"
8735  #include <drm/i915_drm.h>
8736  #include "i915_drv.h"
8737 +#include <linux/locallock.h>
8738
8739  static bool
8740  format_is_yuv(uint32_t format)
8741 @@ -64,6 +65,8 @@ static int usecs_to_scanlines(const struct drm_display_mode *adjusted_mode,
8742                             1000 * adjusted_mode->crtc_htotal);
8743  }
8744
8745 +static DEFINE_LOCAL_IRQ_LOCK(pipe_update_lock);
8746 +
8747  /**
8748   * intel_pipe_update_start() - start update of a set of display registers
8749   * @crtc: the crtc of which the registers are going to be updated
8750 @@ -96,7 +99,7 @@ void intel_pipe_update_start(struct intel_crtc *crtc)
8751         min = vblank_start - usecs_to_scanlines(adjusted_mode, 100);
8752         max = vblank_start - 1;
8753
8754 -       local_irq_disable();
8755 +       local_lock_irq(pipe_update_lock);
8756
8757         if (min <= 0 || max <= 0)
8758                 return;
8759 @@ -126,11 +129,11 @@ void intel_pipe_update_start(struct intel_crtc *crtc)
8760                         break;
8761                 }
8762
8763 -               local_irq_enable();
8764 +               local_unlock_irq(pipe_update_lock);
8765
8766                 timeout = schedule_timeout(timeout);
8767
8768 -               local_irq_disable();
8769 +               local_lock_irq(pipe_update_lock);
8770         }
8771
8772         finish_wait(wq, &wait);
8773 @@ -164,7 +167,7 @@ void intel_pipe_update_end(struct intel_crtc *crtc)
8774
8775         trace_i915_pipe_update_end(crtc, end_vbl_count, scanline_end);
8776
8777 -       local_irq_enable();
8778 +       local_unlock_irq(pipe_update_lock);
8779
8780         if (crtc->debug.start_vbl_count &&
8781             crtc->debug.start_vbl_count != end_vbl_count) {
8782 diff --git a/drivers/gpu/drm/radeon/radeon_display.c b/drivers/gpu/drm/radeon/radeon_display.c
8783 index 3645b223aa37..642854b2ed2c 100644
8784 --- a/drivers/gpu/drm/radeon/radeon_display.c
8785 +++ b/drivers/gpu/drm/radeon/radeon_display.c
8786 @@ -1862,6 +1862,7 @@ int radeon_get_crtc_scanoutpos(struct drm_device *dev, unsigned int pipe,
8787         struct radeon_device *rdev = dev->dev_private;
8788
8789         /* preempt_disable_rt() should go right here in PREEMPT_RT patchset. */
8790 +       preempt_disable_rt();
8791
8792         /* Get optional system timestamp before query. */
8793         if (stime)
8794 @@ -1954,6 +1955,7 @@ int radeon_get_crtc_scanoutpos(struct drm_device *dev, unsigned int pipe,
8795                 *etime = ktime_get();
8796
8797         /* preempt_enable_rt() should go right here in PREEMPT_RT patchset. */
8798 +       preempt_enable_rt();
8799
8800         /* Decode into vertical and horizontal scanout position. */
8801         *vpos = position & 0x1fff;
8802 diff --git a/drivers/hv/vmbus_drv.c b/drivers/hv/vmbus_drv.c
8803 index 509ed9731630..b2498b270f2c 100644
8804 --- a/drivers/hv/vmbus_drv.c
8805 +++ b/drivers/hv/vmbus_drv.c
8806 @@ -820,7 +820,7 @@ static void vmbus_isr(void)
8807                         tasklet_schedule(&msg_dpc);
8808         }
8809
8810 -       add_interrupt_randomness(HYPERVISOR_CALLBACK_VECTOR, 0);
8811 +       add_interrupt_randomness(HYPERVISOR_CALLBACK_VECTOR, 0, 0);
8812  }
8813
8814
8815 diff --git a/drivers/i2c/busses/i2c-omap.c b/drivers/i2c/busses/i2c-omap.c
8816 index 08d26ba61ed3..46b89dd42b10 100644
8817 --- a/drivers/i2c/busses/i2c-omap.c
8818 +++ b/drivers/i2c/busses/i2c-omap.c
8819 @@ -995,15 +995,12 @@ omap_i2c_isr(int irq, void *dev_id)
8820         u16 mask;
8821         u16 stat;
8822
8823 -       spin_lock(&omap->lock);
8824 -       mask = omap_i2c_read_reg(omap, OMAP_I2C_IE_REG);
8825         stat = omap_i2c_read_reg(omap, OMAP_I2C_STAT_REG);
8826 +       mask = omap_i2c_read_reg(omap, OMAP_I2C_IE_REG);
8827
8828         if (stat & mask)
8829                 ret = IRQ_WAKE_THREAD;
8830
8831 -       spin_unlock(&omap->lock);
8832 -
8833         return ret;
8834  }
8835
8836 diff --git a/drivers/ide/alim15x3.c b/drivers/ide/alim15x3.c
8837 index 36f76e28a0bf..394f142f90c7 100644
8838 --- a/drivers/ide/alim15x3.c
8839 +++ b/drivers/ide/alim15x3.c
8840 @@ -234,7 +234,7 @@ static int init_chipset_ali15x3(struct pci_dev *dev)
8841
8842         isa_dev = pci_get_device(PCI_VENDOR_ID_AL, PCI_DEVICE_ID_AL_M1533, NULL);
8843
8844 -       local_irq_save(flags);
8845 +       local_irq_save_nort(flags);
8846
8847         if (m5229_revision < 0xC2) {
8848                 /*
8849 @@ -325,7 +325,7 @@ out:
8850         }
8851         pci_dev_put(north);
8852         pci_dev_put(isa_dev);
8853 -       local_irq_restore(flags);
8854 +       local_irq_restore_nort(flags);
8855         return 0;
8856  }
8857
8858 diff --git a/drivers/ide/hpt366.c b/drivers/ide/hpt366.c
8859 index 696b6c1ec940..0d0a96629b73 100644
8860 --- a/drivers/ide/hpt366.c
8861 +++ b/drivers/ide/hpt366.c
8862 @@ -1241,7 +1241,7 @@ static int init_dma_hpt366(ide_hwif_t *hwif,
8863
8864         dma_old = inb(base + 2);
8865
8866 -       local_irq_save(flags);
8867 +       local_irq_save_nort(flags);
8868
8869         dma_new = dma_old;
8870         pci_read_config_byte(dev, hwif->channel ? 0x4b : 0x43, &masterdma);
8871 @@ -1252,7 +1252,7 @@ static int init_dma_hpt366(ide_hwif_t *hwif,
8872         if (dma_new != dma_old)
8873                 outb(dma_new, base + 2);
8874
8875 -       local_irq_restore(flags);
8876 +       local_irq_restore_nort(flags);
8877
8878         printk(KERN_INFO "    %s: BM-DMA at 0x%04lx-0x%04lx\n",
8879                          hwif->name, base, base + 7);
8880 diff --git a/drivers/ide/ide-io-std.c b/drivers/ide/ide-io-std.c
8881 index 19763977568c..4169433faab5 100644
8882 --- a/drivers/ide/ide-io-std.c
8883 +++ b/drivers/ide/ide-io-std.c
8884 @@ -175,7 +175,7 @@ void ide_input_data(ide_drive_t *drive, struct ide_cmd *cmd, void *buf,
8885                 unsigned long uninitialized_var(flags);
8886
8887                 if ((io_32bit & 2) && !mmio) {
8888 -                       local_irq_save(flags);
8889 +                       local_irq_save_nort(flags);
8890                         ata_vlb_sync(io_ports->nsect_addr);
8891                 }
8892
8893 @@ -186,7 +186,7 @@ void ide_input_data(ide_drive_t *drive, struct ide_cmd *cmd, void *buf,
8894                         insl(data_addr, buf, words);
8895
8896                 if ((io_32bit & 2) && !mmio)
8897 -                       local_irq_restore(flags);
8898 +                       local_irq_restore_nort(flags);
8899
8900                 if (((len + 1) & 3) < 2)
8901                         return;
8902 @@ -219,7 +219,7 @@ void ide_output_data(ide_drive_t *drive, struct ide_cmd *cmd, void *buf,
8903                 unsigned long uninitialized_var(flags);
8904
8905                 if ((io_32bit & 2) && !mmio) {
8906 -                       local_irq_save(flags);
8907 +                       local_irq_save_nort(flags);
8908                         ata_vlb_sync(io_ports->nsect_addr);
8909                 }
8910
8911 @@ -230,7 +230,7 @@ void ide_output_data(ide_drive_t *drive, struct ide_cmd *cmd, void *buf,
8912                         outsl(data_addr, buf, words);
8913
8914                 if ((io_32bit & 2) && !mmio)
8915 -                       local_irq_restore(flags);
8916 +                       local_irq_restore_nort(flags);
8917
8918                 if (((len + 1) & 3) < 2)
8919                         return;
8920 diff --git a/drivers/ide/ide-io.c b/drivers/ide/ide-io.c
8921 index 669ea1e45795..e12e43e62245 100644
8922 --- a/drivers/ide/ide-io.c
8923 +++ b/drivers/ide/ide-io.c
8924 @@ -659,7 +659,7 @@ void ide_timer_expiry (unsigned long data)
8925                 /* disable_irq_nosync ?? */
8926                 disable_irq(hwif->irq);
8927                 /* local CPU only, as if we were handling an interrupt */
8928 -               local_irq_disable();
8929 +               local_irq_disable_nort();
8930                 if (hwif->polling) {
8931                         startstop = handler(drive);
8932                 } else if (drive_is_ready(drive)) {
8933 diff --git a/drivers/ide/ide-iops.c b/drivers/ide/ide-iops.c
8934 index 376f2dc410c5..f014dd1b73dc 100644
8935 --- a/drivers/ide/ide-iops.c
8936 +++ b/drivers/ide/ide-iops.c
8937 @@ -129,12 +129,12 @@ int __ide_wait_stat(ide_drive_t *drive, u8 good, u8 bad,
8938                                 if ((stat & ATA_BUSY) == 0)
8939                                         break;
8940
8941 -                               local_irq_restore(flags);
8942 +                               local_irq_restore_nort(flags);
8943                                 *rstat = stat;
8944                                 return -EBUSY;
8945                         }
8946                 }
8947 -               local_irq_restore(flags);
8948 +               local_irq_restore_nort(flags);
8949         }
8950         /*
8951          * Allow status to settle, then read it again.
8952 diff --git a/drivers/ide/ide-probe.c b/drivers/ide/ide-probe.c
8953 index 0b63facd1d87..4ceba37afc0c 100644
8954 --- a/drivers/ide/ide-probe.c
8955 +++ b/drivers/ide/ide-probe.c
8956 @@ -196,10 +196,10 @@ static void do_identify(ide_drive_t *drive, u8 cmd, u16 *id)
8957         int bswap = 1;
8958
8959         /* local CPU only; some systems need this */
8960 -       local_irq_save(flags);
8961 +       local_irq_save_nort(flags);
8962         /* read 512 bytes of id info */
8963         hwif->tp_ops->input_data(drive, NULL, id, SECTOR_SIZE);
8964 -       local_irq_restore(flags);
8965 +       local_irq_restore_nort(flags);
8966
8967         drive->dev_flags |= IDE_DFLAG_ID_READ;
8968  #ifdef DEBUG
8969 diff --git a/drivers/ide/ide-taskfile.c b/drivers/ide/ide-taskfile.c
8970 index a716693417a3..be0568c722d6 100644
8971 --- a/drivers/ide/ide-taskfile.c
8972 +++ b/drivers/ide/ide-taskfile.c
8973 @@ -250,7 +250,7 @@ void ide_pio_bytes(ide_drive_t *drive, struct ide_cmd *cmd,
8974
8975                 page_is_high = PageHighMem(page);
8976                 if (page_is_high)
8977 -                       local_irq_save(flags);
8978 +                       local_irq_save_nort(flags);
8979
8980                 buf = kmap_atomic(page) + offset;
8981
8982 @@ -271,7 +271,7 @@ void ide_pio_bytes(ide_drive_t *drive, struct ide_cmd *cmd,
8983                 kunmap_atomic(buf);
8984
8985                 if (page_is_high)
8986 -                       local_irq_restore(flags);
8987 +                       local_irq_restore_nort(flags);
8988
8989                 len -= nr_bytes;
8990         }
8991 @@ -414,7 +414,7 @@ static ide_startstop_t pre_task_out_intr(ide_drive_t *drive,
8992         }
8993
8994         if ((drive->dev_flags & IDE_DFLAG_UNMASK) == 0)
8995 -               local_irq_disable();
8996 +               local_irq_disable_nort();
8997
8998         ide_set_handler(drive, &task_pio_intr, WAIT_WORSTCASE);
8999
9000 diff --git a/drivers/infiniband/ulp/ipoib/ipoib_multicast.c b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c
9001 index 87799de90a1d..66cdd37f8605 100644
9002 --- a/drivers/infiniband/ulp/ipoib/ipoib_multicast.c
9003 +++ b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c
9004 @@ -857,7 +857,7 @@ void ipoib_mcast_restart_task(struct work_struct *work)
9005
9006         ipoib_dbg_mcast(priv, "restarting multicast task\n");
9007
9008 -       local_irq_save(flags);
9009 +       local_irq_save_nort(flags);
9010         netif_addr_lock(dev);
9011         spin_lock(&priv->lock);
9012
9013 @@ -939,7 +939,7 @@ void ipoib_mcast_restart_task(struct work_struct *work)
9014
9015         spin_unlock(&priv->lock);
9016         netif_addr_unlock(dev);
9017 -       local_irq_restore(flags);
9018 +       local_irq_restore_nort(flags);
9019
9020         /*
9021          * make sure the in-flight joins have finished before we attempt
9022 diff --git a/drivers/input/gameport/gameport.c b/drivers/input/gameport/gameport.c
9023 index 4a2a9e370be7..e970d9afd179 100644
9024 --- a/drivers/input/gameport/gameport.c
9025 +++ b/drivers/input/gameport/gameport.c
9026 @@ -91,13 +91,13 @@ static int gameport_measure_speed(struct gameport *gameport)
9027         tx = ~0;
9028
9029         for (i = 0; i < 50; i++) {
9030 -               local_irq_save(flags);
9031 +               local_irq_save_nort(flags);
9032                 t1 = ktime_get_ns();
9033                 for (t = 0; t < 50; t++)
9034                         gameport_read(gameport);
9035                 t2 = ktime_get_ns();
9036                 t3 = ktime_get_ns();
9037 -               local_irq_restore(flags);
9038 +               local_irq_restore_nort(flags);
9039                 udelay(i * 10);
9040                 t = (t2 - t1) - (t3 - t2);
9041                 if (t < tx)
9042 @@ -124,12 +124,12 @@ static int old_gameport_measure_speed(struct gameport *gameport)
9043         tx = 1 << 30;
9044
9045         for(i = 0; i < 50; i++) {
9046 -               local_irq_save(flags);
9047 +               local_irq_save_nort(flags);
9048                 GET_TIME(t1);
9049                 for (t = 0; t < 50; t++) gameport_read(gameport);
9050                 GET_TIME(t2);
9051                 GET_TIME(t3);
9052 -               local_irq_restore(flags);
9053 +               local_irq_restore_nort(flags);
9054                 udelay(i * 10);
9055                 if ((t = DELTA(t2,t1) - DELTA(t3,t2)) < tx) tx = t;
9056         }
9057 @@ -148,11 +148,11 @@ static int old_gameport_measure_speed(struct gameport *gameport)
9058         tx = 1 << 30;
9059
9060         for(i = 0; i < 50; i++) {
9061 -               local_irq_save(flags);
9062 +               local_irq_save_nort(flags);
9063                 t1 = rdtsc();
9064                 for (t = 0; t < 50; t++) gameport_read(gameport);
9065                 t2 = rdtsc();
9066 -               local_irq_restore(flags);
9067 +               local_irq_restore_nort(flags);
9068                 udelay(i * 10);
9069                 if (t2 - t1 < tx) tx = t2 - t1;
9070         }
9071 diff --git a/drivers/iommu/amd_iommu.c b/drivers/iommu/amd_iommu.c
9072 index 0397985a2601..bc0e7d2c5cff 100644
9073 --- a/drivers/iommu/amd_iommu.c
9074 +++ b/drivers/iommu/amd_iommu.c
9075 @@ -2019,10 +2019,10 @@ static int __attach_device(struct iommu_dev_data *dev_data,
9076         int ret;
9077
9078         /*
9079 -        * Must be called with IRQs disabled. Warn here to detect early
9080 -        * when its not.
9081 +        * Must be called with IRQs disabled on a non RT kernel. Warn here to
9082 +        * detect early when its not.
9083          */
9084 -       WARN_ON(!irqs_disabled());
9085 +       WARN_ON_NONRT(!irqs_disabled());
9086
9087         /* lock domain */
9088         spin_lock(&domain->lock);
9089 @@ -2185,10 +2185,10 @@ static void __detach_device(struct iommu_dev_data *dev_data)
9090         struct protection_domain *domain;
9091
9092         /*
9093 -        * Must be called with IRQs disabled. Warn here to detect early
9094 -        * when its not.
9095 +        * Must be called with IRQs disabled on a non RT kernel. Warn here to
9096 +        * detect early when its not.
9097          */
9098 -       WARN_ON(!irqs_disabled());
9099 +       WARN_ON_NONRT(!irqs_disabled());
9100
9101         if (WARN_ON(!dev_data->domain))
9102                 return;
9103 diff --git a/drivers/leds/trigger/Kconfig b/drivers/leds/trigger/Kconfig
9104 index 5bda6a9b56bb..d6286584c807 100644
9105 --- a/drivers/leds/trigger/Kconfig
9106 +++ b/drivers/leds/trigger/Kconfig
9107 @@ -61,7 +61,7 @@ config LEDS_TRIGGER_BACKLIGHT
9108
9109  config LEDS_TRIGGER_CPU
9110         bool "LED CPU Trigger"
9111 -       depends on LEDS_TRIGGERS
9112 +       depends on LEDS_TRIGGERS && !PREEMPT_RT_BASE
9113         help
9114           This allows LEDs to be controlled by active CPUs. This shows
9115           the active CPUs across an array of LEDs so you can see which
9116 diff --git a/drivers/md/bcache/Kconfig b/drivers/md/bcache/Kconfig
9117 index 4d200883c505..98b64ed5cb81 100644
9118 --- a/drivers/md/bcache/Kconfig
9119 +++ b/drivers/md/bcache/Kconfig
9120 @@ -1,6 +1,7 @@
9121
9122  config BCACHE
9123         tristate "Block device as cache"
9124 +       depends on !PREEMPT_RT_FULL
9125         ---help---
9126         Allows a block device to be used as cache for other devices; uses
9127         a btree for indexing and the layout is optimized for SSDs.
9128 diff --git a/drivers/md/dm.c b/drivers/md/dm.c
9129 index 84aa8b1d0480..b7f070e3698e 100644
9130 --- a/drivers/md/dm.c
9131 +++ b/drivers/md/dm.c
9132 @@ -2127,7 +2127,7 @@ static void dm_request_fn(struct request_queue *q)
9133                 /* Establish tio->ti before queuing work (map_tio_request) */
9134                 tio->ti = ti;
9135                 queue_kthread_work(&md->kworker, &tio->work);
9136 -               BUG_ON(!irqs_disabled());
9137 +               BUG_ON_NONRT(!irqs_disabled());
9138         }
9139
9140         goto out;
9141 diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
9142 index 10ce885445f6..76f71791361c 100644
9143 --- a/drivers/md/raid5.c
9144 +++ b/drivers/md/raid5.c
9145 @@ -1920,8 +1920,9 @@ static void raid_run_ops(struct stripe_head *sh, unsigned long ops_request)
9146         struct raid5_percpu *percpu;
9147         unsigned long cpu;
9148
9149 -       cpu = get_cpu();
9150 +       cpu = get_cpu_light();
9151         percpu = per_cpu_ptr(conf->percpu, cpu);
9152 +       spin_lock(&percpu->lock);
9153         if (test_bit(STRIPE_OP_BIOFILL, &ops_request)) {
9154                 ops_run_biofill(sh);
9155                 overlap_clear++;
9156 @@ -1977,7 +1978,8 @@ static void raid_run_ops(struct stripe_head *sh, unsigned long ops_request)
9157                         if (test_and_clear_bit(R5_Overlap, &dev->flags))
9158                                 wake_up(&sh->raid_conf->wait_for_overlap);
9159                 }
9160 -       put_cpu();
9161 +       spin_unlock(&percpu->lock);
9162 +       put_cpu_light();
9163  }
9164
9165  static struct stripe_head *alloc_stripe(struct kmem_cache *sc, gfp_t gfp)
9166 @@ -6414,6 +6416,7 @@ static int raid5_alloc_percpu(struct r5conf *conf)
9167                                __func__, cpu);
9168                         break;
9169                 }
9170 +               spin_lock_init(&per_cpu_ptr(conf->percpu, cpu)->lock);
9171         }
9172         put_online_cpus();
9173
9174 diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h
9175 index 517d4b68a1be..efe91887ecd7 100644
9176 --- a/drivers/md/raid5.h
9177 +++ b/drivers/md/raid5.h
9178 @@ -504,6 +504,7 @@ struct r5conf {
9179         int                     recovery_disabled;
9180         /* per cpu variables */
9181         struct raid5_percpu {
9182 +               spinlock_t      lock;           /* Protection for -RT */
9183                 struct page     *spare_page; /* Used when checking P/Q in raid6 */
9184                 struct flex_array *scribble;   /* space for constructing buffer
9185                                               * lists and performing address
9186 diff --git a/drivers/media/platform/vsp1/vsp1_video.c b/drivers/media/platform/vsp1/vsp1_video.c
9187 index 5ce88e1f5d71..b4f8cd74ecb8 100644
9188 --- a/drivers/media/platform/vsp1/vsp1_video.c
9189 +++ b/drivers/media/platform/vsp1/vsp1_video.c
9190 @@ -520,7 +520,7 @@ static bool vsp1_pipeline_stopped(struct vsp1_pipeline *pipe)
9191         bool stopped;
9192
9193         spin_lock_irqsave(&pipe->irqlock, flags);
9194 -       stopped = pipe->state == VSP1_PIPELINE_STOPPED,
9195 +       stopped = pipe->state == VSP1_PIPELINE_STOPPED;
9196         spin_unlock_irqrestore(&pipe->irqlock, flags);
9197
9198         return stopped;
9199 diff --git a/drivers/misc/Kconfig b/drivers/misc/Kconfig
9200 index 4bf7d50b1bc7..6f7e99ad6e29 100644
9201 --- a/drivers/misc/Kconfig
9202 +++ b/drivers/misc/Kconfig
9203 @@ -54,6 +54,7 @@ config AD525X_DPOT_SPI
9204  config ATMEL_TCLIB
9205         bool "Atmel AT32/AT91 Timer/Counter Library"
9206         depends on (AVR32 || ARCH_AT91)
9207 +       default y if PREEMPT_RT_FULL
9208         help
9209           Select this if you want a library to allocate the Timer/Counter
9210           blocks found on many Atmel processors.  This facilitates using
9211 @@ -69,8 +70,7 @@ config ATMEL_TCB_CLKSRC
9212           are combined to make a single 32-bit timer.
9213
9214           When GENERIC_CLOCKEVENTS is defined, the third timer channel
9215 -         may be used as a clock event device supporting oneshot mode
9216 -         (delays of up to two seconds) based on the 32 KiHz clock.
9217 +         may be used as a clock event device supporting oneshot mode.
9218
9219  config ATMEL_TCB_CLKSRC_BLOCK
9220         int
9221 @@ -84,6 +84,15 @@ config ATMEL_TCB_CLKSRC_BLOCK
9222           TC can be used for other purposes, such as PWM generation and
9223           interval timing.
9224
9225 +config ATMEL_TCB_CLKSRC_USE_SLOW_CLOCK
9226 +       bool "TC Block use 32 KiHz clock"
9227 +       depends on ATMEL_TCB_CLKSRC
9228 +       default y if !PREEMPT_RT_FULL
9229 +       help
9230 +         Select this to use 32 KiHz base clock rate as TC block clock
9231 +         source for clock events.
9232 +
9233 +
9234  config DUMMY_IRQ
9235         tristate "Dummy IRQ handler"
9236         default n
9237 @@ -113,6 +122,35 @@ config IBM_ASM
9238           for information on the specific driver level and support statement
9239           for your IBM server.
9240
9241 +config HWLAT_DETECTOR
9242 +       tristate "Testing module to detect hardware-induced latencies"
9243 +       depends on DEBUG_FS
9244 +       depends on RING_BUFFER
9245 +       default m
9246 +       ---help---
9247 +         A simple hardware latency detector. Use this module to detect
9248 +         large latencies introduced by the behavior of the underlying
9249 +         system firmware external to Linux. We do this using periodic
9250 +         use of stop_machine to grab all available CPUs and measure
9251 +         for unexplainable gaps in the CPU timestamp counter(s). By
9252 +         default, the module is not enabled until the "enable" file
9253 +         within the "hwlat_detector" debugfs directory is toggled.
9254 +
9255 +         This module is often used to detect SMI (System Management
9256 +         Interrupts) on x86 systems, though is not x86 specific. To
9257 +         this end, we default to using a sample window of 1 second,
9258 +         during which we will sample for 0.5 seconds. If an SMI or
9259 +         similar event occurs during that time, it is recorded
9260 +         into an 8K samples global ring buffer until retreived.
9261 +
9262 +         WARNING: This software should never be enabled (it can be built
9263 +         but should not be turned on after it is loaded) in a production
9264 +         environment where high latencies are a concern since the
9265 +         sampling mechanism actually introduces latencies for
9266 +         regular tasks while the CPU(s) are being held.
9267 +
9268 +         If unsure, say N
9269 +
9270  config PHANTOM
9271         tristate "Sensable PHANToM (PCI)"
9272         depends on PCI
9273 diff --git a/drivers/misc/Makefile b/drivers/misc/Makefile
9274 index 537d7f3b78da..ec4aecba0656 100644
9275 --- a/drivers/misc/Makefile
9276 +++ b/drivers/misc/Makefile
9277 @@ -39,6 +39,7 @@ obj-$(CONFIG_C2PORT)          += c2port/
9278  obj-$(CONFIG_HMC6352)          += hmc6352.o
9279  obj-y                          += eeprom/
9280  obj-y                          += cb710/
9281 +obj-$(CONFIG_HWLAT_DETECTOR)   += hwlat_detector.o
9282  obj-$(CONFIG_SPEAR13XX_PCIE_GADGET)    += spear13xx_pcie_gadget.o
9283  obj-$(CONFIG_VMWARE_BALLOON)   += vmw_balloon.o
9284  obj-$(CONFIG_ARM_CHARLCD)      += arm-charlcd.o
9285 diff --git a/drivers/misc/hwlat_detector.c b/drivers/misc/hwlat_detector.c
9286 new file mode 100644
9287 index 000000000000..52f5ad5fd9c0
9288 --- /dev/null
9289 +++ b/drivers/misc/hwlat_detector.c
9290 @@ -0,0 +1,1240 @@
9291 +/*
9292 + * hwlat_detector.c - A simple Hardware Latency detector.
9293 + *
9294 + * Use this module to detect large system latencies induced by the behavior of
9295 + * certain underlying system hardware or firmware, independent of Linux itself.
9296 + * The code was developed originally to detect the presence of SMIs on Intel
9297 + * and AMD systems, although there is no dependency upon x86 herein.
9298 + *
9299 + * The classical example usage of this module is in detecting the presence of
9300 + * SMIs or System Management Interrupts on Intel and AMD systems. An SMI is a
9301 + * somewhat special form of hardware interrupt spawned from earlier CPU debug
9302 + * modes in which the (BIOS/EFI/etc.) firmware arranges for the South Bridge
9303 + * LPC (or other device) to generate a special interrupt under certain
9304 + * circumstances, for example, upon expiration of a special SMI timer device,
9305 + * due to certain external thermal readings, on certain I/O address accesses,
9306 + * and other situations. An SMI hits a special CPU pin, triggers a special
9307 + * SMI mode (complete with special memory map), and the OS is unaware.
9308 + *
9309 + * Although certain hardware-inducing latencies are necessary (for example,
9310 + * a modern system often requires an SMI handler for correct thermal control
9311 + * and remote management) they can wreak havoc upon any OS-level performance
9312 + * guarantees toward low-latency, especially when the OS is not even made
9313 + * aware of the presence of these interrupts. For this reason, we need a
9314 + * somewhat brute force mechanism to detect these interrupts. In this case,
9315 + * we do it by hogging all of the CPU(s) for configurable timer intervals,
9316 + * sampling the built-in CPU timer, looking for discontiguous readings.
9317 + *
9318 + * WARNING: This implementation necessarily introduces latencies. Therefore,
9319 + *          you should NEVER use this module in a production environment
9320 + *          requiring any kind of low-latency performance guarantee(s).
9321 + *
9322 + * Copyright (C) 2008-2009 Jon Masters, Red Hat, Inc. <jcm@redhat.com>
9323 + *
9324 + * Includes useful feedback from Clark Williams <clark@redhat.com>
9325 + *
9326 + * This file is licensed under the terms of the GNU General Public
9327 + * License version 2. This program is licensed "as is" without any
9328 + * warranty of any kind, whether express or implied.
9329 + */
9330 +
9331 +#include <linux/module.h>
9332 +#include <linux/init.h>
9333 +#include <linux/ring_buffer.h>
9334 +#include <linux/time.h>
9335 +#include <linux/hrtimer.h>
9336 +#include <linux/kthread.h>
9337 +#include <linux/debugfs.h>
9338 +#include <linux/seq_file.h>
9339 +#include <linux/uaccess.h>
9340 +#include <linux/version.h>
9341 +#include <linux/delay.h>
9342 +#include <linux/slab.h>
9343 +#include <linux/trace_clock.h>
9344 +
9345 +#define BUF_SIZE_DEFAULT       262144UL                /* 8K*(sizeof(entry)) */
9346 +#define BUF_FLAGS              (RB_FL_OVERWRITE)       /* no block on full */
9347 +#define U64STR_SIZE            22                      /* 20 digits max */
9348 +
9349 +#define VERSION                        "1.0.0"
9350 +#define BANNER                 "hwlat_detector: "
9351 +#define DRVNAME                        "hwlat_detector"
9352 +#define DEFAULT_SAMPLE_WINDOW  1000000                 /* 1s */
9353 +#define DEFAULT_SAMPLE_WIDTH   500000                  /* 0.5s */
9354 +#define DEFAULT_LAT_THRESHOLD  10                      /* 10us */
9355 +
9356 +/* Module metadata */
9357 +
9358 +MODULE_LICENSE("GPL");
9359 +MODULE_AUTHOR("Jon Masters <jcm@redhat.com>");
9360 +MODULE_DESCRIPTION("A simple hardware latency detector");
9361 +MODULE_VERSION(VERSION);
9362 +
9363 +/* Module parameters */
9364 +
9365 +static int debug;
9366 +static int enabled;
9367 +static int threshold;
9368 +
9369 +module_param(debug, int, 0);                   /* enable debug */
9370 +module_param(enabled, int, 0);                 /* enable detector */
9371 +module_param(threshold, int, 0);               /* latency threshold */
9372 +
9373 +/* Buffering and sampling */
9374 +
9375 +static struct ring_buffer *ring_buffer;                /* sample buffer */
9376 +static DEFINE_MUTEX(ring_buffer_mutex);                /* lock changes */
9377 +static unsigned long buf_size = BUF_SIZE_DEFAULT;
9378 +static struct task_struct *kthread;            /* sampling thread */
9379 +
9380 +/* DebugFS filesystem entries */
9381 +
9382 +static struct dentry *debug_dir;               /* debugfs directory */
9383 +static struct dentry *debug_max;               /* maximum TSC delta */
9384 +static struct dentry *debug_count;             /* total detect count */
9385 +static struct dentry *debug_sample_width;      /* sample width us */
9386 +static struct dentry *debug_sample_window;     /* sample window us */
9387 +static struct dentry *debug_sample;            /* raw samples us */
9388 +static struct dentry *debug_threshold;         /* threshold us */
9389 +static struct dentry *debug_enable;            /* enable/disable */
9390 +
9391 +/* Individual samples and global state */
9392 +
9393 +struct sample;                                 /* latency sample */
9394 +struct data;                                   /* Global state */
9395 +
9396 +/* Sampling functions */
9397 +static int __buffer_add_sample(struct sample *sample);
9398 +static struct sample *buffer_get_sample(struct sample *sample);
9399 +
9400 +/* Threading and state */
9401 +static int kthread_fn(void *unused);
9402 +static int start_kthread(void);
9403 +static int stop_kthread(void);
9404 +static void __reset_stats(void);
9405 +static int init_stats(void);
9406 +
9407 +/* Debugfs interface */
9408 +static ssize_t simple_data_read(struct file *filp, char __user *ubuf,
9409 +                               size_t cnt, loff_t *ppos, const u64 *entry);
9410 +static ssize_t simple_data_write(struct file *filp, const char __user *ubuf,
9411 +                                size_t cnt, loff_t *ppos, u64 *entry);
9412 +static int debug_sample_fopen(struct inode *inode, struct file *filp);
9413 +static ssize_t debug_sample_fread(struct file *filp, char __user *ubuf,
9414 +                                 size_t cnt, loff_t *ppos);
9415 +static int debug_sample_release(struct inode *inode, struct file *filp);
9416 +static int debug_enable_fopen(struct inode *inode, struct file *filp);
9417 +static ssize_t debug_enable_fread(struct file *filp, char __user *ubuf,
9418 +                                 size_t cnt, loff_t *ppos);
9419 +static ssize_t debug_enable_fwrite(struct file *file,
9420 +                                  const char __user *user_buffer,
9421 +                                  size_t user_size, loff_t *offset);
9422 +
9423 +/* Initialization functions */
9424 +static int init_debugfs(void);
9425 +static void free_debugfs(void);
9426 +static int detector_init(void);
9427 +static void detector_exit(void);
9428 +
9429 +/* Individual latency samples are stored here when detected and packed into
9430 + * the ring_buffer circular buffer, where they are overwritten when
9431 + * more than buf_size/sizeof(sample) samples are received. */
9432 +struct sample {
9433 +       u64             seqnum;         /* unique sequence */
9434 +       u64             duration;       /* ktime delta */
9435 +       u64             outer_duration; /* ktime delta (outer loop) */
9436 +       struct timespec timestamp;      /* wall time */
9437 +       unsigned long   lost;
9438 +};
9439 +
9440 +/* keep the global state somewhere. */
9441 +static struct data {
9442 +
9443 +       struct mutex lock;              /* protect changes */
9444 +
9445 +       u64     count;                  /* total since reset */
9446 +       u64     max_sample;             /* max hardware latency */
9447 +       u64     threshold;              /* sample threshold level */
9448 +
9449 +       u64     sample_window;          /* total sampling window (on+off) */
9450 +       u64     sample_width;           /* active sampling portion of window */
9451 +
9452 +       atomic_t sample_open;           /* whether the sample file is open */
9453 +
9454 +       wait_queue_head_t wq;           /* waitqeue for new sample values */
9455 +
9456 +} data;
9457 +
9458 +/**
9459 + * __buffer_add_sample - add a new latency sample recording to the ring buffer
9460 + * @sample: The new latency sample value
9461 + *
9462 + * This receives a new latency sample and records it in a global ring buffer.
9463 + * No additional locking is used in this case.
9464 + */
9465 +static int __buffer_add_sample(struct sample *sample)
9466 +{
9467 +       return ring_buffer_write(ring_buffer,
9468 +                                sizeof(struct sample), sample);
9469 +}
9470 +
9471 +/**
9472 + * buffer_get_sample - remove a hardware latency sample from the ring buffer
9473 + * @sample: Pre-allocated storage for the sample
9474 + *
9475 + * This retrieves a hardware latency sample from the global circular buffer
9476 + */
9477 +static struct sample *buffer_get_sample(struct sample *sample)
9478 +{
9479 +       struct ring_buffer_event *e = NULL;
9480 +       struct sample *s = NULL;
9481 +       unsigned int cpu = 0;
9482 +
9483 +       if (!sample)
9484 +               return NULL;
9485 +
9486 +       mutex_lock(&ring_buffer_mutex);
9487 +       for_each_online_cpu(cpu) {
9488 +               e = ring_buffer_consume(ring_buffer, cpu, NULL, &sample->lost);
9489 +               if (e)
9490 +                       break;
9491 +       }
9492 +
9493 +       if (e) {
9494 +               s = ring_buffer_event_data(e);
9495 +               memcpy(sample, s, sizeof(struct sample));
9496 +       } else
9497 +               sample = NULL;
9498 +       mutex_unlock(&ring_buffer_mutex);
9499 +
9500 +       return sample;
9501 +}
9502 +
9503 +#ifndef CONFIG_TRACING
9504 +#define time_type      ktime_t
9505 +#define time_get()     ktime_get()
9506 +#define time_to_us(x)  ktime_to_us(x)
9507 +#define time_sub(a, b) ktime_sub(a, b)
9508 +#define init_time(a, b)        (a).tv64 = b
9509 +#define time_u64(a)    ((a).tv64)
9510 +#else
9511 +#define time_type      u64
9512 +#define time_get()     trace_clock_local()
9513 +#define time_to_us(x)  div_u64(x, 1000)
9514 +#define time_sub(a, b) ((a) - (b))
9515 +#define init_time(a, b)        (a = b)
9516 +#define time_u64(a)    a
9517 +#endif
9518 +/**
9519 + * get_sample - sample the CPU TSC and look for likely hardware latencies
9520 + *
9521 + * Used to repeatedly capture the CPU TSC (or similar), looking for potential
9522 + * hardware-induced latency. Called with interrupts disabled and with
9523 + * data.lock held.
9524 + */
9525 +static int get_sample(void)
9526 +{
9527 +       time_type start, t1, t2, last_t2;
9528 +       s64 diff, total = 0;
9529 +       u64 sample = 0;
9530 +       u64 outer_sample = 0;
9531 +       int ret = -1;
9532 +
9533 +       init_time(last_t2, 0);
9534 +       start = time_get(); /* start timestamp */
9535 +
9536 +       do {
9537 +
9538 +               t1 = time_get();        /* we'll look for a discontinuity */
9539 +               t2 = time_get();
9540 +
9541 +               if (time_u64(last_t2)) {
9542 +                       /* Check the delta from outer loop (t2 to next t1) */
9543 +                       diff = time_to_us(time_sub(t1, last_t2));
9544 +                       /* This shouldn't happen */
9545 +                       if (diff < 0) {
9546 +                               pr_err(BANNER "time running backwards\n");
9547 +                               goto out;
9548 +                       }
9549 +                       if (diff > outer_sample)
9550 +                               outer_sample = diff;
9551 +               }
9552 +               last_t2 = t2;
9553 +
9554 +               total = time_to_us(time_sub(t2, start)); /* sample width */
9555 +
9556 +               /* This checks the inner loop (t1 to t2) */
9557 +               diff = time_to_us(time_sub(t2, t1));     /* current diff */
9558 +
9559 +               /* This shouldn't happen */
9560 +               if (diff < 0) {
9561 +                       pr_err(BANNER "time running backwards\n");
9562 +                       goto out;
9563 +               }
9564 +
9565 +               if (diff > sample)
9566 +                       sample = diff; /* only want highest value */
9567 +
9568 +       } while (total <= data.sample_width);
9569 +
9570 +       ret = 0;
9571 +
9572 +       /* If we exceed the threshold value, we have found a hardware latency */
9573 +       if (sample > data.threshold || outer_sample > data.threshold) {
9574 +               struct sample s;
9575 +
9576 +               ret = 1;
9577 +
9578 +               data.count++;
9579 +               s.seqnum = data.count;
9580 +               s.duration = sample;
9581 +               s.outer_duration = outer_sample;
9582 +               s.timestamp = CURRENT_TIME;
9583 +               __buffer_add_sample(&s);
9584 +
9585 +               /* Keep a running maximum ever recorded hardware latency */
9586 +               if (sample > data.max_sample)
9587 +                       data.max_sample = sample;
9588 +       }
9589 +
9590 +out:
9591 +       return ret;
9592 +}
9593 +
9594 +/*
9595 + * kthread_fn - The CPU time sampling/hardware latency detection kernel thread
9596 + * @unused: A required part of the kthread API.
9597 + *
9598 + * Used to periodically sample the CPU TSC via a call to get_sample. We
9599 + * disable interrupts, which does (intentionally) introduce latency since we
9600 + * need to ensure nothing else might be running (and thus pre-empting).
9601 + * Obviously this should never be used in production environments.
9602 + *
9603 + * Currently this runs on which ever CPU it was scheduled on, but most
9604 + * real-worald hardware latency situations occur across several CPUs,
9605 + * but we might later generalize this if we find there are any actualy
9606 + * systems with alternate SMI delivery or other hardware latencies.
9607 + */
9608 +static int kthread_fn(void *unused)
9609 +{
9610 +       int ret;
9611 +       u64 interval;
9612 +
9613 +       while (!kthread_should_stop()) {
9614 +
9615 +               mutex_lock(&data.lock);
9616 +
9617 +               local_irq_disable();
9618 +               ret = get_sample();
9619 +               local_irq_enable();
9620 +
9621 +               if (ret > 0)
9622 +                       wake_up(&data.wq); /* wake up reader(s) */
9623 +
9624 +               interval = data.sample_window - data.sample_width;
9625 +               do_div(interval, USEC_PER_MSEC); /* modifies interval value */
9626 +
9627 +               mutex_unlock(&data.lock);
9628 +
9629 +               if (msleep_interruptible(interval))
9630 +                       break;
9631 +       }
9632 +
9633 +       return 0;
9634 +}
9635 +
9636 +/**
9637 + * start_kthread - Kick off the hardware latency sampling/detector kthread
9638 + *
9639 + * This starts a kernel thread that will sit and sample the CPU timestamp
9640 + * counter (TSC or similar) and look for potential hardware latencies.
9641 + */
9642 +static int start_kthread(void)
9643 +{
9644 +       kthread = kthread_run(kthread_fn, NULL,
9645 +                                       DRVNAME);
9646 +       if (IS_ERR(kthread)) {
9647 +               pr_err(BANNER "could not start sampling thread\n");
9648 +               enabled = 0;
9649 +               return -ENOMEM;
9650 +       }
9651 +
9652 +       return 0;
9653 +}
9654 +
9655 +/**
9656 + * stop_kthread - Inform the hardware latency samping/detector kthread to stop
9657 + *
9658 + * This kicks the running hardware latency sampling/detector kernel thread and
9659 + * tells it to stop sampling now. Use this on unload and at system shutdown.
9660 + */
9661 +static int stop_kthread(void)
9662 +{
9663 +       int ret;
9664 +
9665 +       ret = kthread_stop(kthread);
9666 +
9667 +       return ret;
9668 +}
9669 +
9670 +/**
9671 + * __reset_stats - Reset statistics for the hardware latency detector
9672 + *
9673 + * We use data to store various statistics and global state. We call this
9674 + * function in order to reset those when "enable" is toggled on or off, and
9675 + * also at initialization. Should be called with data.lock held.
9676 + */
9677 +static void __reset_stats(void)
9678 +{
9679 +       data.count = 0;
9680 +       data.max_sample = 0;
9681 +       ring_buffer_reset(ring_buffer); /* flush out old sample entries */
9682 +}
9683 +
9684 +/**
9685 + * init_stats - Setup global state statistics for the hardware latency detector
9686 + *
9687 + * We use data to store various statistics and global state. We also use
9688 + * a global ring buffer (ring_buffer) to keep raw samples of detected hardware
9689 + * induced system latencies. This function initializes these structures and
9690 + * allocates the global ring buffer also.
9691 + */
9692 +static int init_stats(void)
9693 +{
9694 +       int ret = -ENOMEM;
9695 +
9696 +       mutex_init(&data.lock);
9697 +       init_waitqueue_head(&data.wq);
9698 +       atomic_set(&data.sample_open, 0);
9699 +
9700 +       ring_buffer = ring_buffer_alloc(buf_size, BUF_FLAGS);
9701 +
9702 +       if (WARN(!ring_buffer, KERN_ERR BANNER
9703 +                              "failed to allocate ring buffer!\n"))
9704 +               goto out;
9705 +
9706 +       __reset_stats();
9707 +       data.threshold = threshold ?: DEFAULT_LAT_THRESHOLD; /* threshold us */
9708 +       data.sample_window = DEFAULT_SAMPLE_WINDOW; /* window us */
9709 +       data.sample_width = DEFAULT_SAMPLE_WIDTH;   /* width us */
9710 +
9711 +       ret = 0;
9712 +
9713 +out:
9714 +       return ret;
9715 +
9716 +}
9717 +
9718 +/*
9719 + * simple_data_read - Wrapper read function for global state debugfs entries
9720 + * @filp: The active open file structure for the debugfs "file"
9721 + * @ubuf: The userspace provided buffer to read value into
9722 + * @cnt: The maximum number of bytes to read
9723 + * @ppos: The current "file" position
9724 + * @entry: The entry to read from
9725 + *
9726 + * This function provides a generic read implementation for the global state
9727 + * "data" structure debugfs filesystem entries. It would be nice to use
9728 + * simple_attr_read directly, but we need to make sure that the data.lock
9729 + * is held during the actual read.
9730 + */
9731 +static ssize_t simple_data_read(struct file *filp, char __user *ubuf,
9732 +                               size_t cnt, loff_t *ppos, const u64 *entry)
9733 +{
9734 +       char buf[U64STR_SIZE];
9735 +       u64 val = 0;
9736 +       int len = 0;
9737 +
9738 +       memset(buf, 0, sizeof(buf));
9739 +
9740 +       if (!entry)
9741 +               return -EFAULT;
9742 +
9743 +       mutex_lock(&data.lock);
9744 +       val = *entry;
9745 +       mutex_unlock(&data.lock);
9746 +
9747 +       len = snprintf(buf, sizeof(buf), "%llu\n", (unsigned long long)val);
9748 +
9749 +       return simple_read_from_buffer(ubuf, cnt, ppos, buf, len);
9750 +
9751 +}
9752 +
9753 +/*
9754 + * simple_data_write - Wrapper write function for global state debugfs entries
9755 + * @filp: The active open file structure for the debugfs "file"
9756 + * @ubuf: The userspace provided buffer to write value from
9757 + * @cnt: The maximum number of bytes to write
9758 + * @ppos: The current "file" position
9759 + * @entry: The entry to write to
9760 + *
9761 + * This function provides a generic write implementation for the global state
9762 + * "data" structure debugfs filesystem entries. It would be nice to use
9763 + * simple_attr_write directly, but we need to make sure that the data.lock
9764 + * is held during the actual write.
9765 + */
9766 +static ssize_t simple_data_write(struct file *filp, const char __user *ubuf,
9767 +                                size_t cnt, loff_t *ppos, u64 *entry)
9768 +{
9769 +       char buf[U64STR_SIZE];
9770 +       int csize = min(cnt, sizeof(buf));
9771 +       u64 val = 0;
9772 +       int err = 0;
9773 +
9774 +       memset(buf, '\0', sizeof(buf));
9775 +       if (copy_from_user(buf, ubuf, csize))
9776 +               return -EFAULT;
9777 +
9778 +       buf[U64STR_SIZE-1] = '\0';                      /* just in case */
9779 +       err = kstrtoull(buf, 10, &val);
9780 +       if (err)
9781 +               return -EINVAL;
9782 +
9783 +       mutex_lock(&data.lock);
9784 +       *entry = val;
9785 +       mutex_unlock(&data.lock);
9786 +
9787 +       return csize;
9788 +}
9789 +
9790 +/**
9791 + * debug_count_fopen - Open function for "count" debugfs entry
9792 + * @inode: The in-kernel inode representation of the debugfs "file"
9793 + * @filp: The active open file structure for the debugfs "file"
9794 + *
9795 + * This function provides an open implementation for the "count" debugfs
9796 + * interface to the hardware latency detector.
9797 + */
9798 +static int debug_count_fopen(struct inode *inode, struct file *filp)
9799 +{
9800 +       return 0;
9801 +}
9802 +
9803 +/**
9804 + * debug_count_fread - Read function for "count" debugfs entry
9805 + * @filp: The active open file structure for the debugfs "file"
9806 + * @ubuf: The userspace provided buffer to read value into
9807 + * @cnt: The maximum number of bytes to read
9808 + * @ppos: The current "file" position
9809 + *
9810 + * This function provides a read implementation for the "count" debugfs
9811 + * interface to the hardware latency detector. Can be used to read the
9812 + * number of latency readings exceeding the configured threshold since
9813 + * the detector was last reset (e.g. by writing a zero into "count").
9814 + */
9815 +static ssize_t debug_count_fread(struct file *filp, char __user *ubuf,
9816 +                                    size_t cnt, loff_t *ppos)
9817 +{
9818 +       return simple_data_read(filp, ubuf, cnt, ppos, &data.count);
9819 +}
9820 +
9821 +/**
9822 + * debug_count_fwrite - Write function for "count" debugfs entry
9823 + * @filp: The active open file structure for the debugfs "file"
9824 + * @ubuf: The user buffer that contains the value to write
9825 + * @cnt: The maximum number of bytes to write to "file"
9826 + * @ppos: The current position in the debugfs "file"
9827 + *
9828 + * This function provides a write implementation for the "count" debugfs
9829 + * interface to the hardware latency detector. Can be used to write a
9830 + * desired value, especially to zero the total count.
9831 + */
9832 +static ssize_t  debug_count_fwrite(struct file *filp,
9833 +                                      const char __user *ubuf,
9834 +                                      size_t cnt,
9835 +                                      loff_t *ppos)
9836 +{
9837 +       return simple_data_write(filp, ubuf, cnt, ppos, &data.count);
9838 +}
9839 +
9840 +/**
9841 + * debug_enable_fopen - Dummy open function for "enable" debugfs interface
9842 + * @inode: The in-kernel inode representation of the debugfs "file"
9843 + * @filp: The active open file structure for the debugfs "file"
9844 + *
9845 + * This function provides an open implementation for the "enable" debugfs
9846 + * interface to the hardware latency detector.
9847 + */
9848 +static int debug_enable_fopen(struct inode *inode, struct file *filp)
9849 +{
9850 +       return 0;
9851 +}
9852 +
9853 +/**
9854 + * debug_enable_fread - Read function for "enable" debugfs interface
9855 + * @filp: The active open file structure for the debugfs "file"
9856 + * @ubuf: The userspace provided buffer to read value into
9857 + * @cnt: The maximum number of bytes to read
9858 + * @ppos: The current "file" position
9859 + *
9860 + * This function provides a read implementation for the "enable" debugfs
9861 + * interface to the hardware latency detector. Can be used to determine
9862 + * whether the detector is currently enabled ("0\n" or "1\n" returned).
9863 + */
9864 +static ssize_t debug_enable_fread(struct file *filp, char __user *ubuf,
9865 +                                     size_t cnt, loff_t *ppos)
9866 +{
9867 +       char buf[4];
9868 +
9869 +       if ((cnt < sizeof(buf)) || (*ppos))
9870 +               return 0;
9871 +
9872 +       buf[0] = enabled ? '1' : '0';
9873 +       buf[1] = '\n';
9874 +       buf[2] = '\0';
9875 +       if (copy_to_user(ubuf, buf, strlen(buf)))
9876 +               return -EFAULT;
9877 +       return *ppos = strlen(buf);
9878 +}
9879 +
9880 +/**
9881 + * debug_enable_fwrite - Write function for "enable" debugfs interface
9882 + * @filp: The active open file structure for the debugfs "file"
9883 + * @ubuf: The user buffer that contains the value to write
9884 + * @cnt: The maximum number of bytes to write to "file"
9885 + * @ppos: The current position in the debugfs "file"
9886 + *
9887 + * This function provides a write implementation for the "enable" debugfs
9888 + * interface to the hardware latency detector. Can be used to enable or
9889 + * disable the detector, which will have the side-effect of possibly
9890 + * also resetting the global stats and kicking off the measuring
9891 + * kthread (on an enable) or the converse (upon a disable).
9892 + */
9893 +static ssize_t  debug_enable_fwrite(struct file *filp,
9894 +                                       const char __user *ubuf,
9895 +                                       size_t cnt,
9896 +                                       loff_t *ppos)
9897 +{
9898 +       char buf[4];
9899 +       int csize = min(cnt, sizeof(buf));
9900 +       long val = 0;
9901 +       int err = 0;
9902 +
9903 +       memset(buf, '\0', sizeof(buf));
9904 +       if (copy_from_user(buf, ubuf, csize))
9905 +               return -EFAULT;
9906 +
9907 +       buf[sizeof(buf)-1] = '\0';                      /* just in case */
9908 +       err = kstrtoul(buf, 10, &val);
9909 +       if (err)
9910 +               return -EINVAL;
9911 +
9912 +       if (val) {
9913 +               if (enabled)
9914 +                       goto unlock;
9915 +               enabled = 1;
9916 +               __reset_stats();
9917 +               if (start_kthread())
9918 +                       return -EFAULT;
9919 +       } else {
9920 +               if (!enabled)
9921 +                       goto unlock;
9922 +               enabled = 0;
9923 +               err = stop_kthread();
9924 +               if (err) {
9925 +                       pr_err(BANNER "cannot stop kthread\n");
9926 +                       return -EFAULT;
9927 +               }
9928 +               wake_up(&data.wq);              /* reader(s) should return */
9929 +       }
9930 +unlock:
9931 +       return csize;
9932 +}
9933 +
9934 +/**
9935 + * debug_max_fopen - Open function for "max" debugfs entry
9936 + * @inode: The in-kernel inode representation of the debugfs "file"
9937 + * @filp: The active open file structure for the debugfs "file"
9938 + *
9939 + * This function provides an open implementation for the "max" debugfs
9940 + * interface to the hardware latency detector.
9941 + */
9942 +static int debug_max_fopen(struct inode *inode, struct file *filp)
9943 +{
9944 +       return 0;
9945 +}
9946 +
9947 +/**
9948 + * debug_max_fread - Read function for "max" debugfs entry
9949 + * @filp: The active open file structure for the debugfs "file"
9950 + * @ubuf: The userspace provided buffer to read value into
9951 + * @cnt: The maximum number of bytes to read
9952 + * @ppos: The current "file" position
9953 + *
9954 + * This function provides a read implementation for the "max" debugfs
9955 + * interface to the hardware latency detector. Can be used to determine
9956 + * the maximum latency value observed since it was last reset.
9957 + */
9958 +static ssize_t debug_max_fread(struct file *filp, char __user *ubuf,
9959 +                                  size_t cnt, loff_t *ppos)
9960 +{
9961 +       return simple_data_read(filp, ubuf, cnt, ppos, &data.max_sample);
9962 +}
9963 +
9964 +/**
9965 + * debug_max_fwrite - Write function for "max" debugfs entry
9966 + * @filp: The active open file structure for the debugfs "file"
9967 + * @ubuf: The user buffer that contains the value to write
9968 + * @cnt: The maximum number of bytes to write to "file"
9969 + * @ppos: The current position in the debugfs "file"
9970 + *
9971 + * This function provides a write implementation for the "max" debugfs
9972 + * interface to the hardware latency detector. Can be used to reset the
9973 + * maximum or set it to some other desired value - if, then, subsequent
9974 + * measurements exceed this value, the maximum will be updated.
9975 + */
9976 +static ssize_t  debug_max_fwrite(struct file *filp,
9977 +                                    const char __user *ubuf,
9978 +                                    size_t cnt,
9979 +                                    loff_t *ppos)
9980 +{
9981 +       return simple_data_write(filp, ubuf, cnt, ppos, &data.max_sample);
9982 +}
9983 +
9984 +
9985 +/**
9986 + * debug_sample_fopen - An open function for "sample" debugfs interface
9987 + * @inode: The in-kernel inode representation of this debugfs "file"
9988 + * @filp: The active open file structure for the debugfs "file"
9989 + *
9990 + * This function handles opening the "sample" file within the hardware
9991 + * latency detector debugfs directory interface. This file is used to read
9992 + * raw samples from the global ring_buffer and allows the user to see a
9993 + * running latency history. Can be opened blocking or non-blocking,
9994 + * affecting whether it behaves as a buffer read pipe, or does not.
9995 + * Implements simple locking to prevent multiple simultaneous use.
9996 + */
9997 +static int debug_sample_fopen(struct inode *inode, struct file *filp)
9998 +{
9999 +       if (!atomic_add_unless(&data.sample_open, 1, 1))
10000 +               return -EBUSY;
10001 +       else
10002 +               return 0;
10003 +}
10004 +
10005 +/**
10006 + * debug_sample_fread - A read function for "sample" debugfs interface
10007 + * @filp: The active open file structure for the debugfs "file"
10008 + * @ubuf: The user buffer that will contain the samples read
10009 + * @cnt: The maximum bytes to read from the debugfs "file"
10010 + * @ppos: The current position in the debugfs "file"
10011 + *
10012 + * This function handles reading from the "sample" file within the hardware
10013 + * latency detector debugfs directory interface. This file is used to read
10014 + * raw samples from the global ring_buffer and allows the user to see a
10015 + * running latency history. By default this will block pending a new
10016 + * value written into the sample buffer, unless there are already a
10017 + * number of value(s) waiting in the buffer, or the sample file was
10018 + * previously opened in a non-blocking mode of operation.
10019 + */
10020 +static ssize_t debug_sample_fread(struct file *filp, char __user *ubuf,
10021 +                                       size_t cnt, loff_t *ppos)
10022 +{
10023 +       int len = 0;
10024 +       char buf[64];
10025 +       struct sample *sample = NULL;
10026 +
10027 +       if (!enabled)
10028 +               return 0;
10029 +
10030 +       sample = kzalloc(sizeof(struct sample), GFP_KERNEL);
10031 +       if (!sample)
10032 +               return -ENOMEM;
10033 +
10034 +       while (!buffer_get_sample(sample)) {
10035 +
10036 +               DEFINE_WAIT(wait);
10037 +
10038 +               if (filp->f_flags & O_NONBLOCK) {
10039 +                       len = -EAGAIN;
10040 +                       goto out;
10041 +               }
10042 +
10043 +               prepare_to_wait(&data.wq, &wait, TASK_INTERRUPTIBLE);
10044 +               schedule();
10045 +               finish_wait(&data.wq, &wait);
10046 +
10047 +               if (signal_pending(current)) {
10048 +                       len = -EINTR;
10049 +                       goto out;
10050 +               }
10051 +
10052 +               if (!enabled) {                 /* enable was toggled */
10053 +                       len = 0;
10054 +                       goto out;
10055 +               }
10056 +       }
10057 +
10058 +       len = snprintf(buf, sizeof(buf), "%010lu.%010lu\t%llu\t%llu\n",
10059 +                      sample->timestamp.tv_sec,
10060 +                      sample->timestamp.tv_nsec,
10061 +                      sample->duration,
10062 +                      sample->outer_duration);
10063 +
10064 +
10065 +       /* handling partial reads is more trouble than it's worth */
10066 +       if (len > cnt)
10067 +               goto out;
10068 +
10069 +       if (copy_to_user(ubuf, buf, len))
10070 +               len = -EFAULT;
10071 +
10072 +out:
10073 +       kfree(sample);
10074 +       return len;
10075 +}
10076 +
10077 +/**
10078 + * debug_sample_release - Release function for "sample" debugfs interface
10079 + * @inode: The in-kernel inode represenation of the debugfs "file"
10080 + * @filp: The active open file structure for the debugfs "file"
10081 + *
10082 + * This function completes the close of the debugfs interface "sample" file.
10083 + * Frees the sample_open "lock" so that other users may open the interface.
10084 + */
10085 +static int debug_sample_release(struct inode *inode, struct file *filp)
10086 +{
10087 +       atomic_dec(&data.sample_open);
10088 +
10089 +       return 0;
10090 +}
10091 +
10092 +/**
10093 + * debug_threshold_fopen - Open function for "threshold" debugfs entry
10094 + * @inode: The in-kernel inode representation of the debugfs "file"
10095 + * @filp: The active open file structure for the debugfs "file"
10096 + *
10097 + * This function provides an open implementation for the "threshold" debugfs
10098 + * interface to the hardware latency detector.
10099 + */
10100 +static int debug_threshold_fopen(struct inode *inode, struct file *filp)
10101 +{
10102 +       return 0;
10103 +}
10104 +
10105 +/**
10106 + * debug_threshold_fread - Read function for "threshold" debugfs entry
10107 + * @filp: The active open file structure for the debugfs "file"
10108 + * @ubuf: The userspace provided buffer to read value into
10109 + * @cnt: The maximum number of bytes to read
10110 + * @ppos: The current "file" position
10111 + *
10112 + * This function provides a read implementation for the "threshold" debugfs
10113 + * interface to the hardware latency detector. It can be used to determine
10114 + * the current threshold level at which a latency will be recorded in the
10115 + * global ring buffer, typically on the order of 10us.
10116 + */
10117 +static ssize_t debug_threshold_fread(struct file *filp, char __user *ubuf,
10118 +                                        size_t cnt, loff_t *ppos)
10119 +{
10120 +       return simple_data_read(filp, ubuf, cnt, ppos, &data.threshold);
10121 +}
10122 +
10123 +/**
10124 + * debug_threshold_fwrite - Write function for "threshold" debugfs entry
10125 + * @filp: The active open file structure for the debugfs "file"
10126 + * @ubuf: The user buffer that contains the value to write
10127 + * @cnt: The maximum number of bytes to write to "file"
10128 + * @ppos: The current position in the debugfs "file"
10129 + *
10130 + * This function provides a write implementation for the "threshold" debugfs
10131 + * interface to the hardware latency detector. It can be used to configure
10132 + * the threshold level at which any subsequently detected latencies will
10133 + * be recorded into the global ring buffer.
10134 + */
10135 +static ssize_t  debug_threshold_fwrite(struct file *filp,
10136 +                                       const char __user *ubuf,
10137 +                                       size_t cnt,
10138 +                                       loff_t *ppos)
10139 +{
10140 +       int ret;
10141 +
10142 +       ret = simple_data_write(filp, ubuf, cnt, ppos, &data.threshold);
10143 +
10144 +       if (enabled)
10145 +               wake_up_process(kthread);
10146 +
10147 +       return ret;
10148 +}
10149 +
10150 +/**
10151 + * debug_width_fopen - Open function for "width" debugfs entry
10152 + * @inode: The in-kernel inode representation of the debugfs "file"
10153 + * @filp: The active open file structure for the debugfs "file"
10154 + *
10155 + * This function provides an open implementation for the "width" debugfs
10156 + * interface to the hardware latency detector.
10157 + */
10158 +static int debug_width_fopen(struct inode *inode, struct file *filp)
10159 +{
10160 +       return 0;
10161 +}
10162 +
10163 +/**
10164 + * debug_width_fread - Read function for "width" debugfs entry
10165 + * @filp: The active open file structure for the debugfs "file"
10166 + * @ubuf: The userspace provided buffer to read value into
10167 + * @cnt: The maximum number of bytes to read
10168 + * @ppos: The current "file" position
10169 + *
10170 + * This function provides a read implementation for the "width" debugfs
10171 + * interface to the hardware latency detector. It can be used to determine
10172 + * for how many us of the total window us we will actively sample for any
10173 + * hardware-induced latecy periods. Obviously, it is not possible to
10174 + * sample constantly and have the system respond to a sample reader, or,
10175 + * worse, without having the system appear to have gone out to lunch.
10176 + */
10177 +static ssize_t debug_width_fread(struct file *filp, char __user *ubuf,
10178 +                                    size_t cnt, loff_t *ppos)
10179 +{
10180 +       return simple_data_read(filp, ubuf, cnt, ppos, &data.sample_width);
10181 +}
10182 +
10183 +/**
10184 + * debug_width_fwrite - Write function for "width" debugfs entry
10185 + * @filp: The active open file structure for the debugfs "file"
10186 + * @ubuf: The user buffer that contains the value to write
10187 + * @cnt: The maximum number of bytes to write to "file"
10188 + * @ppos: The current position in the debugfs "file"
10189 + *
10190 + * This function provides a write implementation for the "width" debugfs
10191 + * interface to the hardware latency detector. It can be used to configure
10192 + * for how many us of the total window us we will actively sample for any
10193 + * hardware-induced latency periods. Obviously, it is not possible to
10194 + * sample constantly and have the system respond to a sample reader, or,
10195 + * worse, without having the system appear to have gone out to lunch. It
10196 + * is enforced that width is less that the total window size.
10197 + */
10198 +static ssize_t  debug_width_fwrite(struct file *filp,
10199 +                                      const char __user *ubuf,
10200 +                                      size_t cnt,
10201 +                                      loff_t *ppos)
10202 +{
10203 +       char buf[U64STR_SIZE];
10204 +       int csize = min(cnt, sizeof(buf));
10205 +       u64 val = 0;
10206 +       int err = 0;
10207 +
10208 +       memset(buf, '\0', sizeof(buf));
10209 +       if (copy_from_user(buf, ubuf, csize))
10210 +               return -EFAULT;
10211 +
10212 +       buf[U64STR_SIZE-1] = '\0';                      /* just in case */
10213 +       err = kstrtoull(buf, 10, &val);
10214 +       if (err)
10215 +               return -EINVAL;
10216 +
10217 +       mutex_lock(&data.lock);
10218 +       if (val < data.sample_window)
10219 +               data.sample_width = val;
10220 +       else {
10221 +               mutex_unlock(&data.lock);
10222 +               return -EINVAL;
10223 +       }
10224 +       mutex_unlock(&data.lock);
10225 +
10226 +       if (enabled)
10227 +               wake_up_process(kthread);
10228 +
10229 +       return csize;
10230 +}
10231 +
10232 +/**
10233 + * debug_window_fopen - Open function for "window" debugfs entry
10234 + * @inode: The in-kernel inode representation of the debugfs "file"
10235 + * @filp: The active open file structure for the debugfs "file"
10236 + *
10237 + * This function provides an open implementation for the "window" debugfs
10238 + * interface to the hardware latency detector. The window is the total time
10239 + * in us that will be considered one sample period. Conceptually, windows
10240 + * occur back-to-back and contain a sample width period during which
10241 + * actual sampling occurs.
10242 + */
10243 +static int debug_window_fopen(struct inode *inode, struct file *filp)
10244 +{
10245 +       return 0;
10246 +}
10247 +
10248 +/**
10249 + * debug_window_fread - Read function for "window" debugfs entry
10250 + * @filp: The active open file structure for the debugfs "file"
10251 + * @ubuf: The userspace provided buffer to read value into
10252 + * @cnt: The maximum number of bytes to read
10253 + * @ppos: The current "file" position
10254 + *
10255 + * This function provides a read implementation for the "window" debugfs
10256 + * interface to the hardware latency detector. The window is the total time
10257 + * in us that will be considered one sample period. Conceptually, windows
10258 + * occur back-to-back and contain a sample width period during which
10259 + * actual sampling occurs. Can be used to read the total window size.
10260 + */
10261 +static ssize_t debug_window_fread(struct file *filp, char __user *ubuf,
10262 +                                     size_t cnt, loff_t *ppos)
10263 +{
10264 +       return simple_data_read(filp, ubuf, cnt, ppos, &data.sample_window);
10265 +}
10266 +
10267 +/**
10268 + * debug_window_fwrite - Write function for "window" debugfs entry
10269 + * @filp: The active open file structure for the debugfs "file"
10270 + * @ubuf: The user buffer that contains the value to write
10271 + * @cnt: The maximum number of bytes to write to "file"
10272 + * @ppos: The current position in the debugfs "file"
10273 + *
10274 + * This function provides a write implementation for the "window" debufds
10275 + * interface to the hardware latency detetector. The window is the total time
10276 + * in us that will be considered one sample period. Conceptually, windows
10277 + * occur back-to-back and contain a sample width period during which
10278 + * actual sampling occurs. Can be used to write a new total window size. It
10279 + * is enfoced that any value written must be greater than the sample width
10280 + * size, or an error results.
10281 + */
10282 +static ssize_t  debug_window_fwrite(struct file *filp,
10283 +                                       const char __user *ubuf,
10284 +                                       size_t cnt,
10285 +                                       loff_t *ppos)
10286 +{
10287 +       char buf[U64STR_SIZE];
10288 +       int csize = min(cnt, sizeof(buf));
10289 +       u64 val = 0;
10290 +       int err = 0;
10291 +
10292 +       memset(buf, '\0', sizeof(buf));
10293 +       if (copy_from_user(buf, ubuf, csize))
10294 +               return -EFAULT;
10295 +
10296 +       buf[U64STR_SIZE-1] = '\0';                      /* just in case */
10297 +       err = kstrtoull(buf, 10, &val);
10298 +       if (err)
10299 +               return -EINVAL;
10300 +
10301 +       mutex_lock(&data.lock);
10302 +       if (data.sample_width < val)
10303 +               data.sample_window = val;
10304 +       else {
10305 +               mutex_unlock(&data.lock);
10306 +               return -EINVAL;
10307 +       }
10308 +       mutex_unlock(&data.lock);
10309 +
10310 +       return csize;
10311 +}
10312 +
10313 +/*
10314 + * Function pointers for the "count" debugfs file operations
10315 + */
10316 +static const struct file_operations count_fops = {
10317 +       .open           = debug_count_fopen,
10318 +       .read           = debug_count_fread,
10319 +       .write          = debug_count_fwrite,
10320 +       .owner          = THIS_MODULE,
10321 +};
10322 +
10323 +/*
10324 + * Function pointers for the "enable" debugfs file operations
10325 + */
10326 +static const struct file_operations enable_fops = {
10327 +       .open           = debug_enable_fopen,
10328 +       .read           = debug_enable_fread,
10329 +       .write          = debug_enable_fwrite,
10330 +       .owner          = THIS_MODULE,
10331 +};
10332 +
10333 +/*
10334 + * Function pointers for the "max" debugfs file operations
10335 + */
10336 +static const struct file_operations max_fops = {
10337 +       .open           = debug_max_fopen,
10338 +       .read           = debug_max_fread,
10339 +       .write          = debug_max_fwrite,
10340 +       .owner          = THIS_MODULE,
10341 +};
10342 +
10343 +/*
10344 + * Function pointers for the "sample" debugfs file operations
10345 + */
10346 +static const struct file_operations sample_fops = {
10347 +       .open           = debug_sample_fopen,
10348 +       .read           = debug_sample_fread,
10349 +       .release        = debug_sample_release,
10350 +       .owner          = THIS_MODULE,
10351 +};
10352 +
10353 +/*
10354 + * Function pointers for the "threshold" debugfs file operations
10355 + */
10356 +static const struct file_operations threshold_fops = {
10357 +       .open           = debug_threshold_fopen,
10358 +       .read           = debug_threshold_fread,
10359 +       .write          = debug_threshold_fwrite,
10360 +       .owner          = THIS_MODULE,
10361 +};
10362 +
10363 +/*
10364 + * Function pointers for the "width" debugfs file operations
10365 + */
10366 +static const struct file_operations width_fops = {
10367 +       .open           = debug_width_fopen,
10368 +       .read           = debug_width_fread,
10369 +       .write          = debug_width_fwrite,
10370 +       .owner          = THIS_MODULE,
10371 +};
10372 +
10373 +/*
10374 + * Function pointers for the "window" debugfs file operations
10375 + */
10376 +static const struct file_operations window_fops = {
10377 +       .open           = debug_window_fopen,
10378 +       .read           = debug_window_fread,
10379 +       .write          = debug_window_fwrite,
10380 +       .owner          = THIS_MODULE,
10381 +};
10382 +
10383 +/**
10384 + * init_debugfs - A function to initialize the debugfs interface files
10385 + *
10386 + * This function creates entries in debugfs for "hwlat_detector", including
10387 + * files to read values from the detector, current samples, and the
10388 + * maximum sample that has been captured since the hardware latency
10389 + * dectector was started.
10390 + */
10391 +static int init_debugfs(void)
10392 +{
10393 +       int ret = -ENOMEM;
10394 +
10395 +       debug_dir = debugfs_create_dir(DRVNAME, NULL);
10396 +       if (!debug_dir)
10397 +               goto err_debug_dir;
10398 +
10399 +       debug_sample = debugfs_create_file("sample", 0444,
10400 +                                              debug_dir, NULL,
10401 +                                              &sample_fops);
10402 +       if (!debug_sample)
10403 +               goto err_sample;
10404 +
10405 +       debug_count = debugfs_create_file("count", 0444,
10406 +                                             debug_dir, NULL,
10407 +                                             &count_fops);
10408 +       if (!debug_count)
10409 +               goto err_count;
10410 +
10411 +       debug_max = debugfs_create_file("max", 0444,
10412 +                                           debug_dir, NULL,
10413 +                                           &max_fops);
10414 +       if (!debug_max)
10415 +               goto err_max;
10416 +
10417 +       debug_sample_window = debugfs_create_file("window", 0644,
10418 +                                                     debug_dir, NULL,
10419 +                                                     &window_fops);
10420 +       if (!debug_sample_window)
10421 +               goto err_window;
10422 +
10423 +       debug_sample_width = debugfs_create_file("width", 0644,
10424 +                                                    debug_dir, NULL,
10425 +                                                    &width_fops);
10426 +       if (!debug_sample_width)
10427 +               goto err_width;
10428 +
10429 +       debug_threshold = debugfs_create_file("threshold", 0644,
10430 +                                                 debug_dir, NULL,
10431 +                                                 &threshold_fops);
10432 +       if (!debug_threshold)
10433 +               goto err_threshold;
10434 +
10435 +       debug_enable = debugfs_create_file("enable", 0644,
10436 +                                              debug_dir, &enabled,
10437 +                                              &enable_fops);
10438 +       if (!debug_enable)
10439 +               goto err_enable;
10440 +
10441 +       else {
10442 +               ret = 0;
10443 +               goto out;
10444 +       }
10445 +
10446 +err_enable:
10447 +       debugfs_remove(debug_threshold);
10448 +err_threshold:
10449 +       debugfs_remove(debug_sample_width);
10450 +err_width:
10451 +       debugfs_remove(debug_sample_window);
10452 +err_window:
10453 +       debugfs_remove(debug_max);
10454 +err_max:
10455 +       debugfs_remove(debug_count);
10456 +err_count:
10457 +       debugfs_remove(debug_sample);
10458 +err_sample:
10459 +       debugfs_remove(debug_dir);
10460 +err_debug_dir:
10461 +out:
10462 +       return ret;
10463 +}
10464 +
10465 +/**
10466 + * free_debugfs - A function to cleanup the debugfs file interface
10467 + */
10468 +static void free_debugfs(void)
10469 +{
10470 +       /* could also use a debugfs_remove_recursive */
10471 +       debugfs_remove(debug_enable);
10472 +       debugfs_remove(debug_threshold);
10473 +       debugfs_remove(debug_sample_width);
10474 +       debugfs_remove(debug_sample_window);
10475 +       debugfs_remove(debug_max);
10476 +       debugfs_remove(debug_count);
10477 +       debugfs_remove(debug_sample);
10478 +       debugfs_remove(debug_dir);
10479 +}
10480 +
10481 +/**
10482 + * detector_init - Standard module initialization code
10483 + */
10484 +static int detector_init(void)
10485 +{
10486 +       int ret = -ENOMEM;
10487 +
10488 +       pr_info(BANNER "version %s\n", VERSION);
10489 +
10490 +       ret = init_stats();
10491 +       if (ret)
10492 +               goto out;
10493 +
10494 +       ret = init_debugfs();
10495 +       if (ret)
10496 +               goto err_stats;
10497 +
10498 +       if (enabled)
10499 +               ret = start_kthread();
10500 +
10501 +       goto out;
10502 +
10503 +err_stats:
10504 +       ring_buffer_free(ring_buffer);
10505 +out:
10506 +       return ret;
10507 +
10508 +}
10509 +
10510 +/**
10511 + * detector_exit - Standard module cleanup code
10512 + */
10513 +static void detector_exit(void)
10514 +{
10515 +       int err;
10516 +
10517 +       if (enabled) {
10518 +               enabled = 0;
10519 +               err = stop_kthread();
10520 +               if (err)
10521 +                       pr_err(BANNER "cannot stop kthread\n");
10522 +       }
10523 +
10524 +       free_debugfs();
10525 +       ring_buffer_free(ring_buffer);  /* free up the ring buffer */
10526 +
10527 +}
10528 +
10529 +module_init(detector_init);
10530 +module_exit(detector_exit);
10531 diff --git a/drivers/mmc/host/mmci.c b/drivers/mmc/host/mmci.c
10532 index acece3299756..58ea04a03fa9 100644
10533 --- a/drivers/mmc/host/mmci.c
10534 +++ b/drivers/mmc/host/mmci.c
10535 @@ -1155,15 +1155,12 @@ static irqreturn_t mmci_pio_irq(int irq, void *dev_id)
10536         struct sg_mapping_iter *sg_miter = &host->sg_miter;
10537         struct variant_data *variant = host->variant;
10538         void __iomem *base = host->base;
10539 -       unsigned long flags;
10540         u32 status;
10541
10542         status = readl(base + MMCISTATUS);
10543
10544         dev_dbg(mmc_dev(host->mmc), "irq1 (pio) %08x\n", status);
10545
10546 -       local_irq_save(flags);
10547 -
10548         do {
10549                 unsigned int remain, len;
10550                 char *buffer;
10551 @@ -1203,8 +1200,6 @@ static irqreturn_t mmci_pio_irq(int irq, void *dev_id)
10552
10553         sg_miter_stop(sg_miter);
10554
10555 -       local_irq_restore(flags);
10556 -
10557         /*
10558          * If we have less than the fifo 'half-full' threshold to transfer,
10559          * trigger a PIO interrupt as soon as any data is available.
10560 diff --git a/drivers/net/ethernet/3com/3c59x.c b/drivers/net/ethernet/3com/3c59x.c
10561 index 2839af00f20c..4348b9c850d3 100644
10562 --- a/drivers/net/ethernet/3com/3c59x.c
10563 +++ b/drivers/net/ethernet/3com/3c59x.c
10564 @@ -842,9 +842,9 @@ static void poll_vortex(struct net_device *dev)
10565  {
10566         struct vortex_private *vp = netdev_priv(dev);
10567         unsigned long flags;
10568 -       local_irq_save(flags);
10569 +       local_irq_save_nort(flags);
10570         (vp->full_bus_master_rx ? boomerang_interrupt:vortex_interrupt)(dev->irq,dev);
10571 -       local_irq_restore(flags);
10572 +       local_irq_restore_nort(flags);
10573  }
10574  #endif
10575
10576 @@ -1916,12 +1916,12 @@ static void vortex_tx_timeout(struct net_device *dev)
10577                          * Block interrupts because vortex_interrupt does a bare spin_lock()
10578                          */
10579                         unsigned long flags;
10580 -                       local_irq_save(flags);
10581 +                       local_irq_save_nort(flags);
10582                         if (vp->full_bus_master_tx)
10583                                 boomerang_interrupt(dev->irq, dev);
10584                         else
10585                                 vortex_interrupt(dev->irq, dev);
10586 -                       local_irq_restore(flags);
10587 +                       local_irq_restore_nort(flags);
10588                 }
10589         }
10590
10591 diff --git a/drivers/net/ethernet/atheros/atl1c/atl1c_main.c b/drivers/net/ethernet/atheros/atl1c/atl1c_main.c
10592 index 8b5988e210d5..cf9928ccdd7e 100644
10593 --- a/drivers/net/ethernet/atheros/atl1c/atl1c_main.c
10594 +++ b/drivers/net/ethernet/atheros/atl1c/atl1c_main.c
10595 @@ -2221,11 +2221,7 @@ static netdev_tx_t atl1c_xmit_frame(struct sk_buff *skb,
10596         }
10597
10598         tpd_req = atl1c_cal_tpd_req(skb);
10599 -       if (!spin_trylock_irqsave(&adapter->tx_lock, flags)) {
10600 -               if (netif_msg_pktdata(adapter))
10601 -                       dev_info(&adapter->pdev->dev, "tx locked\n");
10602 -               return NETDEV_TX_LOCKED;
10603 -       }
10604 +       spin_lock_irqsave(&adapter->tx_lock, flags);
10605
10606         if (atl1c_tpd_avail(adapter, type) < tpd_req) {
10607                 /* no enough descriptor, just stop queue */
10608 diff --git a/drivers/net/ethernet/atheros/atl1e/atl1e_main.c b/drivers/net/ethernet/atheros/atl1e/atl1e_main.c
10609 index 59a03a193e83..734f7a7ad2c3 100644
10610 --- a/drivers/net/ethernet/atheros/atl1e/atl1e_main.c
10611 +++ b/drivers/net/ethernet/atheros/atl1e/atl1e_main.c
10612 @@ -1880,8 +1880,7 @@ static netdev_tx_t atl1e_xmit_frame(struct sk_buff *skb,
10613                 return NETDEV_TX_OK;
10614         }
10615         tpd_req = atl1e_cal_tdp_req(skb);
10616 -       if (!spin_trylock_irqsave(&adapter->tx_lock, flags))
10617 -               return NETDEV_TX_LOCKED;
10618 +       spin_lock_irqsave(&adapter->tx_lock, flags);
10619
10620         if (atl1e_tpd_avail(adapter) < tpd_req) {
10621                 /* no enough descriptor, just stop queue */
10622 diff --git a/drivers/net/ethernet/chelsio/cxgb/sge.c b/drivers/net/ethernet/chelsio/cxgb/sge.c
10623 index 526ea74e82d9..86f467a2c485 100644
10624 --- a/drivers/net/ethernet/chelsio/cxgb/sge.c
10625 +++ b/drivers/net/ethernet/chelsio/cxgb/sge.c
10626 @@ -1664,8 +1664,7 @@ static int t1_sge_tx(struct sk_buff *skb, struct adapter *adapter,
10627         struct cmdQ *q = &sge->cmdQ[qid];
10628         unsigned int credits, pidx, genbit, count, use_sched_skb = 0;
10629
10630 -       if (!spin_trylock(&q->lock))
10631 -               return NETDEV_TX_LOCKED;
10632 +       spin_lock(&q->lock);
10633
10634         reclaim_completed_tx(sge, q);
10635
10636 diff --git a/drivers/net/ethernet/neterion/s2io.c b/drivers/net/ethernet/neterion/s2io.c
10637 index 9ba975853ec6..813cfa698160 100644
10638 --- a/drivers/net/ethernet/neterion/s2io.c
10639 +++ b/drivers/net/ethernet/neterion/s2io.c
10640 @@ -4084,12 +4084,7 @@ static netdev_tx_t s2io_xmit(struct sk_buff *skb, struct net_device *dev)
10641                         [skb->priority & (MAX_TX_FIFOS - 1)];
10642         fifo = &mac_control->fifos[queue];
10643
10644 -       if (do_spin_lock)
10645 -               spin_lock_irqsave(&fifo->tx_lock, flags);
10646 -       else {
10647 -               if (unlikely(!spin_trylock_irqsave(&fifo->tx_lock, flags)))
10648 -                       return NETDEV_TX_LOCKED;
10649 -       }
10650 +       spin_lock_irqsave(&fifo->tx_lock, flags);
10651
10652         if (sp->config.multiq) {
10653                 if (__netif_subqueue_stopped(dev, fifo->fifo_no)) {
10654 diff --git a/drivers/net/ethernet/oki-semi/pch_gbe/pch_gbe_main.c b/drivers/net/ethernet/oki-semi/pch_gbe/pch_gbe_main.c
10655 index 3b98b263bad0..ca4add749410 100644
10656 --- a/drivers/net/ethernet/oki-semi/pch_gbe/pch_gbe_main.c
10657 +++ b/drivers/net/ethernet/oki-semi/pch_gbe/pch_gbe_main.c
10658 @@ -2137,10 +2137,8 @@ static int pch_gbe_xmit_frame(struct sk_buff *skb, struct net_device *netdev)
10659         struct pch_gbe_tx_ring *tx_ring = adapter->tx_ring;
10660         unsigned long flags;
10661
10662 -       if (!spin_trylock_irqsave(&tx_ring->tx_lock, flags)) {
10663 -               /* Collision - tell upper layer to requeue */
10664 -               return NETDEV_TX_LOCKED;
10665 -       }
10666 +       spin_lock_irqsave(&tx_ring->tx_lock, flags);
10667 +
10668         if (unlikely(!PCH_GBE_DESC_UNUSED(tx_ring))) {
10669                 netif_stop_queue(netdev);
10670                 spin_unlock_irqrestore(&tx_ring->tx_lock, flags);
10671 diff --git a/drivers/net/ethernet/realtek/8139too.c b/drivers/net/ethernet/realtek/8139too.c
10672 index ef668d300800..d987d571fdd6 100644
10673 --- a/drivers/net/ethernet/realtek/8139too.c
10674 +++ b/drivers/net/ethernet/realtek/8139too.c
10675 @@ -2229,7 +2229,7 @@ static void rtl8139_poll_controller(struct net_device *dev)
10676         struct rtl8139_private *tp = netdev_priv(dev);
10677         const int irq = tp->pci_dev->irq;
10678
10679 -       disable_irq(irq);
10680 +       disable_irq_nosync(irq);
10681         rtl8139_interrupt(irq, dev);
10682         enable_irq(irq);
10683  }
10684 diff --git a/drivers/net/ethernet/tehuti/tehuti.c b/drivers/net/ethernet/tehuti/tehuti.c
10685 index 14c9d1baa85c..e1a5305418a8 100644
10686 --- a/drivers/net/ethernet/tehuti/tehuti.c
10687 +++ b/drivers/net/ethernet/tehuti/tehuti.c
10688 @@ -1629,13 +1629,8 @@ static netdev_tx_t bdx_tx_transmit(struct sk_buff *skb,
10689         unsigned long flags;
10690
10691         ENTER;
10692 -       local_irq_save(flags);
10693 -       if (!spin_trylock(&priv->tx_lock)) {
10694 -               local_irq_restore(flags);
10695 -               DBG("%s[%s]: TX locked, returning NETDEV_TX_LOCKED\n",
10696 -                   BDX_DRV_NAME, ndev->name);
10697 -               return NETDEV_TX_LOCKED;
10698 -       }
10699 +
10700 +       spin_lock_irqsave(&priv->tx_lock, flags);
10701
10702         /* build tx descriptor */
10703         BDX_ASSERT(f->m.wptr >= f->m.memsz);    /* started with valid wptr */
10704 diff --git a/drivers/net/rionet.c b/drivers/net/rionet.c
10705 index e7034c55e796..2e4ee0f912bf 100644
10706 --- a/drivers/net/rionet.c
10707 +++ b/drivers/net/rionet.c
10708 @@ -174,11 +174,7 @@ static int rionet_start_xmit(struct sk_buff *skb, struct net_device *ndev)
10709         unsigned long flags;
10710         int add_num = 1;
10711
10712 -       local_irq_save(flags);
10713 -       if (!spin_trylock(&rnet->tx_lock)) {
10714 -               local_irq_restore(flags);
10715 -               return NETDEV_TX_LOCKED;
10716 -       }
10717 +       spin_lock_irqsave(&rnet->tx_lock, flags);
10718
10719         if (is_multicast_ether_addr(eth->h_dest))
10720                 add_num = nets[rnet->mport->id].nact;
10721 diff --git a/drivers/net/wireless/orinoco/orinoco_usb.c b/drivers/net/wireless/orinoco/orinoco_usb.c
10722 index f2cd513d54b2..6c0f4c9638a2 100644
10723 --- a/drivers/net/wireless/orinoco/orinoco_usb.c
10724 +++ b/drivers/net/wireless/orinoco/orinoco_usb.c
10725 @@ -697,7 +697,7 @@ static void ezusb_req_ctx_wait(struct ezusb_priv *upriv,
10726                         while (!ctx->done.done && msecs--)
10727                                 udelay(1000);
10728                 } else {
10729 -                       wait_event_interruptible(ctx->done.wait,
10730 +                       swait_event_interruptible(ctx->done.wait,
10731                                                  ctx->done.done);
10732                 }
10733                 break;
10734 diff --git a/drivers/pci/access.c b/drivers/pci/access.c
10735 index 59ac36fe7c42..7a45a20af78a 100644
10736 --- a/drivers/pci/access.c
10737 +++ b/drivers/pci/access.c
10738 @@ -561,7 +561,7 @@ void pci_cfg_access_unlock(struct pci_dev *dev)
10739         WARN_ON(!dev->block_cfg_access);
10740
10741         dev->block_cfg_access = 0;
10742 -       wake_up_all(&pci_cfg_wait);
10743 +       wake_up_all_locked(&pci_cfg_wait);
10744         raw_spin_unlock_irqrestore(&pci_lock, flags);
10745  }
10746  EXPORT_SYMBOL_GPL(pci_cfg_access_unlock);
10747 diff --git a/drivers/scsi/fcoe/fcoe.c b/drivers/scsi/fcoe/fcoe.c
10748 index f4424063b860..cbbbebd86c6e 100644
10749 --- a/drivers/scsi/fcoe/fcoe.c
10750 +++ b/drivers/scsi/fcoe/fcoe.c
10751 @@ -1286,7 +1286,7 @@ static void fcoe_percpu_thread_destroy(unsigned int cpu)
10752         struct sk_buff *skb;
10753  #ifdef CONFIG_SMP
10754         struct fcoe_percpu_s *p0;
10755 -       unsigned targ_cpu = get_cpu();
10756 +       unsigned targ_cpu = get_cpu_light();
10757  #endif /* CONFIG_SMP */
10758
10759         FCOE_DBG("Destroying receive thread for CPU %d\n", cpu);
10760 @@ -1342,7 +1342,7 @@ static void fcoe_percpu_thread_destroy(unsigned int cpu)
10761                         kfree_skb(skb);
10762                 spin_unlock_bh(&p->fcoe_rx_list.lock);
10763         }
10764 -       put_cpu();
10765 +       put_cpu_light();
10766  #else
10767         /*
10768          * This a non-SMP scenario where the singular Rx thread is
10769 @@ -1566,11 +1566,11 @@ err2:
10770  static int fcoe_alloc_paged_crc_eof(struct sk_buff *skb, int tlen)
10771  {
10772         struct fcoe_percpu_s *fps;
10773 -       int rc;
10774 +       int rc, cpu = get_cpu_light();
10775
10776 -       fps = &get_cpu_var(fcoe_percpu);
10777 +       fps = &per_cpu(fcoe_percpu, cpu);
10778         rc = fcoe_get_paged_crc_eof(skb, tlen, fps);
10779 -       put_cpu_var(fcoe_percpu);
10780 +       put_cpu_light();
10781
10782         return rc;
10783  }
10784 @@ -1766,11 +1766,11 @@ static inline int fcoe_filter_frames(struct fc_lport *lport,
10785                 return 0;
10786         }
10787
10788 -       stats = per_cpu_ptr(lport->stats, get_cpu());
10789 +       stats = per_cpu_ptr(lport->stats, get_cpu_light());
10790         stats->InvalidCRCCount++;
10791         if (stats->InvalidCRCCount < 5)
10792                 printk(KERN_WARNING "fcoe: dropping frame with CRC error\n");
10793 -       put_cpu();
10794 +       put_cpu_light();
10795         return -EINVAL;
10796  }
10797
10798 @@ -1814,7 +1814,7 @@ static void fcoe_recv_frame(struct sk_buff *skb)
10799          */
10800         hp = (struct fcoe_hdr *) skb_network_header(skb);
10801
10802 -       stats = per_cpu_ptr(lport->stats, get_cpu());
10803 +       stats = per_cpu_ptr(lport->stats, get_cpu_light());
10804         if (unlikely(FC_FCOE_DECAPS_VER(hp) != FC_FCOE_VER)) {
10805                 if (stats->ErrorFrames < 5)
10806                         printk(KERN_WARNING "fcoe: FCoE version "
10807 @@ -1846,13 +1846,13 @@ static void fcoe_recv_frame(struct sk_buff *skb)
10808                 goto drop;
10809
10810         if (!fcoe_filter_frames(lport, fp)) {
10811 -               put_cpu();
10812 +               put_cpu_light();
10813                 fc_exch_recv(lport, fp);
10814                 return;
10815         }
10816  drop:
10817         stats->ErrorFrames++;
10818 -       put_cpu();
10819 +       put_cpu_light();
10820         kfree_skb(skb);
10821  }
10822
10823 diff --git a/drivers/scsi/fcoe/fcoe_ctlr.c b/drivers/scsi/fcoe/fcoe_ctlr.c
10824 index 34a1b1f333b4..d91131210695 100644
10825 --- a/drivers/scsi/fcoe/fcoe_ctlr.c
10826 +++ b/drivers/scsi/fcoe/fcoe_ctlr.c
10827 @@ -831,7 +831,7 @@ static unsigned long fcoe_ctlr_age_fcfs(struct fcoe_ctlr *fip)
10828
10829         INIT_LIST_HEAD(&del_list);
10830
10831 -       stats = per_cpu_ptr(fip->lp->stats, get_cpu());
10832 +       stats = per_cpu_ptr(fip->lp->stats, get_cpu_light());
10833
10834         list_for_each_entry_safe(fcf, next, &fip->fcfs, list) {
10835                 deadline = fcf->time + fcf->fka_period + fcf->fka_period / 2;
10836 @@ -867,7 +867,7 @@ static unsigned long fcoe_ctlr_age_fcfs(struct fcoe_ctlr *fip)
10837                                 sel_time = fcf->time;
10838                 }
10839         }
10840 -       put_cpu();
10841 +       put_cpu_light();
10842
10843         list_for_each_entry_safe(fcf, next, &del_list, list) {
10844                 /* Removes fcf from current list */
10845 diff --git a/drivers/scsi/libfc/fc_exch.c b/drivers/scsi/libfc/fc_exch.c
10846 index 30f9ef0c0d4f..6c686bc01a82 100644
10847 --- a/drivers/scsi/libfc/fc_exch.c
10848 +++ b/drivers/scsi/libfc/fc_exch.c
10849 @@ -814,10 +814,10 @@ static struct fc_exch *fc_exch_em_alloc(struct fc_lport *lport,
10850         }
10851         memset(ep, 0, sizeof(*ep));
10852
10853 -       cpu = get_cpu();
10854 +       cpu = get_cpu_light();
10855         pool = per_cpu_ptr(mp->pool, cpu);
10856         spin_lock_bh(&pool->lock);
10857 -       put_cpu();
10858 +       put_cpu_light();
10859
10860         /* peek cache of free slot */
10861         if (pool->left != FC_XID_UNKNOWN) {
10862 diff --git a/drivers/scsi/libsas/sas_ata.c b/drivers/scsi/libsas/sas_ata.c
10863 index 9c706d8c1441..d968ffc79c08 100644
10864 --- a/drivers/scsi/libsas/sas_ata.c
10865 +++ b/drivers/scsi/libsas/sas_ata.c
10866 @@ -190,7 +190,7 @@ static unsigned int sas_ata_qc_issue(struct ata_queued_cmd *qc)
10867         /* TODO: audit callers to ensure they are ready for qc_issue to
10868          * unconditionally re-enable interrupts
10869          */
10870 -       local_irq_save(flags);
10871 +       local_irq_save_nort(flags);
10872         spin_unlock(ap->lock);
10873
10874         /* If the device fell off, no sense in issuing commands */
10875 @@ -255,7 +255,7 @@ static unsigned int sas_ata_qc_issue(struct ata_queued_cmd *qc)
10876
10877   out:
10878         spin_lock(ap->lock);
10879 -       local_irq_restore(flags);
10880 +       local_irq_restore_nort(flags);
10881         return ret;
10882  }
10883
10884 diff --git a/drivers/scsi/qla2xxx/qla_inline.h b/drivers/scsi/qla2xxx/qla_inline.h
10885 index fee9eb7c8a60..b42d4adc42dc 100644
10886 --- a/drivers/scsi/qla2xxx/qla_inline.h
10887 +++ b/drivers/scsi/qla2xxx/qla_inline.h
10888 @@ -59,12 +59,12 @@ qla2x00_poll(struct rsp_que *rsp)
10889  {
10890         unsigned long flags;
10891         struct qla_hw_data *ha = rsp->hw;
10892 -       local_irq_save(flags);
10893 +       local_irq_save_nort(flags);
10894         if (IS_P3P_TYPE(ha))
10895                 qla82xx_poll(0, rsp);
10896         else
10897                 ha->isp_ops->intr_handler(0, rsp);
10898 -       local_irq_restore(flags);
10899 +       local_irq_restore_nort(flags);
10900  }
10901
10902  static inline uint8_t *
10903 diff --git a/drivers/thermal/x86_pkg_temp_thermal.c b/drivers/thermal/x86_pkg_temp_thermal.c
10904 index 7fc919f7da4d..e03fa17b8670 100644
10905 --- a/drivers/thermal/x86_pkg_temp_thermal.c
10906 +++ b/drivers/thermal/x86_pkg_temp_thermal.c
10907 @@ -29,6 +29,7 @@
10908  #include <linux/pm.h>
10909  #include <linux/thermal.h>
10910  #include <linux/debugfs.h>
10911 +#include <linux/swork.h>
10912  #include <asm/cpu_device_id.h>
10913  #include <asm/mce.h>
10914
10915 @@ -352,7 +353,7 @@ static void pkg_temp_thermal_threshold_work_fn(struct work_struct *work)
10916         }
10917  }
10918
10919 -static int pkg_temp_thermal_platform_thermal_notify(__u64 msr_val)
10920 +static void platform_thermal_notify_work(struct swork_event *event)
10921  {
10922         unsigned long flags;
10923         int cpu = smp_processor_id();
10924 @@ -369,7 +370,7 @@ static int pkg_temp_thermal_platform_thermal_notify(__u64 msr_val)
10925                         pkg_work_scheduled[phy_id]) {
10926                 disable_pkg_thres_interrupt();
10927                 spin_unlock_irqrestore(&pkg_work_lock, flags);
10928 -               return -EINVAL;
10929 +               return;
10930         }
10931         pkg_work_scheduled[phy_id] = 1;
10932         spin_unlock_irqrestore(&pkg_work_lock, flags);
10933 @@ -378,9 +379,48 @@ static int pkg_temp_thermal_platform_thermal_notify(__u64 msr_val)
10934         schedule_delayed_work_on(cpu,
10935                                 &per_cpu(pkg_temp_thermal_threshold_work, cpu),
10936                                 msecs_to_jiffies(notify_delay_ms));
10937 +}
10938 +
10939 +#ifdef CONFIG_PREEMPT_RT_FULL
10940 +static struct swork_event notify_work;
10941 +
10942 +static int thermal_notify_work_init(void)
10943 +{
10944 +       int err;
10945 +
10946 +       err = swork_get();
10947 +       if (err)
10948 +               return err;
10949 +
10950 +       INIT_SWORK(&notify_work, platform_thermal_notify_work);
10951         return 0;
10952  }
10953
10954 +static void thermal_notify_work_cleanup(void)
10955 +{
10956 +       swork_put();
10957 +}
10958 +
10959 +static int pkg_temp_thermal_platform_thermal_notify(__u64 msr_val)
10960 +{
10961 +       swork_queue(&notify_work);
10962 +       return 0;
10963 +}
10964 +
10965 +#else  /* !CONFIG_PREEMPT_RT_FULL */
10966 +
10967 +static int thermal_notify_work_init(void) { return 0; }
10968 +
10969 +static void thermal_notify_work_cleanup(void) {  }
10970 +
10971 +static int pkg_temp_thermal_platform_thermal_notify(__u64 msr_val)
10972 +{
10973 +       platform_thermal_notify_work(NULL);
10974 +
10975 +       return 0;
10976 +}
10977 +#endif /* CONFIG_PREEMPT_RT_FULL */
10978 +
10979  static int find_siblings_cpu(int cpu)
10980  {
10981         int i;
10982 @@ -584,6 +624,9 @@ static int __init pkg_temp_thermal_init(void)
10983         if (!x86_match_cpu(pkg_temp_thermal_ids))
10984                 return -ENODEV;
10985
10986 +       if (!thermal_notify_work_init())
10987 +               return -ENODEV;
10988 +
10989         spin_lock_init(&pkg_work_lock);
10990         platform_thermal_package_notify =
10991                         pkg_temp_thermal_platform_thermal_notify;
10992 @@ -608,7 +651,7 @@ err_ret:
10993         kfree(pkg_work_scheduled);
10994         platform_thermal_package_notify = NULL;
10995         platform_thermal_package_rate_control = NULL;
10996 -
10997 +       thermal_notify_work_cleanup();
10998         return -ENODEV;
10999  }
11000
11001 @@ -633,6 +676,7 @@ static void __exit pkg_temp_thermal_exit(void)
11002         mutex_unlock(&phy_dev_list_mutex);
11003         platform_thermal_package_notify = NULL;
11004         platform_thermal_package_rate_control = NULL;
11005 +       thermal_notify_work_cleanup();
11006         for_each_online_cpu(i)
11007                 cancel_delayed_work_sync(
11008                         &per_cpu(pkg_temp_thermal_threshold_work, i));
11009 diff --git a/drivers/tty/serial/8250/8250_core.c b/drivers/tty/serial/8250/8250_core.c
11010 index 39126460c1f5..af7701ca4d48 100644
11011 --- a/drivers/tty/serial/8250/8250_core.c
11012 +++ b/drivers/tty/serial/8250/8250_core.c
11013 @@ -58,7 +58,16 @@ static struct uart_driver serial8250_reg;
11014
11015  static unsigned int skip_txen_test; /* force skip of txen test at init time */
11016
11017 -#define PASS_LIMIT     512
11018 +/*
11019 + * On -rt we can have a more delays, and legitimately
11020 + * so - so don't drop work spuriously and spam the
11021 + * syslog:
11022 + */
11023 +#ifdef CONFIG_PREEMPT_RT_FULL
11024 +# define PASS_LIMIT    1000000
11025 +#else
11026 +# define PASS_LIMIT    512
11027 +#endif
11028
11029  #include <asm/serial.h>
11030  /*
11031 diff --git a/drivers/tty/serial/8250/8250_port.c b/drivers/tty/serial/8250/8250_port.c
11032 index 56ccbcefdd85..a0b9e854672c 100644
11033 --- a/drivers/tty/serial/8250/8250_port.c
11034 +++ b/drivers/tty/serial/8250/8250_port.c
11035 @@ -35,6 +35,7 @@
11036  #include <linux/nmi.h>
11037  #include <linux/mutex.h>
11038  #include <linux/slab.h>
11039 +#include <linux/kdb.h>
11040  #include <linux/uaccess.h>
11041  #include <linux/pm_runtime.h>
11042
11043 @@ -2843,9 +2844,9 @@ void serial8250_console_write(struct uart_8250_port *up, const char *s,
11044
11045         serial8250_rpm_get(up);
11046
11047 -       if (port->sysrq)
11048 +       if (port->sysrq || oops_in_progress)
11049                 locked = 0;
11050 -       else if (oops_in_progress)
11051 +       else if (in_kdb_printk())
11052                 locked = spin_trylock_irqsave(&port->lock, flags);
11053         else
11054                 spin_lock_irqsave(&port->lock, flags);
11055 diff --git a/drivers/tty/serial/amba-pl011.c b/drivers/tty/serial/amba-pl011.c
11056 index 899a77187bde..3ff6363b3751 100644
11057 --- a/drivers/tty/serial/amba-pl011.c
11058 +++ b/drivers/tty/serial/amba-pl011.c
11059 @@ -2067,13 +2067,19 @@ pl011_console_write(struct console *co, const char *s, unsigned int count)
11060
11061         clk_enable(uap->clk);
11062
11063 -       local_irq_save(flags);
11064 +       /*
11065 +        * local_irq_save(flags);
11066 +        *
11067 +        * This local_irq_save() is nonsense. If we come in via sysrq
11068 +        * handling then interrupts are already disabled. Aside of
11069 +        * that the port.sysrq check is racy on SMP regardless.
11070 +       */
11071         if (uap->port.sysrq)
11072                 locked = 0;
11073         else if (oops_in_progress)
11074 -               locked = spin_trylock(&uap->port.lock);
11075 +               locked = spin_trylock_irqsave(&uap->port.lock, flags);
11076         else
11077 -               spin_lock(&uap->port.lock);
11078 +               spin_lock_irqsave(&uap->port.lock, flags);
11079
11080         /*
11081          *      First save the CR then disable the interrupts
11082 @@ -2098,8 +2104,7 @@ pl011_console_write(struct console *co, const char *s, unsigned int count)
11083                 writew(old_cr, uap->port.membase + UART011_CR);
11084
11085         if (locked)
11086 -               spin_unlock(&uap->port.lock);
11087 -       local_irq_restore(flags);
11088 +               spin_unlock_irqrestore(&uap->port.lock, flags);
11089
11090         clk_disable(uap->clk);
11091  }
11092 diff --git a/drivers/tty/serial/omap-serial.c b/drivers/tty/serial/omap-serial.c
11093 index 24280d9a05e9..9745fb8b7abb 100644
11094 --- a/drivers/tty/serial/omap-serial.c
11095 +++ b/drivers/tty/serial/omap-serial.c
11096 @@ -1257,13 +1257,10 @@ serial_omap_console_write(struct console *co, const char *s,
11097
11098         pm_runtime_get_sync(up->dev);
11099
11100 -       local_irq_save(flags);
11101 -       if (up->port.sysrq)
11102 -               locked = 0;
11103 -       else if (oops_in_progress)
11104 -               locked = spin_trylock(&up->port.lock);
11105 +       if (up->port.sysrq || oops_in_progress)
11106 +               locked = spin_trylock_irqsave(&up->port.lock, flags);
11107         else
11108 -               spin_lock(&up->port.lock);
11109 +               spin_lock_irqsave(&up->port.lock, flags);
11110
11111         /*
11112          * First save the IER then disable the interrupts
11113 @@ -1292,8 +1289,7 @@ serial_omap_console_write(struct console *co, const char *s,
11114         pm_runtime_mark_last_busy(up->dev);
11115         pm_runtime_put_autosuspend(up->dev);
11116         if (locked)
11117 -               spin_unlock(&up->port.lock);
11118 -       local_irq_restore(flags);
11119 +               spin_unlock_irqrestore(&up->port.lock, flags);
11120  }
11121
11122  static int __init
11123 diff --git a/drivers/tty/serial/sc16is7xx.c b/drivers/tty/serial/sc16is7xx.c
11124 index edb5305b9d4d..7d5ee8a13ac6 100644
11125 --- a/drivers/tty/serial/sc16is7xx.c
11126 +++ b/drivers/tty/serial/sc16is7xx.c
11127 @@ -1230,7 +1230,7 @@ static int sc16is7xx_probe(struct device *dev,
11128
11129         /* Setup interrupt */
11130         ret = devm_request_irq(dev, irq, sc16is7xx_irq,
11131 -                              IRQF_ONESHOT | flags, dev_name(dev), s);
11132 +                              flags, dev_name(dev), s);
11133         if (!ret)
11134                 return 0;
11135
11136 diff --git a/drivers/usb/core/hcd.c b/drivers/usb/core/hcd.c
11137 index f44ce09367bc..5fc9a14721bd 100644
11138 --- a/drivers/usb/core/hcd.c
11139 +++ b/drivers/usb/core/hcd.c
11140 @@ -1735,9 +1735,9 @@ static void __usb_hcd_giveback_urb(struct urb *urb)
11141          * and no one may trigger the above deadlock situation when
11142          * running complete() in tasklet.
11143          */
11144 -       local_irq_save(flags);
11145 +       local_irq_save_nort(flags);
11146         urb->complete(urb);
11147 -       local_irq_restore(flags);
11148 +       local_irq_restore_nort(flags);
11149
11150         usb_anchor_resume_wakeups(anchor);
11151         atomic_dec(&urb->use_count);
11152 diff --git a/drivers/usb/gadget/function/f_fs.c b/drivers/usb/gadget/function/f_fs.c
11153 index 803c503a2e3d..8dd2720aab64 100644
11154 --- a/drivers/usb/gadget/function/f_fs.c
11155 +++ b/drivers/usb/gadget/function/f_fs.c
11156 @@ -1404,7 +1404,7 @@ static void ffs_data_put(struct ffs_data *ffs)
11157                 pr_info("%s(): freeing\n", __func__);
11158                 ffs_data_clear(ffs);
11159                 BUG_ON(waitqueue_active(&ffs->ev.waitq) ||
11160 -                      waitqueue_active(&ffs->ep0req_completion.wait));
11161 +                      swait_active(&ffs->ep0req_completion.wait));
11162                 kfree(ffs->dev_name);
11163                 kfree(ffs);
11164         }
11165 diff --git a/drivers/usb/gadget/legacy/inode.c b/drivers/usb/gadget/legacy/inode.c
11166 index e57f48f9528f..7544a54056e4 100644
11167 --- a/drivers/usb/gadget/legacy/inode.c
11168 +++ b/drivers/usb/gadget/legacy/inode.c
11169 @@ -345,7 +345,7 @@ ep_io (struct ep_data *epdata, void *buf, unsigned len)
11170         spin_unlock_irq (&epdata->dev->lock);
11171
11172         if (likely (value == 0)) {
11173 -               value = wait_event_interruptible (done.wait, done.done);
11174 +               value = swait_event_interruptible (done.wait, done.done);
11175                 if (value != 0) {
11176                         spin_lock_irq (&epdata->dev->lock);
11177                         if (likely (epdata->ep != NULL)) {
11178 @@ -354,7 +354,7 @@ ep_io (struct ep_data *epdata, void *buf, unsigned len)
11179                                 usb_ep_dequeue (epdata->ep, epdata->req);
11180                                 spin_unlock_irq (&epdata->dev->lock);
11181
11182 -                               wait_event (done.wait, done.done);
11183 +                               swait_event (done.wait, done.done);
11184                                 if (epdata->status == -ECONNRESET)
11185                                         epdata->status = -EINTR;
11186                         } else {
11187 diff --git a/drivers/usb/gadget/udc/atmel_usba_udc.c b/drivers/usb/gadget/udc/atmel_usba_udc.c
11188 index f92f5aff0dd5..f9bba26e3655 100644
11189 --- a/drivers/usb/gadget/udc/atmel_usba_udc.c
11190 +++ b/drivers/usb/gadget/udc/atmel_usba_udc.c
11191 @@ -17,7 +17,9 @@
11192  #include <linux/device.h>
11193  #include <linux/dma-mapping.h>
11194  #include <linux/list.h>
11195 +#include <linux/mfd/syscon.h>
11196  #include <linux/platform_device.h>
11197 +#include <linux/regmap.h>
11198  #include <linux/usb/ch9.h>
11199  #include <linux/usb/gadget.h>
11200  #include <linux/usb/atmel_usba_udc.h>
11201 @@ -1888,20 +1890,15 @@ static int atmel_usba_stop(struct usb_gadget *gadget)
11202  #ifdef CONFIG_OF
11203  static void at91sam9rl_toggle_bias(struct usba_udc *udc, int is_on)
11204  {
11205 -       unsigned int uckr = at91_pmc_read(AT91_CKGR_UCKR);
11206 -
11207 -       if (is_on)
11208 -               at91_pmc_write(AT91_CKGR_UCKR, uckr | AT91_PMC_BIASEN);
11209 -       else
11210 -               at91_pmc_write(AT91_CKGR_UCKR, uckr & ~(AT91_PMC_BIASEN));
11211 +       regmap_update_bits(udc->pmc, AT91_CKGR_UCKR, AT91_PMC_BIASEN,
11212 +                          is_on ? AT91_PMC_BIASEN : 0);
11213  }
11214
11215  static void at91sam9g45_pulse_bias(struct usba_udc *udc)
11216  {
11217 -       unsigned int uckr = at91_pmc_read(AT91_CKGR_UCKR);
11218 -
11219 -       at91_pmc_write(AT91_CKGR_UCKR, uckr & ~(AT91_PMC_BIASEN));
11220 -       at91_pmc_write(AT91_CKGR_UCKR, uckr | AT91_PMC_BIASEN);
11221 +       regmap_update_bits(udc->pmc, AT91_CKGR_UCKR, AT91_PMC_BIASEN, 0);
11222 +       regmap_update_bits(udc->pmc, AT91_CKGR_UCKR, AT91_PMC_BIASEN,
11223 +                          AT91_PMC_BIASEN);
11224  }
11225
11226  static const struct usba_udc_errata at91sam9rl_errata = {
11227 @@ -1938,6 +1935,9 @@ static struct usba_ep * atmel_udc_of_init(struct platform_device *pdev,
11228                 return ERR_PTR(-EINVAL);
11229
11230         udc->errata = match->data;
11231 +       udc->pmc = syscon_regmap_lookup_by_compatible("atmel,at91sam9g45-pmc");
11232 +       if (udc->errata && IS_ERR(udc->pmc))
11233 +               return ERR_CAST(udc->pmc);
11234
11235         udc->num_ep = 0;
11236
11237 diff --git a/drivers/usb/gadget/udc/atmel_usba_udc.h b/drivers/usb/gadget/udc/atmel_usba_udc.h
11238 index ea448a344767..3e1c9d589dfa 100644
11239 --- a/drivers/usb/gadget/udc/atmel_usba_udc.h
11240 +++ b/drivers/usb/gadget/udc/atmel_usba_udc.h
11241 @@ -354,6 +354,8 @@ struct usba_udc {
11242         struct dentry *debugfs_root;
11243         struct dentry *debugfs_regs;
11244  #endif
11245 +
11246 +       struct regmap *pmc;
11247  };
11248
11249  static inline struct usba_ep *to_usba_ep(struct usb_ep *ep)
11250 diff --git a/fs/aio.c b/fs/aio.c
11251 index fe4f49212b99..c3194afdc3df 100644
11252 --- a/fs/aio.c
11253 +++ b/fs/aio.c
11254 @@ -40,6 +40,7 @@
11255  #include <linux/ramfs.h>
11256  #include <linux/percpu-refcount.h>
11257  #include <linux/mount.h>
11258 +#include <linux/swork.h>
11259
11260  #include <asm/kmap_types.h>
11261  #include <asm/uaccess.h>
11262 @@ -115,7 +116,7 @@ struct kioctx {
11263         struct page             **ring_pages;
11264         long                    nr_pages;
11265
11266 -       struct work_struct      free_work;
11267 +       struct swork_event      free_work;
11268
11269         /*
11270          * signals when all in-flight requests are done
11271 @@ -258,6 +259,7 @@ static int __init aio_setup(void)
11272                 .mount          = aio_mount,
11273                 .kill_sb        = kill_anon_super,
11274         };
11275 +       BUG_ON(swork_get());
11276         aio_mnt = kern_mount(&aio_fs);
11277         if (IS_ERR(aio_mnt))
11278                 panic("Failed to create aio fs mount.");
11279 @@ -573,9 +575,9 @@ static int kiocb_cancel(struct aio_kiocb *kiocb)
11280         return cancel(&kiocb->common);
11281  }
11282
11283 -static void free_ioctx(struct work_struct *work)
11284 +static void free_ioctx(struct swork_event *sev)
11285  {
11286 -       struct kioctx *ctx = container_of(work, struct kioctx, free_work);
11287 +       struct kioctx *ctx = container_of(sev, struct kioctx, free_work);
11288
11289         pr_debug("freeing %p\n", ctx);
11290
11291 @@ -594,8 +596,8 @@ static void free_ioctx_reqs(struct percpu_ref *ref)
11292         if (ctx->rq_wait && atomic_dec_and_test(&ctx->rq_wait->count))
11293                 complete(&ctx->rq_wait->comp);
11294
11295 -       INIT_WORK(&ctx->free_work, free_ioctx);
11296 -       schedule_work(&ctx->free_work);
11297 +       INIT_SWORK(&ctx->free_work, free_ioctx);
11298 +       swork_queue(&ctx->free_work);
11299  }
11300
11301  /*
11302 @@ -603,9 +605,9 @@ static void free_ioctx_reqs(struct percpu_ref *ref)
11303   * and ctx->users has dropped to 0, so we know no more kiocbs can be submitted -
11304   * now it's safe to cancel any that need to be.
11305   */
11306 -static void free_ioctx_users(struct percpu_ref *ref)
11307 +static void free_ioctx_users_work(struct swork_event *sev)
11308  {
11309 -       struct kioctx *ctx = container_of(ref, struct kioctx, users);
11310 +       struct kioctx *ctx = container_of(sev, struct kioctx, free_work);
11311         struct aio_kiocb *req;
11312
11313         spin_lock_irq(&ctx->ctx_lock);
11314 @@ -624,6 +626,14 @@ static void free_ioctx_users(struct percpu_ref *ref)
11315         percpu_ref_put(&ctx->reqs);
11316  }
11317
11318 +static void free_ioctx_users(struct percpu_ref *ref)
11319 +{
11320 +       struct kioctx *ctx = container_of(ref, struct kioctx, users);
11321 +
11322 +       INIT_SWORK(&ctx->free_work, free_ioctx_users_work);
11323 +       swork_queue(&ctx->free_work);
11324 +}
11325 +
11326  static int ioctx_add_table(struct kioctx *ctx, struct mm_struct *mm)
11327  {
11328         unsigned i, new_nr;
11329 diff --git a/fs/autofs4/autofs_i.h b/fs/autofs4/autofs_i.h
11330 index 502d3892d8a4..05af8d3e6e88 100644
11331 --- a/fs/autofs4/autofs_i.h
11332 +++ b/fs/autofs4/autofs_i.h
11333 @@ -34,6 +34,7 @@
11334  #include <linux/sched.h>
11335  #include <linux/mount.h>
11336  #include <linux/namei.h>
11337 +#include <linux/delay.h>
11338  #include <asm/current.h>
11339  #include <asm/uaccess.h>
11340
11341 diff --git a/fs/autofs4/expire.c b/fs/autofs4/expire.c
11342 index 7a5a598a2d94..d08bcdc30566 100644
11343 --- a/fs/autofs4/expire.c
11344 +++ b/fs/autofs4/expire.c
11345 @@ -150,7 +150,7 @@ again:
11346                         parent = p->d_parent;
11347                         if (!spin_trylock(&parent->d_lock)) {
11348                                 spin_unlock(&p->d_lock);
11349 -                               cpu_relax();
11350 +                               cpu_chill();
11351                                 goto relock;
11352                         }
11353                         spin_unlock(&p->d_lock);
11354 diff --git a/fs/buffer.c b/fs/buffer.c
11355 index 4f4cd959da7c..72b27e17b907 100644
11356 --- a/fs/buffer.c
11357 +++ b/fs/buffer.c
11358 @@ -305,8 +305,7 @@ static void end_buffer_async_read(struct buffer_head *bh, int uptodate)
11359          * decide that the page is now completely done.
11360          */
11361         first = page_buffers(page);
11362 -       local_irq_save(flags);
11363 -       bit_spin_lock(BH_Uptodate_Lock, &first->b_state);
11364 +       flags = bh_uptodate_lock_irqsave(first);
11365         clear_buffer_async_read(bh);
11366         unlock_buffer(bh);
11367         tmp = bh;
11368 @@ -319,8 +318,7 @@ static void end_buffer_async_read(struct buffer_head *bh, int uptodate)
11369                 }
11370                 tmp = tmp->b_this_page;
11371         } while (tmp != bh);
11372 -       bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
11373 -       local_irq_restore(flags);
11374 +       bh_uptodate_unlock_irqrestore(first, flags);
11375
11376         /*
11377          * If none of the buffers had errors and they are all
11378 @@ -332,9 +330,7 @@ static void end_buffer_async_read(struct buffer_head *bh, int uptodate)
11379         return;
11380
11381  still_busy:
11382 -       bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
11383 -       local_irq_restore(flags);
11384 -       return;
11385 +       bh_uptodate_unlock_irqrestore(first, flags);
11386  }
11387
11388  /*
11389 @@ -362,8 +358,7 @@ void end_buffer_async_write(struct buffer_head *bh, int uptodate)
11390         }
11391
11392         first = page_buffers(page);
11393 -       local_irq_save(flags);
11394 -       bit_spin_lock(BH_Uptodate_Lock, &first->b_state);
11395 +       flags = bh_uptodate_lock_irqsave(first);
11396
11397         clear_buffer_async_write(bh);
11398         unlock_buffer(bh);
11399 @@ -375,15 +370,12 @@ void end_buffer_async_write(struct buffer_head *bh, int uptodate)
11400                 }
11401                 tmp = tmp->b_this_page;
11402         }
11403 -       bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
11404 -       local_irq_restore(flags);
11405 +       bh_uptodate_unlock_irqrestore(first, flags);
11406         end_page_writeback(page);
11407         return;
11408
11409  still_busy:
11410 -       bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
11411 -       local_irq_restore(flags);
11412 -       return;
11413 +       bh_uptodate_unlock_irqrestore(first, flags);
11414  }
11415  EXPORT_SYMBOL(end_buffer_async_write);
11416
11417 @@ -3325,6 +3317,7 @@ struct buffer_head *alloc_buffer_head(gfp_t gfp_flags)
11418         struct buffer_head *ret = kmem_cache_zalloc(bh_cachep, gfp_flags);
11419         if (ret) {
11420                 INIT_LIST_HEAD(&ret->b_assoc_buffers);
11421 +               buffer_head_init_locks(ret);
11422                 preempt_disable();
11423                 __this_cpu_inc(bh_accounting.nr);
11424                 recalc_bh_state();
11425 diff --git a/fs/dcache.c b/fs/dcache.c
11426 index 71b6056ad35d..e80471cbfc19 100644
11427 --- a/fs/dcache.c
11428 +++ b/fs/dcache.c
11429 @@ -19,6 +19,7 @@
11430  #include <linux/mm.h>
11431  #include <linux/fs.h>
11432  #include <linux/fsnotify.h>
11433 +#include <linux/delay.h>
11434  #include <linux/slab.h>
11435  #include <linux/init.h>
11436  #include <linux/hash.h>
11437 @@ -747,6 +748,8 @@ static inline bool fast_dput(struct dentry *dentry)
11438   */
11439  void dput(struct dentry *dentry)
11440  {
11441 +       struct dentry *parent;
11442 +
11443         if (unlikely(!dentry))
11444                 return;
11445
11446 @@ -783,9 +786,18 @@ repeat:
11447         return;
11448
11449  kill_it:
11450 -       dentry = dentry_kill(dentry);
11451 -       if (dentry) {
11452 -               cond_resched();
11453 +       parent = dentry_kill(dentry);
11454 +       if (parent) {
11455 +               int r;
11456 +
11457 +               if (parent == dentry) {
11458 +                       /* the task with the highest priority won't schedule */
11459 +                       r = cond_resched();
11460 +                       if (!r)
11461 +                               cpu_chill();
11462 +               } else {
11463 +                       dentry = parent;
11464 +               }
11465                 goto repeat;
11466         }
11467  }
11468 @@ -2394,7 +2406,7 @@ again:
11469         if (dentry->d_lockref.count == 1) {
11470                 if (!spin_trylock(&inode->i_lock)) {
11471                         spin_unlock(&dentry->d_lock);
11472 -                       cpu_relax();
11473 +                       cpu_chill();
11474                         goto again;
11475                 }
11476                 dentry->d_flags &= ~DCACHE_CANT_MOUNT;
11477 diff --git a/fs/eventpoll.c b/fs/eventpoll.c
11478 index 1e009cad8d5c..d0c12504d3b4 100644
11479 --- a/fs/eventpoll.c
11480 +++ b/fs/eventpoll.c
11481 @@ -505,12 +505,12 @@ static int ep_poll_wakeup_proc(void *priv, void *cookie, int call_nests)
11482   */
11483  static void ep_poll_safewake(wait_queue_head_t *wq)
11484  {
11485 -       int this_cpu = get_cpu();
11486 +       int this_cpu = get_cpu_light();
11487
11488         ep_call_nested(&poll_safewake_ncalls, EP_MAX_NESTS,
11489                        ep_poll_wakeup_proc, NULL, wq, (void *) (long) this_cpu);
11490
11491 -       put_cpu();
11492 +       put_cpu_light();
11493  }
11494
11495  static void ep_remove_wait_queue(struct eppoll_entry *pwq)
11496 diff --git a/fs/exec.c b/fs/exec.c
11497 index b06623a9347f..e7760b7b692c 100644
11498 --- a/fs/exec.c
11499 +++ b/fs/exec.c
11500 @@ -865,12 +865,14 @@ static int exec_mmap(struct mm_struct *mm)
11501                 }
11502         }
11503         task_lock(tsk);
11504 +       preempt_disable_rt();
11505         active_mm = tsk->active_mm;
11506         tsk->mm = mm;
11507         tsk->active_mm = mm;
11508         activate_mm(active_mm, mm);
11509         tsk->mm->vmacache_seqnum = 0;
11510         vmacache_flush(tsk);
11511 +       preempt_enable_rt();
11512         task_unlock(tsk);
11513         if (old_mm) {
11514                 up_read(&old_mm->mmap_sem);
11515 diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
11516 index 9db5500d63d9..5951c495d124 100644
11517 --- a/fs/f2fs/f2fs.h
11518 +++ b/fs/f2fs/f2fs.h
11519 @@ -24,7 +24,6 @@
11520
11521  #ifdef CONFIG_F2FS_CHECK_FS
11522  #define f2fs_bug_on(sbi, condition)    BUG_ON(condition)
11523 -#define f2fs_down_write(x, y)  down_write_nest_lock(x, y)
11524  #else
11525  #define f2fs_bug_on(sbi, condition)                                    \
11526         do {                                                            \
11527 @@ -33,7 +32,6 @@
11528                         set_sbi_flag(sbi, SBI_NEED_FSCK);               \
11529                 }                                                       \
11530         } while (0)
11531 -#define f2fs_down_write(x, y)  down_write(x)
11532  #endif
11533
11534  /*
11535 @@ -959,7 +957,7 @@ static inline void f2fs_unlock_op(struct f2fs_sb_info *sbi)
11536
11537  static inline void f2fs_lock_all(struct f2fs_sb_info *sbi)
11538  {
11539 -       f2fs_down_write(&sbi->cp_rwsem, &sbi->cp_mutex);
11540 +       down_write(&sbi->cp_rwsem);
11541  }
11542
11543  static inline void f2fs_unlock_all(struct f2fs_sb_info *sbi)
11544 diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c
11545 index 684996c8a3a4..6e18a06aaabe 100644
11546 --- a/fs/jbd2/checkpoint.c
11547 +++ b/fs/jbd2/checkpoint.c
11548 @@ -116,6 +116,8 @@ void __jbd2_log_wait_for_space(journal_t *journal)
11549         nblocks = jbd2_space_needed(journal);
11550         while (jbd2_log_space_left(journal) < nblocks) {
11551                 write_unlock(&journal->j_state_lock);
11552 +               if (current->plug)
11553 +                       io_schedule();
11554                 mutex_lock(&journal->j_checkpoint_mutex);
11555
11556                 /*
11557 diff --git a/fs/namespace.c b/fs/namespace.c
11558 index 5be02a0635be..1f3725bbd04b 100644
11559 --- a/fs/namespace.c
11560 +++ b/fs/namespace.c
11561 @@ -14,6 +14,7 @@
11562  #include <linux/mnt_namespace.h>
11563  #include <linux/user_namespace.h>
11564  #include <linux/namei.h>
11565 +#include <linux/delay.h>
11566  #include <linux/security.h>
11567  #include <linux/idr.h>
11568  #include <linux/init.h>                /* init_rootfs */
11569 @@ -353,8 +354,11 @@ int __mnt_want_write(struct vfsmount *m)
11570          * incremented count after it has set MNT_WRITE_HOLD.
11571          */
11572         smp_mb();
11573 -       while (ACCESS_ONCE(mnt->mnt.mnt_flags) & MNT_WRITE_HOLD)
11574 -               cpu_relax();
11575 +       while (ACCESS_ONCE(mnt->mnt.mnt_flags) & MNT_WRITE_HOLD) {
11576 +               preempt_enable();
11577 +               cpu_chill();
11578 +               preempt_disable();
11579 +       }
11580         /*
11581          * After the slowpath clears MNT_WRITE_HOLD, mnt_is_readonly will
11582          * be set to match its requirements. So we must not load that until
11583 diff --git a/fs/ntfs/aops.c b/fs/ntfs/aops.c
11584 index 7521e11db728..f0de4b6b8bf3 100644
11585 --- a/fs/ntfs/aops.c
11586 +++ b/fs/ntfs/aops.c
11587 @@ -107,8 +107,7 @@ static void ntfs_end_buffer_async_read(struct buffer_head *bh, int uptodate)
11588                                 "0x%llx.", (unsigned long long)bh->b_blocknr);
11589         }
11590         first = page_buffers(page);
11591 -       local_irq_save(flags);
11592 -       bit_spin_lock(BH_Uptodate_Lock, &first->b_state);
11593 +       flags = bh_uptodate_lock_irqsave(first);
11594         clear_buffer_async_read(bh);
11595         unlock_buffer(bh);
11596         tmp = bh;
11597 @@ -123,8 +122,7 @@ static void ntfs_end_buffer_async_read(struct buffer_head *bh, int uptodate)
11598                 }
11599                 tmp = tmp->b_this_page;
11600         } while (tmp != bh);
11601 -       bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
11602 -       local_irq_restore(flags);
11603 +       bh_uptodate_unlock_irqrestore(first, flags);
11604         /*
11605          * If none of the buffers had errors then we can set the page uptodate,
11606          * but we first have to perform the post read mst fixups, if the
11607 @@ -145,13 +143,13 @@ static void ntfs_end_buffer_async_read(struct buffer_head *bh, int uptodate)
11608                 recs = PAGE_CACHE_SIZE / rec_size;
11609                 /* Should have been verified before we got here... */
11610                 BUG_ON(!recs);
11611 -               local_irq_save(flags);
11612 +               local_irq_save_nort(flags);
11613                 kaddr = kmap_atomic(page);
11614                 for (i = 0; i < recs; i++)
11615                         post_read_mst_fixup((NTFS_RECORD*)(kaddr +
11616                                         i * rec_size), rec_size);
11617                 kunmap_atomic(kaddr);
11618 -               local_irq_restore(flags);
11619 +               local_irq_restore_nort(flags);
11620                 flush_dcache_page(page);
11621                 if (likely(page_uptodate && !PageError(page)))
11622                         SetPageUptodate(page);
11623 @@ -159,9 +157,7 @@ static void ntfs_end_buffer_async_read(struct buffer_head *bh, int uptodate)
11624         unlock_page(page);
11625         return;
11626  still_busy:
11627 -       bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
11628 -       local_irq_restore(flags);
11629 -       return;
11630 +       bh_uptodate_unlock_irqrestore(first, flags);
11631  }
11632
11633  /**
11634 diff --git a/fs/timerfd.c b/fs/timerfd.c
11635 index 053818dd6c18..c4bc14fe0085 100644
11636 --- a/fs/timerfd.c
11637 +++ b/fs/timerfd.c
11638 @@ -450,7 +450,10 @@ static int do_timerfd_settime(int ufd, int flags,
11639                                 break;
11640                 }
11641                 spin_unlock_irq(&ctx->wqh.lock);
11642 -               cpu_relax();
11643 +               if (isalarm(ctx))
11644 +                       hrtimer_wait_for_timer(&ctx->t.alarm.timer);
11645 +               else
11646 +                       hrtimer_wait_for_timer(&ctx->t.tmr);
11647         }
11648
11649         /*
11650 diff --git a/include/acpi/platform/aclinux.h b/include/acpi/platform/aclinux.h
11651 index 323e5daece54..cc5fbd534fd4 100644
11652 --- a/include/acpi/platform/aclinux.h
11653 +++ b/include/acpi/platform/aclinux.h
11654 @@ -127,6 +127,7 @@
11655
11656  #define acpi_cache_t                        struct kmem_cache
11657  #define acpi_spinlock                       spinlock_t *
11658 +#define acpi_raw_spinlock              raw_spinlock_t *
11659  #define acpi_cpu_flags                      unsigned long
11660
11661  /* Use native linux version of acpi_os_allocate_zeroed */
11662 @@ -145,6 +146,20 @@
11663  #define ACPI_USE_ALTERNATE_PROTOTYPE_acpi_os_get_thread_id
11664  #define ACPI_USE_ALTERNATE_PROTOTYPE_acpi_os_create_lock
11665
11666 +#define acpi_os_create_raw_lock(__handle)                      \
11667 +({                                                             \
11668 +        raw_spinlock_t *lock = ACPI_ALLOCATE(sizeof(*lock));   \
11669 +                                                               \
11670 +        if (lock) {                                            \
11671 +               *(__handle) = lock;                             \
11672 +               raw_spin_lock_init(*(__handle));                \
11673 +        }                                                      \
11674 +        lock ? AE_OK : AE_NO_MEMORY;                           \
11675 + })
11676 +
11677 +#define acpi_os_delete_raw_lock(__handle)      kfree(__handle)
11678 +
11679 +
11680  /*
11681   * OSL interfaces used by debugger/disassembler
11682   */
11683 diff --git a/include/asm-generic/bug.h b/include/asm-generic/bug.h
11684 index 630dd2372238..850e4d993a88 100644
11685 --- a/include/asm-generic/bug.h
11686 +++ b/include/asm-generic/bug.h
11687 @@ -206,6 +206,20 @@ extern void warn_slowpath_null(const char *file, const int line);
11688  # define WARN_ON_SMP(x)                        ({0;})
11689  #endif
11690
11691 +#ifdef CONFIG_PREEMPT_RT_BASE
11692 +# define BUG_ON_RT(c)                  BUG_ON(c)
11693 +# define BUG_ON_NONRT(c)               do { } while (0)
11694 +# define WARN_ON_RT(condition)         WARN_ON(condition)
11695 +# define WARN_ON_NONRT(condition)      do { } while (0)
11696 +# define WARN_ON_ONCE_NONRT(condition) do { } while (0)
11697 +#else
11698 +# define BUG_ON_RT(c)                  do { } while (0)
11699 +# define BUG_ON_NONRT(c)               BUG_ON(c)
11700 +# define WARN_ON_RT(condition)         do { } while (0)
11701 +# define WARN_ON_NONRT(condition)      WARN_ON(condition)
11702 +# define WARN_ON_ONCE_NONRT(condition) WARN_ON_ONCE(condition)
11703 +#endif
11704 +
11705  #endif /* __ASSEMBLY__ */
11706
11707  #endif
11708 diff --git a/include/asm-generic/preempt.h b/include/asm-generic/preempt.h
11709 index 5d8ffa3e6f8c..c1cde3577551 100644
11710 --- a/include/asm-generic/preempt.h
11711 +++ b/include/asm-generic/preempt.h
11712 @@ -7,10 +7,10 @@
11713
11714  static __always_inline int preempt_count(void)
11715  {
11716 -       return current_thread_info()->preempt_count;
11717 +       return READ_ONCE(current_thread_info()->preempt_count);
11718  }
11719
11720 -static __always_inline int *preempt_count_ptr(void)
11721 +static __always_inline volatile int *preempt_count_ptr(void)
11722  {
11723         return &current_thread_info()->preempt_count;
11724  }
11725 diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
11726 index daf17d70aeca..463df8954255 100644
11727 --- a/include/linux/blk-mq.h
11728 +++ b/include/linux/blk-mq.h
11729 @@ -212,6 +212,7 @@ static inline u16 blk_mq_unique_tag_to_tag(u32 unique_tag)
11730
11731  struct blk_mq_hw_ctx *blk_mq_map_queue(struct request_queue *, const int ctx_index);
11732  struct blk_mq_hw_ctx *blk_mq_alloc_single_hw_queue(struct blk_mq_tag_set *, unsigned int, int);
11733 +void __blk_mq_complete_request_remote_work(struct work_struct *work);
11734
11735  int blk_mq_request_started(struct request *rq);
11736  void blk_mq_start_request(struct request *rq);
11737 diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
11738 index fe14382f9664..a82143ad6702 100644
11739 --- a/include/linux/blkdev.h
11740 +++ b/include/linux/blkdev.h
11741 @@ -89,6 +89,7 @@ struct request {
11742         struct list_head queuelist;
11743         union {
11744                 struct call_single_data csd;
11745 +               struct work_struct work;
11746                 unsigned long fifo_time;
11747         };
11748
11749 @@ -455,7 +456,7 @@ struct request_queue {
11750         struct throtl_data *td;
11751  #endif
11752         struct rcu_head         rcu_head;
11753 -       wait_queue_head_t       mq_freeze_wq;
11754 +       struct swait_queue_head mq_freeze_wq;
11755         struct percpu_ref       q_usage_counter;
11756         struct list_head        all_q_node;
11757
11758 diff --git a/include/linux/bottom_half.h b/include/linux/bottom_half.h
11759 index 8fdcb783197d..d07dbeec7bc1 100644
11760 --- a/include/linux/bottom_half.h
11761 +++ b/include/linux/bottom_half.h
11762 @@ -3,6 +3,39 @@
11763
11764  #include <linux/preempt.h>
11765
11766 +#ifdef CONFIG_PREEMPT_RT_FULL
11767 +
11768 +extern void __local_bh_disable(void);
11769 +extern void _local_bh_enable(void);
11770 +extern void __local_bh_enable(void);
11771 +
11772 +static inline void local_bh_disable(void)
11773 +{
11774 +       __local_bh_disable();
11775 +}
11776 +
11777 +static inline void __local_bh_disable_ip(unsigned long ip, unsigned int cnt)
11778 +{
11779 +       __local_bh_disable();
11780 +}
11781 +
11782 +static inline void local_bh_enable(void)
11783 +{
11784 +       __local_bh_enable();
11785 +}
11786 +
11787 +static inline void __local_bh_enable_ip(unsigned long ip, unsigned int cnt)
11788 +{
11789 +       __local_bh_enable();
11790 +}
11791 +
11792 +static inline void local_bh_enable_ip(unsigned long ip)
11793 +{
11794 +       __local_bh_enable();
11795 +}
11796 +
11797 +#else
11798 +
11799  #ifdef CONFIG_TRACE_IRQFLAGS
11800  extern void __local_bh_disable_ip(unsigned long ip, unsigned int cnt);
11801  #else
11802 @@ -30,5 +63,6 @@ static inline void local_bh_enable(void)
11803  {
11804         __local_bh_enable_ip(_THIS_IP_, SOFTIRQ_DISABLE_OFFSET);
11805  }
11806 +#endif
11807
11808  #endif /* _LINUX_BH_H */
11809 diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h
11810 index 89d9aa9e79bf..4a201008b02d 100644
11811 --- a/include/linux/buffer_head.h
11812 +++ b/include/linux/buffer_head.h
11813 @@ -75,8 +75,50 @@ struct buffer_head {
11814         struct address_space *b_assoc_map;      /* mapping this buffer is
11815                                                    associated with */
11816         atomic_t b_count;               /* users using this buffer_head */
11817 +#ifdef CONFIG_PREEMPT_RT_BASE
11818 +       spinlock_t b_uptodate_lock;
11819 +#if IS_ENABLED(CONFIG_JBD2)
11820 +       spinlock_t b_state_lock;
11821 +       spinlock_t b_journal_head_lock;
11822 +#endif
11823 +#endif
11824  };
11825
11826 +static inline unsigned long bh_uptodate_lock_irqsave(struct buffer_head *bh)
11827 +{
11828 +       unsigned long flags;
11829 +
11830 +#ifndef CONFIG_PREEMPT_RT_BASE
11831 +       local_irq_save(flags);
11832 +       bit_spin_lock(BH_Uptodate_Lock, &bh->b_state);
11833 +#else
11834 +       spin_lock_irqsave(&bh->b_uptodate_lock, flags);
11835 +#endif
11836 +       return flags;
11837 +}
11838 +
11839 +static inline void
11840 +bh_uptodate_unlock_irqrestore(struct buffer_head *bh, unsigned long flags)
11841 +{
11842 +#ifndef CONFIG_PREEMPT_RT_BASE
11843 +       bit_spin_unlock(BH_Uptodate_Lock, &bh->b_state);
11844 +       local_irq_restore(flags);
11845 +#else
11846 +       spin_unlock_irqrestore(&bh->b_uptodate_lock, flags);
11847 +#endif
11848 +}
11849 +
11850 +static inline void buffer_head_init_locks(struct buffer_head *bh)
11851 +{
11852 +#ifdef CONFIG_PREEMPT_RT_BASE
11853 +       spin_lock_init(&bh->b_uptodate_lock);
11854 +#if IS_ENABLED(CONFIG_JBD2)
11855 +       spin_lock_init(&bh->b_state_lock);
11856 +       spin_lock_init(&bh->b_journal_head_lock);
11857 +#endif
11858 +#endif
11859 +}
11860 +
11861  /*
11862   * macro tricks to expand the set_buffer_foo(), clear_buffer_foo()
11863   * and buffer_foo() functions.
11864 diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h
11865 index 8da263299754..0cc474291e08 100644
11866 --- a/include/linux/cgroup-defs.h
11867 +++ b/include/linux/cgroup-defs.h
11868 @@ -16,6 +16,7 @@
11869  #include <linux/percpu-refcount.h>
11870  #include <linux/percpu-rwsem.h>
11871  #include <linux/workqueue.h>
11872 +#include <linux/swork.h>
11873
11874  #ifdef CONFIG_CGROUPS
11875
11876 @@ -142,6 +143,7 @@ struct cgroup_subsys_state {
11877         /* percpu_ref killing and RCU release */
11878         struct rcu_head rcu_head;
11879         struct work_struct destroy_work;
11880 +       struct swork_event destroy_swork;
11881  };
11882
11883  /*
11884 diff --git a/include/linux/clk/at91_pmc.h b/include/linux/clk/at91_pmc.h
11885 index 1e6932222e11..17f413bbbedf 100644
11886 --- a/include/linux/clk/at91_pmc.h
11887 +++ b/include/linux/clk/at91_pmc.h
11888 @@ -16,18 +16,6 @@
11889  #ifndef AT91_PMC_H
11890  #define AT91_PMC_H
11891
11892 -#ifndef __ASSEMBLY__
11893 -extern void __iomem *at91_pmc_base;
11894 -
11895 -#define at91_pmc_read(field) \
11896 -       readl_relaxed(at91_pmc_base + field)
11897 -
11898 -#define at91_pmc_write(field, value) \
11899 -       writel_relaxed(value, at91_pmc_base + field)
11900 -#else
11901 -.extern at91_pmc_base
11902 -#endif
11903 -
11904  #define        AT91_PMC_SCER           0x00                    /* System Clock Enable Register */
11905  #define        AT91_PMC_SCDR           0x04                    /* System Clock Disable Register */
11906
11907 diff --git a/include/linux/completion.h b/include/linux/completion.h
11908 index 5d5aaae3af43..3bca1590e29f 100644
11909 --- a/include/linux/completion.h
11910 +++ b/include/linux/completion.h
11911 @@ -7,8 +7,7 @@
11912   * Atomic wait-for-completion handler data structures.
11913   * See kernel/sched/completion.c for details.
11914   */
11915 -
11916 -#include <linux/wait.h>
11917 +#include <linux/swait.h>
11918
11919  /*
11920   * struct completion - structure used to maintain state for a "completion"
11921 @@ -24,11 +23,11 @@
11922   */
11923  struct completion {
11924         unsigned int done;
11925 -       wait_queue_head_t wait;
11926 +       struct swait_queue_head wait;
11927  };
11928
11929  #define COMPLETION_INITIALIZER(work) \
11930 -       { 0, __WAIT_QUEUE_HEAD_INITIALIZER((work).wait) }
11931 +       { 0, __SWAIT_QUEUE_HEAD_INITIALIZER((work).wait) }
11932
11933  #define COMPLETION_INITIALIZER_ONSTACK(work) \
11934         ({ init_completion(&work); work; })
11935 @@ -73,7 +72,7 @@ struct completion {
11936  static inline void init_completion(struct completion *x)
11937  {
11938         x->done = 0;
11939 -       init_waitqueue_head(&x->wait);
11940 +       init_swait_queue_head(&x->wait);
11941  }
11942
11943  /**
11944 diff --git a/include/linux/cpu.h b/include/linux/cpu.h
11945 index d2ca8c38f9c4..94041d803d0b 100644
11946 --- a/include/linux/cpu.h
11947 +++ b/include/linux/cpu.h
11948 @@ -231,6 +231,8 @@ extern void get_online_cpus(void);
11949  extern void put_online_cpus(void);
11950  extern void cpu_hotplug_disable(void);
11951  extern void cpu_hotplug_enable(void);
11952 +extern void pin_current_cpu(void);
11953 +extern void unpin_current_cpu(void);
11954  #define hotcpu_notifier(fn, pri)       cpu_notifier(fn, pri)
11955  #define __hotcpu_notifier(fn, pri)     __cpu_notifier(fn, pri)
11956  #define register_hotcpu_notifier(nb)   register_cpu_notifier(nb)
11957 @@ -248,6 +250,8 @@ static inline void cpu_hotplug_done(void) {}
11958  #define put_online_cpus()      do { } while (0)
11959  #define cpu_hotplug_disable()  do { } while (0)
11960  #define cpu_hotplug_enable()   do { } while (0)
11961 +static inline void pin_current_cpu(void) { }
11962 +static inline void unpin_current_cpu(void) { }
11963  #define hotcpu_notifier(fn, pri)       do { (void)(fn); } while (0)
11964  #define __hotcpu_notifier(fn, pri)     do { (void)(fn); } while (0)
11965  /* These aren't inline functions due to a GCC bug. */
11966 diff --git a/include/linux/delay.h b/include/linux/delay.h
11967 index a6ecb34cf547..37caab306336 100644
11968 --- a/include/linux/delay.h
11969 +++ b/include/linux/delay.h
11970 @@ -52,4 +52,10 @@ static inline void ssleep(unsigned int seconds)
11971         msleep(seconds * 1000);
11972  }
11973
11974 +#ifdef CONFIG_PREEMPT_RT_FULL
11975 +extern void cpu_chill(void);
11976 +#else
11977 +# define cpu_chill()   cpu_relax()
11978 +#endif
11979 +
11980  #endif /* defined(_LINUX_DELAY_H) */
11981 diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
11982 index 60048c50404e..f2cd67624f18 100644
11983 --- a/include/linux/ftrace.h
11984 +++ b/include/linux/ftrace.h
11985 @@ -694,6 +694,18 @@ static inline void __ftrace_enabled_restore(int enabled)
11986  #define CALLER_ADDR5 ((unsigned long)ftrace_return_address(5))
11987  #define CALLER_ADDR6 ((unsigned long)ftrace_return_address(6))
11988
11989 +static inline unsigned long get_lock_parent_ip(void)
11990 +{
11991 +       unsigned long addr = CALLER_ADDR0;
11992 +
11993 +       if (!in_lock_functions(addr))
11994 +               return addr;
11995 +       addr = CALLER_ADDR1;
11996 +       if (!in_lock_functions(addr))
11997 +               return addr;
11998 +       return CALLER_ADDR2;
11999 +}
12000 +
12001  #ifdef CONFIG_IRQSOFF_TRACER
12002    extern void time_hardirqs_on(unsigned long a0, unsigned long a1);
12003    extern void time_hardirqs_off(unsigned long a0, unsigned long a1);
12004 diff --git a/include/linux/highmem.h b/include/linux/highmem.h
12005 index bb3f3297062a..a117a33ef72c 100644
12006 --- a/include/linux/highmem.h
12007 +++ b/include/linux/highmem.h
12008 @@ -7,6 +7,7 @@
12009  #include <linux/mm.h>
12010  #include <linux/uaccess.h>
12011  #include <linux/hardirq.h>
12012 +#include <linux/sched.h>
12013
12014  #include <asm/cacheflush.h>
12015
12016 @@ -65,7 +66,7 @@ static inline void kunmap(struct page *page)
12017
12018  static inline void *kmap_atomic(struct page *page)
12019  {
12020 -       preempt_disable();
12021 +       preempt_disable_nort();
12022         pagefault_disable();
12023         return page_address(page);
12024  }
12025 @@ -74,7 +75,7 @@ static inline void *kmap_atomic(struct page *page)
12026  static inline void __kunmap_atomic(void *addr)
12027  {
12028         pagefault_enable();
12029 -       preempt_enable();
12030 +       preempt_enable_nort();
12031  }
12032
12033  #define kmap_atomic_pfn(pfn)   kmap_atomic(pfn_to_page(pfn))
12034 @@ -86,32 +87,51 @@ static inline void __kunmap_atomic(void *addr)
12035
12036  #if defined(CONFIG_HIGHMEM) || defined(CONFIG_X86_32)
12037
12038 +#ifndef CONFIG_PREEMPT_RT_FULL
12039  DECLARE_PER_CPU(int, __kmap_atomic_idx);
12040 +#endif
12041
12042  static inline int kmap_atomic_idx_push(void)
12043  {
12044 +#ifndef CONFIG_PREEMPT_RT_FULL
12045         int idx = __this_cpu_inc_return(__kmap_atomic_idx) - 1;
12046
12047 -#ifdef CONFIG_DEBUG_HIGHMEM
12048 +# ifdef CONFIG_DEBUG_HIGHMEM
12049         WARN_ON_ONCE(in_irq() && !irqs_disabled());
12050         BUG_ON(idx >= KM_TYPE_NR);
12051 -#endif
12052 +# endif
12053         return idx;
12054 +#else
12055 +       current->kmap_idx++;
12056 +       BUG_ON(current->kmap_idx > KM_TYPE_NR);
12057 +       return current->kmap_idx - 1;
12058 +#endif
12059  }
12060
12061  static inline int kmap_atomic_idx(void)
12062  {
12063 +#ifndef CONFIG_PREEMPT_RT_FULL
12064         return __this_cpu_read(__kmap_atomic_idx) - 1;
12065 +#else
12066 +       return current->kmap_idx - 1;
12067 +#endif
12068  }
12069
12070  static inline void kmap_atomic_idx_pop(void)
12071  {
12072 -#ifdef CONFIG_DEBUG_HIGHMEM
12073 +#ifndef CONFIG_PREEMPT_RT_FULL
12074 +# ifdef CONFIG_DEBUG_HIGHMEM
12075         int idx = __this_cpu_dec_return(__kmap_atomic_idx);
12076
12077         BUG_ON(idx < 0);
12078 -#else
12079 +# else
12080         __this_cpu_dec(__kmap_atomic_idx);
12081 +# endif
12082 +#else
12083 +       current->kmap_idx--;
12084 +# ifdef CONFIG_DEBUG_HIGHMEM
12085 +       BUG_ON(current->kmap_idx < 0);
12086 +# endif
12087  #endif
12088  }
12089
12090 diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h
12091 index 2ead22dd74a0..8fbcdfa5dc77 100644
12092 --- a/include/linux/hrtimer.h
12093 +++ b/include/linux/hrtimer.h
12094 @@ -87,6 +87,9 @@ enum hrtimer_restart {
12095   * @function:  timer expiry callback function
12096   * @base:      pointer to the timer base (per cpu and per clock)
12097   * @state:     state information (See bit values above)
12098 + * @cb_entry:  list entry to defer timers from hardirq context
12099 + * @irqsafe:   timer can run in hardirq context
12100 + * @praecox:   timer expiry time if expired at the time of programming
12101   * @is_rel:    Set if the timer was armed relative
12102   * @start_pid:  timer statistics field to store the pid of the task which
12103   *             started the timer
12104 @@ -103,6 +106,11 @@ struct hrtimer {
12105         enum hrtimer_restart            (*function)(struct hrtimer *);
12106         struct hrtimer_clock_base       *base;
12107         u8                              state;
12108 +       struct list_head                cb_entry;
12109 +       int                             irqsafe;
12110 +#ifdef CONFIG_MISSED_TIMER_OFFSETS_HIST
12111 +       ktime_t                         praecox;
12112 +#endif
12113         u8                              is_rel;
12114  #ifdef CONFIG_TIMER_STATS
12115         int                             start_pid;
12116 @@ -123,11 +131,7 @@ struct hrtimer_sleeper {
12117         struct task_struct *task;
12118  };
12119
12120 -#ifdef CONFIG_64BIT
12121  # define HRTIMER_CLOCK_BASE_ALIGN      64
12122 -#else
12123 -# define HRTIMER_CLOCK_BASE_ALIGN      32
12124 -#endif
12125
12126  /**
12127   * struct hrtimer_clock_base - the timer base for a specific clock
12128 @@ -136,6 +140,7 @@ struct hrtimer_sleeper {
12129   *                     timer to a base on another cpu.
12130   * @clockid:           clock id for per_cpu support
12131   * @active:            red black tree root node for the active timers
12132 + * @expired:           list head for deferred timers.
12133   * @get_time:          function to retrieve the current time of the clock
12134   * @offset:            offset of this clock to the monotonic base
12135   */
12136 @@ -144,6 +149,7 @@ struct hrtimer_clock_base {
12137         int                     index;
12138         clockid_t               clockid;
12139         struct timerqueue_head  active;
12140 +       struct list_head        expired;
12141         ktime_t                 (*get_time)(void);
12142         ktime_t                 offset;
12143  } __attribute__((__aligned__(HRTIMER_CLOCK_BASE_ALIGN)));
12144 @@ -187,6 +193,7 @@ struct hrtimer_cpu_base {
12145         raw_spinlock_t                  lock;
12146         seqcount_t                      seq;
12147         struct hrtimer                  *running;
12148 +       struct hrtimer                  *running_soft;
12149         unsigned int                    cpu;
12150         unsigned int                    active_bases;
12151         unsigned int                    clock_was_set_seq;
12152 @@ -203,6 +210,9 @@ struct hrtimer_cpu_base {
12153         unsigned int                    nr_hangs;
12154         unsigned int                    max_hang_time;
12155  #endif
12156 +#ifdef CONFIG_PREEMPT_RT_BASE
12157 +       wait_queue_head_t               wait;
12158 +#endif
12159         struct hrtimer_clock_base       clock_base[HRTIMER_MAX_CLOCK_BASES];
12160  } ____cacheline_aligned;
12161
12162 @@ -412,6 +422,13 @@ static inline void hrtimer_restart(struct hrtimer *timer)
12163         hrtimer_start_expires(timer, HRTIMER_MODE_ABS);
12164  }
12165
12166 +/* Softirq preemption could deadlock timer removal */
12167 +#ifdef CONFIG_PREEMPT_RT_BASE
12168 +  extern void hrtimer_wait_for_timer(const struct hrtimer *timer);
12169 +#else
12170 +# define hrtimer_wait_for_timer(timer) do { cpu_relax(); } while (0)
12171 +#endif
12172 +
12173  /* Query timers: */
12174  extern ktime_t __hrtimer_get_remaining(const struct hrtimer *timer, bool adjust);
12175
12176 @@ -436,7 +453,7 @@ static inline int hrtimer_is_queued(struct hrtimer *timer)
12177   * Helper function to check, whether the timer is running the callback
12178   * function
12179   */
12180 -static inline int hrtimer_callback_running(struct hrtimer *timer)
12181 +static inline int hrtimer_callback_running(const struct hrtimer *timer)
12182  {
12183         return timer->base->cpu_base->running == timer;
12184  }
12185 diff --git a/include/linux/idr.h b/include/linux/idr.h
12186 index 013fd9bc4cb6..f62be0aec911 100644
12187 --- a/include/linux/idr.h
12188 +++ b/include/linux/idr.h
12189 @@ -95,10 +95,14 @@ bool idr_is_empty(struct idr *idp);
12190   * Each idr_preload() should be matched with an invocation of this
12191   * function.  See idr_preload() for details.
12192   */
12193 +#ifdef CONFIG_PREEMPT_RT_FULL
12194 +void idr_preload_end(void);
12195 +#else
12196  static inline void idr_preload_end(void)
12197  {
12198         preempt_enable();
12199  }
12200 +#endif
12201
12202  /**
12203   * idr_find - return pointer for given id
12204 diff --git a/include/linux/init_task.h b/include/linux/init_task.h
12205 index 1c1ff7e4faa4..60fadde71a44 100644
12206 --- a/include/linux/init_task.h
12207 +++ b/include/linux/init_task.h
12208 @@ -148,9 +148,15 @@ extern struct task_group root_task_group;
12209  # define INIT_PERF_EVENTS(tsk)
12210  #endif
12211
12212 +#ifdef CONFIG_PREEMPT_RT_BASE
12213 +# define INIT_TIMER_LIST               .posix_timer_list = NULL,
12214 +#else
12215 +# define INIT_TIMER_LIST
12216 +#endif
12217 +
12218  #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
12219  # define INIT_VTIME(tsk)                                               \
12220 -       .vtime_seqlock = __SEQLOCK_UNLOCKED(tsk.vtime_seqlock), \
12221 +       .vtime_seqcount = SEQCNT_ZERO(tsk.vtime_seqcount),      \
12222         .vtime_snap = 0,                                \
12223         .vtime_snap_whence = VTIME_SYS,
12224  #else
12225 @@ -239,6 +245,7 @@ extern struct task_group root_task_group;
12226         .cpu_timers     = INIT_CPU_TIMERS(tsk.cpu_timers),              \
12227         .pi_lock        = __RAW_SPIN_LOCK_UNLOCKED(tsk.pi_lock),        \
12228         .timer_slack_ns = 50000, /* 50 usec default slack */            \
12229 +       INIT_TIMER_LIST                                                 \
12230         .pids = {                                                       \
12231                 [PIDTYPE_PID]  = INIT_PID_LINK(PIDTYPE_PID),            \
12232                 [PIDTYPE_PGID] = INIT_PID_LINK(PIDTYPE_PGID),           \
12233 diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h
12234 index ad16809c8596..655cee096aed 100644
12235 --- a/include/linux/interrupt.h
12236 +++ b/include/linux/interrupt.h
12237 @@ -61,6 +61,7 @@
12238   *                interrupt handler after suspending interrupts. For system
12239   *                wakeup devices users need to implement wakeup detection in
12240   *                their interrupt handlers.
12241 + * IRQF_NO_SOFTIRQ_CALL - Do not process softirqs in the irq thread context (RT)
12242   */
12243  #define IRQF_SHARED            0x00000080
12244  #define IRQF_PROBE_SHARED      0x00000100
12245 @@ -74,6 +75,7 @@
12246  #define IRQF_NO_THREAD         0x00010000
12247  #define IRQF_EARLY_RESUME      0x00020000
12248  #define IRQF_COND_SUSPEND      0x00040000
12249 +#define IRQF_NO_SOFTIRQ_CALL   0x00080000
12250
12251  #define IRQF_TIMER             (__IRQF_TIMER | IRQF_NO_SUSPEND | IRQF_NO_THREAD)
12252
12253 @@ -186,7 +188,7 @@ extern void devm_free_irq(struct device *dev, unsigned int irq, void *dev_id);
12254  #ifdef CONFIG_LOCKDEP
12255  # define local_irq_enable_in_hardirq() do { } while (0)
12256  #else
12257 -# define local_irq_enable_in_hardirq() local_irq_enable()
12258 +# define local_irq_enable_in_hardirq() local_irq_enable_nort()
12259  #endif
12260
12261  extern void disable_irq_nosync(unsigned int irq);
12262 @@ -206,6 +208,7 @@ extern void resume_device_irqs(void);
12263   * @irq:               Interrupt to which notification applies
12264   * @kref:              Reference count, for internal use
12265   * @work:              Work item, for internal use
12266 + * @list:              List item for deferred callbacks
12267   * @notify:            Function to be called on change.  This will be
12268   *                     called in process context.
12269   * @release:           Function to be called on release.  This will be
12270 @@ -217,6 +220,7 @@ struct irq_affinity_notify {
12271         unsigned int irq;
12272         struct kref kref;
12273         struct work_struct work;
12274 +       struct list_head list;
12275         void (*notify)(struct irq_affinity_notify *, const cpumask_t *mask);
12276         void (*release)(struct kref *ref);
12277  };
12278 @@ -379,9 +383,13 @@ extern int irq_set_irqchip_state(unsigned int irq, enum irqchip_irq_state which,
12279                                  bool state);
12280
12281  #ifdef CONFIG_IRQ_FORCED_THREADING
12282 +# ifndef CONFIG_PREEMPT_RT_BASE
12283  extern bool force_irqthreads;
12284 +# else
12285 +#  define force_irqthreads     (true)
12286 +# endif
12287  #else
12288 -#define force_irqthreads       (0)
12289 +#define force_irqthreads       (false)
12290  #endif
12291
12292  #ifndef __ARCH_SET_SOFTIRQ_PENDING
12293 @@ -438,9 +446,10 @@ struct softirq_action
12294         void    (*action)(struct softirq_action *);
12295  };
12296
12297 +#ifndef CONFIG_PREEMPT_RT_FULL
12298  asmlinkage void do_softirq(void);
12299  asmlinkage void __do_softirq(void);
12300 -
12301 +static inline void thread_do_softirq(void) { do_softirq(); }
12302  #ifdef __ARCH_HAS_DO_SOFTIRQ
12303  void do_softirq_own_stack(void);
12304  #else
12305 @@ -449,13 +458,25 @@ static inline void do_softirq_own_stack(void)
12306         __do_softirq();
12307  }
12308  #endif
12309 +#else
12310 +extern void thread_do_softirq(void);
12311 +#endif
12312
12313  extern void open_softirq(int nr, void (*action)(struct softirq_action *));
12314  extern void softirq_init(void);
12315  extern void __raise_softirq_irqoff(unsigned int nr);
12316 +#ifdef CONFIG_PREEMPT_RT_FULL
12317 +extern void __raise_softirq_irqoff_ksoft(unsigned int nr);
12318 +#else
12319 +static inline void __raise_softirq_irqoff_ksoft(unsigned int nr)
12320 +{
12321 +       __raise_softirq_irqoff(nr);
12322 +}
12323 +#endif
12324
12325  extern void raise_softirq_irqoff(unsigned int nr);
12326  extern void raise_softirq(unsigned int nr);
12327 +extern void softirq_check_pending_idle(void);
12328
12329  DECLARE_PER_CPU(struct task_struct *, ksoftirqd);
12330
12331 @@ -477,8 +498,9 @@ static inline struct task_struct *this_cpu_ksoftirqd(void)
12332       to be executed on some cpu at least once after this.
12333     * If the tasklet is already scheduled, but its execution is still not
12334       started, it will be executed only once.
12335 -   * If this tasklet is already running on another CPU (or schedule is called
12336 -     from tasklet itself), it is rescheduled for later.
12337 +   * If this tasklet is already running on another CPU, it is rescheduled
12338 +     for later.
12339 +   * Schedule must not be called from the tasklet itself (a lockup occurs)
12340     * Tasklet is strictly serialized wrt itself, but not
12341       wrt another tasklets. If client needs some intertask synchronization,
12342       he makes it with spinlocks.
12343 @@ -503,27 +525,36 @@ struct tasklet_struct name = { NULL, 0, ATOMIC_INIT(1), func, data }
12344  enum
12345  {
12346         TASKLET_STATE_SCHED,    /* Tasklet is scheduled for execution */
12347 -       TASKLET_STATE_RUN       /* Tasklet is running (SMP only) */
12348 +       TASKLET_STATE_RUN,      /* Tasklet is running (SMP only) */
12349 +       TASKLET_STATE_PENDING   /* Tasklet is pending */
12350  };
12351
12352 -#ifdef CONFIG_SMP
12353 +#define TASKLET_STATEF_SCHED   (1 << TASKLET_STATE_SCHED)
12354 +#define TASKLET_STATEF_RUN     (1 << TASKLET_STATE_RUN)
12355 +#define TASKLET_STATEF_PENDING (1 << TASKLET_STATE_PENDING)
12356 +
12357 +#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT_FULL)
12358  static inline int tasklet_trylock(struct tasklet_struct *t)
12359  {
12360         return !test_and_set_bit(TASKLET_STATE_RUN, &(t)->state);
12361  }
12362
12363 +static inline int tasklet_tryunlock(struct tasklet_struct *t)
12364 +{
12365 +       return cmpxchg(&t->state, TASKLET_STATEF_RUN, 0) == TASKLET_STATEF_RUN;
12366 +}
12367 +
12368  static inline void tasklet_unlock(struct tasklet_struct *t)
12369  {
12370         smp_mb__before_atomic();
12371         clear_bit(TASKLET_STATE_RUN, &(t)->state);
12372  }
12373
12374 -static inline void tasklet_unlock_wait(struct tasklet_struct *t)
12375 -{
12376 -       while (test_bit(TASKLET_STATE_RUN, &(t)->state)) { barrier(); }
12377 -}
12378 +extern void tasklet_unlock_wait(struct tasklet_struct *t);
12379 +
12380  #else
12381  #define tasklet_trylock(t) 1
12382 +#define tasklet_tryunlock(t)   1
12383  #define tasklet_unlock_wait(t) do { } while (0)
12384  #define tasklet_unlock(t) do { } while (0)
12385  #endif
12386 @@ -572,12 +603,7 @@ static inline void tasklet_disable(struct tasklet_struct *t)
12387         smp_mb();
12388  }
12389
12390 -static inline void tasklet_enable(struct tasklet_struct *t)
12391 -{
12392 -       smp_mb__before_atomic();
12393 -       atomic_dec(&t->count);
12394 -}
12395 -
12396 +extern void tasklet_enable(struct tasklet_struct *t);
12397  extern void tasklet_kill(struct tasklet_struct *t);
12398  extern void tasklet_kill_immediate(struct tasklet_struct *t, unsigned int cpu);
12399  extern void tasklet_init(struct tasklet_struct *t,
12400 @@ -608,6 +634,12 @@ void tasklet_hrtimer_cancel(struct tasklet_hrtimer *ttimer)
12401         tasklet_kill(&ttimer->tasklet);
12402  }
12403
12404 +#ifdef CONFIG_PREEMPT_RT_FULL
12405 +extern void softirq_early_init(void);
12406 +#else
12407 +static inline void softirq_early_init(void) { }
12408 +#endif
12409 +
12410  /*
12411   * Autoprobing for irqs:
12412   *
12413 diff --git a/include/linux/irq.h b/include/linux/irq.h
12414 index f7cade00c525..dac9e11ba037 100644
12415 --- a/include/linux/irq.h
12416 +++ b/include/linux/irq.h
12417 @@ -72,6 +72,7 @@ enum irqchip_irq_state;
12418   * IRQ_IS_POLLED               - Always polled by another interrupt. Exclude
12419   *                               it from the spurious interrupt detection
12420   *                               mechanism and from core side polling.
12421 + * IRQ_NO_SOFTIRQ_CALL         - No softirq processing in the irq thread context (RT)
12422   * IRQ_DISABLE_UNLAZY          - Disable lazy irq disable
12423   */
12424  enum {
12425 @@ -99,13 +100,14 @@ enum {
12426         IRQ_PER_CPU_DEVID       = (1 << 17),
12427         IRQ_IS_POLLED           = (1 << 18),
12428         IRQ_DISABLE_UNLAZY      = (1 << 19),
12429 +       IRQ_NO_SOFTIRQ_CALL     = (1 << 20),
12430  };
12431
12432  #define IRQF_MODIFY_MASK       \
12433         (IRQ_TYPE_SENSE_MASK | IRQ_NOPROBE | IRQ_NOREQUEST | \
12434          IRQ_NOAUTOEN | IRQ_MOVE_PCNTXT | IRQ_LEVEL | IRQ_NO_BALANCING | \
12435          IRQ_PER_CPU | IRQ_NESTED_THREAD | IRQ_NOTHREAD | IRQ_PER_CPU_DEVID | \
12436 -        IRQ_IS_POLLED | IRQ_DISABLE_UNLAZY)
12437 +        IRQ_IS_POLLED | IRQ_DISABLE_UNLAZY | IRQ_NO_SOFTIRQ_CALL)
12438
12439  #define IRQ_NO_BALANCING_MASK  (IRQ_PER_CPU | IRQ_NO_BALANCING)
12440
12441 diff --git a/include/linux/irq_work.h b/include/linux/irq_work.h
12442 index 47b9ebd4a74f..2543aab05daa 100644
12443 --- a/include/linux/irq_work.h
12444 +++ b/include/linux/irq_work.h
12445 @@ -16,6 +16,7 @@
12446  #define IRQ_WORK_BUSY          2UL
12447  #define IRQ_WORK_FLAGS         3UL
12448  #define IRQ_WORK_LAZY          4UL /* Doesn't want IPI, wait for tick */
12449 +#define IRQ_WORK_HARD_IRQ      8UL /* Run hard IRQ context, even on RT */
12450
12451  struct irq_work {
12452         unsigned long flags;
12453 @@ -51,4 +52,10 @@ static inline bool irq_work_needs_cpu(void) { return false; }
12454  static inline void irq_work_run(void) { }
12455  #endif
12456
12457 +#if defined(CONFIG_IRQ_WORK) && defined(CONFIG_PREEMPT_RT_FULL)
12458 +void irq_work_tick_soft(void);
12459 +#else
12460 +static inline void irq_work_tick_soft(void) { }
12461 +#endif
12462 +
12463  #endif /* _LINUX_IRQ_WORK_H */
12464 diff --git a/include/linux/irqdesc.h b/include/linux/irqdesc.h
12465 index a587a33363c7..ad57402a242d 100644
12466 --- a/include/linux/irqdesc.h
12467 +++ b/include/linux/irqdesc.h
12468 @@ -61,6 +61,7 @@ struct irq_desc {
12469         unsigned int            irqs_unhandled;
12470         atomic_t                threads_handled;
12471         int                     threads_handled_last;
12472 +       u64                     random_ip;
12473         raw_spinlock_t          lock;
12474         struct cpumask          *percpu_enabled;
12475  #ifdef CONFIG_SMP
12476 diff --git a/include/linux/irqflags.h b/include/linux/irqflags.h
12477 index 5dd1272d1ab2..9b77034f7c5e 100644
12478 --- a/include/linux/irqflags.h
12479 +++ b/include/linux/irqflags.h
12480 @@ -25,8 +25,6 @@
12481  # define trace_softirqs_enabled(p)     ((p)->softirqs_enabled)
12482  # define trace_hardirq_enter() do { current->hardirq_context++; } while (0)
12483  # define trace_hardirq_exit()  do { current->hardirq_context--; } while (0)
12484 -# define lockdep_softirq_enter()       do { current->softirq_context++; } while (0)
12485 -# define lockdep_softirq_exit()        do { current->softirq_context--; } while (0)
12486  # define INIT_TRACE_IRQFLAGS   .softirqs_enabled = 1,
12487  #else
12488  # define trace_hardirqs_on()           do { } while (0)
12489 @@ -39,9 +37,15 @@
12490  # define trace_softirqs_enabled(p)     0
12491  # define trace_hardirq_enter()         do { } while (0)
12492  # define trace_hardirq_exit()          do { } while (0)
12493 +# define INIT_TRACE_IRQFLAGS
12494 +#endif
12495 +
12496 +#if defined(CONFIG_TRACE_IRQFLAGS) && !defined(CONFIG_PREEMPT_RT_FULL)
12497 +# define lockdep_softirq_enter() do { current->softirq_context++; } while (0)
12498 +# define lockdep_softirq_exit()         do { current->softirq_context--; } while (0)
12499 +#else
12500  # define lockdep_softirq_enter()       do { } while (0)
12501  # define lockdep_softirq_exit()                do { } while (0)
12502 -# define INIT_TRACE_IRQFLAGS
12503  #endif
12504
12505  #if defined(CONFIG_IRQSOFF_TRACER) || \
12506 @@ -148,4 +152,23 @@
12507
12508  #define irqs_disabled_flags(flags) raw_irqs_disabled_flags(flags)
12509
12510 +/*
12511 + * local_irq* variants depending on RT/!RT
12512 + */
12513 +#ifdef CONFIG_PREEMPT_RT_FULL
12514 +# define local_irq_disable_nort()      do { } while (0)
12515 +# define local_irq_enable_nort()       do { } while (0)
12516 +# define local_irq_save_nort(flags)    local_save_flags(flags)
12517 +# define local_irq_restore_nort(flags) (void)(flags)
12518 +# define local_irq_disable_rt()                local_irq_disable()
12519 +# define local_irq_enable_rt()         local_irq_enable()
12520 +#else
12521 +# define local_irq_disable_nort()      local_irq_disable()
12522 +# define local_irq_enable_nort()       local_irq_enable()
12523 +# define local_irq_save_nort(flags)    local_irq_save(flags)
12524 +# define local_irq_restore_nort(flags) local_irq_restore(flags)
12525 +# define local_irq_disable_rt()                do { } while (0)
12526 +# define local_irq_enable_rt()         do { } while (0)
12527 +#endif
12528 +
12529  #endif
12530 diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h
12531 index 65407f6c9120..eb5aabe4e18c 100644
12532 --- a/include/linux/jbd2.h
12533 +++ b/include/linux/jbd2.h
12534 @@ -352,32 +352,56 @@ static inline struct journal_head *bh2jh(struct buffer_head *bh)
12535
12536  static inline void jbd_lock_bh_state(struct buffer_head *bh)
12537  {
12538 +#ifndef CONFIG_PREEMPT_RT_BASE
12539         bit_spin_lock(BH_State, &bh->b_state);
12540 +#else
12541 +       spin_lock(&bh->b_state_lock);
12542 +#endif
12543  }
12544
12545  static inline int jbd_trylock_bh_state(struct buffer_head *bh)
12546  {
12547 +#ifndef CONFIG_PREEMPT_RT_BASE
12548         return bit_spin_trylock(BH_State, &bh->b_state);
12549 +#else
12550 +       return spin_trylock(&bh->b_state_lock);
12551 +#endif
12552  }
12553
12554  static inline int jbd_is_locked_bh_state(struct buffer_head *bh)
12555  {
12556 +#ifndef CONFIG_PREEMPT_RT_BASE
12557         return bit_spin_is_locked(BH_State, &bh->b_state);
12558 +#else
12559 +       return spin_is_locked(&bh->b_state_lock);
12560 +#endif
12561  }
12562
12563  static inline void jbd_unlock_bh_state(struct buffer_head *bh)
12564  {
12565 +#ifndef CONFIG_PREEMPT_RT_BASE
12566         bit_spin_unlock(BH_State, &bh->b_state);
12567 +#else
12568 +       spin_unlock(&bh->b_state_lock);
12569 +#endif
12570  }
12571
12572  static inline void jbd_lock_bh_journal_head(struct buffer_head *bh)
12573  {
12574 +#ifndef CONFIG_PREEMPT_RT_BASE
12575         bit_spin_lock(BH_JournalHead, &bh->b_state);
12576 +#else
12577 +       spin_lock(&bh->b_journal_head_lock);
12578 +#endif
12579  }
12580
12581  static inline void jbd_unlock_bh_journal_head(struct buffer_head *bh)
12582  {
12583 +#ifndef CONFIG_PREEMPT_RT_BASE
12584         bit_spin_unlock(BH_JournalHead, &bh->b_state);
12585 +#else
12586 +       spin_unlock(&bh->b_journal_head_lock);
12587 +#endif
12588  }
12589
12590  #define J_ASSERT(assert)       BUG_ON(!(assert))
12591 diff --git a/include/linux/kdb.h b/include/linux/kdb.h
12592 index a19bcf9e762e..897495386446 100644
12593 --- a/include/linux/kdb.h
12594 +++ b/include/linux/kdb.h
12595 @@ -167,6 +167,7 @@ extern __printf(2, 0) int vkdb_printf(enum kdb_msgsrc src, const char *fmt,
12596  extern __printf(1, 2) int kdb_printf(const char *, ...);
12597  typedef __printf(1, 2) int (*kdb_printf_t)(const char *, ...);
12598
12599 +#define in_kdb_printk()        (kdb_trap_printk)
12600  extern void kdb_init(int level);
12601
12602  /* Access to kdb specific polling devices */
12603 @@ -201,6 +202,7 @@ extern int kdb_register_flags(char *, kdb_func_t, char *, char *,
12604  extern int kdb_unregister(char *);
12605  #else /* ! CONFIG_KGDB_KDB */
12606  static inline __printf(1, 2) int kdb_printf(const char *fmt, ...) { return 0; }
12607 +#define in_kdb_printk() (0)
12608  static inline void kdb_init(int level) {}
12609  static inline int kdb_register(char *cmd, kdb_func_t func, char *usage,
12610                                char *help, short minlen) { return 0; }
12611 diff --git a/include/linux/kernel.h b/include/linux/kernel.h
12612 index 50220cab738c..d68f639f7330 100644
12613 --- a/include/linux/kernel.h
12614 +++ b/include/linux/kernel.h
12615 @@ -188,6 +188,9 @@ extern int _cond_resched(void);
12616   */
12617  # define might_sleep() \
12618         do { __might_sleep(__FILE__, __LINE__, 0); might_resched(); } while (0)
12619 +
12620 +# define might_sleep_no_state_check() \
12621 +       do { ___might_sleep(__FILE__, __LINE__, 0); might_resched(); } while (0)
12622  # define sched_annotate_sleep()        (current->task_state_change = 0)
12623  #else
12624    static inline void ___might_sleep(const char *file, int line,
12625 @@ -195,6 +198,7 @@ extern int _cond_resched(void);
12626    static inline void __might_sleep(const char *file, int line,
12627                                    int preempt_offset) { }
12628  # define might_sleep() do { might_resched(); } while (0)
12629 +# define might_sleep_no_state_check() do { might_resched(); } while (0)
12630  # define sched_annotate_sleep() do { } while (0)
12631  #endif
12632
12633 @@ -255,6 +259,7 @@ extern long (*panic_blink)(int state);
12634  __printf(1, 2)
12635  void panic(const char *fmt, ...)
12636         __noreturn __cold;
12637 +void nmi_panic(struct pt_regs *regs, const char *msg);
12638  extern void oops_enter(void);
12639  extern void oops_exit(void);
12640  void print_oops_end_marker(void);
12641 @@ -448,6 +453,14 @@ extern int sysctl_panic_on_stackoverflow;
12642  extern bool crash_kexec_post_notifiers;
12643
12644  /*
12645 + * panic_cpu is used for synchronizing panic() and crash_kexec() execution. It
12646 + * holds a CPU number which is executing panic() currently. A value of
12647 + * PANIC_CPU_INVALID means no CPU has entered panic() or crash_kexec().
12648 + */
12649 +extern atomic_t panic_cpu;
12650 +#define PANIC_CPU_INVALID      -1
12651 +
12652 +/*
12653   * Only to be used by arch init code. If the user over-wrote the default
12654   * CONFIG_PANIC_TIMEOUT, honor it.
12655   */
12656 @@ -475,6 +488,7 @@ extern enum system_states {
12657         SYSTEM_HALT,
12658         SYSTEM_POWER_OFF,
12659         SYSTEM_RESTART,
12660 +       SYSTEM_SUSPEND,
12661  } system_state;
12662
12663  #define TAINT_PROPRIETARY_MODULE       0
12664 diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
12665 index c923350ca20a..c690acc6900e 100644
12666 --- a/include/linux/kvm_host.h
12667 +++ b/include/linux/kvm_host.h
12668 @@ -25,6 +25,7 @@
12669  #include <linux/irqflags.h>
12670  #include <linux/context_tracking.h>
12671  #include <linux/irqbypass.h>
12672 +#include <linux/swait.h>
12673  #include <asm/signal.h>
12674
12675  #include <linux/kvm.h>
12676 @@ -243,7 +244,7 @@ struct kvm_vcpu {
12677         int fpu_active;
12678         int guest_fpu_loaded, guest_xcr0_loaded;
12679         unsigned char fpu_counter;
12680 -       wait_queue_head_t wq;
12681 +       struct swait_queue_head wq;
12682         struct pid *pid;
12683         int sigset_active;
12684         sigset_t sigset;
12685 @@ -794,7 +795,7 @@ static inline bool kvm_arch_has_assigned_device(struct kvm *kvm)
12686  }
12687  #endif
12688
12689 -static inline wait_queue_head_t *kvm_arch_vcpu_wq(struct kvm_vcpu *vcpu)
12690 +static inline struct swait_queue_head *kvm_arch_vcpu_wq(struct kvm_vcpu *vcpu)
12691  {
12692  #ifdef __KVM_HAVE_ARCH_WQP
12693         return vcpu->arch.wqp;
12694 diff --git a/include/linux/lglock.h b/include/linux/lglock.h
12695 index c92ebd100d9b..6f035f635d0e 100644
12696 --- a/include/linux/lglock.h
12697 +++ b/include/linux/lglock.h
12698 @@ -34,13 +34,30 @@
12699  #endif
12700
12701  struct lglock {
12702 +#ifdef CONFIG_PREEMPT_RT_FULL
12703 +       struct rt_mutex __percpu *lock;
12704 +#else
12705         arch_spinlock_t __percpu *lock;
12706 +#endif
12707  #ifdef CONFIG_DEBUG_LOCK_ALLOC
12708         struct lock_class_key lock_key;
12709         struct lockdep_map    lock_dep_map;
12710  #endif
12711  };
12712
12713 +#ifdef CONFIG_PREEMPT_RT_FULL
12714 +# define DEFINE_LGLOCK(name)                                           \
12715 +       static DEFINE_PER_CPU(struct rt_mutex, name ## _lock)           \
12716 +       = __RT_MUTEX_INITIALIZER( name ## _lock);                       \
12717 +       struct lglock name = { .lock = &name ## _lock }
12718 +
12719 +# define DEFINE_STATIC_LGLOCK(name)                                    \
12720 +       static DEFINE_PER_CPU(struct rt_mutex, name ## _lock)           \
12721 +       = __RT_MUTEX_INITIALIZER( name ## _lock);                       \
12722 +       static struct lglock name = { .lock = &name ## _lock }
12723 +
12724 +#else
12725 +
12726  #define DEFINE_LGLOCK(name)                                            \
12727         static DEFINE_PER_CPU(arch_spinlock_t, name ## _lock)           \
12728         = __ARCH_SPIN_LOCK_UNLOCKED;                                    \
12729 @@ -50,6 +67,7 @@ struct lglock {
12730         static DEFINE_PER_CPU(arch_spinlock_t, name ## _lock)           \
12731         = __ARCH_SPIN_LOCK_UNLOCKED;                                    \
12732         static struct lglock name = { .lock = &name ## _lock }
12733 +#endif
12734
12735  void lg_lock_init(struct lglock *lg, char *name);
12736
12737 @@ -64,6 +82,12 @@ void lg_double_unlock(struct lglock *lg, int cpu1, int cpu2);
12738  void lg_global_lock(struct lglock *lg);
12739  void lg_global_unlock(struct lglock *lg);
12740
12741 +#ifndef CONFIG_PREEMPT_RT_FULL
12742 +#define lg_global_trylock_relax(name)  lg_global_lock(name)
12743 +#else
12744 +void lg_global_trylock_relax(struct lglock *lg);
12745 +#endif
12746 +
12747  #else
12748  /* When !CONFIG_SMP, map lglock to spinlock */
12749  #define lglock spinlock
12750 diff --git a/include/linux/list_bl.h b/include/linux/list_bl.h
12751 index 8132214e8efd..89ffaa7bd342 100644
12752 --- a/include/linux/list_bl.h
12753 +++ b/include/linux/list_bl.h
12754 @@ -2,6 +2,7 @@
12755  #define _LINUX_LIST_BL_H
12756
12757  #include <linux/list.h>
12758 +#include <linux/spinlock.h>
12759  #include <linux/bit_spinlock.h>
12760
12761  /*
12762 @@ -32,13 +33,24 @@
12763
12764  struct hlist_bl_head {
12765         struct hlist_bl_node *first;
12766 +#ifdef CONFIG_PREEMPT_RT_BASE
12767 +       raw_spinlock_t lock;
12768 +#endif
12769  };
12770
12771  struct hlist_bl_node {
12772         struct hlist_bl_node *next, **pprev;
12773  };
12774 -#define INIT_HLIST_BL_HEAD(ptr) \
12775 -       ((ptr)->first = NULL)
12776 +
12777 +#ifdef CONFIG_PREEMPT_RT_BASE
12778 +#define INIT_HLIST_BL_HEAD(h)          \
12779 +do {                                   \
12780 +       (h)->first = NULL;              \
12781 +       raw_spin_lock_init(&(h)->lock); \
12782 +} while (0)
12783 +#else
12784 +#define INIT_HLIST_BL_HEAD(h) (h)->first = NULL
12785 +#endif
12786
12787  static inline void INIT_HLIST_BL_NODE(struct hlist_bl_node *h)
12788  {
12789 @@ -118,12 +130,26 @@ static inline void hlist_bl_del_init(struct hlist_bl_node *n)
12790
12791  static inline void hlist_bl_lock(struct hlist_bl_head *b)
12792  {
12793 +#ifndef CONFIG_PREEMPT_RT_BASE
12794         bit_spin_lock(0, (unsigned long *)b);
12795 +#else
12796 +       raw_spin_lock(&b->lock);
12797 +#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK)
12798 +       __set_bit(0, (unsigned long *)b);
12799 +#endif
12800 +#endif
12801  }
12802
12803  static inline void hlist_bl_unlock(struct hlist_bl_head *b)
12804  {
12805 +#ifndef CONFIG_PREEMPT_RT_BASE
12806         __bit_spin_unlock(0, (unsigned long *)b);
12807 +#else
12808 +#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK)
12809 +       __clear_bit(0, (unsigned long *)b);
12810 +#endif
12811 +       raw_spin_unlock(&b->lock);
12812 +#endif
12813  }
12814
12815  static inline bool hlist_bl_is_locked(struct hlist_bl_head *b)
12816 diff --git a/include/linux/locallock.h b/include/linux/locallock.h
12817 new file mode 100644
12818 index 000000000000..e572a3971631
12819 --- /dev/null
12820 +++ b/include/linux/locallock.h
12821 @@ -0,0 +1,276 @@
12822 +#ifndef _LINUX_LOCALLOCK_H
12823 +#define _LINUX_LOCALLOCK_H
12824 +
12825 +#include <linux/percpu.h>
12826 +#include <linux/spinlock.h>
12827 +
12828 +#ifdef CONFIG_PREEMPT_RT_BASE
12829 +
12830 +#ifdef CONFIG_DEBUG_SPINLOCK
12831 +# define LL_WARN(cond) WARN_ON(cond)
12832 +#else
12833 +# define LL_WARN(cond) do { } while (0)
12834 +#endif
12835 +
12836 +/*
12837 + * per cpu lock based substitute for local_irq_*()
12838 + */
12839 +struct local_irq_lock {
12840 +       spinlock_t              lock;
12841 +       struct task_struct      *owner;
12842 +       int                     nestcnt;
12843 +       unsigned long           flags;
12844 +};
12845 +
12846 +#define DEFINE_LOCAL_IRQ_LOCK(lvar)                                    \
12847 +       DEFINE_PER_CPU(struct local_irq_lock, lvar) = {                 \
12848 +               .lock = __SPIN_LOCK_UNLOCKED((lvar).lock) }
12849 +
12850 +#define DECLARE_LOCAL_IRQ_LOCK(lvar)                                   \
12851 +       DECLARE_PER_CPU(struct local_irq_lock, lvar)
12852 +
12853 +#define local_irq_lock_init(lvar)                                      \
12854 +       do {                                                            \
12855 +               int __cpu;                                              \
12856 +               for_each_possible_cpu(__cpu)                            \
12857 +                       spin_lock_init(&per_cpu(lvar, __cpu).lock);     \
12858 +       } while (0)
12859 +
12860 +/*
12861 + * spin_lock|trylock|unlock_local flavour that does not migrate disable
12862 + * used for __local_lock|trylock|unlock where get_local_var/put_local_var
12863 + * already takes care of the migrate_disable/enable
12864 + * for CONFIG_PREEMPT_BASE map to the normal spin_* calls.
12865 + */
12866 +#ifdef CONFIG_PREEMPT_RT_FULL
12867 +# define spin_lock_local(lock)                 rt_spin_lock__no_mg(lock)
12868 +# define spin_trylock_local(lock)              rt_spin_trylock__no_mg(lock)
12869 +# define spin_unlock_local(lock)               rt_spin_unlock__no_mg(lock)
12870 +#else
12871 +# define spin_lock_local(lock)                 spin_lock(lock)
12872 +# define spin_trylock_local(lock)              spin_trylock(lock)
12873 +# define spin_unlock_local(lock)               spin_unlock(lock)
12874 +#endif
12875 +
12876 +static inline void __local_lock(struct local_irq_lock *lv)
12877 +{
12878 +       if (lv->owner != current) {
12879 +               spin_lock_local(&lv->lock);
12880 +               LL_WARN(lv->owner);
12881 +               LL_WARN(lv->nestcnt);
12882 +               lv->owner = current;
12883 +       }
12884 +       lv->nestcnt++;
12885 +}
12886 +
12887 +#define local_lock(lvar)                                       \
12888 +       do { __local_lock(&get_local_var(lvar)); } while (0)
12889 +
12890 +#define local_lock_on(lvar, cpu)                               \
12891 +       do { __local_lock(&per_cpu(lvar, cpu)); } while (0)
12892 +
12893 +static inline int __local_trylock(struct local_irq_lock *lv)
12894 +{
12895 +       if (lv->owner != current && spin_trylock_local(&lv->lock)) {
12896 +               LL_WARN(lv->owner);
12897 +               LL_WARN(lv->nestcnt);
12898 +               lv->owner = current;
12899 +               lv->nestcnt = 1;
12900 +               return 1;
12901 +       }
12902 +       return 0;
12903 +}
12904 +
12905 +#define local_trylock(lvar)                                            \
12906 +       ({                                                              \
12907 +               int __locked;                                           \
12908 +               __locked = __local_trylock(&get_local_var(lvar));       \
12909 +               if (!__locked)                                          \
12910 +                       put_local_var(lvar);                            \
12911 +               __locked;                                               \
12912 +       })
12913 +
12914 +static inline void __local_unlock(struct local_irq_lock *lv)
12915 +{
12916 +       LL_WARN(lv->nestcnt == 0);
12917 +       LL_WARN(lv->owner != current);
12918 +       if (--lv->nestcnt)
12919 +               return;
12920 +
12921 +       lv->owner = NULL;
12922 +       spin_unlock_local(&lv->lock);
12923 +}
12924 +
12925 +#define local_unlock(lvar)                                     \
12926 +       do {                                                    \
12927 +               __local_unlock(this_cpu_ptr(&lvar));            \
12928 +               put_local_var(lvar);                            \
12929 +       } while (0)
12930 +
12931 +#define local_unlock_on(lvar, cpu)                       \
12932 +       do { __local_unlock(&per_cpu(lvar, cpu)); } while (0)
12933 +
12934 +static inline void __local_lock_irq(struct local_irq_lock *lv)
12935 +{
12936 +       spin_lock_irqsave(&lv->lock, lv->flags);
12937 +       LL_WARN(lv->owner);
12938 +       LL_WARN(lv->nestcnt);
12939 +       lv->owner = current;
12940 +       lv->nestcnt = 1;
12941 +}
12942 +
12943 +#define local_lock_irq(lvar)                                           \
12944 +       do { __local_lock_irq(&get_local_var(lvar)); } while (0)
12945 +
12946 +#define local_lock_irq_on(lvar, cpu)                                   \
12947 +       do { __local_lock_irq(&per_cpu(lvar, cpu)); } while (0)
12948 +
12949 +static inline void __local_unlock_irq(struct local_irq_lock *lv)
12950 +{
12951 +       LL_WARN(!lv->nestcnt);
12952 +       LL_WARN(lv->owner != current);
12953 +       lv->owner = NULL;
12954 +       lv->nestcnt = 0;
12955 +       spin_unlock_irq(&lv->lock);
12956 +}
12957 +
12958 +#define local_unlock_irq(lvar)                                         \
12959 +       do {                                                            \
12960 +               __local_unlock_irq(this_cpu_ptr(&lvar));                \
12961 +               put_local_var(lvar);                                    \
12962 +       } while (0)
12963 +
12964 +#define local_unlock_irq_on(lvar, cpu)                                 \
12965 +       do {                                                            \
12966 +               __local_unlock_irq(&per_cpu(lvar, cpu));                \
12967 +       } while (0)
12968 +
12969 +static inline int __local_lock_irqsave(struct local_irq_lock *lv)
12970 +{
12971 +       if (lv->owner != current) {
12972 +               __local_lock_irq(lv);
12973 +               return 0;
12974 +       } else {
12975 +               lv->nestcnt++;
12976 +               return 1;
12977 +       }
12978 +}
12979 +
12980 +#define local_lock_irqsave(lvar, _flags)                               \
12981 +       do {                                                            \
12982 +               if (__local_lock_irqsave(&get_local_var(lvar)))         \
12983 +                       put_local_var(lvar);                            \
12984 +               _flags = __this_cpu_read(lvar.flags);                   \
12985 +       } while (0)
12986 +
12987 +#define local_lock_irqsave_on(lvar, _flags, cpu)                       \
12988 +       do {                                                            \
12989 +               __local_lock_irqsave(&per_cpu(lvar, cpu));              \
12990 +               _flags = per_cpu(lvar, cpu).flags;                      \
12991 +       } while (0)
12992 +
12993 +static inline int __local_unlock_irqrestore(struct local_irq_lock *lv,
12994 +                                           unsigned long flags)
12995 +{
12996 +       LL_WARN(!lv->nestcnt);
12997 +       LL_WARN(lv->owner != current);
12998 +       if (--lv->nestcnt)
12999 +               return 0;
13000 +
13001 +       lv->owner = NULL;
13002 +       spin_unlock_irqrestore(&lv->lock, lv->flags);
13003 +       return 1;
13004 +}
13005 +
13006 +#define local_unlock_irqrestore(lvar, flags)                           \
13007 +       do {                                                            \
13008 +               if (__local_unlock_irqrestore(this_cpu_ptr(&lvar), flags)) \
13009 +                       put_local_var(lvar);                            \
13010 +       } while (0)
13011 +
13012 +#define local_unlock_irqrestore_on(lvar, flags, cpu)                   \
13013 +       do {                                                            \
13014 +               __local_unlock_irqrestore(&per_cpu(lvar, cpu), flags);  \
13015 +       } while (0)
13016 +
13017 +#define local_spin_trylock_irq(lvar, lock)                             \
13018 +       ({                                                              \
13019 +               int __locked;                                           \
13020 +               local_lock_irq(lvar);                                   \
13021 +               __locked = spin_trylock(lock);                          \
13022 +               if (!__locked)                                          \
13023 +                       local_unlock_irq(lvar);                         \
13024 +               __locked;                                               \
13025 +       })
13026 +
13027 +#define local_spin_lock_irq(lvar, lock)                                        \
13028 +       do {                                                            \
13029 +               local_lock_irq(lvar);                                   \
13030 +               spin_lock(lock);                                        \
13031 +       } while (0)
13032 +
13033 +#define local_spin_unlock_irq(lvar, lock)                              \
13034 +       do {                                                            \
13035 +               spin_unlock(lock);                                      \
13036 +               local_unlock_irq(lvar);                                 \
13037 +       } while (0)
13038 +
13039 +#define local_spin_lock_irqsave(lvar, lock, flags)                     \
13040 +       do {                                                            \
13041 +               local_lock_irqsave(lvar, flags);                        \
13042 +               spin_lock(lock);                                        \
13043 +       } while (0)
13044 +
13045 +#define local_spin_unlock_irqrestore(lvar, lock, flags)                        \
13046 +       do {                                                            \
13047 +               spin_unlock(lock);                                      \
13048 +               local_unlock_irqrestore(lvar, flags);                   \
13049 +       } while (0)
13050 +
13051 +#define get_locked_var(lvar, var)                                      \
13052 +       (*({                                                            \
13053 +               local_lock(lvar);                                       \
13054 +               this_cpu_ptr(&var);                                     \
13055 +       }))
13056 +
13057 +#define put_locked_var(lvar, var)      local_unlock(lvar);
13058 +
13059 +#define local_lock_cpu(lvar)                                           \
13060 +       ({                                                              \
13061 +               local_lock(lvar);                                       \
13062 +               smp_processor_id();                                     \
13063 +       })
13064 +
13065 +#define local_unlock_cpu(lvar)                 local_unlock(lvar)
13066 +
13067 +#else /* PREEMPT_RT_BASE */
13068 +
13069 +#define DEFINE_LOCAL_IRQ_LOCK(lvar)            __typeof__(const int) lvar
13070 +#define DECLARE_LOCAL_IRQ_LOCK(lvar)           extern __typeof__(const int) lvar
13071 +
13072 +static inline void local_irq_lock_init(int lvar) { }
13073 +
13074 +#define local_lock(lvar)                       preempt_disable()
13075 +#define local_unlock(lvar)                     preempt_enable()
13076 +#define local_lock_irq(lvar)                   local_irq_disable()
13077 +#define local_unlock_irq(lvar)                 local_irq_enable()
13078 +#define local_lock_irqsave(lvar, flags)                local_irq_save(flags)
13079 +#define local_unlock_irqrestore(lvar, flags)   local_irq_restore(flags)
13080 +
13081 +#define local_spin_trylock_irq(lvar, lock)     spin_trylock_irq(lock)
13082 +#define local_spin_lock_irq(lvar, lock)                spin_lock_irq(lock)
13083 +#define local_spin_unlock_irq(lvar, lock)      spin_unlock_irq(lock)
13084 +#define local_spin_lock_irqsave(lvar, lock, flags)     \
13085 +       spin_lock_irqsave(lock, flags)
13086 +#define local_spin_unlock_irqrestore(lvar, lock, flags)        \
13087 +       spin_unlock_irqrestore(lock, flags)
13088 +
13089 +#define get_locked_var(lvar, var)              get_cpu_var(var)
13090 +#define put_locked_var(lvar, var)              put_cpu_var(var)
13091 +
13092 +#define local_lock_cpu(lvar)                   get_cpu()
13093 +#define local_unlock_cpu(lvar)                 put_cpu()
13094 +
13095 +#endif
13096 +
13097 +#endif
13098 diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
13099 index f8d1492a114f..b238ebfbb4d6 100644
13100 --- a/include/linux/mm_types.h
13101 +++ b/include/linux/mm_types.h
13102 @@ -11,6 +11,7 @@
13103  #include <linux/completion.h>
13104  #include <linux/cpumask.h>
13105  #include <linux/uprobes.h>
13106 +#include <linux/rcupdate.h>
13107  #include <linux/page-flags-layout.h>
13108  #include <asm/page.h>
13109  #include <asm/mmu.h>
13110 @@ -504,6 +505,9 @@ struct mm_struct {
13111         bool tlb_flush_pending;
13112  #endif
13113         struct uprobes_state uprobes_state;
13114 +#ifdef CONFIG_PREEMPT_RT_BASE
13115 +       struct rcu_head delayed_drop;
13116 +#endif
13117  #ifdef CONFIG_X86_INTEL_MPX
13118         /* address of the bounds directory */
13119         void __user *bd_addr;
13120 diff --git a/include/linux/mutex.h b/include/linux/mutex.h
13121 index 2cb7531e7d7a..b3fdfc820216 100644
13122 --- a/include/linux/mutex.h
13123 +++ b/include/linux/mutex.h
13124 @@ -19,6 +19,17 @@
13125  #include <asm/processor.h>
13126  #include <linux/osq_lock.h>
13127
13128 +#ifdef CONFIG_DEBUG_LOCK_ALLOC
13129 +# define __DEP_MAP_MUTEX_INITIALIZER(lockname) \
13130 +       , .dep_map = { .name = #lockname }
13131 +#else
13132 +# define __DEP_MAP_MUTEX_INITIALIZER(lockname)
13133 +#endif
13134 +
13135 +#ifdef CONFIG_PREEMPT_RT_FULL
13136 +# include <linux/mutex_rt.h>
13137 +#else
13138 +
13139  /*
13140   * Simple, straightforward mutexes with strict semantics:
13141   *
13142 @@ -99,13 +110,6 @@ do {                                                        \
13143  static inline void mutex_destroy(struct mutex *lock) {}
13144  #endif
13145
13146 -#ifdef CONFIG_DEBUG_LOCK_ALLOC
13147 -# define __DEP_MAP_MUTEX_INITIALIZER(lockname) \
13148 -               , .dep_map = { .name = #lockname }
13149 -#else
13150 -# define __DEP_MAP_MUTEX_INITIALIZER(lockname)
13151 -#endif
13152 -
13153  #define __MUTEX_INITIALIZER(lockname) \
13154                 { .count = ATOMIC_INIT(1) \
13155                 , .wait_lock = __SPIN_LOCK_UNLOCKED(lockname.wait_lock) \
13156 @@ -173,6 +177,8 @@ extern int __must_check mutex_lock_killable(struct mutex *lock);
13157  extern int mutex_trylock(struct mutex *lock);
13158  extern void mutex_unlock(struct mutex *lock);
13159
13160 +#endif /* !PREEMPT_RT_FULL */
13161 +
13162  extern int atomic_dec_and_mutex_lock(atomic_t *cnt, struct mutex *lock);
13163
13164  #endif /* __LINUX_MUTEX_H */
13165 diff --git a/include/linux/mutex_rt.h b/include/linux/mutex_rt.h
13166 new file mode 100644
13167 index 000000000000..c38a44b14da5
13168 --- /dev/null
13169 +++ b/include/linux/mutex_rt.h
13170 @@ -0,0 +1,84 @@
13171 +#ifndef __LINUX_MUTEX_RT_H
13172 +#define __LINUX_MUTEX_RT_H
13173 +
13174 +#ifndef __LINUX_MUTEX_H
13175 +#error "Please include mutex.h"
13176 +#endif
13177 +
13178 +#include <linux/rtmutex.h>
13179 +
13180 +/* FIXME: Just for __lockfunc */
13181 +#include <linux/spinlock.h>
13182 +
13183 +struct mutex {
13184 +       struct rt_mutex         lock;
13185 +#ifdef CONFIG_DEBUG_LOCK_ALLOC
13186 +       struct lockdep_map      dep_map;
13187 +#endif
13188 +};
13189 +
13190 +#define __MUTEX_INITIALIZER(mutexname)                                 \
13191 +       {                                                               \
13192 +               .lock = __RT_MUTEX_INITIALIZER(mutexname.lock)          \
13193 +               __DEP_MAP_MUTEX_INITIALIZER(mutexname)                  \
13194 +       }
13195 +
13196 +#define DEFINE_MUTEX(mutexname)                                                \
13197 +       struct mutex mutexname = __MUTEX_INITIALIZER(mutexname)
13198 +
13199 +extern void __mutex_do_init(struct mutex *lock, const char *name, struct lock_class_key *key);
13200 +extern void __lockfunc _mutex_lock(struct mutex *lock);
13201 +extern int __lockfunc _mutex_lock_interruptible(struct mutex *lock);
13202 +extern int __lockfunc _mutex_lock_killable(struct mutex *lock);
13203 +extern void __lockfunc _mutex_lock_nested(struct mutex *lock, int subclass);
13204 +extern void __lockfunc _mutex_lock_nest_lock(struct mutex *lock, struct lockdep_map *nest_lock);
13205 +extern int __lockfunc _mutex_lock_interruptible_nested(struct mutex *lock, int subclass);
13206 +extern int __lockfunc _mutex_lock_killable_nested(struct mutex *lock, int subclass);
13207 +extern int __lockfunc _mutex_trylock(struct mutex *lock);
13208 +extern void __lockfunc _mutex_unlock(struct mutex *lock);
13209 +
13210 +#define mutex_is_locked(l)             rt_mutex_is_locked(&(l)->lock)
13211 +#define mutex_lock(l)                  _mutex_lock(l)
13212 +#define mutex_lock_interruptible(l)    _mutex_lock_interruptible(l)
13213 +#define mutex_lock_killable(l)         _mutex_lock_killable(l)
13214 +#define mutex_trylock(l)               _mutex_trylock(l)
13215 +#define mutex_unlock(l)                        _mutex_unlock(l)
13216 +#define mutex_destroy(l)               rt_mutex_destroy(&(l)->lock)
13217 +
13218 +#ifdef CONFIG_DEBUG_LOCK_ALLOC
13219 +# define mutex_lock_nested(l, s)       _mutex_lock_nested(l, s)
13220 +# define mutex_lock_interruptible_nested(l, s) \
13221 +                                       _mutex_lock_interruptible_nested(l, s)
13222 +# define mutex_lock_killable_nested(l, s) \
13223 +                                       _mutex_lock_killable_nested(l, s)
13224 +
13225 +# define mutex_lock_nest_lock(lock, nest_lock)                         \
13226 +do {                                                                   \
13227 +       typecheck(struct lockdep_map *, &(nest_lock)->dep_map);         \
13228 +       _mutex_lock_nest_lock(lock, &(nest_lock)->dep_map);             \
13229 +} while (0)
13230 +
13231 +#else
13232 +# define mutex_lock_nested(l, s)       _mutex_lock(l)
13233 +# define mutex_lock_interruptible_nested(l, s) \
13234 +                                       _mutex_lock_interruptible(l)
13235 +# define mutex_lock_killable_nested(l, s) \
13236 +                                       _mutex_lock_killable(l)
13237 +# define mutex_lock_nest_lock(lock, nest_lock) mutex_lock(lock)
13238 +#endif
13239 +
13240 +# define mutex_init(mutex)                             \
13241 +do {                                                   \
13242 +       static struct lock_class_key __key;             \
13243 +                                                       \
13244 +       rt_mutex_init(&(mutex)->lock);                  \
13245 +       __mutex_do_init((mutex), #mutex, &__key);       \
13246 +} while (0)
13247 +
13248 +# define __mutex_init(mutex, name, key)                        \
13249 +do {                                                   \
13250 +       rt_mutex_init(&(mutex)->lock);                  \
13251 +       __mutex_do_init((mutex), name, key);            \
13252 +} while (0)
13253 +
13254 +#endif
13255 diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
13256 index 12b4d54a8ffa..a2e7d1816b4c 100644
13257 --- a/include/linux/netdevice.h
13258 +++ b/include/linux/netdevice.h
13259 @@ -2248,11 +2248,20 @@ void netdev_freemem(struct net_device *dev);
13260  void synchronize_net(void);
13261  int init_dummy_netdev(struct net_device *dev);
13262
13263 +#ifdef CONFIG_PREEMPT_RT_FULL
13264 +static inline int dev_recursion_level(void)
13265 +{
13266 +       return current->xmit_recursion;
13267 +}
13268 +
13269 +#else
13270 +
13271  DECLARE_PER_CPU(int, xmit_recursion);
13272  static inline int dev_recursion_level(void)
13273  {
13274         return this_cpu_read(xmit_recursion);
13275  }
13276 +#endif
13277
13278  struct net_device *dev_get_by_index(struct net *net, int ifindex);
13279  struct net_device *__dev_get_by_index(struct net *net, int ifindex);
13280 @@ -2563,6 +2572,7 @@ struct softnet_data {
13281         unsigned int            dropped;
13282         struct sk_buff_head     input_pkt_queue;
13283         struct napi_struct      backlog;
13284 +       struct sk_buff_head     tofree_queue;
13285
13286  };
13287
13288 diff --git a/include/linux/netfilter/x_tables.h b/include/linux/netfilter/x_tables.h
13289 index 04078e8a4803..a61c9609e32f 100644
13290 --- a/include/linux/netfilter/x_tables.h
13291 +++ b/include/linux/netfilter/x_tables.h
13292 @@ -4,6 +4,7 @@
13293
13294  #include <linux/netdevice.h>
13295  #include <linux/static_key.h>
13296 +#include <linux/locallock.h>
13297  #include <uapi/linux/netfilter/x_tables.h>
13298
13299  /**
13300 @@ -289,6 +290,8 @@ void xt_free_table_info(struct xt_table_info *info);
13301   */
13302  DECLARE_PER_CPU(seqcount_t, xt_recseq);
13303
13304 +DECLARE_LOCAL_IRQ_LOCK(xt_write_lock);
13305 +
13306  /* xt_tee_enabled - true if x_tables needs to handle reentrancy
13307   *
13308   * Enabled if current ip(6)tables ruleset has at least one -j TEE rule.
13309 @@ -309,6 +312,9 @@ static inline unsigned int xt_write_recseq_begin(void)
13310  {
13311         unsigned int addend;
13312
13313 +       /* RT protection */
13314 +       local_lock(xt_write_lock);
13315 +
13316         /*
13317          * Low order bit of sequence is set if we already
13318          * called xt_write_recseq_begin().
13319 @@ -339,6 +345,7 @@ static inline void xt_write_recseq_end(unsigned int addend)
13320         /* this is kind of a write_seqcount_end(), but addend is 0 or 1 */
13321         smp_wmb();
13322         __this_cpu_add(xt_recseq.sequence, addend);
13323 +       local_unlock(xt_write_lock);
13324  }
13325
13326  /*
13327 diff --git a/include/linux/notifier.h b/include/linux/notifier.h
13328 index d14a4c362465..2e4414a0c1c4 100644
13329 --- a/include/linux/notifier.h
13330 +++ b/include/linux/notifier.h
13331 @@ -6,7 +6,7 @@
13332   *
13333   *                             Alan Cox <Alan.Cox@linux.org>
13334   */
13335 -
13336 +
13337  #ifndef _LINUX_NOTIFIER_H
13338  #define _LINUX_NOTIFIER_H
13339  #include <linux/errno.h>
13340 @@ -42,9 +42,7 @@
13341   * in srcu_notifier_call_chain(): no cache bounces and no memory barriers.
13342   * As compensation, srcu_notifier_chain_unregister() is rather expensive.
13343   * SRCU notifier chains should be used when the chain will be called very
13344 - * often but notifier_blocks will seldom be removed.  Also, SRCU notifier
13345 - * chains are slightly more difficult to use because they require special
13346 - * runtime initialization.
13347 + * often but notifier_blocks will seldom be removed.
13348   */
13349
13350  typedef        int (*notifier_fn_t)(struct notifier_block *nb,
13351 @@ -88,7 +86,7 @@ struct srcu_notifier_head {
13352                 (name)->head = NULL;            \
13353         } while (0)
13354
13355 -/* srcu_notifier_heads must be initialized and cleaned up dynamically */
13356 +/* srcu_notifier_heads must be cleaned up dynamically */
13357  extern void srcu_init_notifier_head(struct srcu_notifier_head *nh);
13358  #define srcu_cleanup_notifier_head(name)       \
13359                 cleanup_srcu_struct(&(name)->srcu);
13360 @@ -101,7 +99,13 @@ extern void srcu_init_notifier_head(struct srcu_notifier_head *nh);
13361                 .head = NULL }
13362  #define RAW_NOTIFIER_INIT(name)        {                               \
13363                 .head = NULL }
13364 -/* srcu_notifier_heads cannot be initialized statically */
13365 +
13366 +#define SRCU_NOTIFIER_INIT(name, pcpu)                         \
13367 +       {                                                       \
13368 +               .mutex = __MUTEX_INITIALIZER(name.mutex),       \
13369 +               .head = NULL,                                   \
13370 +               .srcu = __SRCU_STRUCT_INIT(name.srcu, pcpu),    \
13371 +       }
13372
13373  #define ATOMIC_NOTIFIER_HEAD(name)                             \
13374         struct atomic_notifier_head name =                      \
13375 @@ -113,6 +117,18 @@ extern void srcu_init_notifier_head(struct srcu_notifier_head *nh);
13376         struct raw_notifier_head name =                         \
13377                 RAW_NOTIFIER_INIT(name)
13378
13379 +#define _SRCU_NOTIFIER_HEAD(name, mod)                         \
13380 +       static DEFINE_PER_CPU(struct srcu_struct_array,         \
13381 +                       name##_head_srcu_array);                \
13382 +       mod struct srcu_notifier_head name =                    \
13383 +                       SRCU_NOTIFIER_INIT(name, name##_head_srcu_array)
13384 +
13385 +#define SRCU_NOTIFIER_HEAD(name)                               \
13386 +       _SRCU_NOTIFIER_HEAD(name, )
13387 +
13388 +#define SRCU_NOTIFIER_HEAD_STATIC(name)                                \
13389 +       _SRCU_NOTIFIER_HEAD(name, static)
13390 +
13391  #ifdef __KERNEL__
13392
13393  extern int atomic_notifier_chain_register(struct atomic_notifier_head *nh,
13394 @@ -182,12 +198,12 @@ static inline int notifier_to_errno(int ret)
13395
13396  /*
13397   *     Declared notifiers so far. I can imagine quite a few more chains
13398 - *     over time (eg laptop power reset chains, reboot chain (to clean
13399 + *     over time (eg laptop power reset chains, reboot chain (to clean
13400   *     device units up), device [un]mount chain, module load/unload chain,
13401 - *     low memory chain, screenblank chain (for plug in modular screenblankers)
13402 + *     low memory chain, screenblank chain (for plug in modular screenblankers)
13403   *     VC switch chains (for loadable kernel svgalib VC switch helpers) etc...
13404   */
13405 -
13406 +
13407  /* CPU notfiers are defined in include/linux/cpu.h. */
13408
13409  /* netdevice notifiers are defined in include/linux/netdevice.h */
13410 diff --git a/include/linux/percpu.h b/include/linux/percpu.h
13411 index caebf2a758dc..53a60a51c758 100644
13412 --- a/include/linux/percpu.h
13413 +++ b/include/linux/percpu.h
13414 @@ -24,6 +24,35 @@
13415          PERCPU_MODULE_RESERVE)
13416  #endif
13417
13418 +#ifdef CONFIG_PREEMPT_RT_FULL
13419 +
13420 +#define get_local_var(var) (*({                \
13421 +              migrate_disable();       \
13422 +              this_cpu_ptr(&var);      }))
13423 +
13424 +#define put_local_var(var) do {        \
13425 +       (void)&(var);           \
13426 +       migrate_enable();       \
13427 +} while (0)
13428 +
13429 +# define get_local_ptr(var) ({         \
13430 +               migrate_disable();      \
13431 +               this_cpu_ptr(var);      })
13432 +
13433 +# define put_local_ptr(var) do {       \
13434 +       (void)(var);                    \
13435 +       migrate_enable();               \
13436 +} while (0)
13437 +
13438 +#else
13439 +
13440 +#define get_local_var(var)     get_cpu_var(var)
13441 +#define put_local_var(var)     put_cpu_var(var)
13442 +#define get_local_ptr(var)     get_cpu_ptr(var)
13443 +#define put_local_ptr(var)     put_cpu_ptr(var)
13444 +
13445 +#endif
13446 +
13447  /* minimum unit size, also is the maximum supported allocation size */
13448  #define PCPU_MIN_UNIT_SIZE             PFN_ALIGN(32 << 10)
13449
13450 diff --git a/include/linux/pid.h b/include/linux/pid.h
13451 index 23705a53abba..2cc64b779f03 100644
13452 --- a/include/linux/pid.h
13453 +++ b/include/linux/pid.h
13454 @@ -2,6 +2,7 @@
13455  #define _LINUX_PID_H
13456
13457  #include <linux/rcupdate.h>
13458 +#include <linux/atomic.h>
13459
13460  enum pid_type
13461  {
13462 diff --git a/include/linux/preempt.h b/include/linux/preempt.h
13463 index 75e4e30677f1..1cfb1cb72354 100644
13464 --- a/include/linux/preempt.h
13465 +++ b/include/linux/preempt.h
13466 @@ -50,7 +50,11 @@
13467  #define HARDIRQ_OFFSET (1UL << HARDIRQ_SHIFT)
13468  #define NMI_OFFSET     (1UL << NMI_SHIFT)
13469
13470 -#define SOFTIRQ_DISABLE_OFFSET (2 * SOFTIRQ_OFFSET)
13471 +#ifndef CONFIG_PREEMPT_RT_FULL
13472 +# define SOFTIRQ_DISABLE_OFFSET                (2 * SOFTIRQ_OFFSET)
13473 +#else
13474 +# define SOFTIRQ_DISABLE_OFFSET                (0)
13475 +#endif
13476
13477  /* We use the MSB mostly because its available */
13478  #define PREEMPT_NEED_RESCHED   0x80000000
13479 @@ -59,9 +63,15 @@
13480  #include <asm/preempt.h>
13481
13482  #define hardirq_count()        (preempt_count() & HARDIRQ_MASK)
13483 -#define softirq_count()        (preempt_count() & SOFTIRQ_MASK)
13484  #define irq_count()    (preempt_count() & (HARDIRQ_MASK | SOFTIRQ_MASK \
13485                                  | NMI_MASK))
13486 +#ifndef CONFIG_PREEMPT_RT_FULL
13487 +# define softirq_count()       (preempt_count() & SOFTIRQ_MASK)
13488 +# define in_serving_softirq()  (softirq_count() & SOFTIRQ_OFFSET)
13489 +#else
13490 +# define softirq_count()       (0UL)
13491 +extern int in_serving_softirq(void);
13492 +#endif
13493
13494  /*
13495   * Are we doing bottom half or hardware interrupt processing?
13496 @@ -72,7 +82,6 @@
13497  #define in_irq()               (hardirq_count())
13498  #define in_softirq()           (softirq_count())
13499  #define in_interrupt()         (irq_count())
13500 -#define in_serving_softirq()   (softirq_count() & SOFTIRQ_OFFSET)
13501
13502  /*
13503   * Are we in NMI context?
13504 @@ -91,7 +100,11 @@
13505  /*
13506   * The preempt_count offset after spin_lock()
13507   */
13508 +#if !defined(CONFIG_PREEMPT_RT_FULL)
13509  #define PREEMPT_LOCK_OFFSET    PREEMPT_DISABLE_OFFSET
13510 +#else
13511 +#define PREEMPT_LOCK_OFFSET    0
13512 +#endif
13513
13514  /*
13515   * The preempt_count offset needed for things like:
13516 @@ -140,6 +153,20 @@ extern void preempt_count_sub(int val);
13517  #define preempt_count_inc() preempt_count_add(1)
13518  #define preempt_count_dec() preempt_count_sub(1)
13519
13520 +#ifdef CONFIG_PREEMPT_LAZY
13521 +#define add_preempt_lazy_count(val)    do { preempt_lazy_count() += (val); } while (0)
13522 +#define sub_preempt_lazy_count(val)    do { preempt_lazy_count() -= (val); } while (0)
13523 +#define inc_preempt_lazy_count()       add_preempt_lazy_count(1)
13524 +#define dec_preempt_lazy_count()       sub_preempt_lazy_count(1)
13525 +#define preempt_lazy_count()           (current_thread_info()->preempt_lazy_count)
13526 +#else
13527 +#define add_preempt_lazy_count(val)    do { } while (0)
13528 +#define sub_preempt_lazy_count(val)    do { } while (0)
13529 +#define inc_preempt_lazy_count()       do { } while (0)
13530 +#define dec_preempt_lazy_count()       do { } while (0)
13531 +#define preempt_lazy_count()           (0)
13532 +#endif
13533 +
13534  #ifdef CONFIG_PREEMPT_COUNT
13535
13536  #define preempt_disable() \
13537 @@ -148,13 +175,25 @@ do { \
13538         barrier(); \
13539  } while (0)
13540
13541 +#define preempt_lazy_disable() \
13542 +do { \
13543 +       inc_preempt_lazy_count(); \
13544 +       barrier(); \
13545 +} while (0)
13546 +
13547  #define sched_preempt_enable_no_resched() \
13548  do { \
13549         barrier(); \
13550         preempt_count_dec(); \
13551  } while (0)
13552
13553 -#define preempt_enable_no_resched() sched_preempt_enable_no_resched()
13554 +#ifdef CONFIG_PREEMPT_RT_BASE
13555 +# define preempt_enable_no_resched() sched_preempt_enable_no_resched()
13556 +# define preempt_check_resched_rt() preempt_check_resched()
13557 +#else
13558 +# define preempt_enable_no_resched() preempt_enable()
13559 +# define preempt_check_resched_rt() barrier();
13560 +#endif
13561
13562  #define preemptible()  (preempt_count() == 0 && !irqs_disabled())
13563
13564 @@ -179,6 +218,13 @@ do { \
13565                 __preempt_schedule(); \
13566  } while (0)
13567
13568 +#define preempt_lazy_enable() \
13569 +do { \
13570 +       dec_preempt_lazy_count(); \
13571 +       barrier(); \
13572 +       preempt_check_resched(); \
13573 +} while (0)
13574 +
13575  #else /* !CONFIG_PREEMPT */
13576  #define preempt_enable() \
13577  do { \
13578 @@ -224,6 +270,7 @@ do { \
13579  #define preempt_disable_notrace()              barrier()
13580  #define preempt_enable_no_resched_notrace()    barrier()
13581  #define preempt_enable_notrace()               barrier()
13582 +#define preempt_check_resched_rt()             barrier()
13583  #define preemptible()                          0
13584
13585  #endif /* CONFIG_PREEMPT_COUNT */
13586 @@ -244,10 +291,31 @@ do { \
13587  } while (0)
13588  #define preempt_fold_need_resched() \
13589  do { \
13590 -       if (tif_need_resched()) \
13591 +       if (tif_need_resched_now()) \
13592                 set_preempt_need_resched(); \
13593  } while (0)
13594
13595 +#ifdef CONFIG_PREEMPT_RT_FULL
13596 +# define preempt_disable_rt()          preempt_disable()
13597 +# define preempt_enable_rt()           preempt_enable()
13598 +# define preempt_disable_nort()                barrier()
13599 +# define preempt_enable_nort()         barrier()
13600 +# ifdef CONFIG_SMP
13601 +   extern void migrate_disable(void);
13602 +   extern void migrate_enable(void);
13603 +# else /* CONFIG_SMP */
13604 +#  define migrate_disable()            barrier()
13605 +#  define migrate_enable()             barrier()
13606 +# endif /* CONFIG_SMP */
13607 +#else
13608 +# define preempt_disable_rt()          barrier()
13609 +# define preempt_enable_rt()           barrier()
13610 +# define preempt_disable_nort()                preempt_disable()
13611 +# define preempt_enable_nort()         preempt_enable()
13612 +# define migrate_disable()             preempt_disable()
13613 +# define migrate_enable()              preempt_enable()
13614 +#endif
13615 +
13616  #ifdef CONFIG_PREEMPT_NOTIFIERS
13617
13618  struct preempt_notifier;
13619 diff --git a/include/linux/printk.h b/include/linux/printk.h
13620 index 9729565c25ff..9cdca696b718 100644
13621 --- a/include/linux/printk.h
13622 +++ b/include/linux/printk.h
13623 @@ -117,9 +117,11 @@ int no_printk(const char *fmt, ...)
13624  #ifdef CONFIG_EARLY_PRINTK
13625  extern asmlinkage __printf(1, 2)
13626  void early_printk(const char *fmt, ...);
13627 +extern void printk_kill(void);
13628  #else
13629  static inline __printf(1, 2) __cold
13630  void early_printk(const char *s, ...) { }
13631 +static inline void printk_kill(void) { }
13632  #endif
13633
13634  typedef __printf(1, 0) int (*printk_func_t)(const char *fmt, va_list args);
13635 diff --git a/include/linux/radix-tree.h b/include/linux/radix-tree.h
13636 index 5d5174b59802..8ddbd6e15a3c 100644
13637 --- a/include/linux/radix-tree.h
13638 +++ b/include/linux/radix-tree.h
13639 @@ -277,8 +277,13 @@ radix_tree_gang_lookup(struct radix_tree_root *root, void **results,
13640  unsigned int radix_tree_gang_lookup_slot(struct radix_tree_root *root,
13641                         void ***results, unsigned long *indices,
13642                         unsigned long first_index, unsigned int max_items);
13643 +#ifndef CONFIG_PREEMPT_RT_FULL
13644  int radix_tree_preload(gfp_t gfp_mask);
13645  int radix_tree_maybe_preload(gfp_t gfp_mask);
13646 +#else
13647 +static inline int radix_tree_preload(gfp_t gm) { return 0; }
13648 +static inline int radix_tree_maybe_preload(gfp_t gfp_mask) { return 0; }
13649 +#endif
13650  void radix_tree_init(void);
13651  void *radix_tree_tag_set(struct radix_tree_root *root,
13652                         unsigned long index, unsigned int tag);
13653 @@ -303,7 +308,7 @@ unsigned long radix_tree_locate_item(struct radix_tree_root *root, void *item);
13654
13655  static inline void radix_tree_preload_end(void)
13656  {
13657 -       preempt_enable();
13658 +       preempt_enable_nort();
13659  }
13660
13661  /**
13662 diff --git a/include/linux/random.h b/include/linux/random.h
13663 index a75840c1aa71..1a804361670c 100644
13664 --- a/include/linux/random.h
13665 +++ b/include/linux/random.h
13666 @@ -20,7 +20,7 @@ struct random_ready_callback {
13667  extern void add_device_randomness(const void *, unsigned int);
13668  extern void add_input_randomness(unsigned int type, unsigned int code,
13669                                  unsigned int value);
13670 -extern void add_interrupt_randomness(int irq, int irq_flags);
13671 +extern void add_interrupt_randomness(int irq, int irq_flags, __u64 ip);
13672
13673  extern void get_random_bytes(void *buf, int nbytes);
13674  extern int add_random_ready_callback(struct random_ready_callback *rdy);
13675 diff --git a/include/linux/rbtree.h b/include/linux/rbtree.h
13676 index a5aa7ae671f4..24ddffd25492 100644
13677 --- a/include/linux/rbtree.h
13678 +++ b/include/linux/rbtree.h
13679 @@ -31,7 +31,6 @@
13680
13681  #include <linux/kernel.h>
13682  #include <linux/stddef.h>
13683 -#include <linux/rcupdate.h>
13684
13685  struct rb_node {
13686         unsigned long  __rb_parent_color;
13687 @@ -86,14 +85,8 @@ static inline void rb_link_node(struct rb_node *node, struct rb_node *parent,
13688         *rb_link = node;
13689  }
13690
13691 -static inline void rb_link_node_rcu(struct rb_node *node, struct rb_node *parent,
13692 -                                   struct rb_node **rb_link)
13693 -{
13694 -       node->__rb_parent_color = (unsigned long)parent;
13695 -       node->rb_left = node->rb_right = NULL;
13696 -
13697 -       rcu_assign_pointer(*rb_link, node);
13698 -}
13699 +void rb_link_node_rcu(struct rb_node *node, struct rb_node *parent,
13700 +                     struct rb_node **rb_link);
13701
13702  #define rb_entry_safe(ptr, type, member) \
13703         ({ typeof(ptr) ____ptr = (ptr); \
13704 diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
13705 index a0189ba67fde..c2f5f955163d 100644
13706 --- a/include/linux/rcupdate.h
13707 +++ b/include/linux/rcupdate.h
13708 @@ -169,6 +169,9 @@ void call_rcu(struct rcu_head *head,
13709
13710  #endif /* #else #ifdef CONFIG_PREEMPT_RCU */
13711
13712 +#ifdef CONFIG_PREEMPT_RT_FULL
13713 +#define call_rcu_bh    call_rcu
13714 +#else
13715  /**
13716   * call_rcu_bh() - Queue an RCU for invocation after a quicker grace period.
13717   * @head: structure to be used for queueing the RCU updates.
13718 @@ -192,6 +195,7 @@ void call_rcu(struct rcu_head *head,
13719   */
13720  void call_rcu_bh(struct rcu_head *head,
13721                  rcu_callback_t func);
13722 +#endif
13723
13724  /**
13725   * call_rcu_sched() - Queue an RCU for invocation after sched grace period.
13726 @@ -292,6 +296,11 @@ void synchronize_rcu(void);
13727   * types of kernel builds, the rcu_read_lock() nesting depth is unknowable.
13728   */
13729  #define rcu_preempt_depth() (current->rcu_read_lock_nesting)
13730 +#ifndef CONFIG_PREEMPT_RT_FULL
13731 +#define sched_rcu_preempt_depth()      rcu_preempt_depth()
13732 +#else
13733 +static inline int sched_rcu_preempt_depth(void) { return 0; }
13734 +#endif
13735
13736  #else /* #ifdef CONFIG_PREEMPT_RCU */
13737
13738 @@ -317,6 +326,8 @@ static inline int rcu_preempt_depth(void)
13739         return 0;
13740  }
13741
13742 +#define sched_rcu_preempt_depth()      rcu_preempt_depth()
13743 +
13744  #endif /* #else #ifdef CONFIG_PREEMPT_RCU */
13745
13746  /* Internal to kernel */
13747 @@ -489,7 +500,14 @@ extern struct lockdep_map rcu_callback_map;
13748  int debug_lockdep_rcu_enabled(void);
13749
13750  int rcu_read_lock_held(void);
13751 +#ifdef CONFIG_PREEMPT_RT_FULL
13752 +static inline int rcu_read_lock_bh_held(void)
13753 +{
13754 +       return rcu_read_lock_held();
13755 +}
13756 +#else
13757  int rcu_read_lock_bh_held(void);
13758 +#endif
13759
13760  /**
13761   * rcu_read_lock_sched_held() - might we be in RCU-sched read-side critical section?
13762 @@ -937,10 +955,14 @@ static inline void rcu_read_unlock(void)
13763  static inline void rcu_read_lock_bh(void)
13764  {
13765         local_bh_disable();
13766 +#ifdef CONFIG_PREEMPT_RT_FULL
13767 +       rcu_read_lock();
13768 +#else
13769         __acquire(RCU_BH);
13770         rcu_lock_acquire(&rcu_bh_lock_map);
13771         RCU_LOCKDEP_WARN(!rcu_is_watching(),
13772                          "rcu_read_lock_bh() used illegally while idle");
13773 +#endif
13774  }
13775
13776  /*
13777 @@ -950,10 +972,14 @@ static inline void rcu_read_lock_bh(void)
13778   */
13779  static inline void rcu_read_unlock_bh(void)
13780  {
13781 +#ifdef CONFIG_PREEMPT_RT_FULL
13782 +       rcu_read_unlock();
13783 +#else
13784         RCU_LOCKDEP_WARN(!rcu_is_watching(),
13785                          "rcu_read_unlock_bh() used illegally while idle");
13786         rcu_lock_release(&rcu_bh_lock_map);
13787         __release(RCU_BH);
13788 +#endif
13789         local_bh_enable();
13790  }
13791
13792 diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h
13793 index 60d15a080d7c..436c9e62bfc6 100644
13794 --- a/include/linux/rcutree.h
13795 +++ b/include/linux/rcutree.h
13796 @@ -44,7 +44,11 @@ static inline void rcu_virt_note_context_switch(int cpu)
13797         rcu_note_context_switch();
13798  }
13799
13800 +#ifdef CONFIG_PREEMPT_RT_FULL
13801 +# define synchronize_rcu_bh    synchronize_rcu
13802 +#else
13803  void synchronize_rcu_bh(void);
13804 +#endif
13805  void synchronize_sched_expedited(void);
13806  void synchronize_rcu_expedited(void);
13807
13808 @@ -72,7 +76,11 @@ static inline void synchronize_rcu_bh_expedited(void)
13809  }
13810
13811  void rcu_barrier(void);
13812 +#ifdef CONFIG_PREEMPT_RT_FULL
13813 +# define rcu_barrier_bh                rcu_barrier
13814 +#else
13815  void rcu_barrier_bh(void);
13816 +#endif
13817  void rcu_barrier_sched(void);
13818  unsigned long get_state_synchronize_rcu(void);
13819  void cond_synchronize_rcu(unsigned long oldstate);
13820 @@ -85,12 +93,10 @@ unsigned long rcu_batches_started(void);
13821  unsigned long rcu_batches_started_bh(void);
13822  unsigned long rcu_batches_started_sched(void);
13823  unsigned long rcu_batches_completed(void);
13824 -unsigned long rcu_batches_completed_bh(void);
13825  unsigned long rcu_batches_completed_sched(void);
13826  void show_rcu_gp_kthreads(void);
13827
13828  void rcu_force_quiescent_state(void);
13829 -void rcu_bh_force_quiescent_state(void);
13830  void rcu_sched_force_quiescent_state(void);
13831
13832  void rcu_idle_enter(void);
13833 @@ -105,6 +111,14 @@ extern int rcu_scheduler_active __read_mostly;
13834
13835  bool rcu_is_watching(void);
13836
13837 +#ifndef CONFIG_PREEMPT_RT_FULL
13838 +void rcu_bh_force_quiescent_state(void);
13839 +unsigned long rcu_batches_completed_bh(void);
13840 +#else
13841 +# define rcu_bh_force_quiescent_state  rcu_force_quiescent_state
13842 +# define rcu_batches_completed_bh      rcu_batches_completed
13843 +#endif
13844 +
13845  void rcu_all_qs(void);
13846
13847  #endif /* __LINUX_RCUTREE_H */
13848 diff --git a/include/linux/rtmutex.h b/include/linux/rtmutex.h
13849 index 1abba5ce2a2f..30211c627511 100644
13850 --- a/include/linux/rtmutex.h
13851 +++ b/include/linux/rtmutex.h
13852 @@ -13,11 +13,15 @@
13853  #define __LINUX_RT_MUTEX_H
13854
13855  #include <linux/linkage.h>
13856 +#include <linux/spinlock_types_raw.h>
13857  #include <linux/rbtree.h>
13858 -#include <linux/spinlock_types.h>
13859
13860  extern int max_lock_depth; /* for sysctl */
13861
13862 +#ifdef CONFIG_DEBUG_MUTEXES
13863 +#include <linux/debug_locks.h>
13864 +#endif
13865 +
13866  /**
13867   * The rt_mutex structure
13868   *
13869 @@ -31,8 +35,8 @@ struct rt_mutex {
13870         struct rb_root          waiters;
13871         struct rb_node          *waiters_leftmost;
13872         struct task_struct      *owner;
13873 -#ifdef CONFIG_DEBUG_RT_MUTEXES
13874         int                     save_state;
13875 +#ifdef CONFIG_DEBUG_RT_MUTEXES
13876         const char              *name, *file;
13877         int                     line;
13878         void                    *magic;
13879 @@ -55,22 +59,33 @@ struct hrtimer_sleeper;
13880  # define rt_mutex_debug_check_no_locks_held(task)      do { } while (0)
13881  #endif
13882
13883 +# define rt_mutex_init(mutex)                                  \
13884 +       do {                                                    \
13885 +               raw_spin_lock_init(&(mutex)->wait_lock);        \
13886 +               __rt_mutex_init(mutex, #mutex);                 \
13887 +       } while (0)
13888 +
13889  #ifdef CONFIG_DEBUG_RT_MUTEXES
13890  # define __DEBUG_RT_MUTEX_INITIALIZER(mutexname) \
13891         , .name = #mutexname, .file = __FILE__, .line = __LINE__
13892 -# define rt_mutex_init(mutex)                  __rt_mutex_init(mutex, __func__)
13893   extern void rt_mutex_debug_task_free(struct task_struct *tsk);
13894  #else
13895  # define __DEBUG_RT_MUTEX_INITIALIZER(mutexname)
13896 -# define rt_mutex_init(mutex)                  __rt_mutex_init(mutex, NULL)
13897  # define rt_mutex_debug_task_free(t)                   do { } while (0)
13898  #endif
13899
13900 -#define __RT_MUTEX_INITIALIZER(mutexname) \
13901 -       { .wait_lock = __RAW_SPIN_LOCK_UNLOCKED(mutexname.wait_lock) \
13902 +#define __RT_MUTEX_INITIALIZER_PLAIN(mutexname) \
13903 +        .wait_lock = __RAW_SPIN_LOCK_UNLOCKED(mutexname.wait_lock) \
13904         , .waiters = RB_ROOT \
13905         , .owner = NULL \
13906 -       __DEBUG_RT_MUTEX_INITIALIZER(mutexname)}
13907 +       __DEBUG_RT_MUTEX_INITIALIZER(mutexname)
13908 +
13909 +#define __RT_MUTEX_INITIALIZER(mutexname) \
13910 +       { __RT_MUTEX_INITIALIZER_PLAIN(mutexname) }
13911 +
13912 +#define __RT_MUTEX_INITIALIZER_SAVE_STATE(mutexname) \
13913 +       { __RT_MUTEX_INITIALIZER_PLAIN(mutexname)    \
13914 +       , .save_state = 1 }
13915
13916  #define DEFINE_RT_MUTEX(mutexname) \
13917         struct rt_mutex mutexname = __RT_MUTEX_INITIALIZER(mutexname)
13918 @@ -91,6 +106,7 @@ extern void rt_mutex_destroy(struct rt_mutex *lock);
13919
13920  extern void rt_mutex_lock(struct rt_mutex *lock);
13921  extern int rt_mutex_lock_interruptible(struct rt_mutex *lock);
13922 +extern int rt_mutex_lock_killable(struct rt_mutex *lock);
13923  extern int rt_mutex_timed_lock(struct rt_mutex *lock,
13924                                struct hrtimer_sleeper *timeout);
13925
13926 diff --git a/include/linux/rwlock_rt.h b/include/linux/rwlock_rt.h
13927 new file mode 100644
13928 index 000000000000..49ed2d45d3be
13929 --- /dev/null
13930 +++ b/include/linux/rwlock_rt.h
13931 @@ -0,0 +1,99 @@
13932 +#ifndef __LINUX_RWLOCK_RT_H
13933 +#define __LINUX_RWLOCK_RT_H
13934 +
13935 +#ifndef __LINUX_SPINLOCK_H
13936 +#error Do not include directly. Use spinlock.h
13937 +#endif
13938 +
13939 +#define rwlock_init(rwl)                               \
13940 +do {                                                   \
13941 +       static struct lock_class_key __key;             \
13942 +                                                       \
13943 +       rt_mutex_init(&(rwl)->lock);                    \
13944 +       __rt_rwlock_init(rwl, #rwl, &__key);            \
13945 +} while (0)
13946 +
13947 +extern void __lockfunc rt_write_lock(rwlock_t *rwlock);
13948 +extern void __lockfunc rt_read_lock(rwlock_t *rwlock);
13949 +extern int __lockfunc rt_write_trylock(rwlock_t *rwlock);
13950 +extern int __lockfunc rt_write_trylock_irqsave(rwlock_t *trylock, unsigned long *flags);
13951 +extern int __lockfunc rt_read_trylock(rwlock_t *rwlock);
13952 +extern void __lockfunc rt_write_unlock(rwlock_t *rwlock);
13953 +extern void __lockfunc rt_read_unlock(rwlock_t *rwlock);
13954 +extern unsigned long __lockfunc rt_write_lock_irqsave(rwlock_t *rwlock);
13955 +extern unsigned long __lockfunc rt_read_lock_irqsave(rwlock_t *rwlock);
13956 +extern void __rt_rwlock_init(rwlock_t *rwlock, char *name, struct lock_class_key *key);
13957 +
13958 +#define read_trylock(lock)     __cond_lock(lock, rt_read_trylock(lock))
13959 +#define write_trylock(lock)    __cond_lock(lock, rt_write_trylock(lock))
13960 +
13961 +#define write_trylock_irqsave(lock, flags)     \
13962 +       __cond_lock(lock, rt_write_trylock_irqsave(lock, &flags))
13963 +
13964 +#define read_lock_irqsave(lock, flags)                 \
13965 +       do {                                            \
13966 +               typecheck(unsigned long, flags);        \
13967 +               flags = rt_read_lock_irqsave(lock);     \
13968 +       } while (0)
13969 +
13970 +#define write_lock_irqsave(lock, flags)                        \
13971 +       do {                                            \
13972 +               typecheck(unsigned long, flags);        \
13973 +               flags = rt_write_lock_irqsave(lock);    \
13974 +       } while (0)
13975 +
13976 +#define read_lock(lock)                rt_read_lock(lock)
13977 +
13978 +#define read_lock_bh(lock)                             \
13979 +       do {                                            \
13980 +               local_bh_disable();                     \
13981 +               rt_read_lock(lock);                     \
13982 +       } while (0)
13983 +
13984 +#define read_lock_irq(lock)    read_lock(lock)
13985 +
13986 +#define write_lock(lock)       rt_write_lock(lock)
13987 +
13988 +#define write_lock_bh(lock)                            \
13989 +       do {                                            \
13990 +               local_bh_disable();                     \
13991 +               rt_write_lock(lock);                    \
13992 +       } while (0)
13993 +
13994 +#define write_lock_irq(lock)   write_lock(lock)
13995 +
13996 +#define read_unlock(lock)      rt_read_unlock(lock)
13997 +
13998 +#define read_unlock_bh(lock)                           \
13999 +       do {                                            \
14000 +               rt_read_unlock(lock);                   \
14001 +               local_bh_enable();                      \
14002 +       } while (0)
14003 +
14004 +#define read_unlock_irq(lock)  read_unlock(lock)
14005 +
14006 +#define write_unlock(lock)     rt_write_unlock(lock)
14007 +
14008 +#define write_unlock_bh(lock)                          \
14009 +       do {                                            \
14010 +               rt_write_unlock(lock);                  \
14011 +               local_bh_enable();                      \
14012 +       } while (0)
14013 +
14014 +#define write_unlock_irq(lock) write_unlock(lock)
14015 +
14016 +#define read_unlock_irqrestore(lock, flags)            \
14017 +       do {                                            \
14018 +               typecheck(unsigned long, flags);        \
14019 +               (void) flags;                           \
14020 +               rt_read_unlock(lock);                   \
14021 +       } while (0)
14022 +
14023 +#define write_unlock_irqrestore(lock, flags) \
14024 +       do {                                            \
14025 +               typecheck(unsigned long, flags);        \
14026 +               (void) flags;                           \
14027 +               rt_write_unlock(lock);                  \
14028 +       } while (0)
14029 +
14030 +#endif
14031 diff --git a/include/linux/rwlock_types.h b/include/linux/rwlock_types.h
14032 index cc0072e93e36..d0da966ad7a0 100644
14033 --- a/include/linux/rwlock_types.h
14034 +++ b/include/linux/rwlock_types.h
14035 @@ -1,6 +1,10 @@
14036  #ifndef __LINUX_RWLOCK_TYPES_H
14037  #define __LINUX_RWLOCK_TYPES_H
14038
14039 +#if !defined(__LINUX_SPINLOCK_TYPES_H)
14040 +# error "Do not include directly, include spinlock_types.h"
14041 +#endif
14042 +
14043  /*
14044   * include/linux/rwlock_types.h - generic rwlock type definitions
14045   *                               and initializers
14046 @@ -43,6 +47,7 @@ typedef struct {
14047                                 RW_DEP_MAP_INIT(lockname) }
14048  #endif
14049
14050 -#define DEFINE_RWLOCK(x)       rwlock_t x = __RW_LOCK_UNLOCKED(x)
14051 +#define DEFINE_RWLOCK(name) \
14052 +       rwlock_t name __cacheline_aligned_in_smp = __RW_LOCK_UNLOCKED(name)
14053
14054  #endif /* __LINUX_RWLOCK_TYPES_H */
14055 diff --git a/include/linux/rwlock_types_rt.h b/include/linux/rwlock_types_rt.h
14056 new file mode 100644
14057 index 000000000000..b13832119591
14058 --- /dev/null
14059 +++ b/include/linux/rwlock_types_rt.h
14060 @@ -0,0 +1,33 @@
14061 +#ifndef __LINUX_RWLOCK_TYPES_RT_H
14062 +#define __LINUX_RWLOCK_TYPES_RT_H
14063 +
14064 +#ifndef __LINUX_SPINLOCK_TYPES_H
14065 +#error "Do not include directly. Include spinlock_types.h instead"
14066 +#endif
14067 +
14068 +/*
14069 + * rwlocks - rtmutex which allows single reader recursion
14070 + */
14071 +typedef struct {
14072 +       struct rt_mutex         lock;
14073 +       int                     read_depth;
14074 +       unsigned int            break_lock;
14075 +#ifdef CONFIG_DEBUG_LOCK_ALLOC
14076 +       struct lockdep_map      dep_map;
14077 +#endif
14078 +} rwlock_t;
14079 +
14080 +#ifdef CONFIG_DEBUG_LOCK_ALLOC
14081 +# define RW_DEP_MAP_INIT(lockname)     .dep_map = { .name = #lockname }
14082 +#else
14083 +# define RW_DEP_MAP_INIT(lockname)
14084 +#endif
14085 +
14086 +#define __RW_LOCK_UNLOCKED(name) \
14087 +       { .lock = __RT_MUTEX_INITIALIZER_SAVE_STATE(name.lock), \
14088 +         RW_DEP_MAP_INIT(name) }
14089 +
14090 +#define DEFINE_RWLOCK(name) \
14091 +       rwlock_t name __cacheline_aligned_in_smp = __RW_LOCK_UNLOCKED(name)
14092 +
14093 +#endif
14094 diff --git a/include/linux/rwsem.h b/include/linux/rwsem.h
14095 index 8f498cdde280..2b2148431f14 100644
14096 --- a/include/linux/rwsem.h
14097 +++ b/include/linux/rwsem.h
14098 @@ -18,6 +18,10 @@
14099  #include <linux/osq_lock.h>
14100  #endif
14101
14102 +#ifdef CONFIG_PREEMPT_RT_FULL
14103 +#include <linux/rwsem_rt.h>
14104 +#else /* PREEMPT_RT_FULL */
14105 +
14106  struct rw_semaphore;
14107
14108  #ifdef CONFIG_RWSEM_GENERIC_SPINLOCK
14109 @@ -177,4 +181,6 @@ extern void up_read_non_owner(struct rw_semaphore *sem);
14110  # define up_read_non_owner(sem)                        up_read(sem)
14111  #endif
14112
14113 +#endif /* !PREEMPT_RT_FULL */
14114 +
14115  #endif /* _LINUX_RWSEM_H */
14116 diff --git a/include/linux/rwsem_rt.h b/include/linux/rwsem_rt.h
14117 new file mode 100644
14118 index 000000000000..f97860b2e2a4
14119 --- /dev/null
14120 +++ b/include/linux/rwsem_rt.h
14121 @@ -0,0 +1,152 @@
14122 +#ifndef _LINUX_RWSEM_RT_H
14123 +#define _LINUX_RWSEM_RT_H
14124 +
14125 +#ifndef _LINUX_RWSEM_H
14126 +#error "Include rwsem.h"
14127 +#endif
14128 +
14129 +/*
14130 + * RW-semaphores are a spinlock plus a reader-depth count.
14131 + *
14132 + * Note that the semantics are different from the usual
14133 + * Linux rw-sems, in PREEMPT_RT mode we do not allow
14134 + * multiple readers to hold the lock at once, we only allow
14135 + * a read-lock owner to read-lock recursively. This is
14136 + * better for latency, makes the implementation inherently
14137 + * fair and makes it simpler as well.
14138 + */
14139 +
14140 +#include <linux/rtmutex.h>
14141 +
14142 +struct rw_semaphore {
14143 +       struct rt_mutex         lock;
14144 +       int                     read_depth;
14145 +#ifdef CONFIG_DEBUG_LOCK_ALLOC
14146 +       struct lockdep_map      dep_map;
14147 +#endif
14148 +};
14149 +
14150 +#define __RWSEM_INITIALIZER(name) \
14151 +       { .lock = __RT_MUTEX_INITIALIZER(name.lock), \
14152 +         RW_DEP_MAP_INIT(name) }
14153 +
14154 +#define DECLARE_RWSEM(lockname) \
14155 +       struct rw_semaphore lockname = __RWSEM_INITIALIZER(lockname)
14156 +
14157 +extern void  __rt_rwsem_init(struct rw_semaphore *rwsem, const char *name,
14158 +                                    struct lock_class_key *key);
14159 +
14160 +#define __rt_init_rwsem(sem, name, key)                        \
14161 +       do {                                            \
14162 +               rt_mutex_init(&(sem)->lock);            \
14163 +               __rt_rwsem_init((sem), (name), (key));\
14164 +       } while (0)
14165 +
14166 +#define __init_rwsem(sem, name, key) __rt_init_rwsem(sem, name, key)
14167 +
14168 +# define rt_init_rwsem(sem)                            \
14169 +do {                                                   \
14170 +       static struct lock_class_key __key;             \
14171 +                                                       \
14172 +       __rt_init_rwsem((sem), #sem, &__key);           \
14173 +} while (0)
14174 +
14175 +extern void rt_down_write(struct rw_semaphore *rwsem);
14176 +extern void rt_down_read_nested(struct rw_semaphore *rwsem, int subclass);
14177 +extern void rt_down_write_nested(struct rw_semaphore *rwsem, int subclass);
14178 +extern void rt_down_write_nested_lock(struct rw_semaphore *rwsem,
14179 +                                     struct lockdep_map *nest);
14180 +extern void rt__down_read(struct rw_semaphore *rwsem);
14181 +extern void rt_down_read(struct rw_semaphore *rwsem);
14182 +extern int  rt_down_write_trylock(struct rw_semaphore *rwsem);
14183 +extern int  rt__down_read_trylock(struct rw_semaphore *rwsem);
14184 +extern int  rt_down_read_trylock(struct rw_semaphore *rwsem);
14185 +extern void __rt_up_read(struct rw_semaphore *rwsem);
14186 +extern void rt_up_read(struct rw_semaphore *rwsem);
14187 +extern void rt_up_write(struct rw_semaphore *rwsem);
14188 +extern void rt_downgrade_write(struct rw_semaphore *rwsem);
14189 +
14190 +#define init_rwsem(sem)                rt_init_rwsem(sem)
14191 +#define rwsem_is_locked(s)     rt_mutex_is_locked(&(s)->lock)
14192 +
14193 +static inline int rwsem_is_contended(struct rw_semaphore *sem)
14194 +{
14195 +       /* rt_mutex_has_waiters() */
14196 +       return !RB_EMPTY_ROOT(&sem->lock.waiters);
14197 +}
14198 +
14199 +static inline void __down_read(struct rw_semaphore *sem)
14200 +{
14201 +       rt__down_read(sem);
14202 +}
14203 +
14204 +static inline void down_read(struct rw_semaphore *sem)
14205 +{
14206 +       rt_down_read(sem);
14207 +}
14208 +
14209 +static inline int __down_read_trylock(struct rw_semaphore *sem)
14210 +{
14211 +       return rt__down_read_trylock(sem);
14212 +}
14213 +
14214 +static inline int down_read_trylock(struct rw_semaphore *sem)
14215 +{
14216 +       return rt_down_read_trylock(sem);
14217 +}
14218 +
14219 +static inline void down_write(struct rw_semaphore *sem)
14220 +{
14221 +       rt_down_write(sem);
14222 +}
14223 +
14224 +static inline int down_write_trylock(struct rw_semaphore *sem)
14225 +{
14226 +       return rt_down_write_trylock(sem);
14227 +}
14228 +
14229 +static inline void __up_read(struct rw_semaphore *sem)
14230 +{
14231 +       __rt_up_read(sem);
14232 +}
14233 +
14234 +static inline void up_read(struct rw_semaphore *sem)
14235 +{
14236 +       rt_up_read(sem);
14237 +}
14238 +
14239 +static inline void up_write(struct rw_semaphore *sem)
14240 +{
14241 +       rt_up_write(sem);
14242 +}
14243 +
14244 +static inline void downgrade_write(struct rw_semaphore *sem)
14245 +{
14246 +       rt_downgrade_write(sem);
14247 +}
14248 +
14249 +static inline void down_read_nested(struct rw_semaphore *sem, int subclass)
14250 +{
14251 +       return rt_down_read_nested(sem, subclass);
14252 +}
14253 +
14254 +static inline void down_write_nested(struct rw_semaphore *sem, int subclass)
14255 +{
14256 +       rt_down_write_nested(sem, subclass);
14257 +}
14258 +#ifdef CONFIG_DEBUG_LOCK_ALLOC
14259 +static inline void down_write_nest_lock(struct rw_semaphore *sem,
14260 +               struct rw_semaphore *nest_lock)
14261 +{
14262 +       rt_down_write_nested_lock(sem, &nest_lock->dep_map);
14263 +}
14264 +
14265 +#else
14266 +
14267 +static inline void down_write_nest_lock(struct rw_semaphore *sem,
14268 +               struct rw_semaphore *nest_lock)
14269 +{
14270 +       rt_down_write_nested_lock(sem, NULL);
14271 +}
14272 +#endif
14273 +#endif
14274 diff --git a/include/linux/sched.h b/include/linux/sched.h
14275 index 1c0193baea2a..0570d8e022ec 100644
14276 --- a/include/linux/sched.h
14277 +++ b/include/linux/sched.h
14278 @@ -26,6 +26,7 @@ struct sched_param {
14279  #include <linux/nodemask.h>
14280  #include <linux/mm_types.h>
14281  #include <linux/preempt.h>
14282 +#include <asm/kmap_types.h>
14283
14284  #include <asm/page.h>
14285  #include <asm/ptrace.h>
14286 @@ -182,8 +183,6 @@ extern void update_cpu_load_nohz(void);
14287  static inline void update_cpu_load_nohz(void) { }
14288  #endif
14289
14290 -extern unsigned long get_parent_ip(unsigned long addr);
14291 -
14292  extern void dump_cpu_task(int cpu);
14293
14294  struct seq_file;
14295 @@ -242,10 +241,7 @@ extern char ___assert_task_state[1 - 2*!!(
14296                                  TASK_UNINTERRUPTIBLE | __TASK_STOPPED | \
14297                                  __TASK_TRACED | EXIT_ZOMBIE | EXIT_DEAD)
14298
14299 -#define task_is_traced(task)   ((task->state & __TASK_TRACED) != 0)
14300  #define task_is_stopped(task)  ((task->state & __TASK_STOPPED) != 0)
14301 -#define task_is_stopped_or_traced(task)        \
14302 -                       ((task->state & (__TASK_STOPPED | __TASK_TRACED)) != 0)
14303  #define task_contributes_to_load(task) \
14304                                 ((task->state & TASK_UNINTERRUPTIBLE) != 0 && \
14305                                  (task->flags & PF_FROZEN) == 0 && \
14306 @@ -311,6 +307,11 @@ extern char ___assert_task_state[1 - 2*!!(
14307
14308  #endif
14309
14310 +#define __set_current_state_no_track(state_value)      \
14311 +       do { current->state = (state_value); } while (0)
14312 +#define set_current_state_no_track(state_value)                \
14313 +       set_mb(current->state, (state_value))
14314 +
14315  /* Task command name length */
14316  #define TASK_COMM_LEN 16
14317
14318 @@ -970,8 +971,18 @@ struct wake_q_head {
14319         struct wake_q_head name = { WAKE_Q_TAIL, &name.first }
14320
14321  extern void wake_q_add(struct wake_q_head *head,
14322 -                      struct task_struct *task);
14323 -extern void wake_up_q(struct wake_q_head *head);
14324 +                             struct task_struct *task);
14325 +extern void __wake_up_q(struct wake_q_head *head, bool sleeper);
14326 +
14327 +static inline void wake_up_q(struct wake_q_head *head)
14328 +{
14329 +       __wake_up_q(head, false);
14330 +}
14331 +
14332 +static inline void wake_up_q_sleeper(struct wake_q_head *head)
14333 +{
14334 +       __wake_up_q(head, true);
14335 +}
14336
14337  /*
14338   * sched-domains (multiprocessor balancing) declarations:
14339 @@ -1379,6 +1390,7 @@ struct tlbflush_unmap_batch {
14340
14341  struct task_struct {
14342         volatile long state;    /* -1 unrunnable, 0 runnable, >0 stopped */
14343 +       volatile long saved_state;      /* saved state for "spinlock sleepers" */
14344         void *stack;
14345         atomic_t usage;
14346         unsigned int flags;     /* per process flags, defined below */
14347 @@ -1415,6 +1427,12 @@ struct task_struct {
14348  #endif
14349
14350         unsigned int policy;
14351 +#ifdef CONFIG_PREEMPT_RT_FULL
14352 +       int migrate_disable;
14353 +# ifdef CONFIG_SCHED_DEBUG
14354 +       int migrate_disable_atomic;
14355 +# endif
14356 +#endif
14357         int nr_cpus_allowed;
14358         cpumask_t cpus_allowed;
14359
14360 @@ -1522,11 +1540,14 @@ struct task_struct {
14361         cputime_t gtime;
14362         struct prev_cputime prev_cputime;
14363  #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
14364 -       seqlock_t vtime_seqlock;
14365 +       seqcount_t vtime_seqcount;
14366         unsigned long long vtime_snap;
14367         enum {
14368 -               VTIME_SLEEPING = 0,
14369 +               /* Task is sleeping or running in a CPU with VTIME inactive */
14370 +               VTIME_INACTIVE = 0,
14371 +               /* Task runs in userspace in a CPU with VTIME active */
14372                 VTIME_USER,
14373 +               /* Task runs in kernelspace in a CPU with VTIME active */
14374                 VTIME_SYS,
14375         } vtime_snap_whence;
14376  #endif
14377 @@ -1538,6 +1559,9 @@ struct task_struct {
14378
14379         struct task_cputime cputime_expires;
14380         struct list_head cpu_timers[3];
14381 +#ifdef CONFIG_PREEMPT_RT_BASE
14382 +       struct task_struct *posix_timer_list;
14383 +#endif
14384
14385  /* process credentials */
14386         const struct cred __rcu *real_cred; /* objective and real subjective task
14387 @@ -1568,10 +1592,15 @@ struct task_struct {
14388  /* signal handlers */
14389         struct signal_struct *signal;
14390         struct sighand_struct *sighand;
14391 +       struct sigqueue *sigqueue_cache;
14392
14393         sigset_t blocked, real_blocked;
14394         sigset_t saved_sigmask; /* restored if set_restore_sigmask() was used */
14395         struct sigpending pending;
14396 +#ifdef CONFIG_PREEMPT_RT_FULL
14397 +       /* TODO: move me into ->restart_block ? */
14398 +       struct siginfo forced_info;
14399 +#endif
14400
14401         unsigned long sas_ss_sp;
14402         size_t sas_ss_size;
14403 @@ -1795,6 +1824,12 @@ struct task_struct {
14404         unsigned long trace;
14405         /* bitmask and counter of trace recursion */
14406         unsigned long trace_recursion;
14407 +#ifdef CONFIG_WAKEUP_LATENCY_HIST
14408 +       u64 preempt_timestamp_hist;
14409 +#ifdef CONFIG_MISSED_TIMER_OFFSETS_HIST
14410 +       long timer_offset;
14411 +#endif
14412 +#endif
14413  #endif /* CONFIG_TRACING */
14414  #ifdef CONFIG_MEMCG
14415         struct mem_cgroup *memcg_in_oom;
14416 @@ -1811,9 +1846,23 @@ struct task_struct {
14417         unsigned int    sequential_io;
14418         unsigned int    sequential_io_avg;
14419  #endif
14420 +#ifdef CONFIG_PREEMPT_RT_BASE
14421 +       struct rcu_head put_rcu;
14422 +       int softirq_nestcnt;
14423 +       unsigned int softirqs_raised;
14424 +#endif
14425 +#ifdef CONFIG_PREEMPT_RT_FULL
14426 +# if defined CONFIG_HIGHMEM || defined CONFIG_X86_32
14427 +       int kmap_idx;
14428 +       pte_t kmap_pte[KM_TYPE_NR];
14429 +# endif
14430 +#endif
14431  #ifdef CONFIG_DEBUG_ATOMIC_SLEEP
14432         unsigned long   task_state_change;
14433  #endif
14434 +#ifdef CONFIG_PREEMPT_RT_FULL
14435 +       int xmit_recursion;
14436 +#endif
14437         int pagefault_disabled;
14438  /* CPU-specific state of this task */
14439         struct thread_struct thread;
14440 @@ -1831,9 +1880,6 @@ extern int arch_task_struct_size __read_mostly;
14441  # define arch_task_struct_size (sizeof(struct task_struct))
14442  #endif
14443
14444 -/* Future-safe accessor for struct task_struct's cpus_allowed. */
14445 -#define tsk_cpus_allowed(tsk) (&(tsk)->cpus_allowed)
14446 -
14447  #define TNF_MIGRATED   0x01
14448  #define TNF_NO_GROUP   0x02
14449  #define TNF_SHARED     0x04
14450 @@ -2023,6 +2069,15 @@ extern struct pid *cad_pid;
14451  extern void free_task(struct task_struct *tsk);
14452  #define get_task_struct(tsk) do { atomic_inc(&(tsk)->usage); } while(0)
14453
14454 +#ifdef CONFIG_PREEMPT_RT_BASE
14455 +extern void __put_task_struct_cb(struct rcu_head *rhp);
14456 +
14457 +static inline void put_task_struct(struct task_struct *t)
14458 +{
14459 +       if (atomic_dec_and_test(&t->usage))
14460 +               call_rcu(&t->put_rcu, __put_task_struct_cb);
14461 +}
14462 +#else
14463  extern void __put_task_struct(struct task_struct *t);
14464
14465  static inline void put_task_struct(struct task_struct *t)
14466 @@ -2030,6 +2085,7 @@ static inline void put_task_struct(struct task_struct *t)
14467         if (atomic_dec_and_test(&t->usage))
14468                 __put_task_struct(t);
14469  }
14470 +#endif
14471
14472  #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
14473  extern void task_cputime(struct task_struct *t,
14474 @@ -2068,6 +2124,7 @@ extern void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut,
14475  /*
14476   * Per process flags
14477   */
14478 +#define PF_IN_SOFTIRQ  0x00000001      /* Task is serving softirq */
14479  #define PF_EXITING     0x00000004      /* getting shut down */
14480  #define PF_EXITPIDONE  0x00000008      /* pi exit done on shut down */
14481  #define PF_VCPU                0x00000010      /* I'm a virtual CPU */
14482 @@ -2232,6 +2289,10 @@ extern void do_set_cpus_allowed(struct task_struct *p,
14483
14484  extern int set_cpus_allowed_ptr(struct task_struct *p,
14485                                 const struct cpumask *new_mask);
14486 +int migrate_me(void);
14487 +void tell_sched_cpu_down_begin(int cpu);
14488 +void tell_sched_cpu_down_done(int cpu);
14489 +
14490  #else
14491  static inline void do_set_cpus_allowed(struct task_struct *p,
14492                                       const struct cpumask *new_mask)
14493 @@ -2244,6 +2305,9 @@ static inline int set_cpus_allowed_ptr(struct task_struct *p,
14494                 return -EINVAL;
14495         return 0;
14496  }
14497 +static inline int migrate_me(void) { return 0; }
14498 +static inline void tell_sched_cpu_down_begin(int cpu) { }
14499 +static inline void tell_sched_cpu_down_done(int cpu) { }
14500  #endif
14501
14502  #ifdef CONFIG_NO_HZ_COMMON
14503 @@ -2453,6 +2517,7 @@ extern void xtime_update(unsigned long ticks);
14504
14505  extern int wake_up_state(struct task_struct *tsk, unsigned int state);
14506  extern int wake_up_process(struct task_struct *tsk);
14507 +extern int wake_up_lock_sleeper(struct task_struct * tsk);
14508  extern void wake_up_new_task(struct task_struct *tsk);
14509  #ifdef CONFIG_SMP
14510   extern void kick_process(struct task_struct *tsk);
14511 @@ -2576,12 +2641,24 @@ extern struct mm_struct * mm_alloc(void);
14512
14513  /* mmdrop drops the mm and the page tables */
14514  extern void __mmdrop(struct mm_struct *);
14515 +
14516  static inline void mmdrop(struct mm_struct * mm)
14517  {
14518         if (unlikely(atomic_dec_and_test(&mm->mm_count)))
14519                 __mmdrop(mm);
14520  }
14521
14522 +#ifdef CONFIG_PREEMPT_RT_BASE
14523 +extern void __mmdrop_delayed(struct rcu_head *rhp);
14524 +static inline void mmdrop_delayed(struct mm_struct *mm)
14525 +{
14526 +       if (atomic_dec_and_test(&mm->mm_count))
14527 +               call_rcu(&mm->delayed_drop, __mmdrop_delayed);
14528 +}
14529 +#else
14530 +# define mmdrop_delayed(mm)    mmdrop(mm)
14531 +#endif
14532 +
14533  /* mmput gets rid of the mappings and all user-space */
14534  extern void mmput(struct mm_struct *);
14535  /* Grab a reference to a task's mm, if it is not already going away */
14536 @@ -2891,6 +2968,43 @@ static inline int test_tsk_need_resched(struct task_struct *tsk)
14537         return unlikely(test_tsk_thread_flag(tsk,TIF_NEED_RESCHED));
14538  }
14539
14540 +#ifdef CONFIG_PREEMPT_LAZY
14541 +static inline void set_tsk_need_resched_lazy(struct task_struct *tsk)
14542 +{
14543 +       set_tsk_thread_flag(tsk,TIF_NEED_RESCHED_LAZY);
14544 +}
14545 +
14546 +static inline void clear_tsk_need_resched_lazy(struct task_struct *tsk)
14547 +{
14548 +       clear_tsk_thread_flag(tsk,TIF_NEED_RESCHED_LAZY);
14549 +}
14550 +
14551 +static inline int test_tsk_need_resched_lazy(struct task_struct *tsk)
14552 +{
14553 +       return unlikely(test_tsk_thread_flag(tsk,TIF_NEED_RESCHED_LAZY));
14554 +}
14555 +
14556 +static inline int need_resched_lazy(void)
14557 +{
14558 +       return test_thread_flag(TIF_NEED_RESCHED_LAZY);
14559 +}
14560 +
14561 +static inline int need_resched_now(void)
14562 +{
14563 +       return test_thread_flag(TIF_NEED_RESCHED);
14564 +}
14565 +
14566 +#else
14567 +static inline void clear_tsk_need_resched_lazy(struct task_struct *tsk) { }
14568 +static inline int need_resched_lazy(void) { return 0; }
14569 +
14570 +static inline int need_resched_now(void)
14571 +{
14572 +       return test_thread_flag(TIF_NEED_RESCHED);
14573 +}
14574 +
14575 +#endif
14576 +
14577  static inline int restart_syscall(void)
14578  {
14579         set_tsk_thread_flag(current, TIF_SIGPENDING);
14580 @@ -2922,6 +3036,51 @@ static inline int signal_pending_state(long state, struct task_struct *p)
14581         return (state & TASK_INTERRUPTIBLE) || __fatal_signal_pending(p);
14582  }
14583
14584 +static inline bool __task_is_stopped_or_traced(struct task_struct *task)
14585 +{
14586 +       if (task->state & (__TASK_STOPPED | __TASK_TRACED))
14587 +               return true;
14588 +#ifdef CONFIG_PREEMPT_RT_FULL
14589 +       if (task->saved_state & (__TASK_STOPPED | __TASK_TRACED))
14590 +               return true;
14591 +#endif
14592 +       return false;
14593 +}
14594 +
14595 +static inline bool task_is_stopped_or_traced(struct task_struct *task)
14596 +{
14597 +       bool traced_stopped;
14598 +
14599 +#ifdef CONFIG_PREEMPT_RT_FULL
14600 +       unsigned long flags;
14601 +
14602 +       raw_spin_lock_irqsave(&task->pi_lock, flags);
14603 +       traced_stopped = __task_is_stopped_or_traced(task);
14604 +       raw_spin_unlock_irqrestore(&task->pi_lock, flags);
14605 +#else
14606 +       traced_stopped = __task_is_stopped_or_traced(task);
14607 +#endif
14608 +       return traced_stopped;
14609 +}
14610 +
14611 +static inline bool task_is_traced(struct task_struct *task)
14612 +{
14613 +       bool traced = false;
14614 +
14615 +       if (task->state & __TASK_TRACED)
14616 +               return true;
14617 +#ifdef CONFIG_PREEMPT_RT_FULL
14618 +       /* in case the task is sleeping on tasklist_lock */
14619 +       raw_spin_lock_irq(&task->pi_lock);
14620 +       if (task->state & __TASK_TRACED)
14621 +               traced = true;
14622 +       else if (task->saved_state & __TASK_TRACED)
14623 +               traced = true;
14624 +       raw_spin_unlock_irq(&task->pi_lock);
14625 +#endif
14626 +       return traced;
14627 +}
14628 +
14629  /*
14630   * cond_resched() and cond_resched_lock(): latency reduction via
14631   * explicit rescheduling in places that are safe. The return
14632 @@ -2943,12 +3102,16 @@ extern int __cond_resched_lock(spinlock_t *lock);
14633         __cond_resched_lock(lock);                              \
14634  })
14635
14636 +#ifndef CONFIG_PREEMPT_RT_FULL
14637  extern int __cond_resched_softirq(void);
14638
14639  #define cond_resched_softirq() ({                                      \
14640         ___might_sleep(__FILE__, __LINE__, SOFTIRQ_DISABLE_OFFSET);     \
14641         __cond_resched_softirq();                                       \
14642  })
14643 +#else
14644 +# define cond_resched_softirq()                cond_resched()
14645 +#endif
14646
14647  static inline void cond_resched_rcu(void)
14648  {
14649 @@ -3110,6 +3273,31 @@ static inline void set_task_cpu(struct task_struct *p, unsigned int cpu)
14650
14651  #endif /* CONFIG_SMP */
14652
14653 +static inline int __migrate_disabled(struct task_struct *p)
14654 +{
14655 +#ifdef CONFIG_PREEMPT_RT_FULL
14656 +       return p->migrate_disable;
14657 +#else
14658 +       return 0;
14659 +#endif
14660 +}
14661 +
14662 +/* Future-safe accessor for struct task_struct's cpus_allowed. */
14663 +static inline const struct cpumask *tsk_cpus_allowed(struct task_struct *p)
14664 +{
14665 +       if (__migrate_disabled(p))
14666 +               return cpumask_of(task_cpu(p));
14667 +
14668 +       return &p->cpus_allowed;
14669 +}
14670 +
14671 +static inline int tsk_nr_cpus_allowed(struct task_struct *p)
14672 +{
14673 +       if (__migrate_disabled(p))
14674 +               return 1;
14675 +       return p->nr_cpus_allowed;
14676 +}
14677 +
14678  extern long sched_setaffinity(pid_t pid, const struct cpumask *new_mask);
14679  extern long sched_getaffinity(pid_t pid, struct cpumask *mask);
14680
14681 diff --git a/include/linux/seqlock.h b/include/linux/seqlock.h
14682 index e0582106ef4f..b14f4d2368aa 100644
14683 --- a/include/linux/seqlock.h
14684 +++ b/include/linux/seqlock.h
14685 @@ -220,20 +220,30 @@ static inline int read_seqcount_retry(const seqcount_t *s, unsigned start)
14686         return __read_seqcount_retry(s, start);
14687  }
14688
14689 -
14690 -
14691 -static inline void raw_write_seqcount_begin(seqcount_t *s)
14692 +static inline void __raw_write_seqcount_begin(seqcount_t *s)
14693  {
14694         s->sequence++;
14695         smp_wmb();
14696  }
14697
14698 -static inline void raw_write_seqcount_end(seqcount_t *s)
14699 +static inline void raw_write_seqcount_begin(seqcount_t *s)
14700 +{
14701 +       preempt_disable_rt();
14702 +       __raw_write_seqcount_begin(s);
14703 +}
14704 +
14705 +static inline void __raw_write_seqcount_end(seqcount_t *s)
14706  {
14707         smp_wmb();
14708         s->sequence++;
14709  }
14710
14711 +static inline void raw_write_seqcount_end(seqcount_t *s)
14712 +{
14713 +       __raw_write_seqcount_end(s);
14714 +       preempt_enable_rt();
14715 +}
14716 +
14717  /**
14718   * raw_write_seqcount_barrier - do a seq write barrier
14719   * @s: pointer to seqcount_t
14720 @@ -425,10 +435,32 @@ typedef struct {
14721  /*
14722   * Read side functions for starting and finalizing a read side section.
14723   */
14724 +#ifndef CONFIG_PREEMPT_RT_FULL
14725  static inline unsigned read_seqbegin(const seqlock_t *sl)
14726  {
14727         return read_seqcount_begin(&sl->seqcount);
14728  }
14729 +#else
14730 +/*
14731 + * Starvation safe read side for RT
14732 + */
14733 +static inline unsigned read_seqbegin(seqlock_t *sl)
14734 +{
14735 +       unsigned ret;
14736 +
14737 +repeat:
14738 +       ret = ACCESS_ONCE(sl->seqcount.sequence);
14739 +       if (unlikely(ret & 1)) {
14740 +               /*
14741 +                * Take the lock and let the writer proceed (i.e. evtl
14742 +                * boost it), otherwise we could loop here forever.
14743 +                */
14744 +               spin_unlock_wait(&sl->lock);
14745 +               goto repeat;
14746 +       }
14747 +       return ret;
14748 +}
14749 +#endif
14750
14751  static inline unsigned read_seqretry(const seqlock_t *sl, unsigned start)
14752  {
14753 @@ -443,36 +475,36 @@ static inline unsigned read_seqretry(const seqlock_t *sl, unsigned start)
14754  static inline void write_seqlock(seqlock_t *sl)
14755  {
14756         spin_lock(&sl->lock);
14757 -       write_seqcount_begin(&sl->seqcount);
14758 +       __raw_write_seqcount_begin(&sl->seqcount);
14759  }
14760
14761  static inline void write_sequnlock(seqlock_t *sl)
14762  {
14763 -       write_seqcount_end(&sl->seqcount);
14764 +       __raw_write_seqcount_end(&sl->seqcount);
14765         spin_unlock(&sl->lock);
14766  }
14767
14768  static inline void write_seqlock_bh(seqlock_t *sl)
14769  {
14770         spin_lock_bh(&sl->lock);
14771 -       write_seqcount_begin(&sl->seqcount);
14772 +       __raw_write_seqcount_begin(&sl->seqcount);
14773  }
14774
14775  static inline void write_sequnlock_bh(seqlock_t *sl)
14776  {
14777 -       write_seqcount_end(&sl->seqcount);
14778 +       __raw_write_seqcount_end(&sl->seqcount);
14779         spin_unlock_bh(&sl->lock);
14780  }
14781
14782  static inline void write_seqlock_irq(seqlock_t *sl)
14783  {
14784         spin_lock_irq(&sl->lock);
14785 -       write_seqcount_begin(&sl->seqcount);
14786 +       __raw_write_seqcount_begin(&sl->seqcount);
14787  }
14788
14789  static inline void write_sequnlock_irq(seqlock_t *sl)
14790  {
14791 -       write_seqcount_end(&sl->seqcount);
14792 +       __raw_write_seqcount_end(&sl->seqcount);
14793         spin_unlock_irq(&sl->lock);
14794  }
14795
14796 @@ -481,7 +513,7 @@ static inline unsigned long __write_seqlock_irqsave(seqlock_t *sl)
14797         unsigned long flags;
14798
14799         spin_lock_irqsave(&sl->lock, flags);
14800 -       write_seqcount_begin(&sl->seqcount);
14801 +       __raw_write_seqcount_begin(&sl->seqcount);
14802         return flags;
14803  }
14804
14805 @@ -491,7 +523,7 @@ static inline unsigned long __write_seqlock_irqsave(seqlock_t *sl)
14806  static inline void
14807  write_sequnlock_irqrestore(seqlock_t *sl, unsigned long flags)
14808  {
14809 -       write_seqcount_end(&sl->seqcount);
14810 +       __raw_write_seqcount_end(&sl->seqcount);
14811         spin_unlock_irqrestore(&sl->lock, flags);
14812  }
14813
14814 diff --git a/include/linux/signal.h b/include/linux/signal.h
14815 index d80259afb9e5..ddd1e6866a54 100644
14816 --- a/include/linux/signal.h
14817 +++ b/include/linux/signal.h
14818 @@ -233,6 +233,7 @@ static inline void init_sigpending(struct sigpending *sig)
14819  }
14820
14821  extern void flush_sigqueue(struct sigpending *queue);
14822 +extern void flush_task_sigqueue(struct task_struct *tsk);
14823
14824  /* Test if 'sig' is valid signal. Use this instead of testing _NSIG directly */
14825  static inline int valid_signal(unsigned long sig)
14826 diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
14827 index d443d9ab0236..2d1c7f9b7fd0 100644
14828 --- a/include/linux/skbuff.h
14829 +++ b/include/linux/skbuff.h
14830 @@ -203,6 +203,7 @@ struct sk_buff_head {
14831
14832         __u32           qlen;
14833         spinlock_t      lock;
14834 +       raw_spinlock_t  raw_lock;
14835  };
14836
14837  struct sk_buff;
14838 @@ -1465,6 +1466,12 @@ static inline void skb_queue_head_init(struct sk_buff_head *list)
14839         __skb_queue_head_init(list);
14840  }
14841
14842 +static inline void skb_queue_head_init_raw(struct sk_buff_head *list)
14843 +{
14844 +       raw_spin_lock_init(&list->raw_lock);
14845 +       __skb_queue_head_init(list);
14846 +}
14847 +
14848  static inline void skb_queue_head_init_class(struct sk_buff_head *list,
14849                 struct lock_class_key *class)
14850  {
14851 diff --git a/include/linux/smp.h b/include/linux/smp.h
14852 index c4414074bd88..e6ab36aeaaab 100644
14853 --- a/include/linux/smp.h
14854 +++ b/include/linux/smp.h
14855 @@ -185,6 +185,9 @@ static inline void smp_init(void) { }
14856  #define get_cpu()              ({ preempt_disable(); smp_processor_id(); })
14857  #define put_cpu()              preempt_enable()
14858
14859 +#define get_cpu_light()                ({ migrate_disable(); smp_processor_id(); })
14860 +#define put_cpu_light()                migrate_enable()
14861 +
14862  /*
14863   * Callback to arch code if there's nosmp or maxcpus=0 on the
14864   * boot command line:
14865 diff --git a/include/linux/spinlock.h b/include/linux/spinlock.h
14866 index 47dd0cebd204..02928fa5499d 100644
14867 --- a/include/linux/spinlock.h
14868 +++ b/include/linux/spinlock.h
14869 @@ -271,7 +271,11 @@ static inline void do_raw_spin_unlock(raw_spinlock_t *lock) __releases(lock)
14870  #define raw_spin_can_lock(lock)        (!raw_spin_is_locked(lock))
14871
14872  /* Include rwlock functions */
14873 -#include <linux/rwlock.h>
14874 +#ifdef CONFIG_PREEMPT_RT_FULL
14875 +# include <linux/rwlock_rt.h>
14876 +#else
14877 +# include <linux/rwlock.h>
14878 +#endif
14879
14880  /*
14881   * Pull the _spin_*()/_read_*()/_write_*() functions/declarations:
14882 @@ -282,6 +286,10 @@ static inline void do_raw_spin_unlock(raw_spinlock_t *lock) __releases(lock)
14883  # include <linux/spinlock_api_up.h>
14884  #endif
14885
14886 +#ifdef CONFIG_PREEMPT_RT_FULL
14887 +# include <linux/spinlock_rt.h>
14888 +#else /* PREEMPT_RT_FULL */
14889 +
14890  /*
14891   * Map the spin_lock functions to the raw variants for PREEMPT_RT=n
14892   */
14893 @@ -347,6 +355,12 @@ static __always_inline void spin_unlock(spinlock_t *lock)
14894         raw_spin_unlock(&lock->rlock);
14895  }
14896
14897 +static __always_inline int spin_unlock_no_deboost(spinlock_t *lock)
14898 +{
14899 +       raw_spin_unlock(&lock->rlock);
14900 +       return 0;
14901 +}
14902 +
14903  static __always_inline void spin_unlock_bh(spinlock_t *lock)
14904  {
14905         raw_spin_unlock_bh(&lock->rlock);
14906 @@ -416,4 +430,6 @@ extern int _atomic_dec_and_lock(atomic_t *atomic, spinlock_t *lock);
14907  #define atomic_dec_and_lock(atomic, lock) \
14908                 __cond_lock(lock, _atomic_dec_and_lock(atomic, lock))
14909
14910 +#endif /* !PREEMPT_RT_FULL */
14911 +
14912  #endif /* __LINUX_SPINLOCK_H */
14913 diff --git a/include/linux/spinlock_api_smp.h b/include/linux/spinlock_api_smp.h
14914 index 5344268e6e62..043263f30e81 100644
14915 --- a/include/linux/spinlock_api_smp.h
14916 +++ b/include/linux/spinlock_api_smp.h
14917 @@ -189,6 +189,8 @@ static inline int __raw_spin_trylock_bh(raw_spinlock_t *lock)
14918         return 0;
14919  }
14920
14921 -#include <linux/rwlock_api_smp.h>
14922 +#ifndef CONFIG_PREEMPT_RT_FULL
14923 +# include <linux/rwlock_api_smp.h>
14924 +#endif
14925
14926  #endif /* __LINUX_SPINLOCK_API_SMP_H */
14927 diff --git a/include/linux/spinlock_rt.h b/include/linux/spinlock_rt.h
14928 new file mode 100644
14929 index 000000000000..7eb87584e843
14930 --- /dev/null
14931 +++ b/include/linux/spinlock_rt.h
14932 @@ -0,0 +1,165 @@
14933 +#ifndef __LINUX_SPINLOCK_RT_H
14934 +#define __LINUX_SPINLOCK_RT_H
14935 +
14936 +#ifndef __LINUX_SPINLOCK_H
14937 +#error Do not include directly. Use spinlock.h
14938 +#endif
14939 +
14940 +#include <linux/bug.h>
14941 +
14942 +extern void
14943 +__rt_spin_lock_init(spinlock_t *lock, char *name, struct lock_class_key *key);
14944 +
14945 +#define spin_lock_init(slock)                          \
14946 +do {                                                   \
14947 +       static struct lock_class_key __key;             \
14948 +                                                       \
14949 +       rt_mutex_init(&(slock)->lock);                  \
14950 +       __rt_spin_lock_init(slock, #slock, &__key);     \
14951 +} while (0)
14952 +
14953 +void __lockfunc rt_spin_lock__no_mg(spinlock_t *lock);
14954 +void __lockfunc rt_spin_unlock__no_mg(spinlock_t *lock);
14955 +int __lockfunc rt_spin_trylock__no_mg(spinlock_t *lock);
14956 +
14957 +extern void __lockfunc rt_spin_lock(spinlock_t *lock);
14958 +extern unsigned long __lockfunc rt_spin_lock_trace_flags(spinlock_t *lock);
14959 +extern void __lockfunc rt_spin_lock_nested(spinlock_t *lock, int subclass);
14960 +extern void __lockfunc rt_spin_unlock(spinlock_t *lock);
14961 +extern int __lockfunc rt_spin_unlock_no_deboost(spinlock_t *lock);
14962 +extern void __lockfunc rt_spin_unlock_wait(spinlock_t *lock);
14963 +extern int __lockfunc rt_spin_trylock_irqsave(spinlock_t *lock, unsigned long *flags);
14964 +extern int __lockfunc rt_spin_trylock_bh(spinlock_t *lock);
14965 +extern int __lockfunc rt_spin_trylock(spinlock_t *lock);
14966 +extern int atomic_dec_and_spin_lock(atomic_t *atomic, spinlock_t *lock);
14967 +
14968 +/*
14969 + * lockdep-less calls, for derived types like rwlock:
14970 + * (for trylock they can use rt_mutex_trylock() directly.
14971 + */
14972 +extern void __lockfunc __rt_spin_lock__no_mg(struct rt_mutex *lock);
14973 +extern void __lockfunc __rt_spin_lock(struct rt_mutex *lock);
14974 +extern void __lockfunc __rt_spin_unlock(struct rt_mutex *lock);
14975 +extern int __lockfunc __rt_spin_trylock(struct rt_mutex *lock);
14976 +
14977 +#define spin_lock(lock)                        rt_spin_lock(lock)
14978 +
14979 +#define spin_lock_bh(lock)                     \
14980 +       do {                                    \
14981 +               local_bh_disable();             \
14982 +               rt_spin_lock(lock);             \
14983 +       } while (0)
14984 +
14985 +#define spin_lock_irq(lock)            spin_lock(lock)
14986 +
14987 +#define spin_do_trylock(lock)          __cond_lock(lock, rt_spin_trylock(lock))
14988 +
14989 +#define spin_trylock(lock)                     \
14990 +({                                             \
14991 +       int __locked;                           \
14992 +       __locked = spin_do_trylock(lock);       \
14993 +       __locked;                               \
14994 +})
14995 +
14996 +#ifdef CONFIG_LOCKDEP
14997 +# define spin_lock_nested(lock, subclass)              \
14998 +       do {                                            \
14999 +               rt_spin_lock_nested(lock, subclass);    \
15000 +       } while (0)
15001 +
15002 +#define spin_lock_bh_nested(lock, subclass)            \
15003 +       do {                                            \
15004 +               local_bh_disable();                     \
15005 +               rt_spin_lock_nested(lock, subclass);    \
15006 +       } while (0)
15007 +
15008 +# define spin_lock_irqsave_nested(lock, flags, subclass) \
15009 +       do {                                             \
15010 +               typecheck(unsigned long, flags);         \
15011 +               flags = 0;                               \
15012 +               rt_spin_lock_nested(lock, subclass);     \
15013 +       } while (0)
15014 +#else
15015 +# define spin_lock_nested(lock, subclass)      spin_lock(lock)
15016 +# define spin_lock_bh_nested(lock, subclass)   spin_lock_bh(lock)
15017 +
15018 +# define spin_lock_irqsave_nested(lock, flags, subclass) \
15019 +       do {                                             \
15020 +               typecheck(unsigned long, flags);         \
15021 +               flags = 0;                               \
15022 +               spin_lock(lock);                         \
15023 +       } while (0)
15024 +#endif
15025 +
15026 +#define spin_lock_irqsave(lock, flags)                  \
15027 +       do {                                             \
15028 +               typecheck(unsigned long, flags);         \
15029 +               flags = 0;                               \
15030 +               spin_lock(lock);                         \
15031 +       } while (0)
15032 +
15033 +static inline unsigned long spin_lock_trace_flags(spinlock_t *lock)
15034 +{
15035 +       unsigned long flags = 0;
15036 +#ifdef CONFIG_TRACE_IRQFLAGS
15037 +       flags = rt_spin_lock_trace_flags(lock);
15038 +#else
15039 +       spin_lock(lock); /* lock_local */
15040 +#endif
15041 +       return flags;
15042 +}
15043 +
15044 +/* FIXME: we need rt_spin_lock_nest_lock */
15045 +#define spin_lock_nest_lock(lock, nest_lock) spin_lock_nested(lock, 0)
15046 +
15047 +#define spin_unlock(lock)                      rt_spin_unlock(lock)
15048 +#define spin_unlock_no_deboost(lock)           rt_spin_unlock_no_deboost(lock)
15049 +
15050 +#define spin_unlock_bh(lock)                           \
15051 +       do {                                            \
15052 +               rt_spin_unlock(lock);                   \
15053 +               local_bh_enable();                      \
15054 +       } while (0)
15055 +
15056 +#define spin_unlock_irq(lock)          spin_unlock(lock)
15057 +
15058 +#define spin_unlock_irqrestore(lock, flags)            \
15059 +       do {                                            \
15060 +               typecheck(unsigned long, flags);        \
15061 +               (void) flags;                           \
15062 +               spin_unlock(lock);                      \
15063 +       } while (0)
15064 +
15065 +#define spin_trylock_bh(lock)  __cond_lock(lock, rt_spin_trylock_bh(lock))
15066 +#define spin_trylock_irq(lock) spin_trylock(lock)
15067 +
15068 +#define spin_trylock_irqsave(lock, flags)      \
15069 +       rt_spin_trylock_irqsave(lock, &(flags))
15070 +
15071 +#define spin_unlock_wait(lock)         rt_spin_unlock_wait(lock)
15072 +
15073 +#ifdef CONFIG_GENERIC_LOCKBREAK
15074 +# define spin_is_contended(lock)       ((lock)->break_lock)
15075 +#else
15076 +# define spin_is_contended(lock)       (((void)(lock), 0))
15077 +#endif
15078 +
15079 +static inline int spin_can_lock(spinlock_t *lock)
15080 +{
15081 +       return !rt_mutex_is_locked(&lock->lock);
15082 +}
15083 +
15084 +static inline int spin_is_locked(spinlock_t *lock)
15085 +{
15086 +       return rt_mutex_is_locked(&lock->lock);
15087 +}
15088 +
15089 +static inline void assert_spin_locked(spinlock_t *lock)
15090 +{
15091 +       BUG_ON(!spin_is_locked(lock));
15092 +}
15093 +
15094 +#define atomic_dec_and_lock(atomic, lock) \
15095 +       atomic_dec_and_spin_lock(atomic, lock)
15096 +
15097 +#endif
15098 diff --git a/include/linux/spinlock_types.h b/include/linux/spinlock_types.h
15099 index 73548eb13a5d..10bac715ea96 100644
15100 --- a/include/linux/spinlock_types.h
15101 +++ b/include/linux/spinlock_types.h
15102 @@ -9,80 +9,15 @@
15103   * Released under the General Public License (GPL).
15104   */
15105
15106 -#if defined(CONFIG_SMP)
15107 -# include <asm/spinlock_types.h>
15108 -#else
15109 -# include <linux/spinlock_types_up.h>
15110 -#endif
15111 -
15112 -#include <linux/lockdep.h>
15113 -
15114 -typedef struct raw_spinlock {
15115 -       arch_spinlock_t raw_lock;
15116 -#ifdef CONFIG_GENERIC_LOCKBREAK
15117 -       unsigned int break_lock;
15118 -#endif
15119 -#ifdef CONFIG_DEBUG_SPINLOCK
15120 -       unsigned int magic, owner_cpu;
15121 -       void *owner;
15122 -#endif
15123 -#ifdef CONFIG_DEBUG_LOCK_ALLOC
15124 -       struct lockdep_map dep_map;
15125 -#endif
15126 -} raw_spinlock_t;
15127 -
15128 -#define SPINLOCK_MAGIC         0xdead4ead
15129 -
15130 -#define SPINLOCK_OWNER_INIT    ((void *)-1L)
15131 -
15132 -#ifdef CONFIG_DEBUG_LOCK_ALLOC
15133 -# define SPIN_DEP_MAP_INIT(lockname)   .dep_map = { .name = #lockname }
15134 -#else
15135 -# define SPIN_DEP_MAP_INIT(lockname)
15136 -#endif
15137 +#include <linux/spinlock_types_raw.h>
15138
15139 -#ifdef CONFIG_DEBUG_SPINLOCK
15140 -# define SPIN_DEBUG_INIT(lockname)             \
15141 -       .magic = SPINLOCK_MAGIC,                \
15142 -       .owner_cpu = -1,                        \
15143 -       .owner = SPINLOCK_OWNER_INIT,
15144 +#ifndef CONFIG_PREEMPT_RT_FULL
15145 +# include <linux/spinlock_types_nort.h>
15146 +# include <linux/rwlock_types.h>
15147  #else
15148 -# define SPIN_DEBUG_INIT(lockname)
15149 +# include <linux/rtmutex.h>
15150 +# include <linux/spinlock_types_rt.h>
15151 +# include <linux/rwlock_types_rt.h>
15152  #endif
15153
15154 -#define __RAW_SPIN_LOCK_INITIALIZER(lockname)  \
15155 -       {                                       \
15156 -       .raw_lock = __ARCH_SPIN_LOCK_UNLOCKED,  \
15157 -       SPIN_DEBUG_INIT(lockname)               \
15158 -       SPIN_DEP_MAP_INIT(lockname) }
15159 -
15160 -#define __RAW_SPIN_LOCK_UNLOCKED(lockname)     \
15161 -       (raw_spinlock_t) __RAW_SPIN_LOCK_INITIALIZER(lockname)
15162 -
15163 -#define DEFINE_RAW_SPINLOCK(x) raw_spinlock_t x = __RAW_SPIN_LOCK_UNLOCKED(x)
15164 -
15165 -typedef struct spinlock {
15166 -       union {
15167 -               struct raw_spinlock rlock;
15168 -
15169 -#ifdef CONFIG_DEBUG_LOCK_ALLOC
15170 -# define LOCK_PADSIZE (offsetof(struct raw_spinlock, dep_map))
15171 -               struct {
15172 -                       u8 __padding[LOCK_PADSIZE];
15173 -                       struct lockdep_map dep_map;
15174 -               };
15175 -#endif
15176 -       };
15177 -} spinlock_t;
15178 -
15179 -#define __SPIN_LOCK_INITIALIZER(lockname) \
15180 -       { { .rlock = __RAW_SPIN_LOCK_INITIALIZER(lockname) } }
15181 -
15182 -#define __SPIN_LOCK_UNLOCKED(lockname) \
15183 -       (spinlock_t ) __SPIN_LOCK_INITIALIZER(lockname)
15184 -
15185 -#define DEFINE_SPINLOCK(x)     spinlock_t x = __SPIN_LOCK_UNLOCKED(x)
15186 -
15187 -#include <linux/rwlock_types.h>
15188 -
15189  #endif /* __LINUX_SPINLOCK_TYPES_H */
15190 diff --git a/include/linux/spinlock_types_nort.h b/include/linux/spinlock_types_nort.h
15191 new file mode 100644
15192 index 000000000000..f1dac1fb1d6a
15193 --- /dev/null
15194 +++ b/include/linux/spinlock_types_nort.h
15195 @@ -0,0 +1,33 @@
15196 +#ifndef __LINUX_SPINLOCK_TYPES_NORT_H
15197 +#define __LINUX_SPINLOCK_TYPES_NORT_H
15198 +
15199 +#ifndef __LINUX_SPINLOCK_TYPES_H
15200 +#error "Do not include directly. Include spinlock_types.h instead"
15201 +#endif
15202 +
15203 +/*
15204 + * The non RT version maps spinlocks to raw_spinlocks
15205 + */
15206 +typedef struct spinlock {
15207 +       union {
15208 +               struct raw_spinlock rlock;
15209 +
15210 +#ifdef CONFIG_DEBUG_LOCK_ALLOC
15211 +# define LOCK_PADSIZE (offsetof(struct raw_spinlock, dep_map))
15212 +               struct {
15213 +                       u8 __padding[LOCK_PADSIZE];
15214 +                       struct lockdep_map dep_map;
15215 +               };
15216 +#endif
15217 +       };
15218 +} spinlock_t;
15219 +
15220 +#define __SPIN_LOCK_INITIALIZER(lockname) \
15221 +       { { .rlock = __RAW_SPIN_LOCK_INITIALIZER(lockname) } }
15222 +
15223 +#define __SPIN_LOCK_UNLOCKED(lockname) \
15224 +       (spinlock_t ) __SPIN_LOCK_INITIALIZER(lockname)
15225 +
15226 +#define DEFINE_SPINLOCK(x)     spinlock_t x = __SPIN_LOCK_UNLOCKED(x)
15227 +
15228 +#endif
15229 diff --git a/include/linux/spinlock_types_raw.h b/include/linux/spinlock_types_raw.h
15230 new file mode 100644
15231 index 000000000000..edffc4d53fc9
15232 --- /dev/null
15233 +++ b/include/linux/spinlock_types_raw.h
15234 @@ -0,0 +1,56 @@
15235 +#ifndef __LINUX_SPINLOCK_TYPES_RAW_H
15236 +#define __LINUX_SPINLOCK_TYPES_RAW_H
15237 +
15238 +#if defined(CONFIG_SMP)
15239 +# include <asm/spinlock_types.h>
15240 +#else
15241 +# include <linux/spinlock_types_up.h>
15242 +#endif
15243 +
15244 +#include <linux/lockdep.h>
15245 +
15246 +typedef struct raw_spinlock {
15247 +       arch_spinlock_t raw_lock;
15248 +#ifdef CONFIG_GENERIC_LOCKBREAK
15249 +       unsigned int break_lock;
15250 +#endif
15251 +#ifdef CONFIG_DEBUG_SPINLOCK
15252 +       unsigned int magic, owner_cpu;
15253 +       void *owner;
15254 +#endif
15255 +#ifdef CONFIG_DEBUG_LOCK_ALLOC
15256 +       struct lockdep_map dep_map;
15257 +#endif
15258 +} raw_spinlock_t;
15259 +
15260 +#define SPINLOCK_MAGIC         0xdead4ead
15261 +
15262 +#define SPINLOCK_OWNER_INIT    ((void *)-1L)
15263 +
15264 +#ifdef CONFIG_DEBUG_LOCK_ALLOC
15265 +# define SPIN_DEP_MAP_INIT(lockname)   .dep_map = { .name = #lockname }
15266 +#else
15267 +# define SPIN_DEP_MAP_INIT(lockname)
15268 +#endif
15269 +
15270 +#ifdef CONFIG_DEBUG_SPINLOCK
15271 +# define SPIN_DEBUG_INIT(lockname)             \
15272 +       .magic = SPINLOCK_MAGIC,                \
15273 +       .owner_cpu = -1,                        \
15274 +       .owner = SPINLOCK_OWNER_INIT,
15275 +#else
15276 +# define SPIN_DEBUG_INIT(lockname)
15277 +#endif
15278 +
15279 +#define __RAW_SPIN_LOCK_INITIALIZER(lockname)  \
15280 +       {                                       \
15281 +       .raw_lock = __ARCH_SPIN_LOCK_UNLOCKED,  \
15282 +       SPIN_DEBUG_INIT(lockname)               \
15283 +       SPIN_DEP_MAP_INIT(lockname) }
15284 +
15285 +#define __RAW_SPIN_LOCK_UNLOCKED(lockname)     \
15286 +       (raw_spinlock_t) __RAW_SPIN_LOCK_INITIALIZER(lockname)
15287 +
15288 +#define DEFINE_RAW_SPINLOCK(x) raw_spinlock_t x = __RAW_SPIN_LOCK_UNLOCKED(x)
15289 +
15290 +#endif
15291 diff --git a/include/linux/spinlock_types_rt.h b/include/linux/spinlock_types_rt.h
15292 new file mode 100644
15293 index 000000000000..9fd431967abc
15294 --- /dev/null
15295 +++ b/include/linux/spinlock_types_rt.h
15296 @@ -0,0 +1,51 @@
15297 +#ifndef __LINUX_SPINLOCK_TYPES_RT_H
15298 +#define __LINUX_SPINLOCK_TYPES_RT_H
15299 +
15300 +#ifndef __LINUX_SPINLOCK_TYPES_H
15301 +#error "Do not include directly. Include spinlock_types.h instead"
15302 +#endif
15303 +
15304 +#include <linux/cache.h>
15305 +
15306 +/*
15307 + * PREEMPT_RT: spinlocks - an RT mutex plus lock-break field:
15308 + */
15309 +typedef struct spinlock {
15310 +       struct rt_mutex         lock;
15311 +       unsigned int            break_lock;
15312 +#ifdef CONFIG_DEBUG_LOCK_ALLOC
15313 +       struct lockdep_map      dep_map;
15314 +#endif
15315 +} spinlock_t;
15316 +
15317 +#ifdef CONFIG_DEBUG_RT_MUTEXES
15318 +# define __RT_SPIN_INITIALIZER(name) \
15319 +       { \
15320 +       .wait_lock = __RAW_SPIN_LOCK_UNLOCKED(name.wait_lock), \
15321 +       .save_state = 1, \
15322 +       .file = __FILE__, \
15323 +       .line = __LINE__ , \
15324 +       }
15325 +#else
15326 +# define __RT_SPIN_INITIALIZER(name) \
15327 +       {                                                               \
15328 +       .wait_lock = __RAW_SPIN_LOCK_UNLOCKED(name.wait_lock),          \
15329 +       .save_state = 1, \
15330 +       }
15331 +#endif
15332 +
15333 +/*
15334 +.wait_list = PLIST_HEAD_INIT_RAW((name).lock.wait_list, (name).lock.wait_lock)
15335 +*/
15336 +
15337 +#define __SPIN_LOCK_UNLOCKED(name)                     \
15338 +       { .lock = __RT_SPIN_INITIALIZER(name.lock),             \
15339 +         SPIN_DEP_MAP_INIT(name) }
15340 +
15341 +#define __DEFINE_SPINLOCK(name) \
15342 +       spinlock_t name = __SPIN_LOCK_UNLOCKED(name)
15343 +
15344 +#define DEFINE_SPINLOCK(name) \
15345 +       spinlock_t name __cacheline_aligned_in_smp = __SPIN_LOCK_UNLOCKED(name)
15346 +
15347 +#endif
15348 diff --git a/include/linux/srcu.h b/include/linux/srcu.h
15349 index f5f80c5643ac..ec1a8f01563c 100644
15350 --- a/include/linux/srcu.h
15351 +++ b/include/linux/srcu.h
15352 @@ -84,10 +84,10 @@ int init_srcu_struct(struct srcu_struct *sp);
15353
15354  void process_srcu(struct work_struct *work);
15355
15356 -#define __SRCU_STRUCT_INIT(name)                                       \
15357 +#define __SRCU_STRUCT_INIT(name, pcpu_name)                            \
15358         {                                                               \
15359                 .completed = -300,                                      \
15360 -               .per_cpu_ref = &name##_srcu_array,                      \
15361 +               .per_cpu_ref = &pcpu_name,                              \
15362                 .queue_lock = __SPIN_LOCK_UNLOCKED(name.queue_lock),    \
15363                 .running = false,                                       \
15364                 .batch_queue = RCU_BATCH_INIT(name.batch_queue),        \
15365 @@ -104,7 +104,7 @@ void process_srcu(struct work_struct *work);
15366   */
15367  #define __DEFINE_SRCU(name, is_static)                                 \
15368         static DEFINE_PER_CPU(struct srcu_struct_array, name##_srcu_array);\
15369 -       is_static struct srcu_struct name = __SRCU_STRUCT_INIT(name)
15370 +       is_static struct srcu_struct name = __SRCU_STRUCT_INIT(name, name##_srcu_array)
15371  #define DEFINE_SRCU(name)              __DEFINE_SRCU(name, /* not static */)
15372  #define DEFINE_STATIC_SRCU(name)       __DEFINE_SRCU(name, static)
15373
15374 diff --git a/include/linux/suspend.h b/include/linux/suspend.h
15375 index 8b6ec7ef0854..9b77d4cc929f 100644
15376 --- a/include/linux/suspend.h
15377 +++ b/include/linux/suspend.h
15378 @@ -194,6 +194,12 @@ struct platform_freeze_ops {
15379         void (*end)(void);
15380  };
15381
15382 +#if defined(CONFIG_SUSPEND) || defined(CONFIG_HIBERNATION)
15383 +extern bool pm_in_action;
15384 +#else
15385 +# define pm_in_action false
15386 +#endif
15387 +
15388  #ifdef CONFIG_SUSPEND
15389  /**
15390   * suspend_set_ops - set platform dependent suspend operations
15391 diff --git a/include/linux/swait.h b/include/linux/swait.h
15392 new file mode 100644
15393 index 000000000000..83f004a72320
15394 --- /dev/null
15395 +++ b/include/linux/swait.h
15396 @@ -0,0 +1,173 @@
15397 +#ifndef _LINUX_SWAIT_H
15398 +#define _LINUX_SWAIT_H
15399 +
15400 +#include <linux/list.h>
15401 +#include <linux/stddef.h>
15402 +#include <linux/spinlock.h>
15403 +#include <asm/current.h>
15404 +
15405 +/*
15406 + * Simple wait queues
15407 + *
15408 + * While these are very similar to the other/complex wait queues (wait.h) the
15409 + * most important difference is that the simple waitqueue allows for
15410 + * deterministic behaviour -- IOW it has strictly bounded IRQ and lock hold
15411 + * times.
15412 + *
15413 + * In order to make this so, we had to drop a fair number of features of the
15414 + * other waitqueue code; notably:
15415 + *
15416 + *  - mixing INTERRUPTIBLE and UNINTERRUPTIBLE sleeps on the same waitqueue;
15417 + *    all wakeups are TASK_NORMAL in order to avoid O(n) lookups for the right
15418 + *    sleeper state.
15419 + *
15420 + *  - the exclusive mode; because this requires preserving the list order
15421 + *    and this is hard.
15422 + *
15423 + *  - custom wake functions; because you cannot give any guarantees about
15424 + *    random code.
15425 + *
15426 + * As a side effect of this; the data structures are slimmer.
15427 + *
15428 + * One would recommend using this wait queue where possible.
15429 + */
15430 +
15431 +struct task_struct;
15432 +
15433 +struct swait_queue_head {
15434 +       raw_spinlock_t          lock;
15435 +       struct list_head        task_list;
15436 +};
15437 +
15438 +struct swait_queue {
15439 +       struct task_struct      *task;
15440 +       struct list_head        task_list;
15441 +};
15442 +
15443 +#define __SWAITQUEUE_INITIALIZER(name) {                               \
15444 +       .task           = current,                                      \
15445 +       .task_list      = LIST_HEAD_INIT((name).task_list),             \
15446 +}
15447 +
15448 +#define DECLARE_SWAITQUEUE(name)                                       \
15449 +       struct swait_queue name = __SWAITQUEUE_INITIALIZER(name)
15450 +
15451 +#define __SWAIT_QUEUE_HEAD_INITIALIZER(name) {                         \
15452 +       .lock           = __RAW_SPIN_LOCK_UNLOCKED(name.lock),          \
15453 +       .task_list      = LIST_HEAD_INIT((name).task_list),             \
15454 +}
15455 +
15456 +#define DECLARE_SWAIT_QUEUE_HEAD(name)                                 \
15457 +       struct swait_queue_head name = __SWAIT_QUEUE_HEAD_INITIALIZER(name)
15458 +
15459 +extern void __init_swait_queue_head(struct swait_queue_head *q, const char *name,
15460 +                                   struct lock_class_key *key);
15461 +
15462 +#define init_swait_queue_head(q)                               \
15463 +       do {                                                    \
15464 +               static struct lock_class_key __key;             \
15465 +               __init_swait_queue_head((q), #q, &__key);       \
15466 +       } while (0)
15467 +
15468 +#ifdef CONFIG_LOCKDEP
15469 +# define __SWAIT_QUEUE_HEAD_INIT_ONSTACK(name)                 \
15470 +       ({ init_swait_queue_head(&name); name; })
15471 +# define DECLARE_SWAIT_QUEUE_HEAD_ONSTACK(name)                        \
15472 +       struct swait_queue_head name = __SWAIT_QUEUE_HEAD_INIT_ONSTACK(name)
15473 +#else
15474 +# define DECLARE_SWAIT_QUEUE_HEAD_ONSTACK(name)                        \
15475 +       DECLARE_SWAIT_QUEUE_HEAD(name)
15476 +#endif
15477 +
15478 +static inline int swait_active(struct swait_queue_head *q)
15479 +{
15480 +       return !list_empty(&q->task_list);
15481 +}
15482 +
15483 +extern void swake_up(struct swait_queue_head *q);
15484 +extern void swake_up_all(struct swait_queue_head *q);
15485 +extern void swake_up_locked(struct swait_queue_head *q);
15486 +extern void swake_up_all_locked(struct swait_queue_head *q);
15487 +
15488 +extern void __prepare_to_swait(struct swait_queue_head *q, struct swait_queue *wait);
15489 +extern void prepare_to_swait(struct swait_queue_head *q, struct swait_queue *wait, int state);
15490 +extern long prepare_to_swait_event(struct swait_queue_head *q, struct swait_queue *wait, int state);
15491 +
15492 +extern void __finish_swait(struct swait_queue_head *q, struct swait_queue *wait);
15493 +extern void finish_swait(struct swait_queue_head *q, struct swait_queue *wait);
15494 +
15495 +/* as per ___wait_event() but for swait, therefore "exclusive == 0" */
15496 +#define ___swait_event(wq, condition, state, ret, cmd)                 \
15497 +({                                                                     \
15498 +       struct swait_queue __wait;                                      \
15499 +       long __ret = ret;                                               \
15500 +                                                                       \
15501 +       INIT_LIST_HEAD(&__wait.task_list);                              \
15502 +       for (;;) {                                                      \
15503 +               long __int = prepare_to_swait_event(&wq, &__wait, state);\
15504 +                                                                       \
15505 +               if (condition)                                          \
15506 +                       break;                                          \
15507 +                                                                       \
15508 +               if (___wait_is_interruptible(state) && __int) {         \
15509 +                       __ret = __int;                                  \
15510 +                       break;                                          \
15511 +               }                                                       \
15512 +                                                                       \
15513 +               cmd;                                                    \
15514 +       }                                                               \
15515 +       finish_swait(&wq, &__wait);                                     \
15516 +       __ret;                                                          \
15517 +})
15518 +
15519 +#define __swait_event(wq, condition)                                   \
15520 +       (void)___swait_event(wq, condition, TASK_UNINTERRUPTIBLE, 0,    \
15521 +                           schedule())
15522 +
15523 +#define swait_event(wq, condition)                                     \
15524 +do {                                                                   \
15525 +       if (condition)                                                  \
15526 +               break;                                                  \
15527 +       __swait_event(wq, condition);                                   \
15528 +} while (0)
15529 +
15530 +#define __swait_event_timeout(wq, condition, timeout)                  \
15531 +       ___swait_event(wq, ___wait_cond_timeout(condition),             \
15532 +                     TASK_UNINTERRUPTIBLE, timeout,                    \
15533 +                     __ret = schedule_timeout(__ret))
15534 +
15535 +#define swait_event_timeout(wq, condition, timeout)                    \
15536 +({                                                                     \
15537 +       long __ret = timeout;                                           \
15538 +       if (!___wait_cond_timeout(condition))                           \
15539 +               __ret = __swait_event_timeout(wq, condition, timeout);  \
15540 +       __ret;                                                          \
15541 +})
15542 +
15543 +#define __swait_event_interruptible(wq, condition)                     \
15544 +       ___swait_event(wq, condition, TASK_INTERRUPTIBLE, 0,            \
15545 +                     schedule())
15546 +
15547 +#define swait_event_interruptible(wq, condition)                       \
15548 +({                                                                     \
15549 +       int __ret = 0;                                                  \
15550 +       if (!(condition))                                               \
15551 +               __ret = __swait_event_interruptible(wq, condition);     \
15552 +       __ret;                                                          \
15553 +})
15554 +
15555 +#define __swait_event_interruptible_timeout(wq, condition, timeout)    \
15556 +       ___swait_event(wq, ___wait_cond_timeout(condition),             \
15557 +                     TASK_INTERRUPTIBLE, timeout,                      \
15558 +                     __ret = schedule_timeout(__ret))
15559 +
15560 +#define swait_event_interruptible_timeout(wq, condition, timeout)      \
15561 +({                                                                     \
15562 +       long __ret = timeout;                                           \
15563 +       if (!___wait_cond_timeout(condition))                           \
15564 +               __ret = __swait_event_interruptible_timeout(wq,         \
15565 +                                               condition, timeout);    \
15566 +       __ret;                                                          \
15567 +})
15568 +
15569 +#endif /* _LINUX_SWAIT_H */
15570 diff --git a/include/linux/swap.h b/include/linux/swap.h
15571 index d8ca2eaa3a8b..19e038054914 100644
15572 --- a/include/linux/swap.h
15573 +++ b/include/linux/swap.h
15574 @@ -11,6 +11,7 @@
15575  #include <linux/fs.h>
15576  #include <linux/atomic.h>
15577  #include <linux/page-flags.h>
15578 +#include <linux/locallock.h>
15579  #include <asm/page.h>
15580
15581  struct notifier_block;
15582 @@ -252,7 +253,8 @@ struct swap_info_struct {
15583  void *workingset_eviction(struct address_space *mapping, struct page *page);
15584  bool workingset_refault(void *shadow);
15585  void workingset_activation(struct page *page);
15586 -extern struct list_lru workingset_shadow_nodes;
15587 +extern struct list_lru __workingset_shadow_nodes;
15588 +DECLARE_LOCAL_IRQ_LOCK(workingset_shadow_lock);
15589
15590  static inline unsigned int workingset_node_pages(struct radix_tree_node *node)
15591  {
15592 @@ -298,6 +300,7 @@ extern unsigned long nr_free_pagecache_pages(void);
15593
15594
15595  /* linux/mm/swap.c */
15596 +DECLARE_LOCAL_IRQ_LOCK(swapvec_lock);
15597  extern void lru_cache_add(struct page *);
15598  extern void lru_cache_add_anon(struct page *page);
15599  extern void lru_cache_add_file(struct page *page);
15600 diff --git a/include/linux/swork.h b/include/linux/swork.h
15601 new file mode 100644
15602 index 000000000000..f175fa9a6016
15603 --- /dev/null
15604 +++ b/include/linux/swork.h
15605 @@ -0,0 +1,24 @@
15606 +#ifndef _LINUX_SWORK_H
15607 +#define _LINUX_SWORK_H
15608 +
15609 +#include <linux/list.h>
15610 +
15611 +struct swork_event {
15612 +       struct list_head item;
15613 +       unsigned long flags;
15614 +       void (*func)(struct swork_event *);
15615 +};
15616 +
15617 +static inline void INIT_SWORK(struct swork_event *event,
15618 +                             void (*func)(struct swork_event *))
15619 +{
15620 +       event->flags = 0;
15621 +       event->func = func;
15622 +}
15623 +
15624 +bool swork_queue(struct swork_event *sev);
15625 +
15626 +int swork_get(void);
15627 +void swork_put(void);
15628 +
15629 +#endif /* _LINUX_SWORK_H */
15630 diff --git a/include/linux/thread_info.h b/include/linux/thread_info.h
15631 index ff307b548ed3..be9f9dc6a4e1 100644
15632 --- a/include/linux/thread_info.h
15633 +++ b/include/linux/thread_info.h
15634 @@ -102,7 +102,17 @@ static inline int test_ti_thread_flag(struct thread_info *ti, int flag)
15635  #define test_thread_flag(flag) \
15636         test_ti_thread_flag(current_thread_info(), flag)
15637
15638 -#define tif_need_resched() test_thread_flag(TIF_NEED_RESCHED)
15639 +#ifdef CONFIG_PREEMPT_LAZY
15640 +#define tif_need_resched()     (test_thread_flag(TIF_NEED_RESCHED) || \
15641 +                                test_thread_flag(TIF_NEED_RESCHED_LAZY))
15642 +#define tif_need_resched_now() (test_thread_flag(TIF_NEED_RESCHED))
15643 +#define tif_need_resched_lazy()        test_thread_flag(TIF_NEED_RESCHED_LAZY))
15644 +
15645 +#else
15646 +#define tif_need_resched()     test_thread_flag(TIF_NEED_RESCHED)
15647 +#define tif_need_resched_now() test_thread_flag(TIF_NEED_RESCHED)
15648 +#define tif_need_resched_lazy()        0
15649 +#endif
15650
15651  #if defined TIF_RESTORE_SIGMASK && !defined HAVE_SET_RESTORE_SIGMASK
15652  /*
15653 diff --git a/include/linux/timer.h b/include/linux/timer.h
15654 index 61aa61dc410c..299d2b78591f 100644
15655 --- a/include/linux/timer.h
15656 +++ b/include/linux/timer.h
15657 @@ -225,7 +225,7 @@ extern void add_timer(struct timer_list *timer);
15658
15659  extern int try_to_del_timer_sync(struct timer_list *timer);
15660
15661 -#ifdef CONFIG_SMP
15662 +#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT_FULL)
15663    extern int del_timer_sync(struct timer_list *timer);
15664  #else
15665  # define del_timer_sync(t)             del_timer(t)
15666 diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h
15667 index 925730bc9fc1..a591f414da6f 100644
15668 --- a/include/linux/trace_events.h
15669 +++ b/include/linux/trace_events.h
15670 @@ -66,6 +66,9 @@ struct trace_entry {
15671         unsigned char           flags;
15672         unsigned char           preempt_count;
15673         int                     pid;
15674 +       unsigned short          migrate_disable;
15675 +       unsigned short          padding;
15676 +       unsigned char           preempt_lazy_count;
15677  };
15678
15679  #define TRACE_EVENT_TYPE_MAX                                           \
15680 diff --git a/include/linux/uaccess.h b/include/linux/uaccess.h
15681 index 558129af828a..cf5c472bbc79 100644
15682 --- a/include/linux/uaccess.h
15683 +++ b/include/linux/uaccess.h
15684 @@ -24,6 +24,7 @@ static __always_inline void pagefault_disabled_dec(void)
15685   */
15686  static inline void pagefault_disable(void)
15687  {
15688 +       migrate_disable();
15689         pagefault_disabled_inc();
15690         /*
15691          * make sure to have issued the store before a pagefault
15692 @@ -40,6 +41,7 @@ static inline void pagefault_enable(void)
15693          */
15694         barrier();
15695         pagefault_disabled_dec();
15696 +       migrate_enable();
15697  }
15698
15699  /*
15700 diff --git a/include/linux/uprobes.h b/include/linux/uprobes.h
15701 index 4a29c75b146e..0a294e950df8 100644
15702 --- a/include/linux/uprobes.h
15703 +++ b/include/linux/uprobes.h
15704 @@ -27,6 +27,7 @@
15705  #include <linux/errno.h>
15706  #include <linux/rbtree.h>
15707  #include <linux/types.h>
15708 +#include <linux/wait.h>
15709
15710  struct vm_area_struct;
15711  struct mm_struct;
15712 diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h
15713 index 3e5d9075960f..7eaa847cd5a5 100644
15714 --- a/include/linux/vmstat.h
15715 +++ b/include/linux/vmstat.h
15716 @@ -33,7 +33,9 @@ DECLARE_PER_CPU(struct vm_event_state, vm_event_states);
15717   */
15718  static inline void __count_vm_event(enum vm_event_item item)
15719  {
15720 +       preempt_disable_rt();
15721         raw_cpu_inc(vm_event_states.event[item]);
15722 +       preempt_enable_rt();
15723  }
15724
15725  static inline void count_vm_event(enum vm_event_item item)
15726 @@ -43,7 +45,9 @@ static inline void count_vm_event(enum vm_event_item item)
15727
15728  static inline void __count_vm_events(enum vm_event_item item, long delta)
15729  {
15730 +       preempt_disable_rt();
15731         raw_cpu_add(vm_event_states.event[item], delta);
15732 +       preempt_enable_rt();
15733  }
15734
15735  static inline void count_vm_events(enum vm_event_item item, long delta)
15736 diff --git a/include/linux/wait.h b/include/linux/wait.h
15737 index 513b36f04dfd..981c8a840f96 100644
15738 --- a/include/linux/wait.h
15739 +++ b/include/linux/wait.h
15740 @@ -8,6 +8,7 @@
15741  #include <linux/spinlock.h>
15742  #include <asm/current.h>
15743  #include <uapi/linux/wait.h>
15744 +#include <linux/atomic.h>
15745
15746  typedef struct __wait_queue wait_queue_t;
15747  typedef int (*wait_queue_func_t)(wait_queue_t *wait, unsigned mode, int flags, void *key);
15748 diff --git a/include/net/dst.h b/include/net/dst.h
15749 index c7329dcd90cc..35c3dba16728 100644
15750 --- a/include/net/dst.h
15751 +++ b/include/net/dst.h
15752 @@ -437,7 +437,7 @@ static inline void dst_confirm(struct dst_entry *dst)
15753  static inline int dst_neigh_output(struct dst_entry *dst, struct neighbour *n,
15754                                    struct sk_buff *skb)
15755  {
15756 -       const struct hh_cache *hh;
15757 +       struct hh_cache *hh;
15758
15759         if (dst->pending_confirm) {
15760                 unsigned long now = jiffies;
15761 diff --git a/include/net/neighbour.h b/include/net/neighbour.h
15762 index 8b683841e574..bf656008f6e7 100644
15763 --- a/include/net/neighbour.h
15764 +++ b/include/net/neighbour.h
15765 @@ -446,7 +446,7 @@ static inline int neigh_hh_bridge(struct hh_cache *hh, struct sk_buff *skb)
15766  }
15767  #endif
15768
15769 -static inline int neigh_hh_output(const struct hh_cache *hh, struct sk_buff *skb)
15770 +static inline int neigh_hh_output(struct hh_cache *hh, struct sk_buff *skb)
15771  {
15772         unsigned int seq;
15773         int hh_len;
15774 @@ -501,7 +501,7 @@ struct neighbour_cb {
15775
15776  #define NEIGH_CB(skb)  ((struct neighbour_cb *)(skb)->cb)
15777
15778 -static inline void neigh_ha_snapshot(char *dst, const struct neighbour *n,
15779 +static inline void neigh_ha_snapshot(char *dst, struct neighbour *n,
15780                                      const struct net_device *dev)
15781  {
15782         unsigned int seq;
15783 diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h
15784 index c68926b4899c..dd0751e76065 100644
15785 --- a/include/net/netns/ipv4.h
15786 +++ b/include/net/netns/ipv4.h
15787 @@ -70,6 +70,7 @@ struct netns_ipv4 {
15788
15789         int sysctl_icmp_echo_ignore_all;
15790         int sysctl_icmp_echo_ignore_broadcasts;
15791 +       int sysctl_icmp_echo_sysrq;
15792         int sysctl_icmp_ignore_bogus_error_responses;
15793         int sysctl_icmp_ratelimit;
15794         int sysctl_icmp_ratemask;
15795 diff --git a/include/trace/events/hist.h b/include/trace/events/hist.h
15796 new file mode 100644
15797 index 000000000000..f7710de1b1f3
15798 --- /dev/null
15799 +++ b/include/trace/events/hist.h
15800 @@ -0,0 +1,73 @@
15801 +#undef TRACE_SYSTEM
15802 +#define TRACE_SYSTEM hist
15803 +
15804 +#if !defined(_TRACE_HIST_H) || defined(TRACE_HEADER_MULTI_READ)
15805 +#define _TRACE_HIST_H
15806 +
15807 +#include "latency_hist.h"
15808 +#include <linux/tracepoint.h>
15809 +
15810 +#if !defined(CONFIG_PREEMPT_OFF_HIST) && !defined(CONFIG_INTERRUPT_OFF_HIST)
15811 +#define trace_preemptirqsoff_hist(a, b)
15812 +#define trace_preemptirqsoff_hist_rcuidle(a, b)
15813 +#else
15814 +TRACE_EVENT(preemptirqsoff_hist,
15815 +
15816 +       TP_PROTO(int reason, int starthist),
15817 +
15818 +       TP_ARGS(reason, starthist),
15819 +
15820 +       TP_STRUCT__entry(
15821 +               __field(int,    reason)
15822 +               __field(int,    starthist)
15823 +       ),
15824 +
15825 +       TP_fast_assign(
15826 +               __entry->reason         = reason;
15827 +               __entry->starthist      = starthist;
15828 +       ),
15829 +
15830 +       TP_printk("reason=%s starthist=%s", getaction(__entry->reason),
15831 +                 __entry->starthist ? "start" : "stop")
15832 +);
15833 +#endif
15834 +
15835 +#ifndef CONFIG_MISSED_TIMER_OFFSETS_HIST
15836 +#define trace_hrtimer_interrupt(a, b, c, d)
15837 +#else
15838 +TRACE_EVENT(hrtimer_interrupt,
15839 +
15840 +       TP_PROTO(int cpu, long long offset, struct task_struct *curr,
15841 +               struct task_struct *task),
15842 +
15843 +       TP_ARGS(cpu, offset, curr, task),
15844 +
15845 +       TP_STRUCT__entry(
15846 +               __field(int,            cpu)
15847 +               __field(long long,      offset)
15848 +               __array(char,           ccomm,  TASK_COMM_LEN)
15849 +               __field(int,            cprio)
15850 +               __array(char,           tcomm,  TASK_COMM_LEN)
15851 +               __field(int,            tprio)
15852 +       ),
15853 +
15854 +       TP_fast_assign(
15855 +               __entry->cpu    = cpu;
15856 +               __entry->offset = offset;
15857 +               memcpy(__entry->ccomm, curr->comm, TASK_COMM_LEN);
15858 +               __entry->cprio  = curr->prio;
15859 +               memcpy(__entry->tcomm, task != NULL ? task->comm : "<none>",
15860 +                       task != NULL ? TASK_COMM_LEN : 7);
15861 +               __entry->tprio  = task != NULL ? task->prio : -1;
15862 +       ),
15863 +
15864 +       TP_printk("cpu=%d offset=%lld curr=%s[%d] thread=%s[%d]",
15865 +               __entry->cpu, __entry->offset, __entry->ccomm,
15866 +               __entry->cprio, __entry->tcomm, __entry->tprio)
15867 +);
15868 +#endif
15869 +
15870 +#endif /* _TRACE_HIST_H */
15871 +
15872 +/* This part must be outside protection */
15873 +#include <trace/define_trace.h>
15874 diff --git a/include/trace/events/latency_hist.h b/include/trace/events/latency_hist.h
15875 new file mode 100644
15876 index 000000000000..d3f2fbd560b1
15877 --- /dev/null
15878 +++ b/include/trace/events/latency_hist.h
15879 @@ -0,0 +1,29 @@
15880 +#ifndef _LATENCY_HIST_H
15881 +#define _LATENCY_HIST_H
15882 +
15883 +enum hist_action {
15884 +       IRQS_ON,
15885 +       PREEMPT_ON,
15886 +       TRACE_STOP,
15887 +       IRQS_OFF,
15888 +       PREEMPT_OFF,
15889 +       TRACE_START,
15890 +};
15891 +
15892 +static char *actions[] = {
15893 +       "IRQS_ON",
15894 +       "PREEMPT_ON",
15895 +       "TRACE_STOP",
15896 +       "IRQS_OFF",
15897 +       "PREEMPT_OFF",
15898 +       "TRACE_START",
15899 +};
15900 +
15901 +static inline char *getaction(int action)
15902 +{
15903 +       if (action >= 0 && action <= sizeof(actions)/sizeof(actions[0]))
15904 +               return actions[action];
15905 +       return "unknown";
15906 +}
15907 +
15908 +#endif /* _LATENCY_HIST_H */
15909 diff --git a/include/trace/events/writeback.h b/include/trace/events/writeback.h
15910 index fff846b512e6..73614ce1d204 100644
15911 --- a/include/trace/events/writeback.h
15912 +++ b/include/trace/events/writeback.h
15913 @@ -134,58 +134,28 @@ DEFINE_EVENT(writeback_dirty_inode_template, writeback_dirty_inode,
15914  #ifdef CREATE_TRACE_POINTS
15915  #ifdef CONFIG_CGROUP_WRITEBACK
15916
15917 -static inline size_t __trace_wb_cgroup_size(struct bdi_writeback *wb)
15918 +static inline unsigned int __trace_wb_assign_cgroup(struct bdi_writeback *wb)
15919  {
15920 -       return kernfs_path_len(wb->memcg_css->cgroup->kn) + 1;
15921 +       return wb->memcg_css->cgroup->kn->ino;
15922  }
15923
15924 -static inline void __trace_wb_assign_cgroup(char *buf, struct bdi_writeback *wb)
15925 -{
15926 -       struct cgroup *cgrp = wb->memcg_css->cgroup;
15927 -       char *path;
15928 -
15929 -       path = cgroup_path(cgrp, buf, kernfs_path_len(cgrp->kn) + 1);
15930 -       WARN_ON_ONCE(path != buf);
15931 -}
15932 -
15933 -static inline size_t __trace_wbc_cgroup_size(struct writeback_control *wbc)
15934 -{
15935 -       if (wbc->wb)
15936 -               return __trace_wb_cgroup_size(wbc->wb);
15937 -       else
15938 -               return 2;
15939 -}
15940 -
15941 -static inline void __trace_wbc_assign_cgroup(char *buf,
15942 -                                            struct writeback_control *wbc)
15943 +static inline unsigned int __trace_wbc_assign_cgroup(struct writeback_control *wbc)
15944  {
15945         if (wbc->wb)
15946 -               __trace_wb_assign_cgroup(buf, wbc->wb);
15947 +               return __trace_wb_assign_cgroup(wbc->wb);
15948         else
15949 -               strcpy(buf, "/");
15950 +               return -1U;
15951  }
15952 -
15953  #else  /* CONFIG_CGROUP_WRITEBACK */
15954
15955 -static inline size_t __trace_wb_cgroup_size(struct bdi_writeback *wb)
15956 -{
15957 -       return 2;
15958 -}
15959 -
15960 -static inline void __trace_wb_assign_cgroup(char *buf, struct bdi_writeback *wb)
15961 -{
15962 -       strcpy(buf, "/");
15963 -}
15964 -
15965 -static inline size_t __trace_wbc_cgroup_size(struct writeback_control *wbc)
15966 +static inline unsigned int __trace_wb_assign_cgroup(struct bdi_writeback *wb)
15967  {
15968 -       return 2;
15969 +       return -1U;
15970  }
15971
15972 -static inline void __trace_wbc_assign_cgroup(char *buf,
15973 -                                            struct writeback_control *wbc)
15974 +static inline unsigned int __trace_wbc_assign_cgroup(struct writeback_control *wbc)
15975  {
15976 -       strcpy(buf, "/");
15977 +       return -1U;
15978  }
15979
15980  #endif /* CONFIG_CGROUP_WRITEBACK */
15981 @@ -201,7 +171,7 @@ DECLARE_EVENT_CLASS(writeback_write_inode_template,
15982                 __array(char, name, 32)
15983                 __field(unsigned long, ino)
15984                 __field(int, sync_mode)
15985 -               __dynamic_array(char, cgroup, __trace_wbc_cgroup_size(wbc))
15986 +               __field(unsigned int, cgroup_ino)
15987         ),
15988
15989         TP_fast_assign(
15990 @@ -209,14 +179,14 @@ DECLARE_EVENT_CLASS(writeback_write_inode_template,
15991                         dev_name(inode_to_bdi(inode)->dev), 32);
15992                 __entry->ino            = inode->i_ino;
15993                 __entry->sync_mode      = wbc->sync_mode;
15994 -               __trace_wbc_assign_cgroup(__get_str(cgroup), wbc);
15995 +               __entry->cgroup_ino     = __trace_wbc_assign_cgroup(wbc);
15996         ),
15997
15998 -       TP_printk("bdi %s: ino=%lu sync_mode=%d cgroup=%s",
15999 +       TP_printk("bdi %s: ino=%lu sync_mode=%d cgroup_ino=%u",
16000                 __entry->name,
16001                 __entry->ino,
16002                 __entry->sync_mode,
16003 -               __get_str(cgroup)
16004 +               __entry->cgroup_ino
16005         )
16006  );
16007
16008 @@ -246,7 +216,7 @@ DECLARE_EVENT_CLASS(writeback_work_class,
16009                 __field(int, range_cyclic)
16010                 __field(int, for_background)
16011                 __field(int, reason)
16012 -               __dynamic_array(char, cgroup, __trace_wb_cgroup_size(wb))
16013 +               __field(unsigned int, cgroup_ino)
16014         ),
16015         TP_fast_assign(
16016                 strncpy(__entry->name,
16017 @@ -258,10 +228,10 @@ DECLARE_EVENT_CLASS(writeback_work_class,
16018                 __entry->range_cyclic = work->range_cyclic;
16019                 __entry->for_background = work->for_background;
16020                 __entry->reason = work->reason;
16021 -               __trace_wb_assign_cgroup(__get_str(cgroup), wb);
16022 +               __entry->cgroup_ino = __trace_wb_assign_cgroup(wb);
16023         ),
16024         TP_printk("bdi %s: sb_dev %d:%d nr_pages=%ld sync_mode=%d "
16025 -                 "kupdate=%d range_cyclic=%d background=%d reason=%s cgroup=%s",
16026 +                 "kupdate=%d range_cyclic=%d background=%d reason=%s cgroup_ino=%u",
16027                   __entry->name,
16028                   MAJOR(__entry->sb_dev), MINOR(__entry->sb_dev),
16029                   __entry->nr_pages,
16030 @@ -270,7 +240,7 @@ DECLARE_EVENT_CLASS(writeback_work_class,
16031                   __entry->range_cyclic,
16032                   __entry->for_background,
16033                   __print_symbolic(__entry->reason, WB_WORK_REASON),
16034 -                 __get_str(cgroup)
16035 +                 __entry->cgroup_ino
16036         )
16037  );
16038  #define DEFINE_WRITEBACK_WORK_EVENT(name) \
16039 @@ -300,15 +270,15 @@ DECLARE_EVENT_CLASS(writeback_class,
16040         TP_ARGS(wb),
16041         TP_STRUCT__entry(
16042                 __array(char, name, 32)
16043 -               __dynamic_array(char, cgroup, __trace_wb_cgroup_size(wb))
16044 +               __field(unsigned int, cgroup_ino)
16045         ),
16046         TP_fast_assign(
16047                 strncpy(__entry->name, dev_name(wb->bdi->dev), 32);
16048 -               __trace_wb_assign_cgroup(__get_str(cgroup), wb);
16049 +               __entry->cgroup_ino = __trace_wb_assign_cgroup(wb);
16050         ),
16051 -       TP_printk("bdi %s: cgroup=%s",
16052 +       TP_printk("bdi %s: cgroup_ino=%u",
16053                   __entry->name,
16054 -                 __get_str(cgroup)
16055 +                 __entry->cgroup_ino
16056         )
16057  );
16058  #define DEFINE_WRITEBACK_EVENT(name) \
16059 @@ -347,7 +317,7 @@ DECLARE_EVENT_CLASS(wbc_class,
16060                 __field(int, range_cyclic)
16061                 __field(long, range_start)
16062                 __field(long, range_end)
16063 -               __dynamic_array(char, cgroup, __trace_wbc_cgroup_size(wbc))
16064 +               __field(unsigned int, cgroup_ino)
16065         ),
16066
16067         TP_fast_assign(
16068 @@ -361,12 +331,12 @@ DECLARE_EVENT_CLASS(wbc_class,
16069                 __entry->range_cyclic   = wbc->range_cyclic;
16070                 __entry->range_start    = (long)wbc->range_start;
16071                 __entry->range_end      = (long)wbc->range_end;
16072 -               __trace_wbc_assign_cgroup(__get_str(cgroup), wbc);
16073 +               __entry->cgroup_ino     = __trace_wbc_assign_cgroup(wbc);
16074         ),
16075
16076         TP_printk("bdi %s: towrt=%ld skip=%ld mode=%d kupd=%d "
16077                 "bgrd=%d reclm=%d cyclic=%d "
16078 -               "start=0x%lx end=0x%lx cgroup=%s",
16079 +               "start=0x%lx end=0x%lx cgroup_ino=%u",
16080                 __entry->name,
16081                 __entry->nr_to_write,
16082                 __entry->pages_skipped,
16083 @@ -377,7 +347,7 @@ DECLARE_EVENT_CLASS(wbc_class,
16084                 __entry->range_cyclic,
16085                 __entry->range_start,
16086                 __entry->range_end,
16087 -               __get_str(cgroup)
16088 +               __entry->cgroup_ino
16089         )
16090  )
16091
16092 @@ -398,7 +368,7 @@ TRACE_EVENT(writeback_queue_io,
16093                 __field(long,           age)
16094                 __field(int,            moved)
16095                 __field(int,            reason)
16096 -               __dynamic_array(char, cgroup, __trace_wb_cgroup_size(wb))
16097 +               __field(unsigned int,   cgroup_ino)
16098         ),
16099         TP_fast_assign(
16100                 unsigned long *older_than_this = work->older_than_this;
16101 @@ -408,15 +378,15 @@ TRACE_EVENT(writeback_queue_io,
16102                                   (jiffies - *older_than_this) * 1000 / HZ : -1;
16103                 __entry->moved  = moved;
16104                 __entry->reason = work->reason;
16105 -               __trace_wb_assign_cgroup(__get_str(cgroup), wb);
16106 +               __entry->cgroup_ino     = __trace_wb_assign_cgroup(wb);
16107         ),
16108 -       TP_printk("bdi %s: older=%lu age=%ld enqueue=%d reason=%s cgroup=%s",
16109 +       TP_printk("bdi %s: older=%lu age=%ld enqueue=%d reason=%s cgroup_ino=%u",
16110                 __entry->name,
16111                 __entry->older, /* older_than_this in jiffies */
16112                 __entry->age,   /* older_than_this in relative milliseconds */
16113                 __entry->moved,
16114                 __print_symbolic(__entry->reason, WB_WORK_REASON),
16115 -               __get_str(cgroup)
16116 +               __entry->cgroup_ino
16117         )
16118  );
16119
16120 @@ -484,7 +454,7 @@ TRACE_EVENT(bdi_dirty_ratelimit,
16121                 __field(unsigned long,  dirty_ratelimit)
16122                 __field(unsigned long,  task_ratelimit)
16123                 __field(unsigned long,  balanced_dirty_ratelimit)
16124 -               __dynamic_array(char, cgroup, __trace_wb_cgroup_size(wb))
16125 +               __field(unsigned int,   cgroup_ino)
16126         ),
16127
16128         TP_fast_assign(
16129 @@ -496,13 +466,13 @@ TRACE_EVENT(bdi_dirty_ratelimit,
16130                 __entry->task_ratelimit = KBps(task_ratelimit);
16131                 __entry->balanced_dirty_ratelimit =
16132                                         KBps(wb->balanced_dirty_ratelimit);
16133 -               __trace_wb_assign_cgroup(__get_str(cgroup), wb);
16134 +               __entry->cgroup_ino     = __trace_wb_assign_cgroup(wb);
16135         ),
16136
16137         TP_printk("bdi %s: "
16138                   "write_bw=%lu awrite_bw=%lu dirty_rate=%lu "
16139                   "dirty_ratelimit=%lu task_ratelimit=%lu "
16140 -                 "balanced_dirty_ratelimit=%lu cgroup=%s",
16141 +                 "balanced_dirty_ratelimit=%lu cgroup_ino=%u",
16142                   __entry->bdi,
16143                   __entry->write_bw,            /* write bandwidth */
16144                   __entry->avg_write_bw,        /* avg write bandwidth */
16145 @@ -510,7 +480,7 @@ TRACE_EVENT(bdi_dirty_ratelimit,
16146                   __entry->dirty_ratelimit,     /* base ratelimit */
16147                   __entry->task_ratelimit, /* ratelimit with position control */
16148                   __entry->balanced_dirty_ratelimit, /* the balanced ratelimit */
16149 -                 __get_str(cgroup)
16150 +                 __entry->cgroup_ino
16151         )
16152  );
16153
16154 @@ -548,7 +518,7 @@ TRACE_EVENT(balance_dirty_pages,
16155                 __field(         long,  pause)
16156                 __field(unsigned long,  period)
16157                 __field(         long,  think)
16158 -               __dynamic_array(char, cgroup, __trace_wb_cgroup_size(wb))
16159 +               __field(unsigned int,   cgroup_ino)
16160         ),
16161
16162         TP_fast_assign(
16163 @@ -571,7 +541,7 @@ TRACE_EVENT(balance_dirty_pages,
16164                 __entry->period         = period * 1000 / HZ;
16165                 __entry->pause          = pause * 1000 / HZ;
16166                 __entry->paused         = (jiffies - start_time) * 1000 / HZ;
16167 -               __trace_wb_assign_cgroup(__get_str(cgroup), wb);
16168 +               __entry->cgroup_ino     = __trace_wb_assign_cgroup(wb);
16169         ),
16170
16171
16172 @@ -580,7 +550,7 @@ TRACE_EVENT(balance_dirty_pages,
16173                   "bdi_setpoint=%lu bdi_dirty=%lu "
16174                   "dirty_ratelimit=%lu task_ratelimit=%lu "
16175                   "dirtied=%u dirtied_pause=%u "
16176 -                 "paused=%lu pause=%ld period=%lu think=%ld cgroup=%s",
16177 +                 "paused=%lu pause=%ld period=%lu think=%ld cgroup_ino=%u",
16178                   __entry->bdi,
16179                   __entry->limit,
16180                   __entry->setpoint,
16181 @@ -595,7 +565,7 @@ TRACE_EVENT(balance_dirty_pages,
16182                   __entry->pause,       /* ms */
16183                   __entry->period,      /* ms */
16184                   __entry->think,       /* ms */
16185 -                 __get_str(cgroup)
16186 +                 __entry->cgroup_ino
16187           )
16188  );
16189
16190 @@ -609,8 +579,7 @@ TRACE_EVENT(writeback_sb_inodes_requeue,
16191                 __field(unsigned long, ino)
16192                 __field(unsigned long, state)
16193                 __field(unsigned long, dirtied_when)
16194 -               __dynamic_array(char, cgroup,
16195 -                               __trace_wb_cgroup_size(inode_to_wb(inode)))
16196 +               __field(unsigned int, cgroup_ino)
16197         ),
16198
16199         TP_fast_assign(
16200 @@ -619,16 +588,16 @@ TRACE_EVENT(writeback_sb_inodes_requeue,
16201                 __entry->ino            = inode->i_ino;
16202                 __entry->state          = inode->i_state;
16203                 __entry->dirtied_when   = inode->dirtied_when;
16204 -               __trace_wb_assign_cgroup(__get_str(cgroup), inode_to_wb(inode));
16205 +               __entry->cgroup_ino     = __trace_wb_assign_cgroup(inode_to_wb(inode));
16206         ),
16207
16208 -       TP_printk("bdi %s: ino=%lu state=%s dirtied_when=%lu age=%lu cgroup=%s",
16209 +       TP_printk("bdi %s: ino=%lu state=%s dirtied_when=%lu age=%lu cgroup_ino=%u",
16210                   __entry->name,
16211                   __entry->ino,
16212                   show_inode_state(__entry->state),
16213                   __entry->dirtied_when,
16214                   (jiffies - __entry->dirtied_when) / HZ,
16215 -                 __get_str(cgroup)
16216 +                 __entry->cgroup_ino
16217         )
16218  );
16219
16220 @@ -684,7 +653,7 @@ DECLARE_EVENT_CLASS(writeback_single_inode_template,
16221                 __field(unsigned long, writeback_index)
16222                 __field(long, nr_to_write)
16223                 __field(unsigned long, wrote)
16224 -               __dynamic_array(char, cgroup, __trace_wbc_cgroup_size(wbc))
16225 +               __field(unsigned int, cgroup_ino)
16226         ),
16227
16228         TP_fast_assign(
16229 @@ -696,11 +665,11 @@ DECLARE_EVENT_CLASS(writeback_single_inode_template,
16230                 __entry->writeback_index = inode->i_mapping->writeback_index;
16231                 __entry->nr_to_write    = nr_to_write;
16232                 __entry->wrote          = nr_to_write - wbc->nr_to_write;
16233 -               __trace_wbc_assign_cgroup(__get_str(cgroup), wbc);
16234 +               __entry->cgroup_ino     = __trace_wbc_assign_cgroup(wbc);
16235         ),
16236
16237         TP_printk("bdi %s: ino=%lu state=%s dirtied_when=%lu age=%lu "
16238 -                 "index=%lu to_write=%ld wrote=%lu cgroup=%s",
16239 +                 "index=%lu to_write=%ld wrote=%lu cgroup_ino=%u",
16240                   __entry->name,
16241                   __entry->ino,
16242                   show_inode_state(__entry->state),
16243 @@ -709,7 +678,7 @@ DECLARE_EVENT_CLASS(writeback_single_inode_template,
16244                   __entry->writeback_index,
16245                   __entry->nr_to_write,
16246                   __entry->wrote,
16247 -                 __get_str(cgroup)
16248 +                 __entry->cgroup_ino
16249         )
16250  );
16251
16252 diff --git a/init/Kconfig b/init/Kconfig
16253 index 235c7a2c0d20..a7c81c0911da 100644
16254 --- a/init/Kconfig
16255 +++ b/init/Kconfig
16256 @@ -498,7 +498,7 @@ config TINY_RCU
16257
16258  config RCU_EXPERT
16259         bool "Make expert-level adjustments to RCU configuration"
16260 -       default n
16261 +       default y if PREEMPT_RT_FULL
16262         help
16263           This option needs to be enabled if you wish to make
16264           expert-level adjustments to RCU configuration.  By default,
16265 @@ -614,7 +614,7 @@ config RCU_FANOUT_LEAF
16266
16267  config RCU_FAST_NO_HZ
16268         bool "Accelerate last non-dyntick-idle CPU's grace periods"
16269 -       depends on NO_HZ_COMMON && SMP && RCU_EXPERT
16270 +       depends on NO_HZ_COMMON && SMP && RCU_EXPERT && !PREEMPT_RT_FULL
16271         default n
16272         help
16273           This option permits CPUs to enter dynticks-idle state even if
16274 @@ -641,7 +641,7 @@ config TREE_RCU_TRACE
16275  config RCU_BOOST
16276         bool "Enable RCU priority boosting"
16277         depends on RT_MUTEXES && PREEMPT_RCU && RCU_EXPERT
16278 -       default n
16279 +       default y if PREEMPT_RT_FULL
16280         help
16281           This option boosts the priority of preempted RCU readers that
16282           block the current preemptible RCU grace period for too long.
16283 @@ -1106,6 +1106,7 @@ config CFS_BANDWIDTH
16284  config RT_GROUP_SCHED
16285         bool "Group scheduling for SCHED_RR/FIFO"
16286         depends on CGROUP_SCHED
16287 +       depends on !PREEMPT_RT_FULL
16288         default n
16289         help
16290           This feature lets you explicitly allocate real CPU bandwidth
16291 @@ -1719,6 +1720,7 @@ choice
16292
16293  config SLAB
16294         bool "SLAB"
16295 +       depends on !PREEMPT_RT_FULL
16296         help
16297           The regular slab allocator that is established and known to work
16298           well in all environments. It organizes cache hot objects in
16299 @@ -1737,6 +1739,7 @@ config SLUB
16300  config SLOB
16301         depends on EXPERT
16302         bool "SLOB (Simple Allocator)"
16303 +       depends on !PREEMPT_RT_FULL
16304         help
16305            SLOB replaces the stock allocator with a drastically simpler
16306            allocator. SLOB is generally more space efficient but
16307 @@ -1746,7 +1749,7 @@ endchoice
16308
16309  config SLUB_CPU_PARTIAL
16310         default y
16311 -       depends on SLUB && SMP
16312 +       depends on SLUB && SMP && !PREEMPT_RT_FULL
16313         bool "SLUB per cpu partial cache"
16314         help
16315           Per cpu partial caches accellerate objects allocation and freeing
16316 diff --git a/init/Makefile b/init/Makefile
16317 index 7bc47ee31c36..88cf473554e0 100644
16318 --- a/init/Makefile
16319 +++ b/init/Makefile
16320 @@ -33,4 +33,4 @@ silent_chk_compile.h = :
16321  include/generated/compile.h: FORCE
16322         @$($(quiet)chk_compile.h)
16323         $(Q)$(CONFIG_SHELL) $(srctree)/scripts/mkcompile_h $@ \
16324 -       "$(UTS_MACHINE)" "$(CONFIG_SMP)" "$(CONFIG_PREEMPT)" "$(CC) $(KBUILD_CFLAGS)"
16325 +       "$(UTS_MACHINE)" "$(CONFIG_SMP)" "$(CONFIG_PREEMPT)" "$(CONFIG_PREEMPT_RT_FULL)" "$(CC) $(KBUILD_CFLAGS)"
16326 diff --git a/init/main.c b/init/main.c
16327 index 9e64d7097f1a..4a76e629c137 100644
16328 --- a/init/main.c
16329 +++ b/init/main.c
16330 @@ -530,6 +530,7 @@ asmlinkage __visible void __init start_kernel(void)
16331         setup_command_line(command_line);
16332         setup_nr_cpu_ids();
16333         setup_per_cpu_areas();
16334 +       softirq_early_init();
16335         smp_prepare_boot_cpu(); /* arch-specific boot-cpu hooks */
16336
16337         build_all_zonelists(NULL, NULL);
16338 diff --git a/ipc/msg.c b/ipc/msg.c
16339 index c6521c205cb4..996d89023552 100644
16340 --- a/ipc/msg.c
16341 +++ b/ipc/msg.c
16342 @@ -183,20 +183,14 @@ static void ss_wakeup(struct list_head *h, int kill)
16343         }
16344  }
16345
16346 -static void expunge_all(struct msg_queue *msq, int res)
16347 +static void expunge_all(struct msg_queue *msq, int res,
16348 +                       struct wake_q_head *wake_q)
16349  {
16350         struct msg_receiver *msr, *t;
16351
16352         list_for_each_entry_safe(msr, t, &msq->q_receivers, r_list) {
16353 -               msr->r_msg = NULL; /* initialize expunge ordering */
16354 -               wake_up_process(msr->r_tsk);
16355 -               /*
16356 -                * Ensure that the wakeup is visible before setting r_msg as
16357 -                * the receiving end depends on it: either spinning on a nil,
16358 -                * or dealing with -EAGAIN cases. See lockless receive part 1
16359 -                * and 2 in do_msgrcv().
16360 -                */
16361 -               smp_wmb(); /* barrier (B) */
16362 +
16363 +               wake_q_add(wake_q, msr->r_tsk);
16364                 msr->r_msg = ERR_PTR(res);
16365         }
16366  }
16367 @@ -213,11 +207,13 @@ static void freeque(struct ipc_namespace *ns, struct kern_ipc_perm *ipcp)
16368  {
16369         struct msg_msg *msg, *t;
16370         struct msg_queue *msq = container_of(ipcp, struct msg_queue, q_perm);
16371 +       WAKE_Q(wake_q);
16372
16373 -       expunge_all(msq, -EIDRM);
16374 +       expunge_all(msq, -EIDRM, &wake_q);
16375         ss_wakeup(&msq->q_senders, 1);
16376         msg_rmid(ns, msq);
16377         ipc_unlock_object(&msq->q_perm);
16378 +       wake_up_q(&wake_q);
16379         rcu_read_unlock();
16380
16381         list_for_each_entry_safe(msg, t, &msq->q_messages, m_list) {
16382 @@ -342,6 +338,7 @@ static int msgctl_down(struct ipc_namespace *ns, int msqid, int cmd,
16383         struct kern_ipc_perm *ipcp;
16384         struct msqid64_ds uninitialized_var(msqid64);
16385         struct msg_queue *msq;
16386 +       WAKE_Q(wake_q);
16387         int err;
16388
16389         if (cmd == IPC_SET) {
16390 @@ -389,7 +386,7 @@ static int msgctl_down(struct ipc_namespace *ns, int msqid, int cmd,
16391                 /* sleeping receivers might be excluded by
16392                  * stricter permissions.
16393                  */
16394 -               expunge_all(msq, -EAGAIN);
16395 +               expunge_all(msq, -EAGAIN, &wake_q);
16396                 /* sleeping senders might be able to send
16397                  * due to a larger queue size.
16398                  */
16399 @@ -402,6 +399,7 @@ static int msgctl_down(struct ipc_namespace *ns, int msqid, int cmd,
16400
16401  out_unlock0:
16402         ipc_unlock_object(&msq->q_perm);
16403 +       wake_up_q(&wake_q);
16404  out_unlock1:
16405         rcu_read_unlock();
16406  out_up:
16407 @@ -566,7 +564,8 @@ static int testmsg(struct msg_msg *msg, long type, int mode)
16408         return 0;
16409  }
16410
16411 -static inline int pipelined_send(struct msg_queue *msq, struct msg_msg *msg)
16412 +static inline int pipelined_send(struct msg_queue *msq, struct msg_msg *msg,
16413 +                                struct wake_q_head *wake_q)
16414  {
16415         struct msg_receiver *msr, *t;
16416
16417 @@ -577,27 +576,13 @@ static inline int pipelined_send(struct msg_queue *msq, struct msg_msg *msg)
16418
16419                         list_del(&msr->r_list);
16420                         if (msr->r_maxsize < msg->m_ts) {
16421 -                               /* initialize pipelined send ordering */
16422 -                               msr->r_msg = NULL;
16423 -                               wake_up_process(msr->r_tsk);
16424 -                               /* barrier (B) see barrier comment below */
16425 -                               smp_wmb();
16426 +                               wake_q_add(wake_q, msr->r_tsk);
16427                                 msr->r_msg = ERR_PTR(-E2BIG);
16428                         } else {
16429 -                               msr->r_msg = NULL;
16430                                 msq->q_lrpid = task_pid_vnr(msr->r_tsk);
16431                                 msq->q_rtime = get_seconds();
16432 -                               wake_up_process(msr->r_tsk);
16433 -                               /*
16434 -                                * Ensure that the wakeup is visible before
16435 -                                * setting r_msg, as the receiving can otherwise
16436 -                                * exit - once r_msg is set, the receiver can
16437 -                                * continue. See lockless receive part 1 and 2
16438 -                                * in do_msgrcv(). Barrier (B).
16439 -                                */
16440 -                               smp_wmb();
16441 +                               wake_q_add(wake_q, msr->r_tsk);
16442                                 msr->r_msg = msg;
16443 -
16444                                 return 1;
16445                         }
16446                 }
16447 @@ -613,6 +598,7 @@ long do_msgsnd(int msqid, long mtype, void __user *mtext,
16448         struct msg_msg *msg;
16449         int err;
16450         struct ipc_namespace *ns;
16451 +       WAKE_Q(wake_q);
16452
16453         ns = current->nsproxy->ipc_ns;
16454
16455 @@ -698,7 +684,7 @@ long do_msgsnd(int msqid, long mtype, void __user *mtext,
16456         msq->q_lspid = task_tgid_vnr(current);
16457         msq->q_stime = get_seconds();
16458
16459 -       if (!pipelined_send(msq, msg)) {
16460 +       if (!pipelined_send(msq, msg, &wake_q)) {
16461                 /* no one is waiting for this message, enqueue it */
16462                 list_add_tail(&msg->m_list, &msq->q_messages);
16463                 msq->q_cbytes += msgsz;
16464 @@ -712,6 +698,7 @@ long do_msgsnd(int msqid, long mtype, void __user *mtext,
16465
16466  out_unlock0:
16467         ipc_unlock_object(&msq->q_perm);
16468 +       wake_up_q(&wake_q);
16469  out_unlock1:
16470         rcu_read_unlock();
16471         if (msg != NULL)
16472 @@ -932,57 +919,25 @@ long do_msgrcv(int msqid, void __user *buf, size_t bufsz, long msgtyp, int msgfl
16473                 rcu_read_lock();
16474
16475                 /* Lockless receive, part 2:
16476 -                * Wait until pipelined_send or expunge_all are outside of
16477 -                * wake_up_process(). There is a race with exit(), see
16478 -                * ipc/mqueue.c for the details. The correct serialization
16479 -                * ensures that a receiver cannot continue without the wakeup
16480 -                * being visibible _before_ setting r_msg:
16481 +                * The work in pipelined_send() and expunge_all():
16482 +                * - Set pointer to message
16483 +                * - Queue the receiver task for later wakeup
16484 +                * - Wake up the process after the lock is dropped.
16485                  *
16486 -                * CPU 0                             CPU 1
16487 -                * <loop receiver>
16488 -                *   smp_rmb(); (A) <-- pair -.      <waker thread>
16489 -                *   <load ->r_msg>           |        msr->r_msg = NULL;
16490 -                *                            |        wake_up_process();
16491 -                * <continue>                 `------> smp_wmb(); (B)
16492 -                *                                     msr->r_msg = msg;
16493 -                *
16494 -                * Where (A) orders the message value read and where (B) orders
16495 -                * the write to the r_msg -- done in both pipelined_send and
16496 -                * expunge_all.
16497 +                * Should the process wake up before this wakeup (due to a
16498 +                * signal) it will either see the message and continue …
16499                  */
16500 -               for (;;) {
16501 -                       /*
16502 -                        * Pairs with writer barrier in pipelined_send
16503 -                        * or expunge_all.
16504 -                        */
16505 -                       smp_rmb(); /* barrier (A) */
16506 -                       msg = (struct msg_msg *)msr_d.r_msg;
16507 -                       if (msg)
16508 -                               break;
16509
16510 -                       /*
16511 -                        * The cpu_relax() call is a compiler barrier
16512 -                        * which forces everything in this loop to be
16513 -                        * re-loaded.
16514 -                        */
16515 -                       cpu_relax();
16516 -               }
16517 -
16518 -               /* Lockless receive, part 3:
16519 -                * If there is a message or an error then accept it without
16520 -                * locking.
16521 -                */
16522 +               msg = (struct msg_msg *)msr_d.r_msg;
16523                 if (msg != ERR_PTR(-EAGAIN))
16524                         goto out_unlock1;
16525
16526 -               /* Lockless receive, part 3:
16527 -                * Acquire the queue spinlock.
16528 -                */
16529 +                /*
16530 +                 * … or see -EAGAIN, acquire the lock to check the message
16531 +                 * again.
16532 +                 */
16533                 ipc_lock_object(&msq->q_perm);
16534
16535 -               /* Lockless receive, part 4:
16536 -                * Repeat test after acquiring the spinlock.
16537 -                */
16538                 msg = (struct msg_msg *)msr_d.r_msg;
16539                 if (msg != ERR_PTR(-EAGAIN))
16540                         goto out_unlock0;
16541 diff --git a/ipc/sem.c b/ipc/sem.c
16542 index 9862c3d1c26d..ef34d7376697 100644
16543 --- a/ipc/sem.c
16544 +++ b/ipc/sem.c
16545 @@ -708,6 +708,13 @@ undo:
16546  static void wake_up_sem_queue_prepare(struct list_head *pt,
16547                                 struct sem_queue *q, int error)
16548  {
16549 +#ifdef CONFIG_PREEMPT_RT_BASE
16550 +       struct task_struct *p = q->sleeper;
16551 +       get_task_struct(p);
16552 +       q->status = error;
16553 +       wake_up_process(p);
16554 +       put_task_struct(p);
16555 +#else
16556         if (list_empty(pt)) {
16557                 /*
16558                  * Hold preempt off so that we don't get preempted and have the
16559 @@ -719,6 +726,7 @@ static void wake_up_sem_queue_prepare(struct list_head *pt,
16560         q->pid = error;
16561
16562         list_add_tail(&q->list, pt);
16563 +#endif
16564  }
16565
16566  /**
16567 @@ -732,6 +740,7 @@ static void wake_up_sem_queue_prepare(struct list_head *pt,
16568   */
16569  static void wake_up_sem_queue_do(struct list_head *pt)
16570  {
16571 +#ifndef CONFIG_PREEMPT_RT_BASE
16572         struct sem_queue *q, *t;
16573         int did_something;
16574
16575 @@ -744,6 +753,7 @@ static void wake_up_sem_queue_do(struct list_head *pt)
16576         }
16577         if (did_something)
16578                 preempt_enable();
16579 +#endif
16580  }
16581
16582  static void unlink_queue(struct sem_array *sma, struct sem_queue *q)
16583 diff --git a/kernel/Kconfig.locks b/kernel/Kconfig.locks
16584 index ebdb0043203a..b9e6aa7e5aa6 100644
16585 --- a/kernel/Kconfig.locks
16586 +++ b/kernel/Kconfig.locks
16587 @@ -225,11 +225,11 @@ config ARCH_SUPPORTS_ATOMIC_RMW
16588
16589  config MUTEX_SPIN_ON_OWNER
16590         def_bool y
16591 -       depends on SMP && !DEBUG_MUTEXES && ARCH_SUPPORTS_ATOMIC_RMW
16592 +       depends on SMP && !DEBUG_MUTEXES && ARCH_SUPPORTS_ATOMIC_RMW && !PREEMPT_RT_FULL
16593
16594  config RWSEM_SPIN_ON_OWNER
16595         def_bool y
16596 -       depends on SMP && RWSEM_XCHGADD_ALGORITHM && ARCH_SUPPORTS_ATOMIC_RMW
16597 +       depends on SMP && RWSEM_XCHGADD_ALGORITHM && ARCH_SUPPORTS_ATOMIC_RMW && !PREEMPT_RT_FULL
16598
16599  config LOCK_SPIN_ON_OWNER
16600         def_bool y
16601 diff --git a/kernel/Kconfig.preempt b/kernel/Kconfig.preempt
16602 index 3f9c97419f02..11dbe26a8279 100644
16603 --- a/kernel/Kconfig.preempt
16604 +++ b/kernel/Kconfig.preempt
16605 @@ -1,3 +1,16 @@
16606 +config PREEMPT
16607 +       bool
16608 +       select PREEMPT_COUNT
16609 +
16610 +config PREEMPT_RT_BASE
16611 +       bool
16612 +       select PREEMPT
16613 +
16614 +config HAVE_PREEMPT_LAZY
16615 +       bool
16616 +
16617 +config PREEMPT_LAZY
16618 +       def_bool y if HAVE_PREEMPT_LAZY && PREEMPT_RT_FULL
16619
16620  choice
16621         prompt "Preemption Model"
16622 @@ -33,9 +46,9 @@ config PREEMPT_VOLUNTARY
16623
16624           Select this if you are building a kernel for a desktop system.
16625
16626 -config PREEMPT
16627 +config PREEMPT__LL
16628         bool "Preemptible Kernel (Low-Latency Desktop)"
16629 -       select PREEMPT_COUNT
16630 +       select PREEMPT
16631         select UNINLINE_SPIN_UNLOCK if !ARCH_INLINE_SPIN_UNLOCK
16632         help
16633           This option reduces the latency of the kernel by making
16634 @@ -52,6 +65,22 @@ config PREEMPT
16635           embedded system with latency requirements in the milliseconds
16636           range.
16637
16638 +config PREEMPT_RTB
16639 +       bool "Preemptible Kernel (Basic RT)"
16640 +       select PREEMPT_RT_BASE
16641 +       help
16642 +         This option is basically the same as (Low-Latency Desktop) but
16643 +         enables changes which are preliminary for the full preemptible
16644 +         RT kernel.
16645 +
16646 +config PREEMPT_RT_FULL
16647 +       bool "Fully Preemptible Kernel (RT)"
16648 +       depends on IRQ_FORCED_THREADING
16649 +       select PREEMPT_RT_BASE
16650 +       select PREEMPT_RCU
16651 +       help
16652 +         All and everything
16653 +
16654  endchoice
16655
16656  config PREEMPT_COUNT
16657 diff --git a/kernel/cgroup.c b/kernel/cgroup.c
16658 index a3424f28aaf4..69434d231e21 100644
16659 --- a/kernel/cgroup.c
16660 +++ b/kernel/cgroup.c
16661 @@ -4737,10 +4737,10 @@ static void css_free_rcu_fn(struct rcu_head *rcu_head)
16662         queue_work(cgroup_destroy_wq, &css->destroy_work);
16663  }
16664
16665 -static void css_release_work_fn(struct work_struct *work)
16666 +static void css_release_work_fn(struct swork_event *sev)
16667  {
16668         struct cgroup_subsys_state *css =
16669 -               container_of(work, struct cgroup_subsys_state, destroy_work);
16670 +               container_of(sev, struct cgroup_subsys_state, destroy_swork);
16671         struct cgroup_subsys *ss = css->ss;
16672         struct cgroup *cgrp = css->cgroup;
16673
16674 @@ -4779,8 +4779,8 @@ static void css_release(struct percpu_ref *ref)
16675         struct cgroup_subsys_state *css =
16676                 container_of(ref, struct cgroup_subsys_state, refcnt);
16677
16678 -       INIT_WORK(&css->destroy_work, css_release_work_fn);
16679 -       queue_work(cgroup_destroy_wq, &css->destroy_work);
16680 +       INIT_SWORK(&css->destroy_swork, css_release_work_fn);
16681 +       swork_queue(&css->destroy_swork);
16682  }
16683
16684  static void init_and_link_css(struct cgroup_subsys_state *css,
16685 @@ -5397,6 +5397,7 @@ static int __init cgroup_wq_init(void)
16686          */
16687         cgroup_destroy_wq = alloc_workqueue("cgroup_destroy", 0, 1);
16688         BUG_ON(!cgroup_destroy_wq);
16689 +       BUG_ON(swork_get());
16690
16691         /*
16692          * Used to destroy pidlists and separate to serve as flush domain.
16693 diff --git a/kernel/cpu.c b/kernel/cpu.c
16694 index 85ff5e26e23b..8edd3c716092 100644
16695 --- a/kernel/cpu.c
16696 +++ b/kernel/cpu.c
16697 @@ -75,8 +75,8 @@ static struct {
16698  #endif
16699  } cpu_hotplug = {
16700         .active_writer = NULL,
16701 -       .wq = __WAIT_QUEUE_HEAD_INITIALIZER(cpu_hotplug.wq),
16702         .lock = __MUTEX_INITIALIZER(cpu_hotplug.lock),
16703 +       .wq = __WAIT_QUEUE_HEAD_INITIALIZER(cpu_hotplug.wq),
16704  #ifdef CONFIG_DEBUG_LOCK_ALLOC
16705         .dep_map = {.name = "cpu_hotplug.lock" },
16706  #endif
16707 @@ -89,6 +89,289 @@ static struct {
16708  #define cpuhp_lock_acquire()      lock_map_acquire(&cpu_hotplug.dep_map)
16709  #define cpuhp_lock_release()      lock_map_release(&cpu_hotplug.dep_map)
16710
16711 +/**
16712 + * hotplug_pcp - per cpu hotplug descriptor
16713 + * @unplug:    set when pin_current_cpu() needs to sync tasks
16714 + * @sync_tsk:  the task that waits for tasks to finish pinned sections
16715 + * @refcount:  counter of tasks in pinned sections
16716 + * @grab_lock: set when the tasks entering pinned sections should wait
16717 + * @synced:    notifier for @sync_tsk to tell cpu_down it's finished
16718 + * @mutex:     the mutex to make tasks wait (used when @grab_lock is true)
16719 + * @mutex_init:        zero if the mutex hasn't been initialized yet.
16720 + *
16721 + * Although @unplug and @sync_tsk may point to the same task, the @unplug
16722 + * is used as a flag and still exists after @sync_tsk has exited and
16723 + * @sync_tsk set to NULL.
16724 + */
16725 +struct hotplug_pcp {
16726 +       struct task_struct *unplug;
16727 +       struct task_struct *sync_tsk;
16728 +       int refcount;
16729 +       int grab_lock;
16730 +       struct completion synced;
16731 +       struct completion unplug_wait;
16732 +#ifdef CONFIG_PREEMPT_RT_FULL
16733 +       /*
16734 +        * Note, on PREEMPT_RT, the hotplug lock must save the state of
16735 +        * the task, otherwise the mutex will cause the task to fail
16736 +        * to sleep when required. (Because it's called from migrate_disable())
16737 +        *
16738 +        * The spinlock_t on PREEMPT_RT is a mutex that saves the task's
16739 +        * state.
16740 +        */
16741 +       spinlock_t lock;
16742 +#else
16743 +       struct mutex mutex;
16744 +#endif
16745 +       int mutex_init;
16746 +};
16747 +
16748 +#ifdef CONFIG_PREEMPT_RT_FULL
16749 +# define hotplug_lock(hp) rt_spin_lock__no_mg(&(hp)->lock)
16750 +# define hotplug_unlock(hp) rt_spin_unlock__no_mg(&(hp)->lock)
16751 +#else
16752 +# define hotplug_lock(hp) mutex_lock(&(hp)->mutex)
16753 +# define hotplug_unlock(hp) mutex_unlock(&(hp)->mutex)
16754 +#endif
16755 +
16756 +static DEFINE_PER_CPU(struct hotplug_pcp, hotplug_pcp);
16757 +
16758 +/**
16759 + * pin_current_cpu - Prevent the current cpu from being unplugged
16760 + *
16761 + * Lightweight version of get_online_cpus() to prevent cpu from being
16762 + * unplugged when code runs in a migration disabled region.
16763 + *
16764 + * Must be called with preemption disabled (preempt_count = 1)!
16765 + */
16766 +void pin_current_cpu(void)
16767 +{
16768 +       struct hotplug_pcp *hp;
16769 +       int force = 0;
16770 +
16771 +retry:
16772 +       hp = this_cpu_ptr(&hotplug_pcp);
16773 +
16774 +       if (!hp->unplug || hp->refcount || force || preempt_count() > 1 ||
16775 +           hp->unplug == current) {
16776 +               hp->refcount++;
16777 +               return;
16778 +       }
16779 +       if (hp->grab_lock) {
16780 +               preempt_enable();
16781 +               hotplug_lock(hp);
16782 +               hotplug_unlock(hp);
16783 +       } else {
16784 +               preempt_enable();
16785 +               /*
16786 +                * Try to push this task off of this CPU.
16787 +                */
16788 +               if (!migrate_me()) {
16789 +                       preempt_disable();
16790 +                       hp = this_cpu_ptr(&hotplug_pcp);
16791 +                       if (!hp->grab_lock) {
16792 +                               /*
16793 +                                * Just let it continue it's already pinned
16794 +                                * or about to sleep.
16795 +                                */
16796 +                               force = 1;
16797 +                               goto retry;
16798 +                       }
16799 +                       preempt_enable();
16800 +               }
16801 +       }
16802 +       preempt_disable();
16803 +       goto retry;
16804 +}
16805 +
16806 +/**
16807 + * unpin_current_cpu - Allow unplug of current cpu
16808 + *
16809 + * Must be called with preemption or interrupts disabled!
16810 + */
16811 +void unpin_current_cpu(void)
16812 +{
16813 +       struct hotplug_pcp *hp = this_cpu_ptr(&hotplug_pcp);
16814 +
16815 +       WARN_ON(hp->refcount <= 0);
16816 +
16817 +       /* This is safe. sync_unplug_thread is pinned to this cpu */
16818 +       if (!--hp->refcount && hp->unplug && hp->unplug != current)
16819 +               wake_up_process(hp->unplug);
16820 +}
16821 +
16822 +static void wait_for_pinned_cpus(struct hotplug_pcp *hp)
16823 +{
16824 +       set_current_state(TASK_UNINTERRUPTIBLE);
16825 +       while (hp->refcount) {
16826 +               schedule_preempt_disabled();
16827 +               set_current_state(TASK_UNINTERRUPTIBLE);
16828 +       }
16829 +}
16830 +
16831 +static int sync_unplug_thread(void *data)
16832 +{
16833 +       struct hotplug_pcp *hp = data;
16834 +
16835 +       wait_for_completion(&hp->unplug_wait);
16836 +       preempt_disable();
16837 +       hp->unplug = current;
16838 +       wait_for_pinned_cpus(hp);
16839 +
16840 +       /*
16841 +        * This thread will synchronize the cpu_down() with threads
16842 +        * that have pinned the CPU. When the pinned CPU count reaches
16843 +        * zero, we inform the cpu_down code to continue to the next step.
16844 +        */
16845 +       set_current_state(TASK_UNINTERRUPTIBLE);
16846 +       preempt_enable();
16847 +       complete(&hp->synced);
16848 +
16849 +       /*
16850 +        * If all succeeds, the next step will need tasks to wait till
16851 +        * the CPU is offline before continuing. To do this, the grab_lock
16852 +        * is set and tasks going into pin_current_cpu() will block on the
16853 +        * mutex. But we still need to wait for those that are already in
16854 +        * pinned CPU sections. If the cpu_down() failed, the kthread_should_stop()
16855 +        * will kick this thread out.
16856 +        */
16857 +       while (!hp->grab_lock && !kthread_should_stop()) {
16858 +               schedule();
16859 +               set_current_state(TASK_UNINTERRUPTIBLE);
16860 +       }
16861 +
16862 +       /* Make sure grab_lock is seen before we see a stale completion */
16863 +       smp_mb();
16864 +
16865 +       /*
16866 +        * Now just before cpu_down() enters stop machine, we need to make
16867 +        * sure all tasks that are in pinned CPU sections are out, and new
16868 +        * tasks will now grab the lock, keeping them from entering pinned
16869 +        * CPU sections.
16870 +        */
16871 +       if (!kthread_should_stop()) {
16872 +               preempt_disable();
16873 +               wait_for_pinned_cpus(hp);
16874 +               preempt_enable();
16875 +               complete(&hp->synced);
16876 +       }
16877 +
16878 +       set_current_state(TASK_UNINTERRUPTIBLE);
16879 +       while (!kthread_should_stop()) {
16880 +               schedule();
16881 +               set_current_state(TASK_UNINTERRUPTIBLE);
16882 +       }
16883 +       set_current_state(TASK_RUNNING);
16884 +
16885 +       /*
16886 +        * Force this thread off this CPU as it's going down and
16887 +        * we don't want any more work on this CPU.
16888 +        */
16889 +       current->flags &= ~PF_NO_SETAFFINITY;
16890 +       set_cpus_allowed_ptr(current, cpu_present_mask);
16891 +       migrate_me();
16892 +       return 0;
16893 +}
16894 +
16895 +static void __cpu_unplug_sync(struct hotplug_pcp *hp)
16896 +{
16897 +       wake_up_process(hp->sync_tsk);
16898 +       wait_for_completion(&hp->synced);
16899 +}
16900 +
16901 +static void __cpu_unplug_wait(unsigned int cpu)
16902 +{
16903 +       struct hotplug_pcp *hp = &per_cpu(hotplug_pcp, cpu);
16904 +
16905 +       complete(&hp->unplug_wait);
16906 +       wait_for_completion(&hp->synced);
16907 +}
16908 +
16909 +/*
16910 + * Start the sync_unplug_thread on the target cpu and wait for it to
16911 + * complete.
16912 + */
16913 +static int cpu_unplug_begin(unsigned int cpu)
16914 +{
16915 +       struct hotplug_pcp *hp = &per_cpu(hotplug_pcp, cpu);
16916 +       int err;
16917 +
16918 +       /* Protected by cpu_hotplug.lock */
16919 +       if (!hp->mutex_init) {
16920 +#ifdef CONFIG_PREEMPT_RT_FULL
16921 +               spin_lock_init(&hp->lock);
16922 +#else
16923 +               mutex_init(&hp->mutex);
16924 +#endif
16925 +               hp->mutex_init = 1;
16926 +       }
16927 +
16928 +       /* Inform the scheduler to migrate tasks off this CPU */
16929 +       tell_sched_cpu_down_begin(cpu);
16930 +
16931 +       init_completion(&hp->synced);
16932 +       init_completion(&hp->unplug_wait);
16933 +
16934 +       hp->sync_tsk = kthread_create(sync_unplug_thread, hp, "sync_unplug/%d", cpu);
16935 +       if (IS_ERR(hp->sync_tsk)) {
16936 +               err = PTR_ERR(hp->sync_tsk);
16937 +               hp->sync_tsk = NULL;
16938 +               return err;
16939 +       }
16940 +       kthread_bind(hp->sync_tsk, cpu);
16941 +
16942 +       /*
16943 +        * Wait for tasks to get out of the pinned sections,
16944 +        * it's still OK if new tasks enter. Some CPU notifiers will
16945 +        * wait for tasks that are going to enter these sections and
16946 +        * we must not have them block.
16947 +        */
16948 +       wake_up_process(hp->sync_tsk);
16949 +       return 0;
16950 +}
16951 +
16952 +static void cpu_unplug_sync(unsigned int cpu)
16953 +{
16954 +       struct hotplug_pcp *hp = &per_cpu(hotplug_pcp, cpu);
16955 +
16956 +       init_completion(&hp->synced);
16957 +       /* The completion needs to be initialzied before setting grab_lock */
16958 +       smp_wmb();
16959 +
16960 +       /* Grab the mutex before setting grab_lock */
16961 +       hotplug_lock(hp);
16962 +       hp->grab_lock = 1;
16963 +
16964 +       /*
16965 +        * The CPU notifiers have been completed.
16966 +        * Wait for tasks to get out of pinned CPU sections and have new
16967 +        * tasks block until the CPU is completely down.
16968 +        */
16969 +       __cpu_unplug_sync(hp);
16970 +
16971 +       /* All done with the sync thread */
16972 +       kthread_stop(hp->sync_tsk);
16973 +       hp->sync_tsk = NULL;
16974 +}
16975 +
16976 +static void cpu_unplug_done(unsigned int cpu)
16977 +{
16978 +       struct hotplug_pcp *hp = &per_cpu(hotplug_pcp, cpu);
16979 +
16980 +       hp->unplug = NULL;
16981 +       /* Let all tasks know cpu unplug is finished before cleaning up */
16982 +       smp_wmb();
16983 +
16984 +       if (hp->sync_tsk)
16985 +               kthread_stop(hp->sync_tsk);
16986 +
16987 +       if (hp->grab_lock) {
16988 +               hotplug_unlock(hp);
16989 +               /* protected by cpu_hotplug.lock */
16990 +               hp->grab_lock = 0;
16991 +       }
16992 +       tell_sched_cpu_down_done(cpu);
16993 +}
16994
16995  void get_online_cpus(void)
16996  {
16997 @@ -338,13 +621,15 @@ static int take_cpu_down(void *_param)
16998  /* Requires cpu_add_remove_lock to be held */
16999  static int _cpu_down(unsigned int cpu, int tasks_frozen)
17000  {
17001 -       int err, nr_calls = 0;
17002 +       int mycpu, err, nr_calls = 0;
17003         void *hcpu = (void *)(long)cpu;
17004         unsigned long mod = tasks_frozen ? CPU_TASKS_FROZEN : 0;
17005         struct take_cpu_down_param tcd_param = {
17006                 .mod = mod,
17007                 .hcpu = hcpu,
17008         };
17009 +       cpumask_var_t cpumask;
17010 +       cpumask_var_t cpumask_org;
17011
17012         if (num_online_cpus() == 1)
17013                 return -EBUSY;
17014 @@ -352,7 +637,34 @@ static int _cpu_down(unsigned int cpu, int tasks_frozen)
17015         if (!cpu_online(cpu))
17016                 return -EINVAL;
17017
17018 +       /* Move the downtaker off the unplug cpu */
17019 +       if (!alloc_cpumask_var(&cpumask, GFP_KERNEL))
17020 +               return -ENOMEM;
17021 +       if (!alloc_cpumask_var(&cpumask_org, GFP_KERNEL))  {
17022 +               free_cpumask_var(cpumask);
17023 +               return -ENOMEM;
17024 +       }
17025 +
17026 +       cpumask_copy(cpumask_org, tsk_cpus_allowed(current));
17027 +       cpumask_andnot(cpumask, cpu_online_mask, cpumask_of(cpu));
17028 +       set_cpus_allowed_ptr(current, cpumask);
17029 +       free_cpumask_var(cpumask);
17030 +       migrate_disable();
17031 +       mycpu = smp_processor_id();
17032 +       if (mycpu == cpu) {
17033 +               printk(KERN_ERR "Yuck! Still on unplug CPU\n!");
17034 +               migrate_enable();
17035 +               err = -EBUSY;
17036 +               goto restore_cpus;
17037 +       }
17038 +       migrate_enable();
17039 +
17040         cpu_hotplug_begin();
17041 +       err = cpu_unplug_begin(cpu);
17042 +       if (err) {
17043 +               printk("cpu_unplug_begin(%d) failed\n", cpu);
17044 +               goto out_cancel;
17045 +       }
17046
17047         err = __cpu_notify(CPU_DOWN_PREPARE | mod, hcpu, -1, &nr_calls);
17048         if (err) {
17049 @@ -378,8 +690,12 @@ static int _cpu_down(unsigned int cpu, int tasks_frozen)
17050         else
17051                 synchronize_rcu();
17052
17053 +       __cpu_unplug_wait(cpu);
17054         smpboot_park_threads(cpu);
17055
17056 +       /* Notifiers are done. Don't let any more tasks pin this CPU. */
17057 +       cpu_unplug_sync(cpu);
17058 +
17059         /*
17060          * Prevent irq alloc/free while the dying cpu reorganizes the
17061          * interrupt affinities.
17062 @@ -424,9 +740,14 @@ static int _cpu_down(unsigned int cpu, int tasks_frozen)
17063         check_for_tasks(cpu);
17064
17065  out_release:
17066 +       cpu_unplug_done(cpu);
17067 +out_cancel:
17068         cpu_hotplug_done();
17069         if (!err)
17070                 cpu_notify_nofail(CPU_POST_DEAD | mod, hcpu);
17071 +restore_cpus:
17072 +       set_cpus_allowed_ptr(current, cpumask_org);
17073 +       free_cpumask_var(cpumask_org);
17074         return err;
17075  }
17076
17077 diff --git a/kernel/debug/kdb/kdb_io.c b/kernel/debug/kdb/kdb_io.c
17078 index fc1ef736253c..83c666537a7a 100644
17079 --- a/kernel/debug/kdb/kdb_io.c
17080 +++ b/kernel/debug/kdb/kdb_io.c
17081 @@ -554,7 +554,6 @@ int vkdb_printf(enum kdb_msgsrc src, const char *fmt, va_list ap)
17082         int linecount;
17083         int colcount;
17084         int logging, saved_loglevel = 0;
17085 -       int saved_trap_printk;
17086         int got_printf_lock = 0;
17087         int retlen = 0;
17088         int fnd, len;
17089 @@ -565,8 +564,6 @@ int vkdb_printf(enum kdb_msgsrc src, const char *fmt, va_list ap)
17090         unsigned long uninitialized_var(flags);
17091
17092         preempt_disable();
17093 -       saved_trap_printk = kdb_trap_printk;
17094 -       kdb_trap_printk = 0;
17095
17096         /* Serialize kdb_printf if multiple cpus try to write at once.
17097          * But if any cpu goes recursive in kdb, just print the output,
17098 @@ -855,7 +852,6 @@ kdb_print_out:
17099         } else {
17100                 __release(kdb_printf_lock);
17101         }
17102 -       kdb_trap_printk = saved_trap_printk;
17103         preempt_enable();
17104         return retlen;
17105  }
17106 @@ -865,9 +861,11 @@ int kdb_printf(const char *fmt, ...)
17107         va_list ap;
17108         int r;
17109
17110 +       kdb_trap_printk++;
17111         va_start(ap, fmt);
17112         r = vkdb_printf(KDB_MSGSRC_INTERNAL, fmt, ap);
17113         va_end(ap);
17114 +       kdb_trap_printk--;
17115
17116         return r;
17117  }
17118 diff --git a/kernel/events/core.c b/kernel/events/core.c
17119 index bc6371b0e4fb..388de1dc27d9 100644
17120 --- a/kernel/events/core.c
17121 +++ b/kernel/events/core.c
17122 @@ -802,6 +802,7 @@ static void __perf_mux_hrtimer_init(struct perf_cpu_context *cpuctx, int cpu)
17123         raw_spin_lock_init(&cpuctx->hrtimer_lock);
17124         hrtimer_init(timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED);
17125         timer->function = perf_mux_hrtimer_handler;
17126 +       timer->irqsafe = 1;
17127  }
17128
17129  static int perf_mux_hrtimer_restart(struct perf_cpu_context *cpuctx)
17130 @@ -7240,6 +7241,7 @@ static void perf_swevent_init_hrtimer(struct perf_event *event)
17131
17132         hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
17133         hwc->hrtimer.function = perf_swevent_hrtimer;
17134 +       hwc->hrtimer.irqsafe = 1;
17135
17136         /*
17137          * Since hrtimers have a fixed rate, we can do a static freq->period
17138 diff --git a/kernel/exit.c b/kernel/exit.c
17139 index ffba5df4abd5..e199407f8831 100644
17140 --- a/kernel/exit.c
17141 +++ b/kernel/exit.c
17142 @@ -144,7 +144,7 @@ static void __exit_signal(struct task_struct *tsk)
17143          * Do this under ->siglock, we can race with another thread
17144          * doing sigqueue_free() if we have SIGQUEUE_PREALLOC signals.
17145          */
17146 -       flush_sigqueue(&tsk->pending);
17147 +       flush_task_sigqueue(tsk);
17148         tsk->sighand = NULL;
17149         spin_unlock(&sighand->siglock);
17150
17151 diff --git a/kernel/fork.c b/kernel/fork.c
17152 index 7161ebe67cbb..3b880312b385 100644
17153 --- a/kernel/fork.c
17154 +++ b/kernel/fork.c
17155 @@ -108,7 +108,7 @@ int max_threads;            /* tunable limit on nr_threads */
17156
17157  DEFINE_PER_CPU(unsigned long, process_counts) = 0;
17158
17159 -__cacheline_aligned DEFINE_RWLOCK(tasklist_lock);  /* outer */
17160 +DEFINE_RWLOCK(tasklist_lock);  /* outer */
17161
17162  #ifdef CONFIG_PROVE_RCU
17163  int lockdep_tasklist_lock_is_held(void)
17164 @@ -244,7 +244,9 @@ static inline void put_signal_struct(struct signal_struct *sig)
17165         if (atomic_dec_and_test(&sig->sigcnt))
17166                 free_signal_struct(sig);
17167  }
17168 -
17169 +#ifdef CONFIG_PREEMPT_RT_BASE
17170 +static
17171 +#endif
17172  void __put_task_struct(struct task_struct *tsk)
17173  {
17174         WARN_ON(!tsk->exit_state);
17175 @@ -261,7 +263,18 @@ void __put_task_struct(struct task_struct *tsk)
17176         if (!profile_handoff_task(tsk))
17177                 free_task(tsk);
17178  }
17179 +#ifndef CONFIG_PREEMPT_RT_BASE
17180  EXPORT_SYMBOL_GPL(__put_task_struct);
17181 +#else
17182 +void __put_task_struct_cb(struct rcu_head *rhp)
17183 +{
17184 +       struct task_struct *tsk = container_of(rhp, struct task_struct, put_rcu);
17185 +
17186 +       __put_task_struct(tsk);
17187 +
17188 +}
17189 +EXPORT_SYMBOL_GPL(__put_task_struct_cb);
17190 +#endif
17191
17192  void __init __weak arch_task_cache_init(void) { }
17193
17194 @@ -689,6 +702,19 @@ void __mmdrop(struct mm_struct *mm)
17195  }
17196  EXPORT_SYMBOL_GPL(__mmdrop);
17197
17198 +#ifdef CONFIG_PREEMPT_RT_BASE
17199 +/*
17200 + * RCU callback for delayed mm drop. Not strictly rcu, but we don't
17201 + * want another facility to make this work.
17202 + */
17203 +void __mmdrop_delayed(struct rcu_head *rhp)
17204 +{
17205 +       struct mm_struct *mm = container_of(rhp, struct mm_struct, delayed_drop);
17206 +
17207 +       __mmdrop(mm);
17208 +}
17209 +#endif
17210 +
17211  /*
17212   * Decrement the use count and release all resources for an mm.
17213   */
17214 @@ -1239,6 +1265,9 @@ static void rt_mutex_init_task(struct task_struct *p)
17215   */
17216  static void posix_cpu_timers_init(struct task_struct *tsk)
17217  {
17218 +#ifdef CONFIG_PREEMPT_RT_BASE
17219 +       tsk->posix_timer_list = NULL;
17220 +#endif
17221         tsk->cputime_expires.prof_exp = 0;
17222         tsk->cputime_expires.virt_exp = 0;
17223         tsk->cputime_expires.sched_exp = 0;
17224 @@ -1364,15 +1393,16 @@ static struct task_struct *copy_process(unsigned long clone_flags,
17225         spin_lock_init(&p->alloc_lock);
17226
17227         init_sigpending(&p->pending);
17228 +       p->sigqueue_cache = NULL;
17229
17230         p->utime = p->stime = p->gtime = 0;
17231         p->utimescaled = p->stimescaled = 0;
17232         prev_cputime_init(&p->prev_cputime);
17233
17234  #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
17235 -       seqlock_init(&p->vtime_seqlock);
17236 +       seqcount_init(&p->vtime_seqcount);
17237         p->vtime_snap = 0;
17238 -       p->vtime_snap_whence = VTIME_SLEEPING;
17239 +       p->vtime_snap_whence = VTIME_INACTIVE;
17240  #endif
17241
17242  #if defined(SPLIT_RSS_COUNTING)
17243 diff --git a/kernel/futex.c b/kernel/futex.c
17244 index 9d8163afd87c..059623427b99 100644
17245 --- a/kernel/futex.c
17246 +++ b/kernel/futex.c
17247 @@ -815,7 +815,9 @@ void exit_pi_state_list(struct task_struct *curr)
17248                  * task still owns the PI-state:
17249                  */
17250                 if (head->next != next) {
17251 +                       raw_spin_unlock_irq(&curr->pi_lock);
17252                         spin_unlock(&hb->lock);
17253 +                       raw_spin_lock_irq(&curr->pi_lock);
17254                         continue;
17255                 }
17256
17257 @@ -1210,6 +1212,7 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this,
17258         struct futex_pi_state *pi_state = this->pi_state;
17259         u32 uninitialized_var(curval), newval;
17260         WAKE_Q(wake_q);
17261 +       WAKE_Q(wake_sleeper_q);
17262         bool deboost;
17263         int ret = 0;
17264
17265 @@ -1223,7 +1226,7 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this,
17266         if (pi_state->owner != current)
17267                 return -EINVAL;
17268
17269 -       raw_spin_lock(&pi_state->pi_mutex.wait_lock);
17270 +       raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
17271         new_owner = rt_mutex_next_owner(&pi_state->pi_mutex);
17272
17273         /*
17274 @@ -1259,24 +1262,25 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this,
17275                         ret = -EINVAL;
17276         }
17277         if (ret) {
17278 -               raw_spin_unlock(&pi_state->pi_mutex.wait_lock);
17279 +               raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
17280                 return ret;
17281         }
17282
17283 -       raw_spin_lock_irq(&pi_state->owner->pi_lock);
17284 +       raw_spin_lock(&pi_state->owner->pi_lock);
17285         WARN_ON(list_empty(&pi_state->list));
17286         list_del_init(&pi_state->list);
17287 -       raw_spin_unlock_irq(&pi_state->owner->pi_lock);
17288 +       raw_spin_unlock(&pi_state->owner->pi_lock);
17289
17290 -       raw_spin_lock_irq(&new_owner->pi_lock);
17291 +       raw_spin_lock(&new_owner->pi_lock);
17292         WARN_ON(!list_empty(&pi_state->list));
17293         list_add(&pi_state->list, &new_owner->pi_state_list);
17294         pi_state->owner = new_owner;
17295 -       raw_spin_unlock_irq(&new_owner->pi_lock);
17296 +       raw_spin_unlock(&new_owner->pi_lock);
17297
17298 -       raw_spin_unlock(&pi_state->pi_mutex.wait_lock);
17299 +       raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
17300
17301 -       deboost = rt_mutex_futex_unlock(&pi_state->pi_mutex, &wake_q);
17302 +       deboost = rt_mutex_futex_unlock(&pi_state->pi_mutex, &wake_q,
17303 +                                       &wake_sleeper_q);
17304
17305         /*
17306          * First unlock HB so the waiter does not spin on it once he got woken
17307 @@ -1284,8 +1288,9 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this,
17308          * deboost first (and lose our higher priority), then the task might get
17309          * scheduled away before the wake up can take place.
17310          */
17311 -       spin_unlock(&hb->lock);
17312 +       deboost |= spin_unlock_no_deboost(&hb->lock);
17313         wake_up_q(&wake_q);
17314 +       wake_up_q_sleeper(&wake_sleeper_q);
17315         if (deboost)
17316                 rt_mutex_adjust_prio(current);
17317
17318 @@ -1822,6 +1827,16 @@ retry_private:
17319                                 requeue_pi_wake_futex(this, &key2, hb2);
17320                                 drop_count++;
17321                                 continue;
17322 +                       } else if (ret == -EAGAIN) {
17323 +                               /*
17324 +                                * Waiter was woken by timeout or
17325 +                                * signal and has set pi_blocked_on to
17326 +                                * PI_WAKEUP_INPROGRESS before we
17327 +                                * tried to enqueue it on the rtmutex.
17328 +                                */
17329 +                               this->pi_state = NULL;
17330 +                               free_pi_state(pi_state);
17331 +                               continue;
17332                         } else if (ret) {
17333                                 /* -EDEADLK */
17334                                 this->pi_state = NULL;
17335 @@ -2139,11 +2154,11 @@ static int fixup_owner(u32 __user *uaddr, struct futex_q *q, int locked)
17336                  * we returned due to timeout or signal without taking the
17337                  * rt_mutex. Too late.
17338                  */
17339 -               raw_spin_lock(&q->pi_state->pi_mutex.wait_lock);
17340 +               raw_spin_lock_irq(&q->pi_state->pi_mutex.wait_lock);
17341                 owner = rt_mutex_owner(&q->pi_state->pi_mutex);
17342                 if (!owner)
17343                         owner = rt_mutex_next_owner(&q->pi_state->pi_mutex);
17344 -               raw_spin_unlock(&q->pi_state->pi_mutex.wait_lock);
17345 +               raw_spin_unlock_irq(&q->pi_state->pi_mutex.wait_lock);
17346                 ret = fixup_pi_state_owner(uaddr, q, owner);
17347                 goto out;
17348         }
17349 @@ -2691,7 +2706,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
17350         struct hrtimer_sleeper timeout, *to = NULL;
17351         struct rt_mutex_waiter rt_waiter;
17352         struct rt_mutex *pi_mutex = NULL;
17353 -       struct futex_hash_bucket *hb;
17354 +       struct futex_hash_bucket *hb, *hb2;
17355         union futex_key key2 = FUTEX_KEY_INIT;
17356         struct futex_q q = futex_q_init;
17357         int res, ret;
17358 @@ -2716,10 +2731,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
17359          * The waiter is allocated on our stack, manipulated by the requeue
17360          * code while we sleep on uaddr.
17361          */
17362 -       debug_rt_mutex_init_waiter(&rt_waiter);
17363 -       RB_CLEAR_NODE(&rt_waiter.pi_tree_entry);
17364 -       RB_CLEAR_NODE(&rt_waiter.tree_entry);
17365 -       rt_waiter.task = NULL;
17366 +       rt_mutex_init_waiter(&rt_waiter, false);
17367
17368         ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2, VERIFY_WRITE);
17369         if (unlikely(ret != 0))
17370 @@ -2750,20 +2762,55 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
17371         /* Queue the futex_q, drop the hb lock, wait for wakeup. */
17372         futex_wait_queue_me(hb, &q, to);
17373
17374 -       spin_lock(&hb->lock);
17375 -       ret = handle_early_requeue_pi_wakeup(hb, &q, &key2, to);
17376 -       spin_unlock(&hb->lock);
17377 -       if (ret)
17378 -               goto out_put_keys;
17379 +       /*
17380 +        * On RT we must avoid races with requeue and trying to block
17381 +        * on two mutexes (hb->lock and uaddr2's rtmutex) by
17382 +        * serializing access to pi_blocked_on with pi_lock.
17383 +        */
17384 +       raw_spin_lock_irq(&current->pi_lock);
17385 +       if (current->pi_blocked_on) {
17386 +               /*
17387 +                * We have been requeued or are in the process of
17388 +                * being requeued.
17389 +                */
17390 +               raw_spin_unlock_irq(&current->pi_lock);
17391 +       } else {
17392 +               /*
17393 +                * Setting pi_blocked_on to PI_WAKEUP_INPROGRESS
17394 +                * prevents a concurrent requeue from moving us to the
17395 +                * uaddr2 rtmutex. After that we can safely acquire
17396 +                * (and possibly block on) hb->lock.
17397 +                */
17398 +               current->pi_blocked_on = PI_WAKEUP_INPROGRESS;
17399 +               raw_spin_unlock_irq(&current->pi_lock);
17400 +
17401 +               spin_lock(&hb->lock);
17402 +
17403 +               /*
17404 +                * Clean up pi_blocked_on. We might leak it otherwise
17405 +                * when we succeeded with the hb->lock in the fast
17406 +                * path.
17407 +                */
17408 +               raw_spin_lock_irq(&current->pi_lock);
17409 +               current->pi_blocked_on = NULL;
17410 +               raw_spin_unlock_irq(&current->pi_lock);
17411 +
17412 +               ret = handle_early_requeue_pi_wakeup(hb, &q, &key2, to);
17413 +               spin_unlock(&hb->lock);
17414 +               if (ret)
17415 +                       goto out_put_keys;
17416 +       }
17417
17418         /*
17419 -        * In order for us to be here, we know our q.key == key2, and since
17420 -        * we took the hb->lock above, we also know that futex_requeue() has
17421 -        * completed and we no longer have to concern ourselves with a wakeup
17422 -        * race with the atomic proxy lock acquisition by the requeue code. The
17423 -        * futex_requeue dropped our key1 reference and incremented our key2
17424 -        * reference count.
17425 +        * In order to be here, we have either been requeued, are in
17426 +        * the process of being requeued, or requeue successfully
17427 +        * acquired uaddr2 on our behalf.  If pi_blocked_on was
17428 +        * non-null above, we may be racing with a requeue.  Do not
17429 +        * rely on q->lock_ptr to be hb2->lock until after blocking on
17430 +        * hb->lock or hb2->lock. The futex_requeue dropped our key1
17431 +        * reference and incremented our key2 reference count.
17432          */
17433 +       hb2 = hash_futex(&key2);
17434
17435         /* Check if the requeue code acquired the second futex for us. */
17436         if (!q.rt_waiter) {
17437 @@ -2772,14 +2819,15 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
17438                  * did a lock-steal - fix up the PI-state in that case.
17439                  */
17440                 if (q.pi_state && (q.pi_state->owner != current)) {
17441 -                       spin_lock(q.lock_ptr);
17442 +                       spin_lock(&hb2->lock);
17443 +                       BUG_ON(&hb2->lock != q.lock_ptr);
17444                         ret = fixup_pi_state_owner(uaddr2, &q, current);
17445                         /*
17446                          * Drop the reference to the pi state which
17447                          * the requeue_pi() code acquired for us.
17448                          */
17449                         free_pi_state(q.pi_state);
17450 -                       spin_unlock(q.lock_ptr);
17451 +                       spin_unlock(&hb2->lock);
17452                 }
17453         } else {
17454                 /*
17455 @@ -2792,7 +2840,8 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
17456                 ret = rt_mutex_finish_proxy_lock(pi_mutex, to, &rt_waiter);
17457                 debug_rt_mutex_free_waiter(&rt_waiter);
17458
17459 -               spin_lock(q.lock_ptr);
17460 +               spin_lock(&hb2->lock);
17461 +               BUG_ON(&hb2->lock != q.lock_ptr);
17462                 /*
17463                  * Fixup the pi_state owner and possibly acquire the lock if we
17464                  * haven't already.
17465 diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
17466 index 57bff7857e87..6c65c9252991 100644
17467 --- a/kernel/irq/handle.c
17468 +++ b/kernel/irq/handle.c
17469 @@ -134,6 +134,8 @@ void __irq_wake_thread(struct irq_desc *desc, struct irqaction *action)
17470
17471  irqreturn_t handle_irq_event_percpu(struct irq_desc *desc)
17472  {
17473 +       struct pt_regs *regs = get_irq_regs();
17474 +       u64 ip = regs ? instruction_pointer(regs) : 0;
17475         irqreturn_t retval = IRQ_NONE;
17476         unsigned int flags = 0, irq = desc->irq_data.irq;
17477         struct irqaction *action = desc->action;
17478 @@ -176,7 +178,11 @@ irqreturn_t handle_irq_event_percpu(struct irq_desc *desc)
17479                 action = action->next;
17480         }
17481
17482 -       add_interrupt_randomness(irq, flags);
17483 +#ifdef CONFIG_PREEMPT_RT_FULL
17484 +       desc->random_ip = ip;
17485 +#else
17486 +       add_interrupt_randomness(irq, flags, ip);
17487 +#endif
17488
17489         if (!noirqdebug)
17490                 note_interrupt(desc, retval);
17491 diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c
17492 index 239e2ae2c947..0b73349a42d5 100644
17493 --- a/kernel/irq/irqdesc.c
17494 +++ b/kernel/irq/irqdesc.c
17495 @@ -24,10 +24,27 @@
17496  static struct lock_class_key irq_desc_lock_class;
17497
17498  #if defined(CONFIG_SMP)
17499 +static int __init irq_affinity_setup(char *str)
17500 +{
17501 +       zalloc_cpumask_var(&irq_default_affinity, GFP_NOWAIT);
17502 +       cpulist_parse(str, irq_default_affinity);
17503 +       /*
17504 +        * Set at least the boot cpu. We don't want to end up with
17505 +        * bugreports caused by random comandline masks
17506 +        */
17507 +       cpumask_set_cpu(smp_processor_id(), irq_default_affinity);
17508 +       return 1;
17509 +}
17510 +__setup("irqaffinity=", irq_affinity_setup);
17511 +
17512  static void __init init_irq_default_affinity(void)
17513  {
17514 -       alloc_cpumask_var(&irq_default_affinity, GFP_NOWAIT);
17515 -       cpumask_setall(irq_default_affinity);
17516 +#ifdef CONFIG_CPUMASK_OFFSTACK
17517 +       if (!irq_default_affinity)
17518 +               zalloc_cpumask_var(&irq_default_affinity, GFP_NOWAIT);
17519 +#endif
17520 +       if (cpumask_empty(irq_default_affinity))
17521 +               cpumask_setall(irq_default_affinity);
17522  }
17523  #else
17524  static void __init init_irq_default_affinity(void)
17525 diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
17526 index 6ead200370da..8e89554aa345 100644
17527 --- a/kernel/irq/manage.c
17528 +++ b/kernel/irq/manage.c
17529 @@ -22,6 +22,7 @@
17530  #include "internals.h"
17531
17532  #ifdef CONFIG_IRQ_FORCED_THREADING
17533 +# ifndef CONFIG_PREEMPT_RT_BASE
17534  __read_mostly bool force_irqthreads;
17535
17536  static int __init setup_forced_irqthreads(char *arg)
17537 @@ -30,6 +31,7 @@ static int __init setup_forced_irqthreads(char *arg)
17538         return 0;
17539  }
17540  early_param("threadirqs", setup_forced_irqthreads);
17541 +# endif
17542  #endif
17543
17544  static void __synchronize_hardirq(struct irq_desc *desc)
17545 @@ -181,6 +183,62 @@ static inline void
17546  irq_get_pending(struct cpumask *mask, struct irq_desc *desc) { }
17547  #endif
17548
17549 +#ifdef CONFIG_PREEMPT_RT_FULL
17550 +static void _irq_affinity_notify(struct irq_affinity_notify *notify);
17551 +static struct task_struct *set_affinity_helper;
17552 +static LIST_HEAD(affinity_list);
17553 +static DEFINE_RAW_SPINLOCK(affinity_list_lock);
17554 +
17555 +static int set_affinity_thread(void *unused)
17556 +{
17557 +       while (1) {
17558 +               struct irq_affinity_notify *notify;
17559 +               int empty;
17560 +
17561 +               set_current_state(TASK_INTERRUPTIBLE);
17562 +
17563 +               raw_spin_lock_irq(&affinity_list_lock);
17564 +               empty = list_empty(&affinity_list);
17565 +               raw_spin_unlock_irq(&affinity_list_lock);
17566 +
17567 +               if (empty)
17568 +                       schedule();
17569 +               if (kthread_should_stop())
17570 +                       break;
17571 +               set_current_state(TASK_RUNNING);
17572 +try_next:
17573 +               notify = NULL;
17574 +
17575 +               raw_spin_lock_irq(&affinity_list_lock);
17576 +               if (!list_empty(&affinity_list)) {
17577 +                       notify = list_first_entry(&affinity_list,
17578 +                                       struct irq_affinity_notify, list);
17579 +                       list_del_init(&notify->list);
17580 +               }
17581 +               raw_spin_unlock_irq(&affinity_list_lock);
17582 +
17583 +               if (!notify)
17584 +                       continue;
17585 +               _irq_affinity_notify(notify);
17586 +               goto try_next;
17587 +       }
17588 +       return 0;
17589 +}
17590 +
17591 +static void init_helper_thread(void)
17592 +{
17593 +       if (set_affinity_helper)
17594 +               return;
17595 +       set_affinity_helper = kthread_run(set_affinity_thread, NULL,
17596 +                       "affinity-cb");
17597 +       WARN_ON(IS_ERR(set_affinity_helper));
17598 +}
17599 +#else
17600 +
17601 +static inline void init_helper_thread(void) { }
17602 +
17603 +#endif
17604 +
17605  int irq_do_set_affinity(struct irq_data *data, const struct cpumask *mask,
17606                         bool force)
17607  {
17608 @@ -220,7 +278,17 @@ int irq_set_affinity_locked(struct irq_data *data, const struct cpumask *mask,
17609
17610         if (desc->affinity_notify) {
17611                 kref_get(&desc->affinity_notify->kref);
17612 +
17613 +#ifdef CONFIG_PREEMPT_RT_FULL
17614 +               raw_spin_lock(&affinity_list_lock);
17615 +               if (list_empty(&desc->affinity_notify->list))
17616 +                       list_add_tail(&affinity_list,
17617 +                                       &desc->affinity_notify->list);
17618 +               raw_spin_unlock(&affinity_list_lock);
17619 +               wake_up_process(set_affinity_helper);
17620 +#else
17621                 schedule_work(&desc->affinity_notify->work);
17622 +#endif
17623         }
17624         irqd_set(data, IRQD_AFFINITY_SET);
17625
17626 @@ -258,10 +326,8 @@ int irq_set_affinity_hint(unsigned int irq, const struct cpumask *m)
17627  }
17628  EXPORT_SYMBOL_GPL(irq_set_affinity_hint);
17629
17630 -static void irq_affinity_notify(struct work_struct *work)
17631 +static void _irq_affinity_notify(struct irq_affinity_notify *notify)
17632  {
17633 -       struct irq_affinity_notify *notify =
17634 -               container_of(work, struct irq_affinity_notify, work);
17635         struct irq_desc *desc = irq_to_desc(notify->irq);
17636         cpumask_var_t cpumask;
17637         unsigned long flags;
17638 @@ -283,6 +349,13 @@ out:
17639         kref_put(&notify->kref, notify->release);
17640  }
17641
17642 +static void irq_affinity_notify(struct work_struct *work)
17643 +{
17644 +       struct irq_affinity_notify *notify =
17645 +               container_of(work, struct irq_affinity_notify, work);
17646 +       _irq_affinity_notify(notify);
17647 +}
17648 +
17649  /**
17650   *     irq_set_affinity_notifier - control notification of IRQ affinity changes
17651   *     @irq:           Interrupt for which to enable/disable notification
17652 @@ -312,6 +385,8 @@ irq_set_affinity_notifier(unsigned int irq, struct irq_affinity_notify *notify)
17653                 notify->irq = irq;
17654                 kref_init(&notify->kref);
17655                 INIT_WORK(&notify->work, irq_affinity_notify);
17656 +               INIT_LIST_HEAD(&notify->list);
17657 +               init_helper_thread();
17658         }
17659
17660         raw_spin_lock_irqsave(&desc->lock, flags);
17661 @@ -865,7 +940,15 @@ irq_forced_thread_fn(struct irq_desc *desc, struct irqaction *action)
17662         local_bh_disable();
17663         ret = action->thread_fn(action->irq, action->dev_id);
17664         irq_finalize_oneshot(desc, action);
17665 -       local_bh_enable();
17666 +       /*
17667 +        * Interrupts which have real time requirements can be set up
17668 +        * to avoid softirq processing in the thread handler. This is
17669 +        * safe as these interrupts do not raise soft interrupts.
17670 +        */
17671 +       if (irq_settings_no_softirq_call(desc))
17672 +               _local_bh_enable();
17673 +       else
17674 +               local_bh_enable();
17675         return ret;
17676  }
17677
17678 @@ -962,6 +1045,12 @@ static int irq_thread(void *data)
17679                 if (action_ret == IRQ_WAKE_THREAD)
17680                         irq_wake_secondary(desc, action);
17681
17682 +#ifdef CONFIG_PREEMPT_RT_FULL
17683 +               migrate_disable();
17684 +               add_interrupt_randomness(action->irq, 0,
17685 +                                desc->random_ip ^ (unsigned long) action);
17686 +               migrate_enable();
17687 +#endif
17688                 wake_threads_waitq(desc);
17689         }
17690
17691 @@ -1315,6 +1404,9 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
17692                         irqd_set(&desc->irq_data, IRQD_NO_BALANCING);
17693                 }
17694
17695 +               if (new->flags & IRQF_NO_SOFTIRQ_CALL)
17696 +                       irq_settings_set_no_softirq_call(desc);
17697 +
17698                 /* Set default affinity mask once everything is setup */
17699                 setup_affinity(desc, mask);
17700
17701 @@ -1968,7 +2060,7 @@ EXPORT_SYMBOL_GPL(irq_get_irqchip_state);
17702   *     This call sets the internal irqchip state of an interrupt,
17703   *     depending on the value of @which.
17704   *
17705 - *     This function should be called with preemption disabled if the
17706 + *     This function should be called with migration disabled if the
17707   *     interrupt controller has per-cpu registers.
17708   */
17709  int irq_set_irqchip_state(unsigned int irq, enum irqchip_irq_state which,
17710 diff --git a/kernel/irq/settings.h b/kernel/irq/settings.h
17711 index 320579d89091..2df2d4445b1e 100644
17712 --- a/kernel/irq/settings.h
17713 +++ b/kernel/irq/settings.h
17714 @@ -16,6 +16,7 @@ enum {
17715         _IRQ_PER_CPU_DEVID      = IRQ_PER_CPU_DEVID,
17716         _IRQ_IS_POLLED          = IRQ_IS_POLLED,
17717         _IRQ_DISABLE_UNLAZY     = IRQ_DISABLE_UNLAZY,
17718 +       _IRQ_NO_SOFTIRQ_CALL    = IRQ_NO_SOFTIRQ_CALL,
17719         _IRQF_MODIFY_MASK       = IRQF_MODIFY_MASK,
17720  };
17721
17722 @@ -30,6 +31,7 @@ enum {
17723  #define IRQ_PER_CPU_DEVID      GOT_YOU_MORON
17724  #define IRQ_IS_POLLED          GOT_YOU_MORON
17725  #define IRQ_DISABLE_UNLAZY     GOT_YOU_MORON
17726 +#define IRQ_NO_SOFTIRQ_CALL    GOT_YOU_MORON
17727  #undef IRQF_MODIFY_MASK
17728  #define IRQF_MODIFY_MASK       GOT_YOU_MORON
17729
17730 @@ -40,6 +42,16 @@ irq_settings_clr_and_set(struct irq_desc *desc, u32 clr, u32 set)
17731         desc->status_use_accessors |= (set & _IRQF_MODIFY_MASK);
17732  }
17733
17734 +static inline bool irq_settings_no_softirq_call(struct irq_desc *desc)
17735 +{
17736 +       return desc->status_use_accessors & _IRQ_NO_SOFTIRQ_CALL;
17737 +}
17738 +
17739 +static inline void irq_settings_set_no_softirq_call(struct irq_desc *desc)
17740 +{
17741 +       desc->status_use_accessors |= _IRQ_NO_SOFTIRQ_CALL;
17742 +}
17743 +
17744  static inline bool irq_settings_is_per_cpu(struct irq_desc *desc)
17745  {
17746         return desc->status_use_accessors & _IRQ_PER_CPU;
17747 diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c
17748 index 32144175458d..ed26f2554972 100644
17749 --- a/kernel/irq/spurious.c
17750 +++ b/kernel/irq/spurious.c
17751 @@ -444,6 +444,10 @@ MODULE_PARM_DESC(noirqdebug, "Disable irq lockup detection when true");
17752
17753  static int __init irqfixup_setup(char *str)
17754  {
17755 +#ifdef CONFIG_PREEMPT_RT_BASE
17756 +       pr_warn("irqfixup boot option not supported w/ CONFIG_PREEMPT_RT_BASE\n");
17757 +       return 1;
17758 +#endif
17759         irqfixup = 1;
17760         printk(KERN_WARNING "Misrouted IRQ fixup support enabled.\n");
17761         printk(KERN_WARNING "This may impact system performance.\n");
17762 @@ -456,6 +460,10 @@ module_param(irqfixup, int, 0644);
17763
17764  static int __init irqpoll_setup(char *str)
17765  {
17766 +#ifdef CONFIG_PREEMPT_RT_BASE
17767 +       pr_warn("irqpoll boot option not supported w/ CONFIG_PREEMPT_RT_BASE\n");
17768 +       return 1;
17769 +#endif
17770         irqfixup = 2;
17771         printk(KERN_WARNING "Misrouted IRQ fixup and polling support "
17772                                 "enabled\n");
17773 diff --git a/kernel/irq_work.c b/kernel/irq_work.c
17774 index bcf107ce0854..2899ba0d23d1 100644
17775 --- a/kernel/irq_work.c
17776 +++ b/kernel/irq_work.c
17777 @@ -17,6 +17,7 @@
17778  #include <linux/cpu.h>
17779  #include <linux/notifier.h>
17780  #include <linux/smp.h>
17781 +#include <linux/interrupt.h>
17782  #include <asm/processor.h>
17783
17784
17785 @@ -65,6 +66,8 @@ void __weak arch_irq_work_raise(void)
17786   */
17787  bool irq_work_queue_on(struct irq_work *work, int cpu)
17788  {
17789 +       struct llist_head *list;
17790 +
17791         /* All work should have been flushed before going offline */
17792         WARN_ON_ONCE(cpu_is_offline(cpu));
17793
17794 @@ -75,7 +78,12 @@ bool irq_work_queue_on(struct irq_work *work, int cpu)
17795         if (!irq_work_claim(work))
17796                 return false;
17797
17798 -       if (llist_add(&work->llnode, &per_cpu(raised_list, cpu)))
17799 +       if (IS_ENABLED(CONFIG_PREEMPT_RT_FULL) && !(work->flags & IRQ_WORK_HARD_IRQ))
17800 +               list = &per_cpu(lazy_list, cpu);
17801 +       else
17802 +               list = &per_cpu(raised_list, cpu);
17803 +
17804 +       if (llist_add(&work->llnode, list))
17805                 arch_send_call_function_single_ipi(cpu);
17806
17807         return true;
17808 @@ -86,6 +94,9 @@ EXPORT_SYMBOL_GPL(irq_work_queue_on);
17809  /* Enqueue the irq work @work on the current CPU */
17810  bool irq_work_queue(struct irq_work *work)
17811  {
17812 +       struct llist_head *list;
17813 +       bool lazy_work, realtime = IS_ENABLED(CONFIG_PREEMPT_RT_FULL);
17814 +
17815         /* Only queue if not already pending */
17816         if (!irq_work_claim(work))
17817                 return false;
17818 @@ -93,13 +104,15 @@ bool irq_work_queue(struct irq_work *work)
17819         /* Queue the entry and raise the IPI if needed. */
17820         preempt_disable();
17821
17822 -       /* If the work is "lazy", handle it from next tick if any */
17823 -       if (work->flags & IRQ_WORK_LAZY) {
17824 -               if (llist_add(&work->llnode, this_cpu_ptr(&lazy_list)) &&
17825 -                   tick_nohz_tick_stopped())
17826 -                       arch_irq_work_raise();
17827 -       } else {
17828 -               if (llist_add(&work->llnode, this_cpu_ptr(&raised_list)))
17829 +       lazy_work = work->flags & IRQ_WORK_LAZY;
17830 +
17831 +       if (lazy_work || (realtime && !(work->flags & IRQ_WORK_HARD_IRQ)))
17832 +               list = this_cpu_ptr(&lazy_list);
17833 +       else
17834 +               list = this_cpu_ptr(&raised_list);
17835 +
17836 +       if (llist_add(&work->llnode, list)) {
17837 +               if (!lazy_work || tick_nohz_tick_stopped())
17838                         arch_irq_work_raise();
17839         }
17840
17841 @@ -116,9 +129,8 @@ bool irq_work_needs_cpu(void)
17842         raised = this_cpu_ptr(&raised_list);
17843         lazy = this_cpu_ptr(&lazy_list);
17844
17845 -       if (llist_empty(raised) || arch_irq_work_has_interrupt())
17846 -               if (llist_empty(lazy))
17847 -                       return false;
17848 +       if (llist_empty(raised) && llist_empty(lazy))
17849 +               return false;
17850
17851         /* All work should have been flushed before going offline */
17852         WARN_ON_ONCE(cpu_is_offline(smp_processor_id()));
17853 @@ -132,7 +144,7 @@ static void irq_work_run_list(struct llist_head *list)
17854         struct irq_work *work;
17855         struct llist_node *llnode;
17856
17857 -       BUG_ON(!irqs_disabled());
17858 +       BUG_ON_NONRT(!irqs_disabled());
17859
17860         if (llist_empty(list))
17861                 return;
17862 @@ -169,7 +181,16 @@ static void irq_work_run_list(struct llist_head *list)
17863  void irq_work_run(void)
17864  {
17865         irq_work_run_list(this_cpu_ptr(&raised_list));
17866 -       irq_work_run_list(this_cpu_ptr(&lazy_list));
17867 +       if (IS_ENABLED(CONFIG_PREEMPT_RT_FULL)) {
17868 +               /*
17869 +                * NOTE: we raise softirq via IPI for safety,
17870 +                * and execute in irq_work_tick() to move the
17871 +                * overhead from hard to soft irq context.
17872 +                */
17873 +               if (!llist_empty(this_cpu_ptr(&lazy_list)))
17874 +                       raise_softirq(TIMER_SOFTIRQ);
17875 +       } else
17876 +               irq_work_run_list(this_cpu_ptr(&lazy_list));
17877  }
17878  EXPORT_SYMBOL_GPL(irq_work_run);
17879
17880 @@ -179,8 +200,17 @@ void irq_work_tick(void)
17881
17882         if (!llist_empty(raised) && !arch_irq_work_has_interrupt())
17883                 irq_work_run_list(raised);
17884 +
17885 +       if (!IS_ENABLED(CONFIG_PREEMPT_RT_FULL))
17886 +               irq_work_run_list(this_cpu_ptr(&lazy_list));
17887 +}
17888 +
17889 +#if defined(CONFIG_IRQ_WORK) && defined(CONFIG_PREEMPT_RT_FULL)
17890 +void irq_work_tick_soft(void)
17891 +{
17892         irq_work_run_list(this_cpu_ptr(&lazy_list));
17893  }
17894 +#endif
17895
17896  /*
17897   * Synchronize against the irq_work @entry, ensures the entry is not
17898 diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c
17899 index e83b26464061..c0e08d1cf33e 100644
17900 --- a/kernel/ksysfs.c
17901 +++ b/kernel/ksysfs.c
17902 @@ -136,6 +136,15 @@ KERNEL_ATTR_RO(vmcoreinfo);
17903
17904  #endif /* CONFIG_KEXEC_CORE */
17905
17906 +#if defined(CONFIG_PREEMPT_RT_FULL)
17907 +static ssize_t  realtime_show(struct kobject *kobj,
17908 +                             struct kobj_attribute *attr, char *buf)
17909 +{
17910 +       return sprintf(buf, "%d\n", 1);
17911 +}
17912 +KERNEL_ATTR_RO(realtime);
17913 +#endif
17914 +
17915  /* whether file capabilities are enabled */
17916  static ssize_t fscaps_show(struct kobject *kobj,
17917                                   struct kobj_attribute *attr, char *buf)
17918 @@ -203,6 +212,9 @@ static struct attribute * kernel_attrs[] = {
17919         &vmcoreinfo_attr.attr,
17920  #endif
17921         &rcu_expedited_attr.attr,
17922 +#ifdef CONFIG_PREEMPT_RT_FULL
17923 +       &realtime_attr.attr,
17924 +#endif
17925         NULL
17926  };
17927
17928 diff --git a/kernel/locking/Makefile b/kernel/locking/Makefile
17929 index 8e96f6cc2a4a..447b03082d88 100644
17930 --- a/kernel/locking/Makefile
17931 +++ b/kernel/locking/Makefile
17932 @@ -1,5 +1,5 @@
17933
17934 -obj-y += mutex.o semaphore.o rwsem.o percpu-rwsem.o
17935 +obj-y += semaphore.o percpu-rwsem.o
17936
17937  ifdef CONFIG_FUNCTION_TRACER
17938  CFLAGS_REMOVE_lockdep.o = $(CC_FLAGS_FTRACE)
17939 @@ -8,7 +8,11 @@ CFLAGS_REMOVE_mutex-debug.o = $(CC_FLAGS_FTRACE)
17940  CFLAGS_REMOVE_rtmutex-debug.o = $(CC_FLAGS_FTRACE)
17941  endif
17942
17943 +ifneq ($(CONFIG_PREEMPT_RT_FULL),y)
17944 +obj-y += mutex.o
17945  obj-$(CONFIG_DEBUG_MUTEXES) += mutex-debug.o
17946 +obj-y += rwsem.o
17947 +endif
17948  obj-$(CONFIG_LOCKDEP) += lockdep.o
17949  ifeq ($(CONFIG_PROC_FS),y)
17950  obj-$(CONFIG_LOCKDEP) += lockdep_proc.o
17951 @@ -22,7 +26,10 @@ obj-$(CONFIG_RT_MUTEXES) += rtmutex.o
17952  obj-$(CONFIG_DEBUG_RT_MUTEXES) += rtmutex-debug.o
17953  obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock.o
17954  obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock_debug.o
17955 +ifneq ($(CONFIG_PREEMPT_RT_FULL),y)
17956  obj-$(CONFIG_RWSEM_GENERIC_SPINLOCK) += rwsem-spinlock.o
17957  obj-$(CONFIG_RWSEM_XCHGADD_ALGORITHM) += rwsem-xadd.o
17958 +endif
17959 +obj-$(CONFIG_PREEMPT_RT_FULL) += rt.o
17960  obj-$(CONFIG_QUEUED_RWLOCKS) += qrwlock.o
17961  obj-$(CONFIG_LOCK_TORTURE_TEST) += locktorture.o
17962 diff --git a/kernel/locking/lglock.c b/kernel/locking/lglock.c
17963 index 951cfcd10b4a..57e0ea72c28a 100644
17964 --- a/kernel/locking/lglock.c
17965 +++ b/kernel/locking/lglock.c
17966 @@ -4,6 +4,15 @@
17967  #include <linux/cpu.h>
17968  #include <linux/string.h>
17969
17970 +#ifndef CONFIG_PREEMPT_RT_FULL
17971 +# define lg_lock_ptr           arch_spinlock_t
17972 +# define lg_do_lock(l)         arch_spin_lock(l)
17973 +# define lg_do_unlock(l)       arch_spin_unlock(l)
17974 +#else
17975 +# define lg_lock_ptr           struct rt_mutex
17976 +# define lg_do_lock(l)         __rt_spin_lock__no_mg(l)
17977 +# define lg_do_unlock(l)       __rt_spin_unlock(l)
17978 +#endif
17979  /*
17980   * Note there is no uninit, so lglocks cannot be defined in
17981   * modules (but it's fine to use them from there)
17982 @@ -12,51 +21,60 @@
17983
17984  void lg_lock_init(struct lglock *lg, char *name)
17985  {
17986 +#ifdef CONFIG_PREEMPT_RT_FULL
17987 +       int i;
17988 +
17989 +       for_each_possible_cpu(i) {
17990 +               struct rt_mutex *lock = per_cpu_ptr(lg->lock, i);
17991 +
17992 +               rt_mutex_init(lock);
17993 +       }
17994 +#endif
17995         LOCKDEP_INIT_MAP(&lg->lock_dep_map, name, &lg->lock_key, 0);
17996  }
17997  EXPORT_SYMBOL(lg_lock_init);
17998
17999  void lg_local_lock(struct lglock *lg)
18000  {
18001 -       arch_spinlock_t *lock;
18002 +       lg_lock_ptr *lock;
18003
18004 -       preempt_disable();
18005 +       migrate_disable();
18006         lock_acquire_shared(&lg->lock_dep_map, 0, 0, NULL, _RET_IP_);
18007         lock = this_cpu_ptr(lg->lock);
18008 -       arch_spin_lock(lock);
18009 +       lg_do_lock(lock);
18010  }
18011  EXPORT_SYMBOL(lg_local_lock);
18012
18013  void lg_local_unlock(struct lglock *lg)
18014  {
18015 -       arch_spinlock_t *lock;
18016 +       lg_lock_ptr *lock;
18017
18018         lock_release(&lg->lock_dep_map, 1, _RET_IP_);
18019         lock = this_cpu_ptr(lg->lock);
18020 -       arch_spin_unlock(lock);
18021 -       preempt_enable();
18022 +       lg_do_unlock(lock);
18023 +       migrate_enable();
18024  }
18025  EXPORT_SYMBOL(lg_local_unlock);
18026
18027  void lg_local_lock_cpu(struct lglock *lg, int cpu)
18028  {
18029 -       arch_spinlock_t *lock;
18030 +       lg_lock_ptr *lock;
18031
18032 -       preempt_disable();
18033 +       preempt_disable_nort();
18034         lock_acquire_shared(&lg->lock_dep_map, 0, 0, NULL, _RET_IP_);
18035         lock = per_cpu_ptr(lg->lock, cpu);
18036 -       arch_spin_lock(lock);
18037 +       lg_do_lock(lock);
18038  }
18039  EXPORT_SYMBOL(lg_local_lock_cpu);
18040
18041  void lg_local_unlock_cpu(struct lglock *lg, int cpu)
18042  {
18043 -       arch_spinlock_t *lock;
18044 +       lg_lock_ptr *lock;
18045
18046         lock_release(&lg->lock_dep_map, 1, _RET_IP_);
18047         lock = per_cpu_ptr(lg->lock, cpu);
18048 -       arch_spin_unlock(lock);
18049 -       preempt_enable();
18050 +       lg_do_unlock(lock);
18051 +       preempt_enable_nort();
18052  }
18053  EXPORT_SYMBOL(lg_local_unlock_cpu);
18054
18055 @@ -68,30 +86,30 @@ void lg_double_lock(struct lglock *lg, int cpu1, int cpu2)
18056         if (cpu2 < cpu1)
18057                 swap(cpu1, cpu2);
18058
18059 -       preempt_disable();
18060 +       preempt_disable_nort();
18061         lock_acquire_shared(&lg->lock_dep_map, 0, 0, NULL, _RET_IP_);
18062 -       arch_spin_lock(per_cpu_ptr(lg->lock, cpu1));
18063 -       arch_spin_lock(per_cpu_ptr(lg->lock, cpu2));
18064 +       lg_do_lock(per_cpu_ptr(lg->lock, cpu1));
18065 +       lg_do_lock(per_cpu_ptr(lg->lock, cpu2));
18066  }
18067
18068  void lg_double_unlock(struct lglock *lg, int cpu1, int cpu2)
18069  {
18070         lock_release(&lg->lock_dep_map, 1, _RET_IP_);
18071 -       arch_spin_unlock(per_cpu_ptr(lg->lock, cpu1));
18072 -       arch_spin_unlock(per_cpu_ptr(lg->lock, cpu2));
18073 -       preempt_enable();
18074 +       lg_do_unlock(per_cpu_ptr(lg->lock, cpu1));
18075 +       lg_do_unlock(per_cpu_ptr(lg->lock, cpu2));
18076 +       preempt_enable_nort();
18077  }
18078
18079  void lg_global_lock(struct lglock *lg)
18080  {
18081         int i;
18082
18083 -       preempt_disable();
18084 +       preempt_disable_nort();
18085         lock_acquire_exclusive(&lg->lock_dep_map, 0, 0, NULL, _RET_IP_);
18086         for_each_possible_cpu(i) {
18087 -               arch_spinlock_t *lock;
18088 +               lg_lock_ptr *lock;
18089                 lock = per_cpu_ptr(lg->lock, i);
18090 -               arch_spin_lock(lock);
18091 +               lg_do_lock(lock);
18092         }
18093  }
18094  EXPORT_SYMBOL(lg_global_lock);
18095 @@ -102,10 +120,35 @@ void lg_global_unlock(struct lglock *lg)
18096
18097         lock_release(&lg->lock_dep_map, 1, _RET_IP_);
18098         for_each_possible_cpu(i) {
18099 -               arch_spinlock_t *lock;
18100 +               lg_lock_ptr *lock;
18101                 lock = per_cpu_ptr(lg->lock, i);
18102 -               arch_spin_unlock(lock);
18103 +               lg_do_unlock(lock);
18104         }
18105 -       preempt_enable();
18106 +       preempt_enable_nort();
18107  }
18108  EXPORT_SYMBOL(lg_global_unlock);
18109 +
18110 +#ifdef CONFIG_PREEMPT_RT_FULL
18111 +/*
18112 + * HACK: If you use this, you get to keep the pieces.
18113 + * Used in queue_stop_cpus_work() when stop machinery
18114 + * is called from inactive CPU, so we can't schedule.
18115 + */
18116 +# define lg_do_trylock_relax(l)                        \
18117 +       do {                                    \
18118 +               while (!__rt_spin_trylock(l))   \
18119 +                       cpu_relax();            \
18120 +       } while (0)
18121 +
18122 +void lg_global_trylock_relax(struct lglock *lg)
18123 +{
18124 +       int i;
18125 +
18126 +       lock_acquire_exclusive(&lg->lock_dep_map, 0, 0, NULL, _RET_IP_);
18127 +       for_each_possible_cpu(i) {
18128 +               lg_lock_ptr *lock;
18129 +               lock = per_cpu_ptr(lg->lock, i);
18130 +               lg_do_trylock_relax(lock);
18131 +       }
18132 +}
18133 +#endif
18134 diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
18135 index 60ace56618f6..e98ee958a353 100644
18136 --- a/kernel/locking/lockdep.c
18137 +++ b/kernel/locking/lockdep.c
18138 @@ -3525,6 +3525,7 @@ static void check_flags(unsigned long flags)
18139                 }
18140         }
18141
18142 +#ifndef CONFIG_PREEMPT_RT_FULL
18143         /*
18144          * We dont accurately track softirq state in e.g.
18145          * hardirq contexts (such as on 4KSTACKS), so only
18146 @@ -3539,6 +3540,7 @@ static void check_flags(unsigned long flags)
18147                         DEBUG_LOCKS_WARN_ON(!current->softirqs_enabled);
18148                 }
18149         }
18150 +#endif
18151
18152         if (!debug_locks)
18153                 print_irqtrace_events(current);
18154 diff --git a/kernel/locking/locktorture.c b/kernel/locking/locktorture.c
18155 index 8ef1919d63b2..291fc19e28e0 100644
18156 --- a/kernel/locking/locktorture.c
18157 +++ b/kernel/locking/locktorture.c
18158 @@ -26,7 +26,6 @@
18159  #include <linux/kthread.h>
18160  #include <linux/sched/rt.h>
18161  #include <linux/spinlock.h>
18162 -#include <linux/rwlock.h>
18163  #include <linux/mutex.h>
18164  #include <linux/rwsem.h>
18165  #include <linux/smp.h>
18166 diff --git a/kernel/locking/rt.c b/kernel/locking/rt.c
18167 new file mode 100644
18168 index 000000000000..d4ab61c1848b
18169 --- /dev/null
18170 +++ b/kernel/locking/rt.c
18171 @@ -0,0 +1,474 @@
18172 +/*
18173 + * kernel/rt.c
18174 + *
18175 + * Real-Time Preemption Support
18176 + *
18177 + * started by Ingo Molnar:
18178 + *
18179 + *  Copyright (C) 2004-2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
18180 + *  Copyright (C) 2006, Timesys Corp., Thomas Gleixner <tglx@timesys.com>
18181 + *
18182 + * historic credit for proving that Linux spinlocks can be implemented via
18183 + * RT-aware mutexes goes to many people: The Pmutex project (Dirk Grambow
18184 + * and others) who prototyped it on 2.4 and did lots of comparative
18185 + * research and analysis; TimeSys, for proving that you can implement a
18186 + * fully preemptible kernel via the use of IRQ threading and mutexes;
18187 + * Bill Huey for persuasively arguing on lkml that the mutex model is the
18188 + * right one; and to MontaVista, who ported pmutexes to 2.6.
18189 + *
18190 + * This code is a from-scratch implementation and is not based on pmutexes,
18191 + * but the idea of converting spinlocks to mutexes is used here too.
18192 + *
18193 + * lock debugging, locking tree, deadlock detection:
18194 + *
18195 + *  Copyright (C) 2004, LynuxWorks, Inc., Igor Manyilov, Bill Huey
18196 + *  Released under the General Public License (GPL).
18197 + *
18198 + * Includes portions of the generic R/W semaphore implementation from:
18199 + *
18200 + *  Copyright (c) 2001   David Howells (dhowells@redhat.com).
18201 + *  - Derived partially from idea by Andrea Arcangeli <andrea@suse.de>
18202 + *  - Derived also from comments by Linus
18203 + *
18204 + * Pending ownership of locks and ownership stealing:
18205 + *
18206 + *  Copyright (C) 2005, Kihon Technologies Inc., Steven Rostedt
18207 + *
18208 + *   (also by Steven Rostedt)
18209 + *    - Converted single pi_lock to individual task locks.
18210 + *
18211 + * By Esben Nielsen:
18212 + *    Doing priority inheritance with help of the scheduler.
18213 + *
18214 + *  Copyright (C) 2006, Timesys Corp., Thomas Gleixner <tglx@timesys.com>
18215 + *  - major rework based on Esben Nielsens initial patch
18216 + *  - replaced thread_info references by task_struct refs
18217 + *  - removed task->pending_owner dependency
18218 + *  - BKL drop/reacquire for semaphore style locks to avoid deadlocks
18219 + *    in the scheduler return path as discussed with Steven Rostedt
18220 + *
18221 + *  Copyright (C) 2006, Kihon Technologies Inc.
18222 + *    Steven Rostedt <rostedt@goodmis.org>
18223 + *  - debugged and patched Thomas Gleixner's rework.
18224 + *  - added back the cmpxchg to the rework.
18225 + *  - turned atomic require back on for SMP.
18226 + */
18227 +
18228 +#include <linux/spinlock.h>
18229 +#include <linux/rtmutex.h>
18230 +#include <linux/sched.h>
18231 +#include <linux/delay.h>
18232 +#include <linux/module.h>
18233 +#include <linux/kallsyms.h>
18234 +#include <linux/syscalls.h>
18235 +#include <linux/interrupt.h>
18236 +#include <linux/plist.h>
18237 +#include <linux/fs.h>
18238 +#include <linux/futex.h>
18239 +#include <linux/hrtimer.h>
18240 +
18241 +#include "rtmutex_common.h"
18242 +
18243 +/*
18244 + * struct mutex functions
18245 + */
18246 +void __mutex_do_init(struct mutex *mutex, const char *name,
18247 +                    struct lock_class_key *key)
18248 +{
18249 +#ifdef CONFIG_DEBUG_LOCK_ALLOC
18250 +       /*
18251 +        * Make sure we are not reinitializing a held lock:
18252 +        */
18253 +       debug_check_no_locks_freed((void *)mutex, sizeof(*mutex));
18254 +       lockdep_init_map(&mutex->dep_map, name, key, 0);
18255 +#endif
18256 +       mutex->lock.save_state = 0;
18257 +}
18258 +EXPORT_SYMBOL(__mutex_do_init);
18259 +
18260 +void __lockfunc _mutex_lock(struct mutex *lock)
18261 +{
18262 +       mutex_acquire(&lock->dep_map, 0, 0, _RET_IP_);
18263 +       rt_mutex_lock(&lock->lock);
18264 +}
18265 +EXPORT_SYMBOL(_mutex_lock);
18266 +
18267 +int __lockfunc _mutex_lock_interruptible(struct mutex *lock)
18268 +{
18269 +       int ret;
18270 +
18271 +       mutex_acquire(&lock->dep_map, 0, 0, _RET_IP_);
18272 +       ret = rt_mutex_lock_interruptible(&lock->lock);
18273 +       if (ret)
18274 +               mutex_release(&lock->dep_map, 1, _RET_IP_);
18275 +       return ret;
18276 +}
18277 +EXPORT_SYMBOL(_mutex_lock_interruptible);
18278 +
18279 +int __lockfunc _mutex_lock_killable(struct mutex *lock)
18280 +{
18281 +       int ret;
18282 +
18283 +       mutex_acquire(&lock->dep_map, 0, 0, _RET_IP_);
18284 +       ret = rt_mutex_lock_killable(&lock->lock);
18285 +       if (ret)
18286 +               mutex_release(&lock->dep_map, 1, _RET_IP_);
18287 +       return ret;
18288 +}
18289 +EXPORT_SYMBOL(_mutex_lock_killable);
18290 +
18291 +#ifdef CONFIG_DEBUG_LOCK_ALLOC
18292 +void __lockfunc _mutex_lock_nested(struct mutex *lock, int subclass)
18293 +{
18294 +       mutex_acquire_nest(&lock->dep_map, subclass, 0, NULL, _RET_IP_);
18295 +       rt_mutex_lock(&lock->lock);
18296 +}
18297 +EXPORT_SYMBOL(_mutex_lock_nested);
18298 +
18299 +void __lockfunc _mutex_lock_nest_lock(struct mutex *lock, struct lockdep_map *nest)
18300 +{
18301 +       mutex_acquire_nest(&lock->dep_map, 0, 0, nest, _RET_IP_);
18302 +       rt_mutex_lock(&lock->lock);
18303 +}
18304 +EXPORT_SYMBOL(_mutex_lock_nest_lock);
18305 +
18306 +int __lockfunc _mutex_lock_interruptible_nested(struct mutex *lock, int subclass)
18307 +{
18308 +       int ret;
18309 +
18310 +       mutex_acquire_nest(&lock->dep_map, subclass, 0, NULL, _RET_IP_);
18311 +       ret = rt_mutex_lock_interruptible(&lock->lock);
18312 +       if (ret)
18313 +               mutex_release(&lock->dep_map, 1, _RET_IP_);
18314 +       return ret;
18315 +}
18316 +EXPORT_SYMBOL(_mutex_lock_interruptible_nested);
18317 +
18318 +int __lockfunc _mutex_lock_killable_nested(struct mutex *lock, int subclass)
18319 +{
18320 +       int ret;
18321 +
18322 +       mutex_acquire(&lock->dep_map, subclass, 0, _RET_IP_);
18323 +       ret = rt_mutex_lock_killable(&lock->lock);
18324 +       if (ret)
18325 +               mutex_release(&lock->dep_map, 1, _RET_IP_);
18326 +       return ret;
18327 +}
18328 +EXPORT_SYMBOL(_mutex_lock_killable_nested);
18329 +#endif
18330 +
18331 +int __lockfunc _mutex_trylock(struct mutex *lock)
18332 +{
18333 +       int ret = rt_mutex_trylock(&lock->lock);
18334 +
18335 +       if (ret)
18336 +               mutex_acquire(&lock->dep_map, 0, 1, _RET_IP_);
18337 +
18338 +       return ret;
18339 +}
18340 +EXPORT_SYMBOL(_mutex_trylock);
18341 +
18342 +void __lockfunc _mutex_unlock(struct mutex *lock)
18343 +{
18344 +       mutex_release(&lock->dep_map, 1, _RET_IP_);
18345 +       rt_mutex_unlock(&lock->lock);
18346 +}
18347 +EXPORT_SYMBOL(_mutex_unlock);
18348 +
18349 +/*
18350 + * rwlock_t functions
18351 + */
18352 +int __lockfunc rt_write_trylock(rwlock_t *rwlock)
18353 +{
18354 +       int ret;
18355 +
18356 +       migrate_disable();
18357 +       ret = rt_mutex_trylock(&rwlock->lock);
18358 +       if (ret)
18359 +               rwlock_acquire(&rwlock->dep_map, 0, 1, _RET_IP_);
18360 +       else
18361 +               migrate_enable();
18362 +
18363 +       return ret;
18364 +}
18365 +EXPORT_SYMBOL(rt_write_trylock);
18366 +
18367 +int __lockfunc rt_write_trylock_irqsave(rwlock_t *rwlock, unsigned long *flags)
18368 +{
18369 +       int ret;
18370 +
18371 +       *flags = 0;
18372 +       ret = rt_write_trylock(rwlock);
18373 +       return ret;
18374 +}
18375 +EXPORT_SYMBOL(rt_write_trylock_irqsave);
18376 +
18377 +int __lockfunc rt_read_trylock(rwlock_t *rwlock)
18378 +{
18379 +       struct rt_mutex *lock = &rwlock->lock;
18380 +       int ret = 1;
18381 +
18382 +       /*
18383 +        * recursive read locks succeed when current owns the lock,
18384 +        * but not when read_depth == 0 which means that the lock is
18385 +        * write locked.
18386 +        */
18387 +       if (rt_mutex_owner(lock) != current) {
18388 +               migrate_disable();
18389 +               ret = rt_mutex_trylock(lock);
18390 +               if (ret)
18391 +                       rwlock_acquire(&rwlock->dep_map, 0, 1, _RET_IP_);
18392 +               else
18393 +                       migrate_enable();
18394 +
18395 +       } else if (!rwlock->read_depth) {
18396 +               ret = 0;
18397 +       }
18398 +
18399 +       if (ret)
18400 +               rwlock->read_depth++;
18401 +
18402 +       return ret;
18403 +}
18404 +EXPORT_SYMBOL(rt_read_trylock);
18405 +
18406 +void __lockfunc rt_write_lock(rwlock_t *rwlock)
18407 +{
18408 +       rwlock_acquire(&rwlock->dep_map, 0, 0, _RET_IP_);
18409 +       __rt_spin_lock(&rwlock->lock);
18410 +}
18411 +EXPORT_SYMBOL(rt_write_lock);
18412 +
18413 +void __lockfunc rt_read_lock(rwlock_t *rwlock)
18414 +{
18415 +       struct rt_mutex *lock = &rwlock->lock;
18416 +
18417 +
18418 +       /*
18419 +        * recursive read locks succeed when current owns the lock
18420 +        */
18421 +       if (rt_mutex_owner(lock) != current) {
18422 +               rwlock_acquire(&rwlock->dep_map, 0, 0, _RET_IP_);
18423 +               __rt_spin_lock(lock);
18424 +       }
18425 +       rwlock->read_depth++;
18426 +}
18427 +
18428 +EXPORT_SYMBOL(rt_read_lock);
18429 +
18430 +void __lockfunc rt_write_unlock(rwlock_t *rwlock)
18431 +{
18432 +       /* NOTE: we always pass in '1' for nested, for simplicity */
18433 +       rwlock_release(&rwlock->dep_map, 1, _RET_IP_);
18434 +       __rt_spin_unlock(&rwlock->lock);
18435 +       migrate_enable();
18436 +}
18437 +EXPORT_SYMBOL(rt_write_unlock);
18438 +
18439 +void __lockfunc rt_read_unlock(rwlock_t *rwlock)
18440 +{
18441 +       /* Release the lock only when read_depth is down to 0 */
18442 +       if (--rwlock->read_depth == 0) {
18443 +               rwlock_release(&rwlock->dep_map, 1, _RET_IP_);
18444 +               __rt_spin_unlock(&rwlock->lock);
18445 +               migrate_enable();
18446 +       }
18447 +}
18448 +EXPORT_SYMBOL(rt_read_unlock);
18449 +
18450 +unsigned long __lockfunc rt_write_lock_irqsave(rwlock_t *rwlock)
18451 +{
18452 +       rt_write_lock(rwlock);
18453 +
18454 +       return 0;
18455 +}
18456 +EXPORT_SYMBOL(rt_write_lock_irqsave);
18457 +
18458 +unsigned long __lockfunc rt_read_lock_irqsave(rwlock_t *rwlock)
18459 +{
18460 +       rt_read_lock(rwlock);
18461 +
18462 +       return 0;
18463 +}
18464 +EXPORT_SYMBOL(rt_read_lock_irqsave);
18465 +
18466 +void __rt_rwlock_init(rwlock_t *rwlock, char *name, struct lock_class_key *key)
18467 +{
18468 +#ifdef CONFIG_DEBUG_LOCK_ALLOC
18469 +       /*
18470 +        * Make sure we are not reinitializing a held lock:
18471 +        */
18472 +       debug_check_no_locks_freed((void *)rwlock, sizeof(*rwlock));
18473 +       lockdep_init_map(&rwlock->dep_map, name, key, 0);
18474 +#endif
18475 +       rwlock->lock.save_state = 1;
18476 +       rwlock->read_depth = 0;
18477 +}
18478 +EXPORT_SYMBOL(__rt_rwlock_init);
18479 +
18480 +/*
18481 + * rw_semaphores
18482 + */
18483 +
18484 +void  rt_up_write(struct rw_semaphore *rwsem)
18485 +{
18486 +       rwsem_release(&rwsem->dep_map, 1, _RET_IP_);
18487 +       rt_mutex_unlock(&rwsem->lock);
18488 +}
18489 +EXPORT_SYMBOL(rt_up_write);
18490 +
18491 +void __rt_up_read(struct rw_semaphore *rwsem)
18492 +{
18493 +       if (--rwsem->read_depth == 0)
18494 +               rt_mutex_unlock(&rwsem->lock);
18495 +}
18496 +
18497 +void  rt_up_read(struct rw_semaphore *rwsem)
18498 +{
18499 +       rwsem_release(&rwsem->dep_map, 1, _RET_IP_);
18500 +       __rt_up_read(rwsem);
18501 +}
18502 +EXPORT_SYMBOL(rt_up_read);
18503 +
18504 +/*
18505 + * downgrade a write lock into a read lock
18506 + * - just wake up any readers at the front of the queue
18507 + */
18508 +void  rt_downgrade_write(struct rw_semaphore *rwsem)
18509 +{
18510 +       BUG_ON(rt_mutex_owner(&rwsem->lock) != current);
18511 +       rwsem->read_depth = 1;
18512 +}
18513 +EXPORT_SYMBOL(rt_downgrade_write);
18514 +
18515 +int  rt_down_write_trylock(struct rw_semaphore *rwsem)
18516 +{
18517 +       int ret = rt_mutex_trylock(&rwsem->lock);
18518 +
18519 +       if (ret)
18520 +               rwsem_acquire(&rwsem->dep_map, 0, 1, _RET_IP_);
18521 +       return ret;
18522 +}
18523 +EXPORT_SYMBOL(rt_down_write_trylock);
18524 +
18525 +void  rt_down_write(struct rw_semaphore *rwsem)
18526 +{
18527 +       rwsem_acquire(&rwsem->dep_map, 0, 0, _RET_IP_);
18528 +       rt_mutex_lock(&rwsem->lock);
18529 +}
18530 +EXPORT_SYMBOL(rt_down_write);
18531 +
18532 +void  rt_down_write_nested(struct rw_semaphore *rwsem, int subclass)
18533 +{
18534 +       rwsem_acquire(&rwsem->dep_map, subclass, 0, _RET_IP_);
18535 +       rt_mutex_lock(&rwsem->lock);
18536 +}
18537 +EXPORT_SYMBOL(rt_down_write_nested);
18538 +
18539 +void rt_down_write_nested_lock(struct rw_semaphore *rwsem,
18540 +                              struct lockdep_map *nest)
18541 +{
18542 +       rwsem_acquire_nest(&rwsem->dep_map, 0, 0, nest, _RET_IP_);
18543 +       rt_mutex_lock(&rwsem->lock);
18544 +}
18545 +EXPORT_SYMBOL(rt_down_write_nested_lock);
18546 +
18547 +int rt__down_read_trylock(struct rw_semaphore *rwsem)
18548 +{
18549 +       struct rt_mutex *lock = &rwsem->lock;
18550 +       int ret = 1;
18551 +
18552 +       /*
18553 +        * recursive read locks succeed when current owns the rwsem,
18554 +        * but not when read_depth == 0 which means that the rwsem is
18555 +        * write locked.
18556 +        */
18557 +       if (rt_mutex_owner(lock) != current)
18558 +               ret = rt_mutex_trylock(&rwsem->lock);
18559 +       else if (!rwsem->read_depth)
18560 +               ret = 0;
18561 +
18562 +       if (ret)
18563 +               rwsem->read_depth++;
18564 +       return ret;
18565 +
18566 +}
18567 +
18568 +int  rt_down_read_trylock(struct rw_semaphore *rwsem)
18569 +{
18570 +       int ret;
18571 +
18572 +       ret = rt__down_read_trylock(rwsem);
18573 +       if (ret)
18574 +               rwsem_acquire(&rwsem->dep_map, 0, 1, _RET_IP_);
18575 +
18576 +       return ret;
18577 +}
18578 +EXPORT_SYMBOL(rt_down_read_trylock);
18579 +
18580 +void rt__down_read(struct rw_semaphore *rwsem)
18581 +{
18582 +       struct rt_mutex *lock = &rwsem->lock;
18583 +
18584 +       if (rt_mutex_owner(lock) != current)
18585 +               rt_mutex_lock(&rwsem->lock);
18586 +       rwsem->read_depth++;
18587 +}
18588 +EXPORT_SYMBOL(rt__down_read);
18589 +
18590 +static void __rt_down_read(struct rw_semaphore *rwsem, int subclass)
18591 +{
18592 +       rwsem_acquire_read(&rwsem->dep_map, subclass, 0, _RET_IP_);
18593 +       rt__down_read(rwsem);
18594 +}
18595 +
18596 +void  rt_down_read(struct rw_semaphore *rwsem)
18597 +{
18598 +       __rt_down_read(rwsem, 0);
18599 +}
18600 +EXPORT_SYMBOL(rt_down_read);
18601 +
18602 +void  rt_down_read_nested(struct rw_semaphore *rwsem, int subclass)
18603 +{
18604 +       __rt_down_read(rwsem, subclass);
18605 +}
18606 +EXPORT_SYMBOL(rt_down_read_nested);
18607 +
18608 +void  __rt_rwsem_init(struct rw_semaphore *rwsem, const char *name,
18609 +                             struct lock_class_key *key)
18610 +{
18611 +#ifdef CONFIG_DEBUG_LOCK_ALLOC
18612 +       /*
18613 +        * Make sure we are not reinitializing a held lock:
18614 +        */
18615 +       debug_check_no_locks_freed((void *)rwsem, sizeof(*rwsem));
18616 +       lockdep_init_map(&rwsem->dep_map, name, key, 0);
18617 +#endif
18618 +       rwsem->read_depth = 0;
18619 +       rwsem->lock.save_state = 0;
18620 +}
18621 +EXPORT_SYMBOL(__rt_rwsem_init);
18622 +
18623 +/**
18624 + * atomic_dec_and_mutex_lock - return holding mutex if we dec to 0
18625 + * @cnt: the atomic which we are to dec
18626 + * @lock: the mutex to return holding if we dec to 0
18627 + *
18628 + * return true and hold lock if we dec to 0, return false otherwise
18629 + */
18630 +int atomic_dec_and_mutex_lock(atomic_t *cnt, struct mutex *lock)
18631 +{
18632 +       /* dec if we can't possibly hit 0 */
18633 +       if (atomic_add_unless(cnt, -1, 1))
18634 +               return 0;
18635 +       /* we might hit 0, so take the lock */
18636 +       mutex_lock(lock);
18637 +       if (!atomic_dec_and_test(cnt)) {
18638 +               /* when we actually did the dec, we didn't hit 0 */
18639 +               mutex_unlock(lock);
18640 +               return 0;
18641 +       }
18642 +       /* we hit 0, and we hold the lock */
18643 +       return 1;
18644 +}
18645 +EXPORT_SYMBOL(atomic_dec_and_mutex_lock);
18646 diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c
18647 index 8251e75dd9c0..6759a798c927 100644
18648 --- a/kernel/locking/rtmutex.c
18649 +++ b/kernel/locking/rtmutex.c
18650 @@ -7,6 +7,11 @@
18651   *  Copyright (C) 2005-2006 Timesys Corp., Thomas Gleixner <tglx@timesys.com>
18652   *  Copyright (C) 2005 Kihon Technologies Inc., Steven Rostedt
18653   *  Copyright (C) 2006 Esben Nielsen
18654 + *  Adaptive Spinlocks:
18655 + *  Copyright (C) 2008 Novell, Inc., Gregory Haskins, Sven Dietrich,
18656 + *                                  and Peter Morreale,
18657 + * Adaptive Spinlocks simplification:
18658 + *  Copyright (C) 2008 Red Hat, Inc., Steven Rostedt <srostedt@redhat.com>
18659   *
18660   *  See Documentation/locking/rt-mutex-design.txt for details.
18661   */
18662 @@ -16,6 +21,7 @@
18663  #include <linux/sched/rt.h>
18664  #include <linux/sched/deadline.h>
18665  #include <linux/timer.h>
18666 +#include <linux/ww_mutex.h>
18667
18668  #include "rtmutex_common.h"
18669
18670 @@ -69,6 +75,12 @@ static void fixup_rt_mutex_waiters(struct rt_mutex *lock)
18671                 clear_rt_mutex_waiters(lock);
18672  }
18673
18674 +static int rt_mutex_real_waiter(struct rt_mutex_waiter *waiter)
18675 +{
18676 +       return waiter && waiter != PI_WAKEUP_INPROGRESS &&
18677 +               waiter != PI_REQUEUE_INPROGRESS;
18678 +}
18679 +
18680  /*
18681   * We can speed up the acquire/release, if there's no debugging state to be
18682   * set up.
18683 @@ -99,13 +111,14 @@ static inline void mark_rt_mutex_waiters(struct rt_mutex *lock)
18684   * 2) Drop lock->wait_lock
18685   * 3) Try to unlock the lock with cmpxchg
18686   */
18687 -static inline bool unlock_rt_mutex_safe(struct rt_mutex *lock)
18688 +static inline bool unlock_rt_mutex_safe(struct rt_mutex *lock,
18689 +                                       unsigned long flags)
18690         __releases(lock->wait_lock)
18691  {
18692         struct task_struct *owner = rt_mutex_owner(lock);
18693
18694         clear_rt_mutex_waiters(lock);
18695 -       raw_spin_unlock(&lock->wait_lock);
18696 +       raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
18697         /*
18698          * If a new waiter comes in between the unlock and the cmpxchg
18699          * we have two situations:
18700 @@ -147,11 +160,12 @@ static inline void mark_rt_mutex_waiters(struct rt_mutex *lock)
18701  /*
18702   * Simple slow path only version: lock->owner is protected by lock->wait_lock.
18703   */
18704 -static inline bool unlock_rt_mutex_safe(struct rt_mutex *lock)
18705 +static inline bool unlock_rt_mutex_safe(struct rt_mutex *lock,
18706 +                                       unsigned long flags)
18707         __releases(lock->wait_lock)
18708  {
18709         lock->owner = NULL;
18710 -       raw_spin_unlock(&lock->wait_lock);
18711 +       raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
18712         return true;
18713  }
18714  #endif
18715 @@ -348,6 +362,14 @@ static bool rt_mutex_cond_detect_deadlock(struct rt_mutex_waiter *waiter,
18716         return debug_rt_mutex_detect_deadlock(waiter, chwalk);
18717  }
18718
18719 +static void rt_mutex_wake_waiter(struct rt_mutex_waiter *waiter)
18720 +{
18721 +       if (waiter->savestate)
18722 +               wake_up_lock_sleeper(waiter->task);
18723 +       else
18724 +               wake_up_process(waiter->task);
18725 +}
18726 +
18727  /*
18728   * Max number of times we'll walk the boosting chain:
18729   */
18730 @@ -355,7 +377,8 @@ int max_lock_depth = 1024;
18731
18732  static inline struct rt_mutex *task_blocked_on_lock(struct task_struct *p)
18733  {
18734 -       return p->pi_blocked_on ? p->pi_blocked_on->lock : NULL;
18735 +       return rt_mutex_real_waiter(p->pi_blocked_on) ?
18736 +               p->pi_blocked_on->lock : NULL;
18737  }
18738
18739  /*
18740 @@ -433,7 +456,6 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
18741         int ret = 0, depth = 0;
18742         struct rt_mutex *lock;
18743         bool detect_deadlock;
18744 -       unsigned long flags;
18745         bool requeue = true;
18746
18747         detect_deadlock = rt_mutex_cond_detect_deadlock(orig_waiter, chwalk);
18748 @@ -476,7 +498,7 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
18749         /*
18750          * [1] Task cannot go away as we did a get_task() before !
18751          */
18752 -       raw_spin_lock_irqsave(&task->pi_lock, flags);
18753 +       raw_spin_lock_irq(&task->pi_lock);
18754
18755         /*
18756          * [2] Get the waiter on which @task is blocked on.
18757 @@ -492,7 +514,7 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
18758          * reached or the state of the chain has changed while we
18759          * dropped the locks.
18760          */
18761 -       if (!waiter)
18762 +       if (!rt_mutex_real_waiter(waiter))
18763                 goto out_unlock_pi;
18764
18765         /*
18766 @@ -560,7 +582,7 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
18767          * operations.
18768          */
18769         if (!raw_spin_trylock(&lock->wait_lock)) {
18770 -               raw_spin_unlock_irqrestore(&task->pi_lock, flags);
18771 +               raw_spin_unlock_irq(&task->pi_lock);
18772                 cpu_relax();
18773                 goto retry;
18774         }
18775 @@ -591,7 +613,7 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
18776                 /*
18777                  * No requeue[7] here. Just release @task [8]
18778                  */
18779 -               raw_spin_unlock_irqrestore(&task->pi_lock, flags);
18780 +               raw_spin_unlock(&task->pi_lock);
18781                 put_task_struct(task);
18782
18783                 /*
18784 @@ -599,14 +621,14 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
18785                  * If there is no owner of the lock, end of chain.
18786                  */
18787                 if (!rt_mutex_owner(lock)) {
18788 -                       raw_spin_unlock(&lock->wait_lock);
18789 +                       raw_spin_unlock_irq(&lock->wait_lock);
18790                         return 0;
18791                 }
18792
18793                 /* [10] Grab the next task, i.e. owner of @lock */
18794                 task = rt_mutex_owner(lock);
18795                 get_task_struct(task);
18796 -               raw_spin_lock_irqsave(&task->pi_lock, flags);
18797 +               raw_spin_lock(&task->pi_lock);
18798
18799                 /*
18800                  * No requeue [11] here. We just do deadlock detection.
18801 @@ -621,8 +643,8 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
18802                 top_waiter = rt_mutex_top_waiter(lock);
18803
18804                 /* [13] Drop locks */
18805 -               raw_spin_unlock_irqrestore(&task->pi_lock, flags);
18806 -               raw_spin_unlock(&lock->wait_lock);
18807 +               raw_spin_unlock(&task->pi_lock);
18808 +               raw_spin_unlock_irq(&lock->wait_lock);
18809
18810                 /* If owner is not blocked, end of chain. */
18811                 if (!next_lock)
18812 @@ -643,7 +665,7 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
18813         rt_mutex_enqueue(lock, waiter);
18814
18815         /* [8] Release the task */
18816 -       raw_spin_unlock_irqrestore(&task->pi_lock, flags);
18817 +       raw_spin_unlock(&task->pi_lock);
18818         put_task_struct(task);
18819
18820         /*
18821 @@ -654,21 +676,24 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
18822          * follow here. This is the end of the chain we are walking.
18823          */
18824         if (!rt_mutex_owner(lock)) {
18825 +               struct rt_mutex_waiter *lock_top_waiter;
18826 +
18827                 /*
18828                  * If the requeue [7] above changed the top waiter,
18829                  * then we need to wake the new top waiter up to try
18830                  * to get the lock.
18831                  */
18832 -               if (prerequeue_top_waiter != rt_mutex_top_waiter(lock))
18833 -                       wake_up_process(rt_mutex_top_waiter(lock)->task);
18834 -               raw_spin_unlock(&lock->wait_lock);
18835 +               lock_top_waiter = rt_mutex_top_waiter(lock);
18836 +               if (prerequeue_top_waiter != lock_top_waiter)
18837 +                       rt_mutex_wake_waiter(lock_top_waiter);
18838 +               raw_spin_unlock_irq(&lock->wait_lock);
18839                 return 0;
18840         }
18841
18842         /* [10] Grab the next task, i.e. the owner of @lock */
18843         task = rt_mutex_owner(lock);
18844         get_task_struct(task);
18845 -       raw_spin_lock_irqsave(&task->pi_lock, flags);
18846 +       raw_spin_lock(&task->pi_lock);
18847
18848         /* [11] requeue the pi waiters if necessary */
18849         if (waiter == rt_mutex_top_waiter(lock)) {
18850 @@ -722,8 +747,8 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
18851         top_waiter = rt_mutex_top_waiter(lock);
18852
18853         /* [13] Drop the locks */
18854 -       raw_spin_unlock_irqrestore(&task->pi_lock, flags);
18855 -       raw_spin_unlock(&lock->wait_lock);
18856 +       raw_spin_unlock(&task->pi_lock);
18857 +       raw_spin_unlock_irq(&lock->wait_lock);
18858
18859         /*
18860          * Make the actual exit decisions [12], based on the stored
18861 @@ -746,28 +771,46 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
18862         goto again;
18863
18864   out_unlock_pi:
18865 -       raw_spin_unlock_irqrestore(&task->pi_lock, flags);
18866 +       raw_spin_unlock_irq(&task->pi_lock);
18867   out_put_task:
18868         put_task_struct(task);
18869
18870         return ret;
18871  }
18872
18873 +
18874 +#define STEAL_NORMAL  0
18875 +#define STEAL_LATERAL 1
18876 +
18877 +/*
18878 + * Note that RT tasks are excluded from lateral-steals to prevent the
18879 + * introduction of an unbounded latency
18880 + */
18881 +static inline int lock_is_stealable(struct task_struct *task,
18882 +                                   struct task_struct *pendowner, int mode)
18883 +{
18884 +    if (mode == STEAL_NORMAL || rt_task(task)) {
18885 +           if (task->prio >= pendowner->prio)
18886 +                   return 0;
18887 +    } else if (task->prio > pendowner->prio)
18888 +           return 0;
18889 +    return 1;
18890 +}
18891 +
18892  /*
18893   * Try to take an rt-mutex
18894   *
18895 - * Must be called with lock->wait_lock held.
18896 + * Must be called with lock->wait_lock held and interrupts disabled
18897   *
18898   * @lock:   The lock to be acquired.
18899   * @task:   The task which wants to acquire the lock
18900   * @waiter: The waiter that is queued to the lock's wait tree if the
18901   *         callsite called task_blocked_on_lock(), otherwise NULL
18902   */
18903 -static int try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task,
18904 -                               struct rt_mutex_waiter *waiter)
18905 +static int __try_to_take_rt_mutex(struct rt_mutex *lock,
18906 +                                 struct task_struct *task,
18907 +                                 struct rt_mutex_waiter *waiter, int mode)
18908  {
18909 -       unsigned long flags;
18910 -
18911         /*
18912          * Before testing whether we can acquire @lock, we set the
18913          * RT_MUTEX_HAS_WAITERS bit in @lock->owner. This forces all
18914 @@ -803,8 +846,10 @@ static int try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task,
18915                  * If waiter is not the highest priority waiter of
18916                  * @lock, give up.
18917                  */
18918 -               if (waiter != rt_mutex_top_waiter(lock))
18919 +               if (waiter != rt_mutex_top_waiter(lock)) {
18920 +                       /* XXX lock_is_stealable() ? */
18921                         return 0;
18922 +               }
18923
18924                 /*
18925                  * We can acquire the lock. Remove the waiter from the
18926 @@ -822,14 +867,10 @@ static int try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task,
18927                  * not need to be dequeued.
18928                  */
18929                 if (rt_mutex_has_waiters(lock)) {
18930 -                       /*
18931 -                        * If @task->prio is greater than or equal to
18932 -                        * the top waiter priority (kernel view),
18933 -                        * @task lost.
18934 -                        */
18935 -                       if (task->prio >= rt_mutex_top_waiter(lock)->prio)
18936 -                               return 0;
18937 +                       struct task_struct *pown = rt_mutex_top_waiter(lock)->task;
18938
18939 +                       if (task != pown && !lock_is_stealable(task, pown, mode))
18940 +                               return 0;
18941                         /*
18942                          * The current top waiter stays enqueued. We
18943                          * don't have to change anything in the lock
18944 @@ -852,7 +893,7 @@ static int try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task,
18945          * case, but conditionals are more expensive than a redundant
18946          * store.
18947          */
18948 -       raw_spin_lock_irqsave(&task->pi_lock, flags);
18949 +       raw_spin_lock(&task->pi_lock);
18950         task->pi_blocked_on = NULL;
18951         /*
18952          * Finish the lock acquisition. @task is the new owner. If
18953 @@ -861,7 +902,7 @@ static int try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task,
18954          */
18955         if (rt_mutex_has_waiters(lock))
18956                 rt_mutex_enqueue_pi(task, rt_mutex_top_waiter(lock));
18957 -       raw_spin_unlock_irqrestore(&task->pi_lock, flags);
18958 +       raw_spin_unlock(&task->pi_lock);
18959
18960  takeit:
18961         /* We got the lock. */
18962 @@ -878,12 +919,444 @@ takeit:
18963         return 1;
18964  }
18965
18966 +#ifdef CONFIG_PREEMPT_RT_FULL
18967 +/*
18968 + * preemptible spin_lock functions:
18969 + */
18970 +static inline void rt_spin_lock_fastlock(struct rt_mutex *lock,
18971 +                                        void  (*slowfn)(struct rt_mutex *lock,
18972 +                                                        bool mg_off),
18973 +                                        bool do_mig_dis)
18974 +{
18975 +       might_sleep_no_state_check();
18976 +
18977 +       if (do_mig_dis)
18978 +               migrate_disable();
18979 +
18980 +       if (likely(rt_mutex_cmpxchg_acquire(lock, NULL, current)))
18981 +               rt_mutex_deadlock_account_lock(lock, current);
18982 +       else
18983 +               slowfn(lock, do_mig_dis);
18984 +}
18985 +
18986 +static inline int rt_spin_lock_fastunlock(struct rt_mutex *lock,
18987 +                                          int  (*slowfn)(struct rt_mutex *lock))
18988 +{
18989 +       if (likely(rt_mutex_cmpxchg_release(lock, current, NULL))) {
18990 +               rt_mutex_deadlock_account_unlock(current);
18991 +               return 0;
18992 +       }
18993 +       return slowfn(lock);
18994 +}
18995 +#ifdef CONFIG_SMP
18996 +/*
18997 + * Note that owner is a speculative pointer and dereferencing relies
18998 + * on rcu_read_lock() and the check against the lock owner.
18999 + */
19000 +static int adaptive_wait(struct rt_mutex *lock,
19001 +                        struct task_struct *owner)
19002 +{
19003 +       int res = 0;
19004 +
19005 +       rcu_read_lock();
19006 +       for (;;) {
19007 +               if (owner != rt_mutex_owner(lock))
19008 +                       break;
19009 +               /*
19010 +                * Ensure that owner->on_cpu is dereferenced _after_
19011 +                * checking the above to be valid.
19012 +                */
19013 +               barrier();
19014 +               if (!owner->on_cpu) {
19015 +                       res = 1;
19016 +                       break;
19017 +               }
19018 +               cpu_relax();
19019 +       }
19020 +       rcu_read_unlock();
19021 +       return res;
19022 +}
19023 +#else
19024 +static int adaptive_wait(struct rt_mutex *lock,
19025 +                        struct task_struct *orig_owner)
19026 +{
19027 +       return 1;
19028 +}
19029 +#endif
19030 +
19031 +static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
19032 +                                  struct rt_mutex_waiter *waiter,
19033 +                                  struct task_struct *task,
19034 +                                  enum rtmutex_chainwalk chwalk);
19035 +/*
19036 + * Slow path lock function spin_lock style: this variant is very
19037 + * careful not to miss any non-lock wakeups.
19038 + *
19039 + * We store the current state under p->pi_lock in p->saved_state and
19040 + * the try_to_wake_up() code handles this accordingly.
19041 + */
19042 +static void  noinline __sched rt_spin_lock_slowlock(struct rt_mutex *lock,
19043 +                                                   bool mg_off)
19044 +{
19045 +       struct task_struct *lock_owner, *self = current;
19046 +       struct rt_mutex_waiter waiter, *top_waiter;
19047 +       unsigned long flags;
19048 +       int ret;
19049 +
19050 +       rt_mutex_init_waiter(&waiter, true);
19051 +
19052 +       raw_spin_lock_irqsave(&lock->wait_lock, flags);
19053 +
19054 +       if (__try_to_take_rt_mutex(lock, self, NULL, STEAL_LATERAL)) {
19055 +               raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
19056 +               return;
19057 +       }
19058 +
19059 +       BUG_ON(rt_mutex_owner(lock) == self);
19060 +
19061 +       /*
19062 +        * We save whatever state the task is in and we'll restore it
19063 +        * after acquiring the lock taking real wakeups into account
19064 +        * as well. We are serialized via pi_lock against wakeups. See
19065 +        * try_to_wake_up().
19066 +        */
19067 +       raw_spin_lock(&self->pi_lock);
19068 +       self->saved_state = self->state;
19069 +       __set_current_state_no_track(TASK_UNINTERRUPTIBLE);
19070 +       raw_spin_unlock(&self->pi_lock);
19071 +
19072 +       ret = task_blocks_on_rt_mutex(lock, &waiter, self, RT_MUTEX_MIN_CHAINWALK);
19073 +       BUG_ON(ret);
19074 +
19075 +       for (;;) {
19076 +               /* Try to acquire the lock again. */
19077 +               if (__try_to_take_rt_mutex(lock, self, &waiter, STEAL_LATERAL))
19078 +                       break;
19079 +
19080 +               top_waiter = rt_mutex_top_waiter(lock);
19081 +               lock_owner = rt_mutex_owner(lock);
19082 +
19083 +               raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
19084 +
19085 +               debug_rt_mutex_print_deadlock(&waiter);
19086 +
19087 +               if (top_waiter != &waiter || adaptive_wait(lock, lock_owner)) {
19088 +                       if (mg_off)
19089 +                               migrate_enable();
19090 +                       schedule();
19091 +                       if (mg_off)
19092 +                               migrate_disable();
19093 +               }
19094 +
19095 +               raw_spin_lock_irqsave(&lock->wait_lock, flags);
19096 +
19097 +               raw_spin_lock(&self->pi_lock);
19098 +               __set_current_state_no_track(TASK_UNINTERRUPTIBLE);
19099 +               raw_spin_unlock(&self->pi_lock);
19100 +       }
19101 +
19102 +       /*
19103 +        * Restore the task state to current->saved_state. We set it
19104 +        * to the original state above and the try_to_wake_up() code
19105 +        * has possibly updated it when a real (non-rtmutex) wakeup
19106 +        * happened while we were blocked. Clear saved_state so
19107 +        * try_to_wakeup() does not get confused.
19108 +        */
19109 +       raw_spin_lock(&self->pi_lock);
19110 +       __set_current_state_no_track(self->saved_state);
19111 +       self->saved_state = TASK_RUNNING;
19112 +       raw_spin_unlock(&self->pi_lock);
19113 +
19114 +       /*
19115 +        * try_to_take_rt_mutex() sets the waiter bit
19116 +        * unconditionally. We might have to fix that up:
19117 +        */
19118 +       fixup_rt_mutex_waiters(lock);
19119 +
19120 +       BUG_ON(rt_mutex_has_waiters(lock) && &waiter == rt_mutex_top_waiter(lock));
19121 +       BUG_ON(!RB_EMPTY_NODE(&waiter.tree_entry));
19122 +
19123 +       raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
19124 +
19125 +       debug_rt_mutex_free_waiter(&waiter);
19126 +}
19127 +
19128 +static void mark_wakeup_next_waiter(struct wake_q_head *wake_q,
19129 +                                   struct wake_q_head *wake_sleeper_q,
19130 +                                   struct rt_mutex *lock);
19131 +/*
19132 + * Slow path to release a rt_mutex spin_lock style
19133 + */
19134 +static int noinline __sched rt_spin_lock_slowunlock(struct rt_mutex *lock)
19135 +{
19136 +       unsigned long flags;
19137 +       WAKE_Q(wake_q);
19138 +       WAKE_Q(wake_sleeper_q);
19139 +
19140 +       raw_spin_lock_irqsave(&lock->wait_lock, flags);
19141 +
19142 +       debug_rt_mutex_unlock(lock);
19143 +
19144 +       rt_mutex_deadlock_account_unlock(current);
19145 +
19146 +       if (!rt_mutex_has_waiters(lock)) {
19147 +               lock->owner = NULL;
19148 +               raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
19149 +               return 0;
19150 +       }
19151 +
19152 +       mark_wakeup_next_waiter(&wake_q, &wake_sleeper_q, lock);
19153 +
19154 +       raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
19155 +       wake_up_q(&wake_q);
19156 +       wake_up_q_sleeper(&wake_sleeper_q);
19157 +
19158 +       /* Undo pi boosting.when necessary */
19159 +       rt_mutex_adjust_prio(current);
19160 +       return 0;
19161 +}
19162 +
19163 +static int noinline __sched rt_spin_lock_slowunlock_no_deboost(struct rt_mutex *lock)
19164 +{
19165 +       unsigned long flags;
19166 +       WAKE_Q(wake_q);
19167 +       WAKE_Q(wake_sleeper_q);
19168 +
19169 +       raw_spin_lock_irqsave(&lock->wait_lock, flags);
19170 +
19171 +       debug_rt_mutex_unlock(lock);
19172 +
19173 +       rt_mutex_deadlock_account_unlock(current);
19174 +
19175 +       if (!rt_mutex_has_waiters(lock)) {
19176 +               lock->owner = NULL;
19177 +               raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
19178 +               return 0;
19179 +       }
19180 +
19181 +       mark_wakeup_next_waiter(&wake_q, &wake_sleeper_q, lock);
19182 +
19183 +       raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
19184 +       wake_up_q(&wake_q);
19185 +       wake_up_q_sleeper(&wake_sleeper_q);
19186 +       return 1;
19187 +}
19188 +
19189 +void __lockfunc rt_spin_lock__no_mg(spinlock_t *lock)
19190 +{
19191 +       rt_spin_lock_fastlock(&lock->lock, rt_spin_lock_slowlock, false);
19192 +       spin_acquire(&lock->dep_map, 0, 0, _RET_IP_);
19193 +}
19194 +EXPORT_SYMBOL(rt_spin_lock__no_mg);
19195 +
19196 +void __lockfunc rt_spin_lock(spinlock_t *lock)
19197 +{
19198 +       rt_spin_lock_fastlock(&lock->lock, rt_spin_lock_slowlock, true);
19199 +       spin_acquire(&lock->dep_map, 0, 0, _RET_IP_);
19200 +}
19201 +EXPORT_SYMBOL(rt_spin_lock);
19202 +
19203 +void __lockfunc __rt_spin_lock(struct rt_mutex *lock)
19204 +{
19205 +       rt_spin_lock_fastlock(lock, rt_spin_lock_slowlock, true);
19206 +}
19207 +EXPORT_SYMBOL(__rt_spin_lock);
19208 +
19209 +void __lockfunc __rt_spin_lock__no_mg(struct rt_mutex *lock)
19210 +{
19211 +       rt_spin_lock_fastlock(lock, rt_spin_lock_slowlock, false);
19212 +}
19213 +EXPORT_SYMBOL(__rt_spin_lock__no_mg);
19214 +
19215 +#ifdef CONFIG_DEBUG_LOCK_ALLOC
19216 +void __lockfunc rt_spin_lock_nested(spinlock_t *lock, int subclass)
19217 +{
19218 +       spin_acquire(&lock->dep_map, subclass, 0, _RET_IP_);
19219 +       rt_spin_lock_fastlock(&lock->lock, rt_spin_lock_slowlock, true);
19220 +}
19221 +EXPORT_SYMBOL(rt_spin_lock_nested);
19222 +#endif
19223 +
19224 +void __lockfunc rt_spin_unlock__no_mg(spinlock_t *lock)
19225 +{
19226 +       /* NOTE: we always pass in '1' for nested, for simplicity */
19227 +       spin_release(&lock->dep_map, 1, _RET_IP_);
19228 +       rt_spin_lock_fastunlock(&lock->lock, rt_spin_lock_slowunlock);
19229 +}
19230 +EXPORT_SYMBOL(rt_spin_unlock__no_mg);
19231 +
19232 +void __lockfunc rt_spin_unlock(spinlock_t *lock)
19233 +{
19234 +       /* NOTE: we always pass in '1' for nested, for simplicity */
19235 +       spin_release(&lock->dep_map, 1, _RET_IP_);
19236 +       rt_spin_lock_fastunlock(&lock->lock, rt_spin_lock_slowunlock);
19237 +       migrate_enable();
19238 +}
19239 +EXPORT_SYMBOL(rt_spin_unlock);
19240 +
19241 +int __lockfunc rt_spin_unlock_no_deboost(spinlock_t *lock)
19242 +{
19243 +       int ret;
19244 +
19245 +       /* NOTE: we always pass in '1' for nested, for simplicity */
19246 +       spin_release(&lock->dep_map, 1, _RET_IP_);
19247 +       ret = rt_spin_lock_fastunlock(&lock->lock, rt_spin_lock_slowunlock_no_deboost);
19248 +       migrate_enable();
19249 +       return ret;
19250 +}
19251 +
19252 +void __lockfunc __rt_spin_unlock(struct rt_mutex *lock)
19253 +{
19254 +       rt_spin_lock_fastunlock(lock, rt_spin_lock_slowunlock);
19255 +}
19256 +EXPORT_SYMBOL(__rt_spin_unlock);
19257 +
19258 +/*
19259 + * Wait for the lock to get unlocked: instead of polling for an unlock
19260 + * (like raw spinlocks do), we lock and unlock, to force the kernel to
19261 + * schedule if there's contention:
19262 + */
19263 +void __lockfunc rt_spin_unlock_wait(spinlock_t *lock)
19264 +{
19265 +       spin_lock(lock);
19266 +       spin_unlock(lock);
19267 +}
19268 +EXPORT_SYMBOL(rt_spin_unlock_wait);
19269 +
19270 +int __lockfunc __rt_spin_trylock(struct rt_mutex *lock)
19271 +{
19272 +       return rt_mutex_trylock(lock);
19273 +}
19274 +
19275 +int __lockfunc rt_spin_trylock__no_mg(spinlock_t *lock)
19276 +{
19277 +       int ret;
19278 +
19279 +       ret = rt_mutex_trylock(&lock->lock);
19280 +       if (ret)
19281 +               spin_acquire(&lock->dep_map, 0, 1, _RET_IP_);
19282 +       return ret;
19283 +}
19284 +EXPORT_SYMBOL(rt_spin_trylock__no_mg);
19285 +
19286 +int __lockfunc rt_spin_trylock(spinlock_t *lock)
19287 +{
19288 +       int ret;
19289 +
19290 +       migrate_disable();
19291 +       ret = rt_mutex_trylock(&lock->lock);
19292 +       if (ret)
19293 +               spin_acquire(&lock->dep_map, 0, 1, _RET_IP_);
19294 +       else
19295 +               migrate_enable();
19296 +       return ret;
19297 +}
19298 +EXPORT_SYMBOL(rt_spin_trylock);
19299 +
19300 +int __lockfunc rt_spin_trylock_bh(spinlock_t *lock)
19301 +{
19302 +       int ret;
19303 +
19304 +       local_bh_disable();
19305 +       ret = rt_mutex_trylock(&lock->lock);
19306 +       if (ret) {
19307 +               migrate_disable();
19308 +               spin_acquire(&lock->dep_map, 0, 1, _RET_IP_);
19309 +       } else
19310 +               local_bh_enable();
19311 +       return ret;
19312 +}
19313 +EXPORT_SYMBOL(rt_spin_trylock_bh);
19314 +
19315 +int __lockfunc rt_spin_trylock_irqsave(spinlock_t *lock, unsigned long *flags)
19316 +{
19317 +       int ret;
19318 +
19319 +       *flags = 0;
19320 +       ret = rt_mutex_trylock(&lock->lock);
19321 +       if (ret) {
19322 +               migrate_disable();
19323 +               spin_acquire(&lock->dep_map, 0, 1, _RET_IP_);
19324 +       }
19325 +       return ret;
19326 +}
19327 +EXPORT_SYMBOL(rt_spin_trylock_irqsave);
19328 +
19329 +int atomic_dec_and_spin_lock(atomic_t *atomic, spinlock_t *lock)
19330 +{
19331 +       /* Subtract 1 from counter unless that drops it to 0 (ie. it was 1) */
19332 +       if (atomic_add_unless(atomic, -1, 1))
19333 +               return 0;
19334 +       rt_spin_lock(lock);
19335 +       if (atomic_dec_and_test(atomic))
19336 +               return 1;
19337 +       rt_spin_unlock(lock);
19338 +       return 0;
19339 +}
19340 +EXPORT_SYMBOL(atomic_dec_and_spin_lock);
19341 +
19342 +       void
19343 +__rt_spin_lock_init(spinlock_t *lock, char *name, struct lock_class_key *key)
19344 +{
19345 +#ifdef CONFIG_DEBUG_LOCK_ALLOC
19346 +       /*
19347 +        * Make sure we are not reinitializing a held lock:
19348 +        */
19349 +       debug_check_no_locks_freed((void *)lock, sizeof(*lock));
19350 +       lockdep_init_map(&lock->dep_map, name, key, 0);
19351 +#endif
19352 +}
19353 +EXPORT_SYMBOL(__rt_spin_lock_init);
19354 +
19355 +#endif /* PREEMPT_RT_FULL */
19356 +
19357 +#ifdef CONFIG_PREEMPT_RT_FULL
19358 +       static inline int __sched
19359 +__mutex_lock_check_stamp(struct rt_mutex *lock, struct ww_acquire_ctx *ctx)
19360 +{
19361 +       struct ww_mutex *ww = container_of(lock, struct ww_mutex, base.lock);
19362 +       struct ww_acquire_ctx *hold_ctx = ACCESS_ONCE(ww->ctx);
19363 +
19364 +       if (!hold_ctx)
19365 +               return 0;
19366 +
19367 +       if (unlikely(ctx == hold_ctx))
19368 +               return -EALREADY;
19369 +
19370 +       if (ctx->stamp - hold_ctx->stamp <= LONG_MAX &&
19371 +           (ctx->stamp != hold_ctx->stamp || ctx > hold_ctx)) {
19372 +#ifdef CONFIG_DEBUG_MUTEXES
19373 +               DEBUG_LOCKS_WARN_ON(ctx->contending_lock);
19374 +               ctx->contending_lock = ww;
19375 +#endif
19376 +               return -EDEADLK;
19377 +       }
19378 +
19379 +       return 0;
19380 +}
19381 +#else
19382 +       static inline int __sched
19383 +__mutex_lock_check_stamp(struct rt_mutex *lock, struct ww_acquire_ctx *ctx)
19384 +{
19385 +       BUG();
19386 +       return 0;
19387 +}
19388 +
19389 +#endif
19390 +
19391 +static inline int
19392 +try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task,
19393 +                    struct rt_mutex_waiter *waiter)
19394 +{
19395 +       return __try_to_take_rt_mutex(lock, task, waiter, STEAL_NORMAL);
19396 +}
19397 +
19398  /*
19399   * Task blocks on lock.
19400   *
19401   * Prepare waiter and propagate pi chain
19402   *
19403 - * This must be called with lock->wait_lock held.
19404 + * This must be called with lock->wait_lock held and interrupts disabled
19405   */
19406  static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
19407                                    struct rt_mutex_waiter *waiter,
19408 @@ -894,7 +1367,6 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
19409         struct rt_mutex_waiter *top_waiter = waiter;
19410         struct rt_mutex *next_lock;
19411         int chain_walk = 0, res;
19412 -       unsigned long flags;
19413
19414         /*
19415          * Early deadlock detection. We really don't want the task to
19416 @@ -908,7 +1380,24 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
19417         if (owner == task)
19418                 return -EDEADLK;
19419
19420 -       raw_spin_lock_irqsave(&task->pi_lock, flags);
19421 +       raw_spin_lock(&task->pi_lock);
19422 +
19423 +       /*
19424 +        * In the case of futex requeue PI, this will be a proxy
19425 +        * lock. The task will wake unaware that it is enqueueed on
19426 +        * this lock. Avoid blocking on two locks and corrupting
19427 +        * pi_blocked_on via the PI_WAKEUP_INPROGRESS
19428 +        * flag. futex_wait_requeue_pi() sets this when it wakes up
19429 +        * before requeue (due to a signal or timeout). Do not enqueue
19430 +        * the task if PI_WAKEUP_INPROGRESS is set.
19431 +        */
19432 +       if (task != current && task->pi_blocked_on == PI_WAKEUP_INPROGRESS) {
19433 +               raw_spin_unlock(&task->pi_lock);
19434 +               return -EAGAIN;
19435 +       }
19436 +
19437 +       BUG_ON(rt_mutex_real_waiter(task->pi_blocked_on));
19438 +
19439         __rt_mutex_adjust_prio(task);
19440         waiter->task = task;
19441         waiter->lock = lock;
19442 @@ -921,18 +1410,18 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
19443
19444         task->pi_blocked_on = waiter;
19445
19446 -       raw_spin_unlock_irqrestore(&task->pi_lock, flags);
19447 +       raw_spin_unlock(&task->pi_lock);
19448
19449         if (!owner)
19450                 return 0;
19451
19452 -       raw_spin_lock_irqsave(&owner->pi_lock, flags);
19453 +       raw_spin_lock(&owner->pi_lock);
19454         if (waiter == rt_mutex_top_waiter(lock)) {
19455                 rt_mutex_dequeue_pi(owner, top_waiter);
19456                 rt_mutex_enqueue_pi(owner, waiter);
19457
19458                 __rt_mutex_adjust_prio(owner);
19459 -               if (owner->pi_blocked_on)
19460 +               if (rt_mutex_real_waiter(owner->pi_blocked_on))
19461                         chain_walk = 1;
19462         } else if (rt_mutex_cond_detect_deadlock(waiter, chwalk)) {
19463                 chain_walk = 1;
19464 @@ -941,7 +1430,7 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
19465         /* Store the lock on which owner is blocked or NULL */
19466         next_lock = task_blocked_on_lock(owner);
19467
19468 -       raw_spin_unlock_irqrestore(&owner->pi_lock, flags);
19469 +       raw_spin_unlock(&owner->pi_lock);
19470         /*
19471          * Even if full deadlock detection is on, if the owner is not
19472          * blocked itself, we can avoid finding this out in the chain
19473 @@ -957,12 +1446,12 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
19474          */
19475         get_task_struct(owner);
19476
19477 -       raw_spin_unlock(&lock->wait_lock);
19478 +       raw_spin_unlock_irq(&lock->wait_lock);
19479
19480         res = rt_mutex_adjust_prio_chain(owner, chwalk, lock,
19481                                          next_lock, waiter, task);
19482
19483 -       raw_spin_lock(&lock->wait_lock);
19484 +       raw_spin_lock_irq(&lock->wait_lock);
19485
19486         return res;
19487  }
19488 @@ -971,15 +1460,15 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
19489   * Remove the top waiter from the current tasks pi waiter tree and
19490   * queue it up.
19491   *
19492 - * Called with lock->wait_lock held.
19493 + * Called with lock->wait_lock held and interrupts disabled.
19494   */
19495  static void mark_wakeup_next_waiter(struct wake_q_head *wake_q,
19496 +                                   struct wake_q_head *wake_sleeper_q,
19497                                     struct rt_mutex *lock)
19498  {
19499         struct rt_mutex_waiter *waiter;
19500 -       unsigned long flags;
19501
19502 -       raw_spin_lock_irqsave(&current->pi_lock, flags);
19503 +       raw_spin_lock(&current->pi_lock);
19504
19505         waiter = rt_mutex_top_waiter(lock);
19506
19507 @@ -1001,15 +1490,18 @@ static void mark_wakeup_next_waiter(struct wake_q_head *wake_q,
19508          */
19509         lock->owner = (void *) RT_MUTEX_HAS_WAITERS;
19510
19511 -       raw_spin_unlock_irqrestore(&current->pi_lock, flags);
19512 +       raw_spin_unlock(&current->pi_lock);
19513
19514 -       wake_q_add(wake_q, waiter->task);
19515 +       if (waiter->savestate)
19516 +               wake_q_add(wake_sleeper_q, waiter->task);
19517 +       else
19518 +               wake_q_add(wake_q, waiter->task);
19519  }
19520
19521  /*
19522   * Remove a waiter from a lock and give up
19523   *
19524 - * Must be called with lock->wait_lock held and
19525 + * Must be called with lock->wait_lock held and interrupts disabled. I must
19526   * have just failed to try_to_take_rt_mutex().
19527   */
19528  static void remove_waiter(struct rt_mutex *lock,
19529 @@ -1017,13 +1509,12 @@ static void remove_waiter(struct rt_mutex *lock,
19530  {
19531         bool is_top_waiter = (waiter == rt_mutex_top_waiter(lock));
19532         struct task_struct *owner = rt_mutex_owner(lock);
19533 -       struct rt_mutex *next_lock;
19534 -       unsigned long flags;
19535 +       struct rt_mutex *next_lock = NULL;
19536
19537 -       raw_spin_lock_irqsave(&current->pi_lock, flags);
19538 +       raw_spin_lock(&current->pi_lock);
19539         rt_mutex_dequeue(lock, waiter);
19540         current->pi_blocked_on = NULL;
19541 -       raw_spin_unlock_irqrestore(&current->pi_lock, flags);
19542 +       raw_spin_unlock(&current->pi_lock);
19543
19544         /*
19545          * Only update priority if the waiter was the highest priority
19546 @@ -1032,7 +1523,7 @@ static void remove_waiter(struct rt_mutex *lock,
19547         if (!owner || !is_top_waiter)
19548                 return;
19549
19550 -       raw_spin_lock_irqsave(&owner->pi_lock, flags);
19551 +       raw_spin_lock(&owner->pi_lock);
19552
19553         rt_mutex_dequeue_pi(owner, waiter);
19554
19555 @@ -1042,9 +1533,10 @@ static void remove_waiter(struct rt_mutex *lock,
19556         __rt_mutex_adjust_prio(owner);
19557
19558         /* Store the lock on which owner is blocked or NULL */
19559 -       next_lock = task_blocked_on_lock(owner);
19560 +       if (rt_mutex_real_waiter(owner->pi_blocked_on))
19561 +               next_lock = task_blocked_on_lock(owner);
19562
19563 -       raw_spin_unlock_irqrestore(&owner->pi_lock, flags);
19564 +       raw_spin_unlock(&owner->pi_lock);
19565
19566         /*
19567          * Don't walk the chain, if the owner task is not blocked
19568 @@ -1056,12 +1548,12 @@ static void remove_waiter(struct rt_mutex *lock,
19569         /* gets dropped in rt_mutex_adjust_prio_chain()! */
19570         get_task_struct(owner);
19571
19572 -       raw_spin_unlock(&lock->wait_lock);
19573 +       raw_spin_unlock_irq(&lock->wait_lock);
19574
19575         rt_mutex_adjust_prio_chain(owner, RT_MUTEX_MIN_CHAINWALK, lock,
19576                                    next_lock, NULL, current);
19577
19578 -       raw_spin_lock(&lock->wait_lock);
19579 +       raw_spin_lock_irq(&lock->wait_lock);
19580  }
19581
19582  /*
19583 @@ -1078,17 +1570,17 @@ void rt_mutex_adjust_pi(struct task_struct *task)
19584         raw_spin_lock_irqsave(&task->pi_lock, flags);
19585
19586         waiter = task->pi_blocked_on;
19587 -       if (!waiter || (waiter->prio == task->prio &&
19588 +       if (!rt_mutex_real_waiter(waiter) || (waiter->prio == task->prio &&
19589                         !dl_prio(task->prio))) {
19590                 raw_spin_unlock_irqrestore(&task->pi_lock, flags);
19591                 return;
19592         }
19593         next_lock = waiter->lock;
19594 -       raw_spin_unlock_irqrestore(&task->pi_lock, flags);
19595
19596         /* gets dropped in rt_mutex_adjust_prio_chain()! */
19597         get_task_struct(task);
19598
19599 +       raw_spin_unlock_irqrestore(&task->pi_lock, flags);
19600         rt_mutex_adjust_prio_chain(task, RT_MUTEX_MIN_CHAINWALK, NULL,
19601                                    next_lock, NULL, task);
19602  }
19603 @@ -1097,16 +1589,17 @@ void rt_mutex_adjust_pi(struct task_struct *task)
19604   * __rt_mutex_slowlock() - Perform the wait-wake-try-to-take loop
19605   * @lock:               the rt_mutex to take
19606   * @state:              the state the task should block in (TASK_INTERRUPTIBLE
19607 - *                      or TASK_UNINTERRUPTIBLE)
19608 + *                      or TASK_UNINTERRUPTIBLE)
19609   * @timeout:            the pre-initialized and started timer, or NULL for none
19610   * @waiter:             the pre-initialized rt_mutex_waiter
19611   *
19612 - * lock->wait_lock must be held by the caller.
19613 + * Must be called with lock->wait_lock held and interrupts disabled
19614   */
19615  static int __sched
19616  __rt_mutex_slowlock(struct rt_mutex *lock, int state,
19617                     struct hrtimer_sleeper *timeout,
19618 -                   struct rt_mutex_waiter *waiter)
19619 +                   struct rt_mutex_waiter *waiter,
19620 +                   struct ww_acquire_ctx *ww_ctx)
19621  {
19622         int ret = 0;
19623
19624 @@ -1129,13 +1622,19 @@ __rt_mutex_slowlock(struct rt_mutex *lock, int state,
19625                                 break;
19626                 }
19627
19628 -               raw_spin_unlock(&lock->wait_lock);
19629 +               if (ww_ctx && ww_ctx->acquired > 0) {
19630 +                       ret = __mutex_lock_check_stamp(lock, ww_ctx);
19631 +                       if (ret)
19632 +                               break;
19633 +               }
19634 +
19635 +               raw_spin_unlock_irq(&lock->wait_lock);
19636
19637                 debug_rt_mutex_print_deadlock(waiter);
19638
19639                 schedule();
19640
19641 -               raw_spin_lock(&lock->wait_lock);
19642 +               raw_spin_lock_irq(&lock->wait_lock);
19643                 set_current_state(state);
19644         }
19645
19646 @@ -1163,26 +1662,112 @@ static void rt_mutex_handle_deadlock(int res, int detect_deadlock,
19647         }
19648  }
19649
19650 +static __always_inline void ww_mutex_lock_acquired(struct ww_mutex *ww,
19651 +                                                  struct ww_acquire_ctx *ww_ctx)
19652 +{
19653 +#ifdef CONFIG_DEBUG_MUTEXES
19654 +       /*
19655 +        * If this WARN_ON triggers, you used ww_mutex_lock to acquire,
19656 +        * but released with a normal mutex_unlock in this call.
19657 +        *
19658 +        * This should never happen, always use ww_mutex_unlock.
19659 +        */
19660 +       DEBUG_LOCKS_WARN_ON(ww->ctx);
19661 +
19662 +       /*
19663 +        * Not quite done after calling ww_acquire_done() ?
19664 +        */
19665 +       DEBUG_LOCKS_WARN_ON(ww_ctx->done_acquire);
19666 +
19667 +       if (ww_ctx->contending_lock) {
19668 +               /*
19669 +                * After -EDEADLK you tried to
19670 +                * acquire a different ww_mutex? Bad!
19671 +                */
19672 +               DEBUG_LOCKS_WARN_ON(ww_ctx->contending_lock != ww);
19673 +
19674 +               /*
19675 +                * You called ww_mutex_lock after receiving -EDEADLK,
19676 +                * but 'forgot' to unlock everything else first?
19677 +                */
19678 +               DEBUG_LOCKS_WARN_ON(ww_ctx->acquired > 0);
19679 +               ww_ctx->contending_lock = NULL;
19680 +       }
19681 +
19682 +       /*
19683 +        * Naughty, using a different class will lead to undefined behavior!
19684 +        */
19685 +       DEBUG_LOCKS_WARN_ON(ww_ctx->ww_class != ww->ww_class);
19686 +#endif
19687 +       ww_ctx->acquired++;
19688 +}
19689 +
19690 +#ifdef CONFIG_PREEMPT_RT_FULL
19691 +static void ww_mutex_account_lock(struct rt_mutex *lock,
19692 +                                 struct ww_acquire_ctx *ww_ctx)
19693 +{
19694 +       struct ww_mutex *ww = container_of(lock, struct ww_mutex, base.lock);
19695 +       struct rt_mutex_waiter *waiter, *n;
19696 +
19697 +       /*
19698 +        * This branch gets optimized out for the common case,
19699 +        * and is only important for ww_mutex_lock.
19700 +        */
19701 +       ww_mutex_lock_acquired(ww, ww_ctx);
19702 +       ww->ctx = ww_ctx;
19703 +
19704 +       /*
19705 +        * Give any possible sleeping processes the chance to wake up,
19706 +        * so they can recheck if they have to back off.
19707 +        */
19708 +       rbtree_postorder_for_each_entry_safe(waiter, n, &lock->waiters,
19709 +                                            tree_entry) {
19710 +               /* XXX debug rt mutex waiter wakeup */
19711 +
19712 +               BUG_ON(waiter->lock != lock);
19713 +               rt_mutex_wake_waiter(waiter);
19714 +       }
19715 +}
19716 +
19717 +#else
19718 +
19719 +static void ww_mutex_account_lock(struct rt_mutex *lock,
19720 +                                 struct ww_acquire_ctx *ww_ctx)
19721 +{
19722 +       BUG();
19723 +}
19724 +#endif
19725 +
19726  /*
19727   * Slow path lock function:
19728   */
19729  static int __sched
19730  rt_mutex_slowlock(struct rt_mutex *lock, int state,
19731                   struct hrtimer_sleeper *timeout,
19732 -                 enum rtmutex_chainwalk chwalk)
19733 +                 enum rtmutex_chainwalk chwalk,
19734 +                 struct ww_acquire_ctx *ww_ctx)
19735  {
19736         struct rt_mutex_waiter waiter;
19737 +       unsigned long flags;
19738         int ret = 0;
19739
19740 -       debug_rt_mutex_init_waiter(&waiter);
19741 -       RB_CLEAR_NODE(&waiter.pi_tree_entry);
19742 -       RB_CLEAR_NODE(&waiter.tree_entry);
19743 +       rt_mutex_init_waiter(&waiter, false);
19744
19745 -       raw_spin_lock(&lock->wait_lock);
19746 +       /*
19747 +        * Technically we could use raw_spin_[un]lock_irq() here, but this can
19748 +        * be called in early boot if the cmpxchg() fast path is disabled
19749 +        * (debug, no architecture support). In this case we will acquire the
19750 +        * rtmutex with lock->wait_lock held. But we cannot unconditionally
19751 +        * enable interrupts in that early boot case. So we need to use the
19752 +        * irqsave/restore variants.
19753 +        */
19754 +       raw_spin_lock_irqsave(&lock->wait_lock, flags);
19755
19756         /* Try to acquire the lock again: */
19757         if (try_to_take_rt_mutex(lock, current, NULL)) {
19758 -               raw_spin_unlock(&lock->wait_lock);
19759 +               if (ww_ctx)
19760 +                       ww_mutex_account_lock(lock, ww_ctx);
19761 +               raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
19762                 return 0;
19763         }
19764
19765 @@ -1196,13 +1781,23 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state,
19766
19767         if (likely(!ret))
19768                 /* sleep on the mutex */
19769 -               ret = __rt_mutex_slowlock(lock, state, timeout, &waiter);
19770 +               ret = __rt_mutex_slowlock(lock, state, timeout, &waiter,
19771 +                                         ww_ctx);
19772 +       else if (ww_ctx) {
19773 +               /* ww_mutex received EDEADLK, let it become EALREADY */
19774 +               ret = __mutex_lock_check_stamp(lock, ww_ctx);
19775 +               BUG_ON(!ret);
19776 +       }
19777
19778         if (unlikely(ret)) {
19779                 __set_current_state(TASK_RUNNING);
19780                 if (rt_mutex_has_waiters(lock))
19781                         remove_waiter(lock, &waiter);
19782 -               rt_mutex_handle_deadlock(ret, chwalk, &waiter);
19783 +               /* ww_mutex want to report EDEADLK/EALREADY, let them */
19784 +               if (!ww_ctx)
19785 +                       rt_mutex_handle_deadlock(ret, chwalk, &waiter);
19786 +       } else if (ww_ctx) {
19787 +               ww_mutex_account_lock(lock, ww_ctx);
19788         }
19789
19790         /*
19791 @@ -1211,7 +1806,7 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state,
19792          */
19793         fixup_rt_mutex_waiters(lock);
19794
19795 -       raw_spin_unlock(&lock->wait_lock);
19796 +       raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
19797
19798         /* Remove pending timer: */
19799         if (unlikely(timeout))
19800 @@ -1227,6 +1822,7 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state,
19801   */
19802  static inline int rt_mutex_slowtrylock(struct rt_mutex *lock)
19803  {
19804 +       unsigned long flags;
19805         int ret;
19806
19807         /*
19808 @@ -1238,10 +1834,10 @@ static inline int rt_mutex_slowtrylock(struct rt_mutex *lock)
19809                 return 0;
19810
19811         /*
19812 -        * The mutex has currently no owner. Lock the wait lock and
19813 -        * try to acquire the lock.
19814 +        * The mutex has currently no owner. Lock the wait lock and try to
19815 +        * acquire the lock. We use irqsave here to support early boot calls.
19816          */
19817 -       raw_spin_lock(&lock->wait_lock);
19818 +       raw_spin_lock_irqsave(&lock->wait_lock, flags);
19819
19820         ret = try_to_take_rt_mutex(lock, current, NULL);
19821
19822 @@ -1251,7 +1847,7 @@ static inline int rt_mutex_slowtrylock(struct rt_mutex *lock)
19823          */
19824         fixup_rt_mutex_waiters(lock);
19825
19826 -       raw_spin_unlock(&lock->wait_lock);
19827 +       raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
19828
19829         return ret;
19830  }
19831 @@ -1261,9 +1857,13 @@ static inline int rt_mutex_slowtrylock(struct rt_mutex *lock)
19832   * Return whether the current task needs to undo a potential priority boosting.
19833   */
19834  static bool __sched rt_mutex_slowunlock(struct rt_mutex *lock,
19835 -                                       struct wake_q_head *wake_q)
19836 +                                       struct wake_q_head *wake_q,
19837 +                                       struct wake_q_head *wake_sleeper_q)
19838  {
19839 -       raw_spin_lock(&lock->wait_lock);
19840 +       unsigned long flags;
19841 +
19842 +       /* irqsave required to support early boot calls */
19843 +       raw_spin_lock_irqsave(&lock->wait_lock, flags);
19844
19845         debug_rt_mutex_unlock(lock);
19846
19847 @@ -1302,10 +1902,10 @@ static bool __sched rt_mutex_slowunlock(struct rt_mutex *lock,
19848          */
19849         while (!rt_mutex_has_waiters(lock)) {
19850                 /* Drops lock->wait_lock ! */
19851 -               if (unlock_rt_mutex_safe(lock) == true)
19852 +               if (unlock_rt_mutex_safe(lock, flags) == true)
19853                         return false;
19854                 /* Relock the rtmutex and try again */
19855 -               raw_spin_lock(&lock->wait_lock);
19856 +               raw_spin_lock_irqsave(&lock->wait_lock, flags);
19857         }
19858
19859         /*
19860 @@ -1314,9 +1914,9 @@ static bool __sched rt_mutex_slowunlock(struct rt_mutex *lock,
19861          *
19862          * Queue the next waiter for wakeup once we release the wait_lock.
19863          */
19864 -       mark_wakeup_next_waiter(wake_q, lock);
19865 +       mark_wakeup_next_waiter(wake_q, wake_sleeper_q, lock);
19866
19867 -       raw_spin_unlock(&lock->wait_lock);
19868 +       raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
19869
19870         /* check PI boosting */
19871         return true;
19872 @@ -1330,31 +1930,36 @@ static bool __sched rt_mutex_slowunlock(struct rt_mutex *lock,
19873   */
19874  static inline int
19875  rt_mutex_fastlock(struct rt_mutex *lock, int state,
19876 +                 struct ww_acquire_ctx *ww_ctx,
19877                   int (*slowfn)(struct rt_mutex *lock, int state,
19878                                 struct hrtimer_sleeper *timeout,
19879 -                               enum rtmutex_chainwalk chwalk))
19880 +                               enum rtmutex_chainwalk chwalk,
19881 +                               struct ww_acquire_ctx *ww_ctx))
19882  {
19883         if (likely(rt_mutex_cmpxchg_acquire(lock, NULL, current))) {
19884                 rt_mutex_deadlock_account_lock(lock, current);
19885                 return 0;
19886         } else
19887 -               return slowfn(lock, state, NULL, RT_MUTEX_MIN_CHAINWALK);
19888 +               return slowfn(lock, state, NULL, RT_MUTEX_MIN_CHAINWALK,
19889 +                             ww_ctx);
19890  }
19891
19892  static inline int
19893  rt_mutex_timed_fastlock(struct rt_mutex *lock, int state,
19894                         struct hrtimer_sleeper *timeout,
19895                         enum rtmutex_chainwalk chwalk,
19896 +                       struct ww_acquire_ctx *ww_ctx,
19897                         int (*slowfn)(struct rt_mutex *lock, int state,
19898                                       struct hrtimer_sleeper *timeout,
19899 -                                     enum rtmutex_chainwalk chwalk))
19900 +                                     enum rtmutex_chainwalk chwalk,
19901 +                                     struct ww_acquire_ctx *ww_ctx))
19902  {
19903         if (chwalk == RT_MUTEX_MIN_CHAINWALK &&
19904             likely(rt_mutex_cmpxchg_acquire(lock, NULL, current))) {
19905                 rt_mutex_deadlock_account_lock(lock, current);
19906                 return 0;
19907         } else
19908 -               return slowfn(lock, state, timeout, chwalk);
19909 +               return slowfn(lock, state, timeout, chwalk, ww_ctx);
19910  }
19911
19912  static inline int
19913 @@ -1371,17 +1976,20 @@ rt_mutex_fasttrylock(struct rt_mutex *lock,
19914  static inline void
19915  rt_mutex_fastunlock(struct rt_mutex *lock,
19916                     bool (*slowfn)(struct rt_mutex *lock,
19917 -                                  struct wake_q_head *wqh))
19918 +                                  struct wake_q_head *wqh,
19919 +                                  struct wake_q_head *wq_sleeper))
19920  {
19921         WAKE_Q(wake_q);
19922 +       WAKE_Q(wake_sleeper_q);
19923
19924         if (likely(rt_mutex_cmpxchg_release(lock, current, NULL))) {
19925                 rt_mutex_deadlock_account_unlock(current);
19926
19927         } else {
19928 -               bool deboost = slowfn(lock, &wake_q);
19929 +               bool deboost = slowfn(lock, &wake_q, &wake_sleeper_q);
19930
19931                 wake_up_q(&wake_q);
19932 +               wake_up_q_sleeper(&wake_sleeper_q);
19933
19934                 /* Undo pi boosting if necessary: */
19935                 if (deboost)
19936 @@ -1398,7 +2006,7 @@ void __sched rt_mutex_lock(struct rt_mutex *lock)
19937  {
19938         might_sleep();
19939
19940 -       rt_mutex_fastlock(lock, TASK_UNINTERRUPTIBLE, rt_mutex_slowlock);
19941 +       rt_mutex_fastlock(lock, TASK_UNINTERRUPTIBLE, NULL, rt_mutex_slowlock);
19942  }
19943  EXPORT_SYMBOL_GPL(rt_mutex_lock);
19944
19945 @@ -1415,7 +2023,7 @@ int __sched rt_mutex_lock_interruptible(struct rt_mutex *lock)
19946  {
19947         might_sleep();
19948
19949 -       return rt_mutex_fastlock(lock, TASK_INTERRUPTIBLE, rt_mutex_slowlock);
19950 +       return rt_mutex_fastlock(lock, TASK_INTERRUPTIBLE, NULL, rt_mutex_slowlock);
19951  }
19952  EXPORT_SYMBOL_GPL(rt_mutex_lock_interruptible);
19953
19954 @@ -1428,11 +2036,30 @@ int rt_mutex_timed_futex_lock(struct rt_mutex *lock,
19955         might_sleep();
19956
19957         return rt_mutex_timed_fastlock(lock, TASK_INTERRUPTIBLE, timeout,
19958 -                                      RT_MUTEX_FULL_CHAINWALK,
19959 +                                      RT_MUTEX_FULL_CHAINWALK, NULL,
19960                                        rt_mutex_slowlock);
19961  }
19962
19963  /**
19964 + * rt_mutex_lock_killable - lock a rt_mutex killable
19965 + *
19966 + * @lock:              the rt_mutex to be locked
19967 + * @detect_deadlock:   deadlock detection on/off
19968 + *
19969 + * Returns:
19970 + *  0          on success
19971 + * -EINTR      when interrupted by a signal
19972 + * -EDEADLK    when the lock would deadlock (when deadlock detection is on)
19973 + */
19974 +int __sched rt_mutex_lock_killable(struct rt_mutex *lock)
19975 +{
19976 +       might_sleep();
19977 +
19978 +       return rt_mutex_fastlock(lock, TASK_KILLABLE, NULL, rt_mutex_slowlock);
19979 +}
19980 +EXPORT_SYMBOL_GPL(rt_mutex_lock_killable);
19981 +
19982 +/**
19983   * rt_mutex_timed_lock - lock a rt_mutex interruptible
19984   *                     the timeout structure is provided
19985   *                     by the caller
19986 @@ -1452,6 +2079,7 @@ rt_mutex_timed_lock(struct rt_mutex *lock, struct hrtimer_sleeper *timeout)
19987
19988         return rt_mutex_timed_fastlock(lock, TASK_INTERRUPTIBLE, timeout,
19989                                        RT_MUTEX_MIN_CHAINWALK,
19990 +                                      NULL,
19991                                        rt_mutex_slowlock);
19992  }
19993  EXPORT_SYMBOL_GPL(rt_mutex_timed_lock);
19994 @@ -1469,7 +2097,11 @@ EXPORT_SYMBOL_GPL(rt_mutex_timed_lock);
19995   */
19996  int __sched rt_mutex_trylock(struct rt_mutex *lock)
19997  {
19998 +#ifdef CONFIG_PREEMPT_RT_FULL
19999 +       if (WARN_ON_ONCE(in_irq() || in_nmi()))
20000 +#else
20001         if (WARN_ON(in_irq() || in_nmi() || in_serving_softirq()))
20002 +#endif
20003                 return 0;
20004
20005         return rt_mutex_fasttrylock(lock, rt_mutex_slowtrylock);
20006 @@ -1495,13 +2127,14 @@ EXPORT_SYMBOL_GPL(rt_mutex_unlock);
20007   * required or not.
20008   */
20009  bool __sched rt_mutex_futex_unlock(struct rt_mutex *lock,
20010 -                                  struct wake_q_head *wqh)
20011 +                                  struct wake_q_head *wqh,
20012 +                                  struct wake_q_head *wq_sleeper)
20013  {
20014         if (likely(rt_mutex_cmpxchg_release(lock, current, NULL))) {
20015                 rt_mutex_deadlock_account_unlock(current);
20016                 return false;
20017         }
20018 -       return rt_mutex_slowunlock(lock, wqh);
20019 +       return rt_mutex_slowunlock(lock, wqh, wq_sleeper);
20020  }
20021
20022  /**
20023 @@ -1534,13 +2167,12 @@ EXPORT_SYMBOL_GPL(rt_mutex_destroy);
20024  void __rt_mutex_init(struct rt_mutex *lock, const char *name)
20025  {
20026         lock->owner = NULL;
20027 -       raw_spin_lock_init(&lock->wait_lock);
20028         lock->waiters = RB_ROOT;
20029         lock->waiters_leftmost = NULL;
20030
20031         debug_rt_mutex_init(lock, name);
20032  }
20033 -EXPORT_SYMBOL_GPL(__rt_mutex_init);
20034 +EXPORT_SYMBOL(__rt_mutex_init);
20035
20036  /**
20037   * rt_mutex_init_proxy_locked - initialize and lock a rt_mutex on behalf of a
20038 @@ -1555,7 +2187,7 @@ EXPORT_SYMBOL_GPL(__rt_mutex_init);
20039  void rt_mutex_init_proxy_locked(struct rt_mutex *lock,
20040                                 struct task_struct *proxy_owner)
20041  {
20042 -       __rt_mutex_init(lock, NULL);
20043 +       rt_mutex_init(lock);
20044         debug_rt_mutex_proxy_lock(lock, proxy_owner);
20045         rt_mutex_set_owner(lock, proxy_owner);
20046         rt_mutex_deadlock_account_lock(lock, proxy_owner);
20047 @@ -1596,13 +2228,42 @@ int rt_mutex_start_proxy_lock(struct rt_mutex *lock,
20048  {
20049         int ret;
20050
20051 -       raw_spin_lock(&lock->wait_lock);
20052 +       raw_spin_lock_irq(&lock->wait_lock);
20053
20054         if (try_to_take_rt_mutex(lock, task, NULL)) {
20055 -               raw_spin_unlock(&lock->wait_lock);
20056 +               raw_spin_unlock_irq(&lock->wait_lock);
20057                 return 1;
20058         }
20059
20060 +#ifdef CONFIG_PREEMPT_RT_FULL
20061 +       /*
20062 +        * In PREEMPT_RT there's an added race.
20063 +        * If the task, that we are about to requeue, times out,
20064 +        * it can set the PI_WAKEUP_INPROGRESS. This tells the requeue
20065 +        * to skip this task. But right after the task sets
20066 +        * its pi_blocked_on to PI_WAKEUP_INPROGRESS it can then
20067 +        * block on the spin_lock(&hb->lock), which in RT is an rtmutex.
20068 +        * This will replace the PI_WAKEUP_INPROGRESS with the actual
20069 +        * lock that it blocks on. We *must not* place this task
20070 +        * on this proxy lock in that case.
20071 +        *
20072 +        * To prevent this race, we first take the task's pi_lock
20073 +        * and check if it has updated its pi_blocked_on. If it has,
20074 +        * we assume that it woke up and we return -EAGAIN.
20075 +        * Otherwise, we set the task's pi_blocked_on to
20076 +        * PI_REQUEUE_INPROGRESS, so that if the task is waking up
20077 +        * it will know that we are in the process of requeuing it.
20078 +        */
20079 +       raw_spin_lock(&task->pi_lock);
20080 +       if (task->pi_blocked_on) {
20081 +               raw_spin_unlock(&task->pi_lock);
20082 +               raw_spin_unlock_irq(&lock->wait_lock);
20083 +               return -EAGAIN;
20084 +       }
20085 +       task->pi_blocked_on = PI_REQUEUE_INPROGRESS;
20086 +       raw_spin_unlock(&task->pi_lock);
20087 +#endif
20088 +
20089         /* We enforce deadlock detection for futexes */
20090         ret = task_blocks_on_rt_mutex(lock, waiter, task,
20091                                       RT_MUTEX_FULL_CHAINWALK);
20092 @@ -1617,10 +2278,10 @@ int rt_mutex_start_proxy_lock(struct rt_mutex *lock,
20093                 ret = 0;
20094         }
20095
20096 -       if (unlikely(ret))
20097 +       if (ret && rt_mutex_has_waiters(lock))
20098                 remove_waiter(lock, waiter);
20099
20100 -       raw_spin_unlock(&lock->wait_lock);
20101 +       raw_spin_unlock_irq(&lock->wait_lock);
20102
20103         debug_rt_mutex_print_deadlock(waiter);
20104
20105 @@ -1668,12 +2329,12 @@ int rt_mutex_finish_proxy_lock(struct rt_mutex *lock,
20106  {
20107         int ret;
20108
20109 -       raw_spin_lock(&lock->wait_lock);
20110 +       raw_spin_lock_irq(&lock->wait_lock);
20111
20112         set_current_state(TASK_INTERRUPTIBLE);
20113
20114         /* sleep on the mutex */
20115 -       ret = __rt_mutex_slowlock(lock, TASK_INTERRUPTIBLE, to, waiter);
20116 +       ret = __rt_mutex_slowlock(lock, TASK_INTERRUPTIBLE, to, waiter, NULL);
20117
20118         if (unlikely(ret))
20119                 remove_waiter(lock, waiter);
20120 @@ -1684,7 +2345,93 @@ int rt_mutex_finish_proxy_lock(struct rt_mutex *lock,
20121          */
20122         fixup_rt_mutex_waiters(lock);
20123
20124 -       raw_spin_unlock(&lock->wait_lock);
20125 +       raw_spin_unlock_irq(&lock->wait_lock);
20126
20127         return ret;
20128  }
20129 +
20130 +static inline int
20131 +ww_mutex_deadlock_injection(struct ww_mutex *lock, struct ww_acquire_ctx *ctx)
20132 +{
20133 +#ifdef CONFIG_DEBUG_WW_MUTEX_SLOWPATH
20134 +       unsigned tmp;
20135 +
20136 +       if (ctx->deadlock_inject_countdown-- == 0) {
20137 +               tmp = ctx->deadlock_inject_interval;
20138 +               if (tmp > UINT_MAX/4)
20139 +                       tmp = UINT_MAX;
20140 +               else
20141 +                       tmp = tmp*2 + tmp + tmp/2;
20142 +
20143 +               ctx->deadlock_inject_interval = tmp;
20144 +               ctx->deadlock_inject_countdown = tmp;
20145 +               ctx->contending_lock = lock;
20146 +
20147 +               ww_mutex_unlock(lock);
20148 +
20149 +               return -EDEADLK;
20150 +       }
20151 +#endif
20152 +
20153 +       return 0;
20154 +}
20155 +
20156 +#ifdef CONFIG_PREEMPT_RT_FULL
20157 +int __sched
20158 +__ww_mutex_lock_interruptible(struct ww_mutex *lock, struct ww_acquire_ctx *ww_ctx)
20159 +{
20160 +       int ret;
20161 +
20162 +       might_sleep();
20163 +
20164 +       mutex_acquire_nest(&lock->base.dep_map, 0, 0, &ww_ctx->dep_map, _RET_IP_);
20165 +       ret = rt_mutex_slowlock(&lock->base.lock, TASK_INTERRUPTIBLE, NULL, 0, ww_ctx);
20166 +       if (ret)
20167 +               mutex_release(&lock->base.dep_map, 1, _RET_IP_);
20168 +       else if (!ret && ww_ctx->acquired > 1)
20169 +               return ww_mutex_deadlock_injection(lock, ww_ctx);
20170 +
20171 +       return ret;
20172 +}
20173 +EXPORT_SYMBOL_GPL(__ww_mutex_lock_interruptible);
20174 +
20175 +int __sched
20176 +__ww_mutex_lock(struct ww_mutex *lock, struct ww_acquire_ctx *ww_ctx)
20177 +{
20178 +       int ret;
20179 +
20180 +       might_sleep();
20181 +
20182 +       mutex_acquire_nest(&lock->base.dep_map, 0, 0, &ww_ctx->dep_map, _RET_IP_);
20183 +       ret = rt_mutex_slowlock(&lock->base.lock, TASK_UNINTERRUPTIBLE, NULL, 0, ww_ctx);
20184 +       if (ret)
20185 +               mutex_release(&lock->base.dep_map, 1, _RET_IP_);
20186 +       else if (!ret && ww_ctx->acquired > 1)
20187 +               return ww_mutex_deadlock_injection(lock, ww_ctx);
20188 +
20189 +       return ret;
20190 +}
20191 +EXPORT_SYMBOL_GPL(__ww_mutex_lock);
20192 +
20193 +void __sched ww_mutex_unlock(struct ww_mutex *lock)
20194 +{
20195 +       int nest = !!lock->ctx;
20196 +
20197 +       /*
20198 +        * The unlocking fastpath is the 0->1 transition from 'locked'
20199 +        * into 'unlocked' state:
20200 +        */
20201 +       if (nest) {
20202 +#ifdef CONFIG_DEBUG_MUTEXES
20203 +               DEBUG_LOCKS_WARN_ON(!lock->ctx->acquired);
20204 +#endif
20205 +               if (lock->ctx->acquired > 0)
20206 +                       lock->ctx->acquired--;
20207 +               lock->ctx = NULL;
20208 +       }
20209 +
20210 +       mutex_release(&lock->base.dep_map, nest, _RET_IP_);
20211 +       rt_mutex_unlock(&lock->base.lock);
20212 +}
20213 +EXPORT_SYMBOL(ww_mutex_unlock);
20214 +#endif
20215 diff --git a/kernel/locking/rtmutex_common.h b/kernel/locking/rtmutex_common.h
20216 index 4f5f83c7d2d3..289f062f26cd 100644
20217 --- a/kernel/locking/rtmutex_common.h
20218 +++ b/kernel/locking/rtmutex_common.h
20219 @@ -27,6 +27,7 @@ struct rt_mutex_waiter {
20220         struct rb_node          pi_tree_entry;
20221         struct task_struct      *task;
20222         struct rt_mutex         *lock;
20223 +       bool                    savestate;
20224  #ifdef CONFIG_DEBUG_RT_MUTEXES
20225         unsigned long           ip;
20226         struct pid              *deadlock_task_pid;
20227 @@ -97,6 +98,9 @@ enum rtmutex_chainwalk {
20228  /*
20229   * PI-futex support (proxy locking functions, etc.):
20230   */
20231 +#define PI_WAKEUP_INPROGRESS   ((struct rt_mutex_waiter *) 1)
20232 +#define PI_REQUEUE_INPROGRESS  ((struct rt_mutex_waiter *) 2)
20233 +
20234  extern struct task_struct *rt_mutex_next_owner(struct rt_mutex *lock);
20235  extern void rt_mutex_init_proxy_locked(struct rt_mutex *lock,
20236                                        struct task_struct *proxy_owner);
20237 @@ -110,7 +114,8 @@ extern int rt_mutex_finish_proxy_lock(struct rt_mutex *lock,
20238                                       struct rt_mutex_waiter *waiter);
20239  extern int rt_mutex_timed_futex_lock(struct rt_mutex *l, struct hrtimer_sleeper *to);
20240  extern bool rt_mutex_futex_unlock(struct rt_mutex *lock,
20241 -                                 struct wake_q_head *wqh);
20242 +                                 struct wake_q_head *wqh,
20243 +                                 struct wake_q_head *wq_sleeper);
20244  extern void rt_mutex_adjust_prio(struct task_struct *task);
20245
20246  #ifdef CONFIG_DEBUG_RT_MUTEXES
20247 @@ -119,4 +124,14 @@ extern void rt_mutex_adjust_prio(struct task_struct *task);
20248  # include "rtmutex.h"
20249  #endif
20250
20251 +static inline void
20252 +rt_mutex_init_waiter(struct rt_mutex_waiter *waiter, bool savestate)
20253 +{
20254 +       debug_rt_mutex_init_waiter(waiter);
20255 +       waiter->task = NULL;
20256 +       waiter->savestate = savestate;
20257 +       RB_CLEAR_NODE(&waiter->pi_tree_entry);
20258 +       RB_CLEAR_NODE(&waiter->tree_entry);
20259 +}
20260 +
20261  #endif
20262 diff --git a/kernel/locking/spinlock.c b/kernel/locking/spinlock.c
20263 index db3ccb1dd614..909779647bd1 100644
20264 --- a/kernel/locking/spinlock.c
20265 +++ b/kernel/locking/spinlock.c
20266 @@ -124,8 +124,11 @@ void __lockfunc __raw_##op##_lock_bh(locktype##_t *lock)           \
20267   *         __[spin|read|write]_lock_bh()
20268   */
20269  BUILD_LOCK_OPS(spin, raw_spinlock);
20270 +
20271 +#ifndef CONFIG_PREEMPT_RT_FULL
20272  BUILD_LOCK_OPS(read, rwlock);
20273  BUILD_LOCK_OPS(write, rwlock);
20274 +#endif
20275
20276  #endif
20277
20278 @@ -209,6 +212,8 @@ void __lockfunc _raw_spin_unlock_bh(raw_spinlock_t *lock)
20279  EXPORT_SYMBOL(_raw_spin_unlock_bh);
20280  #endif
20281
20282 +#ifndef CONFIG_PREEMPT_RT_FULL
20283 +
20284  #ifndef CONFIG_INLINE_READ_TRYLOCK
20285  int __lockfunc _raw_read_trylock(rwlock_t *lock)
20286  {
20287 @@ -353,6 +358,8 @@ void __lockfunc _raw_write_unlock_bh(rwlock_t *lock)
20288  EXPORT_SYMBOL(_raw_write_unlock_bh);
20289  #endif
20290
20291 +#endif /* !PREEMPT_RT_FULL */
20292 +
20293  #ifdef CONFIG_DEBUG_LOCK_ALLOC
20294
20295  void __lockfunc _raw_spin_lock_nested(raw_spinlock_t *lock, int subclass)
20296 diff --git a/kernel/locking/spinlock_debug.c b/kernel/locking/spinlock_debug.c
20297 index 0374a596cffa..94970338d518 100644
20298 --- a/kernel/locking/spinlock_debug.c
20299 +++ b/kernel/locking/spinlock_debug.c
20300 @@ -31,6 +31,7 @@ void __raw_spin_lock_init(raw_spinlock_t *lock, const char *name,
20301
20302  EXPORT_SYMBOL(__raw_spin_lock_init);
20303
20304 +#ifndef CONFIG_PREEMPT_RT_FULL
20305  void __rwlock_init(rwlock_t *lock, const char *name,
20306                    struct lock_class_key *key)
20307  {
20308 @@ -48,6 +49,7 @@ void __rwlock_init(rwlock_t *lock, const char *name,
20309  }
20310
20311  EXPORT_SYMBOL(__rwlock_init);
20312 +#endif
20313
20314  static void spin_dump(raw_spinlock_t *lock, const char *msg)
20315  {
20316 @@ -159,6 +161,7 @@ void do_raw_spin_unlock(raw_spinlock_t *lock)
20317         arch_spin_unlock(&lock->raw_lock);
20318  }
20319
20320 +#ifndef CONFIG_PREEMPT_RT_FULL
20321  static void rwlock_bug(rwlock_t *lock, const char *msg)
20322  {
20323         if (!debug_locks_off())
20324 @@ -300,3 +303,5 @@ void do_raw_write_unlock(rwlock_t *lock)
20325         debug_write_unlock(lock);
20326         arch_write_unlock(&lock->raw_lock);
20327  }
20328 +
20329 +#endif
20330 diff --git a/kernel/panic.c b/kernel/panic.c
20331 index 41e2b54f36b5..3535f802953a 100644
20332 --- a/kernel/panic.c
20333 +++ b/kernel/panic.c
20334 @@ -61,6 +61,37 @@ void __weak panic_smp_self_stop(void)
20335                 cpu_relax();
20336  }
20337
20338 +/*
20339 + * Stop ourselves in NMI context if another CPU has already panicked. Arch code
20340 + * may override this to prepare for crash dumping, e.g. save regs info.
20341 + */
20342 +void __weak nmi_panic_self_stop(struct pt_regs *regs)
20343 +{
20344 +       panic_smp_self_stop();
20345 +}
20346 +
20347 +atomic_t panic_cpu = ATOMIC_INIT(PANIC_CPU_INVALID);
20348 +
20349 +/*
20350 + * A variant of panic() called from NMI context. We return if we've already
20351 + * panicked on this CPU. If another CPU already panicked, loop in
20352 + * nmi_panic_self_stop() which can provide architecture dependent code such
20353 + * as saving register state for crash dump.
20354 + */
20355 +void nmi_panic(struct pt_regs *regs, const char *msg)
20356 +{
20357 +       int old_cpu, cpu;
20358 +
20359 +       cpu = raw_smp_processor_id();
20360 +       old_cpu = atomic_cmpxchg(&panic_cpu, PANIC_CPU_INVALID, cpu);
20361 +
20362 +       if (old_cpu == PANIC_CPU_INVALID)
20363 +               panic("%s", msg);
20364 +       else if (old_cpu != cpu)
20365 +               nmi_panic_self_stop(regs);
20366 +}
20367 +EXPORT_SYMBOL(nmi_panic);
20368 +
20369  /**
20370   *     panic - halt the system
20371   *     @fmt: The text string to print
20372 @@ -71,17 +102,17 @@ void __weak panic_smp_self_stop(void)
20373   */
20374  void panic(const char *fmt, ...)
20375  {
20376 -       static DEFINE_SPINLOCK(panic_lock);
20377         static char buf[1024];
20378         va_list args;
20379         long i, i_next = 0;
20380         int state = 0;
20381 +       int old_cpu, this_cpu;
20382
20383         /*
20384          * Disable local interrupts. This will prevent panic_smp_self_stop
20385          * from deadlocking the first cpu that invokes the panic, since
20386          * there is nothing to prevent an interrupt handler (that runs
20387 -        * after the panic_lock is acquired) from invoking panic again.
20388 +        * after setting panic_cpu) from invoking panic() again.
20389          */
20390         local_irq_disable();
20391
20392 @@ -94,8 +125,16 @@ void panic(const char *fmt, ...)
20393          * multiple parallel invocations of panic, all other CPUs either
20394          * stop themself or will wait until they are stopped by the 1st CPU
20395          * with smp_send_stop().
20396 +        *
20397 +        * `old_cpu == PANIC_CPU_INVALID' means this is the 1st CPU which
20398 +        * comes here, so go ahead.
20399 +        * `old_cpu == this_cpu' means we came from nmi_panic() which sets
20400 +        * panic_cpu to this CPU.  In this case, this is also the 1st CPU.
20401          */
20402 -       if (!spin_trylock(&panic_lock))
20403 +       this_cpu = raw_smp_processor_id();
20404 +       old_cpu  = atomic_cmpxchg(&panic_cpu, PANIC_CPU_INVALID, this_cpu);
20405 +
20406 +       if (old_cpu != PANIC_CPU_INVALID && old_cpu != this_cpu)
20407                 panic_smp_self_stop();
20408
20409         console_verbose();
20410 @@ -400,9 +439,11 @@ static u64 oops_id;
20411
20412  static int init_oops_id(void)
20413  {
20414 +#ifndef CONFIG_PREEMPT_RT_FULL
20415         if (!oops_id)
20416                 get_random_bytes(&oops_id, sizeof(oops_id));
20417         else
20418 +#endif
20419                 oops_id++;
20420
20421         return 0;
20422 diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
20423 index 3124cebaec31..c1b981521dd0 100644
20424 --- a/kernel/power/hibernate.c
20425 +++ b/kernel/power/hibernate.c
20426 @@ -285,6 +285,8 @@ static int create_image(int platform_mode)
20427
20428         local_irq_disable();
20429
20430 +       system_state = SYSTEM_SUSPEND;
20431 +
20432         error = syscore_suspend();
20433         if (error) {
20434                 printk(KERN_ERR "PM: Some system devices failed to power down, "
20435 @@ -314,6 +316,7 @@ static int create_image(int platform_mode)
20436         syscore_resume();
20437
20438   Enable_irqs:
20439 +       system_state = SYSTEM_RUNNING;
20440         local_irq_enable();
20441
20442   Enable_cpus:
20443 @@ -438,6 +441,7 @@ static int resume_target_kernel(bool platform_mode)
20444                 goto Enable_cpus;
20445
20446         local_irq_disable();
20447 +       system_state = SYSTEM_SUSPEND;
20448
20449         error = syscore_suspend();
20450         if (error)
20451 @@ -471,6 +475,7 @@ static int resume_target_kernel(bool platform_mode)
20452         syscore_resume();
20453
20454   Enable_irqs:
20455 +       system_state = SYSTEM_RUNNING;
20456         local_irq_enable();
20457
20458   Enable_cpus:
20459 @@ -556,6 +561,7 @@ int hibernation_platform_enter(void)
20460                 goto Enable_cpus;
20461
20462         local_irq_disable();
20463 +       system_state = SYSTEM_SUSPEND;
20464         syscore_suspend();
20465         if (pm_wakeup_pending()) {
20466                 error = -EAGAIN;
20467 @@ -568,6 +574,7 @@ int hibernation_platform_enter(void)
20468
20469   Power_up:
20470         syscore_resume();
20471 +       system_state = SYSTEM_RUNNING;
20472         local_irq_enable();
20473
20474   Enable_cpus:
20475 @@ -642,6 +649,10 @@ static void power_down(void)
20476                 cpu_relax();
20477  }
20478
20479 +#ifndef CONFIG_SUSPEND
20480 +bool pm_in_action;
20481 +#endif
20482 +
20483  /**
20484   * hibernate - Carry out system hibernation, including saving the image.
20485   */
20486 @@ -654,6 +665,8 @@ int hibernate(void)
20487                 return -EPERM;
20488         }
20489
20490 +       pm_in_action = true;
20491 +
20492         lock_system_sleep();
20493         /* The snapshot device should not be opened while we're running */
20494         if (!atomic_add_unless(&snapshot_device_available, -1, 0)) {
20495 @@ -719,6 +732,7 @@ int hibernate(void)
20496         atomic_inc(&snapshot_device_available);
20497   Unlock:
20498         unlock_system_sleep();
20499 +       pm_in_action = false;
20500         return error;
20501  }
20502
20503 diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
20504 index f9fe133c13e2..393bc342c586 100644
20505 --- a/kernel/power/suspend.c
20506 +++ b/kernel/power/suspend.c
20507 @@ -359,6 +359,8 @@ static int suspend_enter(suspend_state_t state, bool *wakeup)
20508         arch_suspend_disable_irqs();
20509         BUG_ON(!irqs_disabled());
20510
20511 +       system_state = SYSTEM_SUSPEND;
20512 +
20513         error = syscore_suspend();
20514         if (!error) {
20515                 *wakeup = pm_wakeup_pending();
20516 @@ -375,6 +377,8 @@ static int suspend_enter(suspend_state_t state, bool *wakeup)
20517                 syscore_resume();
20518         }
20519
20520 +       system_state = SYSTEM_RUNNING;
20521 +
20522         arch_suspend_enable_irqs();
20523         BUG_ON(irqs_disabled());
20524
20525 @@ -518,6 +522,8 @@ static int enter_state(suspend_state_t state)
20526         return error;
20527  }
20528
20529 +bool pm_in_action;
20530 +
20531  /**
20532   * pm_suspend - Externally visible function for suspending the system.
20533   * @state: System sleep state to enter.
20534 @@ -532,6 +538,8 @@ int pm_suspend(suspend_state_t state)
20535         if (state <= PM_SUSPEND_ON || state >= PM_SUSPEND_MAX)
20536                 return -EINVAL;
20537
20538 +       pm_in_action = true;
20539 +
20540         error = enter_state(state);
20541         if (error) {
20542                 suspend_stats.fail++;
20543 @@ -539,6 +547,7 @@ int pm_suspend(suspend_state_t state)
20544         } else {
20545                 suspend_stats.success++;
20546         }
20547 +       pm_in_action = false;
20548         return error;
20549  }
20550  EXPORT_SYMBOL(pm_suspend);
20551 diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
20552 index c048e34b177f..c747bdfa199e 100644
20553 --- a/kernel/printk/printk.c
20554 +++ b/kernel/printk/printk.c
20555 @@ -241,6 +241,65 @@ struct printk_log {
20556   */
20557  static DEFINE_RAW_SPINLOCK(logbuf_lock);
20558
20559 +#ifdef CONFIG_EARLY_PRINTK
20560 +struct console *early_console;
20561 +
20562 +static void early_vprintk(const char *fmt, va_list ap)
20563 +{
20564 +       if (early_console) {
20565 +               char buf[512];
20566 +               int n = vscnprintf(buf, sizeof(buf), fmt, ap);
20567 +
20568 +               early_console->write(early_console, buf, n);
20569 +       }
20570 +}
20571 +
20572 +asmlinkage void early_printk(const char *fmt, ...)
20573 +{
20574 +       va_list ap;
20575 +
20576 +       va_start(ap, fmt);
20577 +       early_vprintk(fmt, ap);
20578 +       va_end(ap);
20579 +}
20580 +
20581 +/*
20582 + * This is independent of any log levels - a global
20583 + * kill switch that turns off all of printk.
20584 + *
20585 + * Used by the NMI watchdog if early-printk is enabled.
20586 + */
20587 +static bool __read_mostly printk_killswitch;
20588 +
20589 +static int __init force_early_printk_setup(char *str)
20590 +{
20591 +       printk_killswitch = true;
20592 +       return 0;
20593 +}
20594 +early_param("force_early_printk", force_early_printk_setup);
20595 +
20596 +void printk_kill(void)
20597 +{
20598 +       printk_killswitch = true;
20599 +}
20600 +
20601 +#ifdef CONFIG_PRINTK
20602 +static int forced_early_printk(const char *fmt, va_list ap)
20603 +{
20604 +       if (!printk_killswitch)
20605 +               return 0;
20606 +       early_vprintk(fmt, ap);
20607 +       return 1;
20608 +}
20609 +#endif
20610 +
20611 +#else
20612 +static inline int forced_early_printk(const char *fmt, va_list ap)
20613 +{
20614 +       return 0;
20615 +}
20616 +#endif
20617 +
20618  #ifdef CONFIG_PRINTK
20619  DECLARE_WAIT_QUEUE_HEAD(log_wait);
20620  /* the next printk record to read by syslog(READ) or /proc/kmsg */
20621 @@ -1203,6 +1262,7 @@ static int syslog_print_all(char __user *buf, int size, bool clear)
20622  {
20623         char *text;
20624         int len = 0;
20625 +       int attempts = 0;
20626
20627         text = kmalloc(LOG_LINE_MAX + PREFIX_MAX, GFP_KERNEL);
20628         if (!text)
20629 @@ -1214,7 +1274,14 @@ static int syslog_print_all(char __user *buf, int size, bool clear)
20630                 u64 seq;
20631                 u32 idx;
20632                 enum log_flags prev;
20633 -
20634 +               int num_msg;
20635 +try_again:
20636 +               attempts++;
20637 +               if (attempts > 10) {
20638 +                       len = -EBUSY;
20639 +                       goto out;
20640 +               }
20641 +               num_msg = 0;
20642                 if (clear_seq < log_first_seq) {
20643                         /* messages are gone, move to first available one */
20644                         clear_seq = log_first_seq;
20645 @@ -1235,6 +1302,14 @@ static int syslog_print_all(char __user *buf, int size, bool clear)
20646                         prev = msg->flags;
20647                         idx = log_next(idx);
20648                         seq++;
20649 +                       num_msg++;
20650 +                       if (num_msg > 5) {
20651 +                               num_msg = 0;
20652 +                               raw_spin_unlock_irq(&logbuf_lock);
20653 +                               raw_spin_lock_irq(&logbuf_lock);
20654 +                               if (clear_seq < log_first_seq)
20655 +                                       goto try_again;
20656 +                       }
20657                 }
20658
20659                 /* move first record forward until length fits into the buffer */
20660 @@ -1248,6 +1323,14 @@ static int syslog_print_all(char __user *buf, int size, bool clear)
20661                         prev = msg->flags;
20662                         idx = log_next(idx);
20663                         seq++;
20664 +                       num_msg++;
20665 +                       if (num_msg > 5) {
20666 +                               num_msg = 0;
20667 +                               raw_spin_unlock_irq(&logbuf_lock);
20668 +                               raw_spin_lock_irq(&logbuf_lock);
20669 +                               if (clear_seq < log_first_seq)
20670 +                                       goto try_again;
20671 +                       }
20672                 }
20673
20674                 /* last message fitting into this dump */
20675 @@ -1288,6 +1371,7 @@ static int syslog_print_all(char __user *buf, int size, bool clear)
20676                 clear_seq = log_next_seq;
20677                 clear_idx = log_next_idx;
20678         }
20679 +out:
20680         raw_spin_unlock_irq(&logbuf_lock);
20681
20682         kfree(text);
20683 @@ -1443,6 +1527,12 @@ static void call_console_drivers(int level,
20684         if (!console_drivers)
20685                 return;
20686
20687 +       if (IS_ENABLED(CONFIG_PREEMPT_RT_BASE)) {
20688 +               if (in_irq() || in_nmi())
20689 +                       return;
20690 +       }
20691 +
20692 +       migrate_disable();
20693         for_each_console(con) {
20694                 if (exclusive_console && con != exclusive_console)
20695                         continue;
20696 @@ -1458,6 +1548,7 @@ static void call_console_drivers(int level,
20697                 else
20698                         con->write(con, text, len);
20699         }
20700 +       migrate_enable();
20701  }
20702
20703  /*
20704 @@ -1518,6 +1609,15 @@ static inline int can_use_console(unsigned int cpu)
20705  static int console_trylock_for_printk(void)
20706  {
20707         unsigned int cpu = smp_processor_id();
20708 +#ifdef CONFIG_PREEMPT_RT_FULL
20709 +       int lock = !early_boot_irqs_disabled && (preempt_count() == 0) &&
20710 +               !irqs_disabled();
20711 +#else
20712 +       int lock = 1;
20713 +#endif
20714 +
20715 +       if (!lock)
20716 +               return 0;
20717
20718         if (!console_trylock())
20719                 return 0;
20720 @@ -1672,6 +1772,13 @@ asmlinkage int vprintk_emit(int facility, int level,
20721         /* cpu currently holding logbuf_lock in this function */
20722         static unsigned int logbuf_cpu = UINT_MAX;
20723
20724 +       /*
20725 +        * Fall back to early_printk if a debugging subsystem has
20726 +        * killed printk output
20727 +        */
20728 +       if (unlikely(forced_early_printk(fmt, args)))
20729 +               return 1;
20730 +
20731         if (level == LOGLEVEL_SCHED) {
20732                 level = LOGLEVEL_DEFAULT;
20733                 in_sched = true;
20734 @@ -1813,8 +1920,7 @@ asmlinkage int vprintk_emit(int facility, int level,
20735                  * console_sem which would prevent anyone from printing to
20736                  * console
20737                  */
20738 -               preempt_disable();
20739 -
20740 +               migrate_disable();
20741                 /*
20742                  * Try to acquire and then immediately release the console
20743                  * semaphore.  The release will print out buffers and wake up
20744 @@ -1822,7 +1928,7 @@ asmlinkage int vprintk_emit(int facility, int level,
20745                  */
20746                 if (console_trylock_for_printk())
20747                         console_unlock();
20748 -               preempt_enable();
20749 +               migrate_enable();
20750                 lockdep_on();
20751         }
20752
20753 @@ -1961,26 +2067,6 @@ DEFINE_PER_CPU(printk_func_t, printk_func);
20754
20755  #endif /* CONFIG_PRINTK */
20756
20757 -#ifdef CONFIG_EARLY_PRINTK
20758 -struct console *early_console;
20759 -
20760 -asmlinkage __visible void early_printk(const char *fmt, ...)
20761 -{
20762 -       va_list ap;
20763 -       char buf[512];
20764 -       int n;
20765 -
20766 -       if (!early_console)
20767 -               return;
20768 -
20769 -       va_start(ap, fmt);
20770 -       n = vscnprintf(buf, sizeof(buf), fmt, ap);
20771 -       va_end(ap);
20772 -
20773 -       early_console->write(early_console, buf, n);
20774 -}
20775 -#endif
20776 -
20777  static int __add_preferred_console(char *name, int idx, char *options,
20778                                    char *brl_options)
20779  {
20780 @@ -2202,11 +2288,16 @@ static void console_cont_flush(char *text, size_t size)
20781                 goto out;
20782
20783         len = cont_print_text(text, size);
20784 +#ifdef CONFIG_PREEMPT_RT_FULL
20785 +       raw_spin_unlock_irqrestore(&logbuf_lock, flags);
20786 +       call_console_drivers(cont.level, NULL, 0, text, len);
20787 +#else
20788         raw_spin_unlock(&logbuf_lock);
20789         stop_critical_timings();
20790         call_console_drivers(cont.level, NULL, 0, text, len);
20791         start_critical_timings();
20792         local_irq_restore(flags);
20793 +#endif
20794         return;
20795  out:
20796         raw_spin_unlock_irqrestore(&logbuf_lock, flags);
20797 @@ -2316,13 +2407,17 @@ skip:
20798                 console_idx = log_next(console_idx);
20799                 console_seq++;
20800                 console_prev = msg->flags;
20801 +#ifdef CONFIG_PREEMPT_RT_FULL
20802 +               raw_spin_unlock_irqrestore(&logbuf_lock, flags);
20803 +               call_console_drivers(level, ext_text, ext_len, text, len);
20804 +#else
20805                 raw_spin_unlock(&logbuf_lock);
20806
20807                 stop_critical_timings();        /* don't trace print latency */
20808                 call_console_drivers(level, ext_text, ext_len, text, len);
20809                 start_critical_timings();
20810                 local_irq_restore(flags);
20811 -
20812 +#endif
20813                 if (do_cond_resched)
20814                         cond_resched();
20815         }
20816 @@ -2374,6 +2469,11 @@ void console_unblank(void)
20817  {
20818         struct console *c;
20819
20820 +       if (IS_ENABLED(CONFIG_PREEMPT_RT_BASE)) {
20821 +               if (in_irq() || in_nmi())
20822 +                       return;
20823 +       }
20824 +
20825         /*
20826          * console_unblank can no longer be called in interrupt context unless
20827          * oops_in_progress is set to 1..
20828 diff --git a/kernel/ptrace.c b/kernel/ptrace.c
20829 index 3189e51db7e8..1004af706be7 100644
20830 --- a/kernel/ptrace.c
20831 +++ b/kernel/ptrace.c
20832 @@ -129,7 +129,14 @@ static bool ptrace_freeze_traced(struct task_struct *task)
20833
20834         spin_lock_irq(&task->sighand->siglock);
20835         if (task_is_traced(task) && !__fatal_signal_pending(task)) {
20836 -               task->state = __TASK_TRACED;
20837 +               unsigned long flags;
20838 +
20839 +               raw_spin_lock_irqsave(&task->pi_lock, flags);
20840 +               if (task->state & __TASK_TRACED)
20841 +                       task->state = __TASK_TRACED;
20842 +               else
20843 +                       task->saved_state = __TASK_TRACED;
20844 +               raw_spin_unlock_irqrestore(&task->pi_lock, flags);
20845                 ret = true;
20846         }
20847         spin_unlock_irq(&task->sighand->siglock);
20848 diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
20849 index d89328e260df..5bb3364a6284 100644
20850 --- a/kernel/rcu/rcutorture.c
20851 +++ b/kernel/rcu/rcutorture.c
20852 @@ -390,6 +390,7 @@ static struct rcu_torture_ops rcu_ops = {
20853         .name           = "rcu"
20854  };
20855
20856 +#ifndef CONFIG_PREEMPT_RT_FULL
20857  /*
20858   * Definitions for rcu_bh torture testing.
20859   */
20860 @@ -429,6 +430,12 @@ static struct rcu_torture_ops rcu_bh_ops = {
20861         .name           = "rcu_bh"
20862  };
20863
20864 +#else
20865 +static struct rcu_torture_ops rcu_bh_ops = {
20866 +       .ttype          = INVALID_RCU_FLAVOR,
20867 +};
20868 +#endif
20869 +
20870  /*
20871   * Don't even think about trying any of these in real life!!!
20872   * The names includes "busted", and they really means it!
20873 diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
20874 index f07343b54fe5..d862a203fce0 100644
20875 --- a/kernel/rcu/tree.c
20876 +++ b/kernel/rcu/tree.c
20877 @@ -56,6 +56,11 @@
20878  #include <linux/random.h>
20879  #include <linux/trace_events.h>
20880  #include <linux/suspend.h>
20881 +#include <linux/delay.h>
20882 +#include <linux/gfp.h>
20883 +#include <linux/oom.h>
20884 +#include <linux/smpboot.h>
20885 +#include "../time/tick-internal.h"
20886
20887  #include "tree.h"
20888  #include "rcu.h"
20889 @@ -266,6 +271,19 @@ void rcu_sched_qs(void)
20890         }
20891  }
20892
20893 +#ifdef CONFIG_PREEMPT_RT_FULL
20894 +static void rcu_preempt_qs(void);
20895 +
20896 +void rcu_bh_qs(void)
20897 +{
20898 +       unsigned long flags;
20899 +
20900 +       /* Callers to this function, rcu_preempt_qs(), must disable irqs. */
20901 +       local_irq_save(flags);
20902 +       rcu_preempt_qs();
20903 +       local_irq_restore(flags);
20904 +}
20905 +#else
20906  void rcu_bh_qs(void)
20907  {
20908         if (__this_cpu_read(rcu_bh_data.cpu_no_qs.s)) {
20909 @@ -275,6 +293,7 @@ void rcu_bh_qs(void)
20910                 __this_cpu_write(rcu_bh_data.cpu_no_qs.b.norm, false);
20911         }
20912  }
20913 +#endif
20914
20915  static DEFINE_PER_CPU(int, rcu_sched_qs_mask);
20916
20917 @@ -435,11 +454,13 @@ EXPORT_SYMBOL_GPL(rcu_batches_started_sched);
20918  /*
20919   * Return the number of RCU BH batches started thus far for debug & stats.
20920   */
20921 +#ifndef CONFIG_PREEMPT_RT_FULL
20922  unsigned long rcu_batches_started_bh(void)
20923  {
20924         return rcu_bh_state.gpnum;
20925  }
20926  EXPORT_SYMBOL_GPL(rcu_batches_started_bh);
20927 +#endif
20928
20929  /*
20930   * Return the number of RCU batches completed thus far for debug & stats.
20931 @@ -459,6 +480,7 @@ unsigned long rcu_batches_completed_sched(void)
20932  }
20933  EXPORT_SYMBOL_GPL(rcu_batches_completed_sched);
20934
20935 +#ifndef CONFIG_PREEMPT_RT_FULL
20936  /*
20937   * Return the number of RCU BH batches completed thus far for debug & stats.
20938   */
20939 @@ -486,6 +508,13 @@ void rcu_bh_force_quiescent_state(void)
20940  }
20941  EXPORT_SYMBOL_GPL(rcu_bh_force_quiescent_state);
20942
20943 +#else
20944 +void rcu_force_quiescent_state(void)
20945 +{
20946 +}
20947 +EXPORT_SYMBOL_GPL(rcu_force_quiescent_state);
20948 +#endif
20949 +
20950  /*
20951   * Force a quiescent state for RCU-sched.
20952   */
20953 @@ -536,9 +565,11 @@ void rcutorture_get_gp_data(enum rcutorture_type test_type, int *flags,
20954         case RCU_FLAVOR:
20955                 rsp = rcu_state_p;
20956                 break;
20957 +#ifndef CONFIG_PREEMPT_RT_FULL
20958         case RCU_BH_FLAVOR:
20959                 rsp = &rcu_bh_state;
20960                 break;
20961 +#endif
20962         case RCU_SCHED_FLAVOR:
20963                 rsp = &rcu_sched_state;
20964                 break;
20965 @@ -1590,7 +1621,6 @@ static int rcu_future_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp)
20966         int needmore;
20967         struct rcu_data *rdp = this_cpu_ptr(rsp->rda);
20968
20969 -       rcu_nocb_gp_cleanup(rsp, rnp);
20970         rnp->need_future_gp[c & 0x1] = 0;
20971         needmore = rnp->need_future_gp[(c + 1) & 0x1];
20972         trace_rcu_future_gp(rnp, rdp, c,
20973 @@ -1611,7 +1641,7 @@ static void rcu_gp_kthread_wake(struct rcu_state *rsp)
20974             !READ_ONCE(rsp->gp_flags) ||
20975             !rsp->gp_kthread)
20976                 return;
20977 -       wake_up(&rsp->gp_wq);
20978 +       swake_up(&rsp->gp_wq);
20979  }
20980
20981  /*
20982 @@ -1991,6 +2021,7 @@ static void rcu_gp_cleanup(struct rcu_state *rsp)
20983         int nocb = 0;
20984         struct rcu_data *rdp;
20985         struct rcu_node *rnp = rcu_get_root(rsp);
20986 +       struct swait_queue_head *sq;
20987
20988         WRITE_ONCE(rsp->gp_activity, jiffies);
20989         raw_spin_lock_irq(&rnp->lock);
20990 @@ -2029,7 +2060,9 @@ static void rcu_gp_cleanup(struct rcu_state *rsp)
20991                         needgp = __note_gp_changes(rsp, rnp, rdp) || needgp;
20992                 /* smp_mb() provided by prior unlock-lock pair. */
20993                 nocb += rcu_future_gp_cleanup(rsp, rnp);
20994 +               sq = rcu_nocb_gp_get(rnp);
20995                 raw_spin_unlock_irq(&rnp->lock);
20996 +               rcu_nocb_gp_cleanup(sq);
20997                 cond_resched_rcu_qs();
20998                 WRITE_ONCE(rsp->gp_activity, jiffies);
20999                 rcu_gp_slow(rsp, gp_cleanup_delay);
21000 @@ -2076,7 +2109,7 @@ static int __noreturn rcu_gp_kthread(void *arg)
21001                                                READ_ONCE(rsp->gpnum),
21002                                                TPS("reqwait"));
21003                         rsp->gp_state = RCU_GP_WAIT_GPS;
21004 -                       wait_event_interruptible(rsp->gp_wq,
21005 +                       swait_event_interruptible(rsp->gp_wq,
21006                                                  READ_ONCE(rsp->gp_flags) &
21007                                                  RCU_GP_FLAG_INIT);
21008                         rsp->gp_state = RCU_GP_DONE_GPS;
21009 @@ -2106,7 +2139,7 @@ static int __noreturn rcu_gp_kthread(void *arg)
21010                                                READ_ONCE(rsp->gpnum),
21011                                                TPS("fqswait"));
21012                         rsp->gp_state = RCU_GP_WAIT_FQS;
21013 -                       ret = wait_event_interruptible_timeout(rsp->gp_wq,
21014 +                       ret = swait_event_interruptible_timeout(rsp->gp_wq,
21015                                         rcu_gp_fqs_check_wake(rsp, &gf), j);
21016                         rsp->gp_state = RCU_GP_DOING_FQS;
21017                         /* Locking provides needed memory barriers. */
21018 @@ -2230,7 +2263,7 @@ static void rcu_report_qs_rsp(struct rcu_state *rsp, unsigned long flags)
21019         WARN_ON_ONCE(!rcu_gp_in_progress(rsp));
21020         WRITE_ONCE(rsp->gp_flags, READ_ONCE(rsp->gp_flags) | RCU_GP_FLAG_FQS);
21021         raw_spin_unlock_irqrestore(&rcu_get_root(rsp)->lock, flags);
21022 -       rcu_gp_kthread_wake(rsp);
21023 +       swake_up(&rsp->gp_wq);  /* Memory barrier implied by swake_up() path. */
21024  }
21025
21026  /*
21027 @@ -2891,7 +2924,7 @@ static void force_quiescent_state(struct rcu_state *rsp)
21028         }
21029         WRITE_ONCE(rsp->gp_flags, READ_ONCE(rsp->gp_flags) | RCU_GP_FLAG_FQS);
21030         raw_spin_unlock_irqrestore(&rnp_old->lock, flags);
21031 -       rcu_gp_kthread_wake(rsp);
21032 +       swake_up(&rsp->gp_wq); /* Memory barrier implied by swake_up() path. */
21033  }
21034
21035  /*
21036 @@ -2934,18 +2967,17 @@ __rcu_process_callbacks(struct rcu_state *rsp)
21037  /*
21038   * Do RCU core processing for the current CPU.
21039   */
21040 -static void rcu_process_callbacks(struct softirq_action *unused)
21041 +static void rcu_process_callbacks(void)
21042  {
21043         struct rcu_state *rsp;
21044
21045         if (cpu_is_offline(smp_processor_id()))
21046                 return;
21047 -       trace_rcu_utilization(TPS("Start RCU core"));
21048         for_each_rcu_flavor(rsp)
21049                 __rcu_process_callbacks(rsp);
21050 -       trace_rcu_utilization(TPS("End RCU core"));
21051  }
21052
21053 +static DEFINE_PER_CPU(struct task_struct *, rcu_cpu_kthread_task);
21054  /*
21055   * Schedule RCU callback invocation.  If the specified type of RCU
21056   * does not support RCU priority boosting, just do a direct call,
21057 @@ -2957,18 +2989,105 @@ static void invoke_rcu_callbacks(struct rcu_state *rsp, struct rcu_data *rdp)
21058  {
21059         if (unlikely(!READ_ONCE(rcu_scheduler_fully_active)))
21060                 return;
21061 -       if (likely(!rsp->boost)) {
21062 -               rcu_do_batch(rsp, rdp);
21063 -               return;
21064 -       }
21065 -       invoke_rcu_callbacks_kthread();
21066 +       rcu_do_batch(rsp, rdp);
21067  }
21068
21069 +static void rcu_wake_cond(struct task_struct *t, int status)
21070 +{
21071 +       /*
21072 +        * If the thread is yielding, only wake it when this
21073 +        * is invoked from idle
21074 +        */
21075 +       if (t && (status != RCU_KTHREAD_YIELDING || is_idle_task(current)))
21076 +               wake_up_process(t);
21077 +}
21078 +
21079 +/*
21080 + * Wake up this CPU's rcuc kthread to do RCU core processing.
21081 + */
21082  static void invoke_rcu_core(void)
21083  {
21084 -       if (cpu_online(smp_processor_id()))
21085 -               raise_softirq(RCU_SOFTIRQ);
21086 +       unsigned long flags;
21087 +       struct task_struct *t;
21088 +
21089 +       if (!cpu_online(smp_processor_id()))
21090 +               return;
21091 +       local_irq_save(flags);
21092 +       __this_cpu_write(rcu_cpu_has_work, 1);
21093 +       t = __this_cpu_read(rcu_cpu_kthread_task);
21094 +       if (t != NULL && current != t)
21095 +               rcu_wake_cond(t, __this_cpu_read(rcu_cpu_kthread_status));
21096 +       local_irq_restore(flags);
21097 +}
21098 +
21099 +static void rcu_cpu_kthread_park(unsigned int cpu)
21100 +{
21101 +       per_cpu(rcu_cpu_kthread_status, cpu) = RCU_KTHREAD_OFFCPU;
21102 +}
21103 +
21104 +static int rcu_cpu_kthread_should_run(unsigned int cpu)
21105 +{
21106 +       return __this_cpu_read(rcu_cpu_has_work);
21107 +}
21108 +
21109 +/*
21110 + * Per-CPU kernel thread that invokes RCU callbacks.  This replaces the
21111 + * RCU softirq used in flavors and configurations of RCU that do not
21112 + * support RCU priority boosting.
21113 + */
21114 +static void rcu_cpu_kthread(unsigned int cpu)
21115 +{
21116 +       unsigned int *statusp = this_cpu_ptr(&rcu_cpu_kthread_status);
21117 +       char work, *workp = this_cpu_ptr(&rcu_cpu_has_work);
21118 +       int spincnt;
21119 +
21120 +       for (spincnt = 0; spincnt < 10; spincnt++) {
21121 +               trace_rcu_utilization(TPS("Start CPU kthread@rcu_wait"));
21122 +               local_bh_disable();
21123 +               *statusp = RCU_KTHREAD_RUNNING;
21124 +               this_cpu_inc(rcu_cpu_kthread_loops);
21125 +               local_irq_disable();
21126 +               work = *workp;
21127 +               *workp = 0;
21128 +               local_irq_enable();
21129 +               if (work)
21130 +                       rcu_process_callbacks();
21131 +               local_bh_enable();
21132 +               if (*workp == 0) {
21133 +                       trace_rcu_utilization(TPS("End CPU kthread@rcu_wait"));
21134 +                       *statusp = RCU_KTHREAD_WAITING;
21135 +                       return;
21136 +               }
21137 +       }
21138 +       *statusp = RCU_KTHREAD_YIELDING;
21139 +       trace_rcu_utilization(TPS("Start CPU kthread@rcu_yield"));
21140 +       schedule_timeout_interruptible(2);
21141 +       trace_rcu_utilization(TPS("End CPU kthread@rcu_yield"));
21142 +       *statusp = RCU_KTHREAD_WAITING;
21143 +}
21144 +
21145 +static struct smp_hotplug_thread rcu_cpu_thread_spec = {
21146 +       .store                  = &rcu_cpu_kthread_task,
21147 +       .thread_should_run      = rcu_cpu_kthread_should_run,
21148 +       .thread_fn              = rcu_cpu_kthread,
21149 +       .thread_comm            = "rcuc/%u",
21150 +       .setup                  = rcu_cpu_kthread_setup,
21151 +       .park                   = rcu_cpu_kthread_park,
21152 +};
21153 +
21154 +/*
21155 + * Spawn per-CPU RCU core processing kthreads.
21156 + */
21157 +static int __init rcu_spawn_core_kthreads(void)
21158 +{
21159 +       int cpu;
21160 +
21161 +       for_each_possible_cpu(cpu)
21162 +               per_cpu(rcu_cpu_has_work, cpu) = 0;
21163 +       BUG_ON(smpboot_register_percpu_thread(&rcu_cpu_thread_spec));
21164 +       return 0;
21165  }
21166 +early_initcall(rcu_spawn_core_kthreads);
21167
21168  /*
21169   * Handle any core-RCU processing required by a call_rcu() invocation.
21170 @@ -3114,6 +3233,7 @@ void call_rcu_sched(struct rcu_head *head, rcu_callback_t func)
21171  }
21172  EXPORT_SYMBOL_GPL(call_rcu_sched);
21173
21174 +#ifndef CONFIG_PREEMPT_RT_FULL
21175  /*
21176   * Queue an RCU callback for invocation after a quicker grace period.
21177   */
21178 @@ -3122,6 +3242,7 @@ void call_rcu_bh(struct rcu_head *head, rcu_callback_t func)
21179         __call_rcu(head, func, &rcu_bh_state, -1, 0);
21180  }
21181  EXPORT_SYMBOL_GPL(call_rcu_bh);
21182 +#endif
21183
21184  /*
21185   * Queue an RCU callback for lazy invocation after a grace period.
21186 @@ -3213,6 +3334,7 @@ void synchronize_sched(void)
21187  }
21188  EXPORT_SYMBOL_GPL(synchronize_sched);
21189
21190 +#ifndef CONFIG_PREEMPT_RT_FULL
21191  /**
21192   * synchronize_rcu_bh - wait until an rcu_bh grace period has elapsed.
21193   *
21194 @@ -3239,6 +3361,7 @@ void synchronize_rcu_bh(void)
21195                 wait_rcu_gp(call_rcu_bh);
21196  }
21197  EXPORT_SYMBOL_GPL(synchronize_rcu_bh);
21198 +#endif
21199
21200  /**
21201   * get_state_synchronize_rcu - Snapshot current RCU state
21202 @@ -3524,7 +3647,7 @@ static void __rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp,
21203                         raw_spin_unlock_irqrestore(&rnp->lock, flags);
21204                         if (wake) {
21205                                 smp_mb(); /* EGP done before wake_up(). */
21206 -                               wake_up(&rsp->expedited_wq);
21207 +                               swake_up(&rsp->expedited_wq);
21208                         }
21209                         break;
21210                 }
21211 @@ -3781,7 +3904,7 @@ static void synchronize_sched_expedited_wait(struct rcu_state *rsp)
21212         jiffies_start = jiffies;
21213
21214         for (;;) {
21215 -               ret = wait_event_interruptible_timeout(
21216 +               ret = swait_event_timeout(
21217                                 rsp->expedited_wq,
21218                                 sync_rcu_preempt_exp_done(rnp_root),
21219                                 jiffies_stall);
21220 @@ -3789,7 +3912,7 @@ static void synchronize_sched_expedited_wait(struct rcu_state *rsp)
21221                         return;
21222                 if (ret < 0) {
21223                         /* Hit a signal, disable CPU stall warnings. */
21224 -                       wait_event(rsp->expedited_wq,
21225 +                       swait_event(rsp->expedited_wq,
21226                                    sync_rcu_preempt_exp_done(rnp_root));
21227                         return;
21228                 }
21229 @@ -4101,6 +4224,7 @@ static void _rcu_barrier(struct rcu_state *rsp)
21230         mutex_unlock(&rsp->barrier_mutex);
21231  }
21232
21233 +#ifndef CONFIG_PREEMPT_RT_FULL
21234  /**
21235   * rcu_barrier_bh - Wait until all in-flight call_rcu_bh() callbacks complete.
21236   */
21237 @@ -4109,6 +4233,7 @@ void rcu_barrier_bh(void)
21238         _rcu_barrier(&rcu_bh_state);
21239  }
21240  EXPORT_SYMBOL_GPL(rcu_barrier_bh);
21241 +#endif
21242
21243  /**
21244   * rcu_barrier_sched - Wait for in-flight call_rcu_sched() callbacks.
21245 @@ -4455,8 +4580,8 @@ static void __init rcu_init_one(struct rcu_state *rsp,
21246                 }
21247         }
21248
21249 -       init_waitqueue_head(&rsp->gp_wq);
21250 -       init_waitqueue_head(&rsp->expedited_wq);
21251 +       init_swait_queue_head(&rsp->gp_wq);
21252 +       init_swait_queue_head(&rsp->expedited_wq);
21253         rnp = rsp->level[rcu_num_lvls - 1];
21254         for_each_possible_cpu(i) {
21255                 while (i > rnp->grphi)
21256 @@ -4576,12 +4701,13 @@ void __init rcu_init(void)
21257
21258         rcu_bootup_announce();
21259         rcu_init_geometry();
21260 +#ifndef CONFIG_PREEMPT_RT_FULL
21261         rcu_init_one(&rcu_bh_state, &rcu_bh_data);
21262 +#endif
21263         rcu_init_one(&rcu_sched_state, &rcu_sched_data);
21264         if (dump_tree)
21265                 rcu_dump_rcu_node_tree(&rcu_sched_state);
21266         __rcu_init_preempt();
21267 -       open_softirq(RCU_SOFTIRQ, rcu_process_callbacks);
21268
21269         /*
21270          * We don't need protection against CPU-hotplug here because
21271 diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
21272 index 9fb4e238d4dc..c75834d8de24 100644
21273 --- a/kernel/rcu/tree.h
21274 +++ b/kernel/rcu/tree.h
21275 @@ -27,6 +27,7 @@
21276  #include <linux/threads.h>
21277  #include <linux/cpumask.h>
21278  #include <linux/seqlock.h>
21279 +#include <linux/swait.h>
21280  #include <linux/stop_machine.h>
21281
21282  /*
21283 @@ -241,7 +242,7 @@ struct rcu_node {
21284                                 /* Refused to boost: not sure why, though. */
21285                                 /*  This can happen due to race conditions. */
21286  #ifdef CONFIG_RCU_NOCB_CPU
21287 -       wait_queue_head_t nocb_gp_wq[2];
21288 +       struct swait_queue_head nocb_gp_wq[2];
21289                                 /* Place for rcu_nocb_kthread() to wait GP. */
21290  #endif /* #ifdef CONFIG_RCU_NOCB_CPU */
21291         int need_future_gp[2];
21292 @@ -393,7 +394,7 @@ struct rcu_data {
21293         atomic_long_t nocb_q_count_lazy; /*  invocation (all stages). */
21294         struct rcu_head *nocb_follower_head; /* CBs ready to invoke. */
21295         struct rcu_head **nocb_follower_tail;
21296 -       wait_queue_head_t nocb_wq;      /* For nocb kthreads to sleep on. */
21297 +       struct swait_queue_head nocb_wq; /* For nocb kthreads to sleep on. */
21298         struct task_struct *nocb_kthread;
21299         int nocb_defer_wakeup;          /* Defer wakeup of nocb_kthread. */
21300
21301 @@ -472,7 +473,7 @@ struct rcu_state {
21302         unsigned long gpnum;                    /* Current gp number. */
21303         unsigned long completed;                /* # of last completed gp. */
21304         struct task_struct *gp_kthread;         /* Task for grace periods. */
21305 -       wait_queue_head_t gp_wq;                /* Where GP task waits. */
21306 +       struct swait_queue_head gp_wq;          /* Where GP task waits. */
21307         short gp_flags;                         /* Commands for GP task. */
21308         short gp_state;                         /* GP kthread sleep state. */
21309
21310 @@ -504,7 +505,7 @@ struct rcu_state {
21311         atomic_long_t expedited_workdone3;      /* # done by others #3. */
21312         atomic_long_t expedited_normal;         /* # fallbacks to normal. */
21313         atomic_t expedited_need_qs;             /* # CPUs left to check in. */
21314 -       wait_queue_head_t expedited_wq;         /* Wait for check-ins. */
21315 +       struct swait_queue_head expedited_wq;   /* Wait for check-ins. */
21316         int ncpus_snap;                         /* # CPUs seen last time. */
21317
21318         unsigned long jiffies_force_qs;         /* Time at which to invoke */
21319 @@ -556,18 +557,18 @@ extern struct list_head rcu_struct_flavors;
21320   */
21321  extern struct rcu_state rcu_sched_state;
21322
21323 +#ifndef CONFIG_PREEMPT_RT_FULL
21324  extern struct rcu_state rcu_bh_state;
21325 +#endif
21326
21327  #ifdef CONFIG_PREEMPT_RCU
21328  extern struct rcu_state rcu_preempt_state;
21329  #endif /* #ifdef CONFIG_PREEMPT_RCU */
21330
21331 -#ifdef CONFIG_RCU_BOOST
21332  DECLARE_PER_CPU(unsigned int, rcu_cpu_kthread_status);
21333  DECLARE_PER_CPU(int, rcu_cpu_kthread_cpu);
21334  DECLARE_PER_CPU(unsigned int, rcu_cpu_kthread_loops);
21335  DECLARE_PER_CPU(char, rcu_cpu_has_work);
21336 -#endif /* #ifdef CONFIG_RCU_BOOST */
21337
21338  #ifndef RCU_TREE_NONCORE
21339
21340 @@ -587,10 +588,9 @@ void call_rcu(struct rcu_head *head, rcu_callback_t func);
21341  static void __init __rcu_init_preempt(void);
21342  static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags);
21343  static void rcu_preempt_boost_start_gp(struct rcu_node *rnp);
21344 -static void invoke_rcu_callbacks_kthread(void);
21345  static bool rcu_is_callbacks_kthread(void);
21346 +static void rcu_cpu_kthread_setup(unsigned int cpu);
21347  #ifdef CONFIG_RCU_BOOST
21348 -static void rcu_preempt_do_callbacks(void);
21349  static int rcu_spawn_one_boost_kthread(struct rcu_state *rsp,
21350                                                  struct rcu_node *rnp);
21351  #endif /* #ifdef CONFIG_RCU_BOOST */
21352 @@ -607,7 +607,8 @@ static void zero_cpu_stall_ticks(struct rcu_data *rdp);
21353  static void increment_cpu_stall_ticks(void);
21354  static bool rcu_nocb_cpu_needs_barrier(struct rcu_state *rsp, int cpu);
21355  static void rcu_nocb_gp_set(struct rcu_node *rnp, int nrq);
21356 -static void rcu_nocb_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp);
21357 +static struct swait_queue_head *rcu_nocb_gp_get(struct rcu_node *rnp);
21358 +static void rcu_nocb_gp_cleanup(struct swait_queue_head *sq);
21359  static void rcu_init_one_nocb(struct rcu_node *rnp);
21360  static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp,
21361                             bool lazy, unsigned long flags);
21362 diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
21363 index 630c19772630..8e119cf647ba 100644
21364 --- a/kernel/rcu/tree_plugin.h
21365 +++ b/kernel/rcu/tree_plugin.h
21366 @@ -24,25 +24,10 @@
21367   *        Paul E. McKenney <paulmck@linux.vnet.ibm.com>
21368   */
21369
21370 -#include <linux/delay.h>
21371 -#include <linux/gfp.h>
21372 -#include <linux/oom.h>
21373 -#include <linux/smpboot.h>
21374 -#include "../time/tick-internal.h"
21375 -
21376  #ifdef CONFIG_RCU_BOOST
21377
21378  #include "../locking/rtmutex_common.h"
21379
21380 -/*
21381 - * Control variables for per-CPU and per-rcu_node kthreads.  These
21382 - * handle all flavors of RCU.
21383 - */
21384 -static DEFINE_PER_CPU(struct task_struct *, rcu_cpu_kthread_task);
21385 -DEFINE_PER_CPU(unsigned int, rcu_cpu_kthread_status);
21386 -DEFINE_PER_CPU(unsigned int, rcu_cpu_kthread_loops);
21387 -DEFINE_PER_CPU(char, rcu_cpu_has_work);
21388 -
21389  #else /* #ifdef CONFIG_RCU_BOOST */
21390
21391  /*
21392 @@ -55,6 +40,14 @@ DEFINE_PER_CPU(char, rcu_cpu_has_work);
21393
21394  #endif /* #else #ifdef CONFIG_RCU_BOOST */
21395
21396 +/*
21397 + * Control variables for per-CPU and per-rcu_node kthreads.  These
21398 + * handle all flavors of RCU.
21399 + */
21400 +DEFINE_PER_CPU(unsigned int, rcu_cpu_kthread_status);
21401 +DEFINE_PER_CPU(unsigned int, rcu_cpu_kthread_loops);
21402 +DEFINE_PER_CPU(char, rcu_cpu_has_work);
21403 +
21404  #ifdef CONFIG_RCU_NOCB_CPU
21405  static cpumask_var_t rcu_nocb_mask; /* CPUs to have callbacks offloaded. */
21406  static bool have_rcu_nocb_mask;            /* Was rcu_nocb_mask allocated? */
21407 @@ -432,7 +425,7 @@ void rcu_read_unlock_special(struct task_struct *t)
21408         }
21409
21410         /* Hardware IRQ handlers cannot block, complain if they get here. */
21411 -       if (in_irq() || in_serving_softirq()) {
21412 +       if (preempt_count() & (HARDIRQ_MASK | SOFTIRQ_OFFSET)) {
21413                 lockdep_rcu_suspicious(__FILE__, __LINE__,
21414                                        "rcu_read_unlock() from irq or softirq with blocking in critical section!!!\n");
21415                 pr_alert("->rcu_read_unlock_special: %#x (b: %d, enq: %d nq: %d)\n",
21416 @@ -645,15 +638,6 @@ static void rcu_preempt_check_callbacks(void)
21417                 t->rcu_read_unlock_special.b.need_qs = true;
21418  }
21419
21420 -#ifdef CONFIG_RCU_BOOST
21421 -
21422 -static void rcu_preempt_do_callbacks(void)
21423 -{
21424 -       rcu_do_batch(rcu_state_p, this_cpu_ptr(rcu_data_p));
21425 -}
21426 -
21427 -#endif /* #ifdef CONFIG_RCU_BOOST */
21428 -
21429  /*
21430   * Queue a preemptible-RCU callback for invocation after a grace period.
21431   */
21432 @@ -930,6 +914,19 @@ void exit_rcu(void)
21433
21434  #endif /* #else #ifdef CONFIG_PREEMPT_RCU */
21435
21436 +/*
21437 + * If boosting, set rcuc kthreads to realtime priority.
21438 + */
21439 +static void rcu_cpu_kthread_setup(unsigned int cpu)
21440 +{
21441 +#ifdef CONFIG_RCU_BOOST
21442 +       struct sched_param sp;
21443 +
21444 +       sp.sched_priority = kthread_prio;
21445 +       sched_setscheduler_nocheck(current, SCHED_FIFO, &sp);
21446 +#endif /* #ifdef CONFIG_RCU_BOOST */
21447 +}
21448 +
21449  #ifdef CONFIG_RCU_BOOST
21450
21451  #include "../locking/rtmutex_common.h"
21452 @@ -961,16 +958,6 @@ static void rcu_initiate_boost_trace(struct rcu_node *rnp)
21453
21454  #endif /* #else #ifdef CONFIG_RCU_TRACE */
21455
21456 -static void rcu_wake_cond(struct task_struct *t, int status)
21457 -{
21458 -       /*
21459 -        * If the thread is yielding, only wake it when this
21460 -        * is invoked from idle
21461 -        */
21462 -       if (status != RCU_KTHREAD_YIELDING || is_idle_task(current))
21463 -               wake_up_process(t);
21464 -}
21465 -
21466  /*
21467   * Carry out RCU priority boosting on the task indicated by ->exp_tasks
21468   * or ->boost_tasks, advancing the pointer to the next task in the
21469 @@ -1115,23 +1102,6 @@ static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags)
21470  }
21471
21472  /*
21473 - * Wake up the per-CPU kthread to invoke RCU callbacks.
21474 - */
21475 -static void invoke_rcu_callbacks_kthread(void)
21476 -{
21477 -       unsigned long flags;
21478 -
21479 -       local_irq_save(flags);
21480 -       __this_cpu_write(rcu_cpu_has_work, 1);
21481 -       if (__this_cpu_read(rcu_cpu_kthread_task) != NULL &&
21482 -           current != __this_cpu_read(rcu_cpu_kthread_task)) {
21483 -               rcu_wake_cond(__this_cpu_read(rcu_cpu_kthread_task),
21484 -                             __this_cpu_read(rcu_cpu_kthread_status));
21485 -       }
21486 -       local_irq_restore(flags);
21487 -}
21488 -
21489 -/*
21490   * Is the current CPU running the RCU-callbacks kthread?
21491   * Caller must have preemption disabled.
21492   */
21493 @@ -1186,67 +1156,6 @@ static int rcu_spawn_one_boost_kthread(struct rcu_state *rsp,
21494         return 0;
21495  }
21496
21497 -static void rcu_kthread_do_work(void)
21498 -{
21499 -       rcu_do_batch(&rcu_sched_state, this_cpu_ptr(&rcu_sched_data));
21500 -       rcu_do_batch(&rcu_bh_state, this_cpu_ptr(&rcu_bh_data));
21501 -       rcu_preempt_do_callbacks();
21502 -}
21503 -
21504 -static void rcu_cpu_kthread_setup(unsigned int cpu)
21505 -{
21506 -       struct sched_param sp;
21507 -
21508 -       sp.sched_priority = kthread_prio;
21509 -       sched_setscheduler_nocheck(current, SCHED_FIFO, &sp);
21510 -}
21511 -
21512 -static void rcu_cpu_kthread_park(unsigned int cpu)
21513 -{
21514 -       per_cpu(rcu_cpu_kthread_status, cpu) = RCU_KTHREAD_OFFCPU;
21515 -}
21516 -
21517 -static int rcu_cpu_kthread_should_run(unsigned int cpu)
21518 -{
21519 -       return __this_cpu_read(rcu_cpu_has_work);
21520 -}
21521 -
21522 -/*
21523 - * Per-CPU kernel thread that invokes RCU callbacks.  This replaces the
21524 - * RCU softirq used in flavors and configurations of RCU that do not
21525 - * support RCU priority boosting.
21526 - */
21527 -static void rcu_cpu_kthread(unsigned int cpu)
21528 -{
21529 -       unsigned int *statusp = this_cpu_ptr(&rcu_cpu_kthread_status);
21530 -       char work, *workp = this_cpu_ptr(&rcu_cpu_has_work);
21531 -       int spincnt;
21532 -
21533 -       for (spincnt = 0; spincnt < 10; spincnt++) {
21534 -               trace_rcu_utilization(TPS("Start CPU kthread@rcu_wait"));
21535 -               local_bh_disable();
21536 -               *statusp = RCU_KTHREAD_RUNNING;
21537 -               this_cpu_inc(rcu_cpu_kthread_loops);
21538 -               local_irq_disable();
21539 -               work = *workp;
21540 -               *workp = 0;
21541 -               local_irq_enable();
21542 -               if (work)
21543 -                       rcu_kthread_do_work();
21544 -               local_bh_enable();
21545 -               if (*workp == 0) {
21546 -                       trace_rcu_utilization(TPS("End CPU kthread@rcu_wait"));
21547 -                       *statusp = RCU_KTHREAD_WAITING;
21548 -                       return;
21549 -               }
21550 -       }
21551 -       *statusp = RCU_KTHREAD_YIELDING;
21552 -       trace_rcu_utilization(TPS("Start CPU kthread@rcu_yield"));
21553 -       schedule_timeout_interruptible(2);
21554 -       trace_rcu_utilization(TPS("End CPU kthread@rcu_yield"));
21555 -       *statusp = RCU_KTHREAD_WAITING;
21556 -}
21557 -
21558  /*
21559   * Set the per-rcu_node kthread's affinity to cover all CPUs that are
21560   * served by the rcu_node in question.  The CPU hotplug lock is still
21561 @@ -1276,26 +1185,12 @@ static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu)
21562         free_cpumask_var(cm);
21563  }
21564
21565 -static struct smp_hotplug_thread rcu_cpu_thread_spec = {
21566 -       .store                  = &rcu_cpu_kthread_task,
21567 -       .thread_should_run      = rcu_cpu_kthread_should_run,
21568 -       .thread_fn              = rcu_cpu_kthread,
21569 -       .thread_comm            = "rcuc/%u",
21570 -       .setup                  = rcu_cpu_kthread_setup,
21571 -       .park                   = rcu_cpu_kthread_park,
21572 -};
21573 -
21574  /*
21575   * Spawn boost kthreads -- called as soon as the scheduler is running.
21576   */
21577  static void __init rcu_spawn_boost_kthreads(void)
21578  {
21579         struct rcu_node *rnp;
21580 -       int cpu;
21581 -
21582 -       for_each_possible_cpu(cpu)
21583 -               per_cpu(rcu_cpu_has_work, cpu) = 0;
21584 -       BUG_ON(smpboot_register_percpu_thread(&rcu_cpu_thread_spec));
21585         rcu_for_each_leaf_node(rcu_state_p, rnp)
21586                 (void)rcu_spawn_one_boost_kthread(rcu_state_p, rnp);
21587  }
21588 @@ -1318,11 +1213,6 @@ static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags)
21589         raw_spin_unlock_irqrestore(&rnp->lock, flags);
21590  }
21591
21592 -static void invoke_rcu_callbacks_kthread(void)
21593 -{
21594 -       WARN_ON_ONCE(1);
21595 -}
21596 -
21597  static bool rcu_is_callbacks_kthread(void)
21598  {
21599         return false;
21600 @@ -1346,7 +1236,7 @@ static void rcu_prepare_kthreads(int cpu)
21601
21602  #endif /* #else #ifdef CONFIG_RCU_BOOST */
21603
21604 -#if !defined(CONFIG_RCU_FAST_NO_HZ)
21605 +#if !defined(CONFIG_RCU_FAST_NO_HZ) || defined(CONFIG_PREEMPT_RT_FULL)
21606
21607  /*
21608   * Check to see if any future RCU-related work will need to be done
21609 @@ -1363,7 +1253,9 @@ int rcu_needs_cpu(u64 basemono, u64 *nextevt)
21610         return IS_ENABLED(CONFIG_RCU_NOCB_CPU_ALL)
21611                ? 0 : rcu_cpu_has_callbacks(NULL);
21612  }
21613 +#endif /* !defined(CONFIG_RCU_FAST_NO_HZ) || defined(CONFIG_PREEMPT_RT_FULL) */
21614
21615 +#if !defined(CONFIG_RCU_FAST_NO_HZ)
21616  /*
21617   * Because we do not have RCU_FAST_NO_HZ, don't bother cleaning up
21618   * after it.
21619 @@ -1459,6 +1351,8 @@ static bool __maybe_unused rcu_try_advance_all_cbs(void)
21620         return cbs_ready;
21621  }
21622
21623 +#ifndef CONFIG_PREEMPT_RT_FULL
21624 +
21625  /*
21626   * Allow the CPU to enter dyntick-idle mode unless it has callbacks ready
21627   * to invoke.  If the CPU has callbacks, try to advance them.  Tell the
21628 @@ -1504,6 +1398,7 @@ int rcu_needs_cpu(u64 basemono, u64 *nextevt)
21629         *nextevt = basemono + dj * TICK_NSEC;
21630         return 0;
21631  }
21632 +#endif /* #ifndef CONFIG_PREEMPT_RT_FULL */
21633
21634  /*
21635   * Prepare a CPU for idle from an RCU perspective.  The first major task
21636 @@ -1822,9 +1717,9 @@ early_param("rcu_nocb_poll", parse_rcu_nocb_poll);
21637   * Wake up any no-CBs CPUs' kthreads that were waiting on the just-ended
21638   * grace period.
21639   */
21640 -static void rcu_nocb_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp)
21641 +static void rcu_nocb_gp_cleanup(struct swait_queue_head *sq)
21642  {
21643 -       wake_up_all(&rnp->nocb_gp_wq[rnp->completed & 0x1]);
21644 +       swake_up_all(sq);
21645  }
21646
21647  /*
21648 @@ -1840,10 +1735,15 @@ static void rcu_nocb_gp_set(struct rcu_node *rnp, int nrq)
21649         rnp->need_future_gp[(rnp->completed + 1) & 0x1] += nrq;
21650  }
21651
21652 +static struct swait_queue_head *rcu_nocb_gp_get(struct rcu_node *rnp)
21653 +{
21654 +       return &rnp->nocb_gp_wq[rnp->completed & 0x1];
21655 +}
21656 +
21657  static void rcu_init_one_nocb(struct rcu_node *rnp)
21658  {
21659 -       init_waitqueue_head(&rnp->nocb_gp_wq[0]);
21660 -       init_waitqueue_head(&rnp->nocb_gp_wq[1]);
21661 +       init_swait_queue_head(&rnp->nocb_gp_wq[0]);
21662 +       init_swait_queue_head(&rnp->nocb_gp_wq[1]);
21663  }
21664
21665  #ifndef CONFIG_RCU_NOCB_CPU_ALL
21666 @@ -1868,7 +1768,7 @@ static void wake_nocb_leader(struct rcu_data *rdp, bool force)
21667         if (READ_ONCE(rdp_leader->nocb_leader_sleep) || force) {
21668                 /* Prior smp_mb__after_atomic() orders against prior enqueue. */
21669                 WRITE_ONCE(rdp_leader->nocb_leader_sleep, false);
21670 -               wake_up(&rdp_leader->nocb_wq);
21671 +               swake_up(&rdp_leader->nocb_wq);
21672         }
21673  }
21674
21675 @@ -2081,7 +1981,7 @@ static void rcu_nocb_wait_gp(struct rcu_data *rdp)
21676          */
21677         trace_rcu_future_gp(rnp, rdp, c, TPS("StartWait"));
21678         for (;;) {
21679 -               wait_event_interruptible(
21680 +               swait_event_interruptible(
21681                         rnp->nocb_gp_wq[c & 0x1],
21682                         (d = ULONG_CMP_GE(READ_ONCE(rnp->completed), c)));
21683                 if (likely(d))
21684 @@ -2109,7 +2009,7 @@ wait_again:
21685         /* Wait for callbacks to appear. */
21686         if (!rcu_nocb_poll) {
21687                 trace_rcu_nocb_wake(my_rdp->rsp->name, my_rdp->cpu, "Sleep");
21688 -               wait_event_interruptible(my_rdp->nocb_wq,
21689 +               swait_event_interruptible(my_rdp->nocb_wq,
21690                                 !READ_ONCE(my_rdp->nocb_leader_sleep));
21691                 /* Memory barrier handled by smp_mb() calls below and repoll. */
21692         } else if (firsttime) {
21693 @@ -2184,7 +2084,7 @@ wait_again:
21694                          * List was empty, wake up the follower.
21695                          * Memory barriers supplied by atomic_long_add().
21696                          */
21697 -                       wake_up(&rdp->nocb_wq);
21698 +                       swake_up(&rdp->nocb_wq);
21699                 }
21700         }
21701
21702 @@ -2205,7 +2105,7 @@ static void nocb_follower_wait(struct rcu_data *rdp)
21703                 if (!rcu_nocb_poll) {
21704                         trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu,
21705                                             "FollowerSleep");
21706 -                       wait_event_interruptible(rdp->nocb_wq,
21707 +                       swait_event_interruptible(rdp->nocb_wq,
21708                                                  READ_ONCE(rdp->nocb_follower_head));
21709                 } else if (firsttime) {
21710                         /* Don't drown trace log with "Poll"! */
21711 @@ -2364,7 +2264,7 @@ void __init rcu_init_nohz(void)
21712  static void __init rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp)
21713  {
21714         rdp->nocb_tail = &rdp->nocb_head;
21715 -       init_waitqueue_head(&rdp->nocb_wq);
21716 +       init_swait_queue_head(&rdp->nocb_wq);
21717         rdp->nocb_follower_tail = &rdp->nocb_follower_head;
21718  }
21719
21720 @@ -2514,7 +2414,7 @@ static bool rcu_nocb_cpu_needs_barrier(struct rcu_state *rsp, int cpu)
21721         return false;
21722  }
21723
21724 -static void rcu_nocb_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp)
21725 +static void rcu_nocb_gp_cleanup(struct swait_queue_head *sq)
21726  {
21727  }
21728
21729 @@ -2522,6 +2422,11 @@ static void rcu_nocb_gp_set(struct rcu_node *rnp, int nrq)
21730  {
21731  }
21732
21733 +static struct swait_queue_head *rcu_nocb_gp_get(struct rcu_node *rnp)
21734 +{
21735 +       return NULL;
21736 +}
21737 +
21738  static void rcu_init_one_nocb(struct rcu_node *rnp)
21739  {
21740  }
21741 diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c
21742 index 5f748c5a40f0..9a3904603ff6 100644
21743 --- a/kernel/rcu/update.c
21744 +++ b/kernel/rcu/update.c
21745 @@ -276,6 +276,7 @@ int rcu_read_lock_held(void)
21746  }
21747  EXPORT_SYMBOL_GPL(rcu_read_lock_held);
21748
21749 +#ifndef CONFIG_PREEMPT_RT_FULL
21750  /**
21751   * rcu_read_lock_bh_held() - might we be in RCU-bh read-side critical section?
21752   *
21753 @@ -302,6 +303,7 @@ int rcu_read_lock_bh_held(void)
21754         return in_softirq() || irqs_disabled();
21755  }
21756  EXPORT_SYMBOL_GPL(rcu_read_lock_bh_held);
21757 +#endif
21758
21759  #endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
21760
21761 diff --git a/kernel/relay.c b/kernel/relay.c
21762 index 0b4570cfacae..60684be39f22 100644
21763 --- a/kernel/relay.c
21764 +++ b/kernel/relay.c
21765 @@ -336,6 +336,10 @@ static void wakeup_readers(unsigned long data)
21766  {
21767         struct rchan_buf *buf = (struct rchan_buf *)data;
21768         wake_up_interruptible(&buf->read_wait);
21769 +       /*
21770 +        * Stupid polling for now:
21771 +        */
21772 +       mod_timer(&buf->timer, jiffies + 1);
21773  }
21774
21775  /**
21776 @@ -353,6 +357,7 @@ static void __relay_reset(struct rchan_buf *buf, unsigned int init)
21777                 init_waitqueue_head(&buf->read_wait);
21778                 kref_init(&buf->kref);
21779                 setup_timer(&buf->timer, wakeup_readers, (unsigned long)buf);
21780 +               mod_timer(&buf->timer, jiffies + 1);
21781         } else
21782                 del_timer_sync(&buf->timer);
21783
21784 @@ -736,15 +741,6 @@ size_t relay_switch_subbuf(struct rchan_buf *buf, size_t length)
21785                 else
21786                         buf->early_bytes += buf->chan->subbuf_size -
21787                                             buf->padding[old_subbuf];
21788 -               smp_mb();
21789 -               if (waitqueue_active(&buf->read_wait))
21790 -                       /*
21791 -                        * Calling wake_up_interruptible() from here
21792 -                        * will deadlock if we happen to be logging
21793 -                        * from the scheduler (trying to re-grab
21794 -                        * rq->lock), so defer it.
21795 -                        */
21796 -                       mod_timer(&buf->timer, jiffies + 1);
21797         }
21798
21799         old = buf->data;
21800 diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile
21801 index 67687973ce80..01b9994b367a 100644
21802 --- a/kernel/sched/Makefile
21803 +++ b/kernel/sched/Makefile
21804 @@ -13,7 +13,7 @@ endif
21805
21806  obj-y += core.o loadavg.o clock.o cputime.o
21807  obj-y += idle_task.o fair.o rt.o deadline.o stop_task.o
21808 -obj-y += wait.o completion.o idle.o
21809 +obj-y += wait.o swait.o swork.o completion.o idle.o
21810  obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o
21811  obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o
21812  obj-$(CONFIG_SCHEDSTATS) += stats.o
21813 diff --git a/kernel/sched/completion.c b/kernel/sched/completion.c
21814 index 8d0f35debf35..b62cf6400fe0 100644
21815 --- a/kernel/sched/completion.c
21816 +++ b/kernel/sched/completion.c
21817 @@ -30,10 +30,10 @@ void complete(struct completion *x)
21818  {
21819         unsigned long flags;
21820
21821 -       spin_lock_irqsave(&x->wait.lock, flags);
21822 +       raw_spin_lock_irqsave(&x->wait.lock, flags);
21823         x->done++;
21824 -       __wake_up_locked(&x->wait, TASK_NORMAL, 1);
21825 -       spin_unlock_irqrestore(&x->wait.lock, flags);
21826 +       swake_up_locked(&x->wait);
21827 +       raw_spin_unlock_irqrestore(&x->wait.lock, flags);
21828  }
21829  EXPORT_SYMBOL(complete);
21830
21831 @@ -50,10 +50,10 @@ void complete_all(struct completion *x)
21832  {
21833         unsigned long flags;
21834
21835 -       spin_lock_irqsave(&x->wait.lock, flags);
21836 +       raw_spin_lock_irqsave(&x->wait.lock, flags);
21837         x->done += UINT_MAX/2;
21838 -       __wake_up_locked(&x->wait, TASK_NORMAL, 0);
21839 -       spin_unlock_irqrestore(&x->wait.lock, flags);
21840 +       swake_up_all_locked(&x->wait);
21841 +       raw_spin_unlock_irqrestore(&x->wait.lock, flags);
21842  }
21843  EXPORT_SYMBOL(complete_all);
21844
21845 @@ -62,20 +62,20 @@ do_wait_for_common(struct completion *x,
21846                    long (*action)(long), long timeout, int state)
21847  {
21848         if (!x->done) {
21849 -               DECLARE_WAITQUEUE(wait, current);
21850 +               DECLARE_SWAITQUEUE(wait);
21851
21852 -               __add_wait_queue_tail_exclusive(&x->wait, &wait);
21853 +               __prepare_to_swait(&x->wait, &wait);
21854                 do {
21855                         if (signal_pending_state(state, current)) {
21856                                 timeout = -ERESTARTSYS;
21857                                 break;
21858                         }
21859                         __set_current_state(state);
21860 -                       spin_unlock_irq(&x->wait.lock);
21861 +                       raw_spin_unlock_irq(&x->wait.lock);
21862                         timeout = action(timeout);
21863 -                       spin_lock_irq(&x->wait.lock);
21864 +                       raw_spin_lock_irq(&x->wait.lock);
21865                 } while (!x->done && timeout);
21866 -               __remove_wait_queue(&x->wait, &wait);
21867 +               __finish_swait(&x->wait, &wait);
21868                 if (!x->done)
21869                         return timeout;
21870         }
21871 @@ -89,9 +89,9 @@ __wait_for_common(struct completion *x,
21872  {
21873         might_sleep();
21874
21875 -       spin_lock_irq(&x->wait.lock);
21876 +       raw_spin_lock_irq(&x->wait.lock);
21877         timeout = do_wait_for_common(x, action, timeout, state);
21878 -       spin_unlock_irq(&x->wait.lock);
21879 +       raw_spin_unlock_irq(&x->wait.lock);
21880         return timeout;
21881  }
21882
21883 @@ -277,12 +277,12 @@ bool try_wait_for_completion(struct completion *x)
21884         if (!READ_ONCE(x->done))
21885                 return 0;
21886
21887 -       spin_lock_irqsave(&x->wait.lock, flags);
21888 +       raw_spin_lock_irqsave(&x->wait.lock, flags);
21889         if (!x->done)
21890                 ret = 0;
21891         else
21892                 x->done--;
21893 -       spin_unlock_irqrestore(&x->wait.lock, flags);
21894 +       raw_spin_unlock_irqrestore(&x->wait.lock, flags);
21895         return ret;
21896  }
21897  EXPORT_SYMBOL(try_wait_for_completion);
21898 @@ -311,7 +311,7 @@ bool completion_done(struct completion *x)
21899          * after it's acquired the lock.
21900          */
21901         smp_rmb();
21902 -       spin_unlock_wait(&x->wait.lock);
21903 +       raw_spin_unlock_wait(&x->wait.lock);
21904         return true;
21905  }
21906  EXPORT_SYMBOL(completion_done);
21907 diff --git a/kernel/sched/core.c b/kernel/sched/core.c
21908 index 20253dbc8610..e9b8d518202e 100644
21909 --- a/kernel/sched/core.c
21910 +++ b/kernel/sched/core.c
21911 @@ -260,7 +260,11 @@ late_initcall(sched_init_debug);
21912   * Number of tasks to iterate in a single balance run.
21913   * Limited because this is done with IRQs disabled.
21914   */
21915 +#ifndef CONFIG_PREEMPT_RT_FULL
21916  const_debug unsigned int sysctl_sched_nr_migrate = 32;
21917 +#else
21918 +const_debug unsigned int sysctl_sched_nr_migrate = 8;
21919 +#endif
21920
21921  /*
21922   * period over which we average the RT time consumption, measured
21923 @@ -438,6 +442,7 @@ static void init_rq_hrtick(struct rq *rq)
21924
21925         hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
21926         rq->hrtick_timer.function = hrtick;
21927 +       rq->hrtick_timer.irqsafe = 1;
21928  }
21929  #else  /* CONFIG_SCHED_HRTICK */
21930  static inline void hrtick_clear(struct rq *rq)
21931 @@ -542,7 +547,7 @@ void wake_q_add(struct wake_q_head *head, struct task_struct *task)
21932         head->lastp = &node->next;
21933  }
21934
21935 -void wake_up_q(struct wake_q_head *head)
21936 +void __wake_up_q(struct wake_q_head *head, bool sleeper)
21937  {
21938         struct wake_q_node *node = head->first;
21939
21940 @@ -559,7 +564,10 @@ void wake_up_q(struct wake_q_head *head)
21941                  * wake_up_process() implies a wmb() to pair with the queueing
21942                  * in wake_q_add() so as not to miss wakeups.
21943                  */
21944 -               wake_up_process(task);
21945 +               if (sleeper)
21946 +                       wake_up_lock_sleeper(task);
21947 +               else
21948 +                       wake_up_process(task);
21949                 put_task_struct(task);
21950         }
21951  }
21952 @@ -595,6 +603,38 @@ void resched_curr(struct rq *rq)
21953                 trace_sched_wake_idle_without_ipi(cpu);
21954  }
21955
21956 +#ifdef CONFIG_PREEMPT_LAZY
21957 +void resched_curr_lazy(struct rq *rq)
21958 +{
21959 +       struct task_struct *curr = rq->curr;
21960 +       int cpu;
21961 +
21962 +       if (!sched_feat(PREEMPT_LAZY)) {
21963 +               resched_curr(rq);
21964 +               return;
21965 +       }
21966 +
21967 +       lockdep_assert_held(&rq->lock);
21968 +
21969 +       if (test_tsk_need_resched(curr))
21970 +               return;
21971 +
21972 +       if (test_tsk_need_resched_lazy(curr))
21973 +               return;
21974 +
21975 +       set_tsk_need_resched_lazy(curr);
21976 +
21977 +       cpu = cpu_of(rq);
21978 +       if (cpu == smp_processor_id())
21979 +               return;
21980 +
21981 +       /* NEED_RESCHED_LAZY must be visible before we test polling */
21982 +       smp_mb();
21983 +       if (!tsk_is_polling(curr))
21984 +               smp_send_reschedule(cpu);
21985 +}
21986 +#endif
21987 +
21988  void resched_cpu(int cpu)
21989  {
21990         struct rq *rq = cpu_rq(cpu);
21991 @@ -618,11 +658,14 @@ void resched_cpu(int cpu)
21992   */
21993  int get_nohz_timer_target(void)
21994  {
21995 -       int i, cpu = smp_processor_id();
21996 +       int i, cpu;
21997         struct sched_domain *sd;
21998
21999 +       preempt_disable_rt();
22000 +       cpu = smp_processor_id();
22001 +
22002         if (!idle_cpu(cpu) && is_housekeeping_cpu(cpu))
22003 -               return cpu;
22004 +               goto preempt_en_rt;
22005
22006         rcu_read_lock();
22007         for_each_domain(cpu, sd) {
22008 @@ -641,6 +684,8 @@ int get_nohz_timer_target(void)
22009                 cpu = housekeeping_any_cpu();
22010  unlock:
22011         rcu_read_unlock();
22012 +preempt_en_rt:
22013 +       preempt_enable_rt();
22014         return cpu;
22015  }
22016  /*
22017 @@ -1174,6 +1219,11 @@ void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
22018
22019         lockdep_assert_held(&p->pi_lock);
22020
22021 +       if (__migrate_disabled(p)) {
22022 +               cpumask_copy(&p->cpus_allowed, new_mask);
22023 +               return;
22024 +       }
22025 +
22026         queued = task_on_rq_queued(p);
22027         running = task_current(rq, p);
22028
22029 @@ -1196,6 +1246,84 @@ void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
22030                 enqueue_task(rq, p, ENQUEUE_RESTORE);
22031  }
22032
22033 +static DEFINE_PER_CPU(struct cpumask, sched_cpumasks);
22034 +static DEFINE_MUTEX(sched_down_mutex);
22035 +static cpumask_t sched_down_cpumask;
22036 +
22037 +void tell_sched_cpu_down_begin(int cpu)
22038 +{
22039 +       mutex_lock(&sched_down_mutex);
22040 +       cpumask_set_cpu(cpu, &sched_down_cpumask);
22041 +       mutex_unlock(&sched_down_mutex);
22042 +}
22043 +
22044 +void tell_sched_cpu_down_done(int cpu)
22045 +{
22046 +       mutex_lock(&sched_down_mutex);
22047 +       cpumask_clear_cpu(cpu, &sched_down_cpumask);
22048 +       mutex_unlock(&sched_down_mutex);
22049 +}
22050 +
22051 +/**
22052 + * migrate_me - try to move the current task off this cpu
22053 + *
22054 + * Used by the pin_current_cpu() code to try to get tasks
22055 + * to move off the current CPU as it is going down.
22056 + * It will only move the task if the task isn't pinned to
22057 + * the CPU (with migrate_disable, affinity or NO_SETAFFINITY)
22058 + * and the task has to be in a RUNNING state. Otherwise the
22059 + * movement of the task will wake it up (change its state
22060 + * to running) when the task did not expect it.
22061 + *
22062 + * Returns 1 if it succeeded in moving the current task
22063 + *         0 otherwise.
22064 + */
22065 +int migrate_me(void)
22066 +{
22067 +       struct task_struct *p = current;
22068 +       struct migration_arg arg;
22069 +       struct cpumask *cpumask;
22070 +       struct cpumask *mask;
22071 +       unsigned long flags;
22072 +       unsigned int dest_cpu;
22073 +       struct rq *rq;
22074 +
22075 +       /*
22076 +        * We can not migrate tasks bounded to a CPU or tasks not
22077 +        * running. The movement of the task will wake it up.
22078 +        */
22079 +       if (p->flags & PF_NO_SETAFFINITY || p->state)
22080 +               return 0;
22081 +
22082 +       mutex_lock(&sched_down_mutex);
22083 +       rq = task_rq_lock(p, &flags);
22084 +
22085 +       cpumask = this_cpu_ptr(&sched_cpumasks);
22086 +       mask = &p->cpus_allowed;
22087 +
22088 +       cpumask_andnot(cpumask, mask, &sched_down_cpumask);
22089 +
22090 +       if (!cpumask_weight(cpumask)) {
22091 +               /* It's only on this CPU? */
22092 +               task_rq_unlock(rq, p, &flags);
22093 +               mutex_unlock(&sched_down_mutex);
22094 +               return 0;
22095 +       }
22096 +
22097 +       dest_cpu = cpumask_any_and(cpu_active_mask, cpumask);
22098 +
22099 +       arg.task = p;
22100 +       arg.dest_cpu = dest_cpu;
22101 +
22102 +       task_rq_unlock(rq, p, &flags);
22103 +
22104 +       stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg);
22105 +       tlb_migrate_finish(p->mm);
22106 +       mutex_unlock(&sched_down_mutex);
22107 +
22108 +       return 1;
22109 +}
22110 +
22111  /*
22112   * Change a given task's CPU affinity. Migrate the thread to a
22113   * proper CPU and schedule it away if the CPU it's executing on
22114 @@ -1235,7 +1363,7 @@ static int __set_cpus_allowed_ptr(struct task_struct *p,
22115         do_set_cpus_allowed(p, new_mask);
22116
22117         /* Can the task run on the task's current CPU? If so, we're done */
22118 -       if (cpumask_test_cpu(task_cpu(p), new_mask))
22119 +       if (cpumask_test_cpu(task_cpu(p), new_mask) || __migrate_disabled(p))
22120                 goto out;
22121
22122         dest_cpu = cpumask_any_and(cpu_active_mask, new_mask);
22123 @@ -1411,6 +1539,18 @@ out:
22124         return ret;
22125  }
22126
22127 +static bool check_task_state(struct task_struct *p, long match_state)
22128 +{
22129 +       bool match = false;
22130 +
22131 +       raw_spin_lock_irq(&p->pi_lock);
22132 +       if (p->state == match_state || p->saved_state == match_state)
22133 +               match = true;
22134 +       raw_spin_unlock_irq(&p->pi_lock);
22135 +
22136 +       return match;
22137 +}
22138 +
22139  /*
22140   * wait_task_inactive - wait for a thread to unschedule.
22141   *
22142 @@ -1455,7 +1595,7 @@ unsigned long wait_task_inactive(struct task_struct *p, long match_state)
22143                  * is actually now running somewhere else!
22144                  */
22145                 while (task_running(rq, p)) {
22146 -                       if (match_state && unlikely(p->state != match_state))
22147 +                       if (match_state && !check_task_state(p, match_state))
22148                                 return 0;
22149                         cpu_relax();
22150                 }
22151 @@ -1470,7 +1610,8 @@ unsigned long wait_task_inactive(struct task_struct *p, long match_state)
22152                 running = task_running(rq, p);
22153                 queued = task_on_rq_queued(p);
22154                 ncsw = 0;
22155 -               if (!match_state || p->state == match_state)
22156 +               if (!match_state || p->state == match_state ||
22157 +                   p->saved_state == match_state)
22158                         ncsw = p->nvcsw | LONG_MIN; /* sets MSB */
22159                 task_rq_unlock(rq, p, &flags);
22160
22161 @@ -1627,7 +1768,7 @@ int select_task_rq(struct task_struct *p, int cpu, int sd_flags, int wake_flags)
22162  {
22163         lockdep_assert_held(&p->pi_lock);
22164
22165 -       if (p->nr_cpus_allowed > 1)
22166 +       if (tsk_nr_cpus_allowed(p) > 1)
22167                 cpu = p->sched_class->select_task_rq(p, cpu, sd_flags, wake_flags);
22168
22169         /*
22170 @@ -1707,10 +1848,6 @@ static inline void ttwu_activate(struct rq *rq, struct task_struct *p, int en_fl
22171  {
22172         activate_task(rq, p, en_flags);
22173         p->on_rq = TASK_ON_RQ_QUEUED;
22174 -
22175 -       /* if a worker is waking up, notify workqueue */
22176 -       if (p->flags & PF_WQ_WORKER)
22177 -               wq_worker_waking_up(p, cpu_of(rq));
22178  }
22179
22180  /*
22181 @@ -1937,8 +2074,27 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
22182          */
22183         smp_mb__before_spinlock();
22184         raw_spin_lock_irqsave(&p->pi_lock, flags);
22185 -       if (!(p->state & state))
22186 +       if (!(p->state & state)) {
22187 +               /*
22188 +                * The task might be running due to a spinlock sleeper
22189 +                * wakeup. Check the saved state and set it to running
22190 +                * if the wakeup condition is true.
22191 +                */
22192 +               if (!(wake_flags & WF_LOCK_SLEEPER)) {
22193 +                       if (p->saved_state & state) {
22194 +                               p->saved_state = TASK_RUNNING;
22195 +                               success = 1;
22196 +                       }
22197 +               }
22198                 goto out;
22199 +       }
22200 +
22201 +       /*
22202 +        * If this is a regular wakeup, then we can unconditionally
22203 +        * clear the saved state of a "lock sleeper".
22204 +        */
22205 +       if (!(wake_flags & WF_LOCK_SLEEPER))
22206 +               p->saved_state = TASK_RUNNING;
22207
22208         trace_sched_waking(p);
22209
22210 @@ -2030,52 +2186,6 @@ out:
22211  }
22212
22213  /**
22214 - * try_to_wake_up_local - try to wake up a local task with rq lock held
22215 - * @p: the thread to be awakened
22216 - *
22217 - * Put @p on the run-queue if it's not already there. The caller must
22218 - * ensure that this_rq() is locked, @p is bound to this_rq() and not
22219 - * the current task.
22220 - */
22221 -static void try_to_wake_up_local(struct task_struct *p)
22222 -{
22223 -       struct rq *rq = task_rq(p);
22224 -
22225 -       if (WARN_ON_ONCE(rq != this_rq()) ||
22226 -           WARN_ON_ONCE(p == current))
22227 -               return;
22228 -
22229 -       lockdep_assert_held(&rq->lock);
22230 -
22231 -       if (!raw_spin_trylock(&p->pi_lock)) {
22232 -               /*
22233 -                * This is OK, because current is on_cpu, which avoids it being
22234 -                * picked for load-balance and preemption/IRQs are still
22235 -                * disabled avoiding further scheduler activity on it and we've
22236 -                * not yet picked a replacement task.
22237 -                */
22238 -               lockdep_unpin_lock(&rq->lock);
22239 -               raw_spin_unlock(&rq->lock);
22240 -               raw_spin_lock(&p->pi_lock);
22241 -               raw_spin_lock(&rq->lock);
22242 -               lockdep_pin_lock(&rq->lock);
22243 -       }
22244 -
22245 -       if (!(p->state & TASK_NORMAL))
22246 -               goto out;
22247 -
22248 -       trace_sched_waking(p);
22249 -
22250 -       if (!task_on_rq_queued(p))
22251 -               ttwu_activate(rq, p, ENQUEUE_WAKEUP);
22252 -
22253 -       ttwu_do_wakeup(rq, p, 0);
22254 -       ttwu_stat(p, smp_processor_id(), 0);
22255 -out:
22256 -       raw_spin_unlock(&p->pi_lock);
22257 -}
22258 -
22259 -/**
22260   * wake_up_process - Wake up a specific process
22261   * @p: The process to be woken up.
22262   *
22263 @@ -2093,6 +2203,18 @@ int wake_up_process(struct task_struct *p)
22264  }
22265  EXPORT_SYMBOL(wake_up_process);
22266
22267 +/**
22268 + * wake_up_lock_sleeper - Wake up a specific process blocked on a "sleeping lock"
22269 + * @p: The process to be woken up.
22270 + *
22271 + * Same as wake_up_process() above, but wake_flags=WF_LOCK_SLEEPER to indicate
22272 + * the nature of the wakeup.
22273 + */
22274 +int wake_up_lock_sleeper(struct task_struct *p)
22275 +{
22276 +       return try_to_wake_up(p, TASK_ALL, WF_LOCK_SLEEPER);
22277 +}
22278 +
22279  int wake_up_state(struct task_struct *p, unsigned int state)
22280  {
22281         return try_to_wake_up(p, state, 0);
22282 @@ -2279,6 +2401,9 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p)
22283         p->on_cpu = 0;
22284  #endif
22285         init_task_preempt_count(p);
22286 +#ifdef CONFIG_HAVE_PREEMPT_LAZY
22287 +       task_thread_info(p)->preempt_lazy_count = 0;
22288 +#endif
22289  #ifdef CONFIG_SMP
22290         plist_node_init(&p->pushable_tasks, MAX_PRIO);
22291         RB_CLEAR_NODE(&p->pushable_dl_tasks);
22292 @@ -2603,8 +2728,12 @@ static struct rq *finish_task_switch(struct task_struct *prev)
22293         finish_arch_post_lock_switch();
22294
22295         fire_sched_in_preempt_notifiers(current);
22296 +       /*
22297 +        * We use mmdrop_delayed() here so we don't have to do the
22298 +        * full __mmdrop() when we are the last user.
22299 +        */
22300         if (mm)
22301 -               mmdrop(mm);
22302 +               mmdrop_delayed(mm);
22303         if (unlikely(prev_state == TASK_DEAD)) {
22304                 if (prev->sched_class->task_dead)
22305                         prev->sched_class->task_dead(prev);
22306 @@ -2935,16 +3064,6 @@ u64 scheduler_tick_max_deferment(void)
22307  }
22308  #endif
22309
22310 -notrace unsigned long get_parent_ip(unsigned long addr)
22311 -{
22312 -       if (in_lock_functions(addr)) {
22313 -               addr = CALLER_ADDR2;
22314 -               if (in_lock_functions(addr))
22315 -                       addr = CALLER_ADDR3;
22316 -       }
22317 -       return addr;
22318 -}
22319 -
22320  #if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \
22321                                 defined(CONFIG_PREEMPT_TRACER))
22322
22323 @@ -2966,7 +3085,7 @@ void preempt_count_add(int val)
22324                                 PREEMPT_MASK - 10);
22325  #endif
22326         if (preempt_count() == val) {
22327 -               unsigned long ip = get_parent_ip(CALLER_ADDR1);
22328 +               unsigned long ip = get_lock_parent_ip();
22329  #ifdef CONFIG_DEBUG_PREEMPT
22330                 current->preempt_disable_ip = ip;
22331  #endif
22332 @@ -2993,7 +3112,7 @@ void preempt_count_sub(int val)
22333  #endif
22334
22335         if (preempt_count() == val)
22336 -               trace_preempt_on(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1));
22337 +               trace_preempt_on(CALLER_ADDR0, get_lock_parent_ip());
22338         __preempt_count_sub(val);
22339  }
22340  EXPORT_SYMBOL(preempt_count_sub);
22341 @@ -3048,6 +3167,77 @@ static inline void schedule_debug(struct task_struct *prev)
22342         schedstat_inc(this_rq(), sched_count);
22343  }
22344
22345 +#if defined(CONFIG_PREEMPT_RT_FULL) && defined(CONFIG_SMP)
22346 +
22347 +void migrate_disable(void)
22348 +{
22349 +       struct task_struct *p = current;
22350 +
22351 +       if (in_atomic() || irqs_disabled()) {
22352 +#ifdef CONFIG_SCHED_DEBUG
22353 +               p->migrate_disable_atomic++;
22354 +#endif
22355 +               return;
22356 +       }
22357 +
22358 +#ifdef CONFIG_SCHED_DEBUG
22359 +       if (unlikely(p->migrate_disable_atomic)) {
22360 +               tracing_off();
22361 +               WARN_ON_ONCE(1);
22362 +       }
22363 +#endif
22364 +
22365 +       if (p->migrate_disable) {
22366 +               p->migrate_disable++;
22367 +               return;
22368 +       }
22369 +
22370 +       preempt_disable();
22371 +       preempt_lazy_disable();
22372 +       pin_current_cpu();
22373 +       p->migrate_disable = 1;
22374 +       preempt_enable();
22375 +}
22376 +EXPORT_SYMBOL(migrate_disable);
22377 +
22378 +void migrate_enable(void)
22379 +{
22380 +       struct task_struct *p = current;
22381 +
22382 +       if (in_atomic() || irqs_disabled()) {
22383 +#ifdef CONFIG_SCHED_DEBUG
22384 +               p->migrate_disable_atomic--;
22385 +#endif
22386 +               return;
22387 +       }
22388 +
22389 +#ifdef CONFIG_SCHED_DEBUG
22390 +       if (unlikely(p->migrate_disable_atomic)) {
22391 +               tracing_off();
22392 +               WARN_ON_ONCE(1);
22393 +       }
22394 +#endif
22395 +       WARN_ON_ONCE(p->migrate_disable <= 0);
22396 +
22397 +       if (p->migrate_disable > 1) {
22398 +               p->migrate_disable--;
22399 +               return;
22400 +       }
22401 +
22402 +       preempt_disable();
22403 +       /*
22404 +        * Clearing migrate_disable causes tsk_cpus_allowed to
22405 +        * show the tasks original cpu affinity.
22406 +        */
22407 +       p->migrate_disable = 0;
22408 +
22409 +       unpin_current_cpu();
22410 +       preempt_enable();
22411 +       preempt_lazy_enable();
22412 +}
22413 +EXPORT_SYMBOL(migrate_enable);
22414 +#endif
22415 +
22416  /*
22417   * Pick up the highest-prio task:
22418   */
22419 @@ -3172,19 +3362,6 @@ static void __sched notrace __schedule(bool preempt)
22420                 } else {
22421                         deactivate_task(rq, prev, DEQUEUE_SLEEP);
22422                         prev->on_rq = 0;
22423 -
22424 -                       /*
22425 -                        * If a worker went to sleep, notify and ask workqueue
22426 -                        * whether it wants to wake up a task to maintain
22427 -                        * concurrency.
22428 -                        */
22429 -                       if (prev->flags & PF_WQ_WORKER) {
22430 -                               struct task_struct *to_wakeup;
22431 -
22432 -                               to_wakeup = wq_worker_sleeping(prev, cpu);
22433 -                               if (to_wakeup)
22434 -                                       try_to_wake_up_local(to_wakeup);
22435 -                       }
22436                 }
22437                 switch_count = &prev->nvcsw;
22438         }
22439 @@ -3194,6 +3371,7 @@ static void __sched notrace __schedule(bool preempt)
22440
22441         next = pick_next_task(rq, prev);
22442         clear_tsk_need_resched(prev);
22443 +       clear_tsk_need_resched_lazy(prev);
22444         clear_preempt_need_resched();
22445         rq->clock_skip_update = 0;
22446
22447 @@ -3215,9 +3393,20 @@ static void __sched notrace __schedule(bool preempt)
22448
22449  static inline void sched_submit_work(struct task_struct *tsk)
22450  {
22451 -       if (!tsk->state || tsk_is_pi_blocked(tsk))
22452 +       if (!tsk->state)
22453                 return;
22454         /*
22455 +        * If a worker went to sleep, notify and ask workqueue whether
22456 +        * it wants to wake up a task to maintain concurrency.
22457 +        */
22458 +       if (tsk->flags & PF_WQ_WORKER)
22459 +               wq_worker_sleeping(tsk);
22460 +
22461 +
22462 +       if (tsk_is_pi_blocked(tsk))
22463 +               return;
22464 +
22465 +       /*
22466          * If we are going to sleep and we have plugged IO queued,
22467          * make sure to submit it to avoid deadlocks.
22468          */
22469 @@ -3225,6 +3414,12 @@ static inline void sched_submit_work(struct task_struct *tsk)
22470                 blk_schedule_flush_plug(tsk);
22471  }
22472
22473 +static void sched_update_worker(struct task_struct *tsk)
22474 +{
22475 +       if (tsk->flags & PF_WQ_WORKER)
22476 +               wq_worker_running(tsk);
22477 +}
22478 +
22479  asmlinkage __visible void __sched schedule(void)
22480  {
22481         struct task_struct *tsk = current;
22482 @@ -3235,6 +3430,7 @@ asmlinkage __visible void __sched schedule(void)
22483                 __schedule(false);
22484                 sched_preempt_enable_no_resched();
22485         } while (need_resched());
22486 +       sched_update_worker(tsk);
22487  }
22488  EXPORT_SYMBOL(schedule);
22489
22490 @@ -3283,6 +3479,30 @@ static void __sched notrace preempt_schedule_common(void)
22491         } while (need_resched());
22492  }
22493
22494 +#ifdef CONFIG_PREEMPT_LAZY
22495 +/*
22496 + * If TIF_NEED_RESCHED is then we allow to be scheduled away since this is
22497 + * set by a RT task. Oterwise we try to avoid beeing scheduled out as long as
22498 + * preempt_lazy_count counter >0.
22499 + */
22500 +static __always_inline int preemptible_lazy(void)
22501 +{
22502 +       if (test_thread_flag(TIF_NEED_RESCHED))
22503 +               return 1;
22504 +       if (current_thread_info()->preempt_lazy_count)
22505 +               return 0;
22506 +       return 1;
22507 +}
22508 +
22509 +#else
22510 +
22511 +static inline int preemptible_lazy(void)
22512 +{
22513 +       return 1;
22514 +}
22515 +
22516 +#endif
22517 +
22518  #ifdef CONFIG_PREEMPT
22519  /*
22520   * this is the entry point to schedule() from in-kernel preemption
22521 @@ -3297,6 +3517,8 @@ asmlinkage __visible void __sched notrace preempt_schedule(void)
22522          */
22523         if (likely(!preemptible()))
22524                 return;
22525 +       if (!preemptible_lazy())
22526 +               return;
22527
22528         preempt_schedule_common();
22529  }
22530 @@ -3323,6 +3545,8 @@ asmlinkage __visible void __sched notrace preempt_schedule_notrace(void)
22531
22532         if (likely(!preemptible()))
22533                 return;
22534 +       if (!preemptible_lazy())
22535 +               return;
22536
22537         do {
22538                 preempt_disable_notrace();
22539 @@ -3332,7 +3556,16 @@ asmlinkage __visible void __sched notrace preempt_schedule_notrace(void)
22540                  * an infinite recursion.
22541                  */
22542                 prev_ctx = exception_enter();
22543 +               /*
22544 +                * The add/subtract must not be traced by the function
22545 +                * tracer. But we still want to account for the
22546 +                * preempt off latency tracer. Since the _notrace versions
22547 +                * of add/subtract skip the accounting for latency tracer
22548 +                * we must force it manually.
22549 +                */
22550 +               start_critical_timings();
22551                 __schedule(true);
22552 +               stop_critical_timings();
22553                 exception_exit(prev_ctx);
22554
22555                 preempt_enable_no_resched_notrace();
22556 @@ -4676,6 +4909,7 @@ int __cond_resched_lock(spinlock_t *lock)
22557  }
22558  EXPORT_SYMBOL(__cond_resched_lock);
22559
22560 +#ifndef CONFIG_PREEMPT_RT_FULL
22561  int __sched __cond_resched_softirq(void)
22562  {
22563         BUG_ON(!in_softirq());
22564 @@ -4689,6 +4923,7 @@ int __sched __cond_resched_softirq(void)
22565         return 0;
22566  }
22567  EXPORT_SYMBOL(__cond_resched_softirq);
22568 +#endif
22569
22570  /**
22571   * yield - yield the current processor to other threads.
22572 @@ -5055,7 +5290,9 @@ void init_idle(struct task_struct *idle, int cpu)
22573
22574         /* Set the preempt count _outside_ the spinlocks! */
22575         init_idle_preempt_count(idle, cpu);
22576 -
22577 +#ifdef CONFIG_HAVE_PREEMPT_LAZY
22578 +       task_thread_info(idle)->preempt_lazy_count = 0;
22579 +#endif
22580         /*
22581          * The idle tasks have their own, simple scheduling class:
22582          */
22583 @@ -5196,6 +5433,8 @@ void sched_setnuma(struct task_struct *p, int nid)
22584  #endif /* CONFIG_NUMA_BALANCING */
22585
22586  #ifdef CONFIG_HOTPLUG_CPU
22587 +static DEFINE_PER_CPU(struct mm_struct *, idle_last_mm);
22588 +
22589  /*
22590   * Ensures that the idle task is using init_mm right before its cpu goes
22591   * offline.
22592 @@ -5210,7 +5449,11 @@ void idle_task_exit(void)
22593                 switch_mm(mm, &init_mm, current);
22594                 finish_arch_post_lock_switch();
22595         }
22596 -       mmdrop(mm);
22597 +       /*
22598 +        * Defer the cleanup to an alive cpu. On RT we can neither
22599 +        * call mmdrop() nor mmdrop_delayed() from here.
22600 +        */
22601 +       per_cpu(idle_last_mm, smp_processor_id()) = mm;
22602  }
22603
22604  /*
22605 @@ -5583,6 +5826,10 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
22606
22607         case CPU_DEAD:
22608                 calc_load_migrate(rq);
22609 +               if (per_cpu(idle_last_mm, cpu)) {
22610 +                       mmdrop(per_cpu(idle_last_mm, cpu));
22611 +                       per_cpu(idle_last_mm, cpu) = NULL;
22612 +               }
22613                 break;
22614  #endif
22615         }
22616 @@ -7566,7 +7813,7 @@ void __init sched_init(void)
22617  #ifdef CONFIG_DEBUG_ATOMIC_SLEEP
22618  static inline int preempt_count_equals(int preempt_offset)
22619  {
22620 -       int nested = preempt_count() + rcu_preempt_depth();
22621 +       int nested = preempt_count() + sched_rcu_preempt_depth();
22622
22623         return (nested == preempt_offset);
22624  }
22625 diff --git a/kernel/sched/cpudeadline.c b/kernel/sched/cpudeadline.c
22626 index 5a75b08cfd85..5be58820465c 100644
22627 --- a/kernel/sched/cpudeadline.c
22628 +++ b/kernel/sched/cpudeadline.c
22629 @@ -103,10 +103,10 @@ int cpudl_find(struct cpudl *cp, struct task_struct *p,
22630         const struct sched_dl_entity *dl_se = &p->dl;
22631
22632         if (later_mask &&
22633 -           cpumask_and(later_mask, cp->free_cpus, &p->cpus_allowed)) {
22634 +           cpumask_and(later_mask, cp->free_cpus, tsk_cpus_allowed(p))) {
22635                 best_cpu = cpumask_any(later_mask);
22636                 goto out;
22637 -       } else if (cpumask_test_cpu(cpudl_maximum(cp), &p->cpus_allowed) &&
22638 +       } else if (cpumask_test_cpu(cpudl_maximum(cp), tsk_cpus_allowed(p)) &&
22639                         dl_time_before(dl_se->deadline, cp->elements[0].dl)) {
22640                 best_cpu = cpudl_maximum(cp);
22641                 if (later_mask)
22642 diff --git a/kernel/sched/cpupri.c b/kernel/sched/cpupri.c
22643 index 981fcd7dc394..11e9705bf937 100644
22644 --- a/kernel/sched/cpupri.c
22645 +++ b/kernel/sched/cpupri.c
22646 @@ -103,11 +103,11 @@ int cpupri_find(struct cpupri *cp, struct task_struct *p,
22647                 if (skip)
22648                         continue;
22649
22650 -               if (cpumask_any_and(&p->cpus_allowed, vec->mask) >= nr_cpu_ids)
22651 +               if (cpumask_any_and(tsk_cpus_allowed(p), vec->mask) >= nr_cpu_ids)
22652                         continue;
22653
22654                 if (lowest_mask) {
22655 -                       cpumask_and(lowest_mask, &p->cpus_allowed, vec->mask);
22656 +                       cpumask_and(lowest_mask, tsk_cpus_allowed(p), vec->mask);
22657
22658                         /*
22659                          * We have to ensure that we have at least one bit
22660 diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
22661 index a1aecbedf5b1..558b98af241d 100644
22662 --- a/kernel/sched/cputime.c
22663 +++ b/kernel/sched/cputime.c
22664 @@ -685,7 +685,7 @@ static cputime_t get_vtime_delta(struct task_struct *tsk)
22665  {
22666         unsigned long long delta = vtime_delta(tsk);
22667
22668 -       WARN_ON_ONCE(tsk->vtime_snap_whence == VTIME_SLEEPING);
22669 +       WARN_ON_ONCE(tsk->vtime_snap_whence == VTIME_INACTIVE);
22670         tsk->vtime_snap += delta;
22671
22672         /* CHECKME: always safe to convert nsecs to cputime? */
22673 @@ -701,37 +701,37 @@ static void __vtime_account_system(struct task_struct *tsk)
22674
22675  void vtime_account_system(struct task_struct *tsk)
22676  {
22677 -       write_seqlock(&tsk->vtime_seqlock);
22678 +       write_seqcount_begin(&tsk->vtime_seqcount);
22679         __vtime_account_system(tsk);
22680 -       write_sequnlock(&tsk->vtime_seqlock);
22681 +       write_seqcount_end(&tsk->vtime_seqcount);
22682  }
22683
22684  void vtime_gen_account_irq_exit(struct task_struct *tsk)
22685  {
22686 -       write_seqlock(&tsk->vtime_seqlock);
22687 +       write_seqcount_begin(&tsk->vtime_seqcount);
22688         __vtime_account_system(tsk);
22689         if (context_tracking_in_user())
22690                 tsk->vtime_snap_whence = VTIME_USER;
22691 -       write_sequnlock(&tsk->vtime_seqlock);
22692 +       write_seqcount_end(&tsk->vtime_seqcount);
22693  }
22694
22695  void vtime_account_user(struct task_struct *tsk)
22696  {
22697         cputime_t delta_cpu;
22698
22699 -       write_seqlock(&tsk->vtime_seqlock);
22700 +       write_seqcount_begin(&tsk->vtime_seqcount);
22701         delta_cpu = get_vtime_delta(tsk);
22702         tsk->vtime_snap_whence = VTIME_SYS;
22703         account_user_time(tsk, delta_cpu, cputime_to_scaled(delta_cpu));
22704 -       write_sequnlock(&tsk->vtime_seqlock);
22705 +       write_seqcount_end(&tsk->vtime_seqcount);
22706  }
22707
22708  void vtime_user_enter(struct task_struct *tsk)
22709  {
22710 -       write_seqlock(&tsk->vtime_seqlock);
22711 +       write_seqcount_begin(&tsk->vtime_seqcount);
22712         __vtime_account_system(tsk);
22713         tsk->vtime_snap_whence = VTIME_USER;
22714 -       write_sequnlock(&tsk->vtime_seqlock);
22715 +       write_seqcount_end(&tsk->vtime_seqcount);
22716  }
22717
22718  void vtime_guest_enter(struct task_struct *tsk)
22719 @@ -743,19 +743,19 @@ void vtime_guest_enter(struct task_struct *tsk)
22720          * synchronization against the reader (task_gtime())
22721          * that can thus safely catch up with a tickless delta.
22722          */
22723 -       write_seqlock(&tsk->vtime_seqlock);
22724 +       write_seqcount_begin(&tsk->vtime_seqcount);
22725         __vtime_account_system(tsk);
22726         current->flags |= PF_VCPU;
22727 -       write_sequnlock(&tsk->vtime_seqlock);
22728 +       write_seqcount_end(&tsk->vtime_seqcount);
22729  }
22730  EXPORT_SYMBOL_GPL(vtime_guest_enter);
22731
22732  void vtime_guest_exit(struct task_struct *tsk)
22733  {
22734 -       write_seqlock(&tsk->vtime_seqlock);
22735 +       write_seqcount_begin(&tsk->vtime_seqcount);
22736         __vtime_account_system(tsk);
22737         current->flags &= ~PF_VCPU;
22738 -       write_sequnlock(&tsk->vtime_seqlock);
22739 +       write_seqcount_end(&tsk->vtime_seqcount);
22740  }
22741  EXPORT_SYMBOL_GPL(vtime_guest_exit);
22742
22743 @@ -768,24 +768,26 @@ void vtime_account_idle(struct task_struct *tsk)
22744
22745  void arch_vtime_task_switch(struct task_struct *prev)
22746  {
22747 -       write_seqlock(&prev->vtime_seqlock);
22748 -       prev->vtime_snap_whence = VTIME_SLEEPING;
22749 -       write_sequnlock(&prev->vtime_seqlock);
22750 +       write_seqcount_begin(&prev->vtime_seqcount);
22751 +       prev->vtime_snap_whence = VTIME_INACTIVE;
22752 +       write_seqcount_end(&prev->vtime_seqcount);
22753
22754 -       write_seqlock(&current->vtime_seqlock);
22755 +       write_seqcount_begin(&current->vtime_seqcount);
22756         current->vtime_snap_whence = VTIME_SYS;
22757         current->vtime_snap = sched_clock_cpu(smp_processor_id());
22758 -       write_sequnlock(&current->vtime_seqlock);
22759 +       write_seqcount_end(&current->vtime_seqcount);
22760  }
22761
22762  void vtime_init_idle(struct task_struct *t, int cpu)
22763  {
22764         unsigned long flags;
22765
22766 -       write_seqlock_irqsave(&t->vtime_seqlock, flags);
22767 +       local_irq_save(flags);
22768 +       write_seqcount_begin(&t->vtime_seqcount);
22769         t->vtime_snap_whence = VTIME_SYS;
22770         t->vtime_snap = sched_clock_cpu(cpu);
22771 -       write_sequnlock_irqrestore(&t->vtime_seqlock, flags);
22772 +       write_seqcount_end(&t->vtime_seqcount);
22773 +       local_irq_restore(flags);
22774  }
22775
22776  cputime_t task_gtime(struct task_struct *t)
22777 @@ -797,13 +799,13 @@ cputime_t task_gtime(struct task_struct *t)
22778                 return t->gtime;
22779
22780         do {
22781 -               seq = read_seqbegin(&t->vtime_seqlock);
22782 +               seq = read_seqcount_begin(&t->vtime_seqcount);
22783
22784                 gtime = t->gtime;
22785                 if (t->flags & PF_VCPU)
22786                         gtime += vtime_delta(t);
22787
22788 -       } while (read_seqretry(&t->vtime_seqlock, seq));
22789 +       } while (read_seqcount_retry(&t->vtime_seqcount, seq));
22790
22791         return gtime;
22792  }
22793 @@ -826,7 +828,7 @@ fetch_task_cputime(struct task_struct *t,
22794                 *udelta = 0;
22795                 *sdelta = 0;
22796
22797 -               seq = read_seqbegin(&t->vtime_seqlock);
22798 +               seq = read_seqcount_begin(&t->vtime_seqcount);
22799
22800                 if (u_dst)
22801                         *u_dst = *u_src;
22802 @@ -834,7 +836,7 @@ fetch_task_cputime(struct task_struct *t,
22803                         *s_dst = *s_src;
22804
22805                 /* Task is sleeping, nothing to add */
22806 -               if (t->vtime_snap_whence == VTIME_SLEEPING ||
22807 +               if (t->vtime_snap_whence == VTIME_INACTIVE ||
22808                     is_idle_task(t))
22809                         continue;
22810
22811 @@ -850,7 +852,7 @@ fetch_task_cputime(struct task_struct *t,
22812                         if (t->vtime_snap_whence == VTIME_SYS)
22813                                 *sdelta = delta;
22814                 }
22815 -       } while (read_seqretry(&t->vtime_seqlock, seq));
22816 +       } while (read_seqcount_retry(&t->vtime_seqcount, seq));
22817  }
22818
22819
22820 diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
22821 index 8b0a15e285f9..7a72e69fcf65 100644
22822 --- a/kernel/sched/deadline.c
22823 +++ b/kernel/sched/deadline.c
22824 @@ -134,7 +134,7 @@ static void inc_dl_migration(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
22825  {
22826         struct task_struct *p = dl_task_of(dl_se);
22827
22828 -       if (p->nr_cpus_allowed > 1)
22829 +       if (tsk_nr_cpus_allowed(p) > 1)
22830                 dl_rq->dl_nr_migratory++;
22831
22832         update_dl_migration(dl_rq);
22833 @@ -144,7 +144,7 @@ static void dec_dl_migration(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
22834  {
22835         struct task_struct *p = dl_task_of(dl_se);
22836
22837 -       if (p->nr_cpus_allowed > 1)
22838 +       if (tsk_nr_cpus_allowed(p) > 1)
22839                 dl_rq->dl_nr_migratory--;
22840
22841         update_dl_migration(dl_rq);
22842 @@ -697,6 +697,7 @@ void init_dl_task_timer(struct sched_dl_entity *dl_se)
22843
22844         hrtimer_init(timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
22845         timer->function = dl_task_timer;
22846 +       timer->irqsafe = 1;
22847  }
22848
22849  static
22850 @@ -989,7 +990,7 @@ static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags)
22851
22852         enqueue_dl_entity(&p->dl, pi_se, flags);
22853
22854 -       if (!task_current(rq, p) && p->nr_cpus_allowed > 1)
22855 +       if (!task_current(rq, p) && tsk_nr_cpus_allowed(p) > 1)
22856                 enqueue_pushable_dl_task(rq, p);
22857  }
22858
22859 @@ -1067,9 +1068,9 @@ select_task_rq_dl(struct task_struct *p, int cpu, int sd_flag, int flags)
22860          * try to make it stay here, it might be important.
22861          */
22862         if (unlikely(dl_task(curr)) &&
22863 -           (curr->nr_cpus_allowed < 2 ||
22864 +           (tsk_nr_cpus_allowed(curr) < 2 ||
22865              !dl_entity_preempt(&p->dl, &curr->dl)) &&
22866 -           (p->nr_cpus_allowed > 1)) {
22867 +           (tsk_nr_cpus_allowed(p) > 1)) {
22868                 int target = find_later_rq(p);
22869
22870                 if (target != -1 &&
22871 @@ -1090,7 +1091,7 @@ static void check_preempt_equal_dl(struct rq *rq, struct task_struct *p)
22872          * Current can't be migrated, useless to reschedule,
22873          * let's hope p can move out.
22874          */
22875 -       if (rq->curr->nr_cpus_allowed == 1 ||
22876 +       if (tsk_nr_cpus_allowed(rq->curr) == 1 ||
22877             cpudl_find(&rq->rd->cpudl, rq->curr, NULL) == -1)
22878                 return;
22879
22880 @@ -1098,7 +1099,7 @@ static void check_preempt_equal_dl(struct rq *rq, struct task_struct *p)
22881          * p is migratable, so let's not schedule it and
22882          * see if it is pushed or pulled somewhere else.
22883          */
22884 -       if (p->nr_cpus_allowed != 1 &&
22885 +       if (tsk_nr_cpus_allowed(p) != 1 &&
22886             cpudl_find(&rq->rd->cpudl, p, NULL) != -1)
22887                 return;
22888
22889 @@ -1212,7 +1213,7 @@ static void put_prev_task_dl(struct rq *rq, struct task_struct *p)
22890  {
22891         update_curr_dl(rq);
22892
22893 -       if (on_dl_rq(&p->dl) && p->nr_cpus_allowed > 1)
22894 +       if (on_dl_rq(&p->dl) && tsk_nr_cpus_allowed(p) > 1)
22895                 enqueue_pushable_dl_task(rq, p);
22896  }
22897
22898 @@ -1335,7 +1336,7 @@ static int find_later_rq(struct task_struct *task)
22899         if (unlikely(!later_mask))
22900                 return -1;
22901
22902 -       if (task->nr_cpus_allowed == 1)
22903 +       if (tsk_nr_cpus_allowed(task) == 1)
22904                 return -1;
22905
22906         /*
22907 @@ -1441,7 +1442,7 @@ static struct rq *find_lock_later_rq(struct task_struct *task, struct rq *rq)
22908                 if (double_lock_balance(rq, later_rq)) {
22909                         if (unlikely(task_rq(task) != rq ||
22910                                      !cpumask_test_cpu(later_rq->cpu,
22911 -                                                      &task->cpus_allowed) ||
22912 +                                                      tsk_cpus_allowed(task)) ||
22913                                      task_running(rq, task) ||
22914                                      !task_on_rq_queued(task))) {
22915                                 double_unlock_balance(rq, later_rq);
22916 @@ -1480,7 +1481,7 @@ static struct task_struct *pick_next_pushable_dl_task(struct rq *rq)
22917
22918         BUG_ON(rq->cpu != task_cpu(p));
22919         BUG_ON(task_current(rq, p));
22920 -       BUG_ON(p->nr_cpus_allowed <= 1);
22921 +       BUG_ON(tsk_nr_cpus_allowed(p) <= 1);
22922
22923         BUG_ON(!task_on_rq_queued(p));
22924         BUG_ON(!dl_task(p));
22925 @@ -1519,7 +1520,7 @@ retry:
22926          */
22927         if (dl_task(rq->curr) &&
22928             dl_time_before(next_task->dl.deadline, rq->curr->dl.deadline) &&
22929 -           rq->curr->nr_cpus_allowed > 1) {
22930 +           tsk_nr_cpus_allowed(rq->curr) > 1) {
22931                 resched_curr(rq);
22932                 return 0;
22933         }
22934 @@ -1666,9 +1667,9 @@ static void task_woken_dl(struct rq *rq, struct task_struct *p)
22935  {
22936         if (!task_running(rq, p) &&
22937             !test_tsk_need_resched(rq->curr) &&
22938 -           p->nr_cpus_allowed > 1 &&
22939 +           tsk_nr_cpus_allowed(p) > 1 &&
22940             dl_task(rq->curr) &&
22941 -           (rq->curr->nr_cpus_allowed < 2 ||
22942 +           (tsk_nr_cpus_allowed(rq->curr) < 2 ||
22943              !dl_entity_preempt(&p->dl, &rq->curr->dl))) {
22944                 push_dl_tasks(rq);
22945         }
22946 @@ -1769,7 +1770,7 @@ static void switched_to_dl(struct rq *rq, struct task_struct *p)
22947  {
22948         if (task_on_rq_queued(p) && rq->curr != p) {
22949  #ifdef CONFIG_SMP
22950 -               if (p->nr_cpus_allowed > 1 && rq->dl.overloaded)
22951 +               if (tsk_nr_cpus_allowed(p) > 1 && rq->dl.overloaded)
22952                         queue_push_tasks(rq);
22953  #else
22954                 if (dl_task(rq->curr))
22955 diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
22956 index 641511771ae6..a2d69b883623 100644
22957 --- a/kernel/sched/debug.c
22958 +++ b/kernel/sched/debug.c
22959 @@ -251,6 +251,9 @@ void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq)
22960         P(rt_throttled);
22961         PN(rt_time);
22962         PN(rt_runtime);
22963 +#ifdef CONFIG_SMP
22964 +       P(rt_nr_migratory);
22965 +#endif
22966
22967  #undef PN
22968  #undef P
22969 @@ -635,6 +638,10 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
22970  #endif
22971         P(policy);
22972         P(prio);
22973 +#ifdef CONFIG_PREEMPT_RT_FULL
22974 +       P(migrate_disable);
22975 +#endif
22976 +       P(nr_cpus_allowed);
22977  #undef PN
22978  #undef __PN
22979  #undef P
22980 diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
22981 index 8f258f437ac2..cf0a1adba6c6 100644
22982 --- a/kernel/sched/fair.c
22983 +++ b/kernel/sched/fair.c
22984 @@ -3166,7 +3166,7 @@ check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
22985         ideal_runtime = sched_slice(cfs_rq, curr);
22986         delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime;
22987         if (delta_exec > ideal_runtime) {
22988 -               resched_curr(rq_of(cfs_rq));
22989 +               resched_curr_lazy(rq_of(cfs_rq));
22990                 /*
22991                  * The current task ran long enough, ensure it doesn't get
22992                  * re-elected due to buddy favours.
22993 @@ -3190,7 +3190,7 @@ check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
22994                 return;
22995
22996         if (delta > ideal_runtime)
22997 -               resched_curr(rq_of(cfs_rq));
22998 +               resched_curr_lazy(rq_of(cfs_rq));
22999  }
23000
23001  static void
23002 @@ -3330,7 +3330,7 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
23003          * validating it and just reschedule.
23004          */
23005         if (queued) {
23006 -               resched_curr(rq_of(cfs_rq));
23007 +               resched_curr_lazy(rq_of(cfs_rq));
23008                 return;
23009         }
23010         /*
23011 @@ -3512,7 +3512,7 @@ static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec)
23012          * hierarchy can be throttled
23013          */
23014         if (!assign_cfs_rq_runtime(cfs_rq) && likely(cfs_rq->curr))
23015 -               resched_curr(rq_of(cfs_rq));
23016 +               resched_curr_lazy(rq_of(cfs_rq));
23017  }
23018
23019  static __always_inline
23020 @@ -4124,7 +4124,7 @@ static void hrtick_start_fair(struct rq *rq, struct task_struct *p)
23021
23022                 if (delta < 0) {
23023                         if (rq->curr == p)
23024 -                               resched_curr(rq);
23025 +                               resched_curr_lazy(rq);
23026                         return;
23027                 }
23028                 hrtick_start(rq, delta);
23029 @@ -5213,7 +5213,7 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
23030         return;
23031
23032  preempt:
23033 -       resched_curr(rq);
23034 +       resched_curr_lazy(rq);
23035         /*
23036          * Only set the backward buddy when the current task is still
23037          * on the rq. This can happen when a wakeup gets interleaved
23038 @@ -7964,7 +7964,7 @@ static void task_fork_fair(struct task_struct *p)
23039                  * 'current' within the tree based on its new key value.
23040                  */
23041                 swap(curr->vruntime, se->vruntime);
23042 -               resched_curr(rq);
23043 +               resched_curr_lazy(rq);
23044         }
23045
23046         se->vruntime -= cfs_rq->min_vruntime;
23047 @@ -7989,7 +7989,7 @@ prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio)
23048          */
23049         if (rq->curr == p) {
23050                 if (p->prio > oldprio)
23051 -                       resched_curr(rq);
23052 +                       resched_curr_lazy(rq);
23053         } else
23054                 check_preempt_curr(rq, p, 0);
23055  }
23056 diff --git a/kernel/sched/features.h b/kernel/sched/features.h
23057 index 69631fa46c2f..6d28fcd08872 100644
23058 --- a/kernel/sched/features.h
23059 +++ b/kernel/sched/features.h
23060 @@ -45,11 +45,19 @@ SCHED_FEAT(LB_BIAS, true)
23061   */
23062  SCHED_FEAT(NONTASK_CAPACITY, true)
23063
23064 +#ifdef CONFIG_PREEMPT_RT_FULL
23065 +SCHED_FEAT(TTWU_QUEUE, false)
23066 +# ifdef CONFIG_PREEMPT_LAZY
23067 +SCHED_FEAT(PREEMPT_LAZY, true)
23068 +# endif
23069 +#else
23070 +
23071  /*
23072   * Queue remote wakeups on the target CPU and process them
23073   * using the scheduler IPI. Reduces rq->lock contention/bounces.
23074   */
23075  SCHED_FEAT(TTWU_QUEUE, true)
23076 +#endif
23077
23078  #ifdef HAVE_RT_PUSH_IPI
23079  /*
23080 diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
23081 index 8ec86abe0ea1..8cf360d309ec 100644
23082 --- a/kernel/sched/rt.c
23083 +++ b/kernel/sched/rt.c
23084 @@ -47,6 +47,7 @@ void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime)
23085
23086         hrtimer_init(&rt_b->rt_period_timer,
23087                         CLOCK_MONOTONIC, HRTIMER_MODE_REL);
23088 +       rt_b->rt_period_timer.irqsafe = 1;
23089         rt_b->rt_period_timer.function = sched_rt_period_timer;
23090  }
23091
23092 @@ -93,6 +94,7 @@ void init_rt_rq(struct rt_rq *rt_rq)
23093         rt_rq->push_cpu = nr_cpu_ids;
23094         raw_spin_lock_init(&rt_rq->push_lock);
23095         init_irq_work(&rt_rq->push_work, push_irq_work_func);
23096 +       rt_rq->push_work.flags |= IRQ_WORK_HARD_IRQ;
23097  #endif
23098  #endif /* CONFIG_SMP */
23099         /* We start is dequeued state, because no RT tasks are queued */
23100 @@ -326,7 +328,7 @@ static void inc_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
23101         rt_rq = &rq_of_rt_rq(rt_rq)->rt;
23102
23103         rt_rq->rt_nr_total++;
23104 -       if (p->nr_cpus_allowed > 1)
23105 +       if (tsk_nr_cpus_allowed(p) > 1)
23106                 rt_rq->rt_nr_migratory++;
23107
23108         update_rt_migration(rt_rq);
23109 @@ -343,7 +345,7 @@ static void dec_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
23110         rt_rq = &rq_of_rt_rq(rt_rq)->rt;
23111
23112         rt_rq->rt_nr_total--;
23113 -       if (p->nr_cpus_allowed > 1)
23114 +       if (tsk_nr_cpus_allowed(p) > 1)
23115                 rt_rq->rt_nr_migratory--;
23116
23117         update_rt_migration(rt_rq);
23118 @@ -1262,7 +1264,7 @@ enqueue_task_rt(struct rq *rq, struct task_struct *p, int flags)
23119
23120         enqueue_rt_entity(rt_se, flags & ENQUEUE_HEAD);
23121
23122 -       if (!task_current(rq, p) && p->nr_cpus_allowed > 1)
23123 +       if (!task_current(rq, p) && tsk_nr_cpus_allowed(p) > 1)
23124                 enqueue_pushable_task(rq, p);
23125  }
23126
23127 @@ -1351,7 +1353,7 @@ select_task_rq_rt(struct task_struct *p, int cpu, int sd_flag, int flags)
23128          * will have to sort it out.
23129          */
23130         if (curr && unlikely(rt_task(curr)) &&
23131 -           (curr->nr_cpus_allowed < 2 ||
23132 +           (tsk_nr_cpus_allowed(curr) < 2 ||
23133              curr->prio <= p->prio)) {
23134                 int target = find_lowest_rq(p);
23135
23136 @@ -1375,7 +1377,7 @@ static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p)
23137          * Current can't be migrated, useless to reschedule,
23138          * let's hope p can move out.
23139          */
23140 -       if (rq->curr->nr_cpus_allowed == 1 ||
23141 +       if (tsk_nr_cpus_allowed(rq->curr) == 1 ||
23142             !cpupri_find(&rq->rd->cpupri, rq->curr, NULL))
23143                 return;
23144
23145 @@ -1383,7 +1385,7 @@ static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p)
23146          * p is migratable, so let's not schedule it and
23147          * see if it is pushed or pulled somewhere else.
23148          */
23149 -       if (p->nr_cpus_allowed != 1
23150 +       if (tsk_nr_cpus_allowed(p) != 1
23151             && cpupri_find(&rq->rd->cpupri, p, NULL))
23152                 return;
23153
23154 @@ -1517,7 +1519,7 @@ static void put_prev_task_rt(struct rq *rq, struct task_struct *p)
23155          * The previous task needs to be made eligible for pushing
23156          * if it is still active
23157          */
23158 -       if (on_rt_rq(&p->rt) && p->nr_cpus_allowed > 1)
23159 +       if (on_rt_rq(&p->rt) && tsk_nr_cpus_allowed(p) > 1)
23160                 enqueue_pushable_task(rq, p);
23161  }
23162
23163 @@ -1567,7 +1569,7 @@ static int find_lowest_rq(struct task_struct *task)
23164         if (unlikely(!lowest_mask))
23165                 return -1;
23166
23167 -       if (task->nr_cpus_allowed == 1)
23168 +       if (tsk_nr_cpus_allowed(task) == 1)
23169                 return -1; /* No other targets possible */
23170
23171         if (!cpupri_find(&task_rq(task)->rd->cpupri, task, lowest_mask))
23172 @@ -1699,7 +1701,7 @@ static struct task_struct *pick_next_pushable_task(struct rq *rq)
23173
23174         BUG_ON(rq->cpu != task_cpu(p));
23175         BUG_ON(task_current(rq, p));
23176 -       BUG_ON(p->nr_cpus_allowed <= 1);
23177 +       BUG_ON(tsk_nr_cpus_allowed(p) <= 1);
23178
23179         BUG_ON(!task_on_rq_queued(p));
23180         BUG_ON(!rt_task(p));
23181 @@ -2059,9 +2061,9 @@ static void task_woken_rt(struct rq *rq, struct task_struct *p)
23182  {
23183         if (!task_running(rq, p) &&
23184             !test_tsk_need_resched(rq->curr) &&
23185 -           p->nr_cpus_allowed > 1 &&
23186 +           tsk_nr_cpus_allowed(p) > 1 &&
23187             (dl_task(rq->curr) || rt_task(rq->curr)) &&
23188 -           (rq->curr->nr_cpus_allowed < 2 ||
23189 +           (tsk_nr_cpus_allowed(rq->curr) < 2 ||
23190              rq->curr->prio <= p->prio))
23191                 push_rt_tasks(rq);
23192  }
23193 @@ -2134,7 +2136,7 @@ static void switched_to_rt(struct rq *rq, struct task_struct *p)
23194          */
23195         if (task_on_rq_queued(p) && rq->curr != p) {
23196  #ifdef CONFIG_SMP
23197 -               if (p->nr_cpus_allowed > 1 && rq->rt.overloaded)
23198 +               if (tsk_nr_cpus_allowed(p) > 1 && rq->rt.overloaded)
23199                         queue_push_tasks(rq);
23200  #else
23201                 if (p->prio < rq->curr->prio)
23202 diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
23203 index 0517abd7dd73..a8a9b156ea15 100644
23204 --- a/kernel/sched/sched.h
23205 +++ b/kernel/sched/sched.h
23206 @@ -1100,6 +1100,7 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
23207  #define WF_SYNC                0x01            /* waker goes to sleep after wakeup */
23208  #define WF_FORK                0x02            /* child wakeup after fork */
23209  #define WF_MIGRATED    0x4             /* internal use, task got migrated */
23210 +#define WF_LOCK_SLEEPER        0x08            /* wakeup spinlock "sleeper" */
23211
23212  /*
23213   * To aid in avoiding the subversion of "niceness" due to uneven distribution
23214 @@ -1299,6 +1300,15 @@ extern void init_sched_fair_class(void);
23215  extern void resched_curr(struct rq *rq);
23216  extern void resched_cpu(int cpu);
23217
23218 +#ifdef CONFIG_PREEMPT_LAZY
23219 +extern void resched_curr_lazy(struct rq *rq);
23220 +#else
23221 +static inline void resched_curr_lazy(struct rq *rq)
23222 +{
23223 +       resched_curr(rq);
23224 +}
23225 +#endif
23226 +
23227  extern struct rt_bandwidth def_rt_bandwidth;
23228  extern void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime);
23229
23230 diff --git a/kernel/sched/swait.c b/kernel/sched/swait.c
23231 new file mode 100644
23232 index 000000000000..205fe36868f9
23233 --- /dev/null
23234 +++ b/kernel/sched/swait.c
23235 @@ -0,0 +1,143 @@
23236 +#include <linux/sched.h>
23237 +#include <linux/swait.h>
23238 +#include <linux/suspend.h>
23239 +
23240 +void __init_swait_queue_head(struct swait_queue_head *q, const char *name,
23241 +                            struct lock_class_key *key)
23242 +{
23243 +       raw_spin_lock_init(&q->lock);
23244 +       lockdep_set_class_and_name(&q->lock, key, name);
23245 +       INIT_LIST_HEAD(&q->task_list);
23246 +}
23247 +EXPORT_SYMBOL(__init_swait_queue_head);
23248 +
23249 +/*
23250 + * The thing about the wake_up_state() return value; I think we can ignore it.
23251 + *
23252 + * If for some reason it would return 0, that means the previously waiting
23253 + * task is already running, so it will observe condition true (or has already).
23254 + */
23255 +void swake_up_locked(struct swait_queue_head *q)
23256 +{
23257 +       struct swait_queue *curr;
23258 +
23259 +       if (list_empty(&q->task_list))
23260 +               return;
23261 +
23262 +       curr = list_first_entry(&q->task_list, typeof(*curr), task_list);
23263 +       wake_up_process(curr->task);
23264 +       list_del_init(&curr->task_list);
23265 +}
23266 +EXPORT_SYMBOL(swake_up_locked);
23267 +
23268 +void swake_up_all_locked(struct swait_queue_head *q)
23269 +{
23270 +       struct swait_queue *curr;
23271 +       int wakes = 0;
23272 +
23273 +       while (!list_empty(&q->task_list)) {
23274 +
23275 +               curr = list_first_entry(&q->task_list, typeof(*curr),
23276 +                                       task_list);
23277 +               wake_up_process(curr->task);
23278 +               list_del_init(&curr->task_list);
23279 +               wakes++;
23280 +       }
23281 +       if (pm_in_action)
23282 +               return;
23283 +       WARN(wakes > 2, "complate_all() with %d waiters\n", wakes);
23284 +}
23285 +EXPORT_SYMBOL(swake_up_all_locked);
23286 +
23287 +void swake_up(struct swait_queue_head *q)
23288 +{
23289 +       unsigned long flags;
23290 +
23291 +       if (!swait_active(q))
23292 +               return;
23293 +
23294 +       raw_spin_lock_irqsave(&q->lock, flags);
23295 +       swake_up_locked(q);
23296 +       raw_spin_unlock_irqrestore(&q->lock, flags);
23297 +}
23298 +EXPORT_SYMBOL(swake_up);
23299 +
23300 +/*
23301 + * Does not allow usage from IRQ disabled, since we must be able to
23302 + * release IRQs to guarantee bounded hold time.
23303 + */
23304 +void swake_up_all(struct swait_queue_head *q)
23305 +{
23306 +       struct swait_queue *curr;
23307 +       LIST_HEAD(tmp);
23308 +
23309 +       if (!swait_active(q))
23310 +               return;
23311 +
23312 +       raw_spin_lock_irq(&q->lock);
23313 +       list_splice_init(&q->task_list, &tmp);
23314 +       while (!list_empty(&tmp)) {
23315 +               curr = list_first_entry(&tmp, typeof(*curr), task_list);
23316 +
23317 +               wake_up_state(curr->task, TASK_NORMAL);
23318 +               list_del_init(&curr->task_list);
23319 +
23320 +               if (list_empty(&tmp))
23321 +                       break;
23322 +
23323 +               raw_spin_unlock_irq(&q->lock);
23324 +               raw_spin_lock_irq(&q->lock);
23325 +       }
23326 +       raw_spin_unlock_irq(&q->lock);
23327 +}
23328 +EXPORT_SYMBOL(swake_up_all);
23329 +
23330 +void __prepare_to_swait(struct swait_queue_head *q, struct swait_queue *wait)
23331 +{
23332 +       wait->task = current;
23333 +       if (list_empty(&wait->task_list))
23334 +               list_add(&wait->task_list, &q->task_list);
23335 +}
23336 +
23337 +void prepare_to_swait(struct swait_queue_head *q, struct swait_queue *wait, int state)
23338 +{
23339 +       unsigned long flags;
23340 +
23341 +       raw_spin_lock_irqsave(&q->lock, flags);
23342 +       __prepare_to_swait(q, wait);
23343 +       set_current_state(state);
23344 +       raw_spin_unlock_irqrestore(&q->lock, flags);
23345 +}
23346 +EXPORT_SYMBOL(prepare_to_swait);
23347 +
23348 +long prepare_to_swait_event(struct swait_queue_head *q, struct swait_queue *wait, int state)
23349 +{
23350 +       if (signal_pending_state(state, current))
23351 +               return -ERESTARTSYS;
23352 +
23353 +       prepare_to_swait(q, wait, state);
23354 +
23355 +       return 0;
23356 +}
23357 +EXPORT_SYMBOL(prepare_to_swait_event);
23358 +
23359 +void __finish_swait(struct swait_queue_head *q, struct swait_queue *wait)
23360 +{
23361 +       __set_current_state(TASK_RUNNING);
23362 +       if (!list_empty(&wait->task_list))
23363 +               list_del_init(&wait->task_list);
23364 +}
23365 +
23366 +void finish_swait(struct swait_queue_head *q, struct swait_queue *wait)
23367 +{
23368 +       unsigned long flags;
23369 +
23370 +       __set_current_state(TASK_RUNNING);
23371 +
23372 +       if (!list_empty_careful(&wait->task_list)) {
23373 +               raw_spin_lock_irqsave(&q->lock, flags);
23374 +               list_del_init(&wait->task_list);
23375 +               raw_spin_unlock_irqrestore(&q->lock, flags);
23376 +       }
23377 +}
23378 +EXPORT_SYMBOL(finish_swait);
23379 diff --git a/kernel/sched/swork.c b/kernel/sched/swork.c
23380 new file mode 100644
23381 index 000000000000..1950f40ca725
23382 --- /dev/null
23383 +++ b/kernel/sched/swork.c
23384 @@ -0,0 +1,173 @@
23385 +/*
23386 + * Copyright (C) 2014 BMW Car IT GmbH, Daniel Wagner daniel.wagner@bmw-carit.de
23387 + *
23388 + * Provides a framework for enqueuing callbacks from irq context
23389 + * PREEMPT_RT_FULL safe. The callbacks are executed in kthread context.
23390 + */
23391 +
23392 +#include <linux/swait.h>
23393 +#include <linux/swork.h>
23394 +#include <linux/kthread.h>
23395 +#include <linux/slab.h>
23396 +#include <linux/spinlock.h>
23397 +#include <linux/export.h>
23398 +
23399 +#define SWORK_EVENT_PENDING     (1 << 0)
23400 +
23401 +static DEFINE_MUTEX(worker_mutex);
23402 +static struct sworker *glob_worker;
23403 +
23404 +struct sworker {
23405 +       struct list_head events;
23406 +       struct swait_queue_head wq;
23407 +
23408 +       raw_spinlock_t lock;
23409 +
23410 +       struct task_struct *task;
23411 +       int refs;
23412 +};
23413 +
23414 +static bool swork_readable(struct sworker *worker)
23415 +{
23416 +       bool r;
23417 +
23418 +       if (kthread_should_stop())
23419 +               return true;
23420 +
23421 +       raw_spin_lock_irq(&worker->lock);
23422 +       r = !list_empty(&worker->events);
23423 +       raw_spin_unlock_irq(&worker->lock);
23424 +
23425 +       return r;
23426 +}
23427 +
23428 +static int swork_kthread(void *arg)
23429 +{
23430 +       struct sworker *worker = arg;
23431 +
23432 +       for (;;) {
23433 +               swait_event_interruptible(worker->wq,
23434 +                                       swork_readable(worker));
23435 +               if (kthread_should_stop())
23436 +                       break;
23437 +
23438 +               raw_spin_lock_irq(&worker->lock);
23439 +               while (!list_empty(&worker->events)) {
23440 +                       struct swork_event *sev;
23441 +
23442 +                       sev = list_first_entry(&worker->events,
23443 +                                       struct swork_event, item);
23444 +                       list_del(&sev->item);
23445 +                       raw_spin_unlock_irq(&worker->lock);
23446 +
23447 +                       WARN_ON_ONCE(!test_and_clear_bit(SWORK_EVENT_PENDING,
23448 +                                                        &sev->flags));
23449 +                       sev->func(sev);
23450 +                       raw_spin_lock_irq(&worker->lock);
23451 +               }
23452 +               raw_spin_unlock_irq(&worker->lock);
23453 +       }
23454 +       return 0;
23455 +}
23456 +
23457 +static struct sworker *swork_create(void)
23458 +{
23459 +       struct sworker *worker;
23460 +
23461 +       worker = kzalloc(sizeof(*worker), GFP_KERNEL);
23462 +       if (!worker)
23463 +               return ERR_PTR(-ENOMEM);
23464 +
23465 +       INIT_LIST_HEAD(&worker->events);
23466 +       raw_spin_lock_init(&worker->lock);
23467 +       init_swait_queue_head(&worker->wq);
23468 +
23469 +       worker->task = kthread_run(swork_kthread, worker, "kswork");
23470 +       if (IS_ERR(worker->task)) {
23471 +               kfree(worker);
23472 +               return ERR_PTR(-ENOMEM);
23473 +       }
23474 +
23475 +       return worker;
23476 +}
23477 +
23478 +static void swork_destroy(struct sworker *worker)
23479 +{
23480 +       kthread_stop(worker->task);
23481 +
23482 +       WARN_ON(!list_empty(&worker->events));
23483 +       kfree(worker);
23484 +}
23485 +
23486 +/**
23487 + * swork_queue - queue swork
23488 + *
23489 + * Returns %false if @work was already on a queue, %true otherwise.
23490 + *
23491 + * The work is queued and processed on a random CPU
23492 + */
23493 +bool swork_queue(struct swork_event *sev)
23494 +{
23495 +       unsigned long flags;
23496 +
23497 +       if (test_and_set_bit(SWORK_EVENT_PENDING, &sev->flags))
23498 +               return false;
23499 +
23500 +       raw_spin_lock_irqsave(&glob_worker->lock, flags);
23501 +       list_add_tail(&sev->item, &glob_worker->events);
23502 +       raw_spin_unlock_irqrestore(&glob_worker->lock, flags);
23503 +
23504 +       swake_up(&glob_worker->wq);
23505 +       return true;
23506 +}
23507 +EXPORT_SYMBOL_GPL(swork_queue);
23508 +
23509 +/**
23510 + * swork_get - get an instance of the sworker
23511 + *
23512 + * Returns an negative error code if the initialization if the worker did not
23513 + * work, %0 otherwise.
23514 + *
23515 + */
23516 +int swork_get(void)
23517 +{
23518 +       struct sworker *worker;
23519 +
23520 +       mutex_lock(&worker_mutex);
23521 +       if (!glob_worker) {
23522 +               worker = swork_create();
23523 +               if (IS_ERR(worker)) {
23524 +                       mutex_unlock(&worker_mutex);
23525 +                       return -ENOMEM;
23526 +               }
23527 +
23528 +               glob_worker = worker;
23529 +       }
23530 +
23531 +       glob_worker->refs++;
23532 +       mutex_unlock(&worker_mutex);
23533 +
23534 +       return 0;
23535 +}
23536 +EXPORT_SYMBOL_GPL(swork_get);
23537 +
23538 +/**
23539 + * swork_put - puts an instance of the sworker
23540 + *
23541 + * Will destroy the sworker thread. This function must not be called until all
23542 + * queued events have been completed.
23543 + */
23544 +void swork_put(void)
23545 +{
23546 +       mutex_lock(&worker_mutex);
23547 +
23548 +       glob_worker->refs--;
23549 +       if (glob_worker->refs > 0)
23550 +               goto out;
23551 +
23552 +       swork_destroy(glob_worker);
23553 +       glob_worker = NULL;
23554 +out:
23555 +       mutex_unlock(&worker_mutex);
23556 +}
23557 +EXPORT_SYMBOL_GPL(swork_put);
23558 diff --git a/kernel/signal.c b/kernel/signal.c
23559 index f3f1f7a972fd..bc2c990f3f63 100644
23560 --- a/kernel/signal.c
23561 +++ b/kernel/signal.c
23562 @@ -14,6 +14,7 @@
23563  #include <linux/export.h>
23564  #include <linux/init.h>
23565  #include <linux/sched.h>
23566 +#include <linux/sched/rt.h>
23567  #include <linux/fs.h>
23568  #include <linux/tty.h>
23569  #include <linux/binfmts.h>
23570 @@ -352,13 +353,30 @@ static bool task_participate_group_stop(struct task_struct *task)
23571         return false;
23572  }
23573
23574 +static inline struct sigqueue *get_task_cache(struct task_struct *t)
23575 +{
23576 +       struct sigqueue *q = t->sigqueue_cache;
23577 +
23578 +       if (cmpxchg(&t->sigqueue_cache, q, NULL) != q)
23579 +               return NULL;
23580 +       return q;
23581 +}
23582 +
23583 +static inline int put_task_cache(struct task_struct *t, struct sigqueue *q)
23584 +{
23585 +       if (cmpxchg(&t->sigqueue_cache, NULL, q) == NULL)
23586 +               return 0;
23587 +       return 1;
23588 +}
23589 +
23590  /*
23591   * allocate a new signal queue record
23592   * - this may be called without locks if and only if t == current, otherwise an
23593   *   appropriate lock must be held to stop the target task from exiting
23594   */
23595  static struct sigqueue *
23596 -__sigqueue_alloc(int sig, struct task_struct *t, gfp_t flags, int override_rlimit)
23597 +__sigqueue_do_alloc(int sig, struct task_struct *t, gfp_t flags,
23598 +                   int override_rlimit, int fromslab)
23599  {
23600         struct sigqueue *q = NULL;
23601         struct user_struct *user;
23602 @@ -375,7 +393,10 @@ __sigqueue_alloc(int sig, struct task_struct *t, gfp_t flags, int override_rlimi
23603         if (override_rlimit ||
23604             atomic_read(&user->sigpending) <=
23605                         task_rlimit(t, RLIMIT_SIGPENDING)) {
23606 -               q = kmem_cache_alloc(sigqueue_cachep, flags);
23607 +               if (!fromslab)
23608 +                       q = get_task_cache(t);
23609 +               if (!q)
23610 +                       q = kmem_cache_alloc(sigqueue_cachep, flags);
23611         } else {
23612                 print_dropped_signal(sig);
23613         }
23614 @@ -392,6 +413,13 @@ __sigqueue_alloc(int sig, struct task_struct *t, gfp_t flags, int override_rlimi
23615         return q;
23616  }
23617
23618 +static struct sigqueue *
23619 +__sigqueue_alloc(int sig, struct task_struct *t, gfp_t flags,
23620 +                int override_rlimit)
23621 +{
23622 +       return __sigqueue_do_alloc(sig, t, flags, override_rlimit, 0);
23623 +}
23624 +
23625  static void __sigqueue_free(struct sigqueue *q)
23626  {
23627         if (q->flags & SIGQUEUE_PREALLOC)
23628 @@ -401,6 +429,21 @@ static void __sigqueue_free(struct sigqueue *q)
23629         kmem_cache_free(sigqueue_cachep, q);
23630  }
23631
23632 +static void sigqueue_free_current(struct sigqueue *q)
23633 +{
23634 +       struct user_struct *up;
23635 +
23636 +       if (q->flags & SIGQUEUE_PREALLOC)
23637 +               return;
23638 +
23639 +       up = q->user;
23640 +       if (rt_prio(current->normal_prio) && !put_task_cache(current, q)) {
23641 +               atomic_dec(&up->sigpending);
23642 +               free_uid(up);
23643 +       } else
23644 +                 __sigqueue_free(q);
23645 +}
23646 +
23647  void flush_sigqueue(struct sigpending *queue)
23648  {
23649         struct sigqueue *q;
23650 @@ -414,6 +457,21 @@ void flush_sigqueue(struct sigpending *queue)
23651  }
23652
23653  /*
23654 + * Called from __exit_signal. Flush tsk->pending and
23655 + * tsk->sigqueue_cache
23656 + */
23657 +void flush_task_sigqueue(struct task_struct *tsk)
23658 +{
23659 +       struct sigqueue *q;
23660 +
23661 +       flush_sigqueue(&tsk->pending);
23662 +
23663 +       q = get_task_cache(tsk);
23664 +       if (q)
23665 +               kmem_cache_free(sigqueue_cachep, q);
23666 +}
23667 +
23668 +/*
23669   * Flush all pending signals for this kthread.
23670   */
23671  void flush_signals(struct task_struct *t)
23672 @@ -525,7 +583,7 @@ static void collect_signal(int sig, struct sigpending *list, siginfo_t *info)
23673  still_pending:
23674                 list_del_init(&first->list);
23675                 copy_siginfo(info, &first->info);
23676 -               __sigqueue_free(first);
23677 +               sigqueue_free_current(first);
23678         } else {
23679                 /*
23680                  * Ok, it wasn't in the queue.  This must be
23681 @@ -560,6 +618,8 @@ int dequeue_signal(struct task_struct *tsk, sigset_t *mask, siginfo_t *info)
23682  {
23683         int signr;
23684
23685 +       WARN_ON_ONCE(tsk != current);
23686 +
23687         /* We only dequeue private signals from ourselves, we don't let
23688          * signalfd steal them
23689          */
23690 @@ -1156,8 +1216,8 @@ int do_send_sig_info(int sig, struct siginfo *info, struct task_struct *p,
23691   * We don't want to have recursive SIGSEGV's etc, for example,
23692   * that is why we also clear SIGNAL_UNKILLABLE.
23693   */
23694 -int
23695 -force_sig_info(int sig, struct siginfo *info, struct task_struct *t)
23696 +static int
23697 +do_force_sig_info(int sig, struct siginfo *info, struct task_struct *t)
23698  {
23699         unsigned long int flags;
23700         int ret, blocked, ignored;
23701 @@ -1182,6 +1242,39 @@ force_sig_info(int sig, struct siginfo *info, struct task_struct *t)
23702         return ret;
23703  }
23704
23705 +int force_sig_info(int sig, struct siginfo *info, struct task_struct *t)
23706 +{
23707 +/*
23708 + * On some archs, PREEMPT_RT has to delay sending a signal from a trap
23709 + * since it can not enable preemption, and the signal code's spin_locks
23710 + * turn into mutexes. Instead, it must set TIF_NOTIFY_RESUME which will
23711 + * send the signal on exit of the trap.
23712 + */
23713 +#ifdef ARCH_RT_DELAYS_SIGNAL_SEND
23714 +       if (in_atomic()) {
23715 +               if (WARN_ON_ONCE(t != current))
23716 +                       return 0;
23717 +               if (WARN_ON_ONCE(t->forced_info.si_signo))
23718 +                       return 0;
23719 +
23720 +               if (is_si_special(info)) {
23721 +                       WARN_ON_ONCE(info != SEND_SIG_PRIV);
23722 +                       t->forced_info.si_signo = sig;
23723 +                       t->forced_info.si_errno = 0;
23724 +                       t->forced_info.si_code = SI_KERNEL;
23725 +                       t->forced_info.si_pid = 0;
23726 +                       t->forced_info.si_uid = 0;
23727 +               } else {
23728 +                       t->forced_info = *info;
23729 +               }
23730 +
23731 +               set_tsk_thread_flag(t, TIF_NOTIFY_RESUME);
23732 +               return 0;
23733 +       }
23734 +#endif
23735 +       return do_force_sig_info(sig, info, t);
23736 +}
23737 +
23738  /*
23739   * Nuke all other threads in the group.
23740   */
23741 @@ -1216,12 +1309,12 @@ struct sighand_struct *__lock_task_sighand(struct task_struct *tsk,
23742                  * Disable interrupts early to avoid deadlocks.
23743                  * See rcu_read_unlock() comment header for details.
23744                  */
23745 -               local_irq_save(*flags);
23746 +               local_irq_save_nort(*flags);
23747                 rcu_read_lock();
23748                 sighand = rcu_dereference(tsk->sighand);
23749                 if (unlikely(sighand == NULL)) {
23750                         rcu_read_unlock();
23751 -                       local_irq_restore(*flags);
23752 +                       local_irq_restore_nort(*flags);
23753                         break;
23754                 }
23755                 /*
23756 @@ -1242,7 +1335,7 @@ struct sighand_struct *__lock_task_sighand(struct task_struct *tsk,
23757                 }
23758                 spin_unlock(&sighand->siglock);
23759                 rcu_read_unlock();
23760 -               local_irq_restore(*flags);
23761 +               local_irq_restore_nort(*flags);
23762         }
23763
23764         return sighand;
23765 @@ -1485,7 +1578,8 @@ EXPORT_SYMBOL(kill_pid);
23766   */
23767  struct sigqueue *sigqueue_alloc(void)
23768  {
23769 -       struct sigqueue *q = __sigqueue_alloc(-1, current, GFP_KERNEL, 0);
23770 +       /* Preallocated sigqueue objects always from the slabcache ! */
23771 +       struct sigqueue *q = __sigqueue_do_alloc(-1, current, GFP_KERNEL, 0, 1);
23772
23773         if (q)
23774                 q->flags |= SIGQUEUE_PREALLOC;
23775 @@ -1846,15 +1940,7 @@ static void ptrace_stop(int exit_code, int why, int clear_code, siginfo_t *info)
23776                 if (gstop_done && ptrace_reparented(current))
23777                         do_notify_parent_cldstop(current, false, why);
23778
23779 -               /*
23780 -                * Don't want to allow preemption here, because
23781 -                * sys_ptrace() needs this task to be inactive.
23782 -                *
23783 -                * XXX: implement read_unlock_no_resched().
23784 -                */
23785 -               preempt_disable();
23786                 read_unlock(&tasklist_lock);
23787 -               preempt_enable_no_resched();
23788                 freezable_schedule();
23789         } else {
23790                 /*
23791 diff --git a/kernel/softirq.c b/kernel/softirq.c
23792 index 479e4436f787..cb9c1d5dee10 100644
23793 --- a/kernel/softirq.c
23794 +++ b/kernel/softirq.c
23795 @@ -21,10 +21,12 @@
23796  #include <linux/freezer.h>
23797  #include <linux/kthread.h>
23798  #include <linux/rcupdate.h>
23799 +#include <linux/delay.h>
23800  #include <linux/ftrace.h>
23801  #include <linux/smp.h>
23802  #include <linux/smpboot.h>
23803  #include <linux/tick.h>
23804 +#include <linux/locallock.h>
23805  #include <linux/irq.h>
23806
23807  #define CREATE_TRACE_POINTS
23808 @@ -56,12 +58,108 @@ EXPORT_SYMBOL(irq_stat);
23809  static struct softirq_action softirq_vec[NR_SOFTIRQS] __cacheline_aligned_in_smp;
23810
23811  DEFINE_PER_CPU(struct task_struct *, ksoftirqd);
23812 +#ifdef CONFIG_PREEMPT_RT_FULL
23813 +#define TIMER_SOFTIRQS ((1 << TIMER_SOFTIRQ) | (1 << HRTIMER_SOFTIRQ))
23814 +DEFINE_PER_CPU(struct task_struct *, ktimer_softirqd);
23815 +#endif
23816
23817  const char * const softirq_to_name[NR_SOFTIRQS] = {
23818         "HI", "TIMER", "NET_TX", "NET_RX", "BLOCK", "BLOCK_IOPOLL",
23819         "TASKLET", "SCHED", "HRTIMER", "RCU"
23820  };
23821
23822 +#ifdef CONFIG_NO_HZ_COMMON
23823 +# ifdef CONFIG_PREEMPT_RT_FULL
23824 +
23825 +struct softirq_runner {
23826 +       struct task_struct *runner[NR_SOFTIRQS];
23827 +};
23828 +
23829 +static DEFINE_PER_CPU(struct softirq_runner, softirq_runners);
23830 +
23831 +static inline void softirq_set_runner(unsigned int sirq)
23832 +{
23833 +       struct softirq_runner *sr = this_cpu_ptr(&softirq_runners);
23834 +
23835 +       sr->runner[sirq] = current;
23836 +}
23837 +
23838 +static inline void softirq_clr_runner(unsigned int sirq)
23839 +{
23840 +       struct softirq_runner *sr = this_cpu_ptr(&softirq_runners);
23841 +
23842 +       sr->runner[sirq] = NULL;
23843 +}
23844 +
23845 +/*
23846 + * On preempt-rt a softirq running context might be blocked on a
23847 + * lock. There might be no other runnable task on this CPU because the
23848 + * lock owner runs on some other CPU. So we have to go into idle with
23849 + * the pending bit set. Therefor we need to check this otherwise we
23850 + * warn about false positives which confuses users and defeats the
23851 + * whole purpose of this test.
23852 + *
23853 + * This code is called with interrupts disabled.
23854 + */
23855 +void softirq_check_pending_idle(void)
23856 +{
23857 +       static int rate_limit;
23858 +       struct softirq_runner *sr = this_cpu_ptr(&softirq_runners);
23859 +       u32 warnpending;
23860 +       int i;
23861 +
23862 +       if (rate_limit >= 10)
23863 +               return;
23864 +
23865 +       warnpending = local_softirq_pending() & SOFTIRQ_STOP_IDLE_MASK;
23866 +       for (i = 0; i < NR_SOFTIRQS; i++) {
23867 +               struct task_struct *tsk = sr->runner[i];
23868 +
23869 +               /*
23870 +                * The wakeup code in rtmutex.c wakes up the task
23871 +                * _before_ it sets pi_blocked_on to NULL under
23872 +                * tsk->pi_lock. So we need to check for both: state
23873 +                * and pi_blocked_on.
23874 +                */
23875 +               if (tsk) {
23876 +                       raw_spin_lock(&tsk->pi_lock);
23877 +                       if (tsk->pi_blocked_on || tsk->state == TASK_RUNNING) {
23878 +                               /* Clear all bits pending in that task */
23879 +                               warnpending &= ~(tsk->softirqs_raised);
23880 +                               warnpending &= ~(1 << i);
23881 +                       }
23882 +                       raw_spin_unlock(&tsk->pi_lock);
23883 +               }
23884 +       }
23885 +
23886 +       if (warnpending) {
23887 +               printk(KERN_ERR "NOHZ: local_softirq_pending %02x\n",
23888 +                      warnpending);
23889 +               rate_limit++;
23890 +       }
23891 +}
23892 +# else
23893 +/*
23894 + * On !PREEMPT_RT we just printk rate limited:
23895 + */
23896 +void softirq_check_pending_idle(void)
23897 +{
23898 +       static int rate_limit;
23899 +
23900 +       if (rate_limit < 10 &&
23901 +                       (local_softirq_pending() & SOFTIRQ_STOP_IDLE_MASK)) {
23902 +               printk(KERN_ERR "NOHZ: local_softirq_pending %02x\n",
23903 +                      local_softirq_pending());
23904 +               rate_limit++;
23905 +       }
23906 +}
23907 +# endif
23908 +
23909 +#else /* !CONFIG_NO_HZ_COMMON */
23910 +static inline void softirq_set_runner(unsigned int sirq) { }
23911 +static inline void softirq_clr_runner(unsigned int sirq) { }
23912 +#endif
23913 +
23914  /*
23915   * we cannot loop indefinitely here to avoid userspace starvation,
23916   * but we also don't want to introduce a worst case 1/HZ latency
23917 @@ -77,6 +175,79 @@ static void wakeup_softirqd(void)
23918                 wake_up_process(tsk);
23919  }
23920
23921 +#ifdef CONFIG_PREEMPT_RT_FULL
23922 +static void wakeup_timer_softirqd(void)
23923 +{
23924 +       /* Interrupts are disabled: no need to stop preemption */
23925 +       struct task_struct *tsk = __this_cpu_read(ktimer_softirqd);
23926 +
23927 +       if (tsk && tsk->state != TASK_RUNNING)
23928 +               wake_up_process(tsk);
23929 +}
23930 +#endif
23931 +
23932 +static void handle_softirq(unsigned int vec_nr)
23933 +{
23934 +       struct softirq_action *h = softirq_vec + vec_nr;
23935 +       int prev_count;
23936 +
23937 +       prev_count = preempt_count();
23938 +
23939 +       kstat_incr_softirqs_this_cpu(vec_nr);
23940 +
23941 +       trace_softirq_entry(vec_nr);
23942 +       h->action(h);
23943 +       trace_softirq_exit(vec_nr);
23944 +       if (unlikely(prev_count != preempt_count())) {
23945 +               pr_err("huh, entered softirq %u %s %p with preempt_count %08x, exited with %08x?\n",
23946 +                      vec_nr, softirq_to_name[vec_nr], h->action,
23947 +                      prev_count, preempt_count());
23948 +               preempt_count_set(prev_count);
23949 +       }
23950 +}
23951 +
23952 +#ifndef CONFIG_PREEMPT_RT_FULL
23953 +static inline int ksoftirqd_softirq_pending(void)
23954 +{
23955 +       return local_softirq_pending();
23956 +}
23957 +
23958 +static void handle_pending_softirqs(u32 pending)
23959 +{
23960 +       struct softirq_action *h = softirq_vec;
23961 +       int softirq_bit;
23962 +
23963 +       local_irq_enable();
23964 +
23965 +       h = softirq_vec;
23966 +
23967 +       while ((softirq_bit = ffs(pending))) {
23968 +               unsigned int vec_nr;
23969 +
23970 +               h += softirq_bit - 1;
23971 +               vec_nr = h - softirq_vec;
23972 +               handle_softirq(vec_nr);
23973 +
23974 +               h++;
23975 +               pending >>= softirq_bit;
23976 +       }
23977 +
23978 +       rcu_bh_qs();
23979 +       local_irq_disable();
23980 +}
23981 +
23982 +static void run_ksoftirqd(unsigned int cpu)
23983 +{
23984 +       local_irq_disable();
23985 +       if (ksoftirqd_softirq_pending()) {
23986 +               __do_softirq();
23987 +               local_irq_enable();
23988 +               cond_resched_rcu_qs();
23989 +               return;
23990 +       }
23991 +       local_irq_enable();
23992 +}
23993 +
23994  /*
23995   * preempt_count and SOFTIRQ_OFFSET usage:
23996   * - preempt_count is changed by SOFTIRQ_OFFSET on entering or leaving
23997 @@ -116,9 +287,9 @@ void __local_bh_disable_ip(unsigned long ip, unsigned int cnt)
23998
23999         if (preempt_count() == cnt) {
24000  #ifdef CONFIG_DEBUG_PREEMPT
24001 -               current->preempt_disable_ip = get_parent_ip(CALLER_ADDR1);
24002 +               current->preempt_disable_ip = get_lock_parent_ip();
24003  #endif
24004 -               trace_preempt_off(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1));
24005 +               trace_preempt_off(CALLER_ADDR0, get_lock_parent_ip());
24006         }
24007  }
24008  EXPORT_SYMBOL(__local_bh_disable_ip);
24009 @@ -232,10 +403,8 @@ asmlinkage __visible void __do_softirq(void)
24010         unsigned long end = jiffies + MAX_SOFTIRQ_TIME;
24011         unsigned long old_flags = current->flags;
24012         int max_restart = MAX_SOFTIRQ_RESTART;
24013 -       struct softirq_action *h;
24014         bool in_hardirq;
24015         __u32 pending;
24016 -       int softirq_bit;
24017
24018         /*
24019          * Mask out PF_MEMALLOC s current task context is borrowed for the
24020 @@ -254,36 +423,7 @@ restart:
24021         /* Reset the pending bitmask before enabling irqs */
24022         set_softirq_pending(0);
24023
24024 -       local_irq_enable();
24025 -
24026 -       h = softirq_vec;
24027 -
24028 -       while ((softirq_bit = ffs(pending))) {
24029 -               unsigned int vec_nr;
24030 -               int prev_count;
24031 -
24032 -               h += softirq_bit - 1;
24033 -
24034 -               vec_nr = h - softirq_vec;
24035 -               prev_count = preempt_count();
24036 -
24037 -               kstat_incr_softirqs_this_cpu(vec_nr);
24038 -
24039 -               trace_softirq_entry(vec_nr);
24040 -               h->action(h);
24041 -               trace_softirq_exit(vec_nr);
24042 -               if (unlikely(prev_count != preempt_count())) {
24043 -                       pr_err("huh, entered softirq %u %s %p with preempt_count %08x, exited with %08x?\n",
24044 -                              vec_nr, softirq_to_name[vec_nr], h->action,
24045 -                              prev_count, preempt_count());
24046 -                       preempt_count_set(prev_count);
24047 -               }
24048 -               h++;
24049 -               pending >>= softirq_bit;
24050 -       }
24051 -
24052 -       rcu_bh_qs();
24053 -       local_irq_disable();
24054 +       handle_pending_softirqs(pending);
24055
24056         pending = local_softirq_pending();
24057         if (pending) {
24058 @@ -320,6 +460,310 @@ asmlinkage __visible void do_softirq(void)
24059  }
24060
24061  /*
24062 + * This function must run with irqs disabled!
24063 + */
24064 +void raise_softirq_irqoff(unsigned int nr)
24065 +{
24066 +       __raise_softirq_irqoff(nr);
24067 +
24068 +       /*
24069 +        * If we're in an interrupt or softirq, we're done
24070 +        * (this also catches softirq-disabled code). We will
24071 +        * actually run the softirq once we return from
24072 +        * the irq or softirq.
24073 +        *
24074 +        * Otherwise we wake up ksoftirqd to make sure we
24075 +        * schedule the softirq soon.
24076 +        */
24077 +       if (!in_interrupt())
24078 +               wakeup_softirqd();
24079 +}
24080 +
24081 +void __raise_softirq_irqoff(unsigned int nr)
24082 +{
24083 +       trace_softirq_raise(nr);
24084 +       or_softirq_pending(1UL << nr);
24085 +}
24086 +
24087 +static inline void local_bh_disable_nort(void) { local_bh_disable(); }
24088 +static inline void _local_bh_enable_nort(void) { _local_bh_enable(); }
24089 +static void ksoftirqd_set_sched_params(unsigned int cpu) { }
24090 +
24091 +#else /* !PREEMPT_RT_FULL */
24092 +
24093 +/*
24094 + * On RT we serialize softirq execution with a cpu local lock per softirq
24095 + */
24096 +static DEFINE_PER_CPU(struct local_irq_lock [NR_SOFTIRQS], local_softirq_locks);
24097 +
24098 +void __init softirq_early_init(void)
24099 +{
24100 +       int i;
24101 +
24102 +       for (i = 0; i < NR_SOFTIRQS; i++)
24103 +               local_irq_lock_init(local_softirq_locks[i]);
24104 +}
24105 +
24106 +static void lock_softirq(int which)
24107 +{
24108 +       local_lock(local_softirq_locks[which]);
24109 +}
24110 +
24111 +static void unlock_softirq(int which)
24112 +{
24113 +       local_unlock(local_softirq_locks[which]);
24114 +}
24115 +
24116 +static void do_single_softirq(int which)
24117 +{
24118 +       unsigned long old_flags = current->flags;
24119 +
24120 +       current->flags &= ~PF_MEMALLOC;
24121 +       vtime_account_irq_enter(current);
24122 +       current->flags |= PF_IN_SOFTIRQ;
24123 +       lockdep_softirq_enter();
24124 +       local_irq_enable();
24125 +       handle_softirq(which);
24126 +       local_irq_disable();
24127 +       lockdep_softirq_exit();
24128 +       current->flags &= ~PF_IN_SOFTIRQ;
24129 +       vtime_account_irq_enter(current);
24130 +       tsk_restore_flags(current, old_flags, PF_MEMALLOC);
24131 +}
24132 +
24133 +/*
24134 + * Called with interrupts disabled. Process softirqs which were raised
24135 + * in current context (or on behalf of ksoftirqd).
24136 + */
24137 +static void do_current_softirqs(void)
24138 +{
24139 +       while (current->softirqs_raised) {
24140 +               int i = __ffs(current->softirqs_raised);
24141 +               unsigned int pending, mask = (1U << i);
24142 +
24143 +               current->softirqs_raised &= ~mask;
24144 +               local_irq_enable();
24145 +
24146 +               /*
24147 +                * If the lock is contended, we boost the owner to
24148 +                * process the softirq or leave the critical section
24149 +                * now.
24150 +                */
24151 +               lock_softirq(i);
24152 +               local_irq_disable();
24153 +               softirq_set_runner(i);
24154 +               /*
24155 +                * Check with the local_softirq_pending() bits,
24156 +                * whether we need to process this still or if someone
24157 +                * else took care of it.
24158 +                */
24159 +               pending = local_softirq_pending();
24160 +               if (pending & mask) {
24161 +                       set_softirq_pending(pending & ~mask);
24162 +                       do_single_softirq(i);
24163 +               }
24164 +               softirq_clr_runner(i);
24165 +               WARN_ON(current->softirq_nestcnt != 1);
24166 +               local_irq_enable();
24167 +               unlock_softirq(i);
24168 +               local_irq_disable();
24169 +       }
24170 +}
24171 +
24172 +void __local_bh_disable(void)
24173 +{
24174 +       if (++current->softirq_nestcnt == 1)
24175 +               migrate_disable();
24176 +}
24177 +EXPORT_SYMBOL(__local_bh_disable);
24178 +
24179 +void __local_bh_enable(void)
24180 +{
24181 +       if (WARN_ON(current->softirq_nestcnt == 0))
24182 +               return;
24183 +
24184 +       local_irq_disable();
24185 +       if (current->softirq_nestcnt == 1 && current->softirqs_raised)
24186 +               do_current_softirqs();
24187 +       local_irq_enable();
24188 +
24189 +       if (--current->softirq_nestcnt == 0)
24190 +               migrate_enable();
24191 +}
24192 +EXPORT_SYMBOL(__local_bh_enable);
24193 +
24194 +void _local_bh_enable(void)
24195 +{
24196 +       if (WARN_ON(current->softirq_nestcnt == 0))
24197 +               return;
24198 +       if (--current->softirq_nestcnt == 0)
24199 +               migrate_enable();
24200 +}
24201 +EXPORT_SYMBOL(_local_bh_enable);
24202 +
24203 +int in_serving_softirq(void)
24204 +{
24205 +       return current->flags & PF_IN_SOFTIRQ;
24206 +}
24207 +EXPORT_SYMBOL(in_serving_softirq);
24208 +
24209 +/* Called with preemption disabled */
24210 +static void run_ksoftirqd(unsigned int cpu)
24211 +{
24212 +       local_irq_disable();
24213 +       current->softirq_nestcnt++;
24214 +
24215 +       do_current_softirqs();
24216 +       current->softirq_nestcnt--;
24217 +       local_irq_enable();
24218 +       cond_resched_rcu_qs();
24219 +}
24220 +
24221 +/*
24222 + * Called from netif_rx_ni(). Preemption enabled, but migration
24223 + * disabled. So the cpu can't go away under us.
24224 + */
24225 +void thread_do_softirq(void)
24226 +{
24227 +       if (!in_serving_softirq() && current->softirqs_raised) {
24228 +               current->softirq_nestcnt++;
24229 +               do_current_softirqs();
24230 +               current->softirq_nestcnt--;
24231 +       }
24232 +}
24233 +
24234 +static void do_raise_softirq_irqoff(unsigned int nr)
24235 +{
24236 +       unsigned int mask;
24237 +
24238 +       mask = 1UL << nr;
24239 +
24240 +       trace_softirq_raise(nr);
24241 +       or_softirq_pending(mask);
24242 +
24243 +       /*
24244 +        * If we are not in a hard interrupt and inside a bh disabled
24245 +        * region, we simply raise the flag on current. local_bh_enable()
24246 +        * will make sure that the softirq is executed. Otherwise we
24247 +        * delegate it to ksoftirqd.
24248 +        */
24249 +       if (!in_irq() && current->softirq_nestcnt)
24250 +               current->softirqs_raised |= mask;
24251 +       else if (!__this_cpu_read(ksoftirqd) || !__this_cpu_read(ktimer_softirqd))
24252 +               return;
24253 +
24254 +       if (mask & TIMER_SOFTIRQS)
24255 +               __this_cpu_read(ktimer_softirqd)->softirqs_raised |= mask;
24256 +       else
24257 +               __this_cpu_read(ksoftirqd)->softirqs_raised |= mask;
24258 +}
24259 +
24260 +static void wakeup_proper_softirq(unsigned int nr)
24261 +{
24262 +       if ((1UL << nr) & TIMER_SOFTIRQS)
24263 +               wakeup_timer_softirqd();
24264 +       else
24265 +               wakeup_softirqd();
24266 +}
24267 +
24268 +
24269 +void __raise_softirq_irqoff(unsigned int nr)
24270 +{
24271 +       do_raise_softirq_irqoff(nr);
24272 +       if (!in_irq() && !current->softirq_nestcnt)
24273 +               wakeup_proper_softirq(nr);
24274 +}
24275 +
24276 +/*
24277 + * Same as __raise_softirq_irqoff() but will process them in ksoftirqd
24278 + */
24279 +void __raise_softirq_irqoff_ksoft(unsigned int nr)
24280 +{
24281 +       unsigned int mask;
24282 +
24283 +       if (WARN_ON_ONCE(!__this_cpu_read(ksoftirqd) ||
24284 +                        !__this_cpu_read(ktimer_softirqd)))
24285 +               return;
24286 +       mask = 1UL << nr;
24287 +
24288 +       trace_softirq_raise(nr);
24289 +       or_softirq_pending(mask);
24290 +       if (mask & TIMER_SOFTIRQS)
24291 +               __this_cpu_read(ktimer_softirqd)->softirqs_raised |= mask;
24292 +       else
24293 +               __this_cpu_read(ksoftirqd)->softirqs_raised |= mask;
24294 +       wakeup_proper_softirq(nr);
24295 +}
24296 +
24297 +/*
24298 + * This function must run with irqs disabled!
24299 + */
24300 +void raise_softirq_irqoff(unsigned int nr)
24301 +{
24302 +       do_raise_softirq_irqoff(nr);
24303 +
24304 +       /*
24305 +        * If we're in an hard interrupt we let irq return code deal
24306 +        * with the wakeup of ksoftirqd.
24307 +        */
24308 +       if (in_irq())
24309 +               return;
24310 +       /*
24311 +        * If we are in thread context but outside of a bh disabled
24312 +        * region, we need to wake ksoftirqd as well.
24313 +        *
24314 +        * CHECKME: Some of the places which do that could be wrapped
24315 +        * into local_bh_disable/enable pairs. Though it's unclear
24316 +        * whether this is worth the effort. To find those places just
24317 +        * raise a WARN() if the condition is met.
24318 +        */
24319 +       if (!current->softirq_nestcnt)
24320 +               wakeup_proper_softirq(nr);
24321 +}
24322 +
24323 +static inline int ksoftirqd_softirq_pending(void)
24324 +{
24325 +       return current->softirqs_raised;
24326 +}
24327 +
24328 +static inline void local_bh_disable_nort(void) { }
24329 +static inline void _local_bh_enable_nort(void) { }
24330 +
24331 +static inline void ksoftirqd_set_sched_params(unsigned int cpu)
24332 +{
24333 +       /* Take over all but timer pending softirqs when starting */
24334 +       local_irq_disable();
24335 +       current->softirqs_raised = local_softirq_pending() & ~TIMER_SOFTIRQS;
24336 +       local_irq_enable();
24337 +}
24338 +
24339 +static inline void ktimer_softirqd_set_sched_params(unsigned int cpu)
24340 +{
24341 +       struct sched_param param = { .sched_priority = 1 };
24342 +
24343 +       sched_setscheduler(current, SCHED_FIFO, &param);
24344 +
24345 +       /* Take over timer pending softirqs when starting */
24346 +       local_irq_disable();
24347 +       current->softirqs_raised = local_softirq_pending() & TIMER_SOFTIRQS;
24348 +       local_irq_enable();
24349 +}
24350 +
24351 +static inline void ktimer_softirqd_clr_sched_params(unsigned int cpu,
24352 +                                                   bool online)
24353 +{
24354 +       struct sched_param param = { .sched_priority = 0 };
24355 +
24356 +       sched_setscheduler(current, SCHED_NORMAL, &param);
24357 +}
24358 +
24359 +static int ktimer_softirqd_should_run(unsigned int cpu)
24360 +{
24361 +       return current->softirqs_raised;
24362 +}
24363 +
24364 +#endif /* PREEMPT_RT_FULL */
24365 +/*
24366   * Enter an interrupt context.
24367   */
24368  void irq_enter(void)
24369 @@ -330,9 +774,9 @@ void irq_enter(void)
24370                  * Prevent raise_softirq from needlessly waking up ksoftirqd
24371                  * here, as softirq will be serviced on return from interrupt.
24372                  */
24373 -               local_bh_disable();
24374 +               local_bh_disable_nort();
24375                 tick_irq_enter();
24376 -               _local_bh_enable();
24377 +               _local_bh_enable_nort();
24378         }
24379
24380         __irq_enter();
24381 @@ -340,6 +784,7 @@ void irq_enter(void)
24382
24383  static inline void invoke_softirq(void)
24384  {
24385 +#ifndef CONFIG_PREEMPT_RT_FULL
24386         if (!force_irqthreads) {
24387  #ifdef CONFIG_HAVE_IRQ_EXIT_ON_IRQ_STACK
24388                 /*
24389 @@ -359,6 +804,18 @@ static inline void invoke_softirq(void)
24390         } else {
24391                 wakeup_softirqd();
24392         }
24393 +#else /* PREEMPT_RT_FULL */
24394 +       unsigned long flags;
24395 +
24396 +       local_irq_save(flags);
24397 +       if (__this_cpu_read(ksoftirqd) &&
24398 +                       __this_cpu_read(ksoftirqd)->softirqs_raised)
24399 +               wakeup_softirqd();
24400 +       if (__this_cpu_read(ktimer_softirqd) &&
24401 +                       __this_cpu_read(ktimer_softirqd)->softirqs_raised)
24402 +               wakeup_timer_softirqd();
24403 +       local_irq_restore(flags);
24404 +#endif
24405  }
24406
24407  static inline void tick_irq_exit(void)
24408 @@ -395,26 +852,6 @@ void irq_exit(void)
24409         trace_hardirq_exit(); /* must be last! */
24410  }
24411
24412 -/*
24413 - * This function must run with irqs disabled!
24414 - */
24415 -inline void raise_softirq_irqoff(unsigned int nr)
24416 -{
24417 -       __raise_softirq_irqoff(nr);
24418 -
24419 -       /*
24420 -        * If we're in an interrupt or softirq, we're done
24421 -        * (this also catches softirq-disabled code). We will
24422 -        * actually run the softirq once we return from
24423 -        * the irq or softirq.
24424 -        *
24425 -        * Otherwise we wake up ksoftirqd to make sure we
24426 -        * schedule the softirq soon.
24427 -        */
24428 -       if (!in_interrupt())
24429 -               wakeup_softirqd();
24430 -}
24431 -
24432  void raise_softirq(unsigned int nr)
24433  {
24434         unsigned long flags;
24435 @@ -424,12 +861,6 @@ void raise_softirq(unsigned int nr)
24436         local_irq_restore(flags);
24437  }
24438
24439 -void __raise_softirq_irqoff(unsigned int nr)
24440 -{
24441 -       trace_softirq_raise(nr);
24442 -       or_softirq_pending(1UL << nr);
24443 -}
24444 -
24445  void open_softirq(int nr, void (*action)(struct softirq_action *))
24446  {
24447         softirq_vec[nr].action = action;
24448 @@ -446,15 +877,45 @@ struct tasklet_head {
24449  static DEFINE_PER_CPU(struct tasklet_head, tasklet_vec);
24450  static DEFINE_PER_CPU(struct tasklet_head, tasklet_hi_vec);
24451
24452 +static void inline
24453 +__tasklet_common_schedule(struct tasklet_struct *t, struct tasklet_head *head, unsigned int nr)
24454 +{
24455 +       if (tasklet_trylock(t)) {
24456 +again:
24457 +               /* We may have been preempted before tasklet_trylock
24458 +                * and __tasklet_action may have already run.
24459 +                * So double check the sched bit while the takslet
24460 +                * is locked before adding it to the list.
24461 +                */
24462 +               if (test_bit(TASKLET_STATE_SCHED, &t->state)) {
24463 +                       t->next = NULL;
24464 +                       *head->tail = t;
24465 +                       head->tail = &(t->next);
24466 +                       raise_softirq_irqoff(nr);
24467 +                       tasklet_unlock(t);
24468 +               } else {
24469 +                       /* This is subtle. If we hit the corner case above
24470 +                        * It is possible that we get preempted right here,
24471 +                        * and another task has successfully called
24472 +                        * tasklet_schedule(), then this function, and
24473 +                        * failed on the trylock. Thus we must be sure
24474 +                        * before releasing the tasklet lock, that the
24475 +                        * SCHED_BIT is clear. Otherwise the tasklet
24476 +                        * may get its SCHED_BIT set, but not added to the
24477 +                        * list
24478 +                        */
24479 +                       if (!tasklet_tryunlock(t))
24480 +                               goto again;
24481 +               }
24482 +       }
24483 +}
24484 +
24485  void __tasklet_schedule(struct tasklet_struct *t)
24486  {
24487         unsigned long flags;
24488
24489         local_irq_save(flags);
24490 -       t->next = NULL;
24491 -       *__this_cpu_read(tasklet_vec.tail) = t;
24492 -       __this_cpu_write(tasklet_vec.tail, &(t->next));
24493 -       raise_softirq_irqoff(TASKLET_SOFTIRQ);
24494 +       __tasklet_common_schedule(t, this_cpu_ptr(&tasklet_vec), TASKLET_SOFTIRQ);
24495         local_irq_restore(flags);
24496  }
24497  EXPORT_SYMBOL(__tasklet_schedule);
24498 @@ -464,10 +925,7 @@ void __tasklet_hi_schedule(struct tasklet_struct *t)
24499         unsigned long flags;
24500
24501         local_irq_save(flags);
24502 -       t->next = NULL;
24503 -       *__this_cpu_read(tasklet_hi_vec.tail) = t;
24504 -       __this_cpu_write(tasklet_hi_vec.tail,  &(t->next));
24505 -       raise_softirq_irqoff(HI_SOFTIRQ);
24506 +       __tasklet_common_schedule(t, this_cpu_ptr(&tasklet_hi_vec), HI_SOFTIRQ);
24507         local_irq_restore(flags);
24508  }
24509  EXPORT_SYMBOL(__tasklet_hi_schedule);
24510 @@ -476,82 +934,122 @@ void __tasklet_hi_schedule_first(struct tasklet_struct *t)
24511  {
24512         BUG_ON(!irqs_disabled());
24513
24514 -       t->next = __this_cpu_read(tasklet_hi_vec.head);
24515 -       __this_cpu_write(tasklet_hi_vec.head, t);
24516 -       __raise_softirq_irqoff(HI_SOFTIRQ);
24517 +       __tasklet_hi_schedule(t);
24518  }
24519  EXPORT_SYMBOL(__tasklet_hi_schedule_first);
24520
24521 -static void tasklet_action(struct softirq_action *a)
24522 +void  tasklet_enable(struct tasklet_struct *t)
24523  {
24524 -       struct tasklet_struct *list;
24525 +       if (!atomic_dec_and_test(&t->count))
24526 +               return;
24527 +       if (test_and_clear_bit(TASKLET_STATE_PENDING, &t->state))
24528 +               tasklet_schedule(t);
24529 +}
24530 +EXPORT_SYMBOL(tasklet_enable);
24531
24532 -       local_irq_disable();
24533 -       list = __this_cpu_read(tasklet_vec.head);
24534 -       __this_cpu_write(tasklet_vec.head, NULL);
24535 -       __this_cpu_write(tasklet_vec.tail, this_cpu_ptr(&tasklet_vec.head));
24536 -       local_irq_enable();
24537 +static void __tasklet_action(struct softirq_action *a,
24538 +                            struct tasklet_struct *list)
24539 +{
24540 +       int loops = 1000000;
24541
24542         while (list) {
24543                 struct tasklet_struct *t = list;
24544
24545                 list = list->next;
24546
24547 -               if (tasklet_trylock(t)) {
24548 -                       if (!atomic_read(&t->count)) {
24549 -                               if (!test_and_clear_bit(TASKLET_STATE_SCHED,
24550 -                                                       &t->state))
24551 -                                       BUG();
24552 -                               t->func(t->data);
24553 -                               tasklet_unlock(t);
24554 -                               continue;
24555 -                       }
24556 -                       tasklet_unlock(t);
24557 +               /*
24558 +                * Should always succeed - after a tasklist got on the
24559 +                * list (after getting the SCHED bit set from 0 to 1),
24560 +                * nothing but the tasklet softirq it got queued to can
24561 +                * lock it:
24562 +                */
24563 +               if (!tasklet_trylock(t)) {
24564 +                       WARN_ON(1);
24565 +                       continue;
24566                 }
24567
24568 -               local_irq_disable();
24569                 t->next = NULL;
24570 -               *__this_cpu_read(tasklet_vec.tail) = t;
24571 -               __this_cpu_write(tasklet_vec.tail, &(t->next));
24572 -               __raise_softirq_irqoff(TASKLET_SOFTIRQ);
24573 -               local_irq_enable();
24574 +
24575 +               /*
24576 +                * If we cannot handle the tasklet because it's disabled,
24577 +                * mark it as pending. tasklet_enable() will later
24578 +                * re-schedule the tasklet.
24579 +                */
24580 +               if (unlikely(atomic_read(&t->count))) {
24581 +out_disabled:
24582 +                       /* implicit unlock: */
24583 +                       wmb();
24584 +                       t->state = TASKLET_STATEF_PENDING;
24585 +                       continue;
24586 +               }
24587 +
24588 +               /*
24589 +                * After this point on the tasklet might be rescheduled
24590 +                * on another CPU, but it can only be added to another
24591 +                * CPU's tasklet list if we unlock the tasklet (which we
24592 +                * dont do yet).
24593 +                */
24594 +               if (!test_and_clear_bit(TASKLET_STATE_SCHED, &t->state))
24595 +                       WARN_ON(1);
24596 +
24597 +again:
24598 +               t->func(t->data);
24599 +
24600 +               /*
24601 +                * Try to unlock the tasklet. We must use cmpxchg, because
24602 +                * another CPU might have scheduled or disabled the tasklet.
24603 +                * We only allow the STATE_RUN -> 0 transition here.
24604 +                */
24605 +               while (!tasklet_tryunlock(t)) {
24606 +                       /*
24607 +                        * If it got disabled meanwhile, bail out:
24608 +                        */
24609 +                       if (atomic_read(&t->count))
24610 +                               goto out_disabled;
24611 +                       /*
24612 +                        * If it got scheduled meanwhile, re-execute
24613 +                        * the tasklet function:
24614 +                        */
24615 +                       if (test_and_clear_bit(TASKLET_STATE_SCHED, &t->state))
24616 +                               goto again;
24617 +                       if (!--loops) {
24618 +                               printk("hm, tasklet state: %08lx\n", t->state);
24619 +                               WARN_ON(1);
24620 +                               tasklet_unlock(t);
24621 +                               break;
24622 +                       }
24623 +               }
24624         }
24625  }
24626
24627 +static void tasklet_action(struct softirq_action *a)
24628 +{
24629 +       struct tasklet_struct *list;
24630 +
24631 +       local_irq_disable();
24632 +
24633 +       list = __this_cpu_read(tasklet_vec.head);
24634 +       __this_cpu_write(tasklet_vec.head, NULL);
24635 +       __this_cpu_write(tasklet_vec.tail, this_cpu_ptr(&tasklet_vec.head));
24636 +
24637 +       local_irq_enable();
24638 +
24639 +       __tasklet_action(a, list);
24640 +}
24641 +
24642  static void tasklet_hi_action(struct softirq_action *a)
24643  {
24644         struct tasklet_struct *list;
24645
24646         local_irq_disable();
24647 +
24648         list = __this_cpu_read(tasklet_hi_vec.head);
24649         __this_cpu_write(tasklet_hi_vec.head, NULL);
24650         __this_cpu_write(tasklet_hi_vec.tail, this_cpu_ptr(&tasklet_hi_vec.head));
24651 -       local_irq_enable();
24652
24653 -       while (list) {
24654 -               struct tasklet_struct *t = list;
24655 -
24656 -               list = list->next;
24657 -
24658 -               if (tasklet_trylock(t)) {
24659 -                       if (!atomic_read(&t->count)) {
24660 -                               if (!test_and_clear_bit(TASKLET_STATE_SCHED,
24661 -                                                       &t->state))
24662 -                                       BUG();
24663 -                               t->func(t->data);
24664 -                               tasklet_unlock(t);
24665 -                               continue;
24666 -                       }
24667 -                       tasklet_unlock(t);
24668 -               }
24669 +       local_irq_enable();
24670
24671 -               local_irq_disable();
24672 -               t->next = NULL;
24673 -               *__this_cpu_read(tasklet_hi_vec.tail) = t;
24674 -               __this_cpu_write(tasklet_hi_vec.tail, &(t->next));
24675 -               __raise_softirq_irqoff(HI_SOFTIRQ);
24676 -               local_irq_enable();
24677 -       }
24678 +       __tasklet_action(a, list);
24679  }
24680
24681  void tasklet_init(struct tasklet_struct *t,
24682 @@ -572,7 +1070,7 @@ void tasklet_kill(struct tasklet_struct *t)
24683
24684         while (test_and_set_bit(TASKLET_STATE_SCHED, &t->state)) {
24685                 do {
24686 -                       yield();
24687 +                       msleep(1);
24688                 } while (test_bit(TASKLET_STATE_SCHED, &t->state));
24689         }
24690         tasklet_unlock_wait(t);
24691 @@ -646,25 +1144,26 @@ void __init softirq_init(void)
24692         open_softirq(HI_SOFTIRQ, tasklet_hi_action);
24693  }
24694
24695 -static int ksoftirqd_should_run(unsigned int cpu)
24696 -{
24697 -       return local_softirq_pending();
24698 -}
24699 -
24700 -static void run_ksoftirqd(unsigned int cpu)
24701 +#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT_FULL)
24702 +void tasklet_unlock_wait(struct tasklet_struct *t)
24703  {
24704 -       local_irq_disable();
24705 -       if (local_softirq_pending()) {
24706 +       while (test_bit(TASKLET_STATE_RUN, &(t)->state)) {
24707                 /*
24708 -                * We can safely run softirq on inline stack, as we are not deep
24709 -                * in the task stack here.
24710 +                * Hack for now to avoid this busy-loop:
24711                  */
24712 -               __do_softirq();
24713 -               local_irq_enable();
24714 -               cond_resched_rcu_qs();
24715 -               return;
24716 +#ifdef CONFIG_PREEMPT_RT_FULL
24717 +               msleep(1);
24718 +#else
24719 +               barrier();
24720 +#endif
24721         }
24722 -       local_irq_enable();
24723 +}
24724 +EXPORT_SYMBOL(tasklet_unlock_wait);
24725 +#endif
24726 +
24727 +static int ksoftirqd_should_run(unsigned int cpu)
24728 +{
24729 +       return ksoftirqd_softirq_pending();
24730  }
24731
24732  #ifdef CONFIG_HOTPLUG_CPU
24733 @@ -746,16 +1245,31 @@ static struct notifier_block cpu_nfb = {
24734
24735  static struct smp_hotplug_thread softirq_threads = {
24736         .store                  = &ksoftirqd,
24737 +       .setup                  = ksoftirqd_set_sched_params,
24738         .thread_should_run      = ksoftirqd_should_run,
24739         .thread_fn              = run_ksoftirqd,
24740         .thread_comm            = "ksoftirqd/%u",
24741  };
24742
24743 +#ifdef CONFIG_PREEMPT_RT_FULL
24744 +static struct smp_hotplug_thread softirq_timer_threads = {
24745 +       .store                  = &ktimer_softirqd,
24746 +       .setup                  = ktimer_softirqd_set_sched_params,
24747 +       .cleanup                = ktimer_softirqd_clr_sched_params,
24748 +       .thread_should_run      = ktimer_softirqd_should_run,
24749 +       .thread_fn              = run_ksoftirqd,
24750 +       .thread_comm            = "ktimersoftd/%u",
24751 +};
24752 +#endif
24753 +
24754  static __init int spawn_ksoftirqd(void)
24755  {
24756         register_cpu_notifier(&cpu_nfb);
24757
24758         BUG_ON(smpboot_register_percpu_thread(&softirq_threads));
24759 +#ifdef CONFIG_PREEMPT_RT_FULL
24760 +       BUG_ON(smpboot_register_percpu_thread(&softirq_timer_threads));
24761 +#endif
24762
24763         return 0;
24764  }
24765 diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
24766 index a3bbaee77c58..f84d3b45cda7 100644
24767 --- a/kernel/stop_machine.c
24768 +++ b/kernel/stop_machine.c
24769 @@ -37,7 +37,7 @@ struct cpu_stop_done {
24770  struct cpu_stopper {
24771         struct task_struct      *thread;
24772
24773 -       spinlock_t              lock;
24774 +       raw_spinlock_t          lock;
24775         bool                    enabled;        /* is this stopper enabled? */
24776         struct list_head        works;          /* list of pending works */
24777
24778 @@ -86,12 +86,12 @@ static void cpu_stop_queue_work(unsigned int cpu, struct cpu_stop_work *work)
24779         struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu);
24780         unsigned long flags;
24781
24782 -       spin_lock_irqsave(&stopper->lock, flags);
24783 +       raw_spin_lock_irqsave(&stopper->lock, flags);
24784         if (stopper->enabled)
24785                 __cpu_stop_queue_work(stopper, work);
24786         else
24787                 cpu_stop_signal_done(work->done, false);
24788 -       spin_unlock_irqrestore(&stopper->lock, flags);
24789 +       raw_spin_unlock_irqrestore(&stopper->lock, flags);
24790  }
24791
24792  /**
24793 @@ -224,8 +224,8 @@ static int cpu_stop_queue_two_works(int cpu1, struct cpu_stop_work *work1,
24794         int err;
24795
24796         lg_double_lock(&stop_cpus_lock, cpu1, cpu2);
24797 -       spin_lock_irq(&stopper1->lock);
24798 -       spin_lock_nested(&stopper2->lock, SINGLE_DEPTH_NESTING);
24799 +       raw_spin_lock_irq(&stopper1->lock);
24800 +       raw_spin_lock_nested(&stopper2->lock, SINGLE_DEPTH_NESTING);
24801
24802         err = -ENOENT;
24803         if (!stopper1->enabled || !stopper2->enabled)
24804 @@ -235,8 +235,8 @@ static int cpu_stop_queue_two_works(int cpu1, struct cpu_stop_work *work1,
24805         __cpu_stop_queue_work(stopper1, work1);
24806         __cpu_stop_queue_work(stopper2, work2);
24807  unlock:
24808 -       spin_unlock(&stopper2->lock);
24809 -       spin_unlock_irq(&stopper1->lock);
24810 +       raw_spin_unlock(&stopper2->lock);
24811 +       raw_spin_unlock_irq(&stopper1->lock);
24812         lg_double_unlock(&stop_cpus_lock, cpu1, cpu2);
24813
24814         return err;
24815 @@ -258,7 +258,7 @@ int stop_two_cpus(unsigned int cpu1, unsigned int cpu2, cpu_stop_fn_t fn, void *
24816         struct cpu_stop_work work1, work2;
24817         struct multi_stop_data msdata;
24818
24819 -       preempt_disable();
24820 +       preempt_disable_nort();
24821         msdata = (struct multi_stop_data){
24822                 .fn = fn,
24823                 .data = arg,
24824 @@ -278,11 +278,11 @@ int stop_two_cpus(unsigned int cpu1, unsigned int cpu2, cpu_stop_fn_t fn, void *
24825         if (cpu1 > cpu2)
24826                 swap(cpu1, cpu2);
24827         if (cpu_stop_queue_two_works(cpu1, &work1, cpu2, &work2)) {
24828 -               preempt_enable();
24829 +               preempt_enable_nort();
24830                 return -ENOENT;
24831         }
24832
24833 -       preempt_enable();
24834 +       preempt_enable_nort();
24835
24836         wait_for_completion(&done.completion);
24837
24838 @@ -315,17 +315,20 @@ static DEFINE_MUTEX(stop_cpus_mutex);
24839
24840  static void queue_stop_cpus_work(const struct cpumask *cpumask,
24841                                  cpu_stop_fn_t fn, void *arg,
24842 -                                struct cpu_stop_done *done)
24843 +                                struct cpu_stop_done *done, bool inactive)
24844  {
24845         struct cpu_stop_work *work;
24846         unsigned int cpu;
24847
24848         /*
24849 -        * Disable preemption while queueing to avoid getting
24850 -        * preempted by a stopper which might wait for other stoppers
24851 -        * to enter @fn which can lead to deadlock.
24852 +        * Make sure that all work is queued on all cpus before
24853 +        * any of the cpus can execute it.
24854          */
24855 -       lg_global_lock(&stop_cpus_lock);
24856 +       if (!inactive)
24857 +               lg_global_lock(&stop_cpus_lock);
24858 +       else
24859 +               lg_global_trylock_relax(&stop_cpus_lock);
24860 +
24861         for_each_cpu(cpu, cpumask) {
24862                 work = &per_cpu(cpu_stopper.stop_work, cpu);
24863                 work->fn = fn;
24864 @@ -342,7 +345,7 @@ static int __stop_cpus(const struct cpumask *cpumask,
24865         struct cpu_stop_done done;
24866
24867         cpu_stop_init_done(&done, cpumask_weight(cpumask));
24868 -       queue_stop_cpus_work(cpumask, fn, arg, &done);
24869 +       queue_stop_cpus_work(cpumask, fn, arg, &done, false);
24870         wait_for_completion(&done.completion);
24871         return done.executed ? done.ret : -ENOENT;
24872  }
24873 @@ -422,9 +425,9 @@ static int cpu_stop_should_run(unsigned int cpu)
24874         unsigned long flags;
24875         int run;
24876
24877 -       spin_lock_irqsave(&stopper->lock, flags);
24878 +       raw_spin_lock_irqsave(&stopper->lock, flags);
24879         run = !list_empty(&stopper->works);
24880 -       spin_unlock_irqrestore(&stopper->lock, flags);
24881 +       raw_spin_unlock_irqrestore(&stopper->lock, flags);
24882         return run;
24883  }
24884
24885 @@ -436,13 +439,13 @@ static void cpu_stopper_thread(unsigned int cpu)
24886
24887  repeat:
24888         work = NULL;
24889 -       spin_lock_irq(&stopper->lock);
24890 +       raw_spin_lock_irq(&stopper->lock);
24891         if (!list_empty(&stopper->works)) {
24892                 work = list_first_entry(&stopper->works,
24893                                         struct cpu_stop_work, list);
24894                 list_del_init(&work->list);
24895         }
24896 -       spin_unlock_irq(&stopper->lock);
24897 +       raw_spin_unlock_irq(&stopper->lock);
24898
24899         if (work) {
24900                 cpu_stop_fn_t fn = work->fn;
24901 @@ -450,6 +453,16 @@ repeat:
24902                 struct cpu_stop_done *done = work->done;
24903                 char ksym_buf[KSYM_NAME_LEN] __maybe_unused;
24904
24905 +               /*
24906 +                * Wait until the stopper finished scheduling on all
24907 +                * cpus
24908 +                */
24909 +               lg_global_lock(&stop_cpus_lock);
24910 +               /*
24911 +                * Let other cpu threads continue as well
24912 +                */
24913 +               lg_global_unlock(&stop_cpus_lock);
24914 +
24915                 /* cpu stop callbacks are not allowed to sleep */
24916                 preempt_disable();
24917
24918 @@ -520,10 +533,12 @@ static int __init cpu_stop_init(void)
24919         for_each_possible_cpu(cpu) {
24920                 struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu);
24921
24922 -               spin_lock_init(&stopper->lock);
24923 +               raw_spin_lock_init(&stopper->lock);
24924                 INIT_LIST_HEAD(&stopper->works);
24925         }
24926
24927 +       lg_lock_init(&stop_cpus_lock, "stop_cpus_lock");
24928 +
24929         BUG_ON(smpboot_register_percpu_thread(&cpu_stop_threads));
24930         stop_machine_unpark(raw_smp_processor_id());
24931         stop_machine_initialized = true;
24932 @@ -620,7 +635,7 @@ int stop_machine_from_inactive_cpu(cpu_stop_fn_t fn, void *data,
24933         set_state(&msdata, MULTI_STOP_PREPARE);
24934         cpu_stop_init_done(&done, num_active_cpus());
24935         queue_stop_cpus_work(cpu_active_mask, multi_cpu_stop, &msdata,
24936 -                            &done);
24937 +                            &done, true);
24938         ret = multi_cpu_stop(&msdata);
24939
24940         /* Busy wait for completion. */
24941 diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
24942 index 17f7bcff1e02..ba3d60144838 100644
24943 --- a/kernel/time/hrtimer.c
24944 +++ b/kernel/time/hrtimer.c
24945 @@ -48,11 +48,13 @@
24946  #include <linux/sched/rt.h>
24947  #include <linux/sched/deadline.h>
24948  #include <linux/timer.h>
24949 +#include <linux/kthread.h>
24950  #include <linux/freezer.h>
24951
24952  #include <asm/uaccess.h>
24953
24954  #include <trace/events/timer.h>
24955 +#include <trace/events/hist.h>
24956
24957  #include "tick-internal.h"
24958
24959 @@ -717,6 +719,44 @@ static void clock_was_set_work(struct work_struct *work)
24960
24961  static DECLARE_WORK(hrtimer_work, clock_was_set_work);
24962
24963 +#ifdef CONFIG_PREEMPT_RT_FULL
24964 +/*
24965 + * RT can not call schedule_work from real interrupt context.
24966 + * Need to make a thread to do the real work.
24967 + */
24968 +static struct task_struct *clock_set_delay_thread;
24969 +static bool do_clock_set_delay;
24970 +
24971 +static int run_clock_set_delay(void *ignore)
24972 +{
24973 +       while (!kthread_should_stop()) {
24974 +               set_current_state(TASK_INTERRUPTIBLE);
24975 +               if (do_clock_set_delay) {
24976 +                       do_clock_set_delay = false;
24977 +                       schedule_work(&hrtimer_work);
24978 +               }
24979 +               schedule();
24980 +       }
24981 +       __set_current_state(TASK_RUNNING);
24982 +       return 0;
24983 +}
24984 +
24985 +void clock_was_set_delayed(void)
24986 +{
24987 +       do_clock_set_delay = true;
24988 +       /* Make visible before waking up process */
24989 +       smp_wmb();
24990 +       wake_up_process(clock_set_delay_thread);
24991 +}
24992 +
24993 +static __init int create_clock_set_delay_thread(void)
24994 +{
24995 +       clock_set_delay_thread = kthread_run(run_clock_set_delay, NULL, "kclksetdelayd");
24996 +       BUG_ON(!clock_set_delay_thread);
24997 +       return 0;
24998 +}
24999 +early_initcall(create_clock_set_delay_thread);
25000 +#else /* PREEMPT_RT_FULL */
25001  /*
25002   * Called from timekeeping and resume code to reprogramm the hrtimer
25003   * interrupt device on all cpus.
25004 @@ -725,6 +765,7 @@ void clock_was_set_delayed(void)
25005  {
25006         schedule_work(&hrtimer_work);
25007  }
25008 +#endif
25009
25010  #else
25011
25012 @@ -734,11 +775,8 @@ static inline int hrtimer_is_hres_enabled(void) { return 0; }
25013  static inline void hrtimer_switch_to_hres(void) { }
25014  static inline void
25015  hrtimer_force_reprogram(struct hrtimer_cpu_base *base, int skip_equal) { }
25016 -static inline int hrtimer_reprogram(struct hrtimer *timer,
25017 -                                   struct hrtimer_clock_base *base)
25018 -{
25019 -       return 0;
25020 -}
25021 +static inline void hrtimer_reprogram(struct hrtimer *timer,
25022 +                                    struct hrtimer_clock_base *base) { }
25023  static inline void hrtimer_init_hres(struct hrtimer_cpu_base *base) { }
25024  static inline void retrigger_next_event(void *arg) { }
25025
25026 @@ -870,6 +908,32 @@ u64 hrtimer_forward(struct hrtimer *timer, ktime_t now, ktime_t interval)
25027  }
25028  EXPORT_SYMBOL_GPL(hrtimer_forward);
25029
25030 +#ifdef CONFIG_PREEMPT_RT_BASE
25031 +# define wake_up_timer_waiters(b)      wake_up(&(b)->wait)
25032 +
25033 +/**
25034 + * hrtimer_wait_for_timer - Wait for a running timer
25035 + *
25036 + * @timer:     timer to wait for
25037 + *
25038 + * The function waits in case the timers callback function is
25039 + * currently executed on the waitqueue of the timer base. The
25040 + * waitqueue is woken up after the timer callback function has
25041 + * finished execution.
25042 + */
25043 +void hrtimer_wait_for_timer(const struct hrtimer *timer)
25044 +{
25045 +       struct hrtimer_clock_base *base = timer->base;
25046 +
25047 +       if (base && base->cpu_base && !timer->irqsafe)
25048 +               wait_event(base->cpu_base->wait,
25049 +                               !(hrtimer_callback_running(timer)));
25050 +}
25051 +
25052 +#else
25053 +# define wake_up_timer_waiters(b)      do { } while (0)
25054 +#endif
25055 +
25056  /*
25057   * enqueue_hrtimer - internal function to (re)start a timer
25058   *
25059 @@ -911,6 +975,11 @@ static void __remove_hrtimer(struct hrtimer *timer,
25060         if (!(state & HRTIMER_STATE_ENQUEUED))
25061                 return;
25062
25063 +       if (unlikely(!list_empty(&timer->cb_entry))) {
25064 +               list_del_init(&timer->cb_entry);
25065 +               return;
25066 +       }
25067 +
25068         if (!timerqueue_del(&base->active, &timer->node))
25069                 cpu_base->active_bases &= ~(1 << base->index);
25070
25071 @@ -1006,7 +1075,16 @@ void hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim,
25072         new_base = switch_hrtimer_base(timer, base, mode & HRTIMER_MODE_PINNED);
25073
25074         timer_stats_hrtimer_set_start_info(timer);
25075 +#ifdef CONFIG_MISSED_TIMER_OFFSETS_HIST
25076 +       {
25077 +               ktime_t now = new_base->get_time();
25078
25079 +               if (ktime_to_ns(tim) < ktime_to_ns(now))
25080 +                       timer->praecox = now;
25081 +               else
25082 +                       timer->praecox = ktime_set(0, 0);
25083 +       }
25084 +#endif
25085         leftmost = enqueue_hrtimer(timer, new_base);
25086         if (!leftmost)
25087                 goto unlock;
25088 @@ -1078,7 +1156,7 @@ int hrtimer_cancel(struct hrtimer *timer)
25089
25090                 if (ret >= 0)
25091                         return ret;
25092 -               cpu_relax();
25093 +               hrtimer_wait_for_timer(timer);
25094         }
25095  }
25096  EXPORT_SYMBOL_GPL(hrtimer_cancel);
25097 @@ -1142,6 +1220,7 @@ static void __hrtimer_init(struct hrtimer *timer, clockid_t clock_id,
25098
25099         base = hrtimer_clockid_to_base(clock_id);
25100         timer->base = &cpu_base->clock_base[base];
25101 +       INIT_LIST_HEAD(&timer->cb_entry);
25102         timerqueue_init(&timer->node);
25103
25104  #ifdef CONFIG_TIMER_STATS
25105 @@ -1182,6 +1261,7 @@ bool hrtimer_active(const struct hrtimer *timer)
25106                 seq = raw_read_seqcount_begin(&cpu_base->seq);
25107
25108                 if (timer->state != HRTIMER_STATE_INACTIVE ||
25109 +                   cpu_base->running_soft == timer ||
25110                     cpu_base->running == timer)
25111                         return true;
25112
25113 @@ -1280,10 +1360,112 @@ static void __run_hrtimer(struct hrtimer_cpu_base *cpu_base,
25114         cpu_base->running = NULL;
25115  }
25116
25117 +#ifdef CONFIG_PREEMPT_RT_BASE
25118 +static void hrtimer_rt_reprogram(int restart, struct hrtimer *timer,
25119 +                                struct hrtimer_clock_base *base)
25120 +{
25121 +       int leftmost;
25122 +
25123 +       if (restart != HRTIMER_NORESTART &&
25124 +           !(timer->state & HRTIMER_STATE_ENQUEUED)) {
25125 +
25126 +               leftmost = enqueue_hrtimer(timer, base);
25127 +               if (!leftmost)
25128 +                       return;
25129 +#ifdef CONFIG_HIGH_RES_TIMERS
25130 +               if (!hrtimer_is_hres_active(timer)) {
25131 +                       /*
25132 +                        * Kick to reschedule the next tick to handle the new timer
25133 +                        * on dynticks target.
25134 +                        */
25135 +                       if (base->cpu_base->nohz_active)
25136 +                               wake_up_nohz_cpu(base->cpu_base->cpu);
25137 +               } else {
25138 +
25139 +                       hrtimer_reprogram(timer, base);
25140 +               }
25141 +#endif
25142 +       }
25143 +}
25144 +
25145 +/*
25146 + * The changes in mainline which removed the callback modes from
25147 + * hrtimer are not yet working with -rt. The non wakeup_process()
25148 + * based callbacks which involve sleeping locks need to be treated
25149 + * seperately.
25150 + */
25151 +static void hrtimer_rt_run_pending(void)
25152 +{
25153 +       enum hrtimer_restart (*fn)(struct hrtimer *);
25154 +       struct hrtimer_cpu_base *cpu_base;
25155 +       struct hrtimer_clock_base *base;
25156 +       struct hrtimer *timer;
25157 +       int index, restart;
25158 +
25159 +       local_irq_disable();
25160 +       cpu_base = &per_cpu(hrtimer_bases, smp_processor_id());
25161 +
25162 +       raw_spin_lock(&cpu_base->lock);
25163 +
25164 +       for (index = 0; index < HRTIMER_MAX_CLOCK_BASES; index++) {
25165 +               base = &cpu_base->clock_base[index];
25166 +
25167 +               while (!list_empty(&base->expired)) {
25168 +                       timer = list_first_entry(&base->expired,
25169 +                                                struct hrtimer, cb_entry);
25170 +
25171 +                       /*
25172 +                        * Same as the above __run_hrtimer function
25173 +                        * just we run with interrupts enabled.
25174 +                        */
25175 +                       debug_deactivate(timer);
25176 +                       cpu_base->running_soft = timer;
25177 +                       raw_write_seqcount_barrier(&cpu_base->seq);
25178 +
25179 +                       __remove_hrtimer(timer, base, HRTIMER_STATE_INACTIVE, 0);
25180 +                       timer_stats_account_hrtimer(timer);
25181 +                       fn = timer->function;
25182 +
25183 +                       raw_spin_unlock_irq(&cpu_base->lock);
25184 +                       restart = fn(timer);
25185 +                       raw_spin_lock_irq(&cpu_base->lock);
25186 +
25187 +                       hrtimer_rt_reprogram(restart, timer, base);
25188 +                       raw_write_seqcount_barrier(&cpu_base->seq);
25189 +
25190 +                       WARN_ON_ONCE(cpu_base->running_soft != timer);
25191 +                       cpu_base->running_soft = NULL;
25192 +               }
25193 +       }
25194 +
25195 +       raw_spin_unlock_irq(&cpu_base->lock);
25196 +
25197 +       wake_up_timer_waiters(cpu_base);
25198 +}
25199 +
25200 +static int hrtimer_rt_defer(struct hrtimer *timer)
25201 +{
25202 +       if (timer->irqsafe)
25203 +               return 0;
25204 +
25205 +       __remove_hrtimer(timer, timer->base, timer->state, 0);
25206 +       list_add_tail(&timer->cb_entry, &timer->base->expired);
25207 +       return 1;
25208 +}
25209 +
25210 +#else
25211 +
25212 +static inline int hrtimer_rt_defer(struct hrtimer *timer) { return 0; }
25213 +
25214 +#endif
25215 +
25216 +static enum hrtimer_restart hrtimer_wakeup(struct hrtimer *timer);
25217 +
25218  static void __hrtimer_run_queues(struct hrtimer_cpu_base *cpu_base, ktime_t now)
25219  {
25220         struct hrtimer_clock_base *base = cpu_base->clock_base;
25221         unsigned int active = cpu_base->active_bases;
25222 +       int raise = 0;
25223
25224         for (; active; base++, active >>= 1) {
25225                 struct timerqueue_node *node;
25226 @@ -1299,6 +1481,15 @@ static void __hrtimer_run_queues(struct hrtimer_cpu_base *cpu_base, ktime_t now)
25227
25228                         timer = container_of(node, struct hrtimer, node);
25229
25230 +                       trace_hrtimer_interrupt(raw_smp_processor_id(),
25231 +                           ktime_to_ns(ktime_sub(ktime_to_ns(timer->praecox) ?
25232 +                               timer->praecox : hrtimer_get_expires(timer),
25233 +                               basenow)),
25234 +                           current,
25235 +                           timer->function == hrtimer_wakeup ?
25236 +                           container_of(timer, struct hrtimer_sleeper,
25237 +                               timer)->task : NULL);
25238 +
25239                         /*
25240                          * The immediate goal for using the softexpires is
25241                          * minimizing wakeups, not running timers at the
25242 @@ -1314,9 +1505,14 @@ static void __hrtimer_run_queues(struct hrtimer_cpu_base *cpu_base, ktime_t now)
25243                         if (basenow.tv64 < hrtimer_get_softexpires_tv64(timer))
25244                                 break;
25245
25246 -                       __run_hrtimer(cpu_base, base, timer, &basenow);
25247 +                       if (!hrtimer_rt_defer(timer))
25248 +                               __run_hrtimer(cpu_base, base, timer, &basenow);
25249 +                       else
25250 +                               raise = 1;
25251                 }
25252         }
25253 +       if (raise)
25254 +               raise_softirq_irqoff(HRTIMER_SOFTIRQ);
25255  }
25256
25257  #ifdef CONFIG_HIGH_RES_TIMERS
25258 @@ -1479,16 +1675,18 @@ static enum hrtimer_restart hrtimer_wakeup(struct hrtimer *timer)
25259  void hrtimer_init_sleeper(struct hrtimer_sleeper *sl, struct task_struct *task)
25260  {
25261         sl->timer.function = hrtimer_wakeup;
25262 +       sl->timer.irqsafe = 1;
25263         sl->task = task;
25264  }
25265  EXPORT_SYMBOL_GPL(hrtimer_init_sleeper);
25266
25267 -static int __sched do_nanosleep(struct hrtimer_sleeper *t, enum hrtimer_mode mode)
25268 +static int __sched do_nanosleep(struct hrtimer_sleeper *t, enum hrtimer_mode mode,
25269 +                               unsigned long state)
25270  {
25271         hrtimer_init_sleeper(t, current);
25272
25273         do {
25274 -               set_current_state(TASK_INTERRUPTIBLE);
25275 +               set_current_state(state);
25276                 hrtimer_start_expires(&t->timer, mode);
25277
25278                 if (likely(t->task))
25279 @@ -1530,7 +1728,8 @@ long __sched hrtimer_nanosleep_restart(struct restart_block *restart)
25280                                 HRTIMER_MODE_ABS);
25281         hrtimer_set_expires_tv64(&t.timer, restart->nanosleep.expires);
25282
25283 -       if (do_nanosleep(&t, HRTIMER_MODE_ABS))
25284 +       /* cpu_chill() does not care about restart state. */
25285 +       if (do_nanosleep(&t, HRTIMER_MODE_ABS, TASK_INTERRUPTIBLE))
25286                 goto out;
25287
25288         rmtp = restart->nanosleep.rmtp;
25289 @@ -1547,8 +1746,10 @@ out:
25290         return ret;
25291  }
25292
25293 -long hrtimer_nanosleep(struct timespec *rqtp, struct timespec __user *rmtp,
25294 -                      const enum hrtimer_mode mode, const clockid_t clockid)
25295 +static long
25296 +__hrtimer_nanosleep(struct timespec *rqtp, struct timespec __user *rmtp,
25297 +                   const enum hrtimer_mode mode, const clockid_t clockid,
25298 +                   unsigned long state)
25299  {
25300         struct restart_block *restart;
25301         struct hrtimer_sleeper t;
25302 @@ -1561,7 +1762,7 @@ long hrtimer_nanosleep(struct timespec *rqtp, struct timespec __user *rmtp,
25303
25304         hrtimer_init_on_stack(&t.timer, clockid, mode);
25305         hrtimer_set_expires_range_ns(&t.timer, timespec_to_ktime(*rqtp), slack);
25306 -       if (do_nanosleep(&t, mode))
25307 +       if (do_nanosleep(&t, mode, state))
25308                 goto out;
25309
25310         /* Absolute timers do not update the rmtp value and restart: */
25311 @@ -1588,6 +1789,12 @@ out:
25312         return ret;
25313  }
25314
25315 +long hrtimer_nanosleep(struct timespec *rqtp, struct timespec __user *rmtp,
25316 +                      const enum hrtimer_mode mode, const clockid_t clockid)
25317 +{
25318 +       return __hrtimer_nanosleep(rqtp, rmtp, mode, clockid, TASK_INTERRUPTIBLE);
25319 +}
25320 +
25321  SYSCALL_DEFINE2(nanosleep, struct timespec __user *, rqtp,
25322                 struct timespec __user *, rmtp)
25323  {
25324 @@ -1602,6 +1809,26 @@ SYSCALL_DEFINE2(nanosleep, struct timespec __user *, rqtp,
25325         return hrtimer_nanosleep(&tu, rmtp, HRTIMER_MODE_REL, CLOCK_MONOTONIC);
25326  }
25327
25328 +#ifdef CONFIG_PREEMPT_RT_FULL
25329 +/*
25330 + * Sleep for 1 ms in hope whoever holds what we want will let it go.
25331 + */
25332 +void cpu_chill(void)
25333 +{
25334 +       struct timespec tu = {
25335 +               .tv_nsec = NSEC_PER_MSEC,
25336 +       };
25337 +       unsigned int freeze_flag = current->flags & PF_NOFREEZE;
25338 +
25339 +       current->flags |= PF_NOFREEZE;
25340 +       __hrtimer_nanosleep(&tu, NULL, HRTIMER_MODE_REL, CLOCK_MONOTONIC,
25341 +                           TASK_UNINTERRUPTIBLE);
25342 +       if (!freeze_flag)
25343 +               current->flags &= ~PF_NOFREEZE;
25344 +}
25345 +EXPORT_SYMBOL(cpu_chill);
25346 +#endif
25347 +
25348  /*
25349   * Functions related to boot-time initialization:
25350   */
25351 @@ -1613,10 +1840,14 @@ static void init_hrtimers_cpu(int cpu)
25352         for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) {
25353                 cpu_base->clock_base[i].cpu_base = cpu_base;
25354                 timerqueue_init_head(&cpu_base->clock_base[i].active);
25355 +               INIT_LIST_HEAD(&cpu_base->clock_base[i].expired);
25356         }
25357
25358         cpu_base->cpu = cpu;
25359         hrtimer_init_hres(cpu_base);
25360 +#ifdef CONFIG_PREEMPT_RT_BASE
25361 +       init_waitqueue_head(&cpu_base->wait);
25362 +#endif
25363  }
25364
25365  #ifdef CONFIG_HOTPLUG_CPU
25366 @@ -1714,11 +1945,21 @@ static struct notifier_block hrtimers_nb = {
25367         .notifier_call = hrtimer_cpu_notify,
25368  };
25369
25370 +#ifdef CONFIG_PREEMPT_RT_BASE
25371 +static void run_hrtimer_softirq(struct softirq_action *h)
25372 +{
25373 +       hrtimer_rt_run_pending();
25374 +}
25375 +#endif
25376 +
25377  void __init hrtimers_init(void)
25378  {
25379         hrtimer_cpu_notify(&hrtimers_nb, (unsigned long)CPU_UP_PREPARE,
25380                           (void *)(long)smp_processor_id());
25381         register_cpu_notifier(&hrtimers_nb);
25382 +#ifdef CONFIG_PREEMPT_RT_BASE
25383 +       open_softirq(HRTIMER_SOFTIRQ, run_hrtimer_softirq);
25384 +#endif
25385  }
25386
25387  /**
25388 diff --git a/kernel/time/itimer.c b/kernel/time/itimer.c
25389 index 1d5c7204ddc9..184de6751180 100644
25390 --- a/kernel/time/itimer.c
25391 +++ b/kernel/time/itimer.c
25392 @@ -213,6 +213,7 @@ again:
25393                 /* We are sharing ->siglock with it_real_fn() */
25394                 if (hrtimer_try_to_cancel(timer) < 0) {
25395                         spin_unlock_irq(&tsk->sighand->siglock);
25396 +                       hrtimer_wait_for_timer(&tsk->signal->real_timer);
25397                         goto again;
25398                 }
25399                 expires = timeval_to_ktime(value->it_value);
25400 diff --git a/kernel/time/jiffies.c b/kernel/time/jiffies.c
25401 index 347fecf86a3f..2ede47408a3e 100644
25402 --- a/kernel/time/jiffies.c
25403 +++ b/kernel/time/jiffies.c
25404 @@ -74,7 +74,8 @@ static struct clocksource clocksource_jiffies = {
25405         .max_cycles     = 10,
25406  };
25407
25408 -__cacheline_aligned_in_smp DEFINE_SEQLOCK(jiffies_lock);
25409 +__cacheline_aligned_in_smp DEFINE_RAW_SPINLOCK(jiffies_lock);
25410 +__cacheline_aligned_in_smp seqcount_t jiffies_seq;
25411
25412  #if (BITS_PER_LONG < 64)
25413  u64 get_jiffies_64(void)
25414 @@ -83,9 +84,9 @@ u64 get_jiffies_64(void)
25415         u64 ret;
25416
25417         do {
25418 -               seq = read_seqbegin(&jiffies_lock);
25419 +               seq = read_seqcount_begin(&jiffies_seq);
25420                 ret = jiffies_64;
25421 -       } while (read_seqretry(&jiffies_lock, seq));
25422 +       } while (read_seqcount_retry(&jiffies_seq, seq));
25423         return ret;
25424  }
25425  EXPORT_SYMBOL(get_jiffies_64);
25426 diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
25427 index ab861771e37f..0f6868fd2de6 100644
25428 --- a/kernel/time/ntp.c
25429 +++ b/kernel/time/ntp.c
25430 @@ -10,6 +10,7 @@
25431  #include <linux/workqueue.h>
25432  #include <linux/hrtimer.h>
25433  #include <linux/jiffies.h>
25434 +#include <linux/kthread.h>
25435  #include <linux/math64.h>
25436  #include <linux/timex.h>
25437  #include <linux/time.h>
25438 @@ -562,10 +563,52 @@ static void sync_cmos_clock(struct work_struct *work)
25439                            &sync_cmos_work, timespec64_to_jiffies(&next));
25440  }
25441
25442 +#ifdef CONFIG_PREEMPT_RT_FULL
25443 +/*
25444 + * RT can not call schedule_delayed_work from real interrupt context.
25445 + * Need to make a thread to do the real work.
25446 + */
25447 +static struct task_struct *cmos_delay_thread;
25448 +static bool do_cmos_delay;
25449 +
25450 +static int run_cmos_delay(void *ignore)
25451 +{
25452 +       while (!kthread_should_stop()) {
25453 +               set_current_state(TASK_INTERRUPTIBLE);
25454 +               if (do_cmos_delay) {
25455 +                       do_cmos_delay = false;
25456 +                       queue_delayed_work(system_power_efficient_wq,
25457 +                                          &sync_cmos_work, 0);
25458 +               }
25459 +               schedule();
25460 +       }
25461 +       __set_current_state(TASK_RUNNING);
25462 +       return 0;
25463 +}
25464 +
25465 +void ntp_notify_cmos_timer(void)
25466 +{
25467 +       do_cmos_delay = true;
25468 +       /* Make visible before waking up process */
25469 +       smp_wmb();
25470 +       wake_up_process(cmos_delay_thread);
25471 +}
25472 +
25473 +static __init int create_cmos_delay_thread(void)
25474 +{
25475 +       cmos_delay_thread = kthread_run(run_cmos_delay, NULL, "kcmosdelayd");
25476 +       BUG_ON(!cmos_delay_thread);
25477 +       return 0;
25478 +}
25479 +early_initcall(create_cmos_delay_thread);
25480 +
25481 +#else
25482 +
25483  void ntp_notify_cmos_timer(void)
25484  {
25485         queue_delayed_work(system_power_efficient_wq, &sync_cmos_work, 0);
25486  }
25487 +#endif /* CONFIG_PREEMPT_RT_FULL */
25488
25489  #else
25490  void ntp_notify_cmos_timer(void) { }
25491 diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c
25492 index 80016b329d94..b7342b6e6a5a 100644
25493 --- a/kernel/time/posix-cpu-timers.c
25494 +++ b/kernel/time/posix-cpu-timers.c
25495 @@ -3,6 +3,7 @@
25496   */
25497
25498  #include <linux/sched.h>
25499 +#include <linux/sched/rt.h>
25500  #include <linux/posix-timers.h>
25501  #include <linux/errno.h>
25502  #include <linux/math64.h>
25503 @@ -650,7 +651,7 @@ static int posix_cpu_timer_set(struct k_itimer *timer, int timer_flags,
25504         /*
25505          * Disarm any old timer after extracting its expiry time.
25506          */
25507 -       WARN_ON_ONCE(!irqs_disabled());
25508 +       WARN_ON_ONCE_NONRT(!irqs_disabled());
25509
25510         ret = 0;
25511         old_incr = timer->it.cpu.incr;
25512 @@ -1092,7 +1093,7 @@ void posix_cpu_timer_schedule(struct k_itimer *timer)
25513         /*
25514          * Now re-arm for the new expiry time.
25515          */
25516 -       WARN_ON_ONCE(!irqs_disabled());
25517 +       WARN_ON_ONCE_NONRT(!irqs_disabled());
25518         arm_timer(timer);
25519         unlock_task_sighand(p, &flags);
25520
25521 @@ -1183,13 +1184,13 @@ static inline int fastpath_timer_check(struct task_struct *tsk)
25522   * already updated our counts.  We need to check if any timers fire now.
25523   * Interrupts are disabled.
25524   */
25525 -void run_posix_cpu_timers(struct task_struct *tsk)
25526 +static void __run_posix_cpu_timers(struct task_struct *tsk)
25527  {
25528         LIST_HEAD(firing);
25529         struct k_itimer *timer, *next;
25530         unsigned long flags;
25531
25532 -       WARN_ON_ONCE(!irqs_disabled());
25533 +       WARN_ON_ONCE_NONRT(!irqs_disabled());
25534
25535         /*
25536          * The fast path checks that there are no expired thread or thread
25537 @@ -1243,6 +1244,190 @@ void run_posix_cpu_timers(struct task_struct *tsk)
25538         }
25539  }
25540
25541 +#ifdef CONFIG_PREEMPT_RT_BASE
25542 +#include <linux/kthread.h>
25543 +#include <linux/cpu.h>
25544 +DEFINE_PER_CPU(struct task_struct *, posix_timer_task);
25545 +DEFINE_PER_CPU(struct task_struct *, posix_timer_tasklist);
25546 +
25547 +static int posix_cpu_timers_thread(void *data)
25548 +{
25549 +       int cpu = (long)data;
25550 +
25551 +       BUG_ON(per_cpu(posix_timer_task,cpu) != current);
25552 +
25553 +       while (!kthread_should_stop()) {
25554 +               struct task_struct *tsk = NULL;
25555 +               struct task_struct *next = NULL;
25556 +
25557 +               if (cpu_is_offline(cpu))
25558 +                       goto wait_to_die;
25559 +
25560 +               /* grab task list */
25561 +               raw_local_irq_disable();
25562 +               tsk = per_cpu(posix_timer_tasklist, cpu);
25563 +               per_cpu(posix_timer_tasklist, cpu) = NULL;
25564 +               raw_local_irq_enable();
25565 +
25566 +               /* its possible the list is empty, just return */
25567 +               if (!tsk) {
25568 +                       set_current_state(TASK_INTERRUPTIBLE);
25569 +                       schedule();
25570 +                       __set_current_state(TASK_RUNNING);
25571 +                       continue;
25572 +               }
25573 +
25574 +               /* Process task list */
25575 +               while (1) {
25576 +                       /* save next */
25577 +                       next = tsk->posix_timer_list;
25578 +
25579 +                       /* run the task timers, clear its ptr and
25580 +                        * unreference it
25581 +                        */
25582 +                       __run_posix_cpu_timers(tsk);
25583 +                       tsk->posix_timer_list = NULL;
25584 +                       put_task_struct(tsk);
25585 +
25586 +                       /* check if this is the last on the list */
25587 +                       if (next == tsk)
25588 +                               break;
25589 +                       tsk = next;
25590 +               }
25591 +       }
25592 +       return 0;
25593 +
25594 +wait_to_die:
25595 +       /* Wait for kthread_stop */
25596 +       set_current_state(TASK_INTERRUPTIBLE);
25597 +       while (!kthread_should_stop()) {
25598 +               schedule();
25599 +               set_current_state(TASK_INTERRUPTIBLE);
25600 +       }
25601 +       __set_current_state(TASK_RUNNING);
25602 +       return 0;
25603 +}
25604 +
25605 +static inline int __fastpath_timer_check(struct task_struct *tsk)
25606 +{
25607 +       /* tsk == current, ensure it is safe to use ->signal/sighand */
25608 +       if (unlikely(tsk->exit_state))
25609 +               return 0;
25610 +
25611 +       if (!task_cputime_zero(&tsk->cputime_expires))
25612 +                       return 1;
25613 +
25614 +       if (!task_cputime_zero(&tsk->signal->cputime_expires))
25615 +                       return 1;
25616 +
25617 +       return 0;
25618 +}
25619 +
25620 +void run_posix_cpu_timers(struct task_struct *tsk)
25621 +{
25622 +       unsigned long cpu = smp_processor_id();
25623 +       struct task_struct *tasklist;
25624 +
25625 +       BUG_ON(!irqs_disabled());
25626 +       if(!per_cpu(posix_timer_task, cpu))
25627 +               return;
25628 +       /* get per-cpu references */
25629 +       tasklist = per_cpu(posix_timer_tasklist, cpu);
25630 +
25631 +       /* check to see if we're already queued */
25632 +       if (!tsk->posix_timer_list && __fastpath_timer_check(tsk)) {
25633 +               get_task_struct(tsk);
25634 +               if (tasklist) {
25635 +                       tsk->posix_timer_list = tasklist;
25636 +               } else {
25637 +                       /*
25638 +                        * The list is terminated by a self-pointing
25639 +                        * task_struct
25640 +                        */
25641 +                       tsk->posix_timer_list = tsk;
25642 +               }
25643 +               per_cpu(posix_timer_tasklist, cpu) = tsk;
25644 +
25645 +               wake_up_process(per_cpu(posix_timer_task, cpu));
25646 +       }
25647 +}
25648 +
25649 +/*
25650 + * posix_cpu_thread_call - callback that gets triggered when a CPU is added.
25651 + * Here we can start up the necessary migration thread for the new CPU.
25652 + */
25653 +static int posix_cpu_thread_call(struct notifier_block *nfb,
25654 +                                unsigned long action, void *hcpu)
25655 +{
25656 +       int cpu = (long)hcpu;
25657 +       struct task_struct *p;
25658 +       struct sched_param param;
25659 +
25660 +       switch (action) {
25661 +       case CPU_UP_PREPARE:
25662 +               p = kthread_create(posix_cpu_timers_thread, hcpu,
25663 +                                       "posixcputmr/%d",cpu);
25664 +               if (IS_ERR(p))
25665 +                       return NOTIFY_BAD;
25666 +               p->flags |= PF_NOFREEZE;
25667 +               kthread_bind(p, cpu);
25668 +               /* Must be high prio to avoid getting starved */
25669 +               param.sched_priority = MAX_RT_PRIO-1;
25670 +               sched_setscheduler(p, SCHED_FIFO, &param);
25671 +               per_cpu(posix_timer_task,cpu) = p;
25672 +               break;
25673 +       case CPU_ONLINE:
25674 +               /* Strictly unneccessary, as first user will wake it. */
25675 +               wake_up_process(per_cpu(posix_timer_task,cpu));
25676 +               break;
25677 +#ifdef CONFIG_HOTPLUG_CPU
25678 +       case CPU_UP_CANCELED:
25679 +               /* Unbind it from offline cpu so it can run.  Fall thru. */
25680 +               kthread_bind(per_cpu(posix_timer_task, cpu),
25681 +                            cpumask_any(cpu_online_mask));
25682 +               kthread_stop(per_cpu(posix_timer_task,cpu));
25683 +               per_cpu(posix_timer_task,cpu) = NULL;
25684 +               break;
25685 +       case CPU_DEAD:
25686 +               kthread_stop(per_cpu(posix_timer_task,cpu));
25687 +               per_cpu(posix_timer_task,cpu) = NULL;
25688 +               break;
25689 +#endif
25690 +       }
25691 +       return NOTIFY_OK;
25692 +}
25693 +
25694 +/* Register at highest priority so that task migration (migrate_all_tasks)
25695 + * happens before everything else.
25696 + */
25697 +static struct notifier_block posix_cpu_thread_notifier = {
25698 +       .notifier_call = posix_cpu_thread_call,
25699 +       .priority = 10
25700 +};
25701 +
25702 +static int __init posix_cpu_thread_init(void)
25703 +{
25704 +       void *hcpu = (void *)(long)smp_processor_id();
25705 +       /* Start one for boot CPU. */
25706 +       unsigned long cpu;
25707 +
25708 +       /* init the per-cpu posix_timer_tasklets */
25709 +       for_each_possible_cpu(cpu)
25710 +               per_cpu(posix_timer_tasklist, cpu) = NULL;
25711 +
25712 +       posix_cpu_thread_call(&posix_cpu_thread_notifier, CPU_UP_PREPARE, hcpu);
25713 +       posix_cpu_thread_call(&posix_cpu_thread_notifier, CPU_ONLINE, hcpu);
25714 +       register_cpu_notifier(&posix_cpu_thread_notifier);
25715 +       return 0;
25716 +}
25717 +early_initcall(posix_cpu_thread_init);
25718 +#else /* CONFIG_PREEMPT_RT_BASE */
25719 +void run_posix_cpu_timers(struct task_struct *tsk)
25720 +{
25721 +       __run_posix_cpu_timers(tsk);
25722 +}
25723 +#endif /* CONFIG_PREEMPT_RT_BASE */
25724 +
25725  /*
25726   * Set one of the process-wide special case CPU timers or RLIMIT_CPU.
25727   * The tsk->sighand->siglock must be held by the caller.
25728 diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c
25729 index f2826c35e918..464a98155a0e 100644
25730 --- a/kernel/time/posix-timers.c
25731 +++ b/kernel/time/posix-timers.c
25732 @@ -506,6 +506,7 @@ static enum hrtimer_restart posix_timer_fn(struct hrtimer *timer)
25733  static struct pid *good_sigevent(sigevent_t * event)
25734  {
25735         struct task_struct *rtn = current->group_leader;
25736 +       int sig = event->sigev_signo;
25737
25738         if ((event->sigev_notify & SIGEV_THREAD_ID ) &&
25739                 (!(rtn = find_task_by_vpid(event->sigev_notify_thread_id)) ||
25740 @@ -514,7 +515,8 @@ static struct pid *good_sigevent(sigevent_t * event)
25741                 return NULL;
25742
25743         if (((event->sigev_notify & ~SIGEV_THREAD_ID) != SIGEV_NONE) &&
25744 -           ((event->sigev_signo <= 0) || (event->sigev_signo > SIGRTMAX)))
25745 +           (sig <= 0 || sig > SIGRTMAX || sig_kernel_only(sig) ||
25746 +            sig_kernel_coredump(sig)))
25747                 return NULL;
25748
25749         return task_pid(rtn);
25750 @@ -826,6 +828,20 @@ SYSCALL_DEFINE1(timer_getoverrun, timer_t, timer_id)
25751         return overrun;
25752  }
25753
25754 +/*
25755 + * Protected by RCU!
25756 + */
25757 +static void timer_wait_for_callback(struct k_clock *kc, struct k_itimer *timr)
25758 +{
25759 +#ifdef CONFIG_PREEMPT_RT_FULL
25760 +       if (kc->timer_set == common_timer_set)
25761 +               hrtimer_wait_for_timer(&timr->it.real.timer);
25762 +       else
25763 +               /* FIXME: Whacky hack for posix-cpu-timers */
25764 +               schedule_timeout(1);
25765 +#endif
25766 +}
25767 +
25768  /* Set a POSIX.1b interval timer. */
25769  /* timr->it_lock is taken. */
25770  static int
25771 @@ -903,6 +919,7 @@ retry:
25772         if (!timr)
25773                 return -EINVAL;
25774
25775 +       rcu_read_lock();
25776         kc = clockid_to_kclock(timr->it_clock);
25777         if (WARN_ON_ONCE(!kc || !kc->timer_set))
25778                 error = -EINVAL;
25779 @@ -911,9 +928,12 @@ retry:
25780
25781         unlock_timer(timr, flag);
25782         if (error == TIMER_RETRY) {
25783 +               timer_wait_for_callback(kc, timr);
25784                 rtn = NULL;     // We already got the old time...
25785 +               rcu_read_unlock();
25786                 goto retry;
25787         }
25788 +       rcu_read_unlock();
25789
25790         if (old_setting && !error &&
25791             copy_to_user(old_setting, &old_spec, sizeof (old_spec)))
25792 @@ -951,10 +971,15 @@ retry_delete:
25793         if (!timer)
25794                 return -EINVAL;
25795
25796 +       rcu_read_lock();
25797         if (timer_delete_hook(timer) == TIMER_RETRY) {
25798                 unlock_timer(timer, flags);
25799 +               timer_wait_for_callback(clockid_to_kclock(timer->it_clock),
25800 +                                       timer);
25801 +               rcu_read_unlock();
25802                 goto retry_delete;
25803         }
25804 +       rcu_read_unlock();
25805
25806         spin_lock(&current->sighand->siglock);
25807         list_del(&timer->list);
25808 @@ -980,8 +1005,18 @@ static void itimer_delete(struct k_itimer *timer)
25809  retry_delete:
25810         spin_lock_irqsave(&timer->it_lock, flags);
25811
25812 +       /* On RT we can race with a deletion */
25813 +       if (!timer->it_signal) {
25814 +               unlock_timer(timer, flags);
25815 +               return;
25816 +       }
25817 +
25818         if (timer_delete_hook(timer) == TIMER_RETRY) {
25819 +               rcu_read_lock();
25820                 unlock_timer(timer, flags);
25821 +               timer_wait_for_callback(clockid_to_kclock(timer->it_clock),
25822 +                                       timer);
25823 +               rcu_read_unlock();
25824                 goto retry_delete;
25825         }
25826         list_del(&timer->list);
25827 diff --git a/kernel/time/tick-broadcast-hrtimer.c b/kernel/time/tick-broadcast-hrtimer.c
25828 index 53d7184da0be..1b4ac3361c3f 100644
25829 --- a/kernel/time/tick-broadcast-hrtimer.c
25830 +++ b/kernel/time/tick-broadcast-hrtimer.c
25831 @@ -106,5 +106,6 @@ void tick_setup_hrtimer_broadcast(void)
25832  {
25833         hrtimer_init(&bctimer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
25834         bctimer.function = bc_handler;
25835 +       bctimer.irqsafe = true;
25836         clockevents_register_device(&ce_broadcast_hrtimer);
25837  }
25838 diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c
25839 index 4fcd99e12aa0..5a47f2e98faf 100644
25840 --- a/kernel/time/tick-common.c
25841 +++ b/kernel/time/tick-common.c
25842 @@ -79,13 +79,15 @@ int tick_is_oneshot_available(void)
25843  static void tick_periodic(int cpu)
25844  {
25845         if (tick_do_timer_cpu == cpu) {
25846 -               write_seqlock(&jiffies_lock);
25847 +               raw_spin_lock(&jiffies_lock);
25848 +               write_seqcount_begin(&jiffies_seq);
25849
25850                 /* Keep track of the next tick event */
25851                 tick_next_period = ktime_add(tick_next_period, tick_period);
25852
25853                 do_timer(1);
25854 -               write_sequnlock(&jiffies_lock);
25855 +               write_seqcount_end(&jiffies_seq);
25856 +               raw_spin_unlock(&jiffies_lock);
25857                 update_wall_time();
25858         }
25859
25860 @@ -157,9 +159,9 @@ void tick_setup_periodic(struct clock_event_device *dev, int broadcast)
25861                 ktime_t next;
25862
25863                 do {
25864 -                       seq = read_seqbegin(&jiffies_lock);
25865 +                       seq = read_seqcount_begin(&jiffies_seq);
25866                         next = tick_next_period;
25867 -               } while (read_seqretry(&jiffies_lock, seq));
25868 +               } while (read_seqcount_retry(&jiffies_seq, seq));
25869
25870                 clockevents_switch_state(dev, CLOCK_EVT_STATE_ONESHOT);
25871
25872 diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
25873 index 22c57e191a23..d536824cbd36 100644
25874 --- a/kernel/time/tick-sched.c
25875 +++ b/kernel/time/tick-sched.c
25876 @@ -62,7 +62,8 @@ static void tick_do_update_jiffies64(ktime_t now)
25877                 return;
25878
25879         /* Reevalute with jiffies_lock held */
25880 -       write_seqlock(&jiffies_lock);
25881 +       raw_spin_lock(&jiffies_lock);
25882 +       write_seqcount_begin(&jiffies_seq);
25883
25884         delta = ktime_sub(now, last_jiffies_update);
25885         if (delta.tv64 >= tick_period.tv64) {
25886 @@ -85,10 +86,12 @@ static void tick_do_update_jiffies64(ktime_t now)
25887                 /* Keep the tick_next_period variable up to date */
25888                 tick_next_period = ktime_add(last_jiffies_update, tick_period);
25889         } else {
25890 -               write_sequnlock(&jiffies_lock);
25891 +               write_seqcount_end(&jiffies_seq);
25892 +               raw_spin_unlock(&jiffies_lock);
25893                 return;
25894         }
25895 -       write_sequnlock(&jiffies_lock);
25896 +       write_seqcount_end(&jiffies_seq);
25897 +       raw_spin_unlock(&jiffies_lock);
25898         update_wall_time();
25899  }
25900
25901 @@ -99,12 +102,14 @@ static ktime_t tick_init_jiffy_update(void)
25902  {
25903         ktime_t period;
25904
25905 -       write_seqlock(&jiffies_lock);
25906 +       raw_spin_lock(&jiffies_lock);
25907 +       write_seqcount_begin(&jiffies_seq);
25908         /* Did we start the jiffies update yet ? */
25909         if (last_jiffies_update.tv64 == 0)
25910                 last_jiffies_update = tick_next_period;
25911         period = last_jiffies_update;
25912 -       write_sequnlock(&jiffies_lock);
25913 +       write_seqcount_end(&jiffies_seq);
25914 +       raw_spin_unlock(&jiffies_lock);
25915         return period;
25916  }
25917
25918 @@ -176,6 +181,11 @@ static bool can_stop_full_tick(void)
25919                 return false;
25920         }
25921
25922 +       if (!arch_irq_work_has_interrupt()) {
25923 +               trace_tick_stop(0, "missing irq work interrupt\n");
25924 +               return false;
25925 +       }
25926 +
25927         /* sched_clock_tick() needs us? */
25928  #ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK
25929         /*
25930 @@ -204,6 +214,7 @@ static void nohz_full_kick_work_func(struct irq_work *work)
25931
25932  static DEFINE_PER_CPU(struct irq_work, nohz_full_kick_work) = {
25933         .func = nohz_full_kick_work_func,
25934 +       .flags = IRQ_WORK_HARD_IRQ,
25935  };
25936
25937  /*
25938 @@ -578,10 +589,10 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts,
25939
25940         /* Read jiffies and the time when jiffies were updated last */
25941         do {
25942 -               seq = read_seqbegin(&jiffies_lock);
25943 +               seq = read_seqcount_begin(&jiffies_seq);
25944                 basemono = last_jiffies_update.tv64;
25945                 basejiff = jiffies;
25946 -       } while (read_seqretry(&jiffies_lock, seq));
25947 +       } while (read_seqcount_retry(&jiffies_seq, seq));
25948         ts->last_jiffies = basejiff;
25949
25950         if (rcu_needs_cpu(basemono, &next_rcu) ||
25951 @@ -753,14 +764,7 @@ static bool can_stop_idle_tick(int cpu, struct tick_sched *ts)
25952                 return false;
25953
25954         if (unlikely(local_softirq_pending() && cpu_online(cpu))) {
25955 -               static int ratelimit;
25956 -
25957 -               if (ratelimit < 10 &&
25958 -                   (local_softirq_pending() & SOFTIRQ_STOP_IDLE_MASK)) {
25959 -                       pr_warn("NOHZ: local_softirq_pending %02x\n",
25960 -                               (unsigned int) local_softirq_pending());
25961 -                       ratelimit++;
25962 -               }
25963 +               softirq_check_pending_idle();
25964                 return false;
25965         }
25966
25967 @@ -1100,6 +1104,7 @@ void tick_setup_sched_timer(void)
25968          * Emulate tick processing via per-CPU hrtimers:
25969          */
25970         hrtimer_init(&ts->sched_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
25971 +       ts->sched_timer.irqsafe = 1;
25972         ts->sched_timer.function = tick_sched_timer;
25973
25974         /* Get the next period (per cpu) */
25975 diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
25976 index 445601c580d6..8744b0d87479 100644
25977 --- a/kernel/time/timekeeping.c
25978 +++ b/kernel/time/timekeeping.c
25979 @@ -2070,8 +2070,10 @@ EXPORT_SYMBOL(hardpps);
25980   */
25981  void xtime_update(unsigned long ticks)
25982  {
25983 -       write_seqlock(&jiffies_lock);
25984 +       raw_spin_lock(&jiffies_lock);
25985 +       write_seqcount_begin(&jiffies_seq);
25986         do_timer(ticks);
25987 -       write_sequnlock(&jiffies_lock);
25988 +       write_seqcount_end(&jiffies_seq);
25989 +       raw_spin_unlock(&jiffies_lock);
25990         update_wall_time();
25991  }
25992 diff --git a/kernel/time/timekeeping.h b/kernel/time/timekeeping.h
25993 index 704f595ce83f..763a3e5121ff 100644
25994 --- a/kernel/time/timekeeping.h
25995 +++ b/kernel/time/timekeeping.h
25996 @@ -19,7 +19,8 @@ extern void timekeeping_resume(void);
25997  extern void do_timer(unsigned long ticks);
25998  extern void update_wall_time(void);
25999
26000 -extern seqlock_t jiffies_lock;
26001 +extern raw_spinlock_t jiffies_lock;
26002 +extern seqcount_t jiffies_seq;
26003
26004  #define CS_NAME_LEN    32
26005
26006 diff --git a/kernel/time/timer.c b/kernel/time/timer.c
26007 index bbc5d1114583..603699ff9411 100644
26008 --- a/kernel/time/timer.c
26009 +++ b/kernel/time/timer.c
26010 @@ -80,6 +80,9 @@ struct tvec_root {
26011  struct tvec_base {
26012         spinlock_t lock;
26013         struct timer_list *running_timer;
26014 +#ifdef CONFIG_PREEMPT_RT_FULL
26015 +       wait_queue_head_t wait_for_running_timer;
26016 +#endif
26017         unsigned long timer_jiffies;
26018         unsigned long next_timer;
26019         unsigned long active_timers;
26020 @@ -777,6 +780,39 @@ static struct tvec_base *lock_timer_base(struct timer_list *timer,
26021                 cpu_relax();
26022         }
26023  }
26024 +#ifdef CONFIG_PREEMPT_RT_FULL
26025 +static inline struct tvec_base *switch_timer_base(struct timer_list *timer,
26026 +                                                 struct tvec_base *old,
26027 +                                                 struct tvec_base *new)
26028 +{
26029 +       /*
26030 +        * We cannot do the below because we might be preempted and
26031 +        * then the preempter would see NULL and loop forever.
26032 +        */
26033 +       if (spin_trylock(&new->lock)) {
26034 +               WRITE_ONCE(timer->flags,
26035 +                          (timer->flags & ~TIMER_BASEMASK) | new->cpu);
26036 +               spin_unlock(&old->lock);
26037 +               return new;
26038 +       }
26039 +       return old;
26040 +}
26041 +
26042 +#else
26043 +static inline struct tvec_base *switch_timer_base(struct timer_list *timer,
26044 +                                                 struct tvec_base *old,
26045 +                                                 struct tvec_base *new)
26046 +{
26047 +       /* See the comment in lock_timer_base() */
26048 +       timer->flags |= TIMER_MIGRATING;
26049 +
26050 +       spin_unlock(&old->lock);
26051 +       spin_lock(&new->lock);
26052 +       WRITE_ONCE(timer->flags,
26053 +                  (timer->flags & ~TIMER_BASEMASK) | new->cpu);
26054 +       return new;
26055 +}
26056 +#endif
26057
26058  static inline int
26059  __mod_timer(struct timer_list *timer, unsigned long expires,
26060 @@ -807,16 +843,8 @@ __mod_timer(struct timer_list *timer, unsigned long expires,
26061                  * handler yet has not finished. This also guarantees that
26062                  * the timer is serialized wrt itself.
26063                  */
26064 -               if (likely(base->running_timer != timer)) {
26065 -                       /* See the comment in lock_timer_base() */
26066 -                       timer->flags |= TIMER_MIGRATING;
26067 -
26068 -                       spin_unlock(&base->lock);
26069 -                       base = new_base;
26070 -                       spin_lock(&base->lock);
26071 -                       WRITE_ONCE(timer->flags,
26072 -                                  (timer->flags & ~TIMER_BASEMASK) | base->cpu);
26073 -               }
26074 +               if (likely(base->running_timer != timer))
26075 +                       base = switch_timer_base(timer, base, new_base);
26076         }
26077
26078         timer->expires = expires;
26079 @@ -1006,6 +1034,33 @@ void add_timer_on(struct timer_list *timer, int cpu)
26080  }
26081  EXPORT_SYMBOL_GPL(add_timer_on);
26082
26083 +#ifdef CONFIG_PREEMPT_RT_FULL
26084 +/*
26085 + * Wait for a running timer
26086 + */
26087 +static void wait_for_running_timer(struct timer_list *timer)
26088 +{
26089 +       struct tvec_base *base;
26090 +       u32 tf = timer->flags;
26091 +
26092 +       if (tf & TIMER_MIGRATING)
26093 +               return;
26094 +
26095 +       base = per_cpu_ptr(&tvec_bases, tf & TIMER_CPUMASK);
26096 +       wait_event(base->wait_for_running_timer,
26097 +                  base->running_timer != timer);
26098 +}
26099 +
26100 +# define wakeup_timer_waiters(b)       wake_up_all(&(b)->wait_for_running_timer)
26101 +#else
26102 +static inline void wait_for_running_timer(struct timer_list *timer)
26103 +{
26104 +       cpu_relax();
26105 +}
26106 +
26107 +# define wakeup_timer_waiters(b)       do { } while (0)
26108 +#endif
26109 +
26110  /**
26111   * del_timer - deactive a timer.
26112   * @timer: the timer to be deactivated
26113 @@ -1063,7 +1118,7 @@ int try_to_del_timer_sync(struct timer_list *timer)
26114  }
26115  EXPORT_SYMBOL(try_to_del_timer_sync);
26116
26117 -#ifdef CONFIG_SMP
26118 +#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT_FULL)
26119  /**
26120   * del_timer_sync - deactivate a timer and wait for the handler to finish.
26121   * @timer: the timer to be deactivated
26122 @@ -1123,7 +1178,7 @@ int del_timer_sync(struct timer_list *timer)
26123                 int ret = try_to_del_timer_sync(timer);
26124                 if (ret >= 0)
26125                         return ret;
26126 -               cpu_relax();
26127 +               wait_for_running_timer(timer);
26128         }
26129  }
26130  EXPORT_SYMBOL(del_timer_sync);
26131 @@ -1248,16 +1303,18 @@ static inline void __run_timers(struct tvec_base *base)
26132                         if (irqsafe) {
26133                                 spin_unlock(&base->lock);
26134                                 call_timer_fn(timer, fn, data);
26135 +                               base->running_timer = NULL;
26136                                 spin_lock(&base->lock);
26137                         } else {
26138                                 spin_unlock_irq(&base->lock);
26139                                 call_timer_fn(timer, fn, data);
26140 +                               base->running_timer = NULL;
26141                                 spin_lock_irq(&base->lock);
26142                         }
26143                 }
26144         }
26145 -       base->running_timer = NULL;
26146         spin_unlock_irq(&base->lock);
26147 +       wakeup_timer_waiters(base);
26148  }
26149
26150  #ifdef CONFIG_NO_HZ_COMMON
26151 @@ -1390,6 +1447,14 @@ u64 get_next_timer_interrupt(unsigned long basej, u64 basem)
26152         if (cpu_is_offline(smp_processor_id()))
26153                 return expires;
26154
26155 +#ifdef CONFIG_PREEMPT_RT_FULL
26156 +       /*
26157 +        * On PREEMPT_RT we cannot sleep here. As a result we can't take
26158 +        * the base lock to check when the next timer is pending and so
26159 +        * we assume the next jiffy.
26160 +        */
26161 +       return basem + TICK_NSEC;
26162 +#endif
26163         spin_lock(&base->lock);
26164         if (base->active_timers) {
26165                 if (time_before_eq(base->next_timer, base->timer_jiffies))
26166 @@ -1416,13 +1481,13 @@ void update_process_times(int user_tick)
26167
26168         /* Note: this timer irq context must be accounted for as well. */
26169         account_process_tick(p, user_tick);
26170 +       scheduler_tick();
26171         run_local_timers();
26172         rcu_check_callbacks(user_tick);
26173 -#ifdef CONFIG_IRQ_WORK
26174 +#if defined(CONFIG_IRQ_WORK)
26175         if (in_irq())
26176                 irq_work_tick();
26177  #endif
26178 -       scheduler_tick();
26179         run_posix_cpu_timers(p);
26180  }
26181
26182 @@ -1433,6 +1498,8 @@ static void run_timer_softirq(struct softirq_action *h)
26183  {
26184         struct tvec_base *base = this_cpu_ptr(&tvec_bases);
26185
26186 +       irq_work_tick_soft();
26187 +
26188         if (time_after_eq(jiffies, base->timer_jiffies))
26189                 __run_timers(base);
26190  }
26191 @@ -1589,7 +1656,7 @@ static void migrate_timers(int cpu)
26192
26193         BUG_ON(cpu_online(cpu));
26194         old_base = per_cpu_ptr(&tvec_bases, cpu);
26195 -       new_base = get_cpu_ptr(&tvec_bases);
26196 +       new_base = get_local_ptr(&tvec_bases);
26197         /*
26198          * The caller is globally serialized and nobody else
26199          * takes two locks at once, deadlock is not possible.
26200 @@ -1613,7 +1680,7 @@ static void migrate_timers(int cpu)
26201
26202         spin_unlock(&old_base->lock);
26203         spin_unlock_irq(&new_base->lock);
26204 -       put_cpu_ptr(&tvec_bases);
26205 +       put_local_ptr(&tvec_bases);
26206  }
26207
26208  static int timer_cpu_notify(struct notifier_block *self,
26209 @@ -1645,6 +1712,9 @@ static void __init init_timer_cpu(int cpu)
26210
26211         base->cpu = cpu;
26212         spin_lock_init(&base->lock);
26213 +#ifdef CONFIG_PREEMPT_RT_FULL
26214 +       init_waitqueue_head(&base->wait_for_running_timer);
26215 +#endif
26216
26217         base->timer_jiffies = jiffies;
26218         base->next_timer = base->timer_jiffies;
26219 diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
26220 index e45db6b0d878..364ccd0eb57b 100644
26221 --- a/kernel/trace/Kconfig
26222 +++ b/kernel/trace/Kconfig
26223 @@ -187,6 +187,24 @@ config IRQSOFF_TRACER
26224           enabled. This option and the preempt-off timing option can be
26225           used together or separately.)
26226
26227 +config INTERRUPT_OFF_HIST
26228 +       bool "Interrupts-off Latency Histogram"
26229 +       depends on IRQSOFF_TRACER
26230 +       help
26231 +         This option generates continuously updated histograms (one per cpu)
26232 +         of the duration of time periods with interrupts disabled. The
26233 +         histograms are disabled by default. To enable them, write a non-zero
26234 +         number to
26235 +
26236 +             /sys/kernel/debug/tracing/latency_hist/enable/preemptirqsoff
26237 +
26238 +         If PREEMPT_OFF_HIST is also selected, additional histograms (one
26239 +         per cpu) are generated that accumulate the duration of time periods
26240 +         when both interrupts and preemption are disabled. The histogram data
26241 +         will be located in the debug file system at
26242 +
26243 +             /sys/kernel/debug/tracing/latency_hist/irqsoff
26244 +
26245  config PREEMPT_TRACER
26246         bool "Preemption-off Latency Tracer"
26247         default n
26248 @@ -211,6 +229,24 @@ config PREEMPT_TRACER
26249           enabled. This option and the irqs-off timing option can be
26250           used together or separately.)
26251
26252 +config PREEMPT_OFF_HIST
26253 +       bool "Preemption-off Latency Histogram"
26254 +       depends on PREEMPT_TRACER
26255 +       help
26256 +         This option generates continuously updated histograms (one per cpu)
26257 +         of the duration of time periods with preemption disabled. The
26258 +         histograms are disabled by default. To enable them, write a non-zero
26259 +         number to
26260 +
26261 +             /sys/kernel/debug/tracing/latency_hist/enable/preemptirqsoff
26262 +
26263 +         If INTERRUPT_OFF_HIST is also selected, additional histograms (one
26264 +         per cpu) are generated that accumulate the duration of time periods
26265 +         when both interrupts and preemption are disabled. The histogram data
26266 +         will be located in the debug file system at
26267 +
26268 +             /sys/kernel/debug/tracing/latency_hist/preemptoff
26269 +
26270  config SCHED_TRACER
26271         bool "Scheduling Latency Tracer"
26272         select GENERIC_TRACER
26273 @@ -221,6 +257,74 @@ config SCHED_TRACER
26274           This tracer tracks the latency of the highest priority task
26275           to be scheduled in, starting from the point it has woken up.
26276
26277 +config WAKEUP_LATENCY_HIST
26278 +       bool "Scheduling Latency Histogram"
26279 +       depends on SCHED_TRACER
26280 +       help
26281 +         This option generates continuously updated histograms (one per cpu)
26282 +         of the scheduling latency of the highest priority task.
26283 +         The histograms are disabled by default. To enable them, write a
26284 +         non-zero number to
26285 +
26286 +             /sys/kernel/debug/tracing/latency_hist/enable/wakeup
26287 +
26288 +         Two different algorithms are used, one to determine the latency of
26289 +         processes that exclusively use the highest priority of the system and
26290 +         another one to determine the latency of processes that share the
26291 +         highest system priority with other processes. The former is used to
26292 +         improve hardware and system software, the latter to optimize the
26293 +         priority design of a given system. The histogram data will be
26294 +         located in the debug file system at
26295 +
26296 +             /sys/kernel/debug/tracing/latency_hist/wakeup
26297 +
26298 +         and
26299 +
26300 +             /sys/kernel/debug/tracing/latency_hist/wakeup/sharedprio
26301 +
26302 +         If both Scheduling Latency Histogram and Missed Timer Offsets
26303 +         Histogram are selected, additional histogram data will be collected
26304 +         that contain, in addition to the wakeup latency, the timer latency, in
26305 +         case the wakeup was triggered by an expired timer. These histograms
26306 +         are available in the
26307 +
26308 +             /sys/kernel/debug/tracing/latency_hist/timerandwakeup
26309 +
26310 +         directory. They reflect the apparent interrupt and scheduling latency
26311 +         and are best suitable to determine the worst-case latency of a given
26312 +         system. To enable these histograms, write a non-zero number to
26313 +
26314 +             /sys/kernel/debug/tracing/latency_hist/enable/timerandwakeup
26315 +
26316 +config MISSED_TIMER_OFFSETS_HIST
26317 +       depends on HIGH_RES_TIMERS
26318 +       select GENERIC_TRACER
26319 +       bool "Missed Timer Offsets Histogram"
26320 +       help
26321 +         Generate a histogram of missed timer offsets in microseconds. The
26322 +         histograms are disabled by default. To enable them, write a non-zero
26323 +         number to
26324 +
26325 +             /sys/kernel/debug/tracing/latency_hist/enable/missed_timer_offsets
26326 +
26327 +         The histogram data will be located in the debug file system at
26328 +
26329 +             /sys/kernel/debug/tracing/latency_hist/missed_timer_offsets
26330 +
26331 +         If both Scheduling Latency Histogram and Missed Timer Offsets
26332 +         Histogram are selected, additional histogram data will be collected
26333 +         that contain, in addition to the wakeup latency, the timer latency, in
26334 +         case the wakeup was triggered by an expired timer. These histograms
26335 +         are available in the
26336 +
26337 +             /sys/kernel/debug/tracing/latency_hist/timerandwakeup
26338 +
26339 +         directory. They reflect the apparent interrupt and scheduling latency
26340 +         and are best suitable to determine the worst-case latency of a given
26341 +         system. To enable these histograms, write a non-zero number to
26342 +
26343 +             /sys/kernel/debug/tracing/latency_hist/enable/timerandwakeup
26344 +
26345  config ENABLE_DEFAULT_TRACERS
26346         bool "Trace process context switches and events"
26347         depends on !GENERIC_TRACER
26348 diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
26349 index 05ea5167e6bb..bc08c67301ae 100644
26350 --- a/kernel/trace/Makefile
26351 +++ b/kernel/trace/Makefile
26352 @@ -40,6 +40,10 @@ obj-$(CONFIG_FUNCTION_TRACER) += trace_functions.o
26353  obj-$(CONFIG_IRQSOFF_TRACER) += trace_irqsoff.o
26354  obj-$(CONFIG_PREEMPT_TRACER) += trace_irqsoff.o
26355  obj-$(CONFIG_SCHED_TRACER) += trace_sched_wakeup.o
26356 +obj-$(CONFIG_INTERRUPT_OFF_HIST) += latency_hist.o
26357 +obj-$(CONFIG_PREEMPT_OFF_HIST) += latency_hist.o
26358 +obj-$(CONFIG_WAKEUP_LATENCY_HIST) += latency_hist.o
26359 +obj-$(CONFIG_MISSED_TIMER_OFFSETS_HIST) += latency_hist.o
26360  obj-$(CONFIG_NOP_TRACER) += trace_nop.o
26361  obj-$(CONFIG_STACK_TRACER) += trace_stack.o
26362  obj-$(CONFIG_MMIOTRACE) += trace_mmiotrace.o
26363 diff --git a/kernel/trace/latency_hist.c b/kernel/trace/latency_hist.c
26364 new file mode 100644
26365 index 000000000000..7f6ee70dea41
26366 --- /dev/null
26367 +++ b/kernel/trace/latency_hist.c
26368 @@ -0,0 +1,1178 @@
26369 +/*
26370 + * kernel/trace/latency_hist.c
26371 + *
26372 + * Add support for histograms of preemption-off latency and
26373 + * interrupt-off latency and wakeup latency, it depends on
26374 + * Real-Time Preemption Support.
26375 + *
26376 + *  Copyright (C) 2005 MontaVista Software, Inc.
26377 + *  Yi Yang <yyang@ch.mvista.com>
26378 + *
26379 + *  Converted to work with the new latency tracer.
26380 + *  Copyright (C) 2008 Red Hat, Inc.
26381 + *    Steven Rostedt <srostedt@redhat.com>
26382 + *
26383 + */
26384 +#include <linux/module.h>
26385 +#include <linux/debugfs.h>
26386 +#include <linux/seq_file.h>
26387 +#include <linux/percpu.h>
26388 +#include <linux/kallsyms.h>
26389 +#include <linux/uaccess.h>
26390 +#include <linux/sched.h>
26391 +#include <linux/sched/rt.h>
26392 +#include <linux/slab.h>
26393 +#include <linux/atomic.h>
26394 +#include <asm/div64.h>
26395 +
26396 +#include "trace.h"
26397 +#include <trace/events/sched.h>
26398 +
26399 +#define NSECS_PER_USECS 1000L
26400 +
26401 +#define CREATE_TRACE_POINTS
26402 +#include <trace/events/hist.h>
26403 +
26404 +enum {
26405 +       IRQSOFF_LATENCY = 0,
26406 +       PREEMPTOFF_LATENCY,
26407 +       PREEMPTIRQSOFF_LATENCY,
26408 +       WAKEUP_LATENCY,
26409 +       WAKEUP_LATENCY_SHAREDPRIO,
26410 +       MISSED_TIMER_OFFSETS,
26411 +       TIMERANDWAKEUP_LATENCY,
26412 +       MAX_LATENCY_TYPE,
26413 +};
26414 +
26415 +#define MAX_ENTRY_NUM 10240
26416 +
26417 +struct hist_data {
26418 +       atomic_t hist_mode; /* 0 log, 1 don't log */
26419 +       long offset; /* set it to MAX_ENTRY_NUM/2 for a bipolar scale */
26420 +       long min_lat;
26421 +       long max_lat;
26422 +       unsigned long long below_hist_bound_samples;
26423 +       unsigned long long above_hist_bound_samples;
26424 +       long long accumulate_lat;
26425 +       unsigned long long total_samples;
26426 +       unsigned long long hist_array[MAX_ENTRY_NUM];
26427 +};
26428 +
26429 +struct enable_data {
26430 +       int latency_type;
26431 +       int enabled;
26432 +};
26433 +
26434 +static char *latency_hist_dir_root = "latency_hist";
26435 +
26436 +#ifdef CONFIG_INTERRUPT_OFF_HIST
26437 +static DEFINE_PER_CPU(struct hist_data, irqsoff_hist);
26438 +static char *irqsoff_hist_dir = "irqsoff";
26439 +static DEFINE_PER_CPU(cycles_t, hist_irqsoff_start);
26440 +static DEFINE_PER_CPU(int, hist_irqsoff_counting);
26441 +#endif
26442 +
26443 +#ifdef CONFIG_PREEMPT_OFF_HIST
26444 +static DEFINE_PER_CPU(struct hist_data, preemptoff_hist);
26445 +static char *preemptoff_hist_dir = "preemptoff";
26446 +static DEFINE_PER_CPU(cycles_t, hist_preemptoff_start);
26447 +static DEFINE_PER_CPU(int, hist_preemptoff_counting);
26448 +#endif
26449 +
26450 +#if defined(CONFIG_PREEMPT_OFF_HIST) && defined(CONFIG_INTERRUPT_OFF_HIST)
26451 +static DEFINE_PER_CPU(struct hist_data, preemptirqsoff_hist);
26452 +static char *preemptirqsoff_hist_dir = "preemptirqsoff";
26453 +static DEFINE_PER_CPU(cycles_t, hist_preemptirqsoff_start);
26454 +static DEFINE_PER_CPU(int, hist_preemptirqsoff_counting);
26455 +#endif
26456 +
26457 +#if defined(CONFIG_PREEMPT_OFF_HIST) || defined(CONFIG_INTERRUPT_OFF_HIST)
26458 +static notrace void probe_preemptirqsoff_hist(void *v, int reason, int start);
26459 +static struct enable_data preemptirqsoff_enabled_data = {
26460 +       .latency_type = PREEMPTIRQSOFF_LATENCY,
26461 +       .enabled = 0,
26462 +};
26463 +#endif
26464 +
26465 +#if defined(CONFIG_WAKEUP_LATENCY_HIST) || \
26466 +       defined(CONFIG_MISSED_TIMER_OFFSETS_HIST)
26467 +struct maxlatproc_data {
26468 +       char comm[FIELD_SIZEOF(struct task_struct, comm)];
26469 +       char current_comm[FIELD_SIZEOF(struct task_struct, comm)];
26470 +       int pid;
26471 +       int current_pid;
26472 +       int prio;
26473 +       int current_prio;
26474 +       long latency;
26475 +       long timeroffset;
26476 +       cycle_t timestamp;
26477 +};
26478 +#endif
26479 +
26480 +#ifdef CONFIG_WAKEUP_LATENCY_HIST
26481 +static DEFINE_PER_CPU(struct hist_data, wakeup_latency_hist);
26482 +static DEFINE_PER_CPU(struct hist_data, wakeup_latency_hist_sharedprio);
26483 +static char *wakeup_latency_hist_dir = "wakeup";
26484 +static char *wakeup_latency_hist_dir_sharedprio = "sharedprio";
26485 +static notrace void probe_wakeup_latency_hist_start(void *v,
26486 +       struct task_struct *p);
26487 +static notrace void probe_wakeup_latency_hist_stop(void *v,
26488 +       bool preempt, struct task_struct *prev, struct task_struct *next);
26489 +static notrace void probe_sched_migrate_task(void *,
26490 +       struct task_struct *task, int cpu);
26491 +static struct enable_data wakeup_latency_enabled_data = {
26492 +       .latency_type = WAKEUP_LATENCY,
26493 +       .enabled = 0,
26494 +};
26495 +static DEFINE_PER_CPU(struct maxlatproc_data, wakeup_maxlatproc);
26496 +static DEFINE_PER_CPU(struct maxlatproc_data, wakeup_maxlatproc_sharedprio);
26497 +static DEFINE_PER_CPU(struct task_struct *, wakeup_task);
26498 +static DEFINE_PER_CPU(int, wakeup_sharedprio);
26499 +static unsigned long wakeup_pid;
26500 +#endif
26501 +
26502 +#ifdef CONFIG_MISSED_TIMER_OFFSETS_HIST
26503 +static DEFINE_PER_CPU(struct hist_data, missed_timer_offsets);
26504 +static char *missed_timer_offsets_dir = "missed_timer_offsets";
26505 +static notrace void probe_hrtimer_interrupt(void *v, int cpu,
26506 +       long long offset, struct task_struct *curr, struct task_struct *task);
26507 +static struct enable_data missed_timer_offsets_enabled_data = {
26508 +       .latency_type = MISSED_TIMER_OFFSETS,
26509 +       .enabled = 0,
26510 +};
26511 +static DEFINE_PER_CPU(struct maxlatproc_data, missed_timer_offsets_maxlatproc);
26512 +static unsigned long missed_timer_offsets_pid;
26513 +#endif
26514 +
26515 +#if defined(CONFIG_WAKEUP_LATENCY_HIST) && \
26516 +       defined(CONFIG_MISSED_TIMER_OFFSETS_HIST)
26517 +static DEFINE_PER_CPU(struct hist_data, timerandwakeup_latency_hist);
26518 +static char *timerandwakeup_latency_hist_dir = "timerandwakeup";
26519 +static struct enable_data timerandwakeup_enabled_data = {
26520 +       .latency_type = TIMERANDWAKEUP_LATENCY,
26521 +       .enabled = 0,
26522 +};
26523 +static DEFINE_PER_CPU(struct maxlatproc_data, timerandwakeup_maxlatproc);
26524 +#endif
26525 +
26526 +void notrace latency_hist(int latency_type, int cpu, long latency,
26527 +                         long timeroffset, cycle_t stop,
26528 +                         struct task_struct *p)
26529 +{
26530 +       struct hist_data *my_hist;
26531 +#if defined(CONFIG_WAKEUP_LATENCY_HIST) || \
26532 +       defined(CONFIG_MISSED_TIMER_OFFSETS_HIST)
26533 +       struct maxlatproc_data *mp = NULL;
26534 +#endif
26535 +
26536 +       if (!cpu_possible(cpu) || latency_type < 0 ||
26537 +           latency_type >= MAX_LATENCY_TYPE)
26538 +               return;
26539 +
26540 +       switch (latency_type) {
26541 +#ifdef CONFIG_INTERRUPT_OFF_HIST
26542 +       case IRQSOFF_LATENCY:
26543 +               my_hist = &per_cpu(irqsoff_hist, cpu);
26544 +               break;
26545 +#endif
26546 +#ifdef CONFIG_PREEMPT_OFF_HIST
26547 +       case PREEMPTOFF_LATENCY:
26548 +               my_hist = &per_cpu(preemptoff_hist, cpu);
26549 +               break;
26550 +#endif
26551 +#if defined(CONFIG_PREEMPT_OFF_HIST) && defined(CONFIG_INTERRUPT_OFF_HIST)
26552 +       case PREEMPTIRQSOFF_LATENCY:
26553 +               my_hist = &per_cpu(preemptirqsoff_hist, cpu);
26554 +               break;
26555 +#endif
26556 +#ifdef CONFIG_WAKEUP_LATENCY_HIST
26557 +       case WAKEUP_LATENCY:
26558 +               my_hist = &per_cpu(wakeup_latency_hist, cpu);
26559 +               mp = &per_cpu(wakeup_maxlatproc, cpu);
26560 +               break;
26561 +       case WAKEUP_LATENCY_SHAREDPRIO:
26562 +               my_hist = &per_cpu(wakeup_latency_hist_sharedprio, cpu);
26563 +               mp = &per_cpu(wakeup_maxlatproc_sharedprio, cpu);
26564 +               break;
26565 +#endif
26566 +#ifdef CONFIG_MISSED_TIMER_OFFSETS_HIST
26567 +       case MISSED_TIMER_OFFSETS:
26568 +               my_hist = &per_cpu(missed_timer_offsets, cpu);
26569 +               mp = &per_cpu(missed_timer_offsets_maxlatproc, cpu);
26570 +               break;
26571 +#endif
26572 +#if defined(CONFIG_WAKEUP_LATENCY_HIST) && \
26573 +       defined(CONFIG_MISSED_TIMER_OFFSETS_HIST)
26574 +       case TIMERANDWAKEUP_LATENCY:
26575 +               my_hist = &per_cpu(timerandwakeup_latency_hist, cpu);
26576 +               mp = &per_cpu(timerandwakeup_maxlatproc, cpu);
26577 +               break;
26578 +#endif
26579 +
26580 +       default:
26581 +               return;
26582 +       }
26583 +
26584 +       latency += my_hist->offset;
26585 +
26586 +       if (atomic_read(&my_hist->hist_mode) == 0)
26587 +               return;
26588 +
26589 +       if (latency < 0 || latency >= MAX_ENTRY_NUM) {
26590 +               if (latency < 0)
26591 +                       my_hist->below_hist_bound_samples++;
26592 +               else
26593 +                       my_hist->above_hist_bound_samples++;
26594 +       } else
26595 +               my_hist->hist_array[latency]++;
26596 +
26597 +       if (unlikely(latency > my_hist->max_lat ||
26598 +           my_hist->min_lat == LONG_MAX)) {
26599 +#if defined(CONFIG_WAKEUP_LATENCY_HIST) || \
26600 +       defined(CONFIG_MISSED_TIMER_OFFSETS_HIST)
26601 +               if (latency_type == WAKEUP_LATENCY ||
26602 +                   latency_type == WAKEUP_LATENCY_SHAREDPRIO ||
26603 +                   latency_type == MISSED_TIMER_OFFSETS ||
26604 +                   latency_type == TIMERANDWAKEUP_LATENCY) {
26605 +                       strncpy(mp->comm, p->comm, sizeof(mp->comm));
26606 +                       strncpy(mp->current_comm, current->comm,
26607 +                           sizeof(mp->current_comm));
26608 +                       mp->pid = task_pid_nr(p);
26609 +                       mp->current_pid = task_pid_nr(current);
26610 +                       mp->prio = p->prio;
26611 +                       mp->current_prio = current->prio;
26612 +                       mp->latency = latency;
26613 +                       mp->timeroffset = timeroffset;
26614 +                       mp->timestamp = stop;
26615 +               }
26616 +#endif
26617 +               my_hist->max_lat = latency;
26618 +       }
26619 +       if (unlikely(latency < my_hist->min_lat))
26620 +               my_hist->min_lat = latency;
26621 +       my_hist->total_samples++;
26622 +       my_hist->accumulate_lat += latency;
26623 +}
26624 +
26625 +static void *l_start(struct seq_file *m, loff_t *pos)
26626 +{
26627 +       loff_t *index_ptr = NULL;
26628 +       loff_t index = *pos;
26629 +       struct hist_data *my_hist = m->private;
26630 +
26631 +       if (index == 0) {
26632 +               char minstr[32], avgstr[32], maxstr[32];
26633 +
26634 +               atomic_dec(&my_hist->hist_mode);
26635 +
26636 +               if (likely(my_hist->total_samples)) {
26637 +                       long avg = (long) div64_s64(my_hist->accumulate_lat,
26638 +                           my_hist->total_samples);
26639 +                       snprintf(minstr, sizeof(minstr), "%ld",
26640 +                           my_hist->min_lat - my_hist->offset);
26641 +                       snprintf(avgstr, sizeof(avgstr), "%ld",
26642 +                           avg - my_hist->offset);
26643 +                       snprintf(maxstr, sizeof(maxstr), "%ld",
26644 +                           my_hist->max_lat - my_hist->offset);
26645 +               } else {
26646 +                       strcpy(minstr, "<undef>");
26647 +                       strcpy(avgstr, minstr);
26648 +                       strcpy(maxstr, minstr);
26649 +               }
26650 +
26651 +               seq_printf(m, "#Minimum latency: %s microseconds\n"
26652 +                          "#Average latency: %s microseconds\n"
26653 +                          "#Maximum latency: %s microseconds\n"
26654 +                          "#Total samples: %llu\n"
26655 +                          "#There are %llu samples lower than %ld"
26656 +                          " microseconds.\n"
26657 +                          "#There are %llu samples greater or equal"
26658 +                          " than %ld microseconds.\n"
26659 +                          "#usecs\t%16s\n",
26660 +                          minstr, avgstr, maxstr,
26661 +                          my_hist->total_samples,
26662 +                          my_hist->below_hist_bound_samples,
26663 +                          -my_hist->offset,
26664 +                          my_hist->above_hist_bound_samples,
26665 +                          MAX_ENTRY_NUM - my_hist->offset,
26666 +                          "samples");
26667 +       }
26668 +       if (index < MAX_ENTRY_NUM) {
26669 +               index_ptr = kmalloc(sizeof(loff_t), GFP_KERNEL);
26670 +               if (index_ptr)
26671 +                       *index_ptr = index;
26672 +       }
26673 +
26674 +       return index_ptr;
26675 +}
26676 +
26677 +static void *l_next(struct seq_file *m, void *p, loff_t *pos)
26678 +{
26679 +       loff_t *index_ptr = p;
26680 +       struct hist_data *my_hist = m->private;
26681 +
26682 +       if (++*pos >= MAX_ENTRY_NUM) {
26683 +               atomic_inc(&my_hist->hist_mode);
26684 +               return NULL;
26685 +       }
26686 +       *index_ptr = *pos;
26687 +       return index_ptr;
26688 +}
26689 +
26690 +static void l_stop(struct seq_file *m, void *p)
26691 +{
26692 +       kfree(p);
26693 +}
26694 +
26695 +static int l_show(struct seq_file *m, void *p)
26696 +{
26697 +       int index = *(loff_t *) p;
26698 +       struct hist_data *my_hist = m->private;
26699 +
26700 +       seq_printf(m, "%6ld\t%16llu\n", index - my_hist->offset,
26701 +           my_hist->hist_array[index]);
26702 +       return 0;
26703 +}
26704 +
26705 +static const struct seq_operations latency_hist_seq_op = {
26706 +       .start = l_start,
26707 +       .next  = l_next,
26708 +       .stop  = l_stop,
26709 +       .show  = l_show
26710 +};
26711 +
26712 +static int latency_hist_open(struct inode *inode, struct file *file)
26713 +{
26714 +       int ret;
26715 +
26716 +       ret = seq_open(file, &latency_hist_seq_op);
26717 +       if (!ret) {
26718 +               struct seq_file *seq = file->private_data;
26719 +               seq->private = inode->i_private;
26720 +       }
26721 +       return ret;
26722 +}
26723 +
26724 +static const struct file_operations latency_hist_fops = {
26725 +       .open = latency_hist_open,
26726 +       .read = seq_read,
26727 +       .llseek = seq_lseek,
26728 +       .release = seq_release,
26729 +};
26730 +
26731 +#if defined(CONFIG_WAKEUP_LATENCY_HIST) || \
26732 +       defined(CONFIG_MISSED_TIMER_OFFSETS_HIST)
26733 +static void clear_maxlatprocdata(struct maxlatproc_data *mp)
26734 +{
26735 +       mp->comm[0] = mp->current_comm[0] = '\0';
26736 +       mp->prio = mp->current_prio = mp->pid = mp->current_pid =
26737 +           mp->latency = mp->timeroffset = -1;
26738 +       mp->timestamp = 0;
26739 +}
26740 +#endif
26741 +
26742 +static void hist_reset(struct hist_data *hist)
26743 +{
26744 +       atomic_dec(&hist->hist_mode);
26745 +
26746 +       memset(hist->hist_array, 0, sizeof(hist->hist_array));
26747 +       hist->below_hist_bound_samples = 0ULL;
26748 +       hist->above_hist_bound_samples = 0ULL;
26749 +       hist->min_lat = LONG_MAX;
26750 +       hist->max_lat = LONG_MIN;
26751 +       hist->total_samples = 0ULL;
26752 +       hist->accumulate_lat = 0LL;
26753 +
26754 +       atomic_inc(&hist->hist_mode);
26755 +}
26756 +
26757 +static ssize_t
26758 +latency_hist_reset(struct file *file, const char __user *a,
26759 +                  size_t size, loff_t *off)
26760 +{
26761 +       int cpu;
26762 +       struct hist_data *hist = NULL;
26763 +#if defined(CONFIG_WAKEUP_LATENCY_HIST) || \
26764 +       defined(CONFIG_MISSED_TIMER_OFFSETS_HIST)
26765 +       struct maxlatproc_data *mp = NULL;
26766 +#endif
26767 +       off_t latency_type = (off_t) file->private_data;
26768 +
26769 +       for_each_online_cpu(cpu) {
26770 +
26771 +               switch (latency_type) {
26772 +#ifdef CONFIG_PREEMPT_OFF_HIST
26773 +               case PREEMPTOFF_LATENCY:
26774 +                       hist = &per_cpu(preemptoff_hist, cpu);
26775 +                       break;
26776 +#endif
26777 +#ifdef CONFIG_INTERRUPT_OFF_HIST
26778 +               case IRQSOFF_LATENCY:
26779 +                       hist = &per_cpu(irqsoff_hist, cpu);
26780 +                       break;
26781 +#endif
26782 +#if defined(CONFIG_INTERRUPT_OFF_HIST) && defined(CONFIG_PREEMPT_OFF_HIST)
26783 +               case PREEMPTIRQSOFF_LATENCY:
26784 +                       hist = &per_cpu(preemptirqsoff_hist, cpu);
26785 +                       break;
26786 +#endif
26787 +#ifdef CONFIG_WAKEUP_LATENCY_HIST
26788 +               case WAKEUP_LATENCY:
26789 +                       hist = &per_cpu(wakeup_latency_hist, cpu);
26790 +                       mp = &per_cpu(wakeup_maxlatproc, cpu);
26791 +                       break;
26792 +               case WAKEUP_LATENCY_SHAREDPRIO:
26793 +                       hist = &per_cpu(wakeup_latency_hist_sharedprio, cpu);
26794 +                       mp = &per_cpu(wakeup_maxlatproc_sharedprio, cpu);
26795 +                       break;
26796 +#endif
26797 +#ifdef CONFIG_MISSED_TIMER_OFFSETS_HIST
26798 +               case MISSED_TIMER_OFFSETS:
26799 +                       hist = &per_cpu(missed_timer_offsets, cpu);
26800 +                       mp = &per_cpu(missed_timer_offsets_maxlatproc, cpu);
26801 +                       break;
26802 +#endif
26803 +#if defined(CONFIG_WAKEUP_LATENCY_HIST) && \
26804 +       defined(CONFIG_MISSED_TIMER_OFFSETS_HIST)
26805 +               case TIMERANDWAKEUP_LATENCY:
26806 +                       hist = &per_cpu(timerandwakeup_latency_hist, cpu);
26807 +                       mp = &per_cpu(timerandwakeup_maxlatproc, cpu);
26808 +                       break;
26809 +#endif
26810 +               }
26811 +
26812 +               hist_reset(hist);
26813 +#if defined(CONFIG_WAKEUP_LATENCY_HIST) || \
26814 +       defined(CONFIG_MISSED_TIMER_OFFSETS_HIST)
26815 +               if (latency_type == WAKEUP_LATENCY ||
26816 +                   latency_type == WAKEUP_LATENCY_SHAREDPRIO ||
26817 +                   latency_type == MISSED_TIMER_OFFSETS ||
26818 +                   latency_type == TIMERANDWAKEUP_LATENCY)
26819 +                       clear_maxlatprocdata(mp);
26820 +#endif
26821 +       }
26822 +
26823 +       return size;
26824 +}
26825 +
26826 +#if defined(CONFIG_WAKEUP_LATENCY_HIST) || \
26827 +       defined(CONFIG_MISSED_TIMER_OFFSETS_HIST)
26828 +static ssize_t
26829 +show_pid(struct file *file, char __user *ubuf, size_t cnt, loff_t *ppos)
26830 +{
26831 +       char buf[64];
26832 +       int r;
26833 +       unsigned long *this_pid = file->private_data;
26834 +
26835 +       r = snprintf(buf, sizeof(buf), "%lu\n", *this_pid);
26836 +       return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
26837 +}
26838 +
26839 +static ssize_t do_pid(struct file *file, const char __user *ubuf,
26840 +                     size_t cnt, loff_t *ppos)
26841 +{
26842 +       char buf[64];
26843 +       unsigned long pid;
26844 +       unsigned long *this_pid = file->private_data;
26845 +
26846 +       if (cnt >= sizeof(buf))
26847 +               return -EINVAL;
26848 +
26849 +       if (copy_from_user(&buf, ubuf, cnt))
26850 +               return -EFAULT;
26851 +
26852 +       buf[cnt] = '\0';
26853 +
26854 +       if (kstrtoul(buf, 10, &pid))
26855 +               return -EINVAL;
26856 +
26857 +       *this_pid = pid;
26858 +
26859 +       return cnt;
26860 +}
26861 +#endif
26862 +
26863 +#if defined(CONFIG_WAKEUP_LATENCY_HIST) || \
26864 +       defined(CONFIG_MISSED_TIMER_OFFSETS_HIST)
26865 +static ssize_t
26866 +show_maxlatproc(struct file *file, char __user *ubuf, size_t cnt, loff_t *ppos)
26867 +{
26868 +       int r;
26869 +       struct maxlatproc_data *mp = file->private_data;
26870 +       int strmaxlen = (TASK_COMM_LEN * 2) + (8 * 8);
26871 +       unsigned long long t;
26872 +       unsigned long usecs, secs;
26873 +       char *buf;
26874 +
26875 +       if (mp->pid == -1 || mp->current_pid == -1) {
26876 +               buf = "(none)\n";
26877 +               return simple_read_from_buffer(ubuf, cnt, ppos, buf,
26878 +                   strlen(buf));
26879 +       }
26880 +
26881 +       buf = kmalloc(strmaxlen, GFP_KERNEL);
26882 +       if (buf == NULL)
26883 +               return -ENOMEM;
26884 +
26885 +       t = ns2usecs(mp->timestamp);
26886 +       usecs = do_div(t, USEC_PER_SEC);
26887 +       secs = (unsigned long) t;
26888 +       r = snprintf(buf, strmaxlen,
26889 +           "%d %d %ld (%ld) %s <- %d %d %s %lu.%06lu\n", mp->pid,
26890 +           MAX_RT_PRIO-1 - mp->prio, mp->latency, mp->timeroffset, mp->comm,
26891 +           mp->current_pid, MAX_RT_PRIO-1 - mp->current_prio, mp->current_comm,
26892 +           secs, usecs);
26893 +       r = simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
26894 +       kfree(buf);
26895 +       return r;
26896 +}
26897 +#endif
26898 +
26899 +static ssize_t
26900 +show_enable(struct file *file, char __user *ubuf, size_t cnt, loff_t *ppos)
26901 +{
26902 +       char buf[64];
26903 +       struct enable_data *ed = file->private_data;
26904 +       int r;
26905 +
26906 +       r = snprintf(buf, sizeof(buf), "%d\n", ed->enabled);
26907 +       return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
26908 +}
26909 +
26910 +static ssize_t
26911 +do_enable(struct file *file, const char __user *ubuf, size_t cnt, loff_t *ppos)
26912 +{
26913 +       char buf[64];
26914 +       long enable;
26915 +       struct enable_data *ed = file->private_data;
26916 +
26917 +       if (cnt >= sizeof(buf))
26918 +               return -EINVAL;
26919 +
26920 +       if (copy_from_user(&buf, ubuf, cnt))
26921 +               return -EFAULT;
26922 +
26923 +       buf[cnt] = 0;
26924 +
26925 +       if (kstrtoul(buf, 10, &enable))
26926 +               return -EINVAL;
26927 +
26928 +       if ((enable && ed->enabled) || (!enable && !ed->enabled))
26929 +               return cnt;
26930 +
26931 +       if (enable) {
26932 +               int ret;
26933 +
26934 +               switch (ed->latency_type) {
26935 +#if defined(CONFIG_INTERRUPT_OFF_HIST) || defined(CONFIG_PREEMPT_OFF_HIST)
26936 +               case PREEMPTIRQSOFF_LATENCY:
26937 +                       ret = register_trace_preemptirqsoff_hist(
26938 +                           probe_preemptirqsoff_hist, NULL);
26939 +                       if (ret) {
26940 +                               pr_info("wakeup trace: Couldn't assign "
26941 +                                   "probe_preemptirqsoff_hist "
26942 +                                   "to trace_preemptirqsoff_hist\n");
26943 +                               return ret;
26944 +                       }
26945 +                       break;
26946 +#endif
26947 +#ifdef CONFIG_WAKEUP_LATENCY_HIST
26948 +               case WAKEUP_LATENCY:
26949 +                       ret = register_trace_sched_wakeup(
26950 +                           probe_wakeup_latency_hist_start, NULL);
26951 +                       if (ret) {
26952 +                               pr_info("wakeup trace: Couldn't assign "
26953 +                                   "probe_wakeup_latency_hist_start "
26954 +                                   "to trace_sched_wakeup\n");
26955 +                               return ret;
26956 +                       }
26957 +                       ret = register_trace_sched_wakeup_new(
26958 +                           probe_wakeup_latency_hist_start, NULL);
26959 +                       if (ret) {
26960 +                               pr_info("wakeup trace: Couldn't assign "
26961 +                                   "probe_wakeup_latency_hist_start "
26962 +                                   "to trace_sched_wakeup_new\n");
26963 +                               unregister_trace_sched_wakeup(
26964 +                                   probe_wakeup_latency_hist_start, NULL);
26965 +                               return ret;
26966 +                       }
26967 +                       ret = register_trace_sched_switch(
26968 +                           probe_wakeup_latency_hist_stop, NULL);
26969 +                       if (ret) {
26970 +                               pr_info("wakeup trace: Couldn't assign "
26971 +                                   "probe_wakeup_latency_hist_stop "
26972 +                                   "to trace_sched_switch\n");
26973 +                               unregister_trace_sched_wakeup(
26974 +                                   probe_wakeup_latency_hist_start, NULL);
26975 +                               unregister_trace_sched_wakeup_new(
26976 +                                   probe_wakeup_latency_hist_start, NULL);
26977 +                               return ret;
26978 +                       }
26979 +                       ret = register_trace_sched_migrate_task(
26980 +                           probe_sched_migrate_task, NULL);
26981 +                       if (ret) {
26982 +                               pr_info("wakeup trace: Couldn't assign "
26983 +                                   "probe_sched_migrate_task "
26984 +                                   "to trace_sched_migrate_task\n");
26985 +                               unregister_trace_sched_wakeup(
26986 +                                   probe_wakeup_latency_hist_start, NULL);
26987 +                               unregister_trace_sched_wakeup_new(
26988 +                                   probe_wakeup_latency_hist_start, NULL);
26989 +                               unregister_trace_sched_switch(
26990 +                                   probe_wakeup_latency_hist_stop, NULL);
26991 +                               return ret;
26992 +                       }
26993 +                       break;
26994 +#endif
26995 +#ifdef CONFIG_MISSED_TIMER_OFFSETS_HIST
26996 +               case MISSED_TIMER_OFFSETS:
26997 +                       ret = register_trace_hrtimer_interrupt(
26998 +                           probe_hrtimer_interrupt, NULL);
26999 +                       if (ret) {
27000 +                               pr_info("wakeup trace: Couldn't assign "
27001 +                                   "probe_hrtimer_interrupt "
27002 +                                   "to trace_hrtimer_interrupt\n");
27003 +                               return ret;
27004 +                       }
27005 +                       break;
27006 +#endif
27007 +#if defined(CONFIG_WAKEUP_LATENCY_HIST) && \
27008 +       defined(CONFIG_MISSED_TIMER_OFFSETS_HIST)
27009 +               case TIMERANDWAKEUP_LATENCY:
27010 +                       if (!wakeup_latency_enabled_data.enabled ||
27011 +                           !missed_timer_offsets_enabled_data.enabled)
27012 +                               return -EINVAL;
27013 +                       break;
27014 +#endif
27015 +               default:
27016 +                       break;
27017 +               }
27018 +       } else {
27019 +               switch (ed->latency_type) {
27020 +#if defined(CONFIG_INTERRUPT_OFF_HIST) || defined(CONFIG_PREEMPT_OFF_HIST)
27021 +               case PREEMPTIRQSOFF_LATENCY:
27022 +                       {
27023 +                               int cpu;
27024 +
27025 +                               unregister_trace_preemptirqsoff_hist(
27026 +                                   probe_preemptirqsoff_hist, NULL);
27027 +                               for_each_online_cpu(cpu) {
27028 +#ifdef CONFIG_INTERRUPT_OFF_HIST
27029 +                                       per_cpu(hist_irqsoff_counting,
27030 +                                           cpu) = 0;
27031 +#endif
27032 +#ifdef CONFIG_PREEMPT_OFF_HIST
27033 +                                       per_cpu(hist_preemptoff_counting,
27034 +                                           cpu) = 0;
27035 +#endif
27036 +#if defined(CONFIG_INTERRUPT_OFF_HIST) && defined(CONFIG_PREEMPT_OFF_HIST)
27037 +                                       per_cpu(hist_preemptirqsoff_counting,
27038 +                                           cpu) = 0;
27039 +#endif
27040 +                               }
27041 +                       }
27042 +                       break;
27043 +#endif
27044 +#ifdef CONFIG_WAKEUP_LATENCY_HIST
27045 +               case WAKEUP_LATENCY:
27046 +                       {
27047 +                               int cpu;
27048 +
27049 +                               unregister_trace_sched_wakeup(
27050 +                                   probe_wakeup_latency_hist_start, NULL);
27051 +                               unregister_trace_sched_wakeup_new(
27052 +                                   probe_wakeup_latency_hist_start, NULL);
27053 +                               unregister_trace_sched_switch(
27054 +                                   probe_wakeup_latency_hist_stop, NULL);
27055 +                               unregister_trace_sched_migrate_task(
27056 +                                   probe_sched_migrate_task, NULL);
27057 +
27058 +                               for_each_online_cpu(cpu) {
27059 +                                       per_cpu(wakeup_task, cpu) = NULL;
27060 +                                       per_cpu(wakeup_sharedprio, cpu) = 0;
27061 +                               }
27062 +                       }
27063 +#ifdef CONFIG_MISSED_TIMER_OFFSETS_HIST
27064 +                       timerandwakeup_enabled_data.enabled = 0;
27065 +#endif
27066 +                       break;
27067 +#endif
27068 +#ifdef CONFIG_MISSED_TIMER_OFFSETS_HIST
27069 +               case MISSED_TIMER_OFFSETS:
27070 +                       unregister_trace_hrtimer_interrupt(
27071 +                           probe_hrtimer_interrupt, NULL);
27072 +#ifdef CONFIG_WAKEUP_LATENCY_HIST
27073 +                       timerandwakeup_enabled_data.enabled = 0;
27074 +#endif
27075 +                       break;
27076 +#endif
27077 +               default:
27078 +                       break;
27079 +               }
27080 +       }
27081 +       ed->enabled = enable;
27082 +       return cnt;
27083 +}
27084 +
27085 +static const struct file_operations latency_hist_reset_fops = {
27086 +       .open = tracing_open_generic,
27087 +       .write = latency_hist_reset,
27088 +};
27089 +
27090 +static const struct file_operations enable_fops = {
27091 +       .open = tracing_open_generic,
27092 +       .read = show_enable,
27093 +       .write = do_enable,
27094 +};
27095 +
27096 +#if defined(CONFIG_WAKEUP_LATENCY_HIST) || \
27097 +       defined(CONFIG_MISSED_TIMER_OFFSETS_HIST)
27098 +static const struct file_operations pid_fops = {
27099 +       .open = tracing_open_generic,
27100 +       .read = show_pid,
27101 +       .write = do_pid,
27102 +};
27103 +
27104 +static const struct file_operations maxlatproc_fops = {
27105 +       .open = tracing_open_generic,
27106 +       .read = show_maxlatproc,
27107 +};
27108 +#endif
27109 +
27110 +#if defined(CONFIG_INTERRUPT_OFF_HIST) || defined(CONFIG_PREEMPT_OFF_HIST)
27111 +static notrace void probe_preemptirqsoff_hist(void *v, int reason,
27112 +       int starthist)
27113 +{
27114 +       int cpu = raw_smp_processor_id();
27115 +       int time_set = 0;
27116 +
27117 +       if (starthist) {
27118 +               cycle_t uninitialized_var(start);
27119 +
27120 +               if (!preempt_count() && !irqs_disabled())
27121 +                       return;
27122 +
27123 +#ifdef CONFIG_INTERRUPT_OFF_HIST
27124 +               if ((reason == IRQS_OFF || reason == TRACE_START) &&
27125 +                   !per_cpu(hist_irqsoff_counting, cpu)) {
27126 +                       per_cpu(hist_irqsoff_counting, cpu) = 1;
27127 +                       start = ftrace_now(cpu);
27128 +                       time_set++;
27129 +                       per_cpu(hist_irqsoff_start, cpu) = start;
27130 +               }
27131 +#endif
27132 +
27133 +#ifdef CONFIG_PREEMPT_OFF_HIST
27134 +               if ((reason == PREEMPT_OFF || reason == TRACE_START) &&
27135 +                   !per_cpu(hist_preemptoff_counting, cpu)) {
27136 +                       per_cpu(hist_preemptoff_counting, cpu) = 1;
27137 +                       if (!(time_set++))
27138 +                               start = ftrace_now(cpu);
27139 +                       per_cpu(hist_preemptoff_start, cpu) = start;
27140 +               }
27141 +#endif
27142 +
27143 +#if defined(CONFIG_INTERRUPT_OFF_HIST) && defined(CONFIG_PREEMPT_OFF_HIST)
27144 +               if (per_cpu(hist_irqsoff_counting, cpu) &&
27145 +                   per_cpu(hist_preemptoff_counting, cpu) &&
27146 +                   !per_cpu(hist_preemptirqsoff_counting, cpu)) {
27147 +                       per_cpu(hist_preemptirqsoff_counting, cpu) = 1;
27148 +                       if (!time_set)
27149 +                               start = ftrace_now(cpu);
27150 +                       per_cpu(hist_preemptirqsoff_start, cpu) = start;
27151 +               }
27152 +#endif
27153 +       } else {
27154 +               cycle_t uninitialized_var(stop);
27155 +
27156 +#ifdef CONFIG_INTERRUPT_OFF_HIST
27157 +               if ((reason == IRQS_ON || reason == TRACE_STOP) &&
27158 +                   per_cpu(hist_irqsoff_counting, cpu)) {
27159 +                       cycle_t start = per_cpu(hist_irqsoff_start, cpu);
27160 +
27161 +                       stop = ftrace_now(cpu);
27162 +                       time_set++;
27163 +                       if (start) {
27164 +                               long latency = ((long) (stop - start)) /
27165 +                                   NSECS_PER_USECS;
27166 +
27167 +                               latency_hist(IRQSOFF_LATENCY, cpu, latency, 0,
27168 +                                   stop, NULL);
27169 +                       }
27170 +                       per_cpu(hist_irqsoff_counting, cpu) = 0;
27171 +               }
27172 +#endif
27173 +
27174 +#ifdef CONFIG_PREEMPT_OFF_HIST
27175 +               if ((reason == PREEMPT_ON || reason == TRACE_STOP) &&
27176 +                   per_cpu(hist_preemptoff_counting, cpu)) {
27177 +                       cycle_t start = per_cpu(hist_preemptoff_start, cpu);
27178 +
27179 +                       if (!(time_set++))
27180 +                               stop = ftrace_now(cpu);
27181 +                       if (start) {
27182 +                               long latency = ((long) (stop - start)) /
27183 +                                   NSECS_PER_USECS;
27184 +
27185 +                               latency_hist(PREEMPTOFF_LATENCY, cpu, latency,
27186 +                                   0, stop, NULL);
27187 +                       }
27188 +                       per_cpu(hist_preemptoff_counting, cpu) = 0;
27189 +               }
27190 +#endif
27191 +
27192 +#if defined(CONFIG_INTERRUPT_OFF_HIST) && defined(CONFIG_PREEMPT_OFF_HIST)
27193 +               if ((!per_cpu(hist_irqsoff_counting, cpu) ||
27194 +                    !per_cpu(hist_preemptoff_counting, cpu)) &&
27195 +                  per_cpu(hist_preemptirqsoff_counting, cpu)) {
27196 +                       cycle_t start = per_cpu(hist_preemptirqsoff_start, cpu);
27197 +
27198 +                       if (!time_set)
27199 +                               stop = ftrace_now(cpu);
27200 +                       if (start) {
27201 +                               long latency = ((long) (stop - start)) /
27202 +                                   NSECS_PER_USECS;
27203 +
27204 +                               latency_hist(PREEMPTIRQSOFF_LATENCY, cpu,
27205 +                                   latency, 0, stop, NULL);
27206 +                       }
27207 +                       per_cpu(hist_preemptirqsoff_counting, cpu) = 0;
27208 +               }
27209 +#endif
27210 +       }
27211 +}
27212 +#endif
27213 +
27214 +#ifdef CONFIG_WAKEUP_LATENCY_HIST
27215 +static DEFINE_RAW_SPINLOCK(wakeup_lock);
27216 +static notrace void probe_sched_migrate_task(void *v, struct task_struct *task,
27217 +       int cpu)
27218 +{
27219 +       int old_cpu = task_cpu(task);
27220 +
27221 +       if (cpu != old_cpu) {
27222 +               unsigned long flags;
27223 +               struct task_struct *cpu_wakeup_task;
27224 +
27225 +               raw_spin_lock_irqsave(&wakeup_lock, flags);
27226 +
27227 +               cpu_wakeup_task = per_cpu(wakeup_task, old_cpu);
27228 +               if (task == cpu_wakeup_task) {
27229 +                       put_task_struct(cpu_wakeup_task);
27230 +                       per_cpu(wakeup_task, old_cpu) = NULL;
27231 +                       cpu_wakeup_task = per_cpu(wakeup_task, cpu) = task;
27232 +                       get_task_struct(cpu_wakeup_task);
27233 +               }
27234 +
27235 +               raw_spin_unlock_irqrestore(&wakeup_lock, flags);
27236 +       }
27237 +}
27238 +
27239 +static notrace void probe_wakeup_latency_hist_start(void *v,
27240 +       struct task_struct *p)
27241 +{
27242 +       unsigned long flags;
27243 +       struct task_struct *curr = current;
27244 +       int cpu = task_cpu(p);
27245 +       struct task_struct *cpu_wakeup_task;
27246 +
27247 +       raw_spin_lock_irqsave(&wakeup_lock, flags);
27248 +
27249 +       cpu_wakeup_task = per_cpu(wakeup_task, cpu);
27250 +
27251 +       if (wakeup_pid) {
27252 +               if ((cpu_wakeup_task && p->prio == cpu_wakeup_task->prio) ||
27253 +                   p->prio == curr->prio)
27254 +                       per_cpu(wakeup_sharedprio, cpu) = 1;
27255 +               if (likely(wakeup_pid != task_pid_nr(p)))
27256 +                       goto out;
27257 +       } else {
27258 +               if (likely(!rt_task(p)) ||
27259 +                   (cpu_wakeup_task && p->prio > cpu_wakeup_task->prio) ||
27260 +                   p->prio > curr->prio)
27261 +                       goto out;
27262 +               if ((cpu_wakeup_task && p->prio == cpu_wakeup_task->prio) ||
27263 +                   p->prio == curr->prio)
27264 +                       per_cpu(wakeup_sharedprio, cpu) = 1;
27265 +       }
27266 +
27267 +       if (cpu_wakeup_task)
27268 +               put_task_struct(cpu_wakeup_task);
27269 +       cpu_wakeup_task = per_cpu(wakeup_task, cpu) = p;
27270 +       get_task_struct(cpu_wakeup_task);
27271 +       cpu_wakeup_task->preempt_timestamp_hist =
27272 +               ftrace_now(raw_smp_processor_id());
27273 +out:
27274 +       raw_spin_unlock_irqrestore(&wakeup_lock, flags);
27275 +}
27276 +
27277 +static notrace void probe_wakeup_latency_hist_stop(void *v,
27278 +       bool preempt, struct task_struct *prev, struct task_struct *next)
27279 +{
27280 +       unsigned long flags;
27281 +       int cpu = task_cpu(next);
27282 +       long latency;
27283 +       cycle_t stop;
27284 +       struct task_struct *cpu_wakeup_task;
27285 +
27286 +       raw_spin_lock_irqsave(&wakeup_lock, flags);
27287 +
27288 +       cpu_wakeup_task = per_cpu(wakeup_task, cpu);
27289 +
27290 +       if (cpu_wakeup_task == NULL)
27291 +               goto out;
27292 +
27293 +       /* Already running? */
27294 +       if (unlikely(current == cpu_wakeup_task))
27295 +               goto out_reset;
27296 +
27297 +       if (next != cpu_wakeup_task) {
27298 +               if (next->prio < cpu_wakeup_task->prio)
27299 +                       goto out_reset;
27300 +
27301 +               if (next->prio == cpu_wakeup_task->prio)
27302 +                       per_cpu(wakeup_sharedprio, cpu) = 1;
27303 +
27304 +               goto out;
27305 +       }
27306 +
27307 +       if (current->prio == cpu_wakeup_task->prio)
27308 +               per_cpu(wakeup_sharedprio, cpu) = 1;
27309 +
27310 +       /*
27311 +        * The task we are waiting for is about to be switched to.
27312 +        * Calculate latency and store it in histogram.
27313 +        */
27314 +       stop = ftrace_now(raw_smp_processor_id());
27315 +
27316 +       latency = ((long) (stop - next->preempt_timestamp_hist)) /
27317 +           NSECS_PER_USECS;
27318 +
27319 +       if (per_cpu(wakeup_sharedprio, cpu)) {
27320 +               latency_hist(WAKEUP_LATENCY_SHAREDPRIO, cpu, latency, 0, stop,
27321 +                   next);
27322 +               per_cpu(wakeup_sharedprio, cpu) = 0;
27323 +       } else {
27324 +               latency_hist(WAKEUP_LATENCY, cpu, latency, 0, stop, next);
27325 +#ifdef CONFIG_MISSED_TIMER_OFFSETS_HIST
27326 +               if (timerandwakeup_enabled_data.enabled) {
27327 +                       latency_hist(TIMERANDWAKEUP_LATENCY, cpu,
27328 +                           next->timer_offset + latency, next->timer_offset,
27329 +                           stop, next);
27330 +               }
27331 +#endif
27332 +       }
27333 +
27334 +out_reset:
27335 +#ifdef CONFIG_MISSED_TIMER_OFFSETS_HIST
27336 +       next->timer_offset = 0;
27337 +#endif
27338 +       put_task_struct(cpu_wakeup_task);
27339 +       per_cpu(wakeup_task, cpu) = NULL;
27340 +out:
27341 +       raw_spin_unlock_irqrestore(&wakeup_lock, flags);
27342 +}
27343 +#endif
27344 +
27345 +#ifdef CONFIG_MISSED_TIMER_OFFSETS_HIST
27346 +static notrace void probe_hrtimer_interrupt(void *v, int cpu,
27347 +       long long latency_ns, struct task_struct *curr,
27348 +       struct task_struct *task)
27349 +{
27350 +       if (latency_ns <= 0 && task != NULL && rt_task(task) &&
27351 +           (task->prio < curr->prio ||
27352 +           (task->prio == curr->prio &&
27353 +           !cpumask_test_cpu(cpu, &task->cpus_allowed)))) {
27354 +               long latency;
27355 +               cycle_t now;
27356 +
27357 +               if (missed_timer_offsets_pid) {
27358 +                       if (likely(missed_timer_offsets_pid !=
27359 +                           task_pid_nr(task)))
27360 +                               return;
27361 +               }
27362 +
27363 +               now = ftrace_now(cpu);
27364 +               latency = (long) div_s64(-latency_ns, NSECS_PER_USECS);
27365 +               latency_hist(MISSED_TIMER_OFFSETS, cpu, latency, latency, now,
27366 +                   task);
27367 +#ifdef CONFIG_WAKEUP_LATENCY_HIST
27368 +               task->timer_offset = latency;
27369 +#endif
27370 +       }
27371 +}
27372 +#endif
27373 +
27374 +static __init int latency_hist_init(void)
27375 +{
27376 +       struct dentry *latency_hist_root = NULL;
27377 +       struct dentry *dentry;
27378 +#ifdef CONFIG_WAKEUP_LATENCY_HIST
27379 +       struct dentry *dentry_sharedprio;
27380 +#endif
27381 +       struct dentry *entry;
27382 +       struct dentry *enable_root;
27383 +       int i = 0;
27384 +       struct hist_data *my_hist;
27385 +       char name[64];
27386 +       char *cpufmt = "CPU%d";
27387 +#if defined(CONFIG_WAKEUP_LATENCY_HIST) || \
27388 +       defined(CONFIG_MISSED_TIMER_OFFSETS_HIST)
27389 +       char *cpufmt_maxlatproc = "max_latency-CPU%d";
27390 +       struct maxlatproc_data *mp = NULL;
27391 +#endif
27392 +
27393 +       dentry = tracing_init_dentry();
27394 +       latency_hist_root = debugfs_create_dir(latency_hist_dir_root, dentry);
27395 +       enable_root = debugfs_create_dir("enable", latency_hist_root);
27396 +
27397 +#ifdef CONFIG_INTERRUPT_OFF_HIST
27398 +       dentry = debugfs_create_dir(irqsoff_hist_dir, latency_hist_root);
27399 +       for_each_possible_cpu(i) {
27400 +               sprintf(name, cpufmt, i);
27401 +               entry = debugfs_create_file(name, 0444, dentry,
27402 +                   &per_cpu(irqsoff_hist, i), &latency_hist_fops);
27403 +               my_hist = &per_cpu(irqsoff_hist, i);
27404 +               atomic_set(&my_hist->hist_mode, 1);
27405 +               my_hist->min_lat = LONG_MAX;
27406 +       }
27407 +       entry = debugfs_create_file("reset", 0644, dentry,
27408 +           (void *)IRQSOFF_LATENCY, &latency_hist_reset_fops);
27409 +#endif
27410 +
27411 +#ifdef CONFIG_PREEMPT_OFF_HIST
27412 +       dentry = debugfs_create_dir(preemptoff_hist_dir,
27413 +           latency_hist_root);
27414 +       for_each_possible_cpu(i) {
27415 +               sprintf(name, cpufmt, i);
27416 +               entry = debugfs_create_file(name, 0444, dentry,
27417 +                   &per_cpu(preemptoff_hist, i), &latency_hist_fops);
27418 +               my_hist = &per_cpu(preemptoff_hist, i);
27419 +               atomic_set(&my_hist->hist_mode, 1);
27420 +               my_hist->min_lat = LONG_MAX;
27421 +       }
27422 +       entry = debugfs_create_file("reset", 0644, dentry,
27423 +           (void *)PREEMPTOFF_LATENCY, &latency_hist_reset_fops);
27424 +#endif
27425 +
27426 +#if defined(CONFIG_INTERRUPT_OFF_HIST) && defined(CONFIG_PREEMPT_OFF_HIST)
27427 +       dentry = debugfs_create_dir(preemptirqsoff_hist_dir,
27428 +           latency_hist_root);
27429 +       for_each_possible_cpu(i) {
27430 +               sprintf(name, cpufmt, i);
27431 +               entry = debugfs_create_file(name, 0444, dentry,
27432 +                   &per_cpu(preemptirqsoff_hist, i), &latency_hist_fops);
27433 +               my_hist = &per_cpu(preemptirqsoff_hist, i);
27434 +               atomic_set(&my_hist->hist_mode, 1);
27435 +               my_hist->min_lat = LONG_MAX;
27436 +       }
27437 +       entry = debugfs_create_file("reset", 0644, dentry,
27438 +           (void *)PREEMPTIRQSOFF_LATENCY, &latency_hist_reset_fops);
27439 +#endif
27440 +
27441 +#if defined(CONFIG_INTERRUPT_OFF_HIST) || defined(CONFIG_PREEMPT_OFF_HIST)
27442 +       entry = debugfs_create_file("preemptirqsoff", 0644,
27443 +           enable_root, (void *)&preemptirqsoff_enabled_data,
27444 +           &enable_fops);
27445 +#endif
27446 +
27447 +#ifdef CONFIG_WAKEUP_LATENCY_HIST
27448 +       dentry = debugfs_create_dir(wakeup_latency_hist_dir,
27449 +           latency_hist_root);
27450 +       dentry_sharedprio = debugfs_create_dir(
27451 +           wakeup_latency_hist_dir_sharedprio, dentry);
27452 +       for_each_possible_cpu(i) {
27453 +               sprintf(name, cpufmt, i);
27454 +
27455 +               entry = debugfs_create_file(name, 0444, dentry,
27456 +                   &per_cpu(wakeup_latency_hist, i),
27457 +                   &latency_hist_fops);
27458 +               my_hist = &per_cpu(wakeup_latency_hist, i);
27459 +               atomic_set(&my_hist->hist_mode, 1);
27460 +               my_hist->min_lat = LONG_MAX;
27461 +
27462 +               entry = debugfs_create_file(name, 0444, dentry_sharedprio,
27463 +                   &per_cpu(wakeup_latency_hist_sharedprio, i),
27464 +                   &latency_hist_fops);
27465 +               my_hist = &per_cpu(wakeup_latency_hist_sharedprio, i);
27466 +               atomic_set(&my_hist->hist_mode, 1);
27467 +               my_hist->min_lat = LONG_MAX;
27468 +
27469 +               sprintf(name, cpufmt_maxlatproc, i);
27470 +
27471 +               mp = &per_cpu(wakeup_maxlatproc, i);
27472 +               entry = debugfs_create_file(name, 0444, dentry, mp,
27473 +                   &maxlatproc_fops);
27474 +               clear_maxlatprocdata(mp);
27475 +
27476 +               mp = &per_cpu(wakeup_maxlatproc_sharedprio, i);
27477 +               entry = debugfs_create_file(name, 0444, dentry_sharedprio, mp,
27478 +                   &maxlatproc_fops);
27479 +               clear_maxlatprocdata(mp);
27480 +       }
27481 +       entry = debugfs_create_file("pid", 0644, dentry,
27482 +           (void *)&wakeup_pid, &pid_fops);
27483 +       entry = debugfs_create_file("reset", 0644, dentry,
27484 +           (void *)WAKEUP_LATENCY, &latency_hist_reset_fops);
27485 +       entry = debugfs_create_file("reset", 0644, dentry_sharedprio,
27486 +           (void *)WAKEUP_LATENCY_SHAREDPRIO, &latency_hist_reset_fops);
27487 +       entry = debugfs_create_file("wakeup", 0644,
27488 +           enable_root, (void *)&wakeup_latency_enabled_data,
27489 +           &enable_fops);
27490 +#endif
27491 +
27492 +#ifdef CONFIG_MISSED_TIMER_OFFSETS_HIST
27493 +       dentry = debugfs_create_dir(missed_timer_offsets_dir,
27494 +           latency_hist_root);
27495 +       for_each_possible_cpu(i) {
27496 +               sprintf(name, cpufmt, i);
27497 +               entry = debugfs_create_file(name, 0444, dentry,
27498 +                   &per_cpu(missed_timer_offsets, i), &latency_hist_fops);
27499 +               my_hist = &per_cpu(missed_timer_offsets, i);
27500 +               atomic_set(&my_hist->hist_mode, 1);
27501 +               my_hist->min_lat = LONG_MAX;
27502 +
27503 +               sprintf(name, cpufmt_maxlatproc, i);
27504 +               mp = &per_cpu(missed_timer_offsets_maxlatproc, i);
27505 +               entry = debugfs_create_file(name, 0444, dentry, mp,
27506 +                   &maxlatproc_fops);
27507 +               clear_maxlatprocdata(mp);
27508 +       }
27509 +       entry = debugfs_create_file("pid", 0644, dentry,
27510 +           (void *)&missed_timer_offsets_pid, &pid_fops);
27511 +       entry = debugfs_create_file("reset", 0644, dentry,
27512 +           (void *)MISSED_TIMER_OFFSETS, &latency_hist_reset_fops);
27513 +       entry = debugfs_create_file("missed_timer_offsets", 0644,
27514 +           enable_root, (void *)&missed_timer_offsets_enabled_data,
27515 +           &enable_fops);
27516 +#endif
27517 +
27518 +#if defined(CONFIG_WAKEUP_LATENCY_HIST) && \
27519 +       defined(CONFIG_MISSED_TIMER_OFFSETS_HIST)
27520 +       dentry = debugfs_create_dir(timerandwakeup_latency_hist_dir,
27521 +           latency_hist_root);
27522 +       for_each_possible_cpu(i) {
27523 +               sprintf(name, cpufmt, i);
27524 +               entry = debugfs_create_file(name, 0444, dentry,
27525 +                   &per_cpu(timerandwakeup_latency_hist, i),
27526 +                   &latency_hist_fops);
27527 +               my_hist = &per_cpu(timerandwakeup_latency_hist, i);
27528 +               atomic_set(&my_hist->hist_mode, 1);
27529 +               my_hist->min_lat = LONG_MAX;
27530 +
27531 +               sprintf(name, cpufmt_maxlatproc, i);
27532 +               mp = &per_cpu(timerandwakeup_maxlatproc, i);
27533 +               entry = debugfs_create_file(name, 0444, dentry, mp,
27534 +                   &maxlatproc_fops);
27535 +               clear_maxlatprocdata(mp);
27536 +       }
27537 +       entry = debugfs_create_file("reset", 0644, dentry,
27538 +           (void *)TIMERANDWAKEUP_LATENCY, &latency_hist_reset_fops);
27539 +       entry = debugfs_create_file("timerandwakeup", 0644,
27540 +           enable_root, (void *)&timerandwakeup_enabled_data,
27541 +           &enable_fops);
27542 +#endif
27543 +       return 0;
27544 +}
27545 +
27546 +device_initcall(latency_hist_init);
27547 diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
27548 index 059233abcfcf..cad1a28bfbe2 100644
27549 --- a/kernel/trace/trace.c
27550 +++ b/kernel/trace/trace.c
27551 @@ -1652,6 +1652,7 @@ tracing_generic_entry_update(struct trace_entry *entry, unsigned long flags,
27552         struct task_struct *tsk = current;
27553
27554         entry->preempt_count            = pc & 0xff;
27555 +       entry->preempt_lazy_count       = preempt_lazy_count();
27556         entry->pid                      = (tsk) ? tsk->pid : 0;
27557         entry->flags =
27558  #ifdef CONFIG_TRACE_IRQFLAGS_SUPPORT
27559 @@ -1661,8 +1662,11 @@ tracing_generic_entry_update(struct trace_entry *entry, unsigned long flags,
27560  #endif
27561                 ((pc & HARDIRQ_MASK) ? TRACE_FLAG_HARDIRQ : 0) |
27562                 ((pc & SOFTIRQ_MASK) ? TRACE_FLAG_SOFTIRQ : 0) |
27563 -               (tif_need_resched() ? TRACE_FLAG_NEED_RESCHED : 0) |
27564 +               (tif_need_resched_now() ? TRACE_FLAG_NEED_RESCHED : 0) |
27565 +               (need_resched_lazy() ? TRACE_FLAG_NEED_RESCHED_LAZY : 0) |
27566                 (test_preempt_need_resched() ? TRACE_FLAG_PREEMPT_RESCHED : 0);
27567 +
27568 +       entry->migrate_disable = (tsk) ? __migrate_disabled(tsk) & 0xFF : 0;
27569  }
27570  EXPORT_SYMBOL_GPL(tracing_generic_entry_update);
27571
27572 @@ -2555,14 +2559,17 @@ get_total_entries(struct trace_buffer *buf,
27573
27574  static void print_lat_help_header(struct seq_file *m)
27575  {
27576 -       seq_puts(m, "#                  _------=> CPU#            \n"
27577 -                   "#                 / _-----=> irqs-off        \n"
27578 -                   "#                | / _----=> need-resched    \n"
27579 -                   "#                || / _---=> hardirq/softirq \n"
27580 -                   "#                ||| / _--=> preempt-depth   \n"
27581 -                   "#                |||| /     delay            \n"
27582 -                   "#  cmd     pid   ||||| time  |   caller      \n"
27583 -                   "#     \\   /      |||||  \\    |   /         \n");
27584 +       seq_puts(m, "#                  _--------=> CPU#              \n"
27585 +                   "#                 / _-------=> irqs-off          \n"
27586 +                   "#                | / _------=> need-resched      \n"
27587 +                   "#                || / _-----=> need-resched_lazy \n"
27588 +                   "#                ||| / _----=> hardirq/softirq   \n"
27589 +                   "#                |||| / _---=> preempt-depth     \n"
27590 +                   "#                ||||| / _--=> preempt-lazy-depth\n"
27591 +                   "#                |||||| / _-=> migrate-disable   \n"
27592 +                   "#                ||||||| /     delay             \n"
27593 +                   "# cmd     pid    |||||||| time   |  caller       \n"
27594 +                   "#     \\   /      ||||||||   \\    |  /            \n");
27595  }
27596
27597  static void print_event_info(struct trace_buffer *buf, struct seq_file *m)
27598 @@ -2588,11 +2595,14 @@ static void print_func_help_header_irq(struct trace_buffer *buf, struct seq_file
27599         print_event_info(buf, m);
27600         seq_puts(m, "#                              _-----=> irqs-off\n"
27601                     "#                             / _----=> need-resched\n"
27602 -                   "#                            | / _---=> hardirq/softirq\n"
27603 -                   "#                            || / _--=> preempt-depth\n"
27604 -                   "#                            ||| /     delay\n"
27605 -                   "#           TASK-PID   CPU#  ||||    TIMESTAMP  FUNCTION\n"
27606 -                   "#              | |       |   ||||       |         |\n");
27607 +                   "#                            |/  _-----=> need-resched_lazy\n"
27608 +                   "#                            || / _---=> hardirq/softirq\n"
27609 +                   "#                            ||| / _--=> preempt-depth\n"
27610 +                   "#                            |||| / _-=> preempt-lazy-depth\n"
27611 +                   "#                            ||||| / _-=> migrate-disable   \n"
27612 +                   "#                            |||||| /    delay\n"
27613 +                   "#           TASK-PID   CPU#  |||||||   TIMESTAMP  FUNCTION\n"
27614 +                   "#              | |       |   |||||||      |         |\n");
27615  }
27616
27617  void
27618 diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
27619 index 919d9d07686f..3bf86ece683c 100644
27620 --- a/kernel/trace/trace.h
27621 +++ b/kernel/trace/trace.h
27622 @@ -117,6 +117,7 @@ struct kretprobe_trace_entry_head {
27623   *  NEED_RESCHED       - reschedule is requested
27624   *  HARDIRQ            - inside an interrupt handler
27625   *  SOFTIRQ            - inside a softirq handler
27626 + *  NEED_RESCHED_LAZY  - lazy reschedule is requested
27627   */
27628  enum trace_flag_type {
27629         TRACE_FLAG_IRQS_OFF             = 0x01,
27630 @@ -125,6 +126,7 @@ enum trace_flag_type {
27631         TRACE_FLAG_HARDIRQ              = 0x08,
27632         TRACE_FLAG_SOFTIRQ              = 0x10,
27633         TRACE_FLAG_PREEMPT_RESCHED      = 0x20,
27634 +       TRACE_FLAG_NEED_RESCHED_LAZY    = 0x40,
27635  };
27636
27637  #define TRACE_BUF_SIZE         1024
27638 diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
27639 index 996f0fd34312..5bd79b347398 100644
27640 --- a/kernel/trace/trace_events.c
27641 +++ b/kernel/trace/trace_events.c
27642 @@ -188,6 +188,8 @@ static int trace_define_common_fields(void)
27643         __common_field(unsigned char, flags);
27644         __common_field(unsigned char, preempt_count);
27645         __common_field(int, pid);
27646 +       __common_field(unsigned short, migrate_disable);
27647 +       __common_field(unsigned short, padding);
27648
27649         return ret;
27650  }
27651 @@ -244,6 +246,14 @@ void *trace_event_buffer_reserve(struct trace_event_buffer *fbuffer,
27652
27653         local_save_flags(fbuffer->flags);
27654         fbuffer->pc = preempt_count();
27655 +       /*
27656 +        * If CONFIG_PREEMPT is enabled, then the tracepoint itself disables
27657 +        * preemption (adding one to the preempt_count). Since we are
27658 +        * interested in the preempt_count at the time the tracepoint was
27659 +        * hit, we need to subtract one to offset the increment.
27660 +        */
27661 +       if (IS_ENABLED(CONFIG_PREEMPT))
27662 +               fbuffer->pc--;
27663         fbuffer->trace_file = trace_file;
27664
27665         fbuffer->event =
27666 diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c
27667 index be3222b7d72e..553e71254ad6 100644
27668 --- a/kernel/trace/trace_irqsoff.c
27669 +++ b/kernel/trace/trace_irqsoff.c
27670 @@ -13,6 +13,7 @@
27671  #include <linux/uaccess.h>
27672  #include <linux/module.h>
27673  #include <linux/ftrace.h>
27674 +#include <trace/events/hist.h>
27675
27676  #include "trace.h"
27677
27678 @@ -424,11 +425,13 @@ void start_critical_timings(void)
27679  {
27680         if (preempt_trace() || irq_trace())
27681                 start_critical_timing(CALLER_ADDR0, CALLER_ADDR1);
27682 +       trace_preemptirqsoff_hist_rcuidle(TRACE_START, 1);
27683  }
27684  EXPORT_SYMBOL_GPL(start_critical_timings);
27685
27686  void stop_critical_timings(void)
27687  {
27688 +       trace_preemptirqsoff_hist_rcuidle(TRACE_STOP, 0);
27689         if (preempt_trace() || irq_trace())
27690                 stop_critical_timing(CALLER_ADDR0, CALLER_ADDR1);
27691  }
27692 @@ -438,6 +441,7 @@ EXPORT_SYMBOL_GPL(stop_critical_timings);
27693  #ifdef CONFIG_PROVE_LOCKING
27694  void time_hardirqs_on(unsigned long a0, unsigned long a1)
27695  {
27696 +       trace_preemptirqsoff_hist_rcuidle(IRQS_ON, 0);
27697         if (!preempt_trace() && irq_trace())
27698                 stop_critical_timing(a0, a1);
27699  }
27700 @@ -446,6 +450,7 @@ void time_hardirqs_off(unsigned long a0, unsigned long a1)
27701  {
27702         if (!preempt_trace() && irq_trace())
27703                 start_critical_timing(a0, a1);
27704 +       trace_preemptirqsoff_hist_rcuidle(IRQS_OFF, 1);
27705  }
27706
27707  #else /* !CONFIG_PROVE_LOCKING */
27708 @@ -471,6 +476,7 @@ inline void print_irqtrace_events(struct task_struct *curr)
27709   */
27710  void trace_hardirqs_on(void)
27711  {
27712 +       trace_preemptirqsoff_hist(IRQS_ON, 0);
27713         if (!preempt_trace() && irq_trace())
27714                 stop_critical_timing(CALLER_ADDR0, CALLER_ADDR1);
27715  }
27716 @@ -480,11 +486,13 @@ void trace_hardirqs_off(void)
27717  {
27718         if (!preempt_trace() && irq_trace())
27719                 start_critical_timing(CALLER_ADDR0, CALLER_ADDR1);
27720 +       trace_preemptirqsoff_hist(IRQS_OFF, 1);
27721  }
27722  EXPORT_SYMBOL(trace_hardirqs_off);
27723
27724  __visible void trace_hardirqs_on_caller(unsigned long caller_addr)
27725  {
27726 +       trace_preemptirqsoff_hist(IRQS_ON, 0);
27727         if (!preempt_trace() && irq_trace())
27728                 stop_critical_timing(CALLER_ADDR0, caller_addr);
27729  }
27730 @@ -494,6 +502,7 @@ __visible void trace_hardirqs_off_caller(unsigned long caller_addr)
27731  {
27732         if (!preempt_trace() && irq_trace())
27733                 start_critical_timing(CALLER_ADDR0, caller_addr);
27734 +       trace_preemptirqsoff_hist(IRQS_OFF, 1);
27735  }
27736  EXPORT_SYMBOL(trace_hardirqs_off_caller);
27737
27738 @@ -503,12 +512,14 @@ EXPORT_SYMBOL(trace_hardirqs_off_caller);
27739  #ifdef CONFIG_PREEMPT_TRACER
27740  void trace_preempt_on(unsigned long a0, unsigned long a1)
27741  {
27742 +       trace_preemptirqsoff_hist(PREEMPT_ON, 0);
27743         if (preempt_trace() && !irq_trace())
27744                 stop_critical_timing(a0, a1);
27745  }
27746
27747  void trace_preempt_off(unsigned long a0, unsigned long a1)
27748  {
27749 +       trace_preemptirqsoff_hist(PREEMPT_ON, 1);
27750         if (preempt_trace() && !irq_trace())
27751                 start_critical_timing(a0, a1);
27752  }
27753 diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
27754 index 282982195e09..9f19d839a756 100644
27755 --- a/kernel/trace/trace_output.c
27756 +++ b/kernel/trace/trace_output.c
27757 @@ -386,6 +386,7 @@ int trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry)
27758  {
27759         char hardsoft_irq;
27760         char need_resched;
27761 +       char need_resched_lazy;
27762         char irqs_off;
27763         int hardirq;
27764         int softirq;
27765 @@ -413,6 +414,8 @@ int trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry)
27766                 need_resched = '.';
27767                 break;
27768         }
27769 +       need_resched_lazy =
27770 +               (entry->flags & TRACE_FLAG_NEED_RESCHED_LAZY) ? 'L' : '.';
27771
27772         hardsoft_irq =
27773                 (hardirq && softirq) ? 'H' :
27774 @@ -420,14 +423,25 @@ int trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry)
27775                 softirq ? 's' :
27776                 '.';
27777
27778 -       trace_seq_printf(s, "%c%c%c",
27779 -                        irqs_off, need_resched, hardsoft_irq);
27780 +       trace_seq_printf(s, "%c%c%c%c",
27781 +                        irqs_off, need_resched, need_resched_lazy,
27782 +                        hardsoft_irq);
27783
27784         if (entry->preempt_count)
27785                 trace_seq_printf(s, "%x", entry->preempt_count);
27786         else
27787                 trace_seq_putc(s, '.');
27788
27789 +       if (entry->preempt_lazy_count)
27790 +               trace_seq_printf(s, "%x", entry->preempt_lazy_count);
27791 +       else
27792 +               trace_seq_putc(s, '.');
27793 +
27794 +       if (entry->migrate_disable)
27795 +               trace_seq_printf(s, "%x", entry->migrate_disable);
27796 +       else
27797 +               trace_seq_putc(s, '.');
27798 +
27799         return !trace_seq_has_overflowed(s);
27800  }
27801
27802 diff --git a/kernel/user.c b/kernel/user.c
27803 index b069ccbfb0b0..1a2e88e98b5e 100644
27804 --- a/kernel/user.c
27805 +++ b/kernel/user.c
27806 @@ -161,11 +161,11 @@ void free_uid(struct user_struct *up)
27807         if (!up)
27808                 return;
27809
27810 -       local_irq_save(flags);
27811 +       local_irq_save_nort(flags);
27812         if (atomic_dec_and_lock(&up->__count, &uidhash_lock))
27813                 free_user(up, flags);
27814         else
27815 -               local_irq_restore(flags);
27816 +               local_irq_restore_nort(flags);
27817  }
27818
27819  struct user_struct *alloc_uid(kuid_t uid)
27820 diff --git a/kernel/watchdog.c b/kernel/watchdog.c
27821 index 198137b1cadc..47d143740774 100644
27822 --- a/kernel/watchdog.c
27823 +++ b/kernel/watchdog.c
27824 @@ -299,6 +299,8 @@ static int is_softlockup(unsigned long touch_ts)
27825
27826  #ifdef CONFIG_HARDLOCKUP_DETECTOR
27827
27828 +static DEFINE_RAW_SPINLOCK(watchdog_output_lock);
27829 +
27830  static struct perf_event_attr wd_hw_attr = {
27831         .type           = PERF_TYPE_HARDWARE,
27832         .config         = PERF_COUNT_HW_CPU_CYCLES,
27833 @@ -333,6 +335,13 @@ static void watchdog_overflow_callback(struct perf_event *event,
27834                 /* only print hardlockups once */
27835                 if (__this_cpu_read(hard_watchdog_warn) == true)
27836                         return;
27837 +               /*
27838 +                * If early-printk is enabled then make sure we do not
27839 +                * lock up in printk() and kill console logging:
27840 +                */
27841 +               printk_kill();
27842 +
27843 +               raw_spin_lock(&watchdog_output_lock);
27844
27845                 pr_emerg("Watchdog detected hard LOCKUP on cpu %d", this_cpu);
27846                 print_modules();
27847 @@ -350,8 +359,9 @@ static void watchdog_overflow_callback(struct perf_event *event,
27848                                 !test_and_set_bit(0, &hardlockup_allcpu_dumped))
27849                         trigger_allbutself_cpu_backtrace();
27850
27851 +               raw_spin_unlock(&watchdog_output_lock);
27852                 if (hardlockup_panic)
27853 -                       panic("Hard LOCKUP");
27854 +                       nmi_panic(regs, "Hard LOCKUP");
27855
27856                 __this_cpu_write(hard_watchdog_warn, true);
27857                 return;
27858 @@ -497,6 +507,7 @@ static void watchdog_enable(unsigned int cpu)
27859         /* kick off the timer for the hardlockup detector */
27860         hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
27861         hrtimer->function = watchdog_timer_fn;
27862 +       hrtimer->irqsafe = 1;
27863
27864         /* Enable the perf event */
27865         watchdog_nmi_enable(cpu);
27866 diff --git a/kernel/workqueue.c b/kernel/workqueue.c
27867 index 2c2f971f3e75..965d5f65e847 100644
27868 --- a/kernel/workqueue.c
27869 +++ b/kernel/workqueue.c
27870 @@ -48,6 +48,8 @@
27871  #include <linux/nodemask.h>
27872  #include <linux/moduleparam.h>
27873  #include <linux/uaccess.h>
27874 +#include <linux/locallock.h>
27875 +#include <linux/delay.h>
27876
27877  #include "workqueue_internal.h"
27878
27879 @@ -121,11 +123,16 @@ enum {
27880   *    cpu or grabbing pool->lock is enough for read access.  If
27881   *    POOL_DISASSOCIATED is set, it's identical to L.
27882   *
27883 + *    On RT we need the extra protection via rt_lock_idle_list() for
27884 + *    the list manipulations against read access from
27885 + *    wq_worker_sleeping(). All other places are nicely serialized via
27886 + *    pool->lock.
27887 + *
27888   * A: pool->attach_mutex protected.
27889   *
27890   * PL: wq_pool_mutex protected.
27891   *
27892 - * PR: wq_pool_mutex protected for writes.  Sched-RCU protected for reads.
27893 + * PR: wq_pool_mutex protected for writes.  RCU protected for reads.
27894   *
27895   * PW: wq_pool_mutex and wq->mutex protected for writes.  Either for reads.
27896   *
27897 @@ -134,7 +141,7 @@ enum {
27898   *
27899   * WQ: wq->mutex protected.
27900   *
27901 - * WR: wq->mutex protected for writes.  Sched-RCU protected for reads.
27902 + * WR: wq->mutex protected for writes.  RCU protected for reads.
27903   *
27904   * MD: wq_mayday_lock protected.
27905   */
27906 @@ -183,7 +190,7 @@ struct worker_pool {
27907         atomic_t                nr_running ____cacheline_aligned_in_smp;
27908
27909         /*
27910 -        * Destruction of pool is sched-RCU protected to allow dereferences
27911 +        * Destruction of pool is RCU protected to allow dereferences
27912          * from get_work_pool().
27913          */
27914         struct rcu_head         rcu;
27915 @@ -212,7 +219,7 @@ struct pool_workqueue {
27916         /*
27917          * Release of unbound pwq is punted to system_wq.  See put_pwq()
27918          * and pwq_unbound_release_workfn() for details.  pool_workqueue
27919 -        * itself is also sched-RCU protected so that the first pwq can be
27920 +        * itself is also RCU protected so that the first pwq can be
27921          * determined without grabbing wq->mutex.
27922          */
27923         struct work_struct      unbound_release_work;
27924 @@ -331,6 +338,8 @@ EXPORT_SYMBOL_GPL(system_power_efficient_wq);
27925  struct workqueue_struct *system_freezable_power_efficient_wq __read_mostly;
27926  EXPORT_SYMBOL_GPL(system_freezable_power_efficient_wq);
27927
27928 +static DEFINE_LOCAL_IRQ_LOCK(pendingb_lock);
27929 +
27930  static int worker_thread(void *__worker);
27931  static void workqueue_sysfs_unregister(struct workqueue_struct *wq);
27932
27933 @@ -338,20 +347,20 @@ static void workqueue_sysfs_unregister(struct workqueue_struct *wq);
27934  #include <trace/events/workqueue.h>
27935
27936  #define assert_rcu_or_pool_mutex()                                     \
27937 -       RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held() &&                 \
27938 +       RCU_LOCKDEP_WARN(!rcu_read_lock_held() &&                       \
27939                          !lockdep_is_held(&wq_pool_mutex),              \
27940 -                        "sched RCU or wq_pool_mutex should be held")
27941 +                        "RCU or wq_pool_mutex should be held")
27942
27943  #define assert_rcu_or_wq_mutex(wq)                                     \
27944 -       RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held() &&                 \
27945 +       RCU_LOCKDEP_WARN(!rcu_read_lock_held() &&                       \
27946                          !lockdep_is_held(&wq->mutex),                  \
27947 -                        "sched RCU or wq->mutex should be held")
27948 +                        "RCU or wq->mutex should be held")
27949
27950  #define assert_rcu_or_wq_mutex_or_pool_mutex(wq)                       \
27951 -       RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held() &&                 \
27952 +       RCU_LOCKDEP_WARN(!rcu_read_lock_held() &&                       \
27953                          !lockdep_is_held(&wq->mutex) &&                \
27954                          !lockdep_is_held(&wq_pool_mutex),              \
27955 -                        "sched RCU, wq->mutex or wq_pool_mutex should be held")
27956 +                        "RCU, wq->mutex or wq_pool_mutex should be held")
27957
27958  #define for_each_cpu_worker_pool(pool, cpu)                            \
27959         for ((pool) = &per_cpu(cpu_worker_pools, cpu)[0];               \
27960 @@ -363,7 +372,7 @@ static void workqueue_sysfs_unregister(struct workqueue_struct *wq);
27961   * @pool: iteration cursor
27962   * @pi: integer used for iteration
27963   *
27964 - * This must be called either with wq_pool_mutex held or sched RCU read
27965 + * This must be called either with wq_pool_mutex held or RCU read
27966   * locked.  If the pool needs to be used beyond the locking in effect, the
27967   * caller is responsible for guaranteeing that the pool stays online.
27968   *
27969 @@ -395,7 +404,7 @@ static void workqueue_sysfs_unregister(struct workqueue_struct *wq);
27970   * @pwq: iteration cursor
27971   * @wq: the target workqueue
27972   *
27973 - * This must be called either with wq->mutex held or sched RCU read locked.
27974 + * This must be called either with wq->mutex held or RCU read locked.
27975   * If the pwq needs to be used beyond the locking in effect, the caller is
27976   * responsible for guaranteeing that the pwq stays online.
27977   *
27978 @@ -407,6 +416,31 @@ static void workqueue_sysfs_unregister(struct workqueue_struct *wq);
27979                 if (({ assert_rcu_or_wq_mutex(wq); false; })) { }       \
27980                 else
27981
27982 +#ifdef CONFIG_PREEMPT_RT_BASE
27983 +static inline void rt_lock_idle_list(struct worker_pool *pool)
27984 +{
27985 +       preempt_disable();
27986 +}
27987 +static inline void rt_unlock_idle_list(struct worker_pool *pool)
27988 +{
27989 +       preempt_enable();
27990 +}
27991 +static inline void sched_lock_idle_list(struct worker_pool *pool) { }
27992 +static inline void sched_unlock_idle_list(struct worker_pool *pool) { }
27993 +#else
27994 +static inline void rt_lock_idle_list(struct worker_pool *pool) { }
27995 +static inline void rt_unlock_idle_list(struct worker_pool *pool) { }
27996 +static inline void sched_lock_idle_list(struct worker_pool *pool)
27997 +{
27998 +       spin_lock_irq(&pool->lock);
27999 +}
28000 +static inline void sched_unlock_idle_list(struct worker_pool *pool)
28001 +{
28002 +       spin_unlock_irq(&pool->lock);
28003 +}
28004 +#endif
28005 +
28006 +
28007  #ifdef CONFIG_DEBUG_OBJECTS_WORK
28008
28009  static struct debug_obj_descr work_debug_descr;
28010 @@ -557,7 +591,7 @@ static int worker_pool_assign_id(struct worker_pool *pool)
28011   * @wq: the target workqueue
28012   * @node: the node ID
28013   *
28014 - * This must be called with any of wq_pool_mutex, wq->mutex or sched RCU
28015 + * This must be called with any of wq_pool_mutex, wq->mutex or RCU
28016   * read locked.
28017   * If the pwq needs to be used beyond the locking in effect, the caller is
28018   * responsible for guaranteeing that the pwq stays online.
28019 @@ -701,8 +735,8 @@ static struct pool_workqueue *get_work_pwq(struct work_struct *work)
28020   * @work: the work item of interest
28021   *
28022   * Pools are created and destroyed under wq_pool_mutex, and allows read
28023 - * access under sched-RCU read lock.  As such, this function should be
28024 - * called under wq_pool_mutex or with preemption disabled.
28025 + * access under RCU read lock.  As such, this function should be
28026 + * called under wq_pool_mutex or inside of a rcu_read_lock() region.
28027   *
28028   * All fields of the returned pool are accessible as long as the above
28029   * mentioned locking is in effect.  If the returned pool needs to be used
28030 @@ -839,51 +873,44 @@ static struct worker *first_idle_worker(struct worker_pool *pool)
28031   */
28032  static void wake_up_worker(struct worker_pool *pool)
28033  {
28034 -       struct worker *worker = first_idle_worker(pool);
28035 +       struct worker *worker;
28036 +
28037 +       rt_lock_idle_list(pool);
28038 +
28039 +       worker = first_idle_worker(pool);
28040
28041         if (likely(worker))
28042                 wake_up_process(worker->task);
28043 +
28044 +       rt_unlock_idle_list(pool);
28045  }
28046
28047  /**
28048 - * wq_worker_waking_up - a worker is waking up
28049 - * @task: task waking up
28050 - * @cpu: CPU @task is waking up to
28051 + * wq_worker_running - a worker is running again
28052 + * @task: task returning from sleep
28053   *
28054 - * This function is called during try_to_wake_up() when a worker is
28055 - * being awoken.
28056 - *
28057 - * CONTEXT:
28058 - * spin_lock_irq(rq->lock)
28059 + * This function is called when a worker returns from schedule()
28060   */
28061 -void wq_worker_waking_up(struct task_struct *task, int cpu)
28062 +void wq_worker_running(struct task_struct *task)
28063  {
28064         struct worker *worker = kthread_data(task);
28065
28066 -       if (!(worker->flags & WORKER_NOT_RUNNING)) {
28067 -               WARN_ON_ONCE(worker->pool->cpu != cpu);
28068 +       if (!worker->sleeping)
28069 +               return;
28070 +       if (!(worker->flags & WORKER_NOT_RUNNING))
28071                 atomic_inc(&worker->pool->nr_running);
28072 -       }
28073 +       worker->sleeping = 0;
28074  }
28075
28076  /**
28077   * wq_worker_sleeping - a worker is going to sleep
28078   * @task: task going to sleep
28079 - * @cpu: CPU in question, must be the current CPU number
28080 - *
28081 - * This function is called during schedule() when a busy worker is
28082 - * going to sleep.  Worker on the same cpu can be woken up by
28083 - * returning pointer to its task.
28084 - *
28085 - * CONTEXT:
28086 - * spin_lock_irq(rq->lock)
28087 - *
28088 - * Return:
28089 - * Worker task on @cpu to wake up, %NULL if none.
28090 + * This function is called from schedule() when a busy worker is
28091 + * going to sleep.
28092   */
28093 -struct task_struct *wq_worker_sleeping(struct task_struct *task, int cpu)
28094 +void wq_worker_sleeping(struct task_struct *task)
28095  {
28096 -       struct worker *worker = kthread_data(task), *to_wakeup = NULL;
28097 +       struct worker *worker = kthread_data(task);
28098         struct worker_pool *pool;
28099
28100         /*
28101 @@ -892,29 +919,26 @@ struct task_struct *wq_worker_sleeping(struct task_struct *task, int cpu)
28102          * checking NOT_RUNNING.
28103          */
28104         if (worker->flags & WORKER_NOT_RUNNING)
28105 -               return NULL;
28106 +               return;
28107
28108         pool = worker->pool;
28109
28110 -       /* this can only happen on the local cpu */
28111 -       if (WARN_ON_ONCE(cpu != raw_smp_processor_id() || pool->cpu != cpu))
28112 -               return NULL;
28113 +       if (WARN_ON_ONCE(worker->sleeping))
28114 +               return;
28115 +
28116 +       worker->sleeping = 1;
28117
28118         /*
28119          * The counterpart of the following dec_and_test, implied mb,
28120          * worklist not empty test sequence is in insert_work().
28121          * Please read comment there.
28122 -        *
28123 -        * NOT_RUNNING is clear.  This means that we're bound to and
28124 -        * running on the local cpu w/ rq lock held and preemption
28125 -        * disabled, which in turn means that none else could be
28126 -        * manipulating idle_list, so dereferencing idle_list without pool
28127 -        * lock is safe.
28128          */
28129         if (atomic_dec_and_test(&pool->nr_running) &&
28130 -           !list_empty(&pool->worklist))
28131 -               to_wakeup = first_idle_worker(pool);
28132 -       return to_wakeup ? to_wakeup->task : NULL;
28133 +           !list_empty(&pool->worklist)) {
28134 +               sched_lock_idle_list(pool);
28135 +               wake_up_worker(pool);
28136 +               sched_unlock_idle_list(pool);
28137 +       }
28138  }
28139
28140  /**
28141 @@ -1108,12 +1132,12 @@ static void put_pwq_unlocked(struct pool_workqueue *pwq)
28142  {
28143         if (pwq) {
28144                 /*
28145 -                * As both pwqs and pools are sched-RCU protected, the
28146 +                * As both pwqs and pools are RCU protected, the
28147                  * following lock operations are safe.
28148                  */
28149 -               spin_lock_irq(&pwq->pool->lock);
28150 +               local_spin_lock_irq(pendingb_lock, &pwq->pool->lock);
28151                 put_pwq(pwq);
28152 -               spin_unlock_irq(&pwq->pool->lock);
28153 +               local_spin_unlock_irq(pendingb_lock, &pwq->pool->lock);
28154         }
28155  }
28156
28157 @@ -1215,7 +1239,7 @@ static int try_to_grab_pending(struct work_struct *work, bool is_dwork,
28158         struct worker_pool *pool;
28159         struct pool_workqueue *pwq;
28160
28161 -       local_irq_save(*flags);
28162 +       local_lock_irqsave(pendingb_lock, *flags);
28163
28164         /* try to steal the timer if it exists */
28165         if (is_dwork) {
28166 @@ -1234,6 +1258,7 @@ static int try_to_grab_pending(struct work_struct *work, bool is_dwork,
28167         if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work)))
28168                 return 0;
28169
28170 +       rcu_read_lock();
28171         /*
28172          * The queueing is in progress, or it is already queued. Try to
28173          * steal it from ->worklist without clearing WORK_STRUCT_PENDING.
28174 @@ -1272,14 +1297,16 @@ static int try_to_grab_pending(struct work_struct *work, bool is_dwork,
28175                 set_work_pool_and_keep_pending(work, pool->id);
28176
28177                 spin_unlock(&pool->lock);
28178 +               rcu_read_unlock();
28179                 return 1;
28180         }
28181         spin_unlock(&pool->lock);
28182  fail:
28183 -       local_irq_restore(*flags);
28184 +       rcu_read_unlock();
28185 +       local_unlock_irqrestore(pendingb_lock, *flags);
28186         if (work_is_canceling(work))
28187                 return -ENOENT;
28188 -       cpu_relax();
28189 +       cpu_chill();
28190         return -EAGAIN;
28191  }
28192
28193 @@ -1348,7 +1375,7 @@ static void __queue_work(int cpu, struct workqueue_struct *wq,
28194          * queued or lose PENDING.  Grabbing PENDING and queueing should
28195          * happen with IRQ disabled.
28196          */
28197 -       WARN_ON_ONCE(!irqs_disabled());
28198 +       WARN_ON_ONCE_NONRT(!irqs_disabled());
28199
28200         debug_work_activate(work);
28201
28202 @@ -1356,6 +1383,8 @@ static void __queue_work(int cpu, struct workqueue_struct *wq,
28203         if (unlikely(wq->flags & __WQ_DRAINING) &&
28204             WARN_ON_ONCE(!is_chained_work(wq)))
28205                 return;
28206 +
28207 +       rcu_read_lock();
28208  retry:
28209         if (req_cpu == WORK_CPU_UNBOUND)
28210                 cpu = raw_smp_processor_id();
28211 @@ -1412,10 +1441,8 @@ retry:
28212         /* pwq determined, queue */
28213         trace_workqueue_queue_work(req_cpu, pwq, work);
28214
28215 -       if (WARN_ON(!list_empty(&work->entry))) {
28216 -               spin_unlock(&pwq->pool->lock);
28217 -               return;
28218 -       }
28219 +       if (WARN_ON(!list_empty(&work->entry)))
28220 +               goto out;
28221
28222         pwq->nr_in_flight[pwq->work_color]++;
28223         work_flags = work_color_to_flags(pwq->work_color);
28224 @@ -1431,7 +1458,9 @@ retry:
28225
28226         insert_work(pwq, work, worklist, work_flags);
28227
28228 +out:
28229         spin_unlock(&pwq->pool->lock);
28230 +       rcu_read_unlock();
28231  }
28232
28233  /**
28234 @@ -1451,14 +1480,14 @@ bool queue_work_on(int cpu, struct workqueue_struct *wq,
28235         bool ret = false;
28236         unsigned long flags;
28237
28238 -       local_irq_save(flags);
28239 +       local_lock_irqsave(pendingb_lock,flags);
28240
28241         if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work))) {
28242                 __queue_work(cpu, wq, work);
28243                 ret = true;
28244         }
28245
28246 -       local_irq_restore(flags);
28247 +       local_unlock_irqrestore(pendingb_lock, flags);
28248         return ret;
28249  }
28250  EXPORT_SYMBOL(queue_work_on);
28251 @@ -1525,14 +1554,14 @@ bool queue_delayed_work_on(int cpu, struct workqueue_struct *wq,
28252         unsigned long flags;
28253
28254         /* read the comment in __queue_work() */
28255 -       local_irq_save(flags);
28256 +       local_lock_irqsave(pendingb_lock, flags);
28257
28258         if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work))) {
28259                 __queue_delayed_work(cpu, wq, dwork, delay);
28260                 ret = true;
28261         }
28262
28263 -       local_irq_restore(flags);
28264 +       local_unlock_irqrestore(pendingb_lock, flags);
28265         return ret;
28266  }
28267  EXPORT_SYMBOL(queue_delayed_work_on);
28268 @@ -1567,7 +1596,7 @@ bool mod_delayed_work_on(int cpu, struct workqueue_struct *wq,
28269
28270         if (likely(ret >= 0)) {
28271                 __queue_delayed_work(cpu, wq, dwork, delay);
28272 -               local_irq_restore(flags);
28273 +               local_unlock_irqrestore(pendingb_lock, flags);
28274         }
28275
28276         /* -ENOENT from try_to_grab_pending() becomes %true */
28277 @@ -1600,7 +1629,9 @@ static void worker_enter_idle(struct worker *worker)
28278         worker->last_active = jiffies;
28279
28280         /* idle_list is LIFO */
28281 +       rt_lock_idle_list(pool);
28282         list_add(&worker->entry, &pool->idle_list);
28283 +       rt_unlock_idle_list(pool);
28284
28285         if (too_many_workers(pool) && !timer_pending(&pool->idle_timer))
28286                 mod_timer(&pool->idle_timer, jiffies + IDLE_WORKER_TIMEOUT);
28287 @@ -1633,7 +1664,9 @@ static void worker_leave_idle(struct worker *worker)
28288                 return;
28289         worker_clr_flags(worker, WORKER_IDLE);
28290         pool->nr_idle--;
28291 +       rt_lock_idle_list(pool);
28292         list_del_init(&worker->entry);
28293 +       rt_unlock_idle_list(pool);
28294  }
28295
28296  static struct worker *alloc_worker(int node)
28297 @@ -1799,7 +1832,9 @@ static void destroy_worker(struct worker *worker)
28298         pool->nr_workers--;
28299         pool->nr_idle--;
28300
28301 +       rt_lock_idle_list(pool);
28302         list_del_init(&worker->entry);
28303 +       rt_unlock_idle_list(pool);
28304         worker->flags |= WORKER_DIE;
28305         wake_up_process(worker->task);
28306  }
28307 @@ -2716,14 +2751,14 @@ static bool start_flush_work(struct work_struct *work, struct wq_barrier *barr)
28308
28309         might_sleep();
28310
28311 -       local_irq_disable();
28312 +       rcu_read_lock();
28313         pool = get_work_pool(work);
28314         if (!pool) {
28315 -               local_irq_enable();
28316 +               rcu_read_unlock();
28317                 return false;
28318         }
28319
28320 -       spin_lock(&pool->lock);
28321 +       spin_lock_irq(&pool->lock);
28322         /* see the comment in try_to_grab_pending() with the same code */
28323         pwq = get_work_pwq(work);
28324         if (pwq) {
28325 @@ -2750,10 +2785,11 @@ static bool start_flush_work(struct work_struct *work, struct wq_barrier *barr)
28326         else
28327                 lock_map_acquire_read(&pwq->wq->lockdep_map);
28328         lock_map_release(&pwq->wq->lockdep_map);
28329 -
28330 +       rcu_read_unlock();
28331         return true;
28332  already_gone:
28333         spin_unlock_irq(&pool->lock);
28334 +       rcu_read_unlock();
28335         return false;
28336  }
28337
28338 @@ -2840,7 +2876,7 @@ static bool __cancel_work_timer(struct work_struct *work, bool is_dwork)
28339
28340         /* tell other tasks trying to grab @work to back off */
28341         mark_work_canceling(work);
28342 -       local_irq_restore(flags);
28343 +       local_unlock_irqrestore(pendingb_lock, flags);
28344
28345         flush_work(work);
28346         clear_work_data(work);
28347 @@ -2895,10 +2931,10 @@ EXPORT_SYMBOL_GPL(cancel_work_sync);
28348   */
28349  bool flush_delayed_work(struct delayed_work *dwork)
28350  {
28351 -       local_irq_disable();
28352 +       local_lock_irq(pendingb_lock);
28353         if (del_timer_sync(&dwork->timer))
28354                 __queue_work(dwork->cpu, dwork->wq, &dwork->work);
28355 -       local_irq_enable();
28356 +       local_unlock_irq(pendingb_lock);
28357         return flush_work(&dwork->work);
28358  }
28359  EXPORT_SYMBOL(flush_delayed_work);
28360 @@ -2933,7 +2969,7 @@ bool cancel_delayed_work(struct delayed_work *dwork)
28361
28362         set_work_pool_and_clear_pending(&dwork->work,
28363                                         get_work_pool_id(&dwork->work));
28364 -       local_irq_restore(flags);
28365 +       local_unlock_irqrestore(pendingb_lock, flags);
28366         return ret;
28367  }
28368  EXPORT_SYMBOL(cancel_delayed_work);
28369 @@ -3161,7 +3197,7 @@ static void rcu_free_pool(struct rcu_head *rcu)
28370   * put_unbound_pool - put a worker_pool
28371   * @pool: worker_pool to put
28372   *
28373 - * Put @pool.  If its refcnt reaches zero, it gets destroyed in sched-RCU
28374 + * Put @pool.  If its refcnt reaches zero, it gets destroyed in RCU
28375   * safe manner.  get_unbound_pool() calls this function on its failure path
28376   * and this function should be able to release pools which went through,
28377   * successfully or not, init_worker_pool().
28378 @@ -3215,8 +3251,8 @@ static void put_unbound_pool(struct worker_pool *pool)
28379         del_timer_sync(&pool->idle_timer);
28380         del_timer_sync(&pool->mayday_timer);
28381
28382 -       /* sched-RCU protected to allow dereferences from get_work_pool() */
28383 -       call_rcu_sched(&pool->rcu, rcu_free_pool);
28384 +       /* RCU protected to allow dereferences from get_work_pool() */
28385 +       call_rcu(&pool->rcu, rcu_free_pool);
28386  }
28387
28388  /**
28389 @@ -3323,14 +3359,14 @@ static void pwq_unbound_release_workfn(struct work_struct *work)
28390         put_unbound_pool(pool);
28391         mutex_unlock(&wq_pool_mutex);
28392
28393 -       call_rcu_sched(&pwq->rcu, rcu_free_pwq);
28394 +       call_rcu(&pwq->rcu, rcu_free_pwq);
28395
28396         /*
28397          * If we're the last pwq going away, @wq is already dead and no one
28398          * is gonna access it anymore.  Schedule RCU free.
28399          */
28400         if (is_last)
28401 -               call_rcu_sched(&wq->rcu, rcu_free_wq);
28402 +               call_rcu(&wq->rcu, rcu_free_wq);
28403  }
28404
28405  /**
28406 @@ -3983,7 +4019,7 @@ void destroy_workqueue(struct workqueue_struct *wq)
28407                  * The base ref is never dropped on per-cpu pwqs.  Directly
28408                  * schedule RCU free.
28409                  */
28410 -               call_rcu_sched(&wq->rcu, rcu_free_wq);
28411 +               call_rcu(&wq->rcu, rcu_free_wq);
28412         } else {
28413                 /*
28414                  * We're the sole accessor of @wq at this point.  Directly
28415 @@ -4076,7 +4112,8 @@ bool workqueue_congested(int cpu, struct workqueue_struct *wq)
28416         struct pool_workqueue *pwq;
28417         bool ret;
28418
28419 -       rcu_read_lock_sched();
28420 +       rcu_read_lock();
28421 +       preempt_disable();
28422
28423         if (cpu == WORK_CPU_UNBOUND)
28424                 cpu = smp_processor_id();
28425 @@ -4087,7 +4124,8 @@ bool workqueue_congested(int cpu, struct workqueue_struct *wq)
28426                 pwq = unbound_pwq_by_node(wq, cpu_to_node(cpu));
28427
28428         ret = !list_empty(&pwq->delayed_works);
28429 -       rcu_read_unlock_sched();
28430 +       preempt_enable();
28431 +       rcu_read_unlock();
28432
28433         return ret;
28434  }
28435 @@ -4113,15 +4151,15 @@ unsigned int work_busy(struct work_struct *work)
28436         if (work_pending(work))
28437                 ret |= WORK_BUSY_PENDING;
28438
28439 -       local_irq_save(flags);
28440 +       rcu_read_lock();
28441         pool = get_work_pool(work);
28442         if (pool) {
28443 -               spin_lock(&pool->lock);
28444 +               spin_lock_irqsave(&pool->lock, flags);
28445                 if (find_worker_executing_work(pool, work))
28446                         ret |= WORK_BUSY_RUNNING;
28447 -               spin_unlock(&pool->lock);
28448 +               spin_unlock_irqrestore(&pool->lock, flags);
28449         }
28450 -       local_irq_restore(flags);
28451 +       rcu_read_unlock();
28452
28453         return ret;
28454  }
28455 @@ -4310,7 +4348,7 @@ void show_workqueue_state(void)
28456         unsigned long flags;
28457         int pi;
28458
28459 -       rcu_read_lock_sched();
28460 +       rcu_read_lock();
28461
28462         pr_info("Showing busy workqueues and worker pools:\n");
28463
28464 @@ -4361,7 +4399,7 @@ void show_workqueue_state(void)
28465                 spin_unlock_irqrestore(&pool->lock, flags);
28466         }
28467
28468 -       rcu_read_unlock_sched();
28469 +       rcu_read_unlock();
28470  }
28471
28472  /*
28473 @@ -4722,16 +4760,16 @@ bool freeze_workqueues_busy(void)
28474                  * nr_active is monotonically decreasing.  It's safe
28475                  * to peek without lock.
28476                  */
28477 -               rcu_read_lock_sched();
28478 +               rcu_read_lock();
28479                 for_each_pwq(pwq, wq) {
28480                         WARN_ON_ONCE(pwq->nr_active < 0);
28481                         if (pwq->nr_active) {
28482                                 busy = true;
28483 -                               rcu_read_unlock_sched();
28484 +                               rcu_read_unlock();
28485                                 goto out_unlock;
28486                         }
28487                 }
28488 -               rcu_read_unlock_sched();
28489 +               rcu_read_unlock();
28490         }
28491  out_unlock:
28492         mutex_unlock(&wq_pool_mutex);
28493 @@ -4921,7 +4959,8 @@ static ssize_t wq_pool_ids_show(struct device *dev,
28494         const char *delim = "";
28495         int node, written = 0;
28496
28497 -       rcu_read_lock_sched();
28498 +       get_online_cpus();
28499 +       rcu_read_lock();
28500         for_each_node(node) {
28501                 written += scnprintf(buf + written, PAGE_SIZE - written,
28502                                      "%s%d:%d", delim, node,
28503 @@ -4929,7 +4968,8 @@ static ssize_t wq_pool_ids_show(struct device *dev,
28504                 delim = " ";
28505         }
28506         written += scnprintf(buf + written, PAGE_SIZE - written, "\n");
28507 -       rcu_read_unlock_sched();
28508 +       rcu_read_unlock();
28509 +       put_online_cpus();
28510
28511         return written;
28512  }
28513 diff --git a/kernel/workqueue_internal.h b/kernel/workqueue_internal.h
28514 index 45215870ac6c..f000c4d6917e 100644
28515 --- a/kernel/workqueue_internal.h
28516 +++ b/kernel/workqueue_internal.h
28517 @@ -43,6 +43,7 @@ struct worker {
28518         unsigned long           last_active;    /* L: last active timestamp */
28519         unsigned int            flags;          /* X: flags */
28520         int                     id;             /* I: worker id */
28521 +       int                     sleeping;       /* None */
28522
28523         /*
28524          * Opaque string set with work_set_desc().  Printed out with task
28525 @@ -68,7 +69,7 @@ static inline struct worker *current_wq_worker(void)
28526   * Scheduler hooks for concurrency managed workqueue.  Only to be used from
28527   * sched/core.c and workqueue.c.
28528   */
28529 -void wq_worker_waking_up(struct task_struct *task, int cpu);
28530 -struct task_struct *wq_worker_sleeping(struct task_struct *task, int cpu);
28531 +void wq_worker_running(struct task_struct *task);
28532 +void wq_worker_sleeping(struct task_struct *task);
28533
28534  #endif /* _KERNEL_WORKQUEUE_INTERNAL_H */
28535 diff --git a/lib/Kconfig b/lib/Kconfig
28536 index 1a48744253d7..f75de578cca8 100644
28537 --- a/lib/Kconfig
28538 +++ b/lib/Kconfig
28539 @@ -397,6 +397,7 @@ config CHECK_SIGNATURE
28540
28541  config CPUMASK_OFFSTACK
28542         bool "Force CPU masks off stack" if DEBUG_PER_CPU_MAPS
28543 +       depends on !PREEMPT_RT_FULL
28544         help
28545           Use dynamic allocation for cpumask_var_t, instead of putting
28546           them on the stack.  This is a bit more expensive, but avoids
28547 diff --git a/lib/debugobjects.c b/lib/debugobjects.c
28548 index 547f7f923dbc..8fcdbc2fc6d0 100644
28549 --- a/lib/debugobjects.c
28550 +++ b/lib/debugobjects.c
28551 @@ -309,7 +309,10 @@ __debug_object_init(void *addr, struct debug_obj_descr *descr, int onstack)
28552         struct debug_obj *obj;
28553         unsigned long flags;
28554
28555 -       fill_pool();
28556 +#ifdef CONFIG_PREEMPT_RT_FULL
28557 +       if (preempt_count() == 0 && !irqs_disabled())
28558 +#endif
28559 +               fill_pool();
28560
28561         db = get_bucket((unsigned long) addr);
28562
28563 diff --git a/lib/idr.c b/lib/idr.c
28564 index 6098336df267..9decbe914595 100644
28565 --- a/lib/idr.c
28566 +++ b/lib/idr.c
28567 @@ -30,6 +30,7 @@
28568  #include <linux/idr.h>
28569  #include <linux/spinlock.h>
28570  #include <linux/percpu.h>
28571 +#include <linux/locallock.h>
28572
28573  #define MAX_IDR_SHIFT          (sizeof(int) * 8 - 1)
28574  #define MAX_IDR_BIT            (1U << MAX_IDR_SHIFT)
28575 @@ -45,6 +46,37 @@ static DEFINE_PER_CPU(struct idr_layer *, idr_preload_head);
28576  static DEFINE_PER_CPU(int, idr_preload_cnt);
28577  static DEFINE_SPINLOCK(simple_ida_lock);
28578
28579 +#ifdef CONFIG_PREEMPT_RT_FULL
28580 +static DEFINE_LOCAL_IRQ_LOCK(idr_lock);
28581 +
28582 +static inline void idr_preload_lock(void)
28583 +{
28584 +       local_lock(idr_lock);
28585 +}
28586 +
28587 +static inline void idr_preload_unlock(void)
28588 +{
28589 +       local_unlock(idr_lock);
28590 +}
28591 +
28592 +void idr_preload_end(void)
28593 +{
28594 +       idr_preload_unlock();
28595 +}
28596 +EXPORT_SYMBOL(idr_preload_end);
28597 +#else
28598 +static inline void idr_preload_lock(void)
28599 +{
28600 +       preempt_disable();
28601 +}
28602 +
28603 +static inline void idr_preload_unlock(void)
28604 +{
28605 +       preempt_enable();
28606 +}
28607 +#endif
28608 +
28609 +
28610  /* the maximum ID which can be allocated given idr->layers */
28611  static int idr_max(int layers)
28612  {
28613 @@ -115,14 +147,14 @@ static struct idr_layer *idr_layer_alloc(gfp_t gfp_mask, struct idr *layer_idr)
28614          * context.  See idr_preload() for details.
28615          */
28616         if (!in_interrupt()) {
28617 -               preempt_disable();
28618 +               idr_preload_lock();
28619                 new = __this_cpu_read(idr_preload_head);
28620                 if (new) {
28621                         __this_cpu_write(idr_preload_head, new->ary[0]);
28622                         __this_cpu_dec(idr_preload_cnt);
28623                         new->ary[0] = NULL;
28624                 }
28625 -               preempt_enable();
28626 +               idr_preload_unlock();
28627                 if (new)
28628                         return new;
28629         }
28630 @@ -366,7 +398,6 @@ static void idr_fill_slot(struct idr *idr, void *ptr, int id,
28631         idr_mark_full(pa, id);
28632  }
28633
28634 -
28635  /**
28636   * idr_preload - preload for idr_alloc()
28637   * @gfp_mask: allocation mask to use for preloading
28638 @@ -401,7 +432,7 @@ void idr_preload(gfp_t gfp_mask)
28639         WARN_ON_ONCE(in_interrupt());
28640         might_sleep_if(gfpflags_allow_blocking(gfp_mask));
28641
28642 -       preempt_disable();
28643 +       idr_preload_lock();
28644
28645         /*
28646          * idr_alloc() is likely to succeed w/o full idr_layer buffer and
28647 @@ -413,9 +444,9 @@ void idr_preload(gfp_t gfp_mask)
28648         while (__this_cpu_read(idr_preload_cnt) < MAX_IDR_FREE) {
28649                 struct idr_layer *new;
28650
28651 -               preempt_enable();
28652 +               idr_preload_unlock();
28653                 new = kmem_cache_zalloc(idr_layer_cache, gfp_mask);
28654 -               preempt_disable();
28655 +               idr_preload_lock();
28656                 if (!new)
28657                         break;
28658
28659 diff --git a/lib/locking-selftest.c b/lib/locking-selftest.c
28660 index 872a15a2a637..b93a6103fa4d 100644
28661 --- a/lib/locking-selftest.c
28662 +++ b/lib/locking-selftest.c
28663 @@ -590,6 +590,8 @@ GENERATE_TESTCASE(init_held_rsem)
28664  #include "locking-selftest-spin-hardirq.h"
28665  GENERATE_PERMUTATIONS_2_EVENTS(irqsafe1_hard_spin)
28666
28667 +#ifndef CONFIG_PREEMPT_RT_FULL
28668 +
28669  #include "locking-selftest-rlock-hardirq.h"
28670  GENERATE_PERMUTATIONS_2_EVENTS(irqsafe1_hard_rlock)
28671
28672 @@ -605,9 +607,12 @@ GENERATE_PERMUTATIONS_2_EVENTS(irqsafe1_soft_rlock)
28673  #include "locking-selftest-wlock-softirq.h"
28674  GENERATE_PERMUTATIONS_2_EVENTS(irqsafe1_soft_wlock)
28675
28676 +#endif
28677 +
28678  #undef E1
28679  #undef E2
28680
28681 +#ifndef CONFIG_PREEMPT_RT_FULL
28682  /*
28683   * Enabling hardirqs with a softirq-safe lock held:
28684   */
28685 @@ -640,6 +645,8 @@ GENERATE_PERMUTATIONS_2_EVENTS(irqsafe2A_rlock)
28686  #undef E1
28687  #undef E2
28688
28689 +#endif
28690 +
28691  /*
28692   * Enabling irqs with an irq-safe lock held:
28693   */
28694 @@ -663,6 +670,8 @@ GENERATE_PERMUTATIONS_2_EVENTS(irqsafe2A_rlock)
28695  #include "locking-selftest-spin-hardirq.h"
28696  GENERATE_PERMUTATIONS_2_EVENTS(irqsafe2B_hard_spin)
28697
28698 +#ifndef CONFIG_PREEMPT_RT_FULL
28699 +
28700  #include "locking-selftest-rlock-hardirq.h"
28701  GENERATE_PERMUTATIONS_2_EVENTS(irqsafe2B_hard_rlock)
28702
28703 @@ -678,6 +687,8 @@ GENERATE_PERMUTATIONS_2_EVENTS(irqsafe2B_soft_rlock)
28704  #include "locking-selftest-wlock-softirq.h"
28705  GENERATE_PERMUTATIONS_2_EVENTS(irqsafe2B_soft_wlock)
28706
28707 +#endif
28708 +
28709  #undef E1
28710  #undef E2
28711
28712 @@ -709,6 +720,8 @@ GENERATE_PERMUTATIONS_2_EVENTS(irqsafe2B_soft_wlock)
28713  #include "locking-selftest-spin-hardirq.h"
28714  GENERATE_PERMUTATIONS_3_EVENTS(irqsafe3_hard_spin)
28715
28716 +#ifndef CONFIG_PREEMPT_RT_FULL
28717 +
28718  #include "locking-selftest-rlock-hardirq.h"
28719  GENERATE_PERMUTATIONS_3_EVENTS(irqsafe3_hard_rlock)
28720
28721 @@ -724,6 +737,8 @@ GENERATE_PERMUTATIONS_3_EVENTS(irqsafe3_soft_rlock)
28722  #include "locking-selftest-wlock-softirq.h"
28723  GENERATE_PERMUTATIONS_3_EVENTS(irqsafe3_soft_wlock)
28724
28725 +#endif
28726 +
28727  #undef E1
28728  #undef E2
28729  #undef E3
28730 @@ -757,6 +772,8 @@ GENERATE_PERMUTATIONS_3_EVENTS(irqsafe3_soft_wlock)
28731  #include "locking-selftest-spin-hardirq.h"
28732  GENERATE_PERMUTATIONS_3_EVENTS(irqsafe4_hard_spin)
28733
28734 +#ifndef CONFIG_PREEMPT_RT_FULL
28735 +
28736  #include "locking-selftest-rlock-hardirq.h"
28737  GENERATE_PERMUTATIONS_3_EVENTS(irqsafe4_hard_rlock)
28738
28739 @@ -772,10 +789,14 @@ GENERATE_PERMUTATIONS_3_EVENTS(irqsafe4_soft_rlock)
28740  #include "locking-selftest-wlock-softirq.h"
28741  GENERATE_PERMUTATIONS_3_EVENTS(irqsafe4_soft_wlock)
28742
28743 +#endif
28744 +
28745  #undef E1
28746  #undef E2
28747  #undef E3
28748
28749 +#ifndef CONFIG_PREEMPT_RT_FULL
28750 +
28751  /*
28752   * read-lock / write-lock irq inversion.
28753   *
28754 @@ -838,6 +859,10 @@ GENERATE_PERMUTATIONS_3_EVENTS(irq_inversion_soft_wlock)
28755  #undef E2
28756  #undef E3
28757
28758 +#endif
28759 +
28760 +#ifndef CONFIG_PREEMPT_RT_FULL
28761 +
28762  /*
28763   * read-lock / write-lock recursion that is actually safe.
28764   */
28765 @@ -876,6 +901,8 @@ GENERATE_PERMUTATIONS_3_EVENTS(irq_read_recursion_soft)
28766  #undef E2
28767  #undef E3
28768
28769 +#endif
28770 +
28771  /*
28772   * read-lock / write-lock recursion that is unsafe.
28773   */
28774 @@ -1858,6 +1885,7 @@ void locking_selftest(void)
28775
28776         printk("  --------------------------------------------------------------------------\n");
28777
28778 +#ifndef CONFIG_PREEMPT_RT_FULL
28779         /*
28780          * irq-context testcases:
28781          */
28782 @@ -1870,6 +1898,28 @@ void locking_selftest(void)
28783
28784         DO_TESTCASE_6x2("irq read-recursion", irq_read_recursion);
28785  //     DO_TESTCASE_6x2B("irq read-recursion #2", irq_read_recursion2);
28786 +#else
28787 +       /* On -rt, we only do hardirq context test for raw spinlock */
28788 +       DO_TESTCASE_1B("hard-irqs-on + irq-safe-A", irqsafe1_hard_spin, 12);
28789 +       DO_TESTCASE_1B("hard-irqs-on + irq-safe-A", irqsafe1_hard_spin, 21);
28790 +
28791 +       DO_TESTCASE_1B("hard-safe-A + irqs-on", irqsafe2B_hard_spin, 12);
28792 +       DO_TESTCASE_1B("hard-safe-A + irqs-on", irqsafe2B_hard_spin, 21);
28793 +
28794 +       DO_TESTCASE_1B("hard-safe-A + unsafe-B #1", irqsafe3_hard_spin, 123);
28795 +       DO_TESTCASE_1B("hard-safe-A + unsafe-B #1", irqsafe3_hard_spin, 132);
28796 +       DO_TESTCASE_1B("hard-safe-A + unsafe-B #1", irqsafe3_hard_spin, 213);
28797 +       DO_TESTCASE_1B("hard-safe-A + unsafe-B #1", irqsafe3_hard_spin, 231);
28798 +       DO_TESTCASE_1B("hard-safe-A + unsafe-B #1", irqsafe3_hard_spin, 312);
28799 +       DO_TESTCASE_1B("hard-safe-A + unsafe-B #1", irqsafe3_hard_spin, 321);
28800 +
28801 +       DO_TESTCASE_1B("hard-safe-A + unsafe-B #2", irqsafe4_hard_spin, 123);
28802 +       DO_TESTCASE_1B("hard-safe-A + unsafe-B #2", irqsafe4_hard_spin, 132);
28803 +       DO_TESTCASE_1B("hard-safe-A + unsafe-B #2", irqsafe4_hard_spin, 213);
28804 +       DO_TESTCASE_1B("hard-safe-A + unsafe-B #2", irqsafe4_hard_spin, 231);
28805 +       DO_TESTCASE_1B("hard-safe-A + unsafe-B #2", irqsafe4_hard_spin, 312);
28806 +       DO_TESTCASE_1B("hard-safe-A + unsafe-B #2", irqsafe4_hard_spin, 321);
28807 +#endif
28808
28809         ww_tests();
28810
28811 diff --git a/lib/percpu_ida.c b/lib/percpu_ida.c
28812 index 6d40944960de..822a2c027e72 100644
28813 --- a/lib/percpu_ida.c
28814 +++ b/lib/percpu_ida.c
28815 @@ -26,6 +26,9 @@
28816  #include <linux/string.h>
28817  #include <linux/spinlock.h>
28818  #include <linux/percpu_ida.h>
28819 +#include <linux/locallock.h>
28820 +
28821 +static DEFINE_LOCAL_IRQ_LOCK(irq_off_lock);
28822
28823  struct percpu_ida_cpu {
28824         /*
28825 @@ -148,13 +151,13 @@ int percpu_ida_alloc(struct percpu_ida *pool, int state)
28826         unsigned long flags;
28827         int tag;
28828
28829 -       local_irq_save(flags);
28830 +       local_lock_irqsave(irq_off_lock, flags);
28831         tags = this_cpu_ptr(pool->tag_cpu);
28832
28833         /* Fastpath */
28834         tag = alloc_local_tag(tags);
28835         if (likely(tag >= 0)) {
28836 -               local_irq_restore(flags);
28837 +               local_unlock_irqrestore(irq_off_lock, flags);
28838                 return tag;
28839         }
28840
28841 @@ -173,6 +176,7 @@ int percpu_ida_alloc(struct percpu_ida *pool, int state)
28842
28843                 if (!tags->nr_free)
28844                         alloc_global_tags(pool, tags);
28845 +
28846                 if (!tags->nr_free)
28847                         steal_tags(pool, tags);
28848
28849 @@ -184,7 +188,7 @@ int percpu_ida_alloc(struct percpu_ida *pool, int state)
28850                 }
28851
28852                 spin_unlock(&pool->lock);
28853 -               local_irq_restore(flags);
28854 +               local_unlock_irqrestore(irq_off_lock, flags);
28855
28856                 if (tag >= 0 || state == TASK_RUNNING)
28857                         break;
28858 @@ -196,7 +200,7 @@ int percpu_ida_alloc(struct percpu_ida *pool, int state)
28859
28860                 schedule();
28861
28862 -               local_irq_save(flags);
28863 +               local_lock_irqsave(irq_off_lock, flags);
28864                 tags = this_cpu_ptr(pool->tag_cpu);
28865         }
28866         if (state != TASK_RUNNING)
28867 @@ -221,7 +225,7 @@ void percpu_ida_free(struct percpu_ida *pool, unsigned tag)
28868
28869         BUG_ON(tag >= pool->nr_tags);
28870
28871 -       local_irq_save(flags);
28872 +       local_lock_irqsave(irq_off_lock, flags);
28873         tags = this_cpu_ptr(pool->tag_cpu);
28874
28875         spin_lock(&tags->lock);
28876 @@ -253,7 +257,7 @@ void percpu_ida_free(struct percpu_ida *pool, unsigned tag)
28877                 spin_unlock(&pool->lock);
28878         }
28879
28880 -       local_irq_restore(flags);
28881 +       local_unlock_irqrestore(irq_off_lock, flags);
28882  }
28883  EXPORT_SYMBOL_GPL(percpu_ida_free);
28884
28885 @@ -345,7 +349,7 @@ int percpu_ida_for_each_free(struct percpu_ida *pool, percpu_ida_cb fn,
28886         struct percpu_ida_cpu *remote;
28887         unsigned cpu, i, err = 0;
28888
28889 -       local_irq_save(flags);
28890 +       local_lock_irqsave(irq_off_lock, flags);
28891         for_each_possible_cpu(cpu) {
28892                 remote = per_cpu_ptr(pool->tag_cpu, cpu);
28893                 spin_lock(&remote->lock);
28894 @@ -367,7 +371,7 @@ int percpu_ida_for_each_free(struct percpu_ida *pool, percpu_ida_cb fn,
28895         }
28896         spin_unlock(&pool->lock);
28897  out:
28898 -       local_irq_restore(flags);
28899 +       local_unlock_irqrestore(irq_off_lock, flags);
28900         return err;
28901  }
28902  EXPORT_SYMBOL_GPL(percpu_ida_for_each_free);
28903 diff --git a/lib/radix-tree.c b/lib/radix-tree.c
28904 index 6b79e9026e24..f27e0bcb74f7 100644
28905 --- a/lib/radix-tree.c
28906 +++ b/lib/radix-tree.c
28907 @@ -196,13 +196,14 @@ radix_tree_node_alloc(struct radix_tree_root *root)
28908                  * succeed in getting a node here (and never reach
28909                  * kmem_cache_alloc)
28910                  */
28911 -               rtp = this_cpu_ptr(&radix_tree_preloads);
28912 +               rtp = &get_cpu_var(radix_tree_preloads);
28913                 if (rtp->nr) {
28914                         ret = rtp->nodes;
28915                         rtp->nodes = ret->private_data;
28916                         ret->private_data = NULL;
28917                         rtp->nr--;
28918                 }
28919 +               put_cpu_var(radix_tree_preloads);
28920                 /*
28921                  * Update the allocation stack trace as this is more useful
28922                  * for debugging.
28923 @@ -242,6 +243,7 @@ radix_tree_node_free(struct radix_tree_node *node)
28924         call_rcu(&node->rcu_head, radix_tree_node_rcu_free);
28925  }
28926
28927 +#ifndef CONFIG_PREEMPT_RT_FULL
28928  /*
28929   * Load up this CPU's radix_tree_node buffer with sufficient objects to
28930   * ensure that the addition of a single element in the tree cannot fail.  On
28931 @@ -310,6 +312,7 @@ int radix_tree_maybe_preload(gfp_t gfp_mask)
28932         return 0;
28933  }
28934  EXPORT_SYMBOL(radix_tree_maybe_preload);
28935 +#endif
28936
28937  /*
28938   *     Return the maximum key which can be store into a
28939 diff --git a/lib/rbtree.c b/lib/rbtree.c
28940 index 1356454e36de..d15d6c4327f1 100644
28941 --- a/lib/rbtree.c
28942 +++ b/lib/rbtree.c
28943 @@ -23,6 +23,7 @@
28944
28945  #include <linux/rbtree_augmented.h>
28946  #include <linux/export.h>
28947 +#include <linux/rcupdate.h>
28948
28949  /*
28950   * red-black trees properties:  http://en.wikipedia.org/wiki/Rbtree
28951 @@ -590,3 +591,13 @@ struct rb_node *rb_first_postorder(const struct rb_root *root)
28952         return rb_left_deepest_node(root->rb_node);
28953  }
28954  EXPORT_SYMBOL(rb_first_postorder);
28955 +
28956 +void rb_link_node_rcu(struct rb_node *node, struct rb_node *parent,
28957 +                                   struct rb_node **rb_link)
28958 +{
28959 +       node->__rb_parent_color = (unsigned long)parent;
28960 +       node->rb_left = node->rb_right = NULL;
28961 +
28962 +       rcu_assign_pointer(*rb_link, node);
28963 +}
28964 +EXPORT_SYMBOL(rb_link_node_rcu);
28965 diff --git a/lib/scatterlist.c b/lib/scatterlist.c
28966 index bafa9933fa76..ebe3b7edd086 100644
28967 --- a/lib/scatterlist.c
28968 +++ b/lib/scatterlist.c
28969 @@ -620,7 +620,7 @@ void sg_miter_stop(struct sg_mapping_iter *miter)
28970                         flush_kernel_dcache_page(miter->page);
28971
28972                 if (miter->__flags & SG_MITER_ATOMIC) {
28973 -                       WARN_ON_ONCE(preemptible());
28974 +                       WARN_ON_ONCE(!pagefault_disabled());
28975                         kunmap_atomic(miter->addr);
28976                 } else
28977                         kunmap(miter->page);
28978 @@ -664,7 +664,7 @@ size_t sg_copy_buffer(struct scatterlist *sgl, unsigned int nents, void *buf,
28979         if (!sg_miter_skip(&miter, skip))
28980                 return false;
28981
28982 -       local_irq_save(flags);
28983 +       local_irq_save_nort(flags);
28984
28985         while (sg_miter_next(&miter) && offset < buflen) {
28986                 unsigned int len;
28987 @@ -681,7 +681,7 @@ size_t sg_copy_buffer(struct scatterlist *sgl, unsigned int nents, void *buf,
28988
28989         sg_miter_stop(&miter);
28990
28991 -       local_irq_restore(flags);
28992 +       local_irq_restore_nort(flags);
28993         return offset;
28994  }
28995  EXPORT_SYMBOL(sg_copy_buffer);
28996 diff --git a/lib/smp_processor_id.c b/lib/smp_processor_id.c
28997 index 1afec32de6f2..11fa431046a8 100644
28998 --- a/lib/smp_processor_id.c
28999 +++ b/lib/smp_processor_id.c
29000 @@ -39,8 +39,9 @@ notrace static unsigned int check_preemption_disabled(const char *what1,
29001         if (!printk_ratelimit())
29002                 goto out_enable;
29003
29004 -       printk(KERN_ERR "BUG: using %s%s() in preemptible [%08x] code: %s/%d\n",
29005 -               what1, what2, preempt_count() - 1, current->comm, current->pid);
29006 +       printk(KERN_ERR "BUG: using %s%s() in preemptible [%08x %08x] code: %s/%d\n",
29007 +               what1, what2, preempt_count() - 1, __migrate_disabled(current),
29008 +               current->comm, current->pid);
29009
29010         print_symbol("caller is %s\n", (long)__builtin_return_address(0));
29011         dump_stack();
29012 diff --git a/localversion-rt b/localversion-rt
29013 new file mode 100644
29014 index 000000000000..629e0b4384b8
29015 --- /dev/null
29016 +++ b/localversion-rt
29017 @@ -0,0 +1 @@
29018 +-rt41
29019 diff --git a/mm/Kconfig b/mm/Kconfig
29020 index 97a4e06b15c0..9614351e68b8 100644
29021 --- a/mm/Kconfig
29022 +++ b/mm/Kconfig
29023 @@ -392,7 +392,7 @@ config NOMMU_INITIAL_TRIM_EXCESS
29024
29025  config TRANSPARENT_HUGEPAGE
29026         bool "Transparent Hugepage Support"
29027 -       depends on HAVE_ARCH_TRANSPARENT_HUGEPAGE
29028 +       depends on HAVE_ARCH_TRANSPARENT_HUGEPAGE && !PREEMPT_RT_FULL
29029         select COMPACTION
29030         help
29031           Transparent Hugepages allows the kernel to use huge pages and
29032 diff --git a/mm/backing-dev.c b/mm/backing-dev.c
29033 index 9ef80bf441b3..826fed55c1cc 100644
29034 --- a/mm/backing-dev.c
29035 +++ b/mm/backing-dev.c
29036 @@ -457,9 +457,9 @@ void wb_congested_put(struct bdi_writeback_congested *congested)
29037  {
29038         unsigned long flags;
29039
29040 -       local_irq_save(flags);
29041 +       local_irq_save_nort(flags);
29042         if (!atomic_dec_and_lock(&congested->refcnt, &cgwb_lock)) {
29043 -               local_irq_restore(flags);
29044 +               local_irq_restore_nort(flags);
29045                 return;
29046         }
29047
29048 diff --git a/mm/compaction.c b/mm/compaction.c
29049 index dba02dec7195..51963f58a29b 100644
29050 --- a/mm/compaction.c
29051 +++ b/mm/compaction.c
29052 @@ -1430,10 +1430,12 @@ check_drain:
29053                                 cc->migrate_pfn & ~((1UL << cc->order) - 1);
29054
29055                         if (cc->last_migrated_pfn < current_block_start) {
29056 -                               cpu = get_cpu();
29057 +                               cpu = get_cpu_light();
29058 +                               local_lock_irq(swapvec_lock);
29059                                 lru_add_drain_cpu(cpu);
29060 +                               local_unlock_irq(swapvec_lock);
29061                                 drain_local_pages(zone);
29062 -                               put_cpu();
29063 +                               put_cpu_light();
29064                                 /* No more flushing until we migrate again */
29065                                 cc->last_migrated_pfn = 0;
29066                         }
29067 diff --git a/mm/filemap.c b/mm/filemap.c
29068 index c588d1222b2a..da6a5fbfadd2 100644
29069 --- a/mm/filemap.c
29070 +++ b/mm/filemap.c
29071 @@ -144,9 +144,12 @@ static int page_cache_tree_insert(struct address_space *mapping,
29072                  * node->private_list is protected by
29073                  * mapping->tree_lock.
29074                  */
29075 -               if (!list_empty(&node->private_list))
29076 -                       list_lru_del(&workingset_shadow_nodes,
29077 +               if (!list_empty(&node->private_list)) {
29078 +                       local_lock(workingset_shadow_lock);
29079 +                       list_lru_del(&__workingset_shadow_nodes,
29080                                      &node->private_list);
29081 +                       local_unlock(workingset_shadow_lock);
29082 +               }
29083         }
29084         return 0;
29085  }
29086 @@ -218,7 +221,9 @@ static void page_cache_tree_delete(struct address_space *mapping,
29087         if (!workingset_node_pages(node) &&
29088             list_empty(&node->private_list)) {
29089                 node->private_data = mapping;
29090 -               list_lru_add(&workingset_shadow_nodes, &node->private_list);
29091 +               local_lock(workingset_shadow_lock);
29092 +               list_lru_add(&__workingset_shadow_nodes, &node->private_list);
29093 +               local_unlock(workingset_shadow_lock);
29094         }
29095  }
29096
29097 diff --git a/mm/highmem.c b/mm/highmem.c
29098 index 123bcd3ed4f2..16e8cf26d38a 100644
29099 --- a/mm/highmem.c
29100 +++ b/mm/highmem.c
29101 @@ -29,10 +29,11 @@
29102  #include <linux/kgdb.h>
29103  #include <asm/tlbflush.h>
29104
29105 -
29106 +#ifndef CONFIG_PREEMPT_RT_FULL
29107  #if defined(CONFIG_HIGHMEM) || defined(CONFIG_X86_32)
29108  DEFINE_PER_CPU(int, __kmap_atomic_idx);
29109  #endif
29110 +#endif
29111
29112  /*
29113   * Virtual_count is not a pure "count".
29114 @@ -107,8 +108,9 @@ static inline wait_queue_head_t *get_pkmap_wait_queue_head(unsigned int color)
29115  unsigned long totalhigh_pages __read_mostly;
29116  EXPORT_SYMBOL(totalhigh_pages);
29117
29118 -
29119 +#ifndef CONFIG_PREEMPT_RT_FULL
29120  EXPORT_PER_CPU_SYMBOL(__kmap_atomic_idx);
29121 +#endif
29122
29123  unsigned int nr_free_highpages (void)
29124  {
29125 diff --git a/mm/memcontrol.c b/mm/memcontrol.c
29126 index 6b90d184e9c0..ed7aa011ad70 100644
29127 --- a/mm/memcontrol.c
29128 +++ b/mm/memcontrol.c
29129 @@ -67,6 +67,8 @@
29130  #include <net/sock.h>
29131  #include <net/ip.h>
29132  #include <net/tcp_memcontrol.h>
29133 +#include <linux/locallock.h>
29134 +
29135  #include "slab.h"
29136
29137  #include <asm/uaccess.h>
29138 @@ -87,6 +89,7 @@ int do_swap_account __read_mostly;
29139  #define do_swap_account                0
29140  #endif
29141
29142 +static DEFINE_LOCAL_IRQ_LOCK(event_lock);
29143  static const char * const mem_cgroup_stat_names[] = {
29144         "cache",
29145         "rss",
29146 @@ -1922,14 +1925,17 @@ static void drain_local_stock(struct work_struct *dummy)
29147   */
29148  static void refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
29149  {
29150 -       struct memcg_stock_pcp *stock = &get_cpu_var(memcg_stock);
29151 +       struct memcg_stock_pcp *stock;
29152 +       int cpu = get_cpu_light();
29153 +
29154 +       stock = &per_cpu(memcg_stock, cpu);
29155
29156         if (stock->cached != memcg) { /* reset if necessary */
29157                 drain_stock(stock);
29158                 stock->cached = memcg;
29159         }
29160         stock->nr_pages += nr_pages;
29161 -       put_cpu_var(memcg_stock);
29162 +       put_cpu_light();
29163  }
29164
29165  /*
29166 @@ -1945,7 +1951,7 @@ static void drain_all_stock(struct mem_cgroup *root_memcg)
29167                 return;
29168         /* Notify other cpus that system-wide "drain" is running */
29169         get_online_cpus();
29170 -       curcpu = get_cpu();
29171 +       curcpu = get_cpu_light();
29172         for_each_online_cpu(cpu) {
29173                 struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu);
29174                 struct mem_cgroup *memcg;
29175 @@ -1962,7 +1968,7 @@ static void drain_all_stock(struct mem_cgroup *root_memcg)
29176                                 schedule_work_on(cpu, &stock->work);
29177                 }
29178         }
29179 -       put_cpu();
29180 +       put_cpu_light();
29181         put_online_cpus();
29182         mutex_unlock(&percpu_charge_mutex);
29183  }
29184 @@ -4700,12 +4706,12 @@ static int mem_cgroup_move_account(struct page *page,
29185
29186         ret = 0;
29187
29188 -       local_irq_disable();
29189 +       local_lock_irq(event_lock);
29190         mem_cgroup_charge_statistics(to, page, nr_pages);
29191         memcg_check_events(to, page);
29192         mem_cgroup_charge_statistics(from, page, -nr_pages);
29193         memcg_check_events(from, page);
29194 -       local_irq_enable();
29195 +       local_unlock_irq(event_lock);
29196  out_unlock:
29197         unlock_page(page);
29198  out:
29199 @@ -5495,10 +5501,10 @@ void mem_cgroup_commit_charge(struct page *page, struct mem_cgroup *memcg,
29200                 VM_BUG_ON_PAGE(!PageTransHuge(page), page);
29201         }
29202
29203 -       local_irq_disable();
29204 +       local_lock_irq(event_lock);
29205         mem_cgroup_charge_statistics(memcg, page, nr_pages);
29206         memcg_check_events(memcg, page);
29207 -       local_irq_enable();
29208 +       local_unlock_irq(event_lock);
29209
29210         if (do_swap_account && PageSwapCache(page)) {
29211                 swp_entry_t entry = { .val = page_private(page) };
29212 @@ -5554,14 +5560,14 @@ static void uncharge_batch(struct mem_cgroup *memcg, unsigned long pgpgout,
29213                 memcg_oom_recover(memcg);
29214         }
29215
29216 -       local_irq_save(flags);
29217 +       local_lock_irqsave(event_lock, flags);
29218         __this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_RSS], nr_anon);
29219         __this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_CACHE], nr_file);
29220         __this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_RSS_HUGE], nr_huge);
29221         __this_cpu_add(memcg->stat->events[MEM_CGROUP_EVENTS_PGPGOUT], pgpgout);
29222         __this_cpu_add(memcg->stat->nr_page_events, nr_pages);
29223         memcg_check_events(memcg, dummy_page);
29224 -       local_irq_restore(flags);
29225 +       local_unlock_irqrestore(event_lock, flags);
29226
29227         if (!mem_cgroup_is_root(memcg))
29228                 css_put_many(&memcg->css, nr_pages);
29229 @@ -5753,6 +5759,7 @@ void mem_cgroup_swapout(struct page *page, swp_entry_t entry)
29230  {
29231         struct mem_cgroup *memcg, *swap_memcg;
29232         unsigned short oldid;
29233 +       unsigned long flags;
29234
29235         VM_BUG_ON_PAGE(PageLRU(page), page);
29236         VM_BUG_ON_PAGE(page_count(page), page);
29237 @@ -5793,12 +5800,16 @@ void mem_cgroup_swapout(struct page *page, swp_entry_t entry)
29238          * important here to have the interrupts disabled because it is the
29239          * only synchronisation we have for udpating the per-CPU variables.
29240          */
29241 +       local_lock_irqsave(event_lock, flags);
29242 +#ifndef CONFIG_PREEMPT_RT_BASE
29243         VM_BUG_ON(!irqs_disabled());
29244 +#endif
29245         mem_cgroup_charge_statistics(memcg, page, -1);
29246         memcg_check_events(memcg, page);
29247
29248         if (!mem_cgroup_is_root(memcg))
29249                 css_put(&memcg->css);
29250 +       local_unlock_irqrestore(event_lock, flags);
29251  }
29252
29253  /**
29254 diff --git a/mm/mmu_context.c b/mm/mmu_context.c
29255 index f802c2d216a7..b1b6f238e42d 100644
29256 --- a/mm/mmu_context.c
29257 +++ b/mm/mmu_context.c
29258 @@ -23,6 +23,7 @@ void use_mm(struct mm_struct *mm)
29259         struct task_struct *tsk = current;
29260
29261         task_lock(tsk);
29262 +       preempt_disable_rt();
29263         active_mm = tsk->active_mm;
29264         if (active_mm != mm) {
29265                 atomic_inc(&mm->mm_count);
29266 @@ -30,6 +31,7 @@ void use_mm(struct mm_struct *mm)
29267         }
29268         tsk->mm = mm;
29269         switch_mm(active_mm, mm, tsk);
29270 +       preempt_enable_rt();
29271         task_unlock(tsk);
29272  #ifdef finish_arch_post_lock_switch
29273         finish_arch_post_lock_switch();
29274 diff --git a/mm/page_alloc.c b/mm/page_alloc.c
29275 index 2bcdfbf8c36d..a500c9e740dd 100644
29276 --- a/mm/page_alloc.c
29277 +++ b/mm/page_alloc.c
29278 @@ -60,6 +60,7 @@
29279  #include <linux/page_ext.h>
29280  #include <linux/hugetlb.h>
29281  #include <linux/sched/rt.h>
29282 +#include <linux/locallock.h>
29283  #include <linux/page_owner.h>
29284  #include <linux/kthread.h>
29285
29286 @@ -264,6 +265,18 @@ EXPORT_SYMBOL(nr_node_ids);
29287  EXPORT_SYMBOL(nr_online_nodes);
29288  #endif
29289
29290 +static DEFINE_LOCAL_IRQ_LOCK(pa_lock);
29291 +
29292 +#ifdef CONFIG_PREEMPT_RT_BASE
29293 +# define cpu_lock_irqsave(cpu, flags)          \
29294 +       local_lock_irqsave_on(pa_lock, flags, cpu)
29295 +# define cpu_unlock_irqrestore(cpu, flags)     \
29296 +       local_unlock_irqrestore_on(pa_lock, flags, cpu)
29297 +#else
29298 +# define cpu_lock_irqsave(cpu, flags)          local_irq_save(flags)
29299 +# define cpu_unlock_irqrestore(cpu, flags)     local_irq_restore(flags)
29300 +#endif
29301 +
29302  int page_group_by_mobility_disabled __read_mostly;
29303
29304  #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
29305 @@ -786,7 +799,7 @@ static inline int free_pages_check(struct page *page)
29306  }
29307
29308  /*
29309 - * Frees a number of pages from the PCP lists
29310 + * Frees a number of pages which have been collected from the pcp lists.
29311   * Assumes all pages on list are in same zone, and of same order.
29312   * count is the number of pages to free.
29313   *
29314 @@ -797,18 +810,53 @@ static inline int free_pages_check(struct page *page)
29315   * pinned" detection logic.
29316   */
29317  static void free_pcppages_bulk(struct zone *zone, int count,
29318 -                                       struct per_cpu_pages *pcp)
29319 +                              struct list_head *list)
29320  {
29321 -       int migratetype = 0;
29322 -       int batch_free = 0;
29323         int to_free = count;
29324         unsigned long nr_scanned;
29325 +       unsigned long flags;
29326 +
29327 +       spin_lock_irqsave(&zone->lock, flags);
29328
29329 -       spin_lock(&zone->lock);
29330         nr_scanned = zone_page_state(zone, NR_PAGES_SCANNED);
29331         if (nr_scanned)
29332                 __mod_zone_page_state(zone, NR_PAGES_SCANNED, -nr_scanned);
29333
29334 +       while (!list_empty(list)) {
29335 +               struct page *page = list_first_entry(list, struct page, lru);
29336 +               int mt; /* migratetype of the to-be-freed page */
29337 +
29338 +               /* must delete as __free_one_page list manipulates */
29339 +               list_del(&page->lru);
29340 +
29341 +               mt = get_pcppage_migratetype(page);
29342 +               /* MIGRATE_ISOLATE page should not go to pcplists */
29343 +               VM_BUG_ON_PAGE(is_migrate_isolate(mt), page);
29344 +               /* Pageblock could have been isolated meanwhile */
29345 +               if (unlikely(has_isolate_pageblock(zone)))
29346 +                       mt = get_pageblock_migratetype(page);
29347 +
29348 +               __free_one_page(page, page_to_pfn(page), zone, 0, mt);
29349 +               trace_mm_page_pcpu_drain(page, 0, mt);
29350 +               to_free--;
29351 +       }
29352 +       WARN_ON(to_free != 0);
29353 +       spin_unlock_irqrestore(&zone->lock, flags);
29354 +}
29355 +
29356 +/*
29357 + * Moves a number of pages from the PCP lists to free list which
29358 + * is freed outside of the locked region.
29359 + *
29360 + * Assumes all pages on list are in same zone, and of same order.
29361 + * count is the number of pages to free.
29362 + */
29363 +static void isolate_pcp_pages(int to_free, struct per_cpu_pages *src,
29364 +                             struct list_head *dst)
29365 +{
29366 +       int migratetype = 0;
29367 +       int batch_free = 0;
29368 +
29369         while (to_free) {
29370                 struct page *page;
29371                 struct list_head *list;
29372 @@ -824,7 +872,7 @@ static void free_pcppages_bulk(struct zone *zone, int count,
29373                         batch_free++;
29374                         if (++migratetype == MIGRATE_PCPTYPES)
29375                                 migratetype = 0;
29376 -                       list = &pcp->lists[migratetype];
29377 +                       list = &src->lists[migratetype];
29378                 } while (list_empty(list));
29379
29380                 /* This is the only non-empty list. Free them all. */
29381 @@ -832,24 +880,12 @@ static void free_pcppages_bulk(struct zone *zone, int count,
29382                         batch_free = to_free;
29383
29384                 do {
29385 -                       int mt; /* migratetype of the to-be-freed page */
29386 -
29387 -                       page = list_entry(list->prev, struct page, lru);
29388 -                       /* must delete as __free_one_page list manipulates */
29389 +                       page = list_last_entry(list, struct page, lru);
29390                         list_del(&page->lru);
29391
29392 -                       mt = get_pcppage_migratetype(page);
29393 -                       /* MIGRATE_ISOLATE page should not go to pcplists */
29394 -                       VM_BUG_ON_PAGE(is_migrate_isolate(mt), page);
29395 -                       /* Pageblock could have been isolated meanwhile */
29396 -                       if (unlikely(has_isolate_pageblock(zone)))
29397 -                               mt = get_pageblock_migratetype(page);
29398 -
29399 -                       __free_one_page(page, page_to_pfn(page), zone, 0, mt);
29400 -                       trace_mm_page_pcpu_drain(page, 0, mt);
29401 +                       list_add(&page->lru, dst);
29402                 } while (--to_free && --batch_free && !list_empty(list));
29403         }
29404 -       spin_unlock(&zone->lock);
29405  }
29406
29407  static void free_one_page(struct zone *zone,
29408 @@ -858,7 +894,9 @@ static void free_one_page(struct zone *zone,
29409                                 int migratetype)
29410  {
29411         unsigned long nr_scanned;
29412 -       spin_lock(&zone->lock);
29413 +       unsigned long flags;
29414 +
29415 +       spin_lock_irqsave(&zone->lock, flags);
29416         nr_scanned = zone_page_state(zone, NR_PAGES_SCANNED);
29417         if (nr_scanned)
29418                 __mod_zone_page_state(zone, NR_PAGES_SCANNED, -nr_scanned);
29419 @@ -868,7 +906,7 @@ static void free_one_page(struct zone *zone,
29420                 migratetype = get_pfnblock_migratetype(page, pfn);
29421         }
29422         __free_one_page(page, pfn, zone, order, migratetype);
29423 -       spin_unlock(&zone->lock);
29424 +       spin_unlock_irqrestore(&zone->lock, flags);
29425  }
29426
29427  static int free_tail_pages_check(struct page *head_page, struct page *page)
29428 @@ -1019,10 +1057,10 @@ static void __free_pages_ok(struct page *page, unsigned int order)
29429                 return;
29430
29431         migratetype = get_pfnblock_migratetype(page, pfn);
29432 -       local_irq_save(flags);
29433 +       local_lock_irqsave(pa_lock, flags);
29434         __count_vm_events(PGFREE, 1 << order);
29435         free_one_page(page_zone(page), page, pfn, order, migratetype);
29436 -       local_irq_restore(flags);
29437 +       local_unlock_irqrestore(pa_lock, flags);
29438  }
29439
29440  static void __init __free_pages_boot_core(struct page *page,
29441 @@ -1879,16 +1917,18 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order,
29442  void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp)
29443  {
29444         unsigned long flags;
29445 +       LIST_HEAD(dst);
29446         int to_drain, batch;
29447
29448 -       local_irq_save(flags);
29449 +       local_lock_irqsave(pa_lock, flags);
29450         batch = READ_ONCE(pcp->batch);
29451         to_drain = min(pcp->count, batch);
29452         if (to_drain > 0) {
29453 -               free_pcppages_bulk(zone, to_drain, pcp);
29454 +               isolate_pcp_pages(to_drain, pcp, &dst);
29455                 pcp->count -= to_drain;
29456         }
29457 -       local_irq_restore(flags);
29458 +       local_unlock_irqrestore(pa_lock, flags);
29459 +       free_pcppages_bulk(zone, to_drain, &dst);
29460  }
29461  #endif
29462
29463 @@ -1904,16 +1944,21 @@ static void drain_pages_zone(unsigned int cpu, struct zone *zone)
29464         unsigned long flags;
29465         struct per_cpu_pageset *pset;
29466         struct per_cpu_pages *pcp;
29467 +       LIST_HEAD(dst);
29468 +       int count;
29469
29470 -       local_irq_save(flags);
29471 +       cpu_lock_irqsave(cpu, flags);
29472         pset = per_cpu_ptr(zone->pageset, cpu);
29473
29474         pcp = &pset->pcp;
29475 -       if (pcp->count) {
29476 -               free_pcppages_bulk(zone, pcp->count, pcp);
29477 +       count = pcp->count;
29478 +       if (count) {
29479 +               isolate_pcp_pages(count, pcp, &dst);
29480                 pcp->count = 0;
29481         }
29482 -       local_irq_restore(flags);
29483 +       cpu_unlock_irqrestore(cpu, flags);
29484 +       if (count)
29485 +               free_pcppages_bulk(zone, count, &dst);
29486  }
29487
29488  /*
29489 @@ -1999,8 +2044,17 @@ void drain_all_pages(struct zone *zone)
29490                 else
29491                         cpumask_clear_cpu(cpu, &cpus_with_pcps);
29492         }
29493 +#ifndef CONFIG_PREEMPT_RT_BASE
29494         on_each_cpu_mask(&cpus_with_pcps, (smp_call_func_t) drain_local_pages,
29495                                                                 zone, 1);
29496 +#else
29497 +       for_each_cpu(cpu, &cpus_with_pcps) {
29498 +               if (zone)
29499 +                       drain_pages_zone(cpu, zone);
29500 +               else
29501 +                       drain_pages(cpu);
29502 +       }
29503 +#endif
29504  }
29505
29506  #ifdef CONFIG_HIBERNATION
29507 @@ -2056,7 +2110,7 @@ void free_hot_cold_page(struct page *page, bool cold)
29508
29509         migratetype = get_pfnblock_migratetype(page, pfn);
29510         set_pcppage_migratetype(page, migratetype);
29511 -       local_irq_save(flags);
29512 +       local_lock_irqsave(pa_lock, flags);
29513         __count_vm_event(PGFREE);
29514
29515         /*
29516 @@ -2082,12 +2136,17 @@ void free_hot_cold_page(struct page *page, bool cold)
29517         pcp->count++;
29518         if (pcp->count >= pcp->high) {
29519                 unsigned long batch = READ_ONCE(pcp->batch);
29520 -               free_pcppages_bulk(zone, batch, pcp);
29521 +               LIST_HEAD(dst);
29522 +
29523 +               isolate_pcp_pages(batch, pcp, &dst);
29524                 pcp->count -= batch;
29525 +               local_unlock_irqrestore(pa_lock, flags);
29526 +               free_pcppages_bulk(zone, batch, &dst);
29527 +               return;
29528         }
29529
29530  out:
29531 -       local_irq_restore(flags);
29532 +       local_unlock_irqrestore(pa_lock, flags);
29533  }
29534
29535  /*
29536 @@ -2222,7 +2281,7 @@ struct page *buffered_rmqueue(struct zone *preferred_zone,
29537                 struct per_cpu_pages *pcp;
29538                 struct list_head *list;
29539
29540 -               local_irq_save(flags);
29541 +               local_lock_irqsave(pa_lock, flags);
29542                 pcp = &this_cpu_ptr(zone->pageset)->pcp;
29543                 list = &pcp->lists[migratetype];
29544                 if (list_empty(list)) {
29545 @@ -2254,7 +2313,7 @@ struct page *buffered_rmqueue(struct zone *preferred_zone,
29546                          */
29547                         WARN_ON_ONCE(order > 1);
29548                 }
29549 -               spin_lock_irqsave(&zone->lock, flags);
29550 +               local_spin_lock_irqsave(pa_lock, &zone->lock, flags);
29551
29552                 page = NULL;
29553                 if (alloc_flags & ALLOC_HARDER) {
29554 @@ -2264,11 +2323,13 @@ struct page *buffered_rmqueue(struct zone *preferred_zone,
29555                 }
29556                 if (!page)
29557                         page = __rmqueue(zone, order, migratetype, gfp_flags);
29558 -               spin_unlock(&zone->lock);
29559 -               if (!page)
29560 +               if (!page) {
29561 +                       spin_unlock(&zone->lock);
29562                         goto failed;
29563 +               }
29564                 __mod_zone_freepage_state(zone, -(1 << order),
29565                                           get_pcppage_migratetype(page));
29566 +               spin_unlock(&zone->lock);
29567         }
29568
29569         __mod_zone_page_state(zone, NR_ALLOC_BATCH, -(1 << order));
29570 @@ -2278,13 +2339,13 @@ struct page *buffered_rmqueue(struct zone *preferred_zone,
29571
29572         __count_zone_vm_events(PGALLOC, zone, 1 << order);
29573         zone_statistics(preferred_zone, zone, gfp_flags);
29574 -       local_irq_restore(flags);
29575 +       local_unlock_irqrestore(pa_lock, flags);
29576
29577         VM_BUG_ON_PAGE(bad_range(zone, page), page);
29578         return page;
29579
29580  failed:
29581 -       local_irq_restore(flags);
29582 +       local_unlock_irqrestore(pa_lock, flags);
29583         return NULL;
29584  }
29585
29586 @@ -5950,6 +6011,7 @@ static int page_alloc_cpu_notify(struct notifier_block *self,
29587  void __init page_alloc_init(void)
29588  {
29589         hotcpu_notifier(page_alloc_cpu_notify, 0);
29590 +       local_irq_lock_init(pa_lock);
29591  }
29592
29593  /*
29594 @@ -6844,7 +6906,7 @@ void zone_pcp_reset(struct zone *zone)
29595         struct per_cpu_pageset *pset;
29596
29597         /* avoid races with drain_pages()  */
29598 -       local_irq_save(flags);
29599 +       local_lock_irqsave(pa_lock, flags);
29600         if (zone->pageset != &boot_pageset) {
29601                 for_each_online_cpu(cpu) {
29602                         pset = per_cpu_ptr(zone->pageset, cpu);
29603 @@ -6853,7 +6915,7 @@ void zone_pcp_reset(struct zone *zone)
29604                 free_percpu(zone->pageset);
29605                 zone->pageset = &boot_pageset;
29606         }
29607 -       local_irq_restore(flags);
29608 +       local_unlock_irqrestore(pa_lock, flags);
29609  }
29610
29611  #ifdef CONFIG_MEMORY_HOTREMOVE
29612 diff --git a/mm/slab.h b/mm/slab.h
29613 index 7b6087197997..afdc57941179 100644
29614 --- a/mm/slab.h
29615 +++ b/mm/slab.h
29616 @@ -324,7 +324,11 @@ static inline struct kmem_cache *cache_from_obj(struct kmem_cache *s, void *x)
29617   * The slab lists for all objects.
29618   */
29619  struct kmem_cache_node {
29620 +#ifdef CONFIG_SLUB
29621 +       raw_spinlock_t list_lock;
29622 +#else
29623         spinlock_t list_lock;
29624 +#endif
29625
29626  #ifdef CONFIG_SLAB
29627         struct list_head slabs_partial; /* partial list first, better asm code */
29628 diff --git a/mm/slub.c b/mm/slub.c
29629 index 65d5f92d51d2..feb4a445a546 100644
29630 --- a/mm/slub.c
29631 +++ b/mm/slub.c
29632 @@ -1075,7 +1075,7 @@ static noinline struct kmem_cache_node *free_debug_processing(
29633         void *object = head;
29634         int cnt = 0;
29635
29636 -       spin_lock_irqsave(&n->list_lock, *flags);
29637 +       raw_spin_lock_irqsave(&n->list_lock, *flags);
29638         slab_lock(page);
29639
29640         if (!check_slab(s, page))
29641 @@ -1136,7 +1136,7 @@ out:
29642
29643  fail:
29644         slab_unlock(page);
29645 -       spin_unlock_irqrestore(&n->list_lock, *flags);
29646 +       raw_spin_unlock_irqrestore(&n->list_lock, *flags);
29647         slab_fix(s, "Object at 0x%p not freed", object);
29648         return NULL;
29649  }
29650 @@ -1263,6 +1263,12 @@ static inline void dec_slabs_node(struct kmem_cache *s, int node,
29651
29652  #endif /* CONFIG_SLUB_DEBUG */
29653
29654 +struct slub_free_list {
29655 +       raw_spinlock_t          lock;
29656 +       struct list_head        list;
29657 +};
29658 +static DEFINE_PER_CPU(struct slub_free_list, slub_free_list);
29659 +
29660  /*
29661   * Hooks for other subsystems that check memory allocations. In a typical
29662   * production configuration these hooks all should produce no code at all.
29663 @@ -1399,10 +1405,17 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
29664         gfp_t alloc_gfp;
29665         void *start, *p;
29666         int idx, order;
29667 +       bool enableirqs = false;
29668
29669         flags &= gfp_allowed_mask;
29670
29671         if (gfpflags_allow_blocking(flags))
29672 +               enableirqs = true;
29673 +#ifdef CONFIG_PREEMPT_RT_FULL
29674 +       if (system_state == SYSTEM_RUNNING)
29675 +               enableirqs = true;
29676 +#endif
29677 +       if (enableirqs)
29678                 local_irq_enable();
29679
29680         flags |= s->allocflags;
29681 @@ -1473,7 +1486,7 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
29682         page->frozen = 1;
29683
29684  out:
29685 -       if (gfpflags_allow_blocking(flags))
29686 +       if (enableirqs)
29687                 local_irq_disable();
29688         if (!page)
29689                 return NULL;
29690 @@ -1529,6 +1542,16 @@ static void __free_slab(struct kmem_cache *s, struct page *page)
29691         __free_kmem_pages(page, order);
29692  }
29693
29694 +static void free_delayed(struct list_head *h)
29695 +{
29696 +       while(!list_empty(h)) {
29697 +               struct page *page = list_first_entry(h, struct page, lru);
29698 +
29699 +               list_del(&page->lru);
29700 +               __free_slab(page->slab_cache, page);
29701 +       }
29702 +}
29703 +
29704  #define need_reserve_slab_rcu                                          \
29705         (sizeof(((struct page *)NULL)->lru) < sizeof(struct rcu_head))
29706
29707 @@ -1560,6 +1583,12 @@ static void free_slab(struct kmem_cache *s, struct page *page)
29708                 }
29709
29710                 call_rcu(head, rcu_free_slab);
29711 +       } else if (irqs_disabled()) {
29712 +               struct slub_free_list *f = this_cpu_ptr(&slub_free_list);
29713 +
29714 +               raw_spin_lock(&f->lock);
29715 +               list_add(&page->lru, &f->list);
29716 +               raw_spin_unlock(&f->lock);
29717         } else
29718                 __free_slab(s, page);
29719  }
29720 @@ -1673,7 +1702,7 @@ static void *get_partial_node(struct kmem_cache *s, struct kmem_cache_node *n,
29721         if (!n || !n->nr_partial)
29722                 return NULL;
29723
29724 -       spin_lock(&n->list_lock);
29725 +       raw_spin_lock(&n->list_lock);
29726         list_for_each_entry_safe(page, page2, &n->partial, lru) {
29727                 void *t;
29728
29729 @@ -1698,7 +1727,7 @@ static void *get_partial_node(struct kmem_cache *s, struct kmem_cache_node *n,
29730                         break;
29731
29732         }
29733 -       spin_unlock(&n->list_lock);
29734 +       raw_spin_unlock(&n->list_lock);
29735         return object;
29736  }
29737
29738 @@ -1944,7 +1973,7 @@ redo:
29739                          * that acquire_slab() will see a slab page that
29740                          * is frozen
29741                          */
29742 -                       spin_lock(&n->list_lock);
29743 +                       raw_spin_lock(&n->list_lock);
29744                 }
29745         } else {
29746                 m = M_FULL;
29747 @@ -1955,7 +1984,7 @@ redo:
29748                          * slabs from diagnostic functions will not see
29749                          * any frozen slabs.
29750                          */
29751 -                       spin_lock(&n->list_lock);
29752 +                       raw_spin_lock(&n->list_lock);
29753                 }
29754         }
29755
29756 @@ -1990,7 +2019,7 @@ redo:
29757                 goto redo;
29758
29759         if (lock)
29760 -               spin_unlock(&n->list_lock);
29761 +               raw_spin_unlock(&n->list_lock);
29762
29763         if (m == M_FREE) {
29764                 stat(s, DEACTIVATE_EMPTY);
29765 @@ -2022,10 +2051,10 @@ static void unfreeze_partials(struct kmem_cache *s,
29766                 n2 = get_node(s, page_to_nid(page));
29767                 if (n != n2) {
29768                         if (n)
29769 -                               spin_unlock(&n->list_lock);
29770 +                               raw_spin_unlock(&n->list_lock);
29771
29772                         n = n2;
29773 -                       spin_lock(&n->list_lock);
29774 +                       raw_spin_lock(&n->list_lock);
29775                 }
29776
29777                 do {
29778 @@ -2054,7 +2083,7 @@ static void unfreeze_partials(struct kmem_cache *s,
29779         }
29780
29781         if (n)
29782 -               spin_unlock(&n->list_lock);
29783 +               raw_spin_unlock(&n->list_lock);
29784
29785         while (discard_page) {
29786                 page = discard_page;
29787 @@ -2093,14 +2122,21 @@ static void put_cpu_partial(struct kmem_cache *s, struct page *page, int drain)
29788                         pobjects = oldpage->pobjects;
29789                         pages = oldpage->pages;
29790                         if (drain && pobjects > s->cpu_partial) {
29791 +                               struct slub_free_list *f;
29792                                 unsigned long flags;
29793 +                               LIST_HEAD(tofree);
29794                                 /*
29795                                  * partial array is full. Move the existing
29796                                  * set to the per node partial list.
29797                                  */
29798                                 local_irq_save(flags);
29799                                 unfreeze_partials(s, this_cpu_ptr(s->cpu_slab));
29800 +                               f = this_cpu_ptr(&slub_free_list);
29801 +                               raw_spin_lock(&f->lock);
29802 +                               list_splice_init(&f->list, &tofree);
29803 +                               raw_spin_unlock(&f->lock);
29804                                 local_irq_restore(flags);
29805 +                               free_delayed(&tofree);
29806                                 oldpage = NULL;
29807                                 pobjects = 0;
29808                                 pages = 0;
29809 @@ -2172,7 +2208,22 @@ static bool has_cpu_slab(int cpu, void *info)
29810
29811  static void flush_all(struct kmem_cache *s)
29812  {
29813 +       LIST_HEAD(tofree);
29814 +       int cpu;
29815 +
29816         on_each_cpu_cond(has_cpu_slab, flush_cpu_slab, s, 1, GFP_ATOMIC);
29817 +       for_each_online_cpu(cpu) {
29818 +               struct slub_free_list *f;
29819 +
29820 +               if (!has_cpu_slab(cpu, s))
29821 +                       continue;
29822 +
29823 +               f = &per_cpu(slub_free_list, cpu);
29824 +               raw_spin_lock_irq(&f->lock);
29825 +               list_splice_init(&f->list, &tofree);
29826 +               raw_spin_unlock_irq(&f->lock);
29827 +               free_delayed(&tofree);
29828 +       }
29829  }
29830
29831  /*
29832 @@ -2208,10 +2259,10 @@ static unsigned long count_partial(struct kmem_cache_node *n,
29833         unsigned long x = 0;
29834         struct page *page;
29835
29836 -       spin_lock_irqsave(&n->list_lock, flags);
29837 +       raw_spin_lock_irqsave(&n->list_lock, flags);
29838         list_for_each_entry(page, &n->partial, lru)
29839                 x += get_count(page);
29840 -       spin_unlock_irqrestore(&n->list_lock, flags);
29841 +       raw_spin_unlock_irqrestore(&n->list_lock, flags);
29842         return x;
29843  }
29844  #endif /* CONFIG_SLUB_DEBUG || CONFIG_SYSFS */
29845 @@ -2349,8 +2400,10 @@ static inline void *get_freelist(struct kmem_cache *s, struct page *page)
29846   * already disabled (which is the case for bulk allocation).
29847   */
29848  static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
29849 -                         unsigned long addr, struct kmem_cache_cpu *c)
29850 +                         unsigned long addr, struct kmem_cache_cpu *c,
29851 +                         struct list_head *to_free)
29852  {
29853 +       struct slub_free_list *f;
29854         void *freelist;
29855         struct page *page;
29856
29857 @@ -2410,6 +2463,13 @@ load_freelist:
29858         VM_BUG_ON(!c->page->frozen);
29859         c->freelist = get_freepointer(s, freelist);
29860         c->tid = next_tid(c->tid);
29861 +
29862 +out:
29863 +       f = this_cpu_ptr(&slub_free_list);
29864 +       raw_spin_lock(&f->lock);
29865 +       list_splice_init(&f->list, to_free);
29866 +       raw_spin_unlock(&f->lock);
29867 +
29868         return freelist;
29869
29870  new_slab:
29871 @@ -2441,7 +2501,7 @@ new_slab:
29872         deactivate_slab(s, page, get_freepointer(s, freelist));
29873         c->page = NULL;
29874         c->freelist = NULL;
29875 -       return freelist;
29876 +       goto out;
29877  }
29878
29879  /*
29880 @@ -2453,6 +2513,7 @@ static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
29881  {
29882         void *p;
29883         unsigned long flags;
29884 +       LIST_HEAD(tofree);
29885
29886         local_irq_save(flags);
29887  #ifdef CONFIG_PREEMPT
29888 @@ -2464,8 +2525,9 @@ static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
29889         c = this_cpu_ptr(s->cpu_slab);
29890  #endif
29891
29892 -       p = ___slab_alloc(s, gfpflags, node, addr, c);
29893 +       p = ___slab_alloc(s, gfpflags, node, addr, c, &tofree);
29894         local_irq_restore(flags);
29895 +       free_delayed(&tofree);
29896         return p;
29897  }
29898
29899 @@ -2652,7 +2714,7 @@ static void __slab_free(struct kmem_cache *s, struct page *page,
29900
29901         do {
29902                 if (unlikely(n)) {
29903 -                       spin_unlock_irqrestore(&n->list_lock, flags);
29904 +                       raw_spin_unlock_irqrestore(&n->list_lock, flags);
29905                         n = NULL;
29906                 }
29907                 prior = page->freelist;
29908 @@ -2684,7 +2746,7 @@ static void __slab_free(struct kmem_cache *s, struct page *page,
29909                                  * Otherwise the list_lock will synchronize with
29910                                  * other processors updating the list of slabs.
29911                                  */
29912 -                               spin_lock_irqsave(&n->list_lock, flags);
29913 +                               raw_spin_lock_irqsave(&n->list_lock, flags);
29914
29915                         }
29916                 }
29917 @@ -2726,7 +2788,7 @@ static void __slab_free(struct kmem_cache *s, struct page *page,
29918                 add_partial(n, page, DEACTIVATE_TO_TAIL);
29919                 stat(s, FREE_ADD_PARTIAL);
29920         }
29921 -       spin_unlock_irqrestore(&n->list_lock, flags);
29922 +       raw_spin_unlock_irqrestore(&n->list_lock, flags);
29923         return;
29924
29925  slab_empty:
29926 @@ -2741,7 +2803,7 @@ slab_empty:
29927                 remove_full(s, n, page);
29928         }
29929
29930 -       spin_unlock_irqrestore(&n->list_lock, flags);
29931 +       raw_spin_unlock_irqrestore(&n->list_lock, flags);
29932         stat(s, FREE_SLAB);
29933         discard_slab(s, page);
29934  }
29935 @@ -2913,6 +2975,7 @@ int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size,
29936                           void **p)
29937  {
29938         struct kmem_cache_cpu *c;
29939 +       LIST_HEAD(to_free);
29940         int i;
29941
29942         /* memcg and kmem_cache debug support */
29943 @@ -2936,7 +2999,7 @@ int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size,
29944                          * of re-populating per CPU c->freelist
29945                          */
29946                         p[i] = ___slab_alloc(s, flags, NUMA_NO_NODE,
29947 -                                           _RET_IP_, c);
29948 +                                           _RET_IP_, c, &to_free);
29949                         if (unlikely(!p[i]))
29950                                 goto error;
29951
29952 @@ -2948,6 +3011,7 @@ int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size,
29953         }
29954         c->tid = next_tid(c->tid);
29955         local_irq_enable();
29956 +       free_delayed(&to_free);
29957
29958         /* Clear memory outside IRQ disabled fastpath loop */
29959         if (unlikely(flags & __GFP_ZERO)) {
29960 @@ -3095,7 +3159,7 @@ static void
29961  init_kmem_cache_node(struct kmem_cache_node *n)
29962  {
29963         n->nr_partial = 0;
29964 -       spin_lock_init(&n->list_lock);
29965 +       raw_spin_lock_init(&n->list_lock);
29966         INIT_LIST_HEAD(&n->partial);
29967  #ifdef CONFIG_SLUB_DEBUG
29968         atomic_long_set(&n->nr_slabs, 0);
29969 @@ -3677,7 +3741,7 @@ int __kmem_cache_shrink(struct kmem_cache *s, bool deactivate)
29970                 for (i = 0; i < SHRINK_PROMOTE_MAX; i++)
29971                         INIT_LIST_HEAD(promote + i);
29972
29973 -               spin_lock_irqsave(&n->list_lock, flags);
29974 +               raw_spin_lock_irqsave(&n->list_lock, flags);
29975
29976                 /*
29977                  * Build lists of slabs to discard or promote.
29978 @@ -3708,7 +3772,7 @@ int __kmem_cache_shrink(struct kmem_cache *s, bool deactivate)
29979                 for (i = SHRINK_PROMOTE_MAX - 1; i >= 0; i--)
29980                         list_splice(promote + i, &n->partial);
29981
29982 -               spin_unlock_irqrestore(&n->list_lock, flags);
29983 +               raw_spin_unlock_irqrestore(&n->list_lock, flags);
29984
29985                 /* Release empty slabs */
29986                 list_for_each_entry_safe(page, t, &discard, lru)
29987 @@ -3884,6 +3948,12 @@ void __init kmem_cache_init(void)
29988  {
29989         static __initdata struct kmem_cache boot_kmem_cache,
29990                 boot_kmem_cache_node;
29991 +       int cpu;
29992 +
29993 +       for_each_possible_cpu(cpu) {
29994 +               raw_spin_lock_init(&per_cpu(slub_free_list, cpu).lock);
29995 +               INIT_LIST_HEAD(&per_cpu(slub_free_list, cpu).list);
29996 +       }
29997
29998         if (debug_guardpage_minorder())
29999                 slub_max_order = 0;
30000 @@ -4127,7 +4197,7 @@ static int validate_slab_node(struct kmem_cache *s,
30001         struct page *page;
30002         unsigned long flags;
30003
30004 -       spin_lock_irqsave(&n->list_lock, flags);
30005 +       raw_spin_lock_irqsave(&n->list_lock, flags);
30006
30007         list_for_each_entry(page, &n->partial, lru) {
30008                 validate_slab_slab(s, page, map);
30009 @@ -4149,7 +4219,7 @@ static int validate_slab_node(struct kmem_cache *s,
30010                        s->name, count, atomic_long_read(&n->nr_slabs));
30011
30012  out:
30013 -       spin_unlock_irqrestore(&n->list_lock, flags);
30014 +       raw_spin_unlock_irqrestore(&n->list_lock, flags);
30015         return count;
30016  }
30017
30018 @@ -4337,12 +4407,12 @@ static int list_locations(struct kmem_cache *s, char *buf,
30019                 if (!atomic_long_read(&n->nr_slabs))
30020                         continue;
30021
30022 -               spin_lock_irqsave(&n->list_lock, flags);
30023 +               raw_spin_lock_irqsave(&n->list_lock, flags);
30024                 list_for_each_entry(page, &n->partial, lru)
30025                         process_slab(&t, s, page, alloc, map);
30026                 list_for_each_entry(page, &n->full, lru)
30027                         process_slab(&t, s, page, alloc, map);
30028 -               spin_unlock_irqrestore(&n->list_lock, flags);
30029 +               raw_spin_unlock_irqrestore(&n->list_lock, flags);
30030         }
30031
30032         for (i = 0; i < t.count; i++) {
30033 diff --git a/mm/swap.c b/mm/swap.c
30034 index 39395fb549c0..ad16649221d7 100644
30035 --- a/mm/swap.c
30036 +++ b/mm/swap.c
30037 @@ -31,6 +31,7 @@
30038  #include <linux/memcontrol.h>
30039  #include <linux/gfp.h>
30040  #include <linux/uio.h>
30041 +#include <linux/locallock.h>
30042  #include <linux/hugetlb.h>
30043  #include <linux/page_idle.h>
30044
30045 @@ -46,6 +47,9 @@ static DEFINE_PER_CPU(struct pagevec, lru_add_pvec);
30046  static DEFINE_PER_CPU(struct pagevec, lru_rotate_pvecs);
30047  static DEFINE_PER_CPU(struct pagevec, lru_deactivate_file_pvecs);
30048
30049 +static DEFINE_LOCAL_IRQ_LOCK(rotate_lock);
30050 +DEFINE_LOCAL_IRQ_LOCK(swapvec_lock);
30051 +
30052  /*
30053   * This path almost never happens for VM activity - pages are normally
30054   * freed via pagevecs.  But it gets used by networking.
30055 @@ -481,11 +485,11 @@ void rotate_reclaimable_page(struct page *page)
30056                 unsigned long flags;
30057
30058                 page_cache_get(page);
30059 -               local_irq_save(flags);
30060 +               local_lock_irqsave(rotate_lock, flags);
30061                 pvec = this_cpu_ptr(&lru_rotate_pvecs);
30062                 if (!pagevec_add(pvec, page))
30063                         pagevec_move_tail(pvec);
30064 -               local_irq_restore(flags);
30065 +               local_unlock_irqrestore(rotate_lock, flags);
30066         }
30067  }
30068
30069 @@ -536,12 +540,13 @@ static bool need_activate_page_drain(int cpu)
30070  void activate_page(struct page *page)
30071  {
30072         if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) {
30073 -               struct pagevec *pvec = &get_cpu_var(activate_page_pvecs);
30074 +               struct pagevec *pvec = &get_locked_var(swapvec_lock,
30075 +                                                      activate_page_pvecs);
30076
30077                 page_cache_get(page);
30078                 if (!pagevec_add(pvec, page))
30079                         pagevec_lru_move_fn(pvec, __activate_page, NULL);
30080 -               put_cpu_var(activate_page_pvecs);
30081 +               put_locked_var(swapvec_lock, activate_page_pvecs);
30082         }
30083  }
30084
30085 @@ -567,7 +572,7 @@ void activate_page(struct page *page)
30086
30087  static void __lru_cache_activate_page(struct page *page)
30088  {
30089 -       struct pagevec *pvec = &get_cpu_var(lru_add_pvec);
30090 +       struct pagevec *pvec = &get_locked_var(swapvec_lock, lru_add_pvec);
30091         int i;
30092
30093         /*
30094 @@ -589,7 +594,7 @@ static void __lru_cache_activate_page(struct page *page)
30095                 }
30096         }
30097
30098 -       put_cpu_var(lru_add_pvec);
30099 +       put_locked_var(swapvec_lock, lru_add_pvec);
30100  }
30101
30102  /*
30103 @@ -630,13 +635,13 @@ EXPORT_SYMBOL(mark_page_accessed);
30104
30105  static void __lru_cache_add(struct page *page)
30106  {
30107 -       struct pagevec *pvec = &get_cpu_var(lru_add_pvec);
30108 +       struct pagevec *pvec = &get_locked_var(swapvec_lock, lru_add_pvec);
30109
30110         page_cache_get(page);
30111         if (!pagevec_space(pvec))
30112                 __pagevec_lru_add(pvec);
30113         pagevec_add(pvec, page);
30114 -       put_cpu_var(lru_add_pvec);
30115 +       put_locked_var(swapvec_lock, lru_add_pvec);
30116  }
30117
30118  /**
30119 @@ -816,9 +821,15 @@ void lru_add_drain_cpu(int cpu)
30120                 unsigned long flags;
30121
30122                 /* No harm done if a racing interrupt already did this */
30123 -               local_irq_save(flags);
30124 +#ifdef CONFIG_PREEMPT_RT_BASE
30125 +               local_lock_irqsave_on(rotate_lock, flags, cpu);
30126 +               pagevec_move_tail(pvec);
30127 +               local_unlock_irqrestore_on(rotate_lock, flags, cpu);
30128 +#else
30129 +               local_lock_irqsave(rotate_lock, flags);
30130                 pagevec_move_tail(pvec);
30131 -               local_irq_restore(flags);
30132 +               local_unlock_irqrestore(rotate_lock, flags);
30133 +#endif
30134         }
30135
30136         pvec = &per_cpu(lru_deactivate_file_pvecs, cpu);
30137 @@ -846,26 +857,47 @@ void deactivate_file_page(struct page *page)
30138                 return;
30139
30140         if (likely(get_page_unless_zero(page))) {
30141 -               struct pagevec *pvec = &get_cpu_var(lru_deactivate_file_pvecs);
30142 +               struct pagevec *pvec = &get_locked_var(swapvec_lock,
30143 +                                                      lru_deactivate_file_pvecs);
30144
30145                 if (!pagevec_add(pvec, page))
30146                         pagevec_lru_move_fn(pvec, lru_deactivate_file_fn, NULL);
30147 -               put_cpu_var(lru_deactivate_file_pvecs);
30148 +               put_locked_var(swapvec_lock, lru_deactivate_file_pvecs);
30149         }
30150  }
30151
30152  void lru_add_drain(void)
30153  {
30154 -       lru_add_drain_cpu(get_cpu());
30155 -       put_cpu();
30156 +       lru_add_drain_cpu(local_lock_cpu(swapvec_lock));
30157 +       local_unlock_cpu(swapvec_lock);
30158  }
30159
30160 +
30161 +#ifdef CONFIG_PREEMPT_RT_BASE
30162 +static inline void remote_lru_add_drain(int cpu, struct cpumask *has_work)
30163 +{
30164 +       local_lock_on(swapvec_lock, cpu);
30165 +       lru_add_drain_cpu(cpu);
30166 +       local_unlock_on(swapvec_lock, cpu);
30167 +}
30168 +
30169 +#else
30170 +
30171  static void lru_add_drain_per_cpu(struct work_struct *dummy)
30172  {
30173         lru_add_drain();
30174  }
30175
30176  static DEFINE_PER_CPU(struct work_struct, lru_add_drain_work);
30177 +static inline void remote_lru_add_drain(int cpu, struct cpumask *has_work)
30178 +{
30179 +       struct work_struct *work = &per_cpu(lru_add_drain_work, cpu);
30180 +
30181 +       INIT_WORK(work, lru_add_drain_per_cpu);
30182 +       schedule_work_on(cpu, work);
30183 +       cpumask_set_cpu(cpu, has_work);
30184 +}
30185 +#endif
30186
30187  void lru_add_drain_all(void)
30188  {
30189 @@ -878,20 +910,17 @@ void lru_add_drain_all(void)
30190         cpumask_clear(&has_work);
30191
30192         for_each_online_cpu(cpu) {
30193 -               struct work_struct *work = &per_cpu(lru_add_drain_work, cpu);
30194 -
30195                 if (pagevec_count(&per_cpu(lru_add_pvec, cpu)) ||
30196                     pagevec_count(&per_cpu(lru_rotate_pvecs, cpu)) ||
30197                     pagevec_count(&per_cpu(lru_deactivate_file_pvecs, cpu)) ||
30198 -                   need_activate_page_drain(cpu)) {
30199 -                       INIT_WORK(work, lru_add_drain_per_cpu);
30200 -                       schedule_work_on(cpu, work);
30201 -                       cpumask_set_cpu(cpu, &has_work);
30202 -               }
30203 +                   need_activate_page_drain(cpu))
30204 +                       remote_lru_add_drain(cpu, &has_work);
30205         }
30206
30207 +#ifndef CONFIG_PREEMPT_RT_BASE
30208         for_each_cpu(cpu, &has_work)
30209                 flush_work(&per_cpu(lru_add_drain_work, cpu));
30210 +#endif
30211
30212         put_online_cpus();
30213         mutex_unlock(&lock);
30214 diff --git a/mm/truncate.c b/mm/truncate.c
30215 index 76e35ad97102..5f196420020c 100644
30216 --- a/mm/truncate.c
30217 +++ b/mm/truncate.c
30218 @@ -56,8 +56,11 @@ static void clear_exceptional_entry(struct address_space *mapping,
30219          * protected by mapping->tree_lock.
30220          */
30221         if (!workingset_node_shadows(node) &&
30222 -           !list_empty(&node->private_list))
30223 -               list_lru_del(&workingset_shadow_nodes, &node->private_list);
30224 +           !list_empty(&node->private_list)) {
30225 +               local_lock(workingset_shadow_lock);
30226 +               list_lru_del(&__workingset_shadow_nodes, &node->private_list);
30227 +               local_unlock(workingset_shadow_lock);
30228 +       }
30229         __radix_tree_delete_node(&mapping->page_tree, node);
30230  unlock:
30231         spin_unlock_irq(&mapping->tree_lock);
30232 diff --git a/mm/vmalloc.c b/mm/vmalloc.c
30233 index 8e3c9c5a3042..68740314ad54 100644
30234 --- a/mm/vmalloc.c
30235 +++ b/mm/vmalloc.c
30236 @@ -821,7 +821,7 @@ static void *new_vmap_block(unsigned int order, gfp_t gfp_mask)
30237         struct vmap_block *vb;
30238         struct vmap_area *va;
30239         unsigned long vb_idx;
30240 -       int node, err;
30241 +       int node, err, cpu;
30242         void *vaddr;
30243
30244         node = numa_node_id();
30245 @@ -864,11 +864,12 @@ static void *new_vmap_block(unsigned int order, gfp_t gfp_mask)
30246         BUG_ON(err);
30247         radix_tree_preload_end();
30248
30249 -       vbq = &get_cpu_var(vmap_block_queue);
30250 +       cpu = get_cpu_light();
30251 +       vbq = this_cpu_ptr(&vmap_block_queue);
30252         spin_lock(&vbq->lock);
30253         list_add_tail_rcu(&vb->free_list, &vbq->free);
30254         spin_unlock(&vbq->lock);
30255 -       put_cpu_var(vmap_block_queue);
30256 +       put_cpu_light();
30257
30258         return vaddr;
30259  }
30260 @@ -937,6 +938,7 @@ static void *vb_alloc(unsigned long size, gfp_t gfp_mask)
30261         struct vmap_block *vb;
30262         void *vaddr = NULL;
30263         unsigned int order;
30264 +       int cpu;
30265
30266         BUG_ON(offset_in_page(size));
30267         BUG_ON(size > PAGE_SIZE*VMAP_MAX_ALLOC);
30268 @@ -951,7 +953,8 @@ static void *vb_alloc(unsigned long size, gfp_t gfp_mask)
30269         order = get_order(size);
30270
30271         rcu_read_lock();
30272 -       vbq = &get_cpu_var(vmap_block_queue);
30273 +       cpu = get_cpu_light();
30274 +       vbq = this_cpu_ptr(&vmap_block_queue);
30275         list_for_each_entry_rcu(vb, &vbq->free, free_list) {
30276                 unsigned long pages_off;
30277
30278 @@ -974,7 +977,7 @@ static void *vb_alloc(unsigned long size, gfp_t gfp_mask)
30279                 break;
30280         }
30281
30282 -       put_cpu_var(vmap_block_queue);
30283 +       put_cpu_light();
30284         rcu_read_unlock();
30285
30286         /* Allocate new block if nothing was found */
30287 diff --git a/mm/vmstat.c b/mm/vmstat.c
30288 index c54fd2924f25..64416fd7c209 100644
30289 --- a/mm/vmstat.c
30290 +++ b/mm/vmstat.c
30291 @@ -226,6 +226,7 @@ void __mod_zone_page_state(struct zone *zone, enum zone_stat_item item,
30292         long x;
30293         long t;
30294
30295 +       preempt_disable_rt();
30296         x = delta + __this_cpu_read(*p);
30297
30298         t = __this_cpu_read(pcp->stat_threshold);
30299 @@ -235,6 +236,7 @@ void __mod_zone_page_state(struct zone *zone, enum zone_stat_item item,
30300                 x = 0;
30301         }
30302         __this_cpu_write(*p, x);
30303 +       preempt_enable_rt();
30304  }
30305  EXPORT_SYMBOL(__mod_zone_page_state);
30306
30307 @@ -267,6 +269,7 @@ void __inc_zone_state(struct zone *zone, enum zone_stat_item item)
30308         s8 __percpu *p = pcp->vm_stat_diff + item;
30309         s8 v, t;
30310
30311 +       preempt_disable_rt();
30312         v = __this_cpu_inc_return(*p);
30313         t = __this_cpu_read(pcp->stat_threshold);
30314         if (unlikely(v > t)) {
30315 @@ -275,6 +278,7 @@ void __inc_zone_state(struct zone *zone, enum zone_stat_item item)
30316                 zone_page_state_add(v + overstep, zone, item);
30317                 __this_cpu_write(*p, -overstep);
30318         }
30319 +       preempt_enable_rt();
30320  }
30321
30322  void __inc_zone_page_state(struct page *page, enum zone_stat_item item)
30323 @@ -289,6 +293,7 @@ void __dec_zone_state(struct zone *zone, enum zone_stat_item item)
30324         s8 __percpu *p = pcp->vm_stat_diff + item;
30325         s8 v, t;
30326
30327 +       preempt_disable_rt();
30328         v = __this_cpu_dec_return(*p);
30329         t = __this_cpu_read(pcp->stat_threshold);
30330         if (unlikely(v < - t)) {
30331 @@ -297,6 +302,7 @@ void __dec_zone_state(struct zone *zone, enum zone_stat_item item)
30332                 zone_page_state_add(v - overstep, zone, item);
30333                 __this_cpu_write(*p, overstep);
30334         }
30335 +       preempt_enable_rt();
30336  }
30337
30338  void __dec_zone_page_state(struct page *page, enum zone_stat_item item)
30339 diff --git a/mm/workingset.c b/mm/workingset.c
30340 index df66f426fdcf..6db7b243fa0d 100644
30341 --- a/mm/workingset.c
30342 +++ b/mm/workingset.c
30343 @@ -264,7 +264,8 @@ void workingset_activation(struct page *page)
30344   * point where they would still be useful.
30345   */
30346
30347 -struct list_lru workingset_shadow_nodes;
30348 +struct list_lru __workingset_shadow_nodes;
30349 +DEFINE_LOCAL_IRQ_LOCK(workingset_shadow_lock);
30350
30351  static unsigned long count_shadow_nodes(struct shrinker *shrinker,
30352                                         struct shrink_control *sc)
30353 @@ -274,9 +275,9 @@ static unsigned long count_shadow_nodes(struct shrinker *shrinker,
30354         unsigned long pages;
30355
30356         /* list_lru lock nests inside IRQ-safe mapping->tree_lock */
30357 -       local_irq_disable();
30358 -       shadow_nodes = list_lru_shrink_count(&workingset_shadow_nodes, sc);
30359 -       local_irq_enable();
30360 +       local_lock_irq(workingset_shadow_lock);
30361 +       shadow_nodes = list_lru_shrink_count(&__workingset_shadow_nodes, sc);
30362 +       local_unlock_irq(workingset_shadow_lock);
30363
30364         pages = node_present_pages(sc->nid);
30365         /*
30366 @@ -361,9 +362,9 @@ static enum lru_status shadow_lru_isolate(struct list_head *item,
30367         spin_unlock(&mapping->tree_lock);
30368         ret = LRU_REMOVED_RETRY;
30369  out:
30370 -       local_irq_enable();
30371 +       local_unlock_irq(workingset_shadow_lock);
30372         cond_resched();
30373 -       local_irq_disable();
30374 +       local_lock_irq(workingset_shadow_lock);
30375         spin_lock(lru_lock);
30376         return ret;
30377  }
30378 @@ -374,10 +375,10 @@ static unsigned long scan_shadow_nodes(struct shrinker *shrinker,
30379         unsigned long ret;
30380
30381         /* list_lru lock nests inside IRQ-safe mapping->tree_lock */
30382 -       local_irq_disable();
30383 -       ret =  list_lru_shrink_walk(&workingset_shadow_nodes, sc,
30384 +       local_lock_irq(workingset_shadow_lock);
30385 +       ret =  list_lru_shrink_walk(&__workingset_shadow_nodes, sc,
30386                                     shadow_lru_isolate, NULL);
30387 -       local_irq_enable();
30388 +       local_unlock_irq(workingset_shadow_lock);
30389         return ret;
30390  }
30391
30392 @@ -398,7 +399,7 @@ static int __init workingset_init(void)
30393  {
30394         int ret;
30395
30396 -       ret = list_lru_init_key(&workingset_shadow_nodes, &shadow_nodes_key);
30397 +       ret = list_lru_init_key(&__workingset_shadow_nodes, &shadow_nodes_key);
30398         if (ret)
30399                 goto err;
30400         ret = register_shrinker(&workingset_shadow_shrinker);
30401 @@ -406,7 +407,7 @@ static int __init workingset_init(void)
30402                 goto err_list_lru;
30403         return 0;
30404  err_list_lru:
30405 -       list_lru_destroy(&workingset_shadow_nodes);
30406 +       list_lru_destroy(&__workingset_shadow_nodes);
30407  err:
30408         return ret;
30409  }
30410 diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
30411 index c1ea19478119..529552c3716d 100644
30412 --- a/mm/zsmalloc.c
30413 +++ b/mm/zsmalloc.c
30414 @@ -64,6 +64,7 @@
30415  #include <linux/debugfs.h>
30416  #include <linux/zsmalloc.h>
30417  #include <linux/zpool.h>
30418 +#include <linux/locallock.h>
30419
30420  /*
30421   * This must be power of 2 and greater than of equal to sizeof(link_free).
30422 @@ -403,6 +404,7 @@ static unsigned int get_maxobj_per_zspage(int size, int pages_per_zspage)
30423
30424  /* per-cpu VM mapping areas for zspage accesses that cross page boundaries */
30425  static DEFINE_PER_CPU(struct mapping_area, zs_map_area);
30426 +static DEFINE_LOCAL_IRQ_LOCK(zs_map_area_lock);
30427
30428  static int is_first_page(struct page *page)
30429  {
30430 @@ -1289,7 +1291,7 @@ void *zs_map_object(struct zs_pool *pool, unsigned long handle,
30431         class = pool->size_class[class_idx];
30432         off = obj_idx_to_offset(page, obj_idx, class->size);
30433
30434 -       area = &get_cpu_var(zs_map_area);
30435 +       area = &get_locked_var(zs_map_area_lock, zs_map_area);
30436         area->vm_mm = mm;
30437         if (off + class->size <= PAGE_SIZE) {
30438                 /* this object is contained entirely within a page */
30439 @@ -1342,7 +1344,7 @@ void zs_unmap_object(struct zs_pool *pool, unsigned long handle)
30440
30441                 __zs_unmap_object(area, pages, off, class->size);
30442         }
30443 -       put_cpu_var(zs_map_area);
30444 +       put_locked_var(zs_map_area_lock, zs_map_area);
30445         unpin_tag(handle);
30446  }
30447  EXPORT_SYMBOL_GPL(zs_unmap_object);
30448 diff --git a/net/core/dev.c b/net/core/dev.c
30449 index 0989fea88c44..4d5f550f01f5 100644
30450 --- a/net/core/dev.c
30451 +++ b/net/core/dev.c
30452 @@ -186,6 +186,7 @@ static unsigned int napi_gen_id;
30453  static DEFINE_HASHTABLE(napi_hash, 8);
30454
30455  static seqcount_t devnet_rename_seq;
30456 +static DEFINE_MUTEX(devnet_rename_mutex);
30457
30458  static inline void dev_base_seq_inc(struct net *net)
30459  {
30460 @@ -207,14 +208,14 @@ static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex)
30461  static inline void rps_lock(struct softnet_data *sd)
30462  {
30463  #ifdef CONFIG_RPS
30464 -       spin_lock(&sd->input_pkt_queue.lock);
30465 +       raw_spin_lock(&sd->input_pkt_queue.raw_lock);
30466  #endif
30467  }
30468
30469  static inline void rps_unlock(struct softnet_data *sd)
30470  {
30471  #ifdef CONFIG_RPS
30472 -       spin_unlock(&sd->input_pkt_queue.lock);
30473 +       raw_spin_unlock(&sd->input_pkt_queue.raw_lock);
30474  #endif
30475  }
30476
30477 @@ -884,7 +885,8 @@ retry:
30478         strcpy(name, dev->name);
30479         rcu_read_unlock();
30480         if (read_seqcount_retry(&devnet_rename_seq, seq)) {
30481 -               cond_resched();
30482 +               mutex_lock(&devnet_rename_mutex);
30483 +               mutex_unlock(&devnet_rename_mutex);
30484                 goto retry;
30485         }
30486
30487 @@ -1153,20 +1155,17 @@ int dev_change_name(struct net_device *dev, const char *newname)
30488         if (dev->flags & IFF_UP)
30489                 return -EBUSY;
30490
30491 -       write_seqcount_begin(&devnet_rename_seq);
30492 +       mutex_lock(&devnet_rename_mutex);
30493 +       __raw_write_seqcount_begin(&devnet_rename_seq);
30494
30495 -       if (strncmp(newname, dev->name, IFNAMSIZ) == 0) {
30496 -               write_seqcount_end(&devnet_rename_seq);
30497 -               return 0;
30498 -       }
30499 +       if (strncmp(newname, dev->name, IFNAMSIZ) == 0)
30500 +               goto outunlock;
30501
30502         memcpy(oldname, dev->name, IFNAMSIZ);
30503
30504         err = dev_get_valid_name(net, dev, newname);
30505 -       if (err < 0) {
30506 -               write_seqcount_end(&devnet_rename_seq);
30507 -               return err;
30508 -       }
30509 +       if (err < 0)
30510 +               goto outunlock;
30511
30512         if (oldname[0] && !strchr(oldname, '%'))
30513                 netdev_info(dev, "renamed from %s\n", oldname);
30514 @@ -1179,11 +1178,12 @@ rollback:
30515         if (ret) {
30516                 memcpy(dev->name, oldname, IFNAMSIZ);
30517                 dev->name_assign_type = old_assign_type;
30518 -               write_seqcount_end(&devnet_rename_seq);
30519 -               return ret;
30520 +               err = ret;
30521 +               goto outunlock;
30522         }
30523
30524 -       write_seqcount_end(&devnet_rename_seq);
30525 +       __raw_write_seqcount_end(&devnet_rename_seq);
30526 +       mutex_unlock(&devnet_rename_mutex);
30527
30528         netdev_adjacent_rename_links(dev, oldname);
30529
30530 @@ -1204,7 +1204,8 @@ rollback:
30531                 /* err >= 0 after dev_alloc_name() or stores the first errno */
30532                 if (err >= 0) {
30533                         err = ret;
30534 -                       write_seqcount_begin(&devnet_rename_seq);
30535 +                       mutex_lock(&devnet_rename_mutex);
30536 +                       __raw_write_seqcount_begin(&devnet_rename_seq);
30537                         memcpy(dev->name, oldname, IFNAMSIZ);
30538                         memcpy(oldname, newname, IFNAMSIZ);
30539                         dev->name_assign_type = old_assign_type;
30540 @@ -1217,6 +1218,11 @@ rollback:
30541         }
30542
30543         return err;
30544 +
30545 +outunlock:
30546 +       __raw_write_seqcount_end(&devnet_rename_seq);
30547 +       mutex_unlock(&devnet_rename_mutex);
30548 +       return err;
30549  }
30550
30551  /**
30552 @@ -2246,6 +2252,7 @@ static inline void __netif_reschedule(struct Qdisc *q)
30553         sd->output_queue_tailp = &q->next_sched;
30554         raise_softirq_irqoff(NET_TX_SOFTIRQ);
30555         local_irq_restore(flags);
30556 +       preempt_check_resched_rt();
30557  }
30558
30559  void __netif_schedule(struct Qdisc *q)
30560 @@ -2327,6 +2334,7 @@ void __dev_kfree_skb_irq(struct sk_buff *skb, enum skb_free_reason reason)
30561         __this_cpu_write(softnet_data.completion_queue, skb);
30562         raise_softirq_irqoff(NET_TX_SOFTIRQ);
30563         local_irq_restore(flags);
30564 +       preempt_check_resched_rt();
30565  }
30566  EXPORT_SYMBOL(__dev_kfree_skb_irq);
30567
30568 @@ -2883,7 +2891,11 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
30569          * This permits __QDISC___STATE_RUNNING owner to get the lock more
30570          * often and dequeue packets faster.
30571          */
30572 +#ifdef CONFIG_PREEMPT_RT_FULL
30573 +       contended = true;
30574 +#else
30575         contended = qdisc_is_running(q);
30576 +#endif
30577         if (unlikely(contended))
30578                 spin_lock(&q->busylock);
30579
30580 @@ -2943,9 +2955,44 @@ static void skb_update_prio(struct sk_buff *skb)
30581  #define skb_update_prio(skb)
30582  #endif
30583
30584 +#ifdef CONFIG_PREEMPT_RT_FULL
30585 +
30586 +static inline int xmit_rec_read(void)
30587 +{
30588 +       return current->xmit_recursion;
30589 +}
30590 +
30591 +static inline void xmit_rec_inc(void)
30592 +{
30593 +       current->xmit_recursion++;
30594 +}
30595 +
30596 +static inline void xmit_rec_dec(void)
30597 +{
30598 +       current->xmit_recursion--;
30599 +}
30600 +
30601 +#else
30602 +
30603  DEFINE_PER_CPU(int, xmit_recursion);
30604  EXPORT_SYMBOL(xmit_recursion);
30605
30606 +static inline int xmit_rec_read(void)
30607 +{
30608 +       return __this_cpu_read(xmit_recursion);
30609 +}
30610 +
30611 +static inline void xmit_rec_inc(void)
30612 +{
30613 +       __this_cpu_inc(xmit_recursion);
30614 +}
30615 +
30616 +static inline void xmit_rec_dec(void)
30617 +{
30618 +       __this_cpu_dec(xmit_recursion);
30619 +}
30620 +#endif
30621 +
30622  #define RECURSION_LIMIT 10
30623
30624  /**
30625 @@ -3138,7 +3185,7 @@ static int __dev_queue_xmit(struct sk_buff *skb, void *accel_priv)
30626
30627                 if (txq->xmit_lock_owner != cpu) {
30628
30629 -                       if (__this_cpu_read(xmit_recursion) > RECURSION_LIMIT)
30630 +                       if (xmit_rec_read() > RECURSION_LIMIT)
30631                                 goto recursion_alert;
30632
30633                         skb = validate_xmit_skb(skb, dev);
30634 @@ -3148,9 +3195,9 @@ static int __dev_queue_xmit(struct sk_buff *skb, void *accel_priv)
30635                         HARD_TX_LOCK(dev, txq, cpu);
30636
30637                         if (!netif_xmit_stopped(txq)) {
30638 -                               __this_cpu_inc(xmit_recursion);
30639 +                               xmit_rec_inc();
30640                                 skb = dev_hard_start_xmit(skb, dev, txq, &rc);
30641 -                               __this_cpu_dec(xmit_recursion);
30642 +                               xmit_rec_dec();
30643                                 if (dev_xmit_complete(rc)) {
30644                                         HARD_TX_UNLOCK(dev, txq);
30645                                         goto out;
30646 @@ -3524,6 +3571,7 @@ drop:
30647         rps_unlock(sd);
30648
30649         local_irq_restore(flags);
30650 +       preempt_check_resched_rt();
30651
30652         atomic_long_inc(&skb->dev->rx_dropped);
30653         kfree_skb(skb);
30654 @@ -3542,7 +3590,7 @@ static int netif_rx_internal(struct sk_buff *skb)
30655                 struct rps_dev_flow voidflow, *rflow = &voidflow;
30656                 int cpu;
30657
30658 -               preempt_disable();
30659 +               migrate_disable();
30660                 rcu_read_lock();
30661
30662                 cpu = get_rps_cpu(skb->dev, skb, &rflow);
30663 @@ -3552,13 +3600,13 @@ static int netif_rx_internal(struct sk_buff *skb)
30664                 ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
30665
30666                 rcu_read_unlock();
30667 -               preempt_enable();
30668 +               migrate_enable();
30669         } else
30670  #endif
30671         {
30672                 unsigned int qtail;
30673 -               ret = enqueue_to_backlog(skb, get_cpu(), &qtail);
30674 -               put_cpu();
30675 +               ret = enqueue_to_backlog(skb, get_cpu_light(), &qtail);
30676 +               put_cpu_light();
30677         }
30678         return ret;
30679  }
30680 @@ -3592,16 +3640,44 @@ int netif_rx_ni(struct sk_buff *skb)
30681
30682         trace_netif_rx_ni_entry(skb);
30683
30684 -       preempt_disable();
30685 +       local_bh_disable();
30686         err = netif_rx_internal(skb);
30687 -       if (local_softirq_pending())
30688 -               do_softirq();
30689 -       preempt_enable();
30690 +       local_bh_enable();
30691
30692         return err;
30693  }
30694  EXPORT_SYMBOL(netif_rx_ni);
30695
30696 +#ifdef CONFIG_PREEMPT_RT_FULL
30697 +/*
30698 + * RT runs ksoftirqd as a real time thread and the root_lock is a
30699 + * "sleeping spinlock". If the trylock fails then we can go into an
30700 + * infinite loop when ksoftirqd preempted the task which actually
30701 + * holds the lock, because we requeue q and raise NET_TX softirq
30702 + * causing ksoftirqd to loop forever.
30703 + *
30704 + * It's safe to use spin_lock on RT here as softirqs run in thread
30705 + * context and cannot deadlock against the thread which is holding
30706 + * root_lock.
30707 + *
30708 + * On !RT the trylock might fail, but there we bail out from the
30709 + * softirq loop after 10 attempts which we can't do on RT. And the
30710 + * task holding root_lock cannot be preempted, so the only downside of
30711 + * that trylock is that we need 10 loops to decide that we should have
30712 + * given up in the first one :)
30713 + */
30714 +static inline int take_root_lock(spinlock_t *lock)
30715 +{
30716 +       spin_lock(lock);
30717 +       return 1;
30718 +}
30719 +#else
30720 +static inline int take_root_lock(spinlock_t *lock)
30721 +{
30722 +       return spin_trylock(lock);
30723 +}
30724 +#endif
30725 +
30726  static void net_tx_action(struct softirq_action *h)
30727  {
30728         struct softnet_data *sd = this_cpu_ptr(&softnet_data);
30729 @@ -3643,7 +3719,7 @@ static void net_tx_action(struct softirq_action *h)
30730                         head = head->next_sched;
30731
30732                         root_lock = qdisc_lock(q);
30733 -                       if (spin_trylock(root_lock)) {
30734 +                       if (take_root_lock(root_lock)) {
30735                                 smp_mb__before_atomic();
30736                                 clear_bit(__QDISC_STATE_SCHED,
30737                                           &q->state);
30738 @@ -4065,7 +4141,7 @@ static void flush_backlog(void *arg)
30739         skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) {
30740                 if (skb->dev == dev) {
30741                         __skb_unlink(skb, &sd->input_pkt_queue);
30742 -                       kfree_skb(skb);
30743 +                       __skb_queue_tail(&sd->tofree_queue, skb);
30744                         input_queue_head_incr(sd);
30745                 }
30746         }
30747 @@ -4074,10 +4150,13 @@ static void flush_backlog(void *arg)
30748         skb_queue_walk_safe(&sd->process_queue, skb, tmp) {
30749                 if (skb->dev == dev) {
30750                         __skb_unlink(skb, &sd->process_queue);
30751 -                       kfree_skb(skb);
30752 +                       __skb_queue_tail(&sd->tofree_queue, skb);
30753                         input_queue_head_incr(sd);
30754                 }
30755         }
30756 +
30757 +       if (!skb_queue_empty(&sd->tofree_queue))
30758 +               raise_softirq_irqoff(NET_RX_SOFTIRQ);
30759  }
30760
30761  static int napi_gro_complete(struct sk_buff *skb)
30762 @@ -4531,6 +4610,7 @@ static void net_rps_action_and_irq_enable(struct softnet_data *sd)
30763                 sd->rps_ipi_list = NULL;
30764
30765                 local_irq_enable();
30766 +               preempt_check_resched_rt();
30767
30768                 /* Send pending IPI's to kick RPS processing on remote cpus. */
30769                 while (remsd) {
30770 @@ -4544,6 +4624,7 @@ static void net_rps_action_and_irq_enable(struct softnet_data *sd)
30771         } else
30772  #endif
30773                 local_irq_enable();
30774 +       preempt_check_resched_rt();
30775  }
30776
30777  static bool sd_has_rps_ipi_waiting(struct softnet_data *sd)
30778 @@ -4625,6 +4706,7 @@ void __napi_schedule(struct napi_struct *n)
30779         local_irq_save(flags);
30780         ____napi_schedule(this_cpu_ptr(&softnet_data), n);
30781         local_irq_restore(flags);
30782 +       preempt_check_resched_rt();
30783  }
30784  EXPORT_SYMBOL(__napi_schedule);
30785
30786 @@ -4901,7 +4983,7 @@ static void net_rx_action(struct softirq_action *h)
30787         list_splice_tail(&repoll, &list);
30788         list_splice(&list, &sd->poll_list);
30789         if (!list_empty(&sd->poll_list))
30790 -               __raise_softirq_irqoff(NET_RX_SOFTIRQ);
30791 +               __raise_softirq_irqoff_ksoft(NET_RX_SOFTIRQ);
30792
30793         net_rps_action_and_irq_enable(sd);
30794  }
30795 @@ -7234,7 +7316,7 @@ EXPORT_SYMBOL(free_netdev);
30796  void synchronize_net(void)
30797  {
30798         might_sleep();
30799 -       if (rtnl_is_locked())
30800 +       if (rtnl_is_locked() && !IS_ENABLED(CONFIG_PREEMPT_RT_FULL))
30801                 synchronize_rcu_expedited();
30802         else
30803                 synchronize_rcu();
30804 @@ -7475,16 +7557,20 @@ static int dev_cpu_callback(struct notifier_block *nfb,
30805
30806         raise_softirq_irqoff(NET_TX_SOFTIRQ);
30807         local_irq_enable();
30808 +       preempt_check_resched_rt();
30809
30810         /* Process offline CPU's input_pkt_queue */
30811         while ((skb = __skb_dequeue(&oldsd->process_queue))) {
30812                 netif_rx_ni(skb);
30813                 input_queue_head_incr(oldsd);
30814         }
30815 -       while ((skb = skb_dequeue(&oldsd->input_pkt_queue))) {
30816 +       while ((skb = __skb_dequeue(&oldsd->input_pkt_queue))) {
30817                 netif_rx_ni(skb);
30818                 input_queue_head_incr(oldsd);
30819         }
30820 +       while ((skb = __skb_dequeue(&oldsd->tofree_queue))) {
30821 +               kfree_skb(skb);
30822 +       }
30823
30824         return NOTIFY_OK;
30825  }
30826 @@ -7786,8 +7872,9 @@ static int __init net_dev_init(void)
30827         for_each_possible_cpu(i) {
30828                 struct softnet_data *sd = &per_cpu(softnet_data, i);
30829
30830 -               skb_queue_head_init(&sd->input_pkt_queue);
30831 -               skb_queue_head_init(&sd->process_queue);
30832 +               skb_queue_head_init_raw(&sd->input_pkt_queue);
30833 +               skb_queue_head_init_raw(&sd->process_queue);
30834 +               skb_queue_head_init_raw(&sd->tofree_queue);
30835                 INIT_LIST_HEAD(&sd->poll_list);
30836                 sd->output_queue_tailp = &sd->output_queue;
30837  #ifdef CONFIG_RPS
30838 diff --git a/net/core/skbuff.c b/net/core/skbuff.c
30839 index 4968b5ddea69..c8d778f405dc 100644
30840 --- a/net/core/skbuff.c
30841 +++ b/net/core/skbuff.c
30842 @@ -63,6 +63,7 @@
30843  #include <linux/errqueue.h>
30844  #include <linux/prefetch.h>
30845  #include <linux/if_vlan.h>
30846 +#include <linux/locallock.h>
30847
30848  #include <net/protocol.h>
30849  #include <net/dst.h>
30850 @@ -351,6 +352,8 @@ EXPORT_SYMBOL(build_skb);
30851
30852  static DEFINE_PER_CPU(struct page_frag_cache, netdev_alloc_cache);
30853  static DEFINE_PER_CPU(struct page_frag_cache, napi_alloc_cache);
30854 +static DEFINE_LOCAL_IRQ_LOCK(netdev_alloc_lock);
30855 +static DEFINE_LOCAL_IRQ_LOCK(napi_alloc_cache_lock);
30856
30857  static void *__netdev_alloc_frag(unsigned int fragsz, gfp_t gfp_mask)
30858  {
30859 @@ -358,10 +361,10 @@ static void *__netdev_alloc_frag(unsigned int fragsz, gfp_t gfp_mask)
30860         unsigned long flags;
30861         void *data;
30862
30863 -       local_irq_save(flags);
30864 +       local_lock_irqsave(netdev_alloc_lock, flags);
30865         nc = this_cpu_ptr(&netdev_alloc_cache);
30866         data = __alloc_page_frag(nc, fragsz, gfp_mask);
30867 -       local_irq_restore(flags);
30868 +       local_unlock_irqrestore(netdev_alloc_lock, flags);
30869         return data;
30870  }
30871
30872 @@ -380,9 +383,13 @@ EXPORT_SYMBOL(netdev_alloc_frag);
30873
30874  static void *__napi_alloc_frag(unsigned int fragsz, gfp_t gfp_mask)
30875  {
30876 -       struct page_frag_cache *nc = this_cpu_ptr(&napi_alloc_cache);
30877 +       struct page_frag_cache *nc;
30878 +       void *data;
30879
30880 -       return __alloc_page_frag(nc, fragsz, gfp_mask);
30881 +       nc = &get_locked_var(napi_alloc_cache_lock, napi_alloc_cache);
30882 +       data = __alloc_page_frag(nc, fragsz, gfp_mask);
30883 +       put_locked_var(napi_alloc_cache_lock, napi_alloc_cache);
30884 +       return data;
30885  }
30886
30887  void *napi_alloc_frag(unsigned int fragsz)
30888 @@ -429,13 +436,13 @@ struct sk_buff *__netdev_alloc_skb(struct net_device *dev, unsigned int len,
30889         if (sk_memalloc_socks())
30890                 gfp_mask |= __GFP_MEMALLOC;
30891
30892 -       local_irq_save(flags);
30893 +       local_lock_irqsave(netdev_alloc_lock, flags);
30894
30895         nc = this_cpu_ptr(&netdev_alloc_cache);
30896         data = __alloc_page_frag(nc, len, gfp_mask);
30897         pfmemalloc = nc->pfmemalloc;
30898
30899 -       local_irq_restore(flags);
30900 +       local_unlock_irqrestore(netdev_alloc_lock, flags);
30901
30902         if (unlikely(!data))
30903                 return NULL;
30904 @@ -476,9 +483,10 @@ EXPORT_SYMBOL(__netdev_alloc_skb);
30905  struct sk_buff *__napi_alloc_skb(struct napi_struct *napi, unsigned int len,
30906                                  gfp_t gfp_mask)
30907  {
30908 -       struct page_frag_cache *nc = this_cpu_ptr(&napi_alloc_cache);
30909 +       struct page_frag_cache *nc;
30910         struct sk_buff *skb;
30911         void *data;
30912 +       bool pfmemalloc;
30913
30914         len += NET_SKB_PAD + NET_IP_ALIGN;
30915
30916 @@ -496,7 +504,11 @@ struct sk_buff *__napi_alloc_skb(struct napi_struct *napi, unsigned int len,
30917         if (sk_memalloc_socks())
30918                 gfp_mask |= __GFP_MEMALLOC;
30919
30920 +       nc = &get_locked_var(napi_alloc_cache_lock, napi_alloc_cache);
30921         data = __alloc_page_frag(nc, len, gfp_mask);
30922 +       pfmemalloc = nc->pfmemalloc;
30923 +       put_locked_var(napi_alloc_cache_lock, napi_alloc_cache);
30924 +
30925         if (unlikely(!data))
30926                 return NULL;
30927
30928 @@ -507,7 +519,7 @@ struct sk_buff *__napi_alloc_skb(struct napi_struct *napi, unsigned int len,
30929         }
30930
30931         /* use OR instead of assignment to avoid clearing of bits in mask */
30932 -       if (nc->pfmemalloc)
30933 +       if (pfmemalloc)
30934                 skb->pfmemalloc = 1;
30935         skb->head_frag = 1;
30936
30937 diff --git a/net/core/sock.c b/net/core/sock.c
30938 index 0d91f7dca751..9c3234299fc3 100644
30939 --- a/net/core/sock.c
30940 +++ b/net/core/sock.c
30941 @@ -2435,12 +2435,11 @@ void lock_sock_nested(struct sock *sk, int subclass)
30942         if (sk->sk_lock.owned)
30943                 __lock_sock(sk);
30944         sk->sk_lock.owned = 1;
30945 -       spin_unlock(&sk->sk_lock.slock);
30946 +       spin_unlock_bh(&sk->sk_lock.slock);
30947         /*
30948          * The sk_lock has mutex_lock() semantics here:
30949          */
30950         mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_);
30951 -       local_bh_enable();
30952  }
30953  EXPORT_SYMBOL(lock_sock_nested);
30954
30955 diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c
30956 index 36e26977c908..ff2593269089 100644
30957 --- a/net/ipv4/icmp.c
30958 +++ b/net/ipv4/icmp.c
30959 @@ -69,6 +69,7 @@
30960  #include <linux/jiffies.h>
30961  #include <linux/kernel.h>
30962  #include <linux/fcntl.h>
30963 +#include <linux/sysrq.h>
30964  #include <linux/socket.h>
30965  #include <linux/in.h>
30966  #include <linux/inet.h>
30967 @@ -77,6 +78,7 @@
30968  #include <linux/string.h>
30969  #include <linux/netfilter_ipv4.h>
30970  #include <linux/slab.h>
30971 +#include <linux/locallock.h>
30972  #include <net/snmp.h>
30973  #include <net/ip.h>
30974  #include <net/route.h>
30975 @@ -204,6 +206,8 @@ static const struct icmp_control icmp_pointers[NR_ICMP_TYPES+1];
30976   *
30977   *     On SMP we have one ICMP socket per-cpu.
30978   */
30979 +static DEFINE_LOCAL_IRQ_LOCK(icmp_sk_lock);
30980 +
30981  static struct sock *icmp_sk(struct net *net)
30982  {
30983         return *this_cpu_ptr(net->ipv4.icmp_sk);
30984 @@ -215,12 +219,14 @@ static inline struct sock *icmp_xmit_lock(struct net *net)
30985
30986         local_bh_disable();
30987
30988 +       local_lock(icmp_sk_lock);
30989         sk = icmp_sk(net);
30990
30991         if (unlikely(!spin_trylock(&sk->sk_lock.slock))) {
30992                 /* This can happen if the output path signals a
30993                  * dst_link_failure() for an outgoing ICMP packet.
30994                  */
30995 +               local_unlock(icmp_sk_lock);
30996                 local_bh_enable();
30997                 return NULL;
30998         }
30999 @@ -230,6 +236,7 @@ static inline struct sock *icmp_xmit_lock(struct net *net)
31000  static inline void icmp_xmit_unlock(struct sock *sk)
31001  {
31002         spin_unlock_bh(&sk->sk_lock.slock);
31003 +       local_unlock(icmp_sk_lock);
31004  }
31005
31006  int sysctl_icmp_msgs_per_sec __read_mostly = 1000;
31007 @@ -358,6 +365,7 @@ static void icmp_push_reply(struct icmp_bxm *icmp_param,
31008         struct sock *sk;
31009         struct sk_buff *skb;
31010
31011 +       local_lock(icmp_sk_lock);
31012         sk = icmp_sk(dev_net((*rt)->dst.dev));
31013         if (ip_append_data(sk, fl4, icmp_glue_bits, icmp_param,
31014                            icmp_param->data_len+icmp_param->head_len,
31015 @@ -380,6 +388,7 @@ static void icmp_push_reply(struct icmp_bxm *icmp_param,
31016                 skb->ip_summed = CHECKSUM_NONE;
31017                 ip_push_pending_frames(sk, fl4);
31018         }
31019 +       local_unlock(icmp_sk_lock);
31020  }
31021
31022  /*
31023 @@ -891,6 +900,30 @@ static bool icmp_redirect(struct sk_buff *skb)
31024  }
31025
31026  /*
31027 + * 32bit and 64bit have different timestamp length, so we check for
31028 + * the cookie at offset 20 and verify it is repeated at offset 50
31029 + */
31030 +#define CO_POS0                20
31031 +#define CO_POS1                50
31032 +#define CO_SIZE                sizeof(int)
31033 +#define ICMP_SYSRQ_SIZE        57
31034 +
31035 +/*
31036 + * We got a ICMP_SYSRQ_SIZE sized ping request. Check for the cookie
31037 + * pattern and if it matches send the next byte as a trigger to sysrq.
31038 + */
31039 +static void icmp_check_sysrq(struct net *net, struct sk_buff *skb)
31040 +{
31041 +       int cookie = htonl(net->ipv4.sysctl_icmp_echo_sysrq);
31042 +       char *p = skb->data;
31043 +
31044 +       if (!memcmp(&cookie, p + CO_POS0, CO_SIZE) &&
31045 +           !memcmp(&cookie, p + CO_POS1, CO_SIZE) &&
31046 +           p[CO_POS0 + CO_SIZE] == p[CO_POS1 + CO_SIZE])
31047 +               handle_sysrq(p[CO_POS0 + CO_SIZE]);
31048 +}
31049 +
31050 +/*
31051   *     Handle ICMP_ECHO ("ping") requests.
31052   *
31053   *     RFC 1122: 3.2.2.6 MUST have an echo server that answers ICMP echo
31054 @@ -917,6 +950,11 @@ static bool icmp_echo(struct sk_buff *skb)
31055                 icmp_param.data_len        = skb->len;
31056                 icmp_param.head_len        = sizeof(struct icmphdr);
31057                 icmp_reply(&icmp_param, skb);
31058 +
31059 +               if (skb->len == ICMP_SYSRQ_SIZE &&
31060 +                   net->ipv4.sysctl_icmp_echo_sysrq) {
31061 +                       icmp_check_sysrq(net, skb);
31062 +               }
31063         }
31064         /* should there be an ICMP stat for ignored echos? */
31065         return true;
31066 diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
31067 index a0bd7a55193e..1866f910263f 100644
31068 --- a/net/ipv4/sysctl_net_ipv4.c
31069 +++ b/net/ipv4/sysctl_net_ipv4.c
31070 @@ -818,6 +818,13 @@ static struct ctl_table ipv4_net_table[] = {
31071                 .proc_handler   = proc_dointvec
31072         },
31073         {
31074 +               .procname       = "icmp_echo_sysrq",
31075 +               .data           = &init_net.ipv4.sysctl_icmp_echo_sysrq,
31076 +               .maxlen         = sizeof(int),
31077 +               .mode           = 0644,
31078 +               .proc_handler   = proc_dointvec
31079 +       },
31080 +       {
31081                 .procname       = "icmp_ignore_bogus_error_responses",
31082                 .data           = &init_net.ipv4.sysctl_icmp_ignore_bogus_error_responses,
31083                 .maxlen         = sizeof(int),
31084 diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
31085 index b5853cac3269..de922d86ba2c 100644
31086 --- a/net/ipv4/tcp_ipv4.c
31087 +++ b/net/ipv4/tcp_ipv4.c
31088 @@ -62,6 +62,7 @@
31089  #include <linux/init.h>
31090  #include <linux/times.h>
31091  #include <linux/slab.h>
31092 +#include <linux/locallock.h>
31093
31094  #include <net/net_namespace.h>
31095  #include <net/icmp.h>
31096 @@ -566,6 +567,7 @@ void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
31097  }
31098  EXPORT_SYMBOL(tcp_v4_send_check);
31099
31100 +static DEFINE_LOCAL_IRQ_LOCK(tcp_sk_lock);
31101  /*
31102   *     This routine will send an RST to the other tcp.
31103   *
31104 @@ -687,10 +689,13 @@ static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
31105                 arg.bound_dev_if = sk->sk_bound_dev_if;
31106
31107         arg.tos = ip_hdr(skb)->tos;
31108 +
31109 +       local_lock(tcp_sk_lock);
31110         ip_send_unicast_reply(*this_cpu_ptr(net->ipv4.tcp_sk),
31111                               skb, &TCP_SKB_CB(skb)->header.h4.opt,
31112                               ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
31113                               &arg, arg.iov[0].iov_len);
31114 +       local_unlock(tcp_sk_lock);
31115
31116         TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS);
31117         TCP_INC_STATS_BH(net, TCP_MIB_OUTRSTS);
31118 @@ -772,10 +777,12 @@ static void tcp_v4_send_ack(struct net *net,
31119         if (oif)
31120                 arg.bound_dev_if = oif;
31121         arg.tos = tos;
31122 +       local_lock(tcp_sk_lock);
31123         ip_send_unicast_reply(*this_cpu_ptr(net->ipv4.tcp_sk),
31124                               skb, &TCP_SKB_CB(skb)->header.h4.opt,
31125                               ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
31126                               &arg, arg.iov[0].iov_len);
31127 +       local_unlock(tcp_sk_lock);
31128
31129         TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS);
31130  }
31131 diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c
31132 index a3bb8f7f5fc5..3be977394a80 100644
31133 --- a/net/mac80211/rx.c
31134 +++ b/net/mac80211/rx.c
31135 @@ -3574,7 +3574,7 @@ void ieee80211_rx_napi(struct ieee80211_hw *hw, struct sk_buff *skb,
31136         struct ieee80211_supported_band *sband;
31137         struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(skb);
31138
31139 -       WARN_ON_ONCE(softirq_count() == 0);
31140 +       WARN_ON_ONCE_NONRT(softirq_count() == 0);
31141
31142         if (WARN_ON(status->band >= IEEE80211_NUM_BANDS))
31143                 goto drop;
31144 diff --git a/net/netfilter/core.c b/net/netfilter/core.c
31145 index f39276d1c2d7..10880c89d62f 100644
31146 --- a/net/netfilter/core.c
31147 +++ b/net/netfilter/core.c
31148 @@ -22,11 +22,17 @@
31149  #include <linux/proc_fs.h>
31150  #include <linux/mutex.h>
31151  #include <linux/slab.h>
31152 +#include <linux/locallock.h>
31153  #include <net/net_namespace.h>
31154  #include <net/sock.h>
31155
31156  #include "nf_internals.h"
31157
31158 +#ifdef CONFIG_PREEMPT_RT_BASE
31159 +DEFINE_LOCAL_IRQ_LOCK(xt_write_lock);
31160 +EXPORT_PER_CPU_SYMBOL(xt_write_lock);
31161 +#endif
31162 +
31163  static DEFINE_MUTEX(afinfo_mutex);
31164
31165  const struct nf_afinfo __rcu *nf_afinfo[NFPROTO_NUMPROTO] __read_mostly;
31166 diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
31167 index a86f26d05bc2..77276e3ff2a2 100644
31168 --- a/net/packet/af_packet.c
31169 +++ b/net/packet/af_packet.c
31170 @@ -63,6 +63,7 @@
31171  #include <linux/if_packet.h>
31172  #include <linux/wireless.h>
31173  #include <linux/kernel.h>
31174 +#include <linux/delay.h>
31175  #include <linux/kmod.h>
31176  #include <linux/slab.h>
31177  #include <linux/vmalloc.h>
31178 @@ -695,7 +696,7 @@ static void prb_retire_rx_blk_timer_expired(unsigned long data)
31179         if (BLOCK_NUM_PKTS(pbd)) {
31180                 while (atomic_read(&pkc->blk_fill_in_prog)) {
31181                         /* Waiting for skb_copy_bits to finish... */
31182 -                       cpu_relax();
31183 +                       cpu_chill();
31184                 }
31185         }
31186
31187 @@ -957,7 +958,7 @@ static void prb_retire_current_block(struct tpacket_kbdq_core *pkc,
31188                 if (!(status & TP_STATUS_BLK_TMO)) {
31189                         while (atomic_read(&pkc->blk_fill_in_prog)) {
31190                                 /* Waiting for skb_copy_bits to finish... */
31191 -                               cpu_relax();
31192 +                               cpu_chill();
31193                         }
31194                 }
31195                 prb_close_block(pkc, pbd, po, status);
31196 diff --git a/net/rds/ib_rdma.c b/net/rds/ib_rdma.c
31197 index a2340748ec86..19123a97b354 100644
31198 --- a/net/rds/ib_rdma.c
31199 +++ b/net/rds/ib_rdma.c
31200 @@ -34,6 +34,7 @@
31201  #include <linux/slab.h>
31202  #include <linux/rculist.h>
31203  #include <linux/llist.h>
31204 +#include <linux/delay.h>
31205
31206  #include "rds.h"
31207  #include "ib.h"
31208 @@ -313,7 +314,7 @@ static inline void wait_clean_list_grace(void)
31209         for_each_online_cpu(cpu) {
31210                 flag = &per_cpu(clean_list_grace, cpu);
31211                 while (test_bit(CLEAN_LIST_BUSY_BIT, flag))
31212 -                       cpu_relax();
31213 +                       cpu_chill();
31214         }
31215  }
31216
31217 diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
31218 index aa4725038f94..00b81cab28f3 100644
31219 --- a/net/sched/sch_generic.c
31220 +++ b/net/sched/sch_generic.c
31221 @@ -893,7 +893,7 @@ void dev_deactivate_many(struct list_head *head)
31222         /* Wait for outstanding qdisc_run calls. */
31223         list_for_each_entry(dev, head, close_list)
31224                 while (some_qdisc_is_busy(dev))
31225 -                       yield();
31226 +                       msleep(1);
31227  }
31228
31229  void dev_deactivate(struct net_device *dev)
31230 diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c
31231 index a6cbb2104667..5b69bb580617 100644
31232 --- a/net/sunrpc/svc_xprt.c
31233 +++ b/net/sunrpc/svc_xprt.c
31234 @@ -340,7 +340,7 @@ void svc_xprt_do_enqueue(struct svc_xprt *xprt)
31235                 goto out;
31236         }
31237
31238 -       cpu = get_cpu();
31239 +       cpu = get_cpu_light();
31240         pool = svc_pool_for_cpu(xprt->xpt_server, cpu);
31241
31242         atomic_long_inc(&pool->sp_stats.packets);
31243 @@ -376,7 +376,7 @@ redo_search:
31244
31245                 atomic_long_inc(&pool->sp_stats.threads_woken);
31246                 wake_up_process(rqstp->rq_task);
31247 -               put_cpu();
31248 +               put_cpu_light();
31249                 goto out;
31250         }
31251         rcu_read_unlock();
31252 @@ -397,7 +397,7 @@ redo_search:
31253                 goto redo_search;
31254         }
31255         rqstp = NULL;
31256 -       put_cpu();
31257 +       put_cpu_light();
31258  out:
31259         trace_svc_xprt_do_enqueue(xprt, rqstp);
31260  }
31261 diff --git a/scripts/mkcompile_h b/scripts/mkcompile_h
31262 index 6fdc97ef6023..523e0420d7f0 100755
31263 --- a/scripts/mkcompile_h
31264 +++ b/scripts/mkcompile_h
31265 @@ -4,7 +4,8 @@ TARGET=$1
31266  ARCH=$2
31267  SMP=$3
31268  PREEMPT=$4
31269 -CC=$5
31270 +RT=$5
31271 +CC=$6
31272
31273  vecho() { [ "${quiet}" = "silent_" ] || echo "$@" ; }
31274
31275 @@ -57,6 +58,7 @@ UTS_VERSION="#$VERSION"
31276  CONFIG_FLAGS=""
31277  if [ -n "$SMP" ] ; then CONFIG_FLAGS="SMP"; fi
31278  if [ -n "$PREEMPT" ] ; then CONFIG_FLAGS="$CONFIG_FLAGS PREEMPT"; fi
31279 +if [ -n "$RT" ] ; then CONFIG_FLAGS="$CONFIG_FLAGS RT"; fi
31280  UTS_VERSION="$UTS_VERSION $CONFIG_FLAGS $TIMESTAMP"
31281
31282  # Truncate to maximum length
31283 diff --git a/sound/core/pcm_native.c b/sound/core/pcm_native.c
31284 index 4ba64fd49759..34e50186885d 100644
31285 --- a/sound/core/pcm_native.c
31286 +++ b/sound/core/pcm_native.c
31287 @@ -135,7 +135,7 @@ EXPORT_SYMBOL_GPL(snd_pcm_stream_unlock);
31288  void snd_pcm_stream_lock_irq(struct snd_pcm_substream *substream)
31289  {
31290         if (!substream->pcm->nonatomic)
31291 -               local_irq_disable();
31292 +               local_irq_disable_nort();
31293         snd_pcm_stream_lock(substream);
31294  }
31295  EXPORT_SYMBOL_GPL(snd_pcm_stream_lock_irq);
31296 @@ -150,7 +150,7 @@ void snd_pcm_stream_unlock_irq(struct snd_pcm_substream *substream)
31297  {
31298         snd_pcm_stream_unlock(substream);
31299         if (!substream->pcm->nonatomic)
31300 -               local_irq_enable();
31301 +               local_irq_enable_nort();
31302  }
31303  EXPORT_SYMBOL_GPL(snd_pcm_stream_unlock_irq);
31304
31305 @@ -158,7 +158,7 @@ unsigned long _snd_pcm_stream_lock_irqsave(struct snd_pcm_substream *substream)
31306  {
31307         unsigned long flags = 0;
31308         if (!substream->pcm->nonatomic)
31309 -               local_irq_save(flags);
31310 +               local_irq_save_nort(flags);
31311         snd_pcm_stream_lock(substream);
31312         return flags;
31313  }
31314 @@ -176,7 +176,7 @@ void snd_pcm_stream_unlock_irqrestore(struct snd_pcm_substream *substream,
31315  {
31316         snd_pcm_stream_unlock(substream);
31317         if (!substream->pcm->nonatomic)
31318 -               local_irq_restore(flags);
31319 +               local_irq_restore_nort(flags);
31320  }
31321  EXPORT_SYMBOL_GPL(snd_pcm_stream_unlock_irqrestore);
31322
31323 diff --git a/virt/kvm/async_pf.c b/virt/kvm/async_pf.c
31324 index 4f70d12e392d..9378d0919ed8 100644
31325 --- a/virt/kvm/async_pf.c
31326 +++ b/virt/kvm/async_pf.c
31327 @@ -98,8 +98,8 @@ static void async_pf_execute(struct work_struct *work)
31328          * This memory barrier pairs with prepare_to_wait's set_current_state()
31329          */
31330         smp_mb();
31331 -       if (waitqueue_active(&vcpu->wq))
31332 -               wake_up_interruptible(&vcpu->wq);
31333 +       if (swait_active(&vcpu->wq))
31334 +               swake_up(&vcpu->wq);
31335
31336         mmput(mm);
31337         kvm_put_kvm(vcpu->kvm);
31338 diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
31339 index 336ed267c407..7748ca386e60 100644
31340 --- a/virt/kvm/kvm_main.c
31341 +++ b/virt/kvm/kvm_main.c
31342 @@ -228,8 +228,7 @@ int kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id)
31343         vcpu->kvm = kvm;
31344         vcpu->vcpu_id = id;
31345         vcpu->pid = NULL;
31346 -       vcpu->halt_poll_ns = 0;
31347 -       init_waitqueue_head(&vcpu->wq);
31348 +       init_swait_queue_head(&vcpu->wq);
31349         kvm_async_pf_vcpu_init(vcpu);
31350
31351         vcpu->pre_pcpu = -1;
31352 @@ -2005,7 +2004,7 @@ static int kvm_vcpu_check_block(struct kvm_vcpu *vcpu)
31353  void kvm_vcpu_block(struct kvm_vcpu *vcpu)
31354  {
31355         ktime_t start, cur;
31356 -       DEFINE_WAIT(wait);
31357 +       DECLARE_SWAITQUEUE(wait);
31358         bool waited = false;
31359         u64 block_ns;
31360
31361 @@ -2030,7 +2029,7 @@ void kvm_vcpu_block(struct kvm_vcpu *vcpu)
31362         kvm_arch_vcpu_blocking(vcpu);
31363
31364         for (;;) {
31365 -               prepare_to_wait(&vcpu->wq, &wait, TASK_INTERRUPTIBLE);
31366 +               prepare_to_swait(&vcpu->wq, &wait, TASK_INTERRUPTIBLE);
31367
31368                 if (kvm_vcpu_check_block(vcpu) < 0)
31369                         break;
31370 @@ -2039,7 +2038,7 @@ void kvm_vcpu_block(struct kvm_vcpu *vcpu)
31371                 schedule();
31372         }
31373
31374 -       finish_wait(&vcpu->wq, &wait);
31375 +       finish_swait(&vcpu->wq, &wait);
31376         cur = ktime_get();
31377
31378         kvm_arch_vcpu_unblocking(vcpu);
31379 @@ -2071,11 +2070,11 @@ void kvm_vcpu_kick(struct kvm_vcpu *vcpu)
31380  {
31381         int me;
31382         int cpu = vcpu->cpu;
31383 -       wait_queue_head_t *wqp;
31384 +       struct swait_queue_head *wqp;
31385
31386         wqp = kvm_arch_vcpu_wq(vcpu);
31387 -       if (waitqueue_active(wqp)) {
31388 -               wake_up_interruptible(wqp);
31389 +       if (swait_active(wqp)) {
31390 +               swake_up(wqp);
31391                 ++vcpu->stat.halt_wakeup;
31392         }
31393
31394 @@ -2176,7 +2175,7 @@ void kvm_vcpu_on_spin(struct kvm_vcpu *me)
31395                                 continue;
31396                         if (vcpu == me)
31397                                 continue;
31398 -                       if (waitqueue_active(&vcpu->wq) && !kvm_arch_vcpu_runnable(vcpu))
31399 +                       if (swait_active(&vcpu->wq) && !kvm_arch_vcpu_runnable(vcpu))
31400                                 continue;
31401                         if (!kvm_vcpu_eligible_for_directed_yield(vcpu))
31402                                 continue;