kernel-rt.patch

   1 diff --git a/Documentation/trace/events.txt b/Documentation/trace/events.txt
   2 index 2cc08d4a326e..e28f7f29f2b3 100644
   3 --- a/Documentation/trace/events.txt
   4 +++ b/Documentation/trace/events.txt
   5 @@ -517,1550 +517,4 @@ The following commands are supported:
   6    totals derived from one or more trace event format fields and/or
   7    event counts (hitcount).
   8
   9 -  The format of a hist trigger is as follows:
  10 -
  11 -        hist:keys=<field1[,field2,...]>[:values=<field1[,field2,...]>]
  12 -          [:sort=<field1[,field2,...]>][:size=#entries][:pause][:continue]
  13 -          [:clear][:name=histname1] [if <filter>]
  14 -
  15 -  When a matching event is hit, an entry is added to a hash table
  16 -  using the key(s) and value(s) named.  Keys and values correspond to
  17 -  fields in the event's format description.  Values must correspond to
  18 -  numeric fields - on an event hit, the value(s) will be added to a
  19 -  sum kept for that field.  The special string 'hitcount' can be used
  20 -  in place of an explicit value field - this is simply a count of
  21 -  event hits.  If 'values' isn't specified, an implicit 'hitcount'
  22 -  value will be automatically created and used as the only value.
  23 -  Keys can be any field, or the special string 'stacktrace', which
  24 -  will use the event's kernel stacktrace as the key.  The keywords
  25 -  'keys' or 'key' can be used to specify keys, and the keywords
  26 -  'values', 'vals', or 'val' can be used to specify values.  Compound
  27 -  keys consisting of up to two fields can be specified by the 'keys'
  28 -  keyword.  Hashing a compound key produces a unique entry in the
  29 -  table for each unique combination of component keys, and can be
  30 -  useful for providing more fine-grained summaries of event data.
  31 -  Additionally, sort keys consisting of up to two fields can be
  32 -  specified by the 'sort' keyword.  If more than one field is
  33 -  specified, the result will be a 'sort within a sort': the first key
  34 -  is taken to be the primary sort key and the second the secondary
  35 -  key.  If a hist trigger is given a name using the 'name' parameter,
  36 -  its histogram data will be shared with other triggers of the same
  37 -  name, and trigger hits will update this common data.  Only triggers
  38 -  with 'compatible' fields can be combined in this way; triggers are
  39 -  'compatible' if the fields named in the trigger share the same
  40 -  number and type of fields and those fields also have the same names.
  41 -  Note that any two events always share the compatible 'hitcount' and
  42 -  'stacktrace' fields and can therefore be combined using those
  43 -  fields, however pointless that may be.
  44 -
  45 -  'hist' triggers add a 'hist' file to each event's subdirectory.
  46 -  Reading the 'hist' file for the event will dump the hash table in
  47 -  its entirety to stdout.  If there are multiple hist triggers
  48 -  attached to an event, there will be a table for each trigger in the
  49 -  output.  The table displayed for a named trigger will be the same as
  50 -  any other instance having the same name. Each printed hash table
  51 -  entry is a simple list of the keys and values comprising the entry;
  52 -  keys are printed first and are delineated by curly braces, and are
  53 -  followed by the set of value fields for the entry.  By default,
  54 -  numeric fields are displayed as base-10 integers.  This can be
  55 -  modified by appending any of the following modifiers to the field
  56 -  name:
  57 -
  58 -        .hex        display a number as a hex value
  59 -       .sym        display an address as a symbol
  60 -       .sym-offset display an address as a symbol and offset
  61 -       .syscall    display a syscall id as a system call name
  62 -       .execname   display a common_pid as a program name
  63 -
  64 -  Note that in general the semantics of a given field aren't
  65 -  interpreted when applying a modifier to it, but there are some
  66 -  restrictions to be aware of in this regard:
  67 -
  68 -    - only the 'hex' modifier can be used for values (because values
  69 -      are essentially sums, and the other modifiers don't make sense
  70 -      in that context).
  71 -    - the 'execname' modifier can only be used on a 'common_pid'.  The
  72 -      reason for this is that the execname is simply the 'comm' value
  73 -      saved for the 'current' process when an event was triggered,
  74 -      which is the same as the common_pid value saved by the event
  75 -      tracing code.  Trying to apply that comm value to other pid
  76 -      values wouldn't be correct, and typically events that care save
  77 -      pid-specific comm fields in the event itself.
  78 -
  79 -  A typical usage scenario would be the following to enable a hist
  80 -  trigger, read its current contents, and then turn it off:
  81 -
  82 -  # echo 'hist:keys=skbaddr.hex:vals=len' > \
  83 -    /sys/kernel/debug/tracing/events/net/netif_rx/trigger
  84 -
  85 -  # cat /sys/kernel/debug/tracing/events/net/netif_rx/hist
  86 -
  87 -  # echo '!hist:keys=skbaddr.hex:vals=len' > \
  88 -    /sys/kernel/debug/tracing/events/net/netif_rx/trigger
  89 -
  90 -  The trigger file itself can be read to show the details of the
  91 -  currently attached hist trigger.  This information is also displayed
  92 -  at the top of the 'hist' file when read.
  93 -
  94 -  By default, the size of the hash table is 2048 entries.  The 'size'
  95 -  parameter can be used to specify more or fewer than that.  The units
  96 -  are in terms of hashtable entries - if a run uses more entries than
  97 -  specified, the results will show the number of 'drops', the number
  98 -  of hits that were ignored.  The size should be a power of 2 between
  99 -  128 and 131072 (any non- power-of-2 number specified will be rounded
 100 -  up).
 101 -
 102 -  The 'sort' parameter can be used to specify a value field to sort
 103 -  on.  The default if unspecified is 'hitcount' and the default sort
 104 -  order is 'ascending'.  To sort in the opposite direction, append
 105 -  .descending' to the sort key.
 106 -
 107 -  The 'pause' parameter can be used to pause an existing hist trigger
 108 -  or to start a hist trigger but not log any events until told to do
 109 -  so.  'continue' or 'cont' can be used to start or restart a paused
 110 -  hist trigger.
 111 -
 112 -  The 'clear' parameter will clear the contents of a running hist
 113 -  trigger and leave its current paused/active state.
 114 -
 115 -  Note that the 'pause', 'cont', and 'clear' parameters should be
 116 -  applied using 'append' shell operator ('>>') if applied to an
 117 -  existing trigger, rather than via the '>' operator, which will cause
 118 -  the trigger to be removed through truncation.
 119 -
 120 -- enable_hist/disable_hist
 121 -
 122 -  The enable_hist and disable_hist triggers can be used to have one
 123 -  event conditionally start and stop another event's already-attached
 124 -  hist trigger.  Any number of enable_hist and disable_hist triggers
 125 -  can be attached to a given event, allowing that event to kick off
 126 -  and stop aggregations on a host of other events.
 127 -
 128 -  The format is very similar to the enable/disable_event triggers:
 129 -
 130 -      enable_hist:<system>:<event>[:count]
 131 -      disable_hist:<system>:<event>[:count]
 132 -
 133 -  Instead of enabling or disabling the tracing of the target event
 134 -  into the trace buffer as the enable/disable_event triggers do, the
 135 -  enable/disable_hist triggers enable or disable the aggregation of
 136 -  the target event into a hash table.
 137 -
 138 -  A typical usage scenario for the enable_hist/disable_hist triggers
 139 -  would be to first set up a paused hist trigger on some event,
 140 -  followed by an enable_hist/disable_hist pair that turns the hist
 141 -  aggregation on and off when conditions of interest are hit:
 142 -
 143 -  # echo 'hist:keys=skbaddr.hex:vals=len:pause' > \
 144 -    /sys/kernel/debug/tracing/events/net/netif_receive_skb/trigger
 145 -
 146 -  # echo 'enable_hist:net:netif_receive_skb if filename==/usr/bin/wget' > \
 147 -    /sys/kernel/debug/tracing/events/sched/sched_process_exec/trigger
 148 -
 149 -  # echo 'disable_hist:net:netif_receive_skb if comm==wget' > \
 150 -    /sys/kernel/debug/tracing/events/sched/sched_process_exit/trigger
 151 -
 152 -  The above sets up an initially paused hist trigger which is unpaused
 153 -  and starts aggregating events when a given program is executed, and
 154 -  which stops aggregating when the process exits and the hist trigger
 155 -  is paused again.
 156 -
 157 -  The examples below provide a more concrete illustration of the
 158 -  concepts and typical usage patterns discussed above.
 159 -
 160 -
 161 -6.2 'hist' trigger examples
 162 ----------------------------
 163 -
 164 -  The first set of examples creates aggregations using the kmalloc
 165 -  event.  The fields that can be used for the hist trigger are listed
 166 -  in the kmalloc event's format file:
 167 -
 168 -    # cat /sys/kernel/debug/tracing/events/kmem/kmalloc/format
 169 -    name: kmalloc
 170 -    ID: 374
 171 -    format:
 172 -       field:unsigned short common_type;       offset:0;       size:2; signed:0;
 173 -       field:unsigned char common_flags;       offset:2;       size:1; signed:0;
 174 -       field:unsigned char common_preempt_count;               offset:3;       size:1; signed:0;
 175 -       field:int common_pid;                                   offset:4;       size:4; signed:1;
 176 -
 177 -       field:unsigned long call_site;                          offset:8;       size:8; signed:0;
 178 -       field:const void * ptr;                                 offset:16;      size:8; signed:0;
 179 -       field:size_t bytes_req;                                 offset:24;      size:8; signed:0;
 180 -       field:size_t bytes_alloc;                               offset:32;      size:8; signed:0;
 181 -       field:gfp_t gfp_flags;                                  offset:40;      size:4; signed:0;
 182 -
 183 -  We'll start by creating a hist trigger that generates a simple table
 184 -  that lists the total number of bytes requested for each function in
 185 -  the kernel that made one or more calls to kmalloc:
 186 -
 187 -    # echo 'hist:key=call_site:val=bytes_req' > \
 188 -            /sys/kernel/debug/tracing/events/kmem/kmalloc/trigger
 189 -
 190 -  This tells the tracing system to create a 'hist' trigger using the
 191 -  call_site field of the kmalloc event as the key for the table, which
 192 -  just means that each unique call_site address will have an entry
 193 -  created for it in the table.  The 'val=bytes_req' parameter tells
 194 -  the hist trigger that for each unique entry (call_site) in the
 195 -  table, it should keep a running total of the number of bytes
 196 -  requested by that call_site.
 197 -
 198 -  We'll let it run for awhile and then dump the contents of the 'hist'
 199 -  file in the kmalloc event's subdirectory (for readability, a number
 200 -  of entries have been omitted):
 201 -
 202 -    # cat /sys/kernel/debug/tracing/events/kmem/kmalloc/hist
 203 -    # trigger info: hist:keys=call_site:vals=bytes_req:sort=hitcount:size=2048 [active]
 204 -
 205 -    { call_site: 18446744072106379007 } hitcount:          1  bytes_req:        176
 206 -    { call_site: 18446744071579557049 } hitcount:          1  bytes_req:       1024
 207 -    { call_site: 18446744071580608289 } hitcount:          1  bytes_req:      16384
 208 -    { call_site: 18446744071581827654 } hitcount:          1  bytes_req:         24
 209 -    { call_site: 18446744071580700980 } hitcount:          1  bytes_req:          8
 210 -    { call_site: 18446744071579359876 } hitcount:          1  bytes_req:        152
 211 -    { call_site: 18446744071580795365 } hitcount:          3  bytes_req:        144
 212 -    { call_site: 18446744071581303129 } hitcount:          3  bytes_req:        144
 213 -    { call_site: 18446744071580713234 } hitcount:          4  bytes_req:       2560
 214 -    { call_site: 18446744071580933750 } hitcount:          4  bytes_req:        736
 215 -    .
 216 -    .
 217 -    .
 218 -    { call_site: 18446744072106047046 } hitcount:         69  bytes_req:       5576
 219 -    { call_site: 18446744071582116407 } hitcount:         73  bytes_req:       2336
 220 -    { call_site: 18446744072106054684 } hitcount:        136  bytes_req:     140504
 221 -    { call_site: 18446744072106224230 } hitcount:        136  bytes_req:      19584
 222 -    { call_site: 18446744072106078074 } hitcount:        153  bytes_req:       2448
 223 -    { call_site: 18446744072106062406 } hitcount:        153  bytes_req:      36720
 224 -    { call_site: 18446744071582507929 } hitcount:        153  bytes_req:      37088
 225 -    { call_site: 18446744072102520590 } hitcount:        273  bytes_req:      10920
 226 -    { call_site: 18446744071582143559 } hitcount:        358  bytes_req:        716
 227 -    { call_site: 18446744072106465852 } hitcount:        417  bytes_req:      56712
 228 -    { call_site: 18446744072102523378 } hitcount:        485  bytes_req:      27160
 229 -    { call_site: 18446744072099568646 } hitcount:       1676  bytes_req:      33520
 230 -
 231 -    Totals:
 232 -        Hits: 4610
 233 -        Entries: 45
 234 -        Dropped: 0
 235 -
 236 -  The output displays a line for each entry, beginning with the key
 237 -  specified in the trigger, followed by the value(s) also specified in
 238 -  the trigger.  At the beginning of the output is a line that displays
 239 -  the trigger info, which can also be displayed by reading the
 240 -  'trigger' file:
 241 -
 242 -    # cat /sys/kernel/debug/tracing/events/kmem/kmalloc/trigger
 243 -    hist:keys=call_site:vals=bytes_req:sort=hitcount:size=2048 [active]
 244 -
 245 -  At the end of the output are a few lines that display the overall
 246 -  totals for the run.  The 'Hits' field shows the total number of
 247 -  times the event trigger was hit, the 'Entries' field shows the total
 248 -  number of used entries in the hash table, and the 'Dropped' field
 249 -  shows the number of hits that were dropped because the number of
 250 -  used entries for the run exceeded the maximum number of entries
 251 -  allowed for the table (normally 0, but if not a hint that you may
 252 -  want to increase the size of the table using the 'size' parameter).
 253 -
 254 -  Notice in the above output that there's an extra field, 'hitcount',
 255 -  which wasn't specified in the trigger.  Also notice that in the
 256 -  trigger info output, there's a parameter, 'sort=hitcount', which
 257 -  wasn't specified in the trigger either.  The reason for that is that
 258 -  every trigger implicitly keeps a count of the total number of hits
 259 -  attributed to a given entry, called the 'hitcount'.  That hitcount
 260 -  information is explicitly displayed in the output, and in the
 261 -  absence of a user-specified sort parameter, is used as the default
 262 -  sort field.
 263 -
 264 -  The value 'hitcount' can be used in place of an explicit value in
 265 -  the 'values' parameter if you don't really need to have any
 266 -  particular field summed and are mainly interested in hit
 267 -  frequencies.
 268 -
 269 -  To turn the hist trigger off, simply call up the trigger in the
 270 -  command history and re-execute it with a '!' prepended:
 271 -
 272 -    # echo '!hist:key=call_site:val=bytes_req' > \
 273 -           /sys/kernel/debug/tracing/events/kmem/kmalloc/trigger
 274 -
 275 -  Finally, notice that the call_site as displayed in the output above
 276 -  isn't really very useful.  It's an address, but normally addresses
 277 -  are displayed in hex.  To have a numeric field displayed as a hex
 278 -  value, simply append '.hex' to the field name in the trigger:
 279 -
 280 -    # echo 'hist:key=call_site.hex:val=bytes_req' > \
 281 -           /sys/kernel/debug/tracing/events/kmem/kmalloc/trigger
 282 -
 283 -    # cat /sys/kernel/debug/tracing/events/kmem/kmalloc/hist
 284 -    # trigger info: hist:keys=call_site.hex:vals=bytes_req:sort=hitcount:size=2048 [active]
 285 -
 286 -    { call_site: ffffffffa026b291 } hitcount:          1  bytes_req:        433
 287 -    { call_site: ffffffffa07186ff } hitcount:          1  bytes_req:        176
 288 -    { call_site: ffffffff811ae721 } hitcount:          1  bytes_req:      16384
 289 -    { call_site: ffffffff811c5134 } hitcount:          1  bytes_req:          8
 290 -    { call_site: ffffffffa04a9ebb } hitcount:          1  bytes_req:        511
 291 -    { call_site: ffffffff8122e0a6 } hitcount:          1  bytes_req:         12
 292 -    { call_site: ffffffff8107da84 } hitcount:          1  bytes_req:        152
 293 -    { call_site: ffffffff812d8246 } hitcount:          1  bytes_req:         24
 294 -    { call_site: ffffffff811dc1e5 } hitcount:          3  bytes_req:        144
 295 -    { call_site: ffffffffa02515e8 } hitcount:          3  bytes_req:        648
 296 -    { call_site: ffffffff81258159 } hitcount:          3  bytes_req:        144
 297 -    { call_site: ffffffff811c80f4 } hitcount:          4  bytes_req:        544
 298 -    .
 299 -    .
 300 -    .
 301 -    { call_site: ffffffffa06c7646 } hitcount:        106  bytes_req:       8024
 302 -    { call_site: ffffffffa06cb246 } hitcount:        132  bytes_req:      31680
 303 -    { call_site: ffffffffa06cef7a } hitcount:        132  bytes_req:       2112
 304 -    { call_site: ffffffff8137e399 } hitcount:        132  bytes_req:      23232
 305 -    { call_site: ffffffffa06c941c } hitcount:        185  bytes_req:     171360
 306 -    { call_site: ffffffffa06f2a66 } hitcount:        185  bytes_req:      26640
 307 -    { call_site: ffffffffa036a70e } hitcount:        265  bytes_req:      10600
 308 -    { call_site: ffffffff81325447 } hitcount:        292  bytes_req:        584
 309 -    { call_site: ffffffffa072da3c } hitcount:        446  bytes_req:      60656
 310 -    { call_site: ffffffffa036b1f2 } hitcount:        526  bytes_req:      29456
 311 -    { call_site: ffffffffa0099c06 } hitcount:       1780  bytes_req:      35600
 312 -
 313 -    Totals:
 314 -        Hits: 4775
 315 -        Entries: 46
 316 -        Dropped: 0
 317 -
 318 -  Even that's only marginally more useful - while hex values do look
 319 -  more like addresses, what users are typically more interested in
 320 -  when looking at text addresses are the corresponding symbols
 321 -  instead.  To have an address displayed as symbolic value instead,
 322 -  simply append '.sym' or '.sym-offset' to the field name in the
 323 -  trigger:
 324 -
 325 -    # echo 'hist:key=call_site.sym:val=bytes_req' > \
 326 -           /sys/kernel/debug/tracing/events/kmem/kmalloc/trigger
 327 -
 328 -    # cat /sys/kernel/debug/tracing/events/kmem/kmalloc/hist
 329 -    # trigger info: hist:keys=call_site.sym:vals=bytes_req:sort=hitcount:size=2048 [active]
 330 -
 331 -    { call_site: [ffffffff810adcb9] syslog_print_all                              } hitcount:          1  bytes_req:       1024
 332 -    { call_site: [ffffffff8154bc62] usb_control_msg                               } hitcount:          1  bytes_req:          8
 333 -    { call_site: [ffffffffa00bf6fe] hidraw_send_report [hid]                      } hitcount:          1  bytes_req:          7
 334 -    { call_site: [ffffffff8154acbe] usb_alloc_urb                                 } hitcount:          1  bytes_req:        192
 335 -    { call_site: [ffffffffa00bf1ca] hidraw_report_event [hid]                     } hitcount:          1  bytes_req:          7
 336 -    { call_site: [ffffffff811e3a25] __seq_open_private                            } hitcount:          1  bytes_req:         40
 337 -    { call_site: [ffffffff8109524a] alloc_fair_sched_group                        } hitcount:          2  bytes_req:        128
 338 -    { call_site: [ffffffff811febd5] fsnotify_alloc_group                          } hitcount:          2  bytes_req:        528
 339 -    { call_site: [ffffffff81440f58] __tty_buffer_request_room                     } hitcount:          2  bytes_req:       2624
 340 -    { call_site: [ffffffff81200ba6] inotify_new_group                             } hitcount:          2  bytes_req:         96
 341 -    { call_site: [ffffffffa05e19af] ieee80211_start_tx_ba_session [mac80211]      } hitcount:          2  bytes_req:        464
 342 -    { call_site: [ffffffff81672406] tcp_get_metrics                               } hitcount:          2  bytes_req:        304
 343 -    { call_site: [ffffffff81097ec2] alloc_rt_sched_group                          } hitcount:          2  bytes_req:        128
 344 -    { call_site: [ffffffff81089b05] sched_create_group                            } hitcount:          2  bytes_req:       1424
 345 -    .
 346 -    .
 347 -    .
 348 -    { call_site: [ffffffffa04a580c] intel_crtc_page_flip [i915]                   } hitcount:       1185  bytes_req:     123240
 349 -    { call_site: [ffffffffa0287592] drm_mode_page_flip_ioctl [drm]                } hitcount:       1185  bytes_req:     104280
 350 -    { call_site: [ffffffffa04c4a3c] intel_plane_duplicate_state [i915]            } hitcount:       1402  bytes_req:     190672
 351 -    { call_site: [ffffffff812891ca] ext4_find_extent                              } hitcount:       1518  bytes_req:     146208
 352 -    { call_site: [ffffffffa029070e] drm_vma_node_allow [drm]                      } hitcount:       1746  bytes_req:      69840
 353 -    { call_site: [ffffffffa045e7c4] i915_gem_do_execbuffer.isra.23 [i915]         } hitcount:       2021  bytes_req:     792312
 354 -    { call_site: [ffffffffa02911f2] drm_modeset_lock_crtc [drm]                   } hitcount:       2592  bytes_req:     145152
 355 -    { call_site: [ffffffffa0489a66] intel_ring_begin [i915]                       } hitcount:       2629  bytes_req:     378576
 356 -    { call_site: [ffffffffa046041c] i915_gem_execbuffer2 [i915]                   } hitcount:       2629  bytes_req:    3783248
 357 -    { call_site: [ffffffff81325607] apparmor_file_alloc_security                  } hitcount:       5192  bytes_req:      10384
 358 -    { call_site: [ffffffffa00b7c06] hid_report_raw_event [hid]                    } hitcount:       5529  bytes_req:     110584
 359 -    { call_site: [ffffffff8131ebf7] aa_alloc_task_context                         } hitcount:      21943  bytes_req:     702176
 360 -    { call_site: [ffffffff8125847d] ext4_htree_store_dirent                       } hitcount:      55759  bytes_req:    5074265
 361 -
 362 -    Totals:
 363 -        Hits: 109928
 364 -        Entries: 71
 365 -        Dropped: 0
 366 -
 367 -  Because the default sort key above is 'hitcount', the above shows a
 368 -  the list of call_sites by increasing hitcount, so that at the bottom
 369 -  we see the functions that made the most kmalloc calls during the
 370 -  run.  If instead we we wanted to see the top kmalloc callers in
 371 -  terms of the number of bytes requested rather than the number of
 372 -  calls, and we wanted the top caller to appear at the top, we can use
 373 -  the 'sort' parameter, along with the 'descending' modifier:
 374 -
 375 -    # echo 'hist:key=call_site.sym:val=bytes_req:sort=bytes_req.descending' > \
 376 -           /sys/kernel/debug/tracing/events/kmem/kmalloc/trigger
 377 -
 378 -    # cat /sys/kernel/debug/tracing/events/kmem/kmalloc/hist
 379 -    # trigger info: hist:keys=call_site.sym:vals=bytes_req:sort=bytes_req.descending:size=2048 [active]
 380 -
 381 -    { call_site: [ffffffffa046041c] i915_gem_execbuffer2 [i915]                   } hitcount:       2186  bytes_req:    3397464
 382 -    { call_site: [ffffffffa045e7c4] i915_gem_do_execbuffer.isra.23 [i915]         } hitcount:       1790  bytes_req:     712176
 383 -    { call_site: [ffffffff8125847d] ext4_htree_store_dirent                       } hitcount:       8132  bytes_req:     513135
 384 -    { call_site: [ffffffff811e2a1b] seq_buf_alloc                                 } hitcount:        106  bytes_req:     440128
 385 -    { call_site: [ffffffffa0489a66] intel_ring_begin [i915]                       } hitcount:       2186  bytes_req:     314784
 386 -    { call_site: [ffffffff812891ca] ext4_find_extent                              } hitcount:       2174  bytes_req:     208992
 387 -    { call_site: [ffffffff811ae8e1] __kmalloc                                     } hitcount:          8  bytes_req:     131072
 388 -    { call_site: [ffffffffa04c4a3c] intel_plane_duplicate_state [i915]            } hitcount:        859  bytes_req:     116824
 389 -    { call_site: [ffffffffa02911f2] drm_modeset_lock_crtc [drm]                   } hitcount:       1834  bytes_req:     102704
 390 -    { call_site: [ffffffffa04a580c] intel_crtc_page_flip [i915]                   } hitcount:        972  bytes_req:     101088
 391 -    { call_site: [ffffffffa0287592] drm_mode_page_flip_ioctl [drm]                } hitcount:        972  bytes_req:      85536
 392 -    { call_site: [ffffffffa00b7c06] hid_report_raw_event [hid]                    } hitcount:       3333  bytes_req:      66664
 393 -    { call_site: [ffffffff8137e559] sg_kmalloc                                    } hitcount:        209  bytes_req:      61632
 394 -    .
 395 -    .
 396 -    .
 397 -    { call_site: [ffffffff81095225] alloc_fair_sched_group                        } hitcount:          2  bytes_req:        128
 398 -    { call_site: [ffffffff81097ec2] alloc_rt_sched_group                          } hitcount:          2  bytes_req:        128
 399 -    { call_site: [ffffffff812d8406] copy_semundo                                  } hitcount:          2  bytes_req:         48
 400 -    { call_site: [ffffffff81200ba6] inotify_new_group                             } hitcount:          1  bytes_req:         48
 401 -    { call_site: [ffffffffa027121a] drm_getmagic [drm]                            } hitcount:          1  bytes_req:         48
 402 -    { call_site: [ffffffff811e3a25] __seq_open_private                            } hitcount:          1  bytes_req:         40
 403 -    { call_site: [ffffffff811c52f4] bprm_change_interp                            } hitcount:          2  bytes_req:         16
 404 -    { call_site: [ffffffff8154bc62] usb_control_msg                               } hitcount:          1  bytes_req:          8
 405 -    { call_site: [ffffffffa00bf1ca] hidraw_report_event [hid]                     } hitcount:          1  bytes_req:          7
 406 -    { call_site: [ffffffffa00bf6fe] hidraw_send_report [hid]                      } hitcount:          1  bytes_req:          7
 407 -
 408 -    Totals:
 409 -        Hits: 32133
 410 -        Entries: 81
 411 -        Dropped: 0
 412 -
 413 -  To display the offset and size information in addition to the symbol
 414 -  name, just use 'sym-offset' instead:
 415 -
 416 -    # echo 'hist:key=call_site.sym-offset:val=bytes_req:sort=bytes_req.descending' > \
 417 -           /sys/kernel/debug/tracing/events/kmem/kmalloc/trigger
 418 -
 419 -    # cat /sys/kernel/debug/tracing/events/kmem/kmalloc/hist
 420 -    # trigger info: hist:keys=call_site.sym-offset:vals=bytes_req:sort=bytes_req.descending:size=2048 [active]
 421 -
 422 -    { call_site: [ffffffffa046041c] i915_gem_execbuffer2+0x6c/0x2c0 [i915]                  } hitcount:       4569  bytes_req:    3163720
 423 -    { call_site: [ffffffffa0489a66] intel_ring_begin+0xc6/0x1f0 [i915]                      } hitcount:       4569  bytes_req:     657936
 424 -    { call_site: [ffffffffa045e7c4] i915_gem_do_execbuffer.isra.23+0x694/0x1020 [i915]      } hitcount:       1519  bytes_req:     472936
 425 -    { call_site: [ffffffffa045e646] i915_gem_do_execbuffer.isra.23+0x516/0x1020 [i915]      } hitcount:       3050  bytes_req:     211832
 426 -    { call_site: [ffffffff811e2a1b] seq_buf_alloc+0x1b/0x50                                 } hitcount:         34  bytes_req:     148384
 427 -    { call_site: [ffffffffa04a580c] intel_crtc_page_flip+0xbc/0x870 [i915]                  } hitcount:       1385  bytes_req:     144040
 428 -    { call_site: [ffffffff811ae8e1] __kmalloc+0x191/0x1b0                                   } hitcount:          8  bytes_req:     131072
 429 -    { call_site: [ffffffffa0287592] drm_mode_page_flip_ioctl+0x282/0x360 [drm]              } hitcount:       1385  bytes_req:     121880
 430 -    { call_site: [ffffffffa02911f2] drm_modeset_lock_crtc+0x32/0x100 [drm]                  } hitcount:       1848  bytes_req:     103488
 431 -    { call_site: [ffffffffa04c4a3c] intel_plane_duplicate_state+0x2c/0xa0 [i915]            } hitcount:        461  bytes_req:      62696
 432 -    { call_site: [ffffffffa029070e] drm_vma_node_allow+0x2e/0xd0 [drm]                      } hitcount:       1541  bytes_req:      61640
 433 -    { call_site: [ffffffff815f8d7b] sk_prot_alloc+0xcb/0x1b0                                } hitcount:         57  bytes_req:      57456
 434 -    .
 435 -    .
 436 -    .
 437 -    { call_site: [ffffffff8109524a] alloc_fair_sched_group+0x5a/0x1a0                       } hitcount:          2  bytes_req:        128
 438 -    { call_site: [ffffffffa027b921] drm_vm_open_locked+0x31/0xa0 [drm]                      } hitcount:          3  bytes_req:         96
 439 -    { call_site: [ffffffff8122e266] proc_self_follow_link+0x76/0xb0                         } hitcount:          8  bytes_req:         96
 440 -    { call_site: [ffffffff81213e80] load_elf_binary+0x240/0x1650                            } hitcount:          3  bytes_req:         84
 441 -    { call_site: [ffffffff8154bc62] usb_control_msg+0x42/0x110                              } hitcount:          1  bytes_req:          8
 442 -    { call_site: [ffffffffa00bf6fe] hidraw_send_report+0x7e/0x1a0 [hid]                     } hitcount:          1  bytes_req:          7
 443 -    { call_site: [ffffffffa00bf1ca] hidraw_report_event+0x8a/0x120 [hid]                    } hitcount:          1  bytes_req:          7
 444 -
 445 -    Totals:
 446 -        Hits: 26098
 447 -        Entries: 64
 448 -        Dropped: 0
 449 -
 450 -  We can also add multiple fields to the 'values' parameter.  For
 451 -  example, we might want to see the total number of bytes allocated
 452 -  alongside bytes requested, and display the result sorted by bytes
 453 -  allocated in a descending order:
 454 -
 455 -    # echo 'hist:keys=call_site.sym:values=bytes_req,bytes_alloc:sort=bytes_alloc.descending' > \
 456 -           /sys/kernel/debug/tracing/events/kmem/kmalloc/trigger
 457 -
 458 -    # cat /sys/kernel/debug/tracing/events/kmem/kmalloc/hist
 459 -    # trigger info: hist:keys=call_site.sym:vals=bytes_req,bytes_alloc:sort=bytes_alloc.descending:size=2048 [active]
 460 -
 461 -    { call_site: [ffffffffa046041c] i915_gem_execbuffer2 [i915]                   } hitcount:       7403  bytes_req:    4084360  bytes_alloc:    5958016
 462 -    { call_site: [ffffffff811e2a1b] seq_buf_alloc                                 } hitcount:        541  bytes_req:    2213968  bytes_alloc:    2228224
 463 -    { call_site: [ffffffffa0489a66] intel_ring_begin [i915]                       } hitcount:       7404  bytes_req:    1066176  bytes_alloc:    1421568
 464 -    { call_site: [ffffffffa045e7c4] i915_gem_do_execbuffer.isra.23 [i915]         } hitcount:       1565  bytes_req:     557368  bytes_alloc:    1037760
 465 -    { call_site: [ffffffff8125847d] ext4_htree_store_dirent                       } hitcount:       9557  bytes_req:     595778  bytes_alloc:     695744
 466 -    { call_site: [ffffffffa045e646] i915_gem_do_execbuffer.isra.23 [i915]         } hitcount:       5839  bytes_req:     430680  bytes_alloc:     470400
 467 -    { call_site: [ffffffffa04c4a3c] intel_plane_duplicate_state [i915]            } hitcount:       2388  bytes_req:     324768  bytes_alloc:     458496
 468 -    { call_site: [ffffffffa02911f2] drm_modeset_lock_crtc [drm]                   } hitcount:       3911  bytes_req:     219016  bytes_alloc:     250304
 469 -    { call_site: [ffffffff815f8d7b] sk_prot_alloc                                 } hitcount:        235  bytes_req:     236880  bytes_alloc:     240640
 470 -    { call_site: [ffffffff8137e559] sg_kmalloc                                    } hitcount:        557  bytes_req:     169024  bytes_alloc:     221760
 471 -    { call_site: [ffffffffa00b7c06] hid_report_raw_event [hid]                    } hitcount:       9378  bytes_req:     187548  bytes_alloc:     206312
 472 -    { call_site: [ffffffffa04a580c] intel_crtc_page_flip [i915]                   } hitcount:       1519  bytes_req:     157976  bytes_alloc:     194432
 473 -    .
 474 -    .
 475 -    .
 476 -    { call_site: [ffffffff8109bd3b] sched_autogroup_create_attach                 } hitcount:          2  bytes_req:        144  bytes_alloc:        192
 477 -    { call_site: [ffffffff81097ee8] alloc_rt_sched_group                          } hitcount:          2  bytes_req:        128  bytes_alloc:        128
 478 -    { call_site: [ffffffff8109524a] alloc_fair_sched_group                        } hitcount:          2  bytes_req:        128  bytes_alloc:        128
 479 -    { call_site: [ffffffff81095225] alloc_fair_sched_group                        } hitcount:          2  bytes_req:        128  bytes_alloc:        128
 480 -    { call_site: [ffffffff81097ec2] alloc_rt_sched_group                          } hitcount:          2  bytes_req:        128  bytes_alloc:        128
 481 -    { call_site: [ffffffff81213e80] load_elf_binary                               } hitcount:          3  bytes_req:         84  bytes_alloc:         96
 482 -    { call_site: [ffffffff81079a2e] kthread_create_on_node                        } hitcount:          1  bytes_req:         56  bytes_alloc:         64
 483 -    { call_site: [ffffffffa00bf6fe] hidraw_send_report [hid]                      } hitcount:          1  bytes_req:          7  bytes_alloc:          8
 484 -    { call_site: [ffffffff8154bc62] usb_control_msg                               } hitcount:          1  bytes_req:          8  bytes_alloc:          8
 485 -    { call_site: [ffffffffa00bf1ca] hidraw_report_event [hid]                     } hitcount:          1  bytes_req:          7  bytes_alloc:          8
 486 -
 487 -    Totals:
 488 -        Hits: 66598
 489 -        Entries: 65
 490 -        Dropped: 0
 491 -
 492 -  Finally, to finish off our kmalloc example, instead of simply having
 493 -  the hist trigger display symbolic call_sites, we can have the hist
 494 -  trigger additionally display the complete set of kernel stack traces
 495 -  that led to each call_site.  To do that, we simply use the special
 496 -  value 'stacktrace' for the key parameter:
 497 -
 498 -    # echo 'hist:keys=stacktrace:values=bytes_req,bytes_alloc:sort=bytes_alloc' > \
 499 -           /sys/kernel/debug/tracing/events/kmem/kmalloc/trigger
 500 -
 501 -  The above trigger will use the kernel stack trace in effect when an
 502 -  event is triggered as the key for the hash table.  This allows the
 503 -  enumeration of every kernel callpath that led up to a particular
 504 -  event, along with a running total of any of the event fields for
 505 -  that event.  Here we tally bytes requested and bytes allocated for
 506 -  every callpath in the system that led up to a kmalloc (in this case
 507 -  every callpath to a kmalloc for a kernel compile):
 508 -
 509 -    # cat /sys/kernel/debug/tracing/events/kmem/kmalloc/hist
 510 -    # trigger info: hist:keys=stacktrace:vals=bytes_req,bytes_alloc:sort=bytes_alloc:size=2048 [active]
 511 -
 512 -    { stacktrace:
 513 -         __kmalloc_track_caller+0x10b/0x1a0
 514 -         kmemdup+0x20/0x50
 515 -         hidraw_report_event+0x8a/0x120 [hid]
 516 -         hid_report_raw_event+0x3ea/0x440 [hid]
 517 -         hid_input_report+0x112/0x190 [hid]
 518 -         hid_irq_in+0xc2/0x260 [usbhid]
 519 -         __usb_hcd_giveback_urb+0x72/0x120
 520 -         usb_giveback_urb_bh+0x9e/0xe0
 521 -         tasklet_hi_action+0xf8/0x100
 522 -         __do_softirq+0x114/0x2c0
 523 -         irq_exit+0xa5/0xb0
 524 -         do_IRQ+0x5a/0xf0
 525 -         ret_from_intr+0x0/0x30
 526 -         cpuidle_enter+0x17/0x20
 527 -         cpu_startup_entry+0x315/0x3e0
 528 -         rest_init+0x7c/0x80
 529 -    } hitcount:          3  bytes_req:         21  bytes_alloc:         24
 530 -    { stacktrace:
 531 -         __kmalloc_track_caller+0x10b/0x1a0
 532 -         kmemdup+0x20/0x50
 533 -         hidraw_report_event+0x8a/0x120 [hid]
 534 -         hid_report_raw_event+0x3ea/0x440 [hid]
 535 -         hid_input_report+0x112/0x190 [hid]
 536 -         hid_irq_in+0xc2/0x260 [usbhid]
 537 -         __usb_hcd_giveback_urb+0x72/0x120
 538 -         usb_giveback_urb_bh+0x9e/0xe0
 539 -         tasklet_hi_action+0xf8/0x100
 540 -         __do_softirq+0x114/0x2c0
 541 -         irq_exit+0xa5/0xb0
 542 -         do_IRQ+0x5a/0xf0
 543 -         ret_from_intr+0x0/0x30
 544 -    } hitcount:          3  bytes_req:         21  bytes_alloc:         24
 545 -    { stacktrace:
 546 -         kmem_cache_alloc_trace+0xeb/0x150
 547 -         aa_alloc_task_context+0x27/0x40
 548 -         apparmor_cred_prepare+0x1f/0x50
 549 -         security_prepare_creds+0x16/0x20
 550 -         prepare_creds+0xdf/0x1a0
 551 -         SyS_capset+0xb5/0x200
 552 -         system_call_fastpath+0x12/0x6a
 553 -    } hitcount:          1  bytes_req:         32  bytes_alloc:         32
 554 -    .
 555 -    .
 556 -    .
 557 -    { stacktrace:
 558 -         __kmalloc+0x11b/0x1b0
 559 -         i915_gem_execbuffer2+0x6c/0x2c0 [i915]
 560 -         drm_ioctl+0x349/0x670 [drm]
 561 -         do_vfs_ioctl+0x2f0/0x4f0
 562 -         SyS_ioctl+0x81/0xa0
 563 -         system_call_fastpath+0x12/0x6a
 564 -    } hitcount:      17726  bytes_req:   13944120  bytes_alloc:   19593808
 565 -    { stacktrace:
 566 -         __kmalloc+0x11b/0x1b0
 567 -         load_elf_phdrs+0x76/0xa0
 568 -         load_elf_binary+0x102/0x1650
 569 -         search_binary_handler+0x97/0x1d0
 570 -         do_execveat_common.isra.34+0x551/0x6e0
 571 -         SyS_execve+0x3a/0x50
 572 -         return_from_execve+0x0/0x23
 573 -    } hitcount:      33348  bytes_req:   17152128  bytes_alloc:   20226048
 574 -    { stacktrace:
 575 -         kmem_cache_alloc_trace+0xeb/0x150
 576 -         apparmor_file_alloc_security+0x27/0x40
 577 -         security_file_alloc+0x16/0x20
 578 -         get_empty_filp+0x93/0x1c0
 579 -         path_openat+0x31/0x5f0
 580 -         do_filp_open+0x3a/0x90
 581 -         do_sys_open+0x128/0x220
 582 -         SyS_open+0x1e/0x20
 583 -         system_call_fastpath+0x12/0x6a
 584 -    } hitcount:    4766422  bytes_req:    9532844  bytes_alloc:   38131376
 585 -    { stacktrace:
 586 -         __kmalloc+0x11b/0x1b0
 587 -         seq_buf_alloc+0x1b/0x50
 588 -         seq_read+0x2cc/0x370
 589 -         proc_reg_read+0x3d/0x80
 590 -         __vfs_read+0x28/0xe0
 591 -         vfs_read+0x86/0x140
 592 -         SyS_read+0x46/0xb0
 593 -         system_call_fastpath+0x12/0x6a
 594 -    } hitcount:      19133  bytes_req:   78368768  bytes_alloc:   78368768
 595 -
 596 -    Totals:
 597 -        Hits: 6085872
 598 -        Entries: 253
 599 -        Dropped: 0
 600 -
 601 -  If you key a hist trigger on common_pid, in order for example to
 602 -  gather and display sorted totals for each process, you can use the
 603 -  special .execname modifier to display the executable names for the
 604 -  processes in the table rather than raw pids.  The example below
 605 -  keeps a per-process sum of total bytes read:
 606 -
 607 -    # echo 'hist:key=common_pid.execname:val=count:sort=count.descending' > \
 608 -           /sys/kernel/debug/tracing/events/syscalls/sys_enter_read/trigger
 609 -
 610 -    # cat /sys/kernel/debug/tracing/events/syscalls/sys_enter_read/hist
 611 -    # trigger info: hist:keys=common_pid.execname:vals=count:sort=count.descending:size=2048 [active]
 612 -
 613 -    { common_pid: gnome-terminal  [      3196] } hitcount:        280  count:    1093512
 614 -    { common_pid: Xorg            [      1309] } hitcount:        525  count:     256640
 615 -    { common_pid: compiz          [      2889] } hitcount:         59  count:     254400
 616 -    { common_pid: bash            [      8710] } hitcount:          3  count:      66369
 617 -    { common_pid: dbus-daemon-lau [      8703] } hitcount:         49  count:      47739
 618 -    { common_pid: irqbalance      [      1252] } hitcount:         27  count:      27648
 619 -    { common_pid: 01ifupdown      [      8705] } hitcount:          3  count:      17216
 620 -    { common_pid: dbus-daemon     [       772] } hitcount:         10  count:      12396
 621 -    { common_pid: Socket Thread   [      8342] } hitcount:         11  count:      11264
 622 -    { common_pid: nm-dhcp-client. [      8701] } hitcount:          6  count:       7424
 623 -    { common_pid: gmain           [      1315] } hitcount:         18  count:       6336
 624 -    .
 625 -    .
 626 -    .
 627 -    { common_pid: postgres        [      1892] } hitcount:          2  count:         32
 628 -    { common_pid: postgres        [      1891] } hitcount:          2  count:         32
 629 -    { common_pid: gmain           [      8704] } hitcount:          2  count:         32
 630 -    { common_pid: upstart-dbus-br [      2740] } hitcount:         21  count:         21
 631 -    { common_pid: nm-dispatcher.a [      8696] } hitcount:          1  count:         16
 632 -    { common_pid: indicator-datet [      2904] } hitcount:          1  count:         16
 633 -    { common_pid: gdbus           [      2998] } hitcount:          1  count:         16
 634 -    { common_pid: rtkit-daemon    [      2052] } hitcount:          1  count:          8
 635 -    { common_pid: init            [         1] } hitcount:          2  count:          2
 636 -
 637 -    Totals:
 638 -        Hits: 2116
 639 -        Entries: 51
 640 -        Dropped: 0
 641 -
 642 -  Similarly, if you key a hist trigger on syscall id, for example to
 643 -  gather and display a list of systemwide syscall hits, you can use
 644 -  the special .syscall modifier to display the syscall names rather
 645 -  than raw ids.  The example below keeps a running total of syscall
 646 -  counts for the system during the run:
 647 -
 648 -    # echo 'hist:key=id.syscall:val=hitcount' > \
 649 -           /sys/kernel/debug/tracing/events/raw_syscalls/sys_enter/trigger
 650 -
 651 -    # cat /sys/kernel/debug/tracing/events/raw_syscalls/sys_enter/hist
 652 -    # trigger info: hist:keys=id.syscall:vals=hitcount:sort=hitcount:size=2048 [active]
 653 -
 654 -    { id: sys_fsync                     [ 74] } hitcount:          1
 655 -    { id: sys_newuname                  [ 63] } hitcount:          1
 656 -    { id: sys_prctl                     [157] } hitcount:          1
 657 -    { id: sys_statfs                    [137] } hitcount:          1
 658 -    { id: sys_symlink                   [ 88] } hitcount:          1
 659 -    { id: sys_sendmmsg                  [307] } hitcount:          1
 660 -    { id: sys_semctl                    [ 66] } hitcount:          1
 661 -    { id: sys_readlink                  [ 89] } hitcount:          3
 662 -    { id: sys_bind                      [ 49] } hitcount:          3
 663 -    { id: sys_getsockname               [ 51] } hitcount:          3
 664 -    { id: sys_unlink                    [ 87] } hitcount:          3
 665 -    { id: sys_rename                    [ 82] } hitcount:          4
 666 -    { id: unknown_syscall               [ 58] } hitcount:          4
 667 -    { id: sys_connect                   [ 42] } hitcount:          4
 668 -    { id: sys_getpid                    [ 39] } hitcount:          4
 669 -    .
 670 -    .
 671 -    .
 672 -    { id: sys_rt_sigprocmask            [ 14] } hitcount:        952
 673 -    { id: sys_futex                     [202] } hitcount:       1534
 674 -    { id: sys_write                     [  1] } hitcount:       2689
 675 -    { id: sys_setitimer                 [ 38] } hitcount:       2797
 676 -    { id: sys_read                      [  0] } hitcount:       3202
 677 -    { id: sys_select                    [ 23] } hitcount:       3773
 678 -    { id: sys_writev                    [ 20] } hitcount:       4531
 679 -    { id: sys_poll                      [  7] } hitcount:       8314
 680 -    { id: sys_recvmsg                   [ 47] } hitcount:      13738
 681 -    { id: sys_ioctl                     [ 16] } hitcount:      21843
 682 -
 683 -    Totals:
 684 -        Hits: 67612
 685 -        Entries: 72
 686 -        Dropped: 0
 687 -
 688 -    The syscall counts above provide a rough overall picture of system
 689 -    call activity on the system; we can see for example that the most
 690 -    popular system call on this system was the 'sys_ioctl' system call.
 691 -
 692 -    We can use 'compound' keys to refine that number and provide some
 693 -    further insight as to which processes exactly contribute to the
 694 -    overall ioctl count.
 695 -
 696 -    The command below keeps a hitcount for every unique combination of
 697 -    system call id and pid - the end result is essentially a table
 698 -    that keeps a per-pid sum of system call hits.  The results are
 699 -    sorted using the system call id as the primary key, and the
 700 -    hitcount sum as the secondary key:
 701 -
 702 -    # echo 'hist:key=id.syscall,common_pid.execname:val=hitcount:sort=id,hitcount' > \
 703 -           /sys/kernel/debug/tracing/events/raw_syscalls/sys_enter/trigger
 704 -
 705 -    # cat /sys/kernel/debug/tracing/events/raw_syscalls/sys_enter/hist
 706 -    # trigger info: hist:keys=id.syscall,common_pid.execname:vals=hitcount:sort=id.syscall,hitcount:size=2048 [active]
 707 -
 708 -    { id: sys_read                      [  0], common_pid: rtkit-daemon    [      1877] } hitcount:          1
 709 -    { id: sys_read                      [  0], common_pid: gdbus           [      2976] } hitcount:          1
 710 -    { id: sys_read                      [  0], common_pid: console-kit-dae [      3400] } hitcount:          1
 711 -    { id: sys_read                      [  0], common_pid: postgres        [      1865] } hitcount:          1
 712 -    { id: sys_read                      [  0], common_pid: deja-dup-monito [      3543] } hitcount:          2
 713 -    { id: sys_read                      [  0], common_pid: NetworkManager  [       890] } hitcount:          2
 714 -    { id: sys_read                      [  0], common_pid: evolution-calen [      3048] } hitcount:          2
 715 -    { id: sys_read                      [  0], common_pid: postgres        [      1864] } hitcount:          2
 716 -    { id: sys_read                      [  0], common_pid: nm-applet       [      3022] } hitcount:          2
 717 -    { id: sys_read                      [  0], common_pid: whoopsie        [      1212] } hitcount:          2
 718 -    .
 719 -    .
 720 -    .
 721 -    { id: sys_ioctl                     [ 16], common_pid: bash            [      8479] } hitcount:          1
 722 -    { id: sys_ioctl                     [ 16], common_pid: bash            [      3472] } hitcount:         12
 723 -    { id: sys_ioctl                     [ 16], common_pid: gnome-terminal  [      3199] } hitcount:         16
 724 -    { id: sys_ioctl                     [ 16], common_pid: Xorg            [      1267] } hitcount:       1808
 725 -    { id: sys_ioctl                     [ 16], common_pid: compiz          [      2994] } hitcount:       5580
 726 -    .
 727 -    .
 728 -    .
 729 -    { id: sys_waitid                    [247], common_pid: upstart-dbus-br [      2690] } hitcount:          3
 730 -    { id: sys_waitid                    [247], common_pid: upstart-dbus-br [      2688] } hitcount:         16
 731 -    { id: sys_inotify_add_watch         [254], common_pid: gmain           [       975] } hitcount:          2
 732 -    { id: sys_inotify_add_watch         [254], common_pid: gmain           [      3204] } hitcount:          4
 733 -    { id: sys_inotify_add_watch         [254], common_pid: gmain           [      2888] } hitcount:          4
 734 -    { id: sys_inotify_add_watch         [254], common_pid: gmain           [      3003] } hitcount:          4
 735 -    { id: sys_inotify_add_watch         [254], common_pid: gmain           [      2873] } hitcount:          4
 736 -    { id: sys_inotify_add_watch         [254], common_pid: gmain           [      3196] } hitcount:          6
 737 -    { id: sys_openat                    [257], common_pid: java            [      2623] } hitcount:          2
 738 -    { id: sys_eventfd2                  [290], common_pid: ibus-ui-gtk3    [      2760] } hitcount:          4
 739 -    { id: sys_eventfd2                  [290], common_pid: compiz          [      2994] } hitcount:          6
 740 -
 741 -    Totals:
 742 -        Hits: 31536
 743 -        Entries: 323
 744 -        Dropped: 0
 745 -
 746 -    The above list does give us a breakdown of the ioctl syscall by
 747 -    pid, but it also gives us quite a bit more than that, which we
 748 -    don't really care about at the moment.  Since we know the syscall
 749 -    id for sys_ioctl (16, displayed next to the sys_ioctl name), we
 750 -    can use that to filter out all the other syscalls:
 751 -
 752 -    # echo 'hist:key=id.syscall,common_pid.execname:val=hitcount:sort=id,hitcount if id == 16' > \
 753 -           /sys/kernel/debug/tracing/events/raw_syscalls/sys_enter/trigger
 754 -
 755 -    # cat /sys/kernel/debug/tracing/events/raw_syscalls/sys_enter/hist
 756 -    # trigger info: hist:keys=id.syscall,common_pid.execname:vals=hitcount:sort=id.syscall,hitcount:size=2048 if id == 16 [active]
 757 -
 758 -    { id: sys_ioctl                     [ 16], common_pid: gmain           [      2769] } hitcount:          1
 759 -    { id: sys_ioctl                     [ 16], common_pid: evolution-addre [      8571] } hitcount:          1
 760 -    { id: sys_ioctl                     [ 16], common_pid: gmain           [      3003] } hitcount:          1
 761 -    { id: sys_ioctl                     [ 16], common_pid: gmain           [      2781] } hitcount:          1
 762 -    { id: sys_ioctl                     [ 16], common_pid: gmain           [      2829] } hitcount:          1
 763 -    { id: sys_ioctl                     [ 16], common_pid: bash            [      8726] } hitcount:          1
 764 -    { id: sys_ioctl                     [ 16], common_pid: bash            [      8508] } hitcount:          1
 765 -    { id: sys_ioctl                     [ 16], common_pid: gmain           [      2970] } hitcount:          1
 766 -    { id: sys_ioctl                     [ 16], common_pid: gmain           [      2768] } hitcount:          1
 767 -    .
 768 -    .
 769 -    .
 770 -    { id: sys_ioctl                     [ 16], common_pid: pool            [      8559] } hitcount:         45
 771 -    { id: sys_ioctl                     [ 16], common_pid: pool            [      8555] } hitcount:         48
 772 -    { id: sys_ioctl                     [ 16], common_pid: pool            [      8551] } hitcount:         48
 773 -    { id: sys_ioctl                     [ 16], common_pid: avahi-daemon    [       896] } hitcount:         66
 774 -    { id: sys_ioctl                     [ 16], common_pid: Xorg            [      1267] } hitcount:      26674
 775 -    { id: sys_ioctl                     [ 16], common_pid: compiz          [      2994] } hitcount:      73443
 776 -
 777 -    Totals:
 778 -        Hits: 101162
 779 -        Entries: 103
 780 -        Dropped: 0
 781 -
 782 -    The above output shows that 'compiz' and 'Xorg' are far and away
 783 -    the heaviest ioctl callers (which might lead to questions about
 784 -    whether they really need to be making all those calls and to
 785 -    possible avenues for further investigation.)
 786 -
 787 -    The compound key examples used a key and a sum value (hitcount) to
 788 -    sort the output, but we can just as easily use two keys instead.
 789 -    Here's an example where we use a compound key composed of the the
 790 -    common_pid and size event fields.  Sorting with pid as the primary
 791 -    key and 'size' as the secondary key allows us to display an
 792 -    ordered summary of the recvfrom sizes, with counts, received by
 793 -    each process:
 794 -
 795 -    # echo 'hist:key=common_pid.execname,size:val=hitcount:sort=common_pid,size' > \
 796 -           /sys/kernel/debug/tracing/events/syscalls/sys_enter_recvfrom/trigger
 797 -
 798 -    # cat /sys/kernel/debug/tracing/events/syscalls/sys_enter_recvfrom/hist
 799 -    # trigger info: hist:keys=common_pid.execname,size:vals=hitcount:sort=common_pid.execname,size:size=2048 [active]
 800 -
 801 -    { common_pid: smbd            [       784], size:          4 } hitcount:          1
 802 -    { common_pid: dnsmasq         [      1412], size:       4096 } hitcount:        672
 803 -    { common_pid: postgres        [      1796], size:       1000 } hitcount:          6
 804 -    { common_pid: postgres        [      1867], size:       1000 } hitcount:         10
 805 -    { common_pid: bamfdaemon      [      2787], size:         28 } hitcount:          2
 806 -    { common_pid: bamfdaemon      [      2787], size:      14360 } hitcount:          1
 807 -    { common_pid: compiz          [      2994], size:          8 } hitcount:          1
 808 -    { common_pid: compiz          [      2994], size:         20 } hitcount:         11
 809 -    { common_pid: gnome-terminal  [      3199], size:          4 } hitcount:          2
 810 -    { common_pid: firefox         [      8817], size:          4 } hitcount:          1
 811 -    { common_pid: firefox         [      8817], size:          8 } hitcount:          5
 812 -    { common_pid: firefox         [      8817], size:        588 } hitcount:          2
 813 -    { common_pid: firefox         [      8817], size:        628 } hitcount:          1
 814 -    { common_pid: firefox         [      8817], size:       6944 } hitcount:          1
 815 -    { common_pid: firefox         [      8817], size:     408880 } hitcount:          2
 816 -    { common_pid: firefox         [      8822], size:          8 } hitcount:          2
 817 -    { common_pid: firefox         [      8822], size:        160 } hitcount:          2
 818 -    { common_pid: firefox         [      8822], size:        320 } hitcount:          2
 819 -    { common_pid: firefox         [      8822], size:        352 } hitcount:          1
 820 -    .
 821 -    .
 822 -    .
 823 -    { common_pid: pool            [      8923], size:       1960 } hitcount:         10
 824 -    { common_pid: pool            [      8923], size:       2048 } hitcount:         10
 825 -    { common_pid: pool            [      8924], size:       1960 } hitcount:         10
 826 -    { common_pid: pool            [      8924], size:       2048 } hitcount:         10
 827 -    { common_pid: pool            [      8928], size:       1964 } hitcount:          4
 828 -    { common_pid: pool            [      8928], size:       1965 } hitcount:          2
 829 -    { common_pid: pool            [      8928], size:       2048 } hitcount:          6
 830 -    { common_pid: pool            [      8929], size:       1982 } hitcount:          1
 831 -    { common_pid: pool            [      8929], size:       2048 } hitcount:          1
 832 -
 833 -    Totals:
 834 -        Hits: 2016
 835 -        Entries: 224
 836 -        Dropped: 0
 837 -
 838 -  The above example also illustrates the fact that although a compound
 839 -  key is treated as a single entity for hashing purposes, the sub-keys
 840 -  it's composed of can be accessed independently.
 841 -
 842 -  The next example uses a string field as the hash key and
 843 -  demonstrates how you can manually pause and continue a hist trigger.
 844 -  In this example, we'll aggregate fork counts and don't expect a
 845 -  large number of entries in the hash table, so we'll drop it to a
 846 -  much smaller number, say 256:
 847 -
 848 -    # echo 'hist:key=child_comm:val=hitcount:size=256' > \
 849 -           /sys/kernel/debug/tracing/events/sched/sched_process_fork/trigger
 850 -
 851 -    # cat /sys/kernel/debug/tracing/events/sched/sched_process_fork/hist
 852 -    # trigger info: hist:keys=child_comm:vals=hitcount:sort=hitcount:size=256 [active]
 853 -
 854 -    { child_comm: dconf worker                        } hitcount:          1
 855 -    { child_comm: ibus-daemon                         } hitcount:          1
 856 -    { child_comm: whoopsie                            } hitcount:          1
 857 -    { child_comm: smbd                                } hitcount:          1
 858 -    { child_comm: gdbus                               } hitcount:          1
 859 -    { child_comm: kthreadd                            } hitcount:          1
 860 -    { child_comm: dconf worker                        } hitcount:          1
 861 -    { child_comm: evolution-alarm                     } hitcount:          2
 862 -    { child_comm: Socket Thread                       } hitcount:          2
 863 -    { child_comm: postgres                            } hitcount:          2
 864 -    { child_comm: bash                                } hitcount:          3
 865 -    { child_comm: compiz                              } hitcount:          3
 866 -    { child_comm: evolution-sourc                     } hitcount:          4
 867 -    { child_comm: dhclient                            } hitcount:          4
 868 -    { child_comm: pool                                } hitcount:          5
 869 -    { child_comm: nm-dispatcher.a                     } hitcount:          8
 870 -    { child_comm: firefox                             } hitcount:          8
 871 -    { child_comm: dbus-daemon                         } hitcount:          8
 872 -    { child_comm: glib-pacrunner                      } hitcount:         10
 873 -    { child_comm: evolution                           } hitcount:         23
 874 -
 875 -    Totals:
 876 -        Hits: 89
 877 -        Entries: 20
 878 -        Dropped: 0
 879 -
 880 -  If we want to pause the hist trigger, we can simply append :pause to
 881 -  the command that started the trigger.  Notice that the trigger info
 882 -  displays as [paused]:
 883 -
 884 -    # echo 'hist:key=child_comm:val=hitcount:size=256:pause' >> \
 885 -           /sys/kernel/debug/tracing/events/sched/sched_process_fork/trigger
 886 -
 887 -    # cat /sys/kernel/debug/tracing/events/sched/sched_process_fork/hist
 888 -    # trigger info: hist:keys=child_comm:vals=hitcount:sort=hitcount:size=256 [paused]
 889 -
 890 -    { child_comm: dconf worker                        } hitcount:          1
 891 -    { child_comm: kthreadd                            } hitcount:          1
 892 -    { child_comm: dconf worker                        } hitcount:          1
 893 -    { child_comm: gdbus                               } hitcount:          1
 894 -    { child_comm: ibus-daemon                         } hitcount:          1
 895 -    { child_comm: Socket Thread                       } hitcount:          2
 896 -    { child_comm: evolution-alarm                     } hitcount:          2
 897 -    { child_comm: smbd                                } hitcount:          2
 898 -    { child_comm: bash                                } hitcount:          3
 899 -    { child_comm: whoopsie                            } hitcount:          3
 900 -    { child_comm: compiz                              } hitcount:          3
 901 -    { child_comm: evolution-sourc                     } hitcount:          4
 902 -    { child_comm: pool                                } hitcount:          5
 903 -    { child_comm: postgres                            } hitcount:          6
 904 -    { child_comm: firefox                             } hitcount:          8
 905 -    { child_comm: dhclient                            } hitcount:         10
 906 -    { child_comm: emacs                               } hitcount:         12
 907 -    { child_comm: dbus-daemon                         } hitcount:         20
 908 -    { child_comm: nm-dispatcher.a                     } hitcount:         20
 909 -    { child_comm: evolution                           } hitcount:         35
 910 -    { child_comm: glib-pacrunner                      } hitcount:         59
 911 -
 912 -    Totals:
 913 -        Hits: 199
 914 -        Entries: 21
 915 -        Dropped: 0
 916 -
 917 -  To manually continue having the trigger aggregate events, append
 918 -  :cont instead.  Notice that the trigger info displays as [active]
 919 -  again, and the data has changed:
 920 -
 921 -    # echo 'hist:key=child_comm:val=hitcount:size=256:cont' >> \
 922 -           /sys/kernel/debug/tracing/events/sched/sched_process_fork/trigger
 923 -
 924 -    # cat /sys/kernel/debug/tracing/events/sched/sched_process_fork/hist
 925 -    # trigger info: hist:keys=child_comm:vals=hitcount:sort=hitcount:size=256 [active]
 926 -
 927 -    { child_comm: dconf worker                        } hitcount:          1
 928 -    { child_comm: dconf worker                        } hitcount:          1
 929 -    { child_comm: kthreadd                            } hitcount:          1
 930 -    { child_comm: gdbus                               } hitcount:          1
 931 -    { child_comm: ibus-daemon                         } hitcount:          1
 932 -    { child_comm: Socket Thread                       } hitcount:          2
 933 -    { child_comm: evolution-alarm                     } hitcount:          2
 934 -    { child_comm: smbd                                } hitcount:          2
 935 -    { child_comm: whoopsie                            } hitcount:          3
 936 -    { child_comm: compiz                              } hitcount:          3
 937 -    { child_comm: evolution-sourc                     } hitcount:          4
 938 -    { child_comm: bash                                } hitcount:          5
 939 -    { child_comm: pool                                } hitcount:          5
 940 -    { child_comm: postgres                            } hitcount:          6
 941 -    { child_comm: firefox                             } hitcount:          8
 942 -    { child_comm: dhclient                            } hitcount:         11
 943 -    { child_comm: emacs                               } hitcount:         12
 944 -    { child_comm: dbus-daemon                         } hitcount:         22
 945 -    { child_comm: nm-dispatcher.a                     } hitcount:         22
 946 -    { child_comm: evolution                           } hitcount:         35
 947 -    { child_comm: glib-pacrunner                      } hitcount:         59
 948 -
 949 -    Totals:
 950 -        Hits: 206
 951 -        Entries: 21
 952 -        Dropped: 0
 953 -
 954 -  The previous example showed how to start and stop a hist trigger by
 955 -  appending 'pause' and 'continue' to the hist trigger command.  A
 956 -  hist trigger can also be started in a paused state by initially
 957 -  starting the trigger with ':pause' appended.  This allows you to
 958 -  start the trigger only when you're ready to start collecting data
 959 -  and not before.  For example, you could start the trigger in a
 960 -  paused state, then unpause it and do something you want to measure,
 961 -  then pause the trigger again when done.
 962 -
 963 -  Of course, doing this manually can be difficult and error-prone, but
 964 -  it is possible to automatically start and stop a hist trigger based
 965 -  on some condition, via the enable_hist and disable_hist triggers.
 966 -
 967 -  For example, suppose we wanted to take a look at the relative
 968 -  weights in terms of skb length for each callpath that leads to a
 969 -  netif_receieve_skb event when downloading a decent-sized file using
 970 -  wget.
 971 -
 972 -  First we set up an initially paused stacktrace trigger on the
 973 -  netif_receive_skb event:
 974 -
 975 -    # echo 'hist:key=stacktrace:vals=len:pause' > \
 976 -           /sys/kernel/debug/tracing/events/net/netif_receive_skb/trigger
 977 -
 978 -  Next, we set up an 'enable_hist' trigger on the sched_process_exec
 979 -  event, with an 'if filename==/usr/bin/wget' filter.  The effect of
 980 -  this new trigger is that it will 'unpause' the hist trigger we just
 981 -  set up on netif_receive_skb if and only if it sees a
 982 -  sched_process_exec event with a filename of '/usr/bin/wget'.  When
 983 -  that happens, all netif_receive_skb events are aggregated into a
 984 -  hash table keyed on stacktrace:
 985 -
 986 -    # echo 'enable_hist:net:netif_receive_skb if filename==/usr/bin/wget' > \
 987 -           /sys/kernel/debug/tracing/events/sched/sched_process_exec/trigger
 988 -
 989 -  The aggregation continues until the netif_receive_skb is paused
 990 -  again, which is what the following disable_hist event does by
 991 -  creating a similar setup on the sched_process_exit event, using the
 992 -  filter 'comm==wget':
 993 -
 994 -    # echo 'disable_hist:net:netif_receive_skb if comm==wget' > \
 995 -           /sys/kernel/debug/tracing/events/sched/sched_process_exit/trigger
 996 -
 997 -  Whenever a process exits and the comm field of the disable_hist
 998 -  trigger filter matches 'comm==wget', the netif_receive_skb hist
 999 -  trigger is disabled.
1000 -
1001 -  The overall effect is that netif_receive_skb events are aggregated
1002 -  into the hash table for only the duration of the wget.  Executing a
1003 -  wget command and then listing the 'hist' file will display the
1004 -  output generated by the wget command:
1005 -
1006 -    $ wget https://www.kernel.org/pub/linux/kernel/v3.x/patch-3.19.xz
1007 -
1008 -    # cat /sys/kernel/debug/tracing/events/net/netif_receive_skb/hist
1009 -    # trigger info: hist:keys=stacktrace:vals=len:sort=hitcount:size=2048 [paused]
1010 -
1011 -    { stacktrace:
1012 -         __netif_receive_skb_core+0x46d/0x990
1013 -         __netif_receive_skb+0x18/0x60
1014 -         netif_receive_skb_internal+0x23/0x90
1015 -         napi_gro_receive+0xc8/0x100
1016 -         ieee80211_deliver_skb+0xd6/0x270 [mac80211]
1017 -         ieee80211_rx_handlers+0xccf/0x22f0 [mac80211]
1018 -         ieee80211_prepare_and_rx_handle+0x4e7/0xc40 [mac80211]
1019 -         ieee80211_rx+0x31d/0x900 [mac80211]
1020 -         iwlagn_rx_reply_rx+0x3db/0x6f0 [iwldvm]
1021 -         iwl_rx_dispatch+0x8e/0xf0 [iwldvm]
1022 -         iwl_pcie_irq_handler+0xe3c/0x12f0 [iwlwifi]
1023 -         irq_thread_fn+0x20/0x50
1024 -         irq_thread+0x11f/0x150
1025 -         kthread+0xd2/0xf0
1026 -         ret_from_fork+0x42/0x70
1027 -    } hitcount:         85  len:      28884
1028 -    { stacktrace:
1029 -         __netif_receive_skb_core+0x46d/0x990
1030 -         __netif_receive_skb+0x18/0x60
1031 -         netif_receive_skb_internal+0x23/0x90
1032 -         napi_gro_complete+0xa4/0xe0
1033 -         dev_gro_receive+0x23a/0x360
1034 -         napi_gro_receive+0x30/0x100
1035 -         ieee80211_deliver_skb+0xd6/0x270 [mac80211]
1036 -         ieee80211_rx_handlers+0xccf/0x22f0 [mac80211]
1037 -         ieee80211_prepare_and_rx_handle+0x4e7/0xc40 [mac80211]
1038 -         ieee80211_rx+0x31d/0x900 [mac80211]
1039 -         iwlagn_rx_reply_rx+0x3db/0x6f0 [iwldvm]
1040 -         iwl_rx_dispatch+0x8e/0xf0 [iwldvm]
1041 -         iwl_pcie_irq_handler+0xe3c/0x12f0 [iwlwifi]
1042 -         irq_thread_fn+0x20/0x50
1043 -         irq_thread+0x11f/0x150
1044 -         kthread+0xd2/0xf0
1045 -    } hitcount:         98  len:     664329
1046 -    { stacktrace:
1047 -         __netif_receive_skb_core+0x46d/0x990
1048 -         __netif_receive_skb+0x18/0x60
1049 -         process_backlog+0xa8/0x150
1050 -         net_rx_action+0x15d/0x340
1051 -         __do_softirq+0x114/0x2c0
1052 -         do_softirq_own_stack+0x1c/0x30
1053 -         do_softirq+0x65/0x70
1054 -         __local_bh_enable_ip+0xb5/0xc0
1055 -         ip_finish_output+0x1f4/0x840
1056 -         ip_output+0x6b/0xc0
1057 -         ip_local_out_sk+0x31/0x40
1058 -         ip_send_skb+0x1a/0x50
1059 -         udp_send_skb+0x173/0x2a0
1060 -         udp_sendmsg+0x2bf/0x9f0
1061 -         inet_sendmsg+0x64/0xa0
1062 -         sock_sendmsg+0x3d/0x50
1063 -    } hitcount:        115  len:      13030
1064 -    { stacktrace:
1065 -         __netif_receive_skb_core+0x46d/0x990
1066 -         __netif_receive_skb+0x18/0x60
1067 -         netif_receive_skb_internal+0x23/0x90
1068 -         napi_gro_complete+0xa4/0xe0
1069 -         napi_gro_flush+0x6d/0x90
1070 -         iwl_pcie_irq_handler+0x92a/0x12f0 [iwlwifi]
1071 -         irq_thread_fn+0x20/0x50
1072 -         irq_thread+0x11f/0x150
1073 -         kthread+0xd2/0xf0
1074 -         ret_from_fork+0x42/0x70
1075 -    } hitcount:        934  len:    5512212
1076 -
1077 -    Totals:
1078 -        Hits: 1232
1079 -        Entries: 4
1080 -        Dropped: 0
1081 -
1082 -  The above shows all the netif_receive_skb callpaths and their total
1083 -  lengths for the duration of the wget command.
1084 -
1085 -  The 'clear' hist trigger param can be used to clear the hash table.
1086 -  Suppose we wanted to try another run of the previous example but
1087 -  this time also wanted to see the complete list of events that went
1088 -  into the histogram.  In order to avoid having to set everything up
1089 -  again, we can just clear the histogram first:
1090 -
1091 -    # echo 'hist:key=stacktrace:vals=len:clear' >> \
1092 -           /sys/kernel/debug/tracing/events/net/netif_receive_skb/trigger
1093 -
1094 -  Just to verify that it is in fact cleared, here's what we now see in
1095 -  the hist file:
1096 -
1097 -    # cat /sys/kernel/debug/tracing/events/net/netif_receive_skb/hist
1098 -    # trigger info: hist:keys=stacktrace:vals=len:sort=hitcount:size=2048 [paused]
1099 -
1100 -    Totals:
1101 -        Hits: 0
1102 -        Entries: 0
1103 -        Dropped: 0
1104 -
1105 -  Since we want to see the detailed list of every netif_receive_skb
1106 -  event occurring during the new run, which are in fact the same
1107 -  events being aggregated into the hash table, we add some additional
1108 -  'enable_event' events to the triggering sched_process_exec and
1109 -  sched_process_exit events as such:
1110 -
1111 -    # echo 'enable_event:net:netif_receive_skb if filename==/usr/bin/wget' > \
1112 -           /sys/kernel/debug/tracing/events/sched/sched_process_exec/trigger
1113 -
1114 -    # echo 'disable_event:net:netif_receive_skb if comm==wget' > \
1115 -           /sys/kernel/debug/tracing/events/sched/sched_process_exit/trigger
1116 -
1117 -  If you read the trigger files for the sched_process_exec and
1118 -  sched_process_exit triggers, you should see two triggers for each:
1119 -  one enabling/disabling the hist aggregation and the other
1120 -  enabling/disabling the logging of events:
1121 -
1122 -    # cat /sys/kernel/debug/tracing/events/sched/sched_process_exec/trigger
1123 -    enable_event:net:netif_receive_skb:unlimited if filename==/usr/bin/wget
1124 -    enable_hist:net:netif_receive_skb:unlimited if filename==/usr/bin/wget
1125 -
1126 -    # cat /sys/kernel/debug/tracing/events/sched/sched_process_exit/trigger
1127 -    enable_event:net:netif_receive_skb:unlimited if comm==wget
1128 -    disable_hist:net:netif_receive_skb:unlimited if comm==wget
1129 -
1130 -  In other words, whenever either of the sched_process_exec or
1131 -  sched_process_exit events is hit and matches 'wget', it enables or
1132 -  disables both the histogram and the event log, and what you end up
1133 -  with is a hash table and set of events just covering the specified
1134 -  duration.  Run the wget command again:
1135 -
1136 -    $ wget https://www.kernel.org/pub/linux/kernel/v3.x/patch-3.19.xz
1137 -
1138 -  Displaying the 'hist' file should show something similar to what you
1139 -  saw in the last run, but this time you should also see the
1140 -  individual events in the trace file:
1141 -
1142 -    # cat /sys/kernel/debug/tracing/trace
1143 -
1144 -    # tracer: nop
1145 -    #
1146 -    # entries-in-buffer/entries-written: 183/1426   #P:4
1147 -    #
1148 -    #                              _-----=> irqs-off
1149 -    #                             / _----=> need-resched
1150 -    #                            | / _---=> hardirq/softirq
1151 -    #                            || / _--=> preempt-depth
1152 -    #                            ||| /     delay
1153 -    #           TASK-PID   CPU#  ||||    TIMESTAMP  FUNCTION
1154 -    #              | |       |   ||||       |         |
1155 -                wget-15108 [000] ..s1 31769.606929: netif_receive_skb: dev=lo skbaddr=ffff88009c353100 len=60
1156 -                wget-15108 [000] ..s1 31769.606999: netif_receive_skb: dev=lo skbaddr=ffff88009c353200 len=60
1157 -             dnsmasq-1382  [000] ..s1 31769.677652: netif_receive_skb: dev=lo skbaddr=ffff88009c352b00 len=130
1158 -             dnsmasq-1382  [000] ..s1 31769.685917: netif_receive_skb: dev=lo skbaddr=ffff88009c352200 len=138
1159 -    ##### CPU 2 buffer started ####
1160 -      irq/29-iwlwifi-559   [002] ..s. 31772.031529: netif_receive_skb: dev=wlan0 skbaddr=ffff88009d433d00 len=2948
1161 -      irq/29-iwlwifi-559   [002] ..s. 31772.031572: netif_receive_skb: dev=wlan0 skbaddr=ffff88009d432200 len=1500
1162 -      irq/29-iwlwifi-559   [002] ..s. 31772.032196: netif_receive_skb: dev=wlan0 skbaddr=ffff88009d433100 len=2948
1163 -      irq/29-iwlwifi-559   [002] ..s. 31772.032761: netif_receive_skb: dev=wlan0 skbaddr=ffff88009d433000 len=2948
1164 -      irq/29-iwlwifi-559   [002] ..s. 31772.033220: netif_receive_skb: dev=wlan0 skbaddr=ffff88009d432e00 len=1500
1165 -    .
1166 -    .
1167 -    .
1168 -
1169 -  The following example demonstrates how multiple hist triggers can be
1170 -  attached to a given event.  This capability can be useful for
1171 -  creating a set of different summaries derived from the same set of
1172 -  events, or for comparing the effects of different filters, among
1173 -  other things.
1174 -
1175 -    # echo 'hist:keys=skbaddr.hex:vals=len if len < 0' >> \
1176 -           /sys/kernel/debug/tracing/events/net/netif_receive_skb/trigger
1177 -    # echo 'hist:keys=skbaddr.hex:vals=len if len > 4096' >> \
1178 -           /sys/kernel/debug/tracing/events/net/netif_receive_skb/trigger
1179 -    # echo 'hist:keys=skbaddr.hex:vals=len if len == 256' >> \
1180 -           /sys/kernel/debug/tracing/events/net/netif_receive_skb/trigger
1181 -    # echo 'hist:keys=skbaddr.hex:vals=len' >> \
1182 -           /sys/kernel/debug/tracing/events/net/netif_receive_skb/trigger
1183 -    # echo 'hist:keys=len:vals=common_preempt_count' >> \
1184 -           /sys/kernel/debug/tracing/events/net/netif_receive_skb/trigger
1185 -
1186 -  The above set of commands create four triggers differing only in
1187 -  their filters, along with a completely different though fairly
1188 -  nonsensical trigger.  Note that in order to append multiple hist
1189 -  triggers to the same file, you should use the '>>' operator to
1190 -  append them ('>' will also add the new hist trigger, but will remove
1191 -  any existing hist triggers beforehand).
1192 -
1193 -  Displaying the contents of the 'hist' file for the event shows the
1194 -  contents of all five histograms:
1195 -
1196 -    # cat /sys/kernel/debug/tracing/events/net/netif_receive_skb/hist
1197 -
1198 -    # event histogram
1199 -    #
1200 -    # trigger info: hist:keys=len:vals=hitcount,common_preempt_count:sort=hitcount:size=2048 [active]
1201 -    #
1202 -
1203 -    { len:        176 } hitcount:          1  common_preempt_count:          0
1204 -    { len:        223 } hitcount:          1  common_preempt_count:          0
1205 -    { len:       4854 } hitcount:          1  common_preempt_count:          0
1206 -    { len:        395 } hitcount:          1  common_preempt_count:          0
1207 -    { len:        177 } hitcount:          1  common_preempt_count:          0
1208 -    { len:        446 } hitcount:          1  common_preempt_count:          0
1209 -    { len:       1601 } hitcount:          1  common_preempt_count:          0
1210 -    .
1211 -    .
1212 -    .
1213 -    { len:       1280 } hitcount:         66  common_preempt_count:          0
1214 -    { len:        116 } hitcount:         81  common_preempt_count:         40
1215 -    { len:        708 } hitcount:        112  common_preempt_count:          0
1216 -    { len:         46 } hitcount:        221  common_preempt_count:          0
1217 -    { len:       1264 } hitcount:        458  common_preempt_count:          0
1218 -
1219 -    Totals:
1220 -        Hits: 1428
1221 -        Entries: 147
1222 -        Dropped: 0
1223 -
1224 -
1225 -    # event histogram
1226 -    #
1227 -    # trigger info: hist:keys=skbaddr.hex:vals=hitcount,len:sort=hitcount:size=2048 [active]
1228 -    #
1229 -
1230 -    { skbaddr: ffff8800baee5e00 } hitcount:          1  len:        130
1231 -    { skbaddr: ffff88005f3d5600 } hitcount:          1  len:       1280
1232 -    { skbaddr: ffff88005f3d4900 } hitcount:          1  len:       1280
1233 -    { skbaddr: ffff88009fed6300 } hitcount:          1  len:        115
1234 -    { skbaddr: ffff88009fe0ad00 } hitcount:          1  len:        115
1235 -    { skbaddr: ffff88008cdb1900 } hitcount:          1  len:         46
1236 -    { skbaddr: ffff880064b5ef00 } hitcount:          1  len:        118
1237 -    { skbaddr: ffff880044e3c700 } hitcount:          1  len:         60
1238 -    { skbaddr: ffff880100065900 } hitcount:          1  len:         46
1239 -    { skbaddr: ffff8800d46bd500 } hitcount:          1  len:        116
1240 -    { skbaddr: ffff88005f3d5f00 } hitcount:          1  len:       1280
1241 -    { skbaddr: ffff880100064700 } hitcount:          1  len:        365
1242 -    { skbaddr: ffff8800badb6f00 } hitcount:          1  len:         60
1243 -    .
1244 -    .
1245 -    .
1246 -    { skbaddr: ffff88009fe0be00 } hitcount:         27  len:      24677
1247 -    { skbaddr: ffff88009fe0a400 } hitcount:         27  len:      23052
1248 -    { skbaddr: ffff88009fe0b700 } hitcount:         31  len:      25589
1249 -    { skbaddr: ffff88009fe0b600 } hitcount:         32  len:      27326
1250 -    { skbaddr: ffff88006a462800 } hitcount:         68  len:      71678
1251 -    { skbaddr: ffff88006a463700 } hitcount:         70  len:      72678
1252 -    { skbaddr: ffff88006a462b00 } hitcount:         71  len:      77589
1253 -    { skbaddr: ffff88006a463600 } hitcount:         73  len:      71307
1254 -    { skbaddr: ffff88006a462200 } hitcount:         81  len:      81032
1255 -
1256 -    Totals:
1257 -        Hits: 1451
1258 -        Entries: 318
1259 -        Dropped: 0
1260 -
1261 -
1262 -    # event histogram
1263 -    #
1264 -    # trigger info: hist:keys=skbaddr.hex:vals=hitcount,len:sort=hitcount:size=2048 if len == 256 [active]
1265 -    #
1266 -
1267 -
1268 -    Totals:
1269 -        Hits: 0
1270 -        Entries: 0
1271 -        Dropped: 0
1272 -
1273 -
1274 -    # event histogram
1275 -    #
1276 -    # trigger info: hist:keys=skbaddr.hex:vals=hitcount,len:sort=hitcount:size=2048 if len > 4096 [active]
1277 -    #
1278 -
1279 -    { skbaddr: ffff88009fd2c300 } hitcount:          1  len:       7212
1280 -    { skbaddr: ffff8800d2bcce00 } hitcount:          1  len:       7212
1281 -    { skbaddr: ffff8800d2bcd700 } hitcount:          1  len:       7212
1282 -    { skbaddr: ffff8800d2bcda00 } hitcount:          1  len:      21492
1283 -    { skbaddr: ffff8800ae2e2d00 } hitcount:          1  len:       7212
1284 -    { skbaddr: ffff8800d2bcdb00 } hitcount:          1  len:       7212
1285 -    { skbaddr: ffff88006a4df500 } hitcount:          1  len:       4854
1286 -    { skbaddr: ffff88008ce47b00 } hitcount:          1  len:      18636
1287 -    { skbaddr: ffff8800ae2e2200 } hitcount:          1  len:      12924
1288 -    { skbaddr: ffff88005f3e1000 } hitcount:          1  len:       4356
1289 -    { skbaddr: ffff8800d2bcdc00 } hitcount:          2  len:      24420
1290 -    { skbaddr: ffff8800d2bcc200 } hitcount:          2  len:      12996
1291 -
1292 -    Totals:
1293 -        Hits: 14
1294 -        Entries: 12
1295 -        Dropped: 0
1296 -
1297 -
1298 -    # event histogram
1299 -    #
1300 -    # trigger info: hist:keys=skbaddr.hex:vals=hitcount,len:sort=hitcount:size=2048 if len < 0 [active]
1301 -    #
1302 -
1303 -
1304 -    Totals:
1305 -        Hits: 0
1306 -        Entries: 0
1307 -        Dropped: 0
1308 -
1309 -  Named triggers can be used to have triggers share a common set of
1310 -  histogram data.  This capability is mostly useful for combining the
1311 -  output of events generated by tracepoints contained inside inline
1312 -  functions, but names can be used in a hist trigger on any event.
1313 -  For example, these two triggers when hit will update the same 'len'
1314 -  field in the shared 'foo' histogram data:
1315 -
1316 -    # echo 'hist:name=foo:keys=skbaddr.hex:vals=len' > \
1317 -           /sys/kernel/debug/tracing/events/net/netif_receive_skb/trigger
1318 -    # echo 'hist:name=foo:keys=skbaddr.hex:vals=len' > \
1319 -           /sys/kernel/debug/tracing/events/net/netif_rx/trigger
1320 -
1321 -  You can see that they're updating common histogram data by reading
1322 -  each event's hist files at the same time:
1323 -
1324 -    # cat /sys/kernel/debug/tracing/events/net/netif_receive_skb/hist;
1325 -      cat /sys/kernel/debug/tracing/events/net/netif_rx/hist
1326 -
1327 -    # event histogram
1328 -    #
1329 -    # trigger info: hist:name=foo:keys=skbaddr.hex:vals=hitcount,len:sort=hitcount:size=2048 [active]
1330 -    #
1331 -
1332 -    { skbaddr: ffff88000ad53500 } hitcount:          1  len:         46
1333 -    { skbaddr: ffff8800af5a1500 } hitcount:          1  len:         76
1334 -    { skbaddr: ffff8800d62a1900 } hitcount:          1  len:         46
1335 -    { skbaddr: ffff8800d2bccb00 } hitcount:          1  len:        468
1336 -    { skbaddr: ffff8800d3c69900 } hitcount:          1  len:         46
1337 -    { skbaddr: ffff88009ff09100 } hitcount:          1  len:         52
1338 -    { skbaddr: ffff88010f13ab00 } hitcount:          1  len:        168
1339 -    { skbaddr: ffff88006a54f400 } hitcount:          1  len:         46
1340 -    { skbaddr: ffff8800d2bcc500 } hitcount:          1  len:        260
1341 -    { skbaddr: ffff880064505000 } hitcount:          1  len:         46
1342 -    { skbaddr: ffff8800baf24e00 } hitcount:          1  len:         32
1343 -    { skbaddr: ffff88009fe0ad00 } hitcount:          1  len:         46
1344 -    { skbaddr: ffff8800d3edff00 } hitcount:          1  len:         44
1345 -    { skbaddr: ffff88009fe0b400 } hitcount:          1  len:        168
1346 -    { skbaddr: ffff8800a1c55a00 } hitcount:          1  len:         40
1347 -    { skbaddr: ffff8800d2bcd100 } hitcount:          1  len:         40
1348 -    { skbaddr: ffff880064505f00 } hitcount:          1  len:        174
1349 -    { skbaddr: ffff8800a8bff200 } hitcount:          1  len:        160
1350 -    { skbaddr: ffff880044e3cc00 } hitcount:          1  len:         76
1351 -    { skbaddr: ffff8800a8bfe700 } hitcount:          1  len:         46
1352 -    { skbaddr: ffff8800d2bcdc00 } hitcount:          1  len:         32
1353 -    { skbaddr: ffff8800a1f64800 } hitcount:          1  len:         46
1354 -    { skbaddr: ffff8800d2bcde00 } hitcount:          1  len:        988
1355 -    { skbaddr: ffff88006a5dea00 } hitcount:          1  len:         46
1356 -    { skbaddr: ffff88002e37a200 } hitcount:          1  len:         44
1357 -    { skbaddr: ffff8800a1f32c00 } hitcount:          2  len:        676
1358 -    { skbaddr: ffff88000ad52600 } hitcount:          2  len:        107
1359 -    { skbaddr: ffff8800a1f91e00 } hitcount:          2  len:         92
1360 -    { skbaddr: ffff8800af5a0200 } hitcount:          2  len:        142
1361 -    { skbaddr: ffff8800d2bcc600 } hitcount:          2  len:        220
1362 -    { skbaddr: ffff8800ba36f500 } hitcount:          2  len:         92
1363 -    { skbaddr: ffff8800d021f800 } hitcount:          2  len:         92
1364 -    { skbaddr: ffff8800a1f33600 } hitcount:          2  len:        675
1365 -    { skbaddr: ffff8800a8bfff00 } hitcount:          3  len:        138
1366 -    { skbaddr: ffff8800d62a1300 } hitcount:          3  len:        138
1367 -    { skbaddr: ffff88002e37a100 } hitcount:          4  len:        184
1368 -    { skbaddr: ffff880064504400 } hitcount:          4  len:        184
1369 -    { skbaddr: ffff8800a8bfec00 } hitcount:          4  len:        184
1370 -    { skbaddr: ffff88000ad53700 } hitcount:          5  len:        230
1371 -    { skbaddr: ffff8800d2bcdb00 } hitcount:          5  len:        196
1372 -    { skbaddr: ffff8800a1f90000 } hitcount:          6  len:        276
1373 -    { skbaddr: ffff88006a54f900 } hitcount:          6  len:        276
1374 -
1375 -    Totals:
1376 -        Hits: 81
1377 -        Entries: 42
1378 -        Dropped: 0
1379 -    # event histogram
1380 -    #
1381 -    # trigger info: hist:name=foo:keys=skbaddr.hex:vals=hitcount,len:sort=hitcount:size=2048 [active]
1382 -    #
1383 -
1384 -    { skbaddr: ffff88000ad53500 } hitcount:          1  len:         46
1385 -    { skbaddr: ffff8800af5a1500 } hitcount:          1  len:         76
1386 -    { skbaddr: ffff8800d62a1900 } hitcount:          1  len:         46
1387 -    { skbaddr: ffff8800d2bccb00 } hitcount:          1  len:        468
1388 -    { skbaddr: ffff8800d3c69900 } hitcount:          1  len:         46
1389 -    { skbaddr: ffff88009ff09100 } hitcount:          1  len:         52
1390 -    { skbaddr: ffff88010f13ab00 } hitcount:          1  len:        168
1391 -    { skbaddr: ffff88006a54f400 } hitcount:          1  len:         46
1392 -    { skbaddr: ffff8800d2bcc500 } hitcount:          1  len:        260
1393 -    { skbaddr: ffff880064505000 } hitcount:          1  len:         46
1394 -    { skbaddr: ffff8800baf24e00 } hitcount:          1  len:         32
1395 -    { skbaddr: ffff88009fe0ad00 } hitcount:          1  len:         46
1396 -    { skbaddr: ffff8800d3edff00 } hitcount:          1  len:         44
1397 -    { skbaddr: ffff88009fe0b400 } hitcount:          1  len:        168
1398 -    { skbaddr: ffff8800a1c55a00 } hitcount:          1  len:         40
1399 -    { skbaddr: ffff8800d2bcd100 } hitcount:          1  len:         40
1400 -    { skbaddr: ffff880064505f00 } hitcount:          1  len:        174
1401 -    { skbaddr: ffff8800a8bff200 } hitcount:          1  len:        160
1402 -    { skbaddr: ffff880044e3cc00 } hitcount:          1  len:         76
1403 -    { skbaddr: ffff8800a8bfe700 } hitcount:          1  len:         46
1404 -    { skbaddr: ffff8800d2bcdc00 } hitcount:          1  len:         32
1405 -    { skbaddr: ffff8800a1f64800 } hitcount:          1  len:         46
1406 -    { skbaddr: ffff8800d2bcde00 } hitcount:          1  len:        988
1407 -    { skbaddr: ffff88006a5dea00 } hitcount:          1  len:         46
1408 -    { skbaddr: ffff88002e37a200 } hitcount:          1  len:         44
1409 -    { skbaddr: ffff8800a1f32c00 } hitcount:          2  len:        676
1410 -    { skbaddr: ffff88000ad52600 } hitcount:          2  len:        107
1411 -    { skbaddr: ffff8800a1f91e00 } hitcount:          2  len:         92
1412 -    { skbaddr: ffff8800af5a0200 } hitcount:          2  len:        142
1413 -    { skbaddr: ffff8800d2bcc600 } hitcount:          2  len:        220
1414 -    { skbaddr: ffff8800ba36f500 } hitcount:          2  len:         92
1415 -    { skbaddr: ffff8800d021f800 } hitcount:          2  len:         92
1416 -    { skbaddr: ffff8800a1f33600 } hitcount:          2  len:        675
1417 -    { skbaddr: ffff8800a8bfff00 } hitcount:          3  len:        138
1418 -    { skbaddr: ffff8800d62a1300 } hitcount:          3  len:        138
1419 -    { skbaddr: ffff88002e37a100 } hitcount:          4  len:        184
1420 -    { skbaddr: ffff880064504400 } hitcount:          4  len:        184
1421 -    { skbaddr: ffff8800a8bfec00 } hitcount:          4  len:        184
1422 -    { skbaddr: ffff88000ad53700 } hitcount:          5  len:        230
1423 -    { skbaddr: ffff8800d2bcdb00 } hitcount:          5  len:        196
1424 -    { skbaddr: ffff8800a1f90000 } hitcount:          6  len:        276
1425 -    { skbaddr: ffff88006a54f900 } hitcount:          6  len:        276
1426 -
1427 -    Totals:
1428 -        Hits: 81
1429 -        Entries: 42
1430 -        Dropped: 0
1431 -
1432 -  And here's an example that shows how to combine histogram data from
1433 -  any two events even if they don't share any 'compatible' fields
1434 -  other than 'hitcount' and 'stacktrace'.  These commands create a
1435 -  couple of triggers named 'bar' using those fields:
1436 -
1437 -    # echo 'hist:name=bar:key=stacktrace:val=hitcount' > \
1438 -           /sys/kernel/debug/tracing/events/sched/sched_process_fork/trigger
1439 -    # echo 'hist:name=bar:key=stacktrace:val=hitcount' > \
1440 -          /sys/kernel/debug/tracing/events/net/netif_rx/trigger
1441 -
1442 -  And displaying the output of either shows some interesting if
1443 -  somewhat confusing output:
1444 -
1445 -    # cat /sys/kernel/debug/tracing/events/sched/sched_process_fork/hist
1446 -    # cat /sys/kernel/debug/tracing/events/net/netif_rx/hist
1447 -
1448 -    # event histogram
1449 -    #
1450 -    # trigger info: hist:name=bar:keys=stacktrace:vals=hitcount:sort=hitcount:size=2048 [active]
1451 -    #
1452 -
1453 -    { stacktrace:
1454 -             _do_fork+0x18e/0x330
1455 -             kernel_thread+0x29/0x30
1456 -             kthreadd+0x154/0x1b0
1457 -             ret_from_fork+0x3f/0x70
1458 -    } hitcount:          1
1459 -    { stacktrace:
1460 -             netif_rx_internal+0xb2/0xd0
1461 -             netif_rx_ni+0x20/0x70
1462 -             dev_loopback_xmit+0xaa/0xd0
1463 -             ip_mc_output+0x126/0x240
1464 -             ip_local_out_sk+0x31/0x40
1465 -             igmp_send_report+0x1e9/0x230
1466 -             igmp_timer_expire+0xe9/0x120
1467 -             call_timer_fn+0x39/0xf0
1468 -             run_timer_softirq+0x1e1/0x290
1469 -             __do_softirq+0xfd/0x290
1470 -             irq_exit+0x98/0xb0
1471 -             smp_apic_timer_interrupt+0x4a/0x60
1472 -             apic_timer_interrupt+0x6d/0x80
1473 -             cpuidle_enter+0x17/0x20
1474 -             call_cpuidle+0x3b/0x60
1475 -             cpu_startup_entry+0x22d/0x310
1476 -    } hitcount:          1
1477 -    { stacktrace:
1478 -             netif_rx_internal+0xb2/0xd0
1479 -             netif_rx_ni+0x20/0x70
1480 -             dev_loopback_xmit+0xaa/0xd0
1481 -             ip_mc_output+0x17f/0x240
1482 -             ip_local_out_sk+0x31/0x40
1483 -             ip_send_skb+0x1a/0x50
1484 -             udp_send_skb+0x13e/0x270
1485 -             udp_sendmsg+0x2bf/0x980
1486 -             inet_sendmsg+0x67/0xa0
1487 -             sock_sendmsg+0x38/0x50
1488 -             SYSC_sendto+0xef/0x170
1489 -             SyS_sendto+0xe/0x10
1490 -             entry_SYSCALL_64_fastpath+0x12/0x6a
1491 -    } hitcount:          2
1492 -    { stacktrace:
1493 -             netif_rx_internal+0xb2/0xd0
1494 -             netif_rx+0x1c/0x60
1495 -             loopback_xmit+0x6c/0xb0
1496 -             dev_hard_start_xmit+0x219/0x3a0
1497 -             __dev_queue_xmit+0x415/0x4f0
1498 -             dev_queue_xmit_sk+0x13/0x20
1499 -             ip_finish_output2+0x237/0x340
1500 -             ip_finish_output+0x113/0x1d0
1501 -             ip_output+0x66/0xc0
1502 -             ip_local_out_sk+0x31/0x40
1503 -             ip_send_skb+0x1a/0x50
1504 -             udp_send_skb+0x16d/0x270
1505 -             udp_sendmsg+0x2bf/0x980
1506 -             inet_sendmsg+0x67/0xa0
1507 -             sock_sendmsg+0x38/0x50
1508 -             ___sys_sendmsg+0x14e/0x270
1509 -    } hitcount:         76
1510 -    { stacktrace:
1511 -             netif_rx_internal+0xb2/0xd0
1512 -             netif_rx+0x1c/0x60
1513 -             loopback_xmit+0x6c/0xb0
1514 -             dev_hard_start_xmit+0x219/0x3a0
1515 -             __dev_queue_xmit+0x415/0x4f0
1516 -             dev_queue_xmit_sk+0x13/0x20
1517 -             ip_finish_output2+0x237/0x340
1518 -             ip_finish_output+0x113/0x1d0
1519 -             ip_output+0x66/0xc0
1520 -             ip_local_out_sk+0x31/0x40
1521 -             ip_send_skb+0x1a/0x50
1522 -             udp_send_skb+0x16d/0x270
1523 -             udp_sendmsg+0x2bf/0x980
1524 -             inet_sendmsg+0x67/0xa0
1525 -             sock_sendmsg+0x38/0x50
1526 -             ___sys_sendmsg+0x269/0x270
1527 -    } hitcount:         77
1528 -    { stacktrace:
1529 -             netif_rx_internal+0xb2/0xd0
1530 -             netif_rx+0x1c/0x60
1531 -             loopback_xmit+0x6c/0xb0
1532 -             dev_hard_start_xmit+0x219/0x3a0
1533 -             __dev_queue_xmit+0x415/0x4f0
1534 -             dev_queue_xmit_sk+0x13/0x20
1535 -             ip_finish_output2+0x237/0x340
1536 -             ip_finish_output+0x113/0x1d0
1537 -             ip_output+0x66/0xc0
1538 -             ip_local_out_sk+0x31/0x40
1539 -             ip_send_skb+0x1a/0x50
1540 -             udp_send_skb+0x16d/0x270
1541 -             udp_sendmsg+0x2bf/0x980
1542 -             inet_sendmsg+0x67/0xa0
1543 -             sock_sendmsg+0x38/0x50
1544 -             SYSC_sendto+0xef/0x170
1545 -    } hitcount:         88
1546 -    { stacktrace:
1547 -             _do_fork+0x18e/0x330
1548 -             SyS_clone+0x19/0x20
1549 -             entry_SYSCALL_64_fastpath+0x12/0x6a
1550 -    } hitcount:        244
1551 -
1552 -    Totals:
1553 -        Hits: 489
1554 -        Entries: 7
1555 -        Dropped: 0
1556 +  See Documentation/trace/histogram.txt for details and examples.
1557 diff --git a/Documentation/trace/ftrace.txt b/Documentation/trace/ftrace.txt
1558 index d4601df6e72e..54213e5c23f6 100644
1559 --- a/Documentation/trace/ftrace.txt
1560 +++ b/Documentation/trace/ftrace.txt
1561 @@ -539,6 +539,30 @@ of ftrace. Here is a list of some of the key files:
1562
1563         See events.txt for more information.
1564
1565 +  timestamp_mode:
1566 +
1567 +       Certain tracers may change the timestamp mode used when
1568 +       logging trace events into the event buffer.  Events with
1569 +       different modes can coexist within a buffer but the mode in
1570 +       effect when an event is logged determines which timestamp mode
1571 +       is used for that event.  The default timestamp mode is
1572 +       'delta'.
1573 +
1574 +       Usual timestamp modes for tracing:
1575 +
1576 +         # cat timestamp_mode
1577 +         [delta] absolute
1578 +
1579 +         The timestamp mode with the square brackets around it is the
1580 +         one in effect.
1581 +
1582 +         delta: Default timestamp mode - timestamp is a delta against
1583 +                a per-buffer timestamp.
1584 +
1585 +         absolute: The timestamp is a full timestamp, not a delta
1586 +                 against some other value.  As such it takes up more
1587 +                 space and is less efficient.
1588 +
1589    hwlat_detector:
1590
1591         Directory for the Hardware Latency Detector.
1592 diff --git a/Documentation/trace/histogram.txt b/Documentation/trace/histogram.txt
1593 new file mode 100644
1594 index 000000000000..6e05510afc28
1595 --- /dev/null
1596 +++ b/Documentation/trace/histogram.txt
1597 @@ -0,0 +1,1995 @@
1598 +                            Event Histograms
1599 +
1600 +                   Documentation written by Tom Zanussi
1601 +
1602 +1. Introduction
1603 +===============
1604 +
1605 +  Histogram triggers are special event triggers that can be used to
1606 +  aggregate trace event data into histograms.  For information on
1607 +  trace events and event triggers, see Documentation/trace/events.txt.
1608 +
1609 +
1610 +2. Histogram Trigger Command
1611 +============================
1612 +
1613 +  A histogram trigger command is an event trigger command that
1614 +  aggregates event hits into a hash table keyed on one or more trace
1615 +  event format fields (or stacktrace) and a set of running totals
1616 +  derived from one or more trace event format fields and/or event
1617 +  counts (hitcount).
1618 +
1619 +  The format of a hist trigger is as follows:
1620 +
1621 +        hist:keys=<field1[,field2,...]>[:values=<field1[,field2,...]>]
1622 +          [:sort=<field1[,field2,...]>][:size=#entries][:pause][:continue]
1623 +          [:clear][:name=histname1] [if <filter>]
1624 +
1625 +  When a matching event is hit, an entry is added to a hash table
1626 +  using the key(s) and value(s) named.  Keys and values correspond to
1627 +  fields in the event's format description.  Values must correspond to
1628 +  numeric fields - on an event hit, the value(s) will be added to a
1629 +  sum kept for that field.  The special string 'hitcount' can be used
1630 +  in place of an explicit value field - this is simply a count of
1631 +  event hits.  If 'values' isn't specified, an implicit 'hitcount'
1632 +  value will be automatically created and used as the only value.
1633 +  Keys can be any field, or the special string 'stacktrace', which
1634 +  will use the event's kernel stacktrace as the key.  The keywords
1635 +  'keys' or 'key' can be used to specify keys, and the keywords
1636 +  'values', 'vals', or 'val' can be used to specify values.  Compound
1637 +  keys consisting of up to two fields can be specified by the 'keys'
1638 +  keyword.  Hashing a compound key produces a unique entry in the
1639 +  table for each unique combination of component keys, and can be
1640 +  useful for providing more fine-grained summaries of event data.
1641 +  Additionally, sort keys consisting of up to two fields can be
1642 +  specified by the 'sort' keyword.  If more than one field is
1643 +  specified, the result will be a 'sort within a sort': the first key
1644 +  is taken to be the primary sort key and the second the secondary
1645 +  key.  If a hist trigger is given a name using the 'name' parameter,
1646 +  its histogram data will be shared with other triggers of the same
1647 +  name, and trigger hits will update this common data.  Only triggers
1648 +  with 'compatible' fields can be combined in this way; triggers are
1649 +  'compatible' if the fields named in the trigger share the same
1650 +  number and type of fields and those fields also have the same names.
1651 +  Note that any two events always share the compatible 'hitcount' and
1652 +  'stacktrace' fields and can therefore be combined using those
1653 +  fields, however pointless that may be.
1654 +
1655 +  'hist' triggers add a 'hist' file to each event's subdirectory.
1656 +  Reading the 'hist' file for the event will dump the hash table in
1657 +  its entirety to stdout.  If there are multiple hist triggers
1658 +  attached to an event, there will be a table for each trigger in the
1659 +  output.  The table displayed for a named trigger will be the same as
1660 +  any other instance having the same name. Each printed hash table
1661 +  entry is a simple list of the keys and values comprising the entry;
1662 +  keys are printed first and are delineated by curly braces, and are
1663 +  followed by the set of value fields for the entry.  By default,
1664 +  numeric fields are displayed as base-10 integers.  This can be
1665 +  modified by appending any of the following modifiers to the field
1666 +  name:
1667 +
1668 +        .hex        display a number as a hex value
1669 +       .sym        display an address as a symbol
1670 +       .sym-offset display an address as a symbol and offset
1671 +       .syscall    display a syscall id as a system call name
1672 +       .execname   display a common_pid as a program name
1673 +       .log2       display log2 value rather than raw number
1674 +       .usecs      display a common_timestamp in microseconds
1675 +
1676 +  Note that in general the semantics of a given field aren't
1677 +  interpreted when applying a modifier to it, but there are some
1678 +  restrictions to be aware of in this regard:
1679 +
1680 +    - only the 'hex' modifier can be used for values (because values
1681 +      are essentially sums, and the other modifiers don't make sense
1682 +      in that context).
1683 +    - the 'execname' modifier can only be used on a 'common_pid'.  The
1684 +      reason for this is that the execname is simply the 'comm' value
1685 +      saved for the 'current' process when an event was triggered,
1686 +      which is the same as the common_pid value saved by the event
1687 +      tracing code.  Trying to apply that comm value to other pid
1688 +      values wouldn't be correct, and typically events that care save
1689 +      pid-specific comm fields in the event itself.
1690 +
1691 +  A typical usage scenario would be the following to enable a hist
1692 +  trigger, read its current contents, and then turn it off:
1693 +
1694 +  # echo 'hist:keys=skbaddr.hex:vals=len' > \
1695 +    /sys/kernel/debug/tracing/events/net/netif_rx/trigger
1696 +
1697 +  # cat /sys/kernel/debug/tracing/events/net/netif_rx/hist
1698 +
1699 +  # echo '!hist:keys=skbaddr.hex:vals=len' > \
1700 +    /sys/kernel/debug/tracing/events/net/netif_rx/trigger
1701 +
1702 +  The trigger file itself can be read to show the details of the
1703 +  currently attached hist trigger.  This information is also displayed
1704 +  at the top of the 'hist' file when read.
1705 +
1706 +  By default, the size of the hash table is 2048 entries.  The 'size'
1707 +  parameter can be used to specify more or fewer than that.  The units
1708 +  are in terms of hashtable entries - if a run uses more entries than
1709 +  specified, the results will show the number of 'drops', the number
1710 +  of hits that were ignored.  The size should be a power of 2 between
1711 +  128 and 131072 (any non- power-of-2 number specified will be rounded
1712 +  up).
1713 +
1714 +  The 'sort' parameter can be used to specify a value field to sort
1715 +  on.  The default if unspecified is 'hitcount' and the default sort
1716 +  order is 'ascending'.  To sort in the opposite direction, append
1717 +  .descending' to the sort key.
1718 +
1719 +  The 'pause' parameter can be used to pause an existing hist trigger
1720 +  or to start a hist trigger but not log any events until told to do
1721 +  so.  'continue' or 'cont' can be used to start or restart a paused
1722 +  hist trigger.
1723 +
1724 +  The 'clear' parameter will clear the contents of a running hist
1725 +  trigger and leave its current paused/active state.
1726 +
1727 +  Note that the 'pause', 'cont', and 'clear' parameters should be
1728 +  applied using 'append' shell operator ('>>') if applied to an
1729 +  existing trigger, rather than via the '>' operator, which will cause
1730 +  the trigger to be removed through truncation.
1731 +
1732 +- enable_hist/disable_hist
1733 +
1734 +  The enable_hist and disable_hist triggers can be used to have one
1735 +  event conditionally start and stop another event's already-attached
1736 +  hist trigger.  Any number of enable_hist and disable_hist triggers
1737 +  can be attached to a given event, allowing that event to kick off
1738 +  and stop aggregations on a host of other events.
1739 +
1740 +  The format is very similar to the enable/disable_event triggers:
1741 +
1742 +      enable_hist:<system>:<event>[:count]
1743 +      disable_hist:<system>:<event>[:count]
1744 +
1745 +  Instead of enabling or disabling the tracing of the target event
1746 +  into the trace buffer as the enable/disable_event triggers do, the
1747 +  enable/disable_hist triggers enable or disable the aggregation of
1748 +  the target event into a hash table.
1749 +
1750 +  A typical usage scenario for the enable_hist/disable_hist triggers
1751 +  would be to first set up a paused hist trigger on some event,
1752 +  followed by an enable_hist/disable_hist pair that turns the hist
1753 +  aggregation on and off when conditions of interest are hit:
1754 +
1755 +  # echo 'hist:keys=skbaddr.hex:vals=len:pause' > \
1756 +    /sys/kernel/debug/tracing/events/net/netif_receive_skb/trigger
1757 +
1758 +  # echo 'enable_hist:net:netif_receive_skb if filename==/usr/bin/wget' > \
1759 +    /sys/kernel/debug/tracing/events/sched/sched_process_exec/trigger
1760 +
1761 +  # echo 'disable_hist:net:netif_receive_skb if comm==wget' > \
1762 +    /sys/kernel/debug/tracing/events/sched/sched_process_exit/trigger
1763 +
1764 +  The above sets up an initially paused hist trigger which is unpaused
1765 +  and starts aggregating events when a given program is executed, and
1766 +  which stops aggregating when the process exits and the hist trigger
1767 +  is paused again.
1768 +
1769 +  The examples below provide a more concrete illustration of the
1770 +  concepts and typical usage patterns discussed above.
1771 +
1772 +  'special' event fields
1773 +  ------------------------
1774 +
1775 +  There are a number of 'special event fields' available for use as
1776 +  keys or values in a hist trigger.  These look like and behave as if
1777 +  they were actual event fields, but aren't really part of the event's
1778 +  field definition or format file.  They are however available for any
1779 +  event, and can be used anywhere an actual event field could be.
1780 +  They are:
1781 +
1782 +    common_timestamp       u64 - timestamp (from ring buffer) associated
1783 +                                 with the event, in nanoseconds.  May be
1784 +                                modified by .usecs to have timestamps
1785 +                                interpreted as microseconds.
1786 +    cpu                    int - the cpu on which the event occurred.
1787 +
1788 +  Extended error information
1789 +  --------------------------
1790 +
1791 +  For some error conditions encountered when invoking a hist trigger
1792 +  command, extended error information is available via the
1793 +  corresponding event's 'hist' file.  Reading the hist file after an
1794 +  error will display more detailed information about what went wrong,
1795 +  if information is available.  This extended error information will
1796 +  be available until the next hist trigger command for that event.
1797 +
1798 +  If available for a given error condition, the extended error
1799 +  information and usage takes the following form:
1800 +
1801 +    # echo xxx > /sys/kernel/debug/tracing/events/sched/sched_wakeup/trigger
1802 +    echo: write error: Invalid argument
1803 +
1804 +    # cat /sys/kernel/debug/tracing/events/sched/sched_wakeup/hist
1805 +    ERROR: Couldn't yyy: zzz
1806 +      Last command: xxx
1807 +
1808 +6.2 'hist' trigger examples
1809 +---------------------------
1810 +
1811 +  The first set of examples creates aggregations using the kmalloc
1812 +  event.  The fields that can be used for the hist trigger are listed
1813 +  in the kmalloc event's format file:
1814 +
1815 +    # cat /sys/kernel/debug/tracing/events/kmem/kmalloc/format
1816 +    name: kmalloc
1817 +    ID: 374
1818 +    format:
1819 +       field:unsigned short common_type;       offset:0;       size:2; signed:0;
1820 +       field:unsigned char common_flags;       offset:2;       size:1; signed:0;
1821 +       field:unsigned char common_preempt_count;               offset:3;       size:1; signed:0;
1822 +       field:int common_pid;                                   offset:4;       size:4; signed:1;
1823 +
1824 +       field:unsigned long call_site;                          offset:8;       size:8; signed:0;
1825 +       field:const void * ptr;                                 offset:16;      size:8; signed:0;
1826 +       field:size_t bytes_req;                                 offset:24;      size:8; signed:0;
1827 +       field:size_t bytes_alloc;                               offset:32;      size:8; signed:0;
1828 +       field:gfp_t gfp_flags;                                  offset:40;      size:4; signed:0;
1829 +
1830 +  We'll start by creating a hist trigger that generates a simple table
1831 +  that lists the total number of bytes requested for each function in
1832 +  the kernel that made one or more calls to kmalloc:
1833 +
1834 +    # echo 'hist:key=call_site:val=bytes_req' > \
1835 +            /sys/kernel/debug/tracing/events/kmem/kmalloc/trigger
1836 +
1837 +  This tells the tracing system to create a 'hist' trigger using the
1838 +  call_site field of the kmalloc event as the key for the table, which
1839 +  just means that each unique call_site address will have an entry
1840 +  created for it in the table.  The 'val=bytes_req' parameter tells
1841 +  the hist trigger that for each unique entry (call_site) in the
1842 +  table, it should keep a running total of the number of bytes
1843 +  requested by that call_site.
1844 +
1845 +  We'll let it run for awhile and then dump the contents of the 'hist'
1846 +  file in the kmalloc event's subdirectory (for readability, a number
1847 +  of entries have been omitted):
1848 +
1849 +    # cat /sys/kernel/debug/tracing/events/kmem/kmalloc/hist
1850 +    # trigger info: hist:keys=call_site:vals=bytes_req:sort=hitcount:size=2048 [active]
1851 +
1852 +    { call_site: 18446744072106379007 } hitcount:          1  bytes_req:        176
1853 +    { call_site: 18446744071579557049 } hitcount:          1  bytes_req:       1024
1854 +    { call_site: 18446744071580608289 } hitcount:          1  bytes_req:      16384
1855 +    { call_site: 18446744071581827654 } hitcount:          1  bytes_req:         24
1856 +    { call_site: 18446744071580700980 } hitcount:          1  bytes_req:          8
1857 +    { call_site: 18446744071579359876 } hitcount:          1  bytes_req:        152
1858 +    { call_site: 18446744071580795365 } hitcount:          3  bytes_req:        144
1859 +    { call_site: 18446744071581303129 } hitcount:          3  bytes_req:        144
1860 +    { call_site: 18446744071580713234 } hitcount:          4  bytes_req:       2560
1861 +    { call_site: 18446744071580933750 } hitcount:          4  bytes_req:        736
1862 +    .
1863 +    .
1864 +    .
1865 +    { call_site: 18446744072106047046 } hitcount:         69  bytes_req:       5576
1866 +    { call_site: 18446744071582116407 } hitcount:         73  bytes_req:       2336
1867 +    { call_site: 18446744072106054684 } hitcount:        136  bytes_req:     140504
1868 +    { call_site: 18446744072106224230 } hitcount:        136  bytes_req:      19584
1869 +    { call_site: 18446744072106078074 } hitcount:        153  bytes_req:       2448
1870 +    { call_site: 18446744072106062406 } hitcount:        153  bytes_req:      36720
1871 +    { call_site: 18446744071582507929 } hitcount:        153  bytes_req:      37088
1872 +    { call_site: 18446744072102520590 } hitcount:        273  bytes_req:      10920
1873 +    { call_site: 18446744071582143559 } hitcount:        358  bytes_req:        716
1874 +    { call_site: 18446744072106465852 } hitcount:        417  bytes_req:      56712
1875 +    { call_site: 18446744072102523378 } hitcount:        485  bytes_req:      27160
1876 +    { call_site: 18446744072099568646 } hitcount:       1676  bytes_req:      33520
1877 +
1878 +    Totals:
1879 +        Hits: 4610
1880 +        Entries: 45
1881 +        Dropped: 0
1882 +
1883 +  The output displays a line for each entry, beginning with the key
1884 +  specified in the trigger, followed by the value(s) also specified in
1885 +  the trigger.  At the beginning of the output is a line that displays
1886 +  the trigger info, which can also be displayed by reading the
1887 +  'trigger' file:
1888 +
1889 +    # cat /sys/kernel/debug/tracing/events/kmem/kmalloc/trigger
1890 +    hist:keys=call_site:vals=bytes_req:sort=hitcount:size=2048 [active]
1891 +
1892 +  At the end of the output are a few lines that display the overall
1893 +  totals for the run.  The 'Hits' field shows the total number of
1894 +  times the event trigger was hit, the 'Entries' field shows the total
1895 +  number of used entries in the hash table, and the 'Dropped' field
1896 +  shows the number of hits that were dropped because the number of
1897 +  used entries for the run exceeded the maximum number of entries
1898 +  allowed for the table (normally 0, but if not a hint that you may
1899 +  want to increase the size of the table using the 'size' parameter).
1900 +
1901 +  Notice in the above output that there's an extra field, 'hitcount',
1902 +  which wasn't specified in the trigger.  Also notice that in the
1903 +  trigger info output, there's a parameter, 'sort=hitcount', which
1904 +  wasn't specified in the trigger either.  The reason for that is that
1905 +  every trigger implicitly keeps a count of the total number of hits
1906 +  attributed to a given entry, called the 'hitcount'.  That hitcount
1907 +  information is explicitly displayed in the output, and in the
1908 +  absence of a user-specified sort parameter, is used as the default
1909 +  sort field.
1910 +
1911 +  The value 'hitcount' can be used in place of an explicit value in
1912 +  the 'values' parameter if you don't really need to have any
1913 +  particular field summed and are mainly interested in hit
1914 +  frequencies.
1915 +
1916 +  To turn the hist trigger off, simply call up the trigger in the
1917 +  command history and re-execute it with a '!' prepended:
1918 +
1919 +    # echo '!hist:key=call_site:val=bytes_req' > \
1920 +           /sys/kernel/debug/tracing/events/kmem/kmalloc/trigger
1921 +
1922 +  Finally, notice that the call_site as displayed in the output above
1923 +  isn't really very useful.  It's an address, but normally addresses
1924 +  are displayed in hex.  To have a numeric field displayed as a hex
1925 +  value, simply append '.hex' to the field name in the trigger:
1926 +
1927 +    # echo 'hist:key=call_site.hex:val=bytes_req' > \
1928 +           /sys/kernel/debug/tracing/events/kmem/kmalloc/trigger
1929 +
1930 +    # cat /sys/kernel/debug/tracing/events/kmem/kmalloc/hist
1931 +    # trigger info: hist:keys=call_site.hex:vals=bytes_req:sort=hitcount:size=2048 [active]
1932 +
1933 +    { call_site: ffffffffa026b291 } hitcount:          1  bytes_req:        433
1934 +    { call_site: ffffffffa07186ff } hitcount:          1  bytes_req:        176
1935 +    { call_site: ffffffff811ae721 } hitcount:          1  bytes_req:      16384
1936 +    { call_site: ffffffff811c5134 } hitcount:          1  bytes_req:          8
1937 +    { call_site: ffffffffa04a9ebb } hitcount:          1  bytes_req:        511
1938 +    { call_site: ffffffff8122e0a6 } hitcount:          1  bytes_req:         12
1939 +    { call_site: ffffffff8107da84 } hitcount:          1  bytes_req:        152
1940 +    { call_site: ffffffff812d8246 } hitcount:          1  bytes_req:         24
1941 +    { call_site: ffffffff811dc1e5 } hitcount:          3  bytes_req:        144
1942 +    { call_site: ffffffffa02515e8 } hitcount:          3  bytes_req:        648
1943 +    { call_site: ffffffff81258159 } hitcount:          3  bytes_req:        144
1944 +    { call_site: ffffffff811c80f4 } hitcount:          4  bytes_req:        544
1945 +    .
1946 +    .
1947 +    .
1948 +    { call_site: ffffffffa06c7646 } hitcount:        106  bytes_req:       8024
1949 +    { call_site: ffffffffa06cb246 } hitcount:        132  bytes_req:      31680
1950 +    { call_site: ffffffffa06cef7a } hitcount:        132  bytes_req:       2112
1951 +    { call_site: ffffffff8137e399 } hitcount:        132  bytes_req:      23232
1952 +    { call_site: ffffffffa06c941c } hitcount:        185  bytes_req:     171360
1953 +    { call_site: ffffffffa06f2a66 } hitcount:        185  bytes_req:      26640
1954 +    { call_site: ffffffffa036a70e } hitcount:        265  bytes_req:      10600
1955 +    { call_site: ffffffff81325447 } hitcount:        292  bytes_req:        584
1956 +    { call_site: ffffffffa072da3c } hitcount:        446  bytes_req:      60656
1957 +    { call_site: ffffffffa036b1f2 } hitcount:        526  bytes_req:      29456
1958 +    { call_site: ffffffffa0099c06 } hitcount:       1780  bytes_req:      35600
1959 +
1960 +    Totals:
1961 +        Hits: 4775
1962 +        Entries: 46
1963 +        Dropped: 0
1964 +
1965 +  Even that's only marginally more useful - while hex values do look
1966 +  more like addresses, what users are typically more interested in
1967 +  when looking at text addresses are the corresponding symbols
1968 +  instead.  To have an address displayed as symbolic value instead,
1969 +  simply append '.sym' or '.sym-offset' to the field name in the
1970 +  trigger:
1971 +
1972 +    # echo 'hist:key=call_site.sym:val=bytes_req' > \
1973 +           /sys/kernel/debug/tracing/events/kmem/kmalloc/trigger
1974 +
1975 +    # cat /sys/kernel/debug/tracing/events/kmem/kmalloc/hist
1976 +    # trigger info: hist:keys=call_site.sym:vals=bytes_req:sort=hitcount:size=2048 [active]
1977 +
1978 +    { call_site: [ffffffff810adcb9] syslog_print_all                              } hitcount:          1  bytes_req:       1024
1979 +    { call_site: [ffffffff8154bc62] usb_control_msg                               } hitcount:          1  bytes_req:          8
1980 +    { call_site: [ffffffffa00bf6fe] hidraw_send_report [hid]                      } hitcount:          1  bytes_req:          7
1981 +    { call_site: [ffffffff8154acbe] usb_alloc_urb                                 } hitcount:          1  bytes_req:        192
1982 +    { call_site: [ffffffffa00bf1ca] hidraw_report_event [hid]                     } hitcount:          1  bytes_req:          7
1983 +    { call_site: [ffffffff811e3a25] __seq_open_private                            } hitcount:          1  bytes_req:         40
1984 +    { call_site: [ffffffff8109524a] alloc_fair_sched_group                        } hitcount:          2  bytes_req:        128
1985 +    { call_site: [ffffffff811febd5] fsnotify_alloc_group                          } hitcount:          2  bytes_req:        528
1986 +    { call_site: [ffffffff81440f58] __tty_buffer_request_room                     } hitcount:          2  bytes_req:       2624
1987 +    { call_site: [ffffffff81200ba6] inotify_new_group                             } hitcount:          2  bytes_req:         96
1988 +    { call_site: [ffffffffa05e19af] ieee80211_start_tx_ba_session [mac80211]      } hitcount:          2  bytes_req:        464
1989 +    { call_site: [ffffffff81672406] tcp_get_metrics                               } hitcount:          2  bytes_req:        304
1990 +    { call_site: [ffffffff81097ec2] alloc_rt_sched_group                          } hitcount:          2  bytes_req:        128
1991 +    { call_site: [ffffffff81089b05] sched_create_group                            } hitcount:          2  bytes_req:       1424
1992 +    .
1993 +    .
1994 +    .
1995 +    { call_site: [ffffffffa04a580c] intel_crtc_page_flip [i915]                   } hitcount:       1185  bytes_req:     123240
1996 +    { call_site: [ffffffffa0287592] drm_mode_page_flip_ioctl [drm]                } hitcount:       1185  bytes_req:     104280
1997 +    { call_site: [ffffffffa04c4a3c] intel_plane_duplicate_state [i915]            } hitcount:       1402  bytes_req:     190672
1998 +    { call_site: [ffffffff812891ca] ext4_find_extent                              } hitcount:       1518  bytes_req:     146208
1999 +    { call_site: [ffffffffa029070e] drm_vma_node_allow [drm]                      } hitcount:       1746  bytes_req:      69840
2000 +    { call_site: [ffffffffa045e7c4] i915_gem_do_execbuffer.isra.23 [i915]         } hitcount:       2021  bytes_req:     792312
2001 +    { call_site: [ffffffffa02911f2] drm_modeset_lock_crtc [drm]                   } hitcount:       2592  bytes_req:     145152
2002 +    { call_site: [ffffffffa0489a66] intel_ring_begin [i915]                       } hitcount:       2629  bytes_req:     378576
2003 +    { call_site: [ffffffffa046041c] i915_gem_execbuffer2 [i915]                   } hitcount:       2629  bytes_req:    3783248
2004 +    { call_site: [ffffffff81325607] apparmor_file_alloc_security                  } hitcount:       5192  bytes_req:      10384
2005 +    { call_site: [ffffffffa00b7c06] hid_report_raw_event [hid]                    } hitcount:       5529  bytes_req:     110584
2006 +    { call_site: [ffffffff8131ebf7] aa_alloc_task_context                         } hitcount:      21943  bytes_req:     702176
2007 +    { call_site: [ffffffff8125847d] ext4_htree_store_dirent                       } hitcount:      55759  bytes_req:    5074265
2008 +
2009 +    Totals:
2010 +        Hits: 109928
2011 +        Entries: 71
2012 +        Dropped: 0
2013 +
2014 +  Because the default sort key above is 'hitcount', the above shows a
2015 +  the list of call_sites by increasing hitcount, so that at the bottom
2016 +  we see the functions that made the most kmalloc calls during the
2017 +  run.  If instead we we wanted to see the top kmalloc callers in
2018 +  terms of the number of bytes requested rather than the number of
2019 +  calls, and we wanted the top caller to appear at the top, we can use
2020 +  the 'sort' parameter, along with the 'descending' modifier:
2021 +
2022 +    # echo 'hist:key=call_site.sym:val=bytes_req:sort=bytes_req.descending' > \
2023 +           /sys/kernel/debug/tracing/events/kmem/kmalloc/trigger
2024 +
2025 +    # cat /sys/kernel/debug/tracing/events/kmem/kmalloc/hist
2026 +    # trigger info: hist:keys=call_site.sym:vals=bytes_req:sort=bytes_req.descending:size=2048 [active]
2027 +
2028 +    { call_site: [ffffffffa046041c] i915_gem_execbuffer2 [i915]                   } hitcount:       2186  bytes_req:    3397464
2029 +    { call_site: [ffffffffa045e7c4] i915_gem_do_execbuffer.isra.23 [i915]         } hitcount:       1790  bytes_req:     712176
2030 +    { call_site: [ffffffff8125847d] ext4_htree_store_dirent                       } hitcount:       8132  bytes_req:     513135
2031 +    { call_site: [ffffffff811e2a1b] seq_buf_alloc                                 } hitcount:        106  bytes_req:     440128
2032 +    { call_site: [ffffffffa0489a66] intel_ring_begin [i915]                       } hitcount:       2186  bytes_req:     314784
2033 +    { call_site: [ffffffff812891ca] ext4_find_extent                              } hitcount:       2174  bytes_req:     208992
2034 +    { call_site: [ffffffff811ae8e1] __kmalloc                                     } hitcount:          8  bytes_req:     131072
2035 +    { call_site: [ffffffffa04c4a3c] intel_plane_duplicate_state [i915]            } hitcount:        859  bytes_req:     116824
2036 +    { call_site: [ffffffffa02911f2] drm_modeset_lock_crtc [drm]                   } hitcount:       1834  bytes_req:     102704
2037 +    { call_site: [ffffffffa04a580c] intel_crtc_page_flip [i915]                   } hitcount:        972  bytes_req:     101088
2038 +    { call_site: [ffffffffa0287592] drm_mode_page_flip_ioctl [drm]                } hitcount:        972  bytes_req:      85536
2039 +    { call_site: [ffffffffa00b7c06] hid_report_raw_event [hid]                    } hitcount:       3333  bytes_req:      66664
2040 +    { call_site: [ffffffff8137e559] sg_kmalloc                                    } hitcount:        209  bytes_req:      61632
2041 +    .
2042 +    .
2043 +    .
2044 +    { call_site: [ffffffff81095225] alloc_fair_sched_group                        } hitcount:          2  bytes_req:        128
2045 +    { call_site: [ffffffff81097ec2] alloc_rt_sched_group                          } hitcount:          2  bytes_req:        128
2046 +    { call_site: [ffffffff812d8406] copy_semundo                                  } hitcount:          2  bytes_req:         48
2047 +    { call_site: [ffffffff81200ba6] inotify_new_group                             } hitcount:          1  bytes_req:         48
2048 +    { call_site: [ffffffffa027121a] drm_getmagic [drm]                            } hitcount:          1  bytes_req:         48
2049 +    { call_site: [ffffffff811e3a25] __seq_open_private                            } hitcount:          1  bytes_req:         40
2050 +    { call_site: [ffffffff811c52f4] bprm_change_interp                            } hitcount:          2  bytes_req:         16
2051 +    { call_site: [ffffffff8154bc62] usb_control_msg                               } hitcount:          1  bytes_req:          8
2052 +    { call_site: [ffffffffa00bf1ca] hidraw_report_event [hid]                     } hitcount:          1  bytes_req:          7
2053 +    { call_site: [ffffffffa00bf6fe] hidraw_send_report [hid]                      } hitcount:          1  bytes_req:          7
2054 +
2055 +    Totals:
2056 +        Hits: 32133
2057 +        Entries: 81
2058 +        Dropped: 0
2059 +
2060 +  To display the offset and size information in addition to the symbol
2061 +  name, just use 'sym-offset' instead:
2062 +
2063 +    # echo 'hist:key=call_site.sym-offset:val=bytes_req:sort=bytes_req.descending' > \
2064 +           /sys/kernel/debug/tracing/events/kmem/kmalloc/trigger
2065 +
2066 +    # cat /sys/kernel/debug/tracing/events/kmem/kmalloc/hist
2067 +    # trigger info: hist:keys=call_site.sym-offset:vals=bytes_req:sort=bytes_req.descending:size=2048 [active]
2068 +
2069 +    { call_site: [ffffffffa046041c] i915_gem_execbuffer2+0x6c/0x2c0 [i915]                  } hitcount:       4569  bytes_req:    3163720
2070 +    { call_site: [ffffffffa0489a66] intel_ring_begin+0xc6/0x1f0 [i915]                      } hitcount:       4569  bytes_req:     657936
2071 +    { call_site: [ffffffffa045e7c4] i915_gem_do_execbuffer.isra.23+0x694/0x1020 [i915]      } hitcount:       1519  bytes_req:     472936
2072 +    { call_site: [ffffffffa045e646] i915_gem_do_execbuffer.isra.23+0x516/0x1020 [i915]      } hitcount:       3050  bytes_req:     211832
2073 +    { call_site: [ffffffff811e2a1b] seq_buf_alloc+0x1b/0x50                                 } hitcount:         34  bytes_req:     148384
2074 +    { call_site: [ffffffffa04a580c] intel_crtc_page_flip+0xbc/0x870 [i915]                  } hitcount:       1385  bytes_req:     144040
2075 +    { call_site: [ffffffff811ae8e1] __kmalloc+0x191/0x1b0                                   } hitcount:          8  bytes_req:     131072
2076 +    { call_site: [ffffffffa0287592] drm_mode_page_flip_ioctl+0x282/0x360 [drm]              } hitcount:       1385  bytes_req:     121880
2077 +    { call_site: [ffffffffa02911f2] drm_modeset_lock_crtc+0x32/0x100 [drm]                  } hitcount:       1848  bytes_req:     103488
2078 +    { call_site: [ffffffffa04c4a3c] intel_plane_duplicate_state+0x2c/0xa0 [i915]            } hitcount:        461  bytes_req:      62696
2079 +    { call_site: [ffffffffa029070e] drm_vma_node_allow+0x2e/0xd0 [drm]                      } hitcount:       1541  bytes_req:      61640
2080 +    { call_site: [ffffffff815f8d7b] sk_prot_alloc+0xcb/0x1b0                                } hitcount:         57  bytes_req:      57456
2081 +    .
2082 +    .
2083 +    .
2084 +    { call_site: [ffffffff8109524a] alloc_fair_sched_group+0x5a/0x1a0                       } hitcount:          2  bytes_req:        128
2085 +    { call_site: [ffffffffa027b921] drm_vm_open_locked+0x31/0xa0 [drm]                      } hitcount:          3  bytes_req:         96
2086 +    { call_site: [ffffffff8122e266] proc_self_follow_link+0x76/0xb0                         } hitcount:          8  bytes_req:         96
2087 +    { call_site: [ffffffff81213e80] load_elf_binary+0x240/0x1650                            } hitcount:          3  bytes_req:         84
2088 +    { call_site: [ffffffff8154bc62] usb_control_msg+0x42/0x110                              } hitcount:          1  bytes_req:          8
2089 +    { call_site: [ffffffffa00bf6fe] hidraw_send_report+0x7e/0x1a0 [hid]                     } hitcount:          1  bytes_req:          7
2090 +    { call_site: [ffffffffa00bf1ca] hidraw_report_event+0x8a/0x120 [hid]                    } hitcount:          1  bytes_req:          7
2091 +
2092 +    Totals:
2093 +        Hits: 26098
2094 +        Entries: 64
2095 +        Dropped: 0
2096 +
2097 +  We can also add multiple fields to the 'values' parameter.  For
2098 +  example, we might want to see the total number of bytes allocated
2099 +  alongside bytes requested, and display the result sorted by bytes
2100 +  allocated in a descending order:
2101 +
2102 +    # echo 'hist:keys=call_site.sym:values=bytes_req,bytes_alloc:sort=bytes_alloc.descending' > \
2103 +           /sys/kernel/debug/tracing/events/kmem/kmalloc/trigger
2104 +
2105 +    # cat /sys/kernel/debug/tracing/events/kmem/kmalloc/hist
2106 +    # trigger info: hist:keys=call_site.sym:vals=bytes_req,bytes_alloc:sort=bytes_alloc.descending:size=2048 [active]
2107 +
2108 +    { call_site: [ffffffffa046041c] i915_gem_execbuffer2 [i915]                   } hitcount:       7403  bytes_req:    4084360  bytes_alloc:    5958016
2109 +    { call_site: [ffffffff811e2a1b] seq_buf_alloc                                 } hitcount:        541  bytes_req:    2213968  bytes_alloc:    2228224
2110 +    { call_site: [ffffffffa0489a66] intel_ring_begin [i915]                       } hitcount:       7404  bytes_req:    1066176  bytes_alloc:    1421568
2111 +    { call_site: [ffffffffa045e7c4] i915_gem_do_execbuffer.isra.23 [i915]         } hitcount:       1565  bytes_req:     557368  bytes_alloc:    1037760
2112 +    { call_site: [ffffffff8125847d] ext4_htree_store_dirent                       } hitcount:       9557  bytes_req:     595778  bytes_alloc:     695744
2113 +    { call_site: [ffffffffa045e646] i915_gem_do_execbuffer.isra.23 [i915]         } hitcount:       5839  bytes_req:     430680  bytes_alloc:     470400
2114 +    { call_site: [ffffffffa04c4a3c] intel_plane_duplicate_state [i915]            } hitcount:       2388  bytes_req:     324768  bytes_alloc:     458496
2115 +    { call_site: [ffffffffa02911f2] drm_modeset_lock_crtc [drm]                   } hitcount:       3911  bytes_req:     219016  bytes_alloc:     250304
2116 +    { call_site: [ffffffff815f8d7b] sk_prot_alloc                                 } hitcount:        235  bytes_req:     236880  bytes_alloc:     240640
2117 +    { call_site: [ffffffff8137e559] sg_kmalloc                                    } hitcount:        557  bytes_req:     169024  bytes_alloc:     221760
2118 +    { call_site: [ffffffffa00b7c06] hid_report_raw_event [hid]                    } hitcount:       9378  bytes_req:     187548  bytes_alloc:     206312
2119 +    { call_site: [ffffffffa04a580c] intel_crtc_page_flip [i915]                   } hitcount:       1519  bytes_req:     157976  bytes_alloc:     194432
2120 +    .
2121 +    .
2122 +    .
2123 +    { call_site: [ffffffff8109bd3b] sched_autogroup_create_attach                 } hitcount:          2  bytes_req:        144  bytes_alloc:        192
2124 +    { call_site: [ffffffff81097ee8] alloc_rt_sched_group                          } hitcount:          2  bytes_req:        128  bytes_alloc:        128
2125 +    { call_site: [ffffffff8109524a] alloc_fair_sched_group                        } hitcount:          2  bytes_req:        128  bytes_alloc:        128
2126 +    { call_site: [ffffffff81095225] alloc_fair_sched_group                        } hitcount:          2  bytes_req:        128  bytes_alloc:        128
2127 +    { call_site: [ffffffff81097ec2] alloc_rt_sched_group                          } hitcount:          2  bytes_req:        128  bytes_alloc:        128
2128 +    { call_site: [ffffffff81213e80] load_elf_binary                               } hitcount:          3  bytes_req:         84  bytes_alloc:         96
2129 +    { call_site: [ffffffff81079a2e] kthread_create_on_node                        } hitcount:          1  bytes_req:         56  bytes_alloc:         64
2130 +    { call_site: [ffffffffa00bf6fe] hidraw_send_report [hid]                      } hitcount:          1  bytes_req:          7  bytes_alloc:          8
2131 +    { call_site: [ffffffff8154bc62] usb_control_msg                               } hitcount:          1  bytes_req:          8  bytes_alloc:          8
2132 +    { call_site: [ffffffffa00bf1ca] hidraw_report_event [hid]                     } hitcount:          1  bytes_req:          7  bytes_alloc:          8
2133 +
2134 +    Totals:
2135 +        Hits: 66598
2136 +        Entries: 65
2137 +        Dropped: 0
2138 +
2139 +  Finally, to finish off our kmalloc example, instead of simply having
2140 +  the hist trigger display symbolic call_sites, we can have the hist
2141 +  trigger additionally display the complete set of kernel stack traces
2142 +  that led to each call_site.  To do that, we simply use the special
2143 +  value 'stacktrace' for the key parameter:
2144 +
2145 +    # echo 'hist:keys=stacktrace:values=bytes_req,bytes_alloc:sort=bytes_alloc' > \
2146 +           /sys/kernel/debug/tracing/events/kmem/kmalloc/trigger
2147 +
2148 +  The above trigger will use the kernel stack trace in effect when an
2149 +  event is triggered as the key for the hash table.  This allows the
2150 +  enumeration of every kernel callpath that led up to a particular
2151 +  event, along with a running total of any of the event fields for
2152 +  that event.  Here we tally bytes requested and bytes allocated for
2153 +  every callpath in the system that led up to a kmalloc (in this case
2154 +  every callpath to a kmalloc for a kernel compile):
2155 +
2156 +    # cat /sys/kernel/debug/tracing/events/kmem/kmalloc/hist
2157 +    # trigger info: hist:keys=stacktrace:vals=bytes_req,bytes_alloc:sort=bytes_alloc:size=2048 [active]
2158 +
2159 +    { stacktrace:
2160 +         __kmalloc_track_caller+0x10b/0x1a0
2161 +         kmemdup+0x20/0x50
2162 +         hidraw_report_event+0x8a/0x120 [hid]
2163 +         hid_report_raw_event+0x3ea/0x440 [hid]
2164 +         hid_input_report+0x112/0x190 [hid]
2165 +         hid_irq_in+0xc2/0x260 [usbhid]
2166 +         __usb_hcd_giveback_urb+0x72/0x120
2167 +         usb_giveback_urb_bh+0x9e/0xe0
2168 +         tasklet_hi_action+0xf8/0x100
2169 +         __do_softirq+0x114/0x2c0
2170 +         irq_exit+0xa5/0xb0
2171 +         do_IRQ+0x5a/0xf0
2172 +         ret_from_intr+0x0/0x30
2173 +         cpuidle_enter+0x17/0x20
2174 +         cpu_startup_entry+0x315/0x3e0
2175 +         rest_init+0x7c/0x80
2176 +    } hitcount:          3  bytes_req:         21  bytes_alloc:         24
2177 +    { stacktrace:
2178 +         __kmalloc_track_caller+0x10b/0x1a0
2179 +         kmemdup+0x20/0x50
2180 +         hidraw_report_event+0x8a/0x120 [hid]
2181 +         hid_report_raw_event+0x3ea/0x440 [hid]
2182 +         hid_input_report+0x112/0x190 [hid]
2183 +         hid_irq_in+0xc2/0x260 [usbhid]
2184 +         __usb_hcd_giveback_urb+0x72/0x120
2185 +         usb_giveback_urb_bh+0x9e/0xe0
2186 +         tasklet_hi_action+0xf8/0x100
2187 +         __do_softirq+0x114/0x2c0
2188 +         irq_exit+0xa5/0xb0
2189 +         do_IRQ+0x5a/0xf0
2190 +         ret_from_intr+0x0/0x30
2191 +    } hitcount:          3  bytes_req:         21  bytes_alloc:         24
2192 +    { stacktrace:
2193 +         kmem_cache_alloc_trace+0xeb/0x150
2194 +         aa_alloc_task_context+0x27/0x40
2195 +         apparmor_cred_prepare+0x1f/0x50
2196 +         security_prepare_creds+0x16/0x20
2197 +         prepare_creds+0xdf/0x1a0
2198 +         SyS_capset+0xb5/0x200
2199 +         system_call_fastpath+0x12/0x6a
2200 +    } hitcount:          1  bytes_req:         32  bytes_alloc:         32
2201 +    .
2202 +    .
2203 +    .
2204 +    { stacktrace:
2205 +         __kmalloc+0x11b/0x1b0
2206 +         i915_gem_execbuffer2+0x6c/0x2c0 [i915]
2207 +         drm_ioctl+0x349/0x670 [drm]
2208 +         do_vfs_ioctl+0x2f0/0x4f0
2209 +         SyS_ioctl+0x81/0xa0
2210 +         system_call_fastpath+0x12/0x6a
2211 +    } hitcount:      17726  bytes_req:   13944120  bytes_alloc:   19593808
2212 +    { stacktrace:
2213 +         __kmalloc+0x11b/0x1b0
2214 +         load_elf_phdrs+0x76/0xa0
2215 +         load_elf_binary+0x102/0x1650
2216 +         search_binary_handler+0x97/0x1d0
2217 +         do_execveat_common.isra.34+0x551/0x6e0
2218 +         SyS_execve+0x3a/0x50
2219 +         return_from_execve+0x0/0x23
2220 +    } hitcount:      33348  bytes_req:   17152128  bytes_alloc:   20226048
2221 +    { stacktrace:
2222 +         kmem_cache_alloc_trace+0xeb/0x150
2223 +         apparmor_file_alloc_security+0x27/0x40
2224 +         security_file_alloc+0x16/0x20
2225 +         get_empty_filp+0x93/0x1c0
2226 +         path_openat+0x31/0x5f0
2227 +         do_filp_open+0x3a/0x90
2228 +         do_sys_open+0x128/0x220
2229 +         SyS_open+0x1e/0x20
2230 +         system_call_fastpath+0x12/0x6a
2231 +    } hitcount:    4766422  bytes_req:    9532844  bytes_alloc:   38131376
2232 +    { stacktrace:
2233 +         __kmalloc+0x11b/0x1b0
2234 +         seq_buf_alloc+0x1b/0x50
2235 +         seq_read+0x2cc/0x370
2236 +         proc_reg_read+0x3d/0x80
2237 +         __vfs_read+0x28/0xe0
2238 +         vfs_read+0x86/0x140
2239 +         SyS_read+0x46/0xb0
2240 +         system_call_fastpath+0x12/0x6a
2241 +    } hitcount:      19133  bytes_req:   78368768  bytes_alloc:   78368768
2242 +
2243 +    Totals:
2244 +        Hits: 6085872
2245 +        Entries: 253
2246 +        Dropped: 0
2247 +
2248 +  If you key a hist trigger on common_pid, in order for example to
2249 +  gather and display sorted totals for each process, you can use the
2250 +  special .execname modifier to display the executable names for the
2251 +  processes in the table rather than raw pids.  The example below
2252 +  keeps a per-process sum of total bytes read:
2253 +
2254 +    # echo 'hist:key=common_pid.execname:val=count:sort=count.descending' > \
2255 +           /sys/kernel/debug/tracing/events/syscalls/sys_enter_read/trigger
2256 +
2257 +    # cat /sys/kernel/debug/tracing/events/syscalls/sys_enter_read/hist
2258 +    # trigger info: hist:keys=common_pid.execname:vals=count:sort=count.descending:size=2048 [active]
2259 +
2260 +    { common_pid: gnome-terminal  [      3196] } hitcount:        280  count:    1093512
2261 +    { common_pid: Xorg            [      1309] } hitcount:        525  count:     256640
2262 +    { common_pid: compiz          [      2889] } hitcount:         59  count:     254400
2263 +    { common_pid: bash            [      8710] } hitcount:          3  count:      66369
2264 +    { common_pid: dbus-daemon-lau [      8703] } hitcount:         49  count:      47739
2265 +    { common_pid: irqbalance      [      1252] } hitcount:         27  count:      27648
2266 +    { common_pid: 01ifupdown      [      8705] } hitcount:          3  count:      17216
2267 +    { common_pid: dbus-daemon     [       772] } hitcount:         10  count:      12396
2268 +    { common_pid: Socket Thread   [      8342] } hitcount:         11  count:      11264
2269 +    { common_pid: nm-dhcp-client. [      8701] } hitcount:          6  count:       7424
2270 +    { common_pid: gmain           [      1315] } hitcount:         18  count:       6336
2271 +    .
2272 +    .
2273 +    .
2274 +    { common_pid: postgres        [      1892] } hitcount:          2  count:         32
2275 +    { common_pid: postgres        [      1891] } hitcount:          2  count:         32
2276 +    { common_pid: gmain           [      8704] } hitcount:          2  count:         32
2277 +    { common_pid: upstart-dbus-br [      2740] } hitcount:         21  count:         21
2278 +    { common_pid: nm-dispatcher.a [      8696] } hitcount:          1  count:         16
2279 +    { common_pid: indicator-datet [      2904] } hitcount:          1  count:         16
2280 +    { common_pid: gdbus           [      2998] } hitcount:          1  count:         16
2281 +    { common_pid: rtkit-daemon    [      2052] } hitcount:          1  count:          8
2282 +    { common_pid: init            [         1] } hitcount:          2  count:          2
2283 +
2284 +    Totals:
2285 +        Hits: 2116
2286 +        Entries: 51
2287 +        Dropped: 0
2288 +
2289 +  Similarly, if you key a hist trigger on syscall id, for example to
2290 +  gather and display a list of systemwide syscall hits, you can use
2291 +  the special .syscall modifier to display the syscall names rather
2292 +  than raw ids.  The example below keeps a running total of syscall
2293 +  counts for the system during the run:
2294 +
2295 +    # echo 'hist:key=id.syscall:val=hitcount' > \
2296 +           /sys/kernel/debug/tracing/events/raw_syscalls/sys_enter/trigger
2297 +
2298 +    # cat /sys/kernel/debug/tracing/events/raw_syscalls/sys_enter/hist
2299 +    # trigger info: hist:keys=id.syscall:vals=hitcount:sort=hitcount:size=2048 [active]
2300 +
2301 +    { id: sys_fsync                     [ 74] } hitcount:          1
2302 +    { id: sys_newuname                  [ 63] } hitcount:          1
2303 +    { id: sys_prctl                     [157] } hitcount:          1
2304 +    { id: sys_statfs                    [137] } hitcount:          1
2305 +    { id: sys_symlink                   [ 88] } hitcount:          1
2306 +    { id: sys_sendmmsg                  [307] } hitcount:          1
2307 +    { id: sys_semctl                    [ 66] } hitcount:          1
2308 +    { id: sys_readlink                  [ 89] } hitcount:          3
2309 +    { id: sys_bind                      [ 49] } hitcount:          3
2310 +    { id: sys_getsockname               [ 51] } hitcount:          3
2311 +    { id: sys_unlink                    [ 87] } hitcount:          3
2312 +    { id: sys_rename                    [ 82] } hitcount:          4
2313 +    { id: unknown_syscall               [ 58] } hitcount:          4
2314 +    { id: sys_connect                   [ 42] } hitcount:          4
2315 +    { id: sys_getpid                    [ 39] } hitcount:          4
2316 +    .
2317 +    .
2318 +    .
2319 +    { id: sys_rt_sigprocmask            [ 14] } hitcount:        952
2320 +    { id: sys_futex                     [202] } hitcount:       1534
2321 +    { id: sys_write                     [  1] } hitcount:       2689
2322 +    { id: sys_setitimer                 [ 38] } hitcount:       2797
2323 +    { id: sys_read                      [  0] } hitcount:       3202
2324 +    { id: sys_select                    [ 23] } hitcount:       3773
2325 +    { id: sys_writev                    [ 20] } hitcount:       4531
2326 +    { id: sys_poll                      [  7] } hitcount:       8314
2327 +    { id: sys_recvmsg                   [ 47] } hitcount:      13738
2328 +    { id: sys_ioctl                     [ 16] } hitcount:      21843
2329 +
2330 +    Totals:
2331 +        Hits: 67612
2332 +        Entries: 72
2333 +        Dropped: 0
2334 +
2335 +    The syscall counts above provide a rough overall picture of system
2336 +    call activity on the system; we can see for example that the most
2337 +    popular system call on this system was the 'sys_ioctl' system call.
2338 +
2339 +    We can use 'compound' keys to refine that number and provide some
2340 +    further insight as to which processes exactly contribute to the
2341 +    overall ioctl count.
2342 +
2343 +    The command below keeps a hitcount for every unique combination of
2344 +    system call id and pid - the end result is essentially a table
2345 +    that keeps a per-pid sum of system call hits.  The results are
2346 +    sorted using the system call id as the primary key, and the
2347 +    hitcount sum as the secondary key:
2348 +
2349 +    # echo 'hist:key=id.syscall,common_pid.execname:val=hitcount:sort=id,hitcount' > \
2350 +           /sys/kernel/debug/tracing/events/raw_syscalls/sys_enter/trigger
2351 +
2352 +    # cat /sys/kernel/debug/tracing/events/raw_syscalls/sys_enter/hist
2353 +    # trigger info: hist:keys=id.syscall,common_pid.execname:vals=hitcount:sort=id.syscall,hitcount:size=2048 [active]
2354 +
2355 +    { id: sys_read                      [  0], common_pid: rtkit-daemon    [      1877] } hitcount:          1
2356 +    { id: sys_read                      [  0], common_pid: gdbus           [      2976] } hitcount:          1
2357 +    { id: sys_read                      [  0], common_pid: console-kit-dae [      3400] } hitcount:          1
2358 +    { id: sys_read                      [  0], common_pid: postgres        [      1865] } hitcount:          1
2359 +    { id: sys_read                      [  0], common_pid: deja-dup-monito [      3543] } hitcount:          2
2360 +    { id: sys_read                      [  0], common_pid: NetworkManager  [       890] } hitcount:          2
2361 +    { id: sys_read                      [  0], common_pid: evolution-calen [      3048] } hitcount:          2
2362 +    { id: sys_read                      [  0], common_pid: postgres        [      1864] } hitcount:          2
2363 +    { id: sys_read                      [  0], common_pid: nm-applet       [      3022] } hitcount:          2
2364 +    { id: sys_read                      [  0], common_pid: whoopsie        [      1212] } hitcount:          2
2365 +    .
2366 +    .
2367 +    .
2368 +    { id: sys_ioctl                     [ 16], common_pid: bash            [      8479] } hitcount:          1
2369 +    { id: sys_ioctl                     [ 16], common_pid: bash            [      3472] } hitcount:         12
2370 +    { id: sys_ioctl                     [ 16], common_pid: gnome-terminal  [      3199] } hitcount:         16
2371 +    { id: sys_ioctl                     [ 16], common_pid: Xorg            [      1267] } hitcount:       1808
2372 +    { id: sys_ioctl                     [ 16], common_pid: compiz          [      2994] } hitcount:       5580
2373 +    .
2374 +    .
2375 +    .
2376 +    { id: sys_waitid                    [247], common_pid: upstart-dbus-br [      2690] } hitcount:          3
2377 +    { id: sys_waitid                    [247], common_pid: upstart-dbus-br [      2688] } hitcount:         16
2378 +    { id: sys_inotify_add_watch         [254], common_pid: gmain           [       975] } hitcount:          2
2379 +    { id: sys_inotify_add_watch         [254], common_pid: gmain           [      3204] } hitcount:          4
2380 +    { id: sys_inotify_add_watch         [254], common_pid: gmain           [      2888] } hitcount:          4
2381 +    { id: sys_inotify_add_watch         [254], common_pid: gmain           [      3003] } hitcount:          4
2382 +    { id: sys_inotify_add_watch         [254], common_pid: gmain           [      2873] } hitcount:          4
2383 +    { id: sys_inotify_add_watch         [254], common_pid: gmain           [      3196] } hitcount:          6
2384 +    { id: sys_openat                    [257], common_pid: java            [      2623] } hitcount:          2
2385 +    { id: sys_eventfd2                  [290], common_pid: ibus-ui-gtk3    [      2760] } hitcount:          4
2386 +    { id: sys_eventfd2                  [290], common_pid: compiz          [      2994] } hitcount:          6
2387 +
2388 +    Totals:
2389 +        Hits: 31536
2390 +        Entries: 323
2391 +        Dropped: 0
2392 +
2393 +    The above list does give us a breakdown of the ioctl syscall by
2394 +    pid, but it also gives us quite a bit more than that, which we
2395 +    don't really care about at the moment.  Since we know the syscall
2396 +    id for sys_ioctl (16, displayed next to the sys_ioctl name), we
2397 +    can use that to filter out all the other syscalls:
2398 +
2399 +    # echo 'hist:key=id.syscall,common_pid.execname:val=hitcount:sort=id,hitcount if id == 16' > \
2400 +           /sys/kernel/debug/tracing/events/raw_syscalls/sys_enter/trigger
2401 +
2402 +    # cat /sys/kernel/debug/tracing/events/raw_syscalls/sys_enter/hist
2403 +    # trigger info: hist:keys=id.syscall,common_pid.execname:vals=hitcount:sort=id.syscall,hitcount:size=2048 if id == 16 [active]
2404 +
2405 +    { id: sys_ioctl                     [ 16], common_pid: gmain           [      2769] } hitcount:          1
2406 +    { id: sys_ioctl                     [ 16], common_pid: evolution-addre [      8571] } hitcount:          1
2407 +    { id: sys_ioctl                     [ 16], common_pid: gmain           [      3003] } hitcount:          1
2408 +    { id: sys_ioctl                     [ 16], common_pid: gmain           [      2781] } hitcount:          1
2409 +    { id: sys_ioctl                     [ 16], common_pid: gmain           [      2829] } hitcount:          1
2410 +    { id: sys_ioctl                     [ 16], common_pid: bash            [      8726] } hitcount:          1
2411 +    { id: sys_ioctl                     [ 16], common_pid: bash            [      8508] } hitcount:          1
2412 +    { id: sys_ioctl                     [ 16], common_pid: gmain           [      2970] } hitcount:          1
2413 +    { id: sys_ioctl                     [ 16], common_pid: gmain           [      2768] } hitcount:          1
2414 +    .
2415 +    .
2416 +    .
2417 +    { id: sys_ioctl                     [ 16], common_pid: pool            [      8559] } hitcount:         45
2418 +    { id: sys_ioctl                     [ 16], common_pid: pool            [      8555] } hitcount:         48
2419 +    { id: sys_ioctl                     [ 16], common_pid: pool            [      8551] } hitcount:         48
2420 +    { id: sys_ioctl                     [ 16], common_pid: avahi-daemon    [       896] } hitcount:         66
2421 +    { id: sys_ioctl                     [ 16], common_pid: Xorg            [      1267] } hitcount:      26674
2422 +    { id: sys_ioctl                     [ 16], common_pid: compiz          [      2994] } hitcount:      73443
2423 +
2424 +    Totals:
2425 +        Hits: 101162
2426 +        Entries: 103
2427 +        Dropped: 0
2428 +
2429 +    The above output shows that 'compiz' and 'Xorg' are far and away
2430 +    the heaviest ioctl callers (which might lead to questions about
2431 +    whether they really need to be making all those calls and to
2432 +    possible avenues for further investigation.)
2433 +
2434 +    The compound key examples used a key and a sum value (hitcount) to
2435 +    sort the output, but we can just as easily use two keys instead.
2436 +    Here's an example where we use a compound key composed of the the
2437 +    common_pid and size event fields.  Sorting with pid as the primary
2438 +    key and 'size' as the secondary key allows us to display an
2439 +    ordered summary of the recvfrom sizes, with counts, received by
2440 +    each process:
2441 +
2442 +    # echo 'hist:key=common_pid.execname,size:val=hitcount:sort=common_pid,size' > \
2443 +           /sys/kernel/debug/tracing/events/syscalls/sys_enter_recvfrom/trigger
2444 +
2445 +    # cat /sys/kernel/debug/tracing/events/syscalls/sys_enter_recvfrom/hist
2446 +    # trigger info: hist:keys=common_pid.execname,size:vals=hitcount:sort=common_pid.execname,size:size=2048 [active]
2447 +
2448 +    { common_pid: smbd            [       784], size:          4 } hitcount:          1
2449 +    { common_pid: dnsmasq         [      1412], size:       4096 } hitcount:        672
2450 +    { common_pid: postgres        [      1796], size:       1000 } hitcount:          6
2451 +    { common_pid: postgres        [      1867], size:       1000 } hitcount:         10
2452 +    { common_pid: bamfdaemon      [      2787], size:         28 } hitcount:          2
2453 +    { common_pid: bamfdaemon      [      2787], size:      14360 } hitcount:          1
2454 +    { common_pid: compiz          [      2994], size:          8 } hitcount:          1
2455 +    { common_pid: compiz          [      2994], size:         20 } hitcount:         11
2456 +    { common_pid: gnome-terminal  [      3199], size:          4 } hitcount:          2
2457 +    { common_pid: firefox         [      8817], size:          4 } hitcount:          1
2458 +    { common_pid: firefox         [      8817], size:          8 } hitcount:          5
2459 +    { common_pid: firefox         [      8817], size:        588 } hitcount:          2
2460 +    { common_pid: firefox         [      8817], size:        628 } hitcount:          1
2461 +    { common_pid: firefox         [      8817], size:       6944 } hitcount:          1
2462 +    { common_pid: firefox         [      8817], size:     408880 } hitcount:          2
2463 +    { common_pid: firefox         [      8822], size:          8 } hitcount:          2
2464 +    { common_pid: firefox         [      8822], size:        160 } hitcount:          2
2465 +    { common_pid: firefox         [      8822], size:        320 } hitcount:          2
2466 +    { common_pid: firefox         [      8822], size:        352 } hitcount:          1
2467 +    .
2468 +    .
2469 +    .
2470 +    { common_pid: pool            [      8923], size:       1960 } hitcount:         10
2471 +    { common_pid: pool            [      8923], size:       2048 } hitcount:         10
2472 +    { common_pid: pool            [      8924], size:       1960 } hitcount:         10
2473 +    { common_pid: pool            [      8924], size:       2048 } hitcount:         10
2474 +    { common_pid: pool            [      8928], size:       1964 } hitcount:          4
2475 +    { common_pid: pool            [      8928], size:       1965 } hitcount:          2
2476 +    { common_pid: pool            [      8928], size:       2048 } hitcount:          6
2477 +    { common_pid: pool            [      8929], size:       1982 } hitcount:          1
2478 +    { common_pid: pool            [      8929], size:       2048 } hitcount:          1
2479 +
2480 +    Totals:
2481 +        Hits: 2016
2482 +        Entries: 224
2483 +        Dropped: 0
2484 +
2485 +  The above example also illustrates the fact that although a compound
2486 +  key is treated as a single entity for hashing purposes, the sub-keys
2487 +  it's composed of can be accessed independently.
2488 +
2489 +  The next example uses a string field as the hash key and
2490 +  demonstrates how you can manually pause and continue a hist trigger.
2491 +  In this example, we'll aggregate fork counts and don't expect a
2492 +  large number of entries in the hash table, so we'll drop it to a
2493 +  much smaller number, say 256:
2494 +
2495 +    # echo 'hist:key=child_comm:val=hitcount:size=256' > \
2496 +           /sys/kernel/debug/tracing/events/sched/sched_process_fork/trigger
2497 +
2498 +    # cat /sys/kernel/debug/tracing/events/sched/sched_process_fork/hist
2499 +    # trigger info: hist:keys=child_comm:vals=hitcount:sort=hitcount:size=256 [active]
2500 +
2501 +    { child_comm: dconf worker                        } hitcount:          1
2502 +    { child_comm: ibus-daemon                         } hitcount:          1
2503 +    { child_comm: whoopsie                            } hitcount:          1
2504 +    { child_comm: smbd                                } hitcount:          1
2505 +    { child_comm: gdbus                               } hitcount:          1
2506 +    { child_comm: kthreadd                            } hitcount:          1
2507 +    { child_comm: dconf worker                        } hitcount:          1
2508 +    { child_comm: evolution-alarm                     } hitcount:          2
2509 +    { child_comm: Socket Thread                       } hitcount:          2
2510 +    { child_comm: postgres                            } hitcount:          2
2511 +    { child_comm: bash                                } hitcount:          3
2512 +    { child_comm: compiz                              } hitcount:          3
2513 +    { child_comm: evolution-sourc                     } hitcount:          4
2514 +    { child_comm: dhclient                            } hitcount:          4
2515 +    { child_comm: pool                                } hitcount:          5
2516 +    { child_comm: nm-dispatcher.a                     } hitcount:          8
2517 +    { child_comm: firefox                             } hitcount:          8
2518 +    { child_comm: dbus-daemon                         } hitcount:          8
2519 +    { child_comm: glib-pacrunner                      } hitcount:         10
2520 +    { child_comm: evolution                           } hitcount:         23
2521 +
2522 +    Totals:
2523 +        Hits: 89
2524 +        Entries: 20
2525 +        Dropped: 0
2526 +
2527 +  If we want to pause the hist trigger, we can simply append :pause to
2528 +  the command that started the trigger.  Notice that the trigger info
2529 +  displays as [paused]:
2530 +
2531 +    # echo 'hist:key=child_comm:val=hitcount:size=256:pause' >> \
2532 +           /sys/kernel/debug/tracing/events/sched/sched_process_fork/trigger
2533 +
2534 +    # cat /sys/kernel/debug/tracing/events/sched/sched_process_fork/hist
2535 +    # trigger info: hist:keys=child_comm:vals=hitcount:sort=hitcount:size=256 [paused]
2536 +
2537 +    { child_comm: dconf worker                        } hitcount:          1
2538 +    { child_comm: kthreadd                            } hitcount:          1
2539 +    { child_comm: dconf worker                        } hitcount:          1
2540 +    { child_comm: gdbus                               } hitcount:          1
2541 +    { child_comm: ibus-daemon                         } hitcount:          1
2542 +    { child_comm: Socket Thread                       } hitcount:          2
2543 +    { child_comm: evolution-alarm                     } hitcount:          2
2544 +    { child_comm: smbd                                } hitcount:          2
2545 +    { child_comm: bash                                } hitcount:          3
2546 +    { child_comm: whoopsie                            } hitcount:          3
2547 +    { child_comm: compiz                              } hitcount:          3
2548 +    { child_comm: evolution-sourc                     } hitcount:          4
2549 +    { child_comm: pool                                } hitcount:          5
2550 +    { child_comm: postgres                            } hitcount:          6
2551 +    { child_comm: firefox                             } hitcount:          8
2552 +    { child_comm: dhclient                            } hitcount:         10
2553 +    { child_comm: emacs                               } hitcount:         12
2554 +    { child_comm: dbus-daemon                         } hitcount:         20
2555 +    { child_comm: nm-dispatcher.a                     } hitcount:         20
2556 +    { child_comm: evolution                           } hitcount:         35
2557 +    { child_comm: glib-pacrunner                      } hitcount:         59
2558 +
2559 +    Totals:
2560 +        Hits: 199
2561 +        Entries: 21
2562 +        Dropped: 0
2563 +
2564 +  To manually continue having the trigger aggregate events, append
2565 +  :cont instead.  Notice that the trigger info displays as [active]
2566 +  again, and the data has changed:
2567 +
2568 +    # echo 'hist:key=child_comm:val=hitcount:size=256:cont' >> \
2569 +           /sys/kernel/debug/tracing/events/sched/sched_process_fork/trigger
2570 +
2571 +    # cat /sys/kernel/debug/tracing/events/sched/sched_process_fork/hist
2572 +    # trigger info: hist:keys=child_comm:vals=hitcount:sort=hitcount:size=256 [active]
2573 +
2574 +    { child_comm: dconf worker                        } hitcount:          1
2575 +    { child_comm: dconf worker                        } hitcount:          1
2576 +    { child_comm: kthreadd                            } hitcount:          1
2577 +    { child_comm: gdbus                               } hitcount:          1
2578 +    { child_comm: ibus-daemon                         } hitcount:          1
2579 +    { child_comm: Socket Thread                       } hitcount:          2
2580 +    { child_comm: evolution-alarm                     } hitcount:          2
2581 +    { child_comm: smbd                                } hitcount:          2
2582 +    { child_comm: whoopsie                            } hitcount:          3
2583 +    { child_comm: compiz                              } hitcount:          3
2584 +    { child_comm: evolution-sourc                     } hitcount:          4
2585 +    { child_comm: bash                                } hitcount:          5
2586 +    { child_comm: pool                                } hitcount:          5
2587 +    { child_comm: postgres                            } hitcount:          6
2588 +    { child_comm: firefox                             } hitcount:          8
2589 +    { child_comm: dhclient                            } hitcount:         11
2590 +    { child_comm: emacs                               } hitcount:         12
2591 +    { child_comm: dbus-daemon                         } hitcount:         22
2592 +    { child_comm: nm-dispatcher.a                     } hitcount:         22
2593 +    { child_comm: evolution                           } hitcount:         35
2594 +    { child_comm: glib-pacrunner                      } hitcount:         59
2595 +
2596 +    Totals:
2597 +        Hits: 206
2598 +        Entries: 21
2599 +        Dropped: 0
2600 +
2601 +  The previous example showed how to start and stop a hist trigger by
2602 +  appending 'pause' and 'continue' to the hist trigger command.  A
2603 +  hist trigger can also be started in a paused state by initially
2604 +  starting the trigger with ':pause' appended.  This allows you to
2605 +  start the trigger only when you're ready to start collecting data
2606 +  and not before.  For example, you could start the trigger in a
2607 +  paused state, then unpause it and do something you want to measure,
2608 +  then pause the trigger again when done.
2609 +
2610 +  Of course, doing this manually can be difficult and error-prone, but
2611 +  it is possible to automatically start and stop a hist trigger based
2612 +  on some condition, via the enable_hist and disable_hist triggers.
2613 +
2614 +  For example, suppose we wanted to take a look at the relative
2615 +  weights in terms of skb length for each callpath that leads to a
2616 +  netif_receieve_skb event when downloading a decent-sized file using
2617 +  wget.
2618 +
2619 +  First we set up an initially paused stacktrace trigger on the
2620 +  netif_receive_skb event:
2621 +
2622 +    # echo 'hist:key=stacktrace:vals=len:pause' > \
2623 +           /sys/kernel/debug/tracing/events/net/netif_receive_skb/trigger
2624 +
2625 +  Next, we set up an 'enable_hist' trigger on the sched_process_exec
2626 +  event, with an 'if filename==/usr/bin/wget' filter.  The effect of
2627 +  this new trigger is that it will 'unpause' the hist trigger we just
2628 +  set up on netif_receive_skb if and only if it sees a
2629 +  sched_process_exec event with a filename of '/usr/bin/wget'.  When
2630 +  that happens, all netif_receive_skb events are aggregated into a
2631 +  hash table keyed on stacktrace:
2632 +
2633 +    # echo 'enable_hist:net:netif_receive_skb if filename==/usr/bin/wget' > \
2634 +           /sys/kernel/debug/tracing/events/sched/sched_process_exec/trigger
2635 +
2636 +  The aggregation continues until the netif_receive_skb is paused
2637 +  again, which is what the following disable_hist event does by
2638 +  creating a similar setup on the sched_process_exit event, using the
2639 +  filter 'comm==wget':
2640 +
2641 +    # echo 'disable_hist:net:netif_receive_skb if comm==wget' > \
2642 +           /sys/kernel/debug/tracing/events/sched/sched_process_exit/trigger
2643 +
2644 +  Whenever a process exits and the comm field of the disable_hist
2645 +  trigger filter matches 'comm==wget', the netif_receive_skb hist
2646 +  trigger is disabled.
2647 +
2648 +  The overall effect is that netif_receive_skb events are aggregated
2649 +  into the hash table for only the duration of the wget.  Executing a
2650 +  wget command and then listing the 'hist' file will display the
2651 +  output generated by the wget command:
2652 +
2653 +    $ wget https://www.kernel.org/pub/linux/kernel/v3.x/patch-3.19.xz
2654 +
2655 +    # cat /sys/kernel/debug/tracing/events/net/netif_receive_skb/hist
2656 +    # trigger info: hist:keys=stacktrace:vals=len:sort=hitcount:size=2048 [paused]
2657 +
2658 +    { stacktrace:
2659 +         __netif_receive_skb_core+0x46d/0x990
2660 +         __netif_receive_skb+0x18/0x60
2661 +         netif_receive_skb_internal+0x23/0x90
2662 +         napi_gro_receive+0xc8/0x100
2663 +         ieee80211_deliver_skb+0xd6/0x270 [mac80211]
2664 +         ieee80211_rx_handlers+0xccf/0x22f0 [mac80211]
2665 +         ieee80211_prepare_and_rx_handle+0x4e7/0xc40 [mac80211]
2666 +         ieee80211_rx+0x31d/0x900 [mac80211]
2667 +         iwlagn_rx_reply_rx+0x3db/0x6f0 [iwldvm]
2668 +         iwl_rx_dispatch+0x8e/0xf0 [iwldvm]
2669 +         iwl_pcie_irq_handler+0xe3c/0x12f0 [iwlwifi]
2670 +         irq_thread_fn+0x20/0x50
2671 +         irq_thread+0x11f/0x150
2672 +         kthread+0xd2/0xf0
2673 +         ret_from_fork+0x42/0x70
2674 +    } hitcount:         85  len:      28884
2675 +    { stacktrace:
2676 +         __netif_receive_skb_core+0x46d/0x990
2677 +         __netif_receive_skb+0x18/0x60
2678 +         netif_receive_skb_internal+0x23/0x90
2679 +         napi_gro_complete+0xa4/0xe0
2680 +         dev_gro_receive+0x23a/0x360
2681 +         napi_gro_receive+0x30/0x100
2682 +         ieee80211_deliver_skb+0xd6/0x270 [mac80211]
2683 +         ieee80211_rx_handlers+0xccf/0x22f0 [mac80211]
2684 +         ieee80211_prepare_and_rx_handle+0x4e7/0xc40 [mac80211]
2685 +         ieee80211_rx+0x31d/0x900 [mac80211]
2686 +         iwlagn_rx_reply_rx+0x3db/0x6f0 [iwldvm]
2687 +         iwl_rx_dispatch+0x8e/0xf0 [iwldvm]
2688 +         iwl_pcie_irq_handler+0xe3c/0x12f0 [iwlwifi]
2689 +         irq_thread_fn+0x20/0x50
2690 +         irq_thread+0x11f/0x150
2691 +         kthread+0xd2/0xf0
2692 +    } hitcount:         98  len:     664329
2693 +    { stacktrace:
2694 +         __netif_receive_skb_core+0x46d/0x990
2695 +         __netif_receive_skb+0x18/0x60
2696 +         process_backlog+0xa8/0x150
2697 +         net_rx_action+0x15d/0x340
2698 +         __do_softirq+0x114/0x2c0
2699 +         do_softirq_own_stack+0x1c/0x30
2700 +         do_softirq+0x65/0x70
2701 +         __local_bh_enable_ip+0xb5/0xc0
2702 +         ip_finish_output+0x1f4/0x840
2703 +         ip_output+0x6b/0xc0
2704 +         ip_local_out_sk+0x31/0x40
2705 +         ip_send_skb+0x1a/0x50
2706 +         udp_send_skb+0x173/0x2a0
2707 +         udp_sendmsg+0x2bf/0x9f0
2708 +         inet_sendmsg+0x64/0xa0
2709 +         sock_sendmsg+0x3d/0x50
2710 +    } hitcount:        115  len:      13030
2711 +    { stacktrace:
2712 +         __netif_receive_skb_core+0x46d/0x990
2713 +         __netif_receive_skb+0x18/0x60
2714 +         netif_receive_skb_internal+0x23/0x90
2715 +         napi_gro_complete+0xa4/0xe0
2716 +         napi_gro_flush+0x6d/0x90
2717 +         iwl_pcie_irq_handler+0x92a/0x12f0 [iwlwifi]
2718 +         irq_thread_fn+0x20/0x50
2719 +         irq_thread+0x11f/0x150
2720 +         kthread+0xd2/0xf0
2721 +         ret_from_fork+0x42/0x70
2722 +    } hitcount:        934  len:    5512212
2723 +
2724 +    Totals:
2725 +        Hits: 1232
2726 +        Entries: 4
2727 +        Dropped: 0
2728 +
2729 +  The above shows all the netif_receive_skb callpaths and their total
2730 +  lengths for the duration of the wget command.
2731 +
2732 +  The 'clear' hist trigger param can be used to clear the hash table.
2733 +  Suppose we wanted to try another run of the previous example but
2734 +  this time also wanted to see the complete list of events that went
2735 +  into the histogram.  In order to avoid having to set everything up
2736 +  again, we can just clear the histogram first:
2737 +
2738 +    # echo 'hist:key=stacktrace:vals=len:clear' >> \
2739 +           /sys/kernel/debug/tracing/events/net/netif_receive_skb/trigger
2740 +
2741 +  Just to verify that it is in fact cleared, here's what we now see in
2742 +  the hist file:
2743 +
2744 +    # cat /sys/kernel/debug/tracing/events/net/netif_receive_skb/hist
2745 +    # trigger info: hist:keys=stacktrace:vals=len:sort=hitcount:size=2048 [paused]
2746 +
2747 +    Totals:
2748 +        Hits: 0
2749 +        Entries: 0
2750 +        Dropped: 0
2751 +
2752 +  Since we want to see the detailed list of every netif_receive_skb
2753 +  event occurring during the new run, which are in fact the same
2754 +  events being aggregated into the hash table, we add some additional
2755 +  'enable_event' events to the triggering sched_process_exec and
2756 +  sched_process_exit events as such:
2757 +
2758 +    # echo 'enable_event:net:netif_receive_skb if filename==/usr/bin/wget' > \
2759 +           /sys/kernel/debug/tracing/events/sched/sched_process_exec/trigger
2760 +
2761 +    # echo 'disable_event:net:netif_receive_skb if comm==wget' > \
2762 +           /sys/kernel/debug/tracing/events/sched/sched_process_exit/trigger
2763 +
2764 +  If you read the trigger files for the sched_process_exec and
2765 +  sched_process_exit triggers, you should see two triggers for each:
2766 +  one enabling/disabling the hist aggregation and the other
2767 +  enabling/disabling the logging of events:
2768 +
2769 +    # cat /sys/kernel/debug/tracing/events/sched/sched_process_exec/trigger
2770 +    enable_event:net:netif_receive_skb:unlimited if filename==/usr/bin/wget
2771 +    enable_hist:net:netif_receive_skb:unlimited if filename==/usr/bin/wget
2772 +
2773 +    # cat /sys/kernel/debug/tracing/events/sched/sched_process_exit/trigger
2774 +    enable_event:net:netif_receive_skb:unlimited if comm==wget
2775 +    disable_hist:net:netif_receive_skb:unlimited if comm==wget
2776 +
2777 +  In other words, whenever either of the sched_process_exec or
2778 +  sched_process_exit events is hit and matches 'wget', it enables or
2779 +  disables both the histogram and the event log, and what you end up
2780 +  with is a hash table and set of events just covering the specified
2781 +  duration.  Run the wget command again:
2782 +
2783 +    $ wget https://www.kernel.org/pub/linux/kernel/v3.x/patch-3.19.xz
2784 +
2785 +  Displaying the 'hist' file should show something similar to what you
2786 +  saw in the last run, but this time you should also see the
2787 +  individual events in the trace file:
2788 +
2789 +    # cat /sys/kernel/debug/tracing/trace
2790 +
2791 +    # tracer: nop
2792 +    #
2793 +    # entries-in-buffer/entries-written: 183/1426   #P:4
2794 +    #
2795 +    #                              _-----=> irqs-off
2796 +    #                             / _----=> need-resched
2797 +    #                            | / _---=> hardirq/softirq
2798 +    #                            || / _--=> preempt-depth
2799 +    #                            ||| /     delay
2800 +    #           TASK-PID   CPU#  ||||    TIMESTAMP  FUNCTION
2801 +    #              | |       |   ||||       |         |
2802 +                wget-15108 [000] ..s1 31769.606929: netif_receive_skb: dev=lo skbaddr=ffff88009c353100 len=60
2803 +                wget-15108 [000] ..s1 31769.606999: netif_receive_skb: dev=lo skbaddr=ffff88009c353200 len=60
2804 +             dnsmasq-1382  [000] ..s1 31769.677652: netif_receive_skb: dev=lo skbaddr=ffff88009c352b00 len=130
2805 +             dnsmasq-1382  [000] ..s1 31769.685917: netif_receive_skb: dev=lo skbaddr=ffff88009c352200 len=138
2806 +    ##### CPU 2 buffer started ####
2807 +      irq/29-iwlwifi-559   [002] ..s. 31772.031529: netif_receive_skb: dev=wlan0 skbaddr=ffff88009d433d00 len=2948
2808 +      irq/29-iwlwifi-559   [002] ..s. 31772.031572: netif_receive_skb: dev=wlan0 skbaddr=ffff88009d432200 len=1500
2809 +      irq/29-iwlwifi-559   [002] ..s. 31772.032196: netif_receive_skb: dev=wlan0 skbaddr=ffff88009d433100 len=2948
2810 +      irq/29-iwlwifi-559   [002] ..s. 31772.032761: netif_receive_skb: dev=wlan0 skbaddr=ffff88009d433000 len=2948
2811 +      irq/29-iwlwifi-559   [002] ..s. 31772.033220: netif_receive_skb: dev=wlan0 skbaddr=ffff88009d432e00 len=1500
2812 +    .
2813 +    .
2814 +    .
2815 +
2816 +  The following example demonstrates how multiple hist triggers can be
2817 +  attached to a given event.  This capability can be useful for
2818 +  creating a set of different summaries derived from the same set of
2819 +  events, or for comparing the effects of different filters, among
2820 +  other things.
2821 +
2822 +    # echo 'hist:keys=skbaddr.hex:vals=len if len < 0' >> \
2823 +           /sys/kernel/debug/tracing/events/net/netif_receive_skb/trigger
2824 +    # echo 'hist:keys=skbaddr.hex:vals=len if len > 4096' >> \
2825 +           /sys/kernel/debug/tracing/events/net/netif_receive_skb/trigger
2826 +    # echo 'hist:keys=skbaddr.hex:vals=len if len == 256' >> \
2827 +           /sys/kernel/debug/tracing/events/net/netif_receive_skb/trigger
2828 +    # echo 'hist:keys=skbaddr.hex:vals=len' >> \
2829 +           /sys/kernel/debug/tracing/events/net/netif_receive_skb/trigger
2830 +    # echo 'hist:keys=len:vals=common_preempt_count' >> \
2831 +           /sys/kernel/debug/tracing/events/net/netif_receive_skb/trigger
2832 +
2833 +  The above set of commands create four triggers differing only in
2834 +  their filters, along with a completely different though fairly
2835 +  nonsensical trigger.  Note that in order to append multiple hist
2836 +  triggers to the same file, you should use the '>>' operator to
2837 +  append them ('>' will also add the new hist trigger, but will remove
2838 +  any existing hist triggers beforehand).
2839 +
2840 +  Displaying the contents of the 'hist' file for the event shows the
2841 +  contents of all five histograms:
2842 +
2843 +    # cat /sys/kernel/debug/tracing/events/net/netif_receive_skb/hist
2844 +
2845 +    # event histogram
2846 +    #
2847 +    # trigger info: hist:keys=len:vals=hitcount,common_preempt_count:sort=hitcount:size=2048 [active]
2848 +    #
2849 +
2850 +    { len:        176 } hitcount:          1  common_preempt_count:          0
2851 +    { len:        223 } hitcount:          1  common_preempt_count:          0
2852 +    { len:       4854 } hitcount:          1  common_preempt_count:          0
2853 +    { len:        395 } hitcount:          1  common_preempt_count:          0
2854 +    { len:        177 } hitcount:          1  common_preempt_count:          0
2855 +    { len:        446 } hitcount:          1  common_preempt_count:          0
2856 +    { len:       1601 } hitcount:          1  common_preempt_count:          0
2857 +    .
2858 +    .
2859 +    .
2860 +    { len:       1280 } hitcount:         66  common_preempt_count:          0
2861 +    { len:        116 } hitcount:         81  common_preempt_count:         40
2862 +    { len:        708 } hitcount:        112  common_preempt_count:          0
2863 +    { len:         46 } hitcount:        221  common_preempt_count:          0
2864 +    { len:       1264 } hitcount:        458  common_preempt_count:          0
2865 +
2866 +    Totals:
2867 +        Hits: 1428
2868 +        Entries: 147
2869 +        Dropped: 0
2870 +
2871 +
2872 +    # event histogram
2873 +    #
2874 +    # trigger info: hist:keys=skbaddr.hex:vals=hitcount,len:sort=hitcount:size=2048 [active]
2875 +    #
2876 +
2877 +    { skbaddr: ffff8800baee5e00 } hitcount:          1  len:        130
2878 +    { skbaddr: ffff88005f3d5600 } hitcount:          1  len:       1280
2879 +    { skbaddr: ffff88005f3d4900 } hitcount:          1  len:       1280
2880 +    { skbaddr: ffff88009fed6300 } hitcount:          1  len:        115
2881 +    { skbaddr: ffff88009fe0ad00 } hitcount:          1  len:        115
2882 +    { skbaddr: ffff88008cdb1900 } hitcount:          1  len:         46
2883 +    { skbaddr: ffff880064b5ef00 } hitcount:          1  len:        118
2884 +    { skbaddr: ffff880044e3c700 } hitcount:          1  len:         60
2885 +    { skbaddr: ffff880100065900 } hitcount:          1  len:         46
2886 +    { skbaddr: ffff8800d46bd500 } hitcount:          1  len:        116
2887 +    { skbaddr: ffff88005f3d5f00 } hitcount:          1  len:       1280
2888 +    { skbaddr: ffff880100064700 } hitcount:          1  len:        365
2889 +    { skbaddr: ffff8800badb6f00 } hitcount:          1  len:         60
2890 +    .
2891 +    .
2892 +    .
2893 +    { skbaddr: ffff88009fe0be00 } hitcount:         27  len:      24677
2894 +    { skbaddr: ffff88009fe0a400 } hitcount:         27  len:      23052
2895 +    { skbaddr: ffff88009fe0b700 } hitcount:         31  len:      25589
2896 +    { skbaddr: ffff88009fe0b600 } hitcount:         32  len:      27326
2897 +    { skbaddr: ffff88006a462800 } hitcount:         68  len:      71678
2898 +    { skbaddr: ffff88006a463700 } hitcount:         70  len:      72678
2899 +    { skbaddr: ffff88006a462b00 } hitcount:         71  len:      77589
2900 +    { skbaddr: ffff88006a463600 } hitcount:         73  len:      71307
2901 +    { skbaddr: ffff88006a462200 } hitcount:         81  len:      81032
2902 +
2903 +    Totals:
2904 +        Hits: 1451
2905 +        Entries: 318
2906 +        Dropped: 0
2907 +
2908 +
2909 +    # event histogram
2910 +    #
2911 +    # trigger info: hist:keys=skbaddr.hex:vals=hitcount,len:sort=hitcount:size=2048 if len == 256 [active]
2912 +    #
2913 +
2914 +
2915 +    Totals:
2916 +        Hits: 0
2917 +        Entries: 0
2918 +        Dropped: 0
2919 +
2920 +
2921 +    # event histogram
2922 +    #
2923 +    # trigger info: hist:keys=skbaddr.hex:vals=hitcount,len:sort=hitcount:size=2048 if len > 4096 [active]
2924 +    #
2925 +
2926 +    { skbaddr: ffff88009fd2c300 } hitcount:          1  len:       7212
2927 +    { skbaddr: ffff8800d2bcce00 } hitcount:          1  len:       7212
2928 +    { skbaddr: ffff8800d2bcd700 } hitcount:          1  len:       7212
2929 +    { skbaddr: ffff8800d2bcda00 } hitcount:          1  len:      21492
2930 +    { skbaddr: ffff8800ae2e2d00 } hitcount:          1  len:       7212
2931 +    { skbaddr: ffff8800d2bcdb00 } hitcount:          1  len:       7212
2932 +    { skbaddr: ffff88006a4df500 } hitcount:          1  len:       4854
2933 +    { skbaddr: ffff88008ce47b00 } hitcount:          1  len:      18636
2934 +    { skbaddr: ffff8800ae2e2200 } hitcount:          1  len:      12924
2935 +    { skbaddr: ffff88005f3e1000 } hitcount:          1  len:       4356
2936 +    { skbaddr: ffff8800d2bcdc00 } hitcount:          2  len:      24420
2937 +    { skbaddr: ffff8800d2bcc200 } hitcount:          2  len:      12996
2938 +
2939 +    Totals:
2940 +        Hits: 14
2941 +        Entries: 12
2942 +        Dropped: 0
2943 +
2944 +
2945 +    # event histogram
2946 +    #
2947 +    # trigger info: hist:keys=skbaddr.hex:vals=hitcount,len:sort=hitcount:size=2048 if len < 0 [active]
2948 +    #
2949 +
2950 +
2951 +    Totals:
2952 +        Hits: 0
2953 +        Entries: 0
2954 +        Dropped: 0
2955 +
2956 +  Named triggers can be used to have triggers share a common set of
2957 +  histogram data.  This capability is mostly useful for combining the
2958 +  output of events generated by tracepoints contained inside inline
2959 +  functions, but names can be used in a hist trigger on any event.
2960 +  For example, these two triggers when hit will update the same 'len'
2961 +  field in the shared 'foo' histogram data:
2962 +
2963 +    # echo 'hist:name=foo:keys=skbaddr.hex:vals=len' > \
2964 +           /sys/kernel/debug/tracing/events/net/netif_receive_skb/trigger
2965 +    # echo 'hist:name=foo:keys=skbaddr.hex:vals=len' > \
2966 +           /sys/kernel/debug/tracing/events/net/netif_rx/trigger
2967 +
2968 +  You can see that they're updating common histogram data by reading
2969 +  each event's hist files at the same time:
2970 +
2971 +    # cat /sys/kernel/debug/tracing/events/net/netif_receive_skb/hist;
2972 +      cat /sys/kernel/debug/tracing/events/net/netif_rx/hist
2973 +
2974 +    # event histogram
2975 +    #
2976 +    # trigger info: hist:name=foo:keys=skbaddr.hex:vals=hitcount,len:sort=hitcount:size=2048 [active]
2977 +    #
2978 +
2979 +    { skbaddr: ffff88000ad53500 } hitcount:          1  len:         46
2980 +    { skbaddr: ffff8800af5a1500 } hitcount:          1  len:         76
2981 +    { skbaddr: ffff8800d62a1900 } hitcount:          1  len:         46
2982 +    { skbaddr: ffff8800d2bccb00 } hitcount:          1  len:        468
2983 +    { skbaddr: ffff8800d3c69900 } hitcount:          1  len:         46
2984 +    { skbaddr: ffff88009ff09100 } hitcount:          1  len:         52
2985 +    { skbaddr: ffff88010f13ab00 } hitcount:          1  len:        168
2986 +    { skbaddr: ffff88006a54f400 } hitcount:          1  len:         46
2987 +    { skbaddr: ffff8800d2bcc500 } hitcount:          1  len:        260
2988 +    { skbaddr: ffff880064505000 } hitcount:          1  len:         46
2989 +    { skbaddr: ffff8800baf24e00 } hitcount:          1  len:         32
2990 +    { skbaddr: ffff88009fe0ad00 } hitcount:          1  len:         46
2991 +    { skbaddr: ffff8800d3edff00 } hitcount:          1  len:         44
2992 +    { skbaddr: ffff88009fe0b400 } hitcount:          1  len:        168
2993 +    { skbaddr: ffff8800a1c55a00 } hitcount:          1  len:         40
2994 +    { skbaddr: ffff8800d2bcd100 } hitcount:          1  len:         40
2995 +    { skbaddr: ffff880064505f00 } hitcount:          1  len:        174
2996 +    { skbaddr: ffff8800a8bff200 } hitcount:          1  len:        160
2997 +    { skbaddr: ffff880044e3cc00 } hitcount:          1  len:         76
2998 +    { skbaddr: ffff8800a8bfe700 } hitcount:          1  len:         46
2999 +    { skbaddr: ffff8800d2bcdc00 } hitcount:          1  len:         32
3000 +    { skbaddr: ffff8800a1f64800 } hitcount:          1  len:         46
3001 +    { skbaddr: ffff8800d2bcde00 } hitcount:          1  len:        988
3002 +    { skbaddr: ffff88006a5dea00 } hitcount:          1  len:         46
3003 +    { skbaddr: ffff88002e37a200 } hitcount:          1  len:         44
3004 +    { skbaddr: ffff8800a1f32c00 } hitcount:          2  len:        676
3005 +    { skbaddr: ffff88000ad52600 } hitcount:          2  len:        107
3006 +    { skbaddr: ffff8800a1f91e00 } hitcount:          2  len:         92
3007 +    { skbaddr: ffff8800af5a0200 } hitcount:          2  len:        142
3008 +    { skbaddr: ffff8800d2bcc600 } hitcount:          2  len:        220
3009 +    { skbaddr: ffff8800ba36f500 } hitcount:          2  len:         92
3010 +    { skbaddr: ffff8800d021f800 } hitcount:          2  len:         92
3011 +    { skbaddr: ffff8800a1f33600 } hitcount:          2  len:        675
3012 +    { skbaddr: ffff8800a8bfff00 } hitcount:          3  len:        138
3013 +    { skbaddr: ffff8800d62a1300 } hitcount:          3  len:        138
3014 +    { skbaddr: ffff88002e37a100 } hitcount:          4  len:        184
3015 +    { skbaddr: ffff880064504400 } hitcount:          4  len:        184
3016 +    { skbaddr: ffff8800a8bfec00 } hitcount:          4  len:        184
3017 +    { skbaddr: ffff88000ad53700 } hitcount:          5  len:        230
3018 +    { skbaddr: ffff8800d2bcdb00 } hitcount:          5  len:        196
3019 +    { skbaddr: ffff8800a1f90000 } hitcount:          6  len:        276
3020 +    { skbaddr: ffff88006a54f900 } hitcount:          6  len:        276
3021 +
3022 +    Totals:
3023 +        Hits: 81
3024 +        Entries: 42
3025 +        Dropped: 0
3026 +    # event histogram
3027 +    #
3028 +    # trigger info: hist:name=foo:keys=skbaddr.hex:vals=hitcount,len:sort=hitcount:size=2048 [active]
3029 +    #
3030 +
3031 +    { skbaddr: ffff88000ad53500 } hitcount:          1  len:         46
3032 +    { skbaddr: ffff8800af5a1500 } hitcount:          1  len:         76
3033 +    { skbaddr: ffff8800d62a1900 } hitcount:          1  len:         46
3034 +    { skbaddr: ffff8800d2bccb00 } hitcount:          1  len:        468
3035 +    { skbaddr: ffff8800d3c69900 } hitcount:          1  len:         46
3036 +    { skbaddr: ffff88009ff09100 } hitcount:          1  len:         52
3037 +    { skbaddr: ffff88010f13ab00 } hitcount:          1  len:        168
3038 +    { skbaddr: ffff88006a54f400 } hitcount:          1  len:         46
3039 +    { skbaddr: ffff8800d2bcc500 } hitcount:          1  len:        260
3040 +    { skbaddr: ffff880064505000 } hitcount:          1  len:         46
3041 +    { skbaddr: ffff8800baf24e00 } hitcount:          1  len:         32
3042 +    { skbaddr: ffff88009fe0ad00 } hitcount:          1  len:         46
3043 +    { skbaddr: ffff8800d3edff00 } hitcount:          1  len:         44
3044 +    { skbaddr: ffff88009fe0b400 } hitcount:          1  len:        168
3045 +    { skbaddr: ffff8800a1c55a00 } hitcount:          1  len:         40
3046 +    { skbaddr: ffff8800d2bcd100 } hitcount:          1  len:         40
3047 +    { skbaddr: ffff880064505f00 } hitcount:          1  len:        174
3048 +    { skbaddr: ffff8800a8bff200 } hitcount:          1  len:        160
3049 +    { skbaddr: ffff880044e3cc00 } hitcount:          1  len:         76
3050 +    { skbaddr: ffff8800a8bfe700 } hitcount:          1  len:         46
3051 +    { skbaddr: ffff8800d2bcdc00 } hitcount:          1  len:         32
3052 +    { skbaddr: ffff8800a1f64800 } hitcount:          1  len:         46
3053 +    { skbaddr: ffff8800d2bcde00 } hitcount:          1  len:        988
3054 +    { skbaddr: ffff88006a5dea00 } hitcount:          1  len:         46
3055 +    { skbaddr: ffff88002e37a200 } hitcount:          1  len:         44
3056 +    { skbaddr: ffff8800a1f32c00 } hitcount:          2  len:        676
3057 +    { skbaddr: ffff88000ad52600 } hitcount:          2  len:        107
3058 +    { skbaddr: ffff8800a1f91e00 } hitcount:          2  len:         92
3059 +    { skbaddr: ffff8800af5a0200 } hitcount:          2  len:        142
3060 +    { skbaddr: ffff8800d2bcc600 } hitcount:          2  len:        220
3061 +    { skbaddr: ffff8800ba36f500 } hitcount:          2  len:         92
3062 +    { skbaddr: ffff8800d021f800 } hitcount:          2  len:         92
3063 +    { skbaddr: ffff8800a1f33600 } hitcount:          2  len:        675
3064 +    { skbaddr: ffff8800a8bfff00 } hitcount:          3  len:        138
3065 +    { skbaddr: ffff8800d62a1300 } hitcount:          3  len:        138
3066 +    { skbaddr: ffff88002e37a100 } hitcount:          4  len:        184
3067 +    { skbaddr: ffff880064504400 } hitcount:          4  len:        184
3068 +    { skbaddr: ffff8800a8bfec00 } hitcount:          4  len:        184
3069 +    { skbaddr: ffff88000ad53700 } hitcount:          5  len:        230
3070 +    { skbaddr: ffff8800d2bcdb00 } hitcount:          5  len:        196
3071 +    { skbaddr: ffff8800a1f90000 } hitcount:          6  len:        276
3072 +    { skbaddr: ffff88006a54f900 } hitcount:          6  len:        276
3073 +
3074 +    Totals:
3075 +        Hits: 81
3076 +        Entries: 42
3077 +        Dropped: 0
3078 +
3079 +  And here's an example that shows how to combine histogram data from
3080 +  any two events even if they don't share any 'compatible' fields
3081 +  other than 'hitcount' and 'stacktrace'.  These commands create a
3082 +  couple of triggers named 'bar' using those fields:
3083 +
3084 +    # echo 'hist:name=bar:key=stacktrace:val=hitcount' > \
3085 +           /sys/kernel/debug/tracing/events/sched/sched_process_fork/trigger
3086 +    # echo 'hist:name=bar:key=stacktrace:val=hitcount' > \
3087 +          /sys/kernel/debug/tracing/events/net/netif_rx/trigger
3088 +
3089 +  And displaying the output of either shows some interesting if
3090 +  somewhat confusing output:
3091 +
3092 +    # cat /sys/kernel/debug/tracing/events/sched/sched_process_fork/hist
3093 +    # cat /sys/kernel/debug/tracing/events/net/netif_rx/hist
3094 +
3095 +    # event histogram
3096 +    #
3097 +    # trigger info: hist:name=bar:keys=stacktrace:vals=hitcount:sort=hitcount:size=2048 [active]
3098 +    #
3099 +
3100 +    { stacktrace:
3101 +             _do_fork+0x18e/0x330
3102 +             kernel_thread+0x29/0x30
3103 +             kthreadd+0x154/0x1b0
3104 +             ret_from_fork+0x3f/0x70
3105 +    } hitcount:          1
3106 +    { stacktrace:
3107 +             netif_rx_internal+0xb2/0xd0
3108 +             netif_rx_ni+0x20/0x70
3109 +             dev_loopback_xmit+0xaa/0xd0
3110 +             ip_mc_output+0x126/0x240
3111 +             ip_local_out_sk+0x31/0x40
3112 +             igmp_send_report+0x1e9/0x230
3113 +             igmp_timer_expire+0xe9/0x120
3114 +             call_timer_fn+0x39/0xf0
3115 +             run_timer_softirq+0x1e1/0x290
3116 +             __do_softirq+0xfd/0x290
3117 +             irq_exit+0x98/0xb0
3118 +             smp_apic_timer_interrupt+0x4a/0x60
3119 +             apic_timer_interrupt+0x6d/0x80
3120 +             cpuidle_enter+0x17/0x20
3121 +             call_cpuidle+0x3b/0x60
3122 +             cpu_startup_entry+0x22d/0x310
3123 +    } hitcount:          1
3124 +    { stacktrace:
3125 +             netif_rx_internal+0xb2/0xd0
3126 +             netif_rx_ni+0x20/0x70
3127 +             dev_loopback_xmit+0xaa/0xd0
3128 +             ip_mc_output+0x17f/0x240
3129 +             ip_local_out_sk+0x31/0x40
3130 +             ip_send_skb+0x1a/0x50
3131 +             udp_send_skb+0x13e/0x270
3132 +             udp_sendmsg+0x2bf/0x980
3133 +             inet_sendmsg+0x67/0xa0
3134 +             sock_sendmsg+0x38/0x50
3135 +             SYSC_sendto+0xef/0x170
3136 +             SyS_sendto+0xe/0x10
3137 +             entry_SYSCALL_64_fastpath+0x12/0x6a
3138 +    } hitcount:          2
3139 +    { stacktrace:
3140 +             netif_rx_internal+0xb2/0xd0
3141 +             netif_rx+0x1c/0x60
3142 +             loopback_xmit+0x6c/0xb0
3143 +             dev_hard_start_xmit+0x219/0x3a0
3144 +             __dev_queue_xmit+0x415/0x4f0
3145 +             dev_queue_xmit_sk+0x13/0x20
3146 +             ip_finish_output2+0x237/0x340
3147 +             ip_finish_output+0x113/0x1d0
3148 +             ip_output+0x66/0xc0
3149 +             ip_local_out_sk+0x31/0x40
3150 +             ip_send_skb+0x1a/0x50
3151 +             udp_send_skb+0x16d/0x270
3152 +             udp_sendmsg+0x2bf/0x980
3153 +             inet_sendmsg+0x67/0xa0
3154 +             sock_sendmsg+0x38/0x50
3155 +             ___sys_sendmsg+0x14e/0x270
3156 +    } hitcount:         76
3157 +    { stacktrace:
3158 +             netif_rx_internal+0xb2/0xd0
3159 +             netif_rx+0x1c/0x60
3160 +             loopback_xmit+0x6c/0xb0
3161 +             dev_hard_start_xmit+0x219/0x3a0
3162 +             __dev_queue_xmit+0x415/0x4f0
3163 +             dev_queue_xmit_sk+0x13/0x20
3164 +             ip_finish_output2+0x237/0x340
3165 +             ip_finish_output+0x113/0x1d0
3166 +             ip_output+0x66/0xc0
3167 +             ip_local_out_sk+0x31/0x40
3168 +             ip_send_skb+0x1a/0x50
3169 +             udp_send_skb+0x16d/0x270
3170 +             udp_sendmsg+0x2bf/0x980
3171 +             inet_sendmsg+0x67/0xa0
3172 +             sock_sendmsg+0x38/0x50
3173 +             ___sys_sendmsg+0x269/0x270
3174 +    } hitcount:         77
3175 +    { stacktrace:
3176 +             netif_rx_internal+0xb2/0xd0
3177 +             netif_rx+0x1c/0x60
3178 +             loopback_xmit+0x6c/0xb0
3179 +             dev_hard_start_xmit+0x219/0x3a0
3180 +             __dev_queue_xmit+0x415/0x4f0
3181 +             dev_queue_xmit_sk+0x13/0x20
3182 +             ip_finish_output2+0x237/0x340
3183 +             ip_finish_output+0x113/0x1d0
3184 +             ip_output+0x66/0xc0
3185 +             ip_local_out_sk+0x31/0x40
3186 +             ip_send_skb+0x1a/0x50
3187 +             udp_send_skb+0x16d/0x270
3188 +             udp_sendmsg+0x2bf/0x980
3189 +             inet_sendmsg+0x67/0xa0
3190 +             sock_sendmsg+0x38/0x50
3191 +             SYSC_sendto+0xef/0x170
3192 +    } hitcount:         88
3193 +    { stacktrace:
3194 +             _do_fork+0x18e/0x330
3195 +             SyS_clone+0x19/0x20
3196 +             entry_SYSCALL_64_fastpath+0x12/0x6a
3197 +    } hitcount:        244
3198 +
3199 +    Totals:
3200 +        Hits: 489
3201 +        Entries: 7
3202 +        Dropped: 0
3203 +
3204 +
3205 +2.2 Inter-event hist triggers
3206 +-----------------------------
3207 +
3208 +Inter-event hist triggers are hist triggers that combine values from
3209 +one or more other events and create a histogram using that data.  Data
3210 +from an inter-event histogram can in turn become the source for
3211 +further combined histograms, thus providing a chain of related
3212 +histograms, which is important for some applications.
3213 +
3214 +The most important example of an inter-event quantity that can be used
3215 +in this manner is latency, which is simply a difference in timestamps
3216 +between two events.  Although latency is the most important
3217 +inter-event quantity, note that because the support is completely
3218 +general across the trace event subsystem, any event field can be used
3219 +in an inter-event quantity.
3220 +
3221 +An example of a histogram that combines data from other histograms
3222 +into a useful chain would be a 'wakeupswitch latency' histogram that
3223 +combines a 'wakeup latency' histogram and a 'switch latency'
3224 +histogram.
3225 +
3226 +Normally, a hist trigger specification consists of a (possibly
3227 +compound) key along with one or more numeric values, which are
3228 +continually updated sums associated with that key.  A histogram
3229 +specification in this case consists of individual key and value
3230 +specifications that refer to trace event fields associated with a
3231 +single event type.
3232 +
3233 +The inter-event hist trigger extension allows fields from multiple
3234 +events to be referenced and combined into a multi-event histogram
3235 +specification.  In support of this overall goal, a few enabling
3236 +features have been added to the hist trigger support:
3237 +
3238 +  - In order to compute an inter-event quantity, a value from one
3239 +    event needs to saved and then referenced from another event.  This
3240 +    requires the introduction of support for histogram 'variables'.
3241 +
3242 +  - The computation of inter-event quantities and their combination
3243 +    require some minimal amount of support for applying simple
3244 +    expressions to variables (+ and -).
3245 +
3246 +  - A histogram consisting of inter-event quantities isn't logically a
3247 +    histogram on either event (so having the 'hist' file for either
3248 +    event host the histogram output doesn't really make sense).  To
3249 +    address the idea that the histogram is associated with a
3250 +    combination of events, support is added allowing the creation of
3251 +    'synthetic' events that are events derived from other events.
3252 +    These synthetic events are full-fledged events just like any other
3253 +    and can be used as such, as for instance to create the
3254 +    'combination' histograms mentioned previously.
3255 +
3256 +  - A set of 'actions' can be associated with histogram entries -
3257 +    these can be used to generate the previously mentioned synthetic
3258 +    events, but can also be used for other purposes, such as for
3259 +    example saving context when a 'max' latency has been hit.
3260 +
3261 +  - Trace events don't have a 'timestamp' associated with them, but
3262 +    there is an implicit timestamp saved along with an event in the
3263 +    underlying ftrace ring buffer.  This timestamp is now exposed as a
3264 +    a synthetic field named 'common_timestamp' which can be used in
3265 +    histograms as if it were any other event field; it isn't an actual
3266 +    field in the trace format but rather is a synthesized value that
3267 +    nonetheless can be used as if it were an actual field.  By default
3268 +    it is in units of nanoseconds; appending '.usecs' to a
3269 +    common_timestamp field changes the units to microseconds.
3270 +
3271 +A note on inter-event timestamps: If common_timestamp is used in a
3272 +histogram, the trace buffer is automatically switched over to using
3273 +absolute timestamps and the "global" trace clock, in order to avoid
3274 +bogus timestamp differences with other clocks that aren't coherent
3275 +across CPUs.  This can be overridden by specifying one of the other
3276 +trace clocks instead, using the "clock=XXX" hist trigger attribute,
3277 +where XXX is any of the clocks listed in the tracing/trace_clock
3278 +pseudo-file.
3279 +
3280 +These features are described in more detail in the following sections.
3281 +
3282 +2.2.1 Histogram Variables
3283 +-------------------------
3284 +
3285 +Variables are simply named locations used for saving and retrieving
3286 +values between matching events.  A 'matching' event is defined as an
3287 +event that has a matching key - if a variable is saved for a histogram
3288 +entry corresponding to that key, any subsequent event with a matching
3289 +key can access that variable.
3290 +
3291 +A variable's value is normally available to any subsequent event until
3292 +it is set to something else by a subsequent event.  The one exception
3293 +to that rule is that any variable used in an expression is essentially
3294 +'read-once' - once it's used by an expression in a subsequent event,
3295 +it's reset to its 'unset' state, which means it can't be used again
3296 +unless it's set again.  This ensures not only that an event doesn't
3297 +use an uninitialized variable in a calculation, but that that variable
3298 +is used only once and not for any unrelated subsequent match.
3299 +
3300 +The basic syntax for saving a variable is to simply prefix a unique
3301 +variable name not corresponding to any keyword along with an '=' sign
3302 +to any event field.
3303 +
3304 +Either keys or values can be saved and retrieved in this way.  This
3305 +creates a variable named 'ts0' for a histogram entry with the key
3306 +'next_pid':
3307 +
3308 +  # echo 'hist:keys=next_pid:vals=$ts0:ts0=common_timestamp ... >> \
3309 +       event/trigger
3310 +
3311 +The ts0 variable can be accessed by any subsequent event having the
3312 +same pid as 'next_pid'.
3313 +
3314 +Variable references are formed by prepending the variable name with
3315 +the '$' sign.  Thus for example, the ts0 variable above would be
3316 +referenced as '$ts0' in expressions.
3317 +
3318 +Because 'vals=' is used, the common_timestamp variable value above
3319 +will also be summed as a normal histogram value would (though for a
3320 +timestamp it makes little sense).
3321 +
3322 +The below shows that a key value can also be saved in the same way:
3323 +
3324 +  # echo 'hist:timer_pid=common_pid:key=timer_pid ...' >> event/trigger
3325 +
3326 +If a variable isn't a key variable or prefixed with 'vals=', the
3327 +associated event field will be saved in a variable but won't be summed
3328 +as a value:
3329 +
3330 +  # echo 'hist:keys=next_pid:ts1=common_timestamp ... >> event/trigger
3331 +
3332 +Multiple variables can be assigned at the same time.  The below would
3333 +result in both ts0 and b being created as variables, with both
3334 +common_timestamp and field1 additionally being summed as values:
3335 +
3336 +  # echo 'hist:keys=pid:vals=$ts0,$b:ts0=common_timestamp,b=field1 ... >> \
3337 +       event/trigger
3338 +
3339 +Note that variable assignments can appear either preceding or
3340 +following their use.  The command below behaves identically to the
3341 +command above:
3342 +
3343 +  # echo 'hist:keys=pid:ts0=common_timestamp,b=field1:vals=$ts0,$b ... >> \
3344 +       event/trigger
3345 +
3346 +Any number of variables not bound to a 'vals=' prefix can also be
3347 +assigned by simply separating them with colons.  Below is the same
3348 +thing but without the values being summed in the histogram:
3349 +
3350 +  # echo 'hist:keys=pid:ts0=common_timestamp:b=field1 ... >> event/trigger
3351 +
3352 +Variables set as above can be referenced and used in expressions on
3353 +another event.
3354 +
3355 +For example, here's how a latency can be calculated:
3356 +
3357 +  # echo 'hist:keys=pid,prio:ts0=common_timestamp ... >> event1/trigger
3358 +  # echo 'hist:keys=next_pid:wakeup_lat=common_timestamp-$ts0 ... >> event2/trigger
3359 +
3360 +In the first line above, the event's timetamp is saved into the
3361 +variable ts0.  In the next line, ts0 is subtracted from the second
3362 +event's timestamp to produce the latency, which is then assigned into
3363 +yet another variable, 'wakeup_lat'.  The hist trigger below in turn
3364 +makes use of the wakeup_lat variable to compute a combined latency
3365 +using the same key and variable from yet another event:
3366 +
3367 +  # echo 'hist:key=pid:wakeupswitch_lat=$wakeup_lat+$switchtime_lat ... >> event3/trigger
3368 +
3369 +2.2.2 Synthetic Events
3370 +----------------------
3371 +
3372 +Synthetic events are user-defined events generated from hist trigger
3373 +variables or fields associated with one or more other events.  Their
3374 +purpose is to provide a mechanism for displaying data spanning
3375 +multiple events consistent with the existing and already familiar
3376 +usage for normal events.
3377 +
3378 +To define a synthetic event, the user writes a simple specification
3379 +consisting of the name of the new event along with one or more
3380 +variables and their types, which can be any valid field type,
3381 +separated by semicolons, to the tracing/synthetic_events file.
3382 +
3383 +For instance, the following creates a new event named 'wakeup_latency'
3384 +with 3 fields: lat, pid, and prio.  Each of those fields is simply a
3385 +variable reference to a variable on another event:
3386 +
3387 +  # echo 'wakeup_latency \
3388 +          u64 lat; \
3389 +          pid_t pid; \
3390 +         int prio' >> \
3391 +         /sys/kernel/debug/tracing/synthetic_events
3392 +
3393 +Reading the tracing/synthetic_events file lists all the currently
3394 +defined synthetic events, in this case the event defined above:
3395 +
3396 +  # cat /sys/kernel/debug/tracing/synthetic_events
3397 +    wakeup_latency u64 lat; pid_t pid; int prio
3398 +
3399 +An existing synthetic event definition can be removed by prepending
3400 +the command that defined it with a '!':
3401 +
3402 +  # echo '!wakeup_latency u64 lat pid_t pid int prio' >> \
3403 +    /sys/kernel/debug/tracing/synthetic_events
3404 +
3405 +At this point, there isn't yet an actual 'wakeup_latency' event
3406 +instantiated in the event subsytem - for this to happen, a 'hist
3407 +trigger action' needs to be instantiated and bound to actual fields
3408 +and variables defined on other events (see Section 6.3.3 below).
3409 +
3410 +Once that is done, an event instance is created, and a histogram can
3411 +be defined using it:
3412 +
3413 +  # echo 'hist:keys=pid,prio,lat.log2:sort=pid,lat' >> \
3414 +        /sys/kernel/debug/tracing/events/synthetic/wakeup_latency/trigger
3415 +
3416 +The new event is created under the tracing/events/synthetic/ directory
3417 +and looks and behaves just like any other event:
3418 +
3419 +  # ls /sys/kernel/debug/tracing/events/synthetic/wakeup_latency
3420 +        enable  filter  format  hist  id  trigger
3421 +
3422 +Like any other event, once a histogram is enabled for the event, the
3423 +output can be displayed by reading the event's 'hist' file.
3424 +
3425 +2.2.3 Hist trigger 'actions'
3426 +----------------------------
3427 +
3428 +A hist trigger 'action' is a function that's executed whenever a
3429 +histogram entry is added or updated.
3430 +
3431 +The default 'action' if no special function is explicity specified is
3432 +as it always has been, to simply update the set of values associated
3433 +with an entry.  Some applications, however, may want to perform
3434 +additional actions at that point, such as generate another event, or
3435 +compare and save a maximum.
3436 +
3437 +The following additional actions are available.  To specify an action
3438 +for a given event, simply specify the action between colons in the
3439 +hist trigger specification.
3440 +
3441 +  - onmatch(matching.event).<synthetic_event_name>(param list)
3442 +
3443 +    The 'onmatch(matching.event).<synthetic_event_name>(params)' hist
3444 +    trigger action is invoked whenever an event matches and the
3445 +    histogram entry would be added or updated.  It causes the named
3446 +    synthetic event to be generated with the values given in the
3447 +    'param list'.  The result is the generation of a synthetic event
3448 +    that consists of the values contained in those variables at the
3449 +    time the invoking event was hit.
3450 +
3451 +    The 'param list' consists of one or more parameters which may be
3452 +    either variables or fields defined on either the 'matching.event'
3453 +    or the target event.  The variables or fields specified in the
3454 +    param list may be either fully-qualified or unqualified.  If a
3455 +    variable is specified as unqualified, it must be unique between
3456 +    the two events.  A field name used as a param can be unqualified
3457 +    if it refers to the target event, but must be fully qualified if
3458 +    it refers to the matching event.  A fully-qualified name is of the
3459 +    form 'system.event_name.$var_name' or 'system.event_name.field'.
3460 +
3461 +    The 'matching.event' specification is simply the fully qualified
3462 +    event name of the event that matches the target event for the
3463 +    onmatch() functionality, in the form 'system.event_name'.
3464 +
3465 +    Finally, the number and type of variables/fields in the 'param
3466 +    list' must match the number and types of the fields in the
3467 +    synthetic event being generated.
3468 +
3469 +    As an example the below defines a simple synthetic event and uses
3470 +    a variable defined on the sched_wakeup_new event as a parameter
3471 +    when invoking the synthetic event.  Here we define the synthetic
3472 +    event:
3473 +
3474 +    # echo 'wakeup_new_test pid_t pid' >> \
3475 +           /sys/kernel/debug/tracing/synthetic_events
3476 +
3477 +    # cat /sys/kernel/debug/tracing/synthetic_events
3478 +          wakeup_new_test pid_t pid
3479 +
3480 +    The following hist trigger both defines the missing testpid
3481 +    variable and specifies an onmatch() action that generates a
3482 +    wakeup_new_test synthetic event whenever a sched_wakeup_new event
3483 +    occurs, which because of the 'if comm == "cyclictest"' filter only
3484 +    happens when the executable is cyclictest:
3485 +
3486 +    # echo 'hist:keys=$testpid:testpid=pid:onmatch(sched.sched_wakeup_new).\
3487 +            wakeup_new_test($testpid) if comm=="cyclictest"' >> \
3488 +            /sys/kernel/debug/tracing/events/sched/sched_wakeup_new/trigger
3489 +
3490 +    Creating and displaying a histogram based on those events is now
3491 +    just a matter of using the fields and new synthetic event in the
3492 +    tracing/events/synthetic directory, as usual:
3493 +
3494 +    # echo 'hist:keys=pid:sort=pid' >> \
3495 +           /sys/kernel/debug/tracing/events/synthetic/wakeup_new_test/trigger
3496 +
3497 +    Running 'cyclictest' should cause wakeup_new events to generate
3498 +    wakeup_new_test synthetic events which should result in histogram
3499 +    output in the wakeup_new_test event's hist file:
3500 +
3501 +    # cat /sys/kernel/debug/tracing/events/synthetic/wakeup_new_test/hist
3502 +
3503 +    A more typical usage would be to use two events to calculate a
3504 +    latency.  The following example uses a set of hist triggers to
3505 +    produce a 'wakeup_latency' histogram:
3506 +
3507 +    First, we define a 'wakeup_latency' synthetic event:
3508 +
3509 +    # echo 'wakeup_latency u64 lat; pid_t pid; int prio' >> \
3510 +            /sys/kernel/debug/tracing/synthetic_events
3511 +
3512 +    Next, we specify that whenever we see a sched_waking event for a
3513 +    cyclictest thread, save the timestamp in a 'ts0' variable:
3514 +
3515 +    # echo 'hist:keys=$saved_pid:saved_pid=pid:ts0=common_timestamp.usecs \
3516 +            if comm=="cyclictest"' >> \
3517 +           /sys/kernel/debug/tracing/events/sched/sched_waking/trigger
3518 +
3519 +    Then, when the corresponding thread is actually scheduled onto the
3520 +    CPU by a sched_switch event, calculate the latency and use that
3521 +    along with another variable and an event field to generate a
3522 +    wakeup_latency synthetic event:
3523 +
3524 +    # echo 'hist:keys=next_pid:wakeup_lat=common_timestamp.usecs-$ts0:\
3525 +            onmatch(sched.sched_waking).wakeup_latency($wakeup_lat,\
3526 +                   $saved_pid,next_prio) if next_comm=="cyclictest"' >> \
3527 +           /sys/kernel/debug/tracing/events/sched/sched_switch/trigger
3528 +
3529 +    We also need to create a histogram on the wakeup_latency synthetic
3530 +    event in order to aggregate the generated synthetic event data:
3531 +
3532 +    # echo 'hist:keys=pid,prio,lat:sort=pid,lat' >> \
3533 +            /sys/kernel/debug/tracing/events/synthetic/wakeup_latency/trigger
3534 +
3535 +    Finally, once we've run cyclictest to actually generate some
3536 +    events, we can see the output by looking at the wakeup_latency
3537 +    synthetic event's hist file:
3538 +
3539 +    # cat /sys/kernel/debug/tracing/events/synthetic/wakeup_latency/hist
3540 +
3541 +  - onmax(var).save(field,..   .)
3542 +
3543 +    The 'onmax(var).save(field,...)' hist trigger action is invoked
3544 +    whenever the value of 'var' associated with a histogram entry
3545 +    exceeds the current maximum contained in that variable.
3546 +
3547 +    The end result is that the trace event fields specified as the
3548 +    onmax.save() params will be saved if 'var' exceeds the current
3549 +    maximum for that hist trigger entry.  This allows context from the
3550 +    event that exhibited the new maximum to be saved for later
3551 +    reference.  When the histogram is displayed, additional fields
3552 +    displaying the saved values will be printed.
3553 +
3554 +    As an example the below defines a couple of hist triggers, one for
3555 +    sched_waking and another for sched_switch, keyed on pid.  Whenever
3556 +    a sched_waking occurs, the timestamp is saved in the entry
3557 +    corresponding to the current pid, and when the scheduler switches
3558 +    back to that pid, the timestamp difference is calculated.  If the
3559 +    resulting latency, stored in wakeup_lat, exceeds the current
3560 +    maximum latency, the values specified in the save() fields are
3561 +    recoreded:
3562 +
3563 +    # echo 'hist:keys=pid:ts0=common_timestamp.usecs \
3564 +            if comm=="cyclictest"' >> \
3565 +            /sys/kernel/debug/tracing/events/sched/sched_waking/trigger
3566 +
3567 +    # echo 'hist:keys=next_pid:\
3568 +            wakeup_lat=common_timestamp.usecs-$ts0:\
3569 +            onmax($wakeup_lat).save(next_comm,prev_pid,prev_prio,prev_comm) \
3570 +            if next_comm=="cyclictest"' >> \
3571 +            /sys/kernel/debug/tracing/events/sched/sched_switch/trigger
3572 +
3573 +    When the histogram is displayed, the max value and the saved
3574 +    values corresponding to the max are displayed following the rest
3575 +    of the fields:
3576 +
3577 +    # cat /sys/kernel/debug/tracing/events/sched/sched_switch/hist
3578 +      { next_pid:       2255 } hitcount:        239
3579 +        common_timestamp-ts0:          0
3580 +        max:         27
3581 +       next_comm: cyclictest
3582 +        prev_pid:          0  prev_prio:        120  prev_comm: swapper/1
3583 +
3584 +      { next_pid:       2256 } hitcount:       2355
3585 +        common_timestamp-ts0: 0
3586 +        max:         49  next_comm: cyclictest
3587 +        prev_pid:          0  prev_prio:        120  prev_comm: swapper/0
3588 +
3589 +      Totals:
3590 +          Hits: 12970
3591 +          Entries: 2
3592 +          Dropped: 0
3593 diff --git a/arch/Kconfig b/arch/Kconfig
3594 index 40dc31fea90c..7c6108479209 100644
3595 --- a/arch/Kconfig
3596 +++ b/arch/Kconfig
3597 @@ -20,6 +20,7 @@ config OPROFILE
3598         tristate "OProfile system profiling"
3599         depends on PROFILING
3600         depends on HAVE_OPROFILE
3601 +       depends on !PREEMPT_RT_FULL
3602         select RING_BUFFER
3603         select RING_BUFFER_ALLOW_SWAP
3604         help
3605 diff --git a/arch/alpha/include/asm/spinlock_types.h b/arch/alpha/include/asm/spinlock_types.h
3606 index 1d5716bc060b..6883bc952d22 100644
3607 --- a/arch/alpha/include/asm/spinlock_types.h
3608 +++ b/arch/alpha/include/asm/spinlock_types.h
3609 @@ -2,10 +2,6 @@
3610  #ifndef _ALPHA_SPINLOCK_TYPES_H
3611  #define _ALPHA_SPINLOCK_TYPES_H
3612
3613 -#ifndef __LINUX_SPINLOCK_TYPES_H
3614 -# error "please don't include this file directly"
3615 -#endif
3616 -
3617  typedef struct {
3618         volatile unsigned int lock;
3619  } arch_spinlock_t;
3620 diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig
3621 index d1346a160760..558b0995e94a 100644
3622 --- a/arch/arm/Kconfig
3623 +++ b/arch/arm/Kconfig
3624 @@ -45,7 +45,7 @@ config ARM
3625         select HARDIRQS_SW_RESEND
3626         select HAVE_ARCH_AUDITSYSCALL if (AEABI && !OABI_COMPAT)
3627         select HAVE_ARCH_BITREVERSE if (CPU_32v7M || CPU_32v7) && !CPU_32v6
3628 -       select HAVE_ARCH_JUMP_LABEL if !XIP_KERNEL && !CPU_ENDIAN_BE32 && MMU
3629 +       select HAVE_ARCH_JUMP_LABEL if !XIP_KERNEL && !CPU_ENDIAN_BE32 && MMU && !PREEMPT_RT_BASE
3630         select HAVE_ARCH_KGDB if !CPU_ENDIAN_BE32 && MMU
3631         select HAVE_ARCH_MMAP_RND_BITS if MMU
3632         select HAVE_ARCH_SECCOMP_FILTER if (AEABI && !OABI_COMPAT)
3633 @@ -85,6 +85,7 @@ config ARM
3634         select HAVE_PERF_EVENTS
3635         select HAVE_PERF_REGS
3636         select HAVE_PERF_USER_STACK_DUMP
3637 +       select HAVE_PREEMPT_LAZY
3638         select HAVE_RCU_TABLE_FREE if (SMP && ARM_LPAE)
3639         select HAVE_REGS_AND_STACK_ACCESS_API
3640         select HAVE_SYSCALL_TRACEPOINTS
3641 @@ -2164,7 +2165,7 @@ config NEON
3642
3643  config KERNEL_MODE_NEON
3644         bool "Support for NEON in kernel mode"
3645 -       depends on NEON && AEABI
3646 +       depends on NEON && AEABI && !PREEMPT_RT_BASE
3647         help
3648           Say Y to include support for NEON in kernel mode.
3649
3650 diff --git a/arch/arm/include/asm/irq.h b/arch/arm/include/asm/irq.h
3651 index b6f319606e30..ad377ef73739 100644
3652 --- a/arch/arm/include/asm/irq.h
3653 +++ b/arch/arm/include/asm/irq.h
3654 @@ -23,6 +23,8 @@
3655  #endif
3656
3657  #ifndef __ASSEMBLY__
3658 +#include <linux/cpumask.h>
3659 +
3660  struct irqaction;
3661  struct pt_regs;
3662  extern void migrate_irqs(void);
3663 diff --git a/arch/arm/include/asm/spinlock_types.h b/arch/arm/include/asm/spinlock_types.h
3664 index 5976958647fe..a37c0803954b 100644
3665 --- a/arch/arm/include/asm/spinlock_types.h
3666 +++ b/arch/arm/include/asm/spinlock_types.h
3667 @@ -2,10 +2,6 @@
3668  #ifndef __ASM_SPINLOCK_TYPES_H
3669  #define __ASM_SPINLOCK_TYPES_H
3670
3671 -#ifndef __LINUX_SPINLOCK_TYPES_H
3672 -# error "please don't include this file directly"
3673 -#endif
3674 -
3675  #define TICKET_SHIFT   16
3676
3677  typedef struct {
3678 diff --git a/arch/arm/include/asm/switch_to.h b/arch/arm/include/asm/switch_to.h
3679 index d3e937dcee4d..6ab96a2ce1f8 100644
3680 --- a/arch/arm/include/asm/switch_to.h
3681 +++ b/arch/arm/include/asm/switch_to.h
3682 @@ -4,6 +4,13 @@
3683
3684  #include <linux/thread_info.h>
3685
3686 +#if defined CONFIG_PREEMPT_RT_FULL && defined CONFIG_HIGHMEM
3687 +void switch_kmaps(struct task_struct *prev_p, struct task_struct *next_p);
3688 +#else
3689 +static inline void
3690 +switch_kmaps(struct task_struct *prev_p, struct task_struct *next_p) { }
3691 +#endif
3692 +
3693  /*
3694   * For v7 SMP cores running a preemptible kernel we may be pre-empted
3695   * during a TLB maintenance operation, so execute an inner-shareable dsb
3696 @@ -26,6 +33,7 @@ extern struct task_struct *__switch_to(struct task_struct *, struct thread_info
3697  #define switch_to(prev,next,last)                                      \
3698  do {                                                                   \
3699         __complete_pending_tlbi();                                      \
3700 +       switch_kmaps(prev, next);                                       \
3701         last = __switch_to(prev,task_thread_info(prev), task_thread_info(next));        \
3702  } while (0)
3703
3704 diff --git a/arch/arm/include/asm/thread_info.h b/arch/arm/include/asm/thread_info.h
3705 index 57d2ad9c75ca..cdfb6855943b 100644
3706 --- a/arch/arm/include/asm/thread_info.h
3707 +++ b/arch/arm/include/asm/thread_info.h
3708 @@ -49,6 +49,7 @@ struct cpu_context_save {
3709  struct thread_info {
3710         unsigned long           flags;          /* low level flags */
3711         int                     preempt_count;  /* 0 => preemptable, <0 => bug */
3712 +       int                     preempt_lazy_count; /* 0 => preemptable, <0 => bug */
3713         mm_segment_t            addr_limit;     /* address limit */
3714         struct task_struct      *task;          /* main task structure */
3715         __u32                   cpu;            /* cpu */
3716 @@ -142,7 +143,8 @@ extern int vfp_restore_user_hwstate(struct user_vfp *,
3717  #define TIF_SYSCALL_TRACE      4       /* syscall trace active */
3718  #define TIF_SYSCALL_AUDIT      5       /* syscall auditing active */
3719  #define TIF_SYSCALL_TRACEPOINT 6       /* syscall tracepoint instrumentation */
3720 -#define TIF_SECCOMP            7       /* seccomp syscall filtering active */
3721 +#define TIF_SECCOMP            8       /* seccomp syscall filtering active */
3722 +#define TIF_NEED_RESCHED_LAZY  7
3723
3724  #define TIF_NOHZ               12      /* in adaptive nohz mode */
3725  #define TIF_USING_IWMMXT       17
3726 @@ -152,6 +154,7 @@ extern int vfp_restore_user_hwstate(struct user_vfp *,
3727  #define _TIF_SIGPENDING                (1 << TIF_SIGPENDING)
3728  #define _TIF_NEED_RESCHED      (1 << TIF_NEED_RESCHED)
3729  #define _TIF_NOTIFY_RESUME     (1 << TIF_NOTIFY_RESUME)
3730 +#define _TIF_NEED_RESCHED_LAZY (1 << TIF_NEED_RESCHED_LAZY)
3731  #define _TIF_UPROBE            (1 << TIF_UPROBE)
3732  #define _TIF_SYSCALL_TRACE     (1 << TIF_SYSCALL_TRACE)
3733  #define _TIF_SYSCALL_AUDIT     (1 << TIF_SYSCALL_AUDIT)
3734 @@ -167,7 +170,8 @@ extern int vfp_restore_user_hwstate(struct user_vfp *,
3735   * Change these and you break ASM code in entry-common.S
3736   */
3737  #define _TIF_WORK_MASK         (_TIF_NEED_RESCHED | _TIF_SIGPENDING | \
3738 -                                _TIF_NOTIFY_RESUME | _TIF_UPROBE)
3739 +                                _TIF_NOTIFY_RESUME | _TIF_UPROBE | \
3740 +                                _TIF_NEED_RESCHED_LAZY)
3741
3742  #endif /* __KERNEL__ */
3743  #endif /* __ASM_ARM_THREAD_INFO_H */
3744 diff --git a/arch/arm/kernel/asm-offsets.c b/arch/arm/kernel/asm-offsets.c
3745 index 608008229c7d..3866da3f7bb7 100644
3746 --- a/arch/arm/kernel/asm-offsets.c
3747 +++ b/arch/arm/kernel/asm-offsets.c
3748 @@ -65,6 +65,7 @@ int main(void)
3749    BLANK();
3750    DEFINE(TI_FLAGS,             offsetof(struct thread_info, flags));
3751    DEFINE(TI_PREEMPT,           offsetof(struct thread_info, preempt_count));
3752 +  DEFINE(TI_PREEMPT_LAZY,      offsetof(struct thread_info, preempt_lazy_count));
3753    DEFINE(TI_ADDR_LIMIT,                offsetof(struct thread_info, addr_limit));
3754    DEFINE(TI_TASK,              offsetof(struct thread_info, task));
3755    DEFINE(TI_CPU,               offsetof(struct thread_info, cpu));
3756 diff --git a/arch/arm/kernel/entry-armv.S b/arch/arm/kernel/entry-armv.S
3757 index fbc707626b3e..b434c59d2b64 100644
3758 --- a/arch/arm/kernel/entry-armv.S
3759 +++ b/arch/arm/kernel/entry-armv.S
3760 @@ -220,11 +220,18 @@ __irq_svc:
3761
3762  #ifdef CONFIG_PREEMPT
3763         ldr     r8, [tsk, #TI_PREEMPT]          @ get preempt count
3764 -       ldr     r0, [tsk, #TI_FLAGS]            @ get flags
3765         teq     r8, #0                          @ if preempt count != 0
3766 +       bne     1f                              @ return from exeption
3767 +       ldr     r0, [tsk, #TI_FLAGS]            @ get flags
3768 +       tst     r0, #_TIF_NEED_RESCHED          @ if NEED_RESCHED is set
3769 +       blne    svc_preempt                     @ preempt!
3770 +
3771 +       ldr     r8, [tsk, #TI_PREEMPT_LAZY]     @ get preempt lazy count
3772 +       teq     r8, #0                          @ if preempt lazy count != 0
3773         movne   r0, #0                          @ force flags to 0
3774 -       tst     r0, #_TIF_NEED_RESCHED
3775 +       tst     r0, #_TIF_NEED_RESCHED_LAZY
3776         blne    svc_preempt
3777 +1:
3778  #endif
3779
3780         svc_exit r5, irq = 1                    @ return from exception
3781 @@ -239,8 +246,14 @@ svc_preempt:
3782  1:     bl      preempt_schedule_irq            @ irq en/disable is done inside
3783         ldr     r0, [tsk, #TI_FLAGS]            @ get new tasks TI_FLAGS
3784         tst     r0, #_TIF_NEED_RESCHED
3785 +       bne     1b
3786 +       tst     r0, #_TIF_NEED_RESCHED_LAZY
3787         reteq   r8                              @ go again
3788 -       b       1b
3789 +       ldr     r0, [tsk, #TI_PREEMPT_LAZY]     @ get preempt lazy count
3790 +       teq     r0, #0                          @ if preempt lazy count != 0
3791 +       beq     1b
3792 +       ret     r8                              @ go again
3793 +
3794  #endif
3795
3796  __und_fault:
3797 diff --git a/arch/arm/kernel/entry-common.S b/arch/arm/kernel/entry-common.S
3798 index 54c10503d71f..3fdeade24e3f 100644
3799 --- a/arch/arm/kernel/entry-common.S
3800 +++ b/arch/arm/kernel/entry-common.S
3801 @@ -53,7 +53,9 @@ ret_fast_syscall:
3802         cmp     r2, #TASK_SIZE
3803         blne    addr_limit_check_failed
3804         ldr     r1, [tsk, #TI_FLAGS]            @ re-check for syscall tracing
3805 -       tst     r1, #_TIF_SYSCALL_WORK | _TIF_WORK_MASK
3806 +       tst     r1, #((_TIF_SYSCALL_WORK | _TIF_WORK_MASK) & ~_TIF_SECCOMP)
3807 +       bne     fast_work_pending
3808 +       tst     r1, #_TIF_SECCOMP
3809         bne     fast_work_pending
3810
3811
3812 @@ -83,8 +85,11 @@ ret_fast_syscall:
3813         cmp     r2, #TASK_SIZE
3814         blne    addr_limit_check_failed
3815         ldr     r1, [tsk, #TI_FLAGS]            @ re-check for syscall tracing
3816 -       tst     r1, #_TIF_SYSCALL_WORK | _TIF_WORK_MASK
3817 +       tst     r1, #((_TIF_SYSCALL_WORK | _TIF_WORK_MASK) & ~_TIF_SECCOMP)
3818 +       bne     do_slower_path
3819 +       tst     r1, #_TIF_SECCOMP
3820         beq     no_work_pending
3821 +do_slower_path:
3822   UNWIND(.fnend         )
3823  ENDPROC(ret_fast_syscall)
3824
3825 diff --git a/arch/arm/kernel/patch.c b/arch/arm/kernel/patch.c
3826 index a50dc00d79a2..d0a05a3bdb96 100644
3827 --- a/arch/arm/kernel/patch.c
3828 +++ b/arch/arm/kernel/patch.c
3829 @@ -16,7 +16,7 @@ struct patch {
3830         unsigned int insn;
3831  };
3832
3833 -static DEFINE_SPINLOCK(patch_lock);
3834 +static DEFINE_RAW_SPINLOCK(patch_lock);
3835
3836  static void __kprobes *patch_map(void *addr, int fixmap, unsigned long *flags)
3837         __acquires(&patch_lock)
3838 @@ -33,7 +33,7 @@ static void __kprobes *patch_map(void *addr, int fixmap, unsigned long *flags)
3839                 return addr;
3840
3841         if (flags)
3842 -               spin_lock_irqsave(&patch_lock, *flags);
3843 +               raw_spin_lock_irqsave(&patch_lock, *flags);
3844         else
3845                 __acquire(&patch_lock);
3846
3847 @@ -48,7 +48,7 @@ static void __kprobes patch_unmap(int fixmap, unsigned long *flags)
3848         clear_fixmap(fixmap);
3849
3850         if (flags)
3851 -               spin_unlock_irqrestore(&patch_lock, *flags);
3852 +               raw_spin_unlock_irqrestore(&patch_lock, *flags);
3853         else
3854                 __release(&patch_lock);
3855  }
3856 diff --git a/arch/arm/kernel/process.c b/arch/arm/kernel/process.c
3857 index d96714e1858c..cf4e1452d4b4 100644
3858 --- a/arch/arm/kernel/process.c
3859 +++ b/arch/arm/kernel/process.c
3860 @@ -325,6 +325,30 @@ unsigned long arch_randomize_brk(struct mm_struct *mm)
3861  }
3862
3863  #ifdef CONFIG_MMU
3864 +/*
3865 + * CONFIG_SPLIT_PTLOCK_CPUS results in a page->ptl lock.  If the lock is not
3866 + * initialized by pgtable_page_ctor() then a coredump of the vector page will
3867 + * fail.
3868 + */
3869 +static int __init vectors_user_mapping_init_page(void)
3870 +{
3871 +       struct page *page;
3872 +       unsigned long addr = 0xffff0000;
3873 +       pgd_t *pgd;
3874 +       pud_t *pud;
3875 +       pmd_t *pmd;
3876 +
3877 +       pgd = pgd_offset_k(addr);
3878 +       pud = pud_offset(pgd, addr);
3879 +       pmd = pmd_offset(pud, addr);
3880 +       page = pmd_page(*(pmd));
3881 +
3882 +       pgtable_page_ctor(page);
3883 +
3884 +       return 0;
3885 +}
3886 +late_initcall(vectors_user_mapping_init_page);
3887 +
3888  #ifdef CONFIG_KUSER_HELPERS
3889  /*
3890   * The vectors page is always readable from user space for the
3891 diff --git a/arch/arm/kernel/signal.c b/arch/arm/kernel/signal.c
3892 index cdfe52b15a0a..198cf8bf0b37 100644
3893 --- a/arch/arm/kernel/signal.c
3894 +++ b/arch/arm/kernel/signal.c
3895 @@ -615,7 +615,8 @@ do_work_pending(struct pt_regs *regs, unsigned int thread_flags, int syscall)
3896          */
3897         trace_hardirqs_off();
3898         do {
3899 -               if (likely(thread_flags & _TIF_NEED_RESCHED)) {
3900 +               if (likely(thread_flags & (_TIF_NEED_RESCHED |
3901 +                                          _TIF_NEED_RESCHED_LAZY))) {
3902                         schedule();
3903                 } else {
3904                         if (unlikely(!user_mode(regs)))
3905 diff --git a/arch/arm/kernel/smp.c b/arch/arm/kernel/smp.c
3906 index e61af0600133..d8f2e77d5651 100644
3907 --- a/arch/arm/kernel/smp.c
3908 +++ b/arch/arm/kernel/smp.c
3909 @@ -237,8 +237,6 @@ int __cpu_disable(void)
3910         flush_cache_louis();
3911         local_flush_tlb_all();
3912
3913 -       clear_tasks_mm_cpumask(cpu);
3914 -
3915         return 0;
3916  }
3917
3918 @@ -256,6 +254,7 @@ void __cpu_die(unsigned int cpu)
3919         }
3920         pr_debug("CPU%u: shutdown\n", cpu);
3921
3922 +       clear_tasks_mm_cpumask(cpu);
3923         /*
3924          * platform_cpu_kill() is generally expected to do the powering off
3925          * and/or cutting of clocks to the dying CPU.  Optionally, this may
3926 diff --git a/arch/arm/kernel/unwind.c b/arch/arm/kernel/unwind.c
3927 index 0bee233fef9a..314cfb232a63 100644
3928 --- a/arch/arm/kernel/unwind.c
3929 +++ b/arch/arm/kernel/unwind.c
3930 @@ -93,7 +93,7 @@ extern const struct unwind_idx __start_unwind_idx[];
3931  static const struct unwind_idx *__origin_unwind_idx;
3932  extern const struct unwind_idx __stop_unwind_idx[];
3933
3934 -static DEFINE_SPINLOCK(unwind_lock);
3935 +static DEFINE_RAW_SPINLOCK(unwind_lock);
3936  static LIST_HEAD(unwind_tables);
3937
3938  /* Convert a prel31 symbol to an absolute address */
3939 @@ -201,7 +201,7 @@ static const struct unwind_idx *unwind_find_idx(unsigned long addr)
3940                 /* module unwind tables */
3941                 struct unwind_table *table;
3942
3943 -               spin_lock_irqsave(&unwind_lock, flags);
3944 +               raw_spin_lock_irqsave(&unwind_lock, flags);
3945                 list_for_each_entry(table, &unwind_tables, list) {
3946                         if (addr >= table->begin_addr &&
3947                             addr < table->end_addr) {
3948 @@ -213,7 +213,7 @@ static const struct unwind_idx *unwind_find_idx(unsigned long addr)
3949                                 break;
3950                         }
3951                 }
3952 -               spin_unlock_irqrestore(&unwind_lock, flags);
3953 +               raw_spin_unlock_irqrestore(&unwind_lock, flags);
3954         }
3955
3956         pr_debug("%s: idx = %p\n", __func__, idx);
3957 @@ -529,9 +529,9 @@ struct unwind_table *unwind_table_add(unsigned long start, unsigned long size,
3958         tab->begin_addr = text_addr;
3959         tab->end_addr = text_addr + text_size;
3960
3961 -       spin_lock_irqsave(&unwind_lock, flags);
3962 +       raw_spin_lock_irqsave(&unwind_lock, flags);
3963         list_add_tail(&tab->list, &unwind_tables);
3964 -       spin_unlock_irqrestore(&unwind_lock, flags);
3965 +       raw_spin_unlock_irqrestore(&unwind_lock, flags);
3966
3967         return tab;
3968  }
3969 @@ -543,9 +543,9 @@ void unwind_table_del(struct unwind_table *tab)
3970         if (!tab)
3971                 return;
3972
3973 -       spin_lock_irqsave(&unwind_lock, flags);
3974 +       raw_spin_lock_irqsave(&unwind_lock, flags);
3975         list_del(&tab->list);
3976 -       spin_unlock_irqrestore(&unwind_lock, flags);
3977 +       raw_spin_unlock_irqrestore(&unwind_lock, flags);
3978
3979         kfree(tab);
3980  }
3981 diff --git a/arch/arm/mach-exynos/platsmp.c b/arch/arm/mach-exynos/platsmp.c
3982 index 5a03bffe7226..3080ea833d19 100644
3983 --- a/arch/arm/mach-exynos/platsmp.c
3984 +++ b/arch/arm/mach-exynos/platsmp.c
3985 @@ -229,7 +229,7 @@ static void __iomem *scu_base_addr(void)
3986         return (void __iomem *)(S5P_VA_SCU);
3987  }
3988
3989 -static DEFINE_SPINLOCK(boot_lock);
3990 +static DEFINE_RAW_SPINLOCK(boot_lock);
3991
3992  static void exynos_secondary_init(unsigned int cpu)
3993  {
3994 @@ -242,8 +242,8 @@ static void exynos_secondary_init(unsigned int cpu)
3995         /*
3996          * Synchronise with the boot thread.
3997          */
3998 -       spin_lock(&boot_lock);
3999 -       spin_unlock(&boot_lock);
4000 +       raw_spin_lock(&boot_lock);
4001 +       raw_spin_unlock(&boot_lock);
4002  }
4003
4004  int exynos_set_boot_addr(u32 core_id, unsigned long boot_addr)
4005 @@ -307,7 +307,7 @@ static int exynos_boot_secondary(unsigned int cpu, struct task_struct *idle)
4006          * Set synchronisation state between this boot processor
4007          * and the secondary one
4008          */
4009 -       spin_lock(&boot_lock);
4010 +       raw_spin_lock(&boot_lock);
4011
4012         /*
4013          * The secondary processor is waiting to be released from
4014 @@ -334,7 +334,7 @@ static int exynos_boot_secondary(unsigned int cpu, struct task_struct *idle)
4015
4016                 if (timeout == 0) {
4017                         printk(KERN_ERR "cpu1 power enable failed");
4018 -                       spin_unlock(&boot_lock);
4019 +                       raw_spin_unlock(&boot_lock);
4020                         return -ETIMEDOUT;
4021                 }
4022         }
4023 @@ -380,7 +380,7 @@ static int exynos_boot_secondary(unsigned int cpu, struct task_struct *idle)
4024          * calibrations, then wait for it to finish
4025          */
4026  fail:
4027 -       spin_unlock(&boot_lock);
4028 +       raw_spin_unlock(&boot_lock);
4029
4030         return pen_release != -1 ? ret : 0;
4031  }
4032 diff --git a/arch/arm/mach-hisi/platmcpm.c b/arch/arm/mach-hisi/platmcpm.c
4033 index f66815c3dd07..00524abd963f 100644
4034 --- a/arch/arm/mach-hisi/platmcpm.c
4035 +++ b/arch/arm/mach-hisi/platmcpm.c
4036 @@ -61,7 +61,7 @@
4037
4038  static void __iomem *sysctrl, *fabric;
4039  static int hip04_cpu_table[HIP04_MAX_CLUSTERS][HIP04_MAX_CPUS_PER_CLUSTER];
4040 -static DEFINE_SPINLOCK(boot_lock);
4041 +static DEFINE_RAW_SPINLOCK(boot_lock);
4042  static u32 fabric_phys_addr;
4043  /*
4044   * [0]: bootwrapper physical address
4045 @@ -113,7 +113,7 @@ static int hip04_boot_secondary(unsigned int l_cpu, struct task_struct *idle)
4046         if (cluster >= HIP04_MAX_CLUSTERS || cpu >= HIP04_MAX_CPUS_PER_CLUSTER)
4047                 return -EINVAL;
4048
4049 -       spin_lock_irq(&boot_lock);
4050 +       raw_spin_lock_irq(&boot_lock);
4051
4052         if (hip04_cpu_table[cluster][cpu])
4053                 goto out;
4054 @@ -147,7 +147,7 @@ static int hip04_boot_secondary(unsigned int l_cpu, struct task_struct *idle)
4055
4056  out:
4057         hip04_cpu_table[cluster][cpu]++;
4058 -       spin_unlock_irq(&boot_lock);
4059 +       raw_spin_unlock_irq(&boot_lock);
4060
4061         return 0;
4062  }
4063 @@ -162,11 +162,11 @@ static void hip04_cpu_die(unsigned int l_cpu)
4064         cpu = MPIDR_AFFINITY_LEVEL(mpidr, 0);
4065         cluster = MPIDR_AFFINITY_LEVEL(mpidr, 1);
4066
4067 -       spin_lock(&boot_lock);
4068 +       raw_spin_lock(&boot_lock);
4069         hip04_cpu_table[cluster][cpu]--;
4070         if (hip04_cpu_table[cluster][cpu] == 1) {
4071                 /* A power_up request went ahead of us. */
4072 -               spin_unlock(&boot_lock);
4073 +               raw_spin_unlock(&boot_lock);
4074                 return;
4075         } else if (hip04_cpu_table[cluster][cpu] > 1) {
4076                 pr_err("Cluster %d CPU%d boots multiple times\n", cluster, cpu);
4077 @@ -174,7 +174,7 @@ static void hip04_cpu_die(unsigned int l_cpu)
4078         }
4079
4080         last_man = hip04_cluster_is_down(cluster);
4081 -       spin_unlock(&boot_lock);
4082 +       raw_spin_unlock(&boot_lock);
4083         if (last_man) {
4084                 /* Since it's Cortex A15, disable L2 prefetching. */
4085                 asm volatile(
4086 @@ -203,7 +203,7 @@ static int hip04_cpu_kill(unsigned int l_cpu)
4087                cpu >= HIP04_MAX_CPUS_PER_CLUSTER);
4088
4089         count = TIMEOUT_MSEC / POLL_MSEC;
4090 -       spin_lock_irq(&boot_lock);
4091 +       raw_spin_lock_irq(&boot_lock);
4092         for (tries = 0; tries < count; tries++) {
4093                 if (hip04_cpu_table[cluster][cpu])
4094                         goto err;
4095 @@ -211,10 +211,10 @@ static int hip04_cpu_kill(unsigned int l_cpu)
4096                 data = readl_relaxed(sysctrl + SC_CPU_RESET_STATUS(cluster));
4097                 if (data & CORE_WFI_STATUS(cpu))
4098                         break;
4099 -               spin_unlock_irq(&boot_lock);
4100 +               raw_spin_unlock_irq(&boot_lock);
4101                 /* Wait for clean L2 when the whole cluster is down. */
4102                 msleep(POLL_MSEC);
4103 -               spin_lock_irq(&boot_lock);
4104 +               raw_spin_lock_irq(&boot_lock);
4105         }
4106         if (tries >= count)
4107                 goto err;
4108 @@ -231,10 +231,10 @@ static int hip04_cpu_kill(unsigned int l_cpu)
4109                 goto err;
4110         if (hip04_cluster_is_down(cluster))
4111                 hip04_set_snoop_filter(cluster, 0);
4112 -       spin_unlock_irq(&boot_lock);
4113 +       raw_spin_unlock_irq(&boot_lock);
4114         return 1;
4115  err:
4116 -       spin_unlock_irq(&boot_lock);
4117 +       raw_spin_unlock_irq(&boot_lock);
4118         return 0;
4119  }
4120  #endif
4121 diff --git a/arch/arm/mach-omap2/omap-smp.c b/arch/arm/mach-omap2/omap-smp.c
4122 index 1c73694c871a..ac4d2f030b87 100644
4123 --- a/arch/arm/mach-omap2/omap-smp.c
4124 +++ b/arch/arm/mach-omap2/omap-smp.c
4125 @@ -69,7 +69,7 @@ static const struct omap_smp_config omap5_cfg __initconst = {
4126         .startup_addr = omap5_secondary_startup,
4127  };
4128
4129 -static DEFINE_SPINLOCK(boot_lock);
4130 +static DEFINE_RAW_SPINLOCK(boot_lock);
4131
4132  void __iomem *omap4_get_scu_base(void)
4133  {
4134 @@ -177,8 +177,8 @@ static void omap4_secondary_init(unsigned int cpu)
4135         /*
4136          * Synchronise with the boot thread.
4137          */
4138 -       spin_lock(&boot_lock);
4139 -       spin_unlock(&boot_lock);
4140 +       raw_spin_lock(&boot_lock);
4141 +       raw_spin_unlock(&boot_lock);
4142  }
4143
4144  static int omap4_boot_secondary(unsigned int cpu, struct task_struct *idle)
4145 @@ -191,7 +191,7 @@ static int omap4_boot_secondary(unsigned int cpu, struct task_struct *idle)
4146          * Set synchronisation state between this boot processor
4147          * and the secondary one
4148          */
4149 -       spin_lock(&boot_lock);
4150 +       raw_spin_lock(&boot_lock);
4151
4152         /*
4153          * Update the AuxCoreBoot0 with boot state for secondary core.
4154 @@ -270,7 +270,7 @@ static int omap4_boot_secondary(unsigned int cpu, struct task_struct *idle)
4155          * Now the secondary core is starting up let it run its
4156          * calibrations, then wait for it to finish
4157          */
4158 -       spin_unlock(&boot_lock);
4159 +       raw_spin_unlock(&boot_lock);
4160
4161         return 0;
4162  }
4163 diff --git a/arch/arm/mach-prima2/platsmp.c b/arch/arm/mach-prima2/platsmp.c
4164 index 75ef5d4be554..c17c86e5d860 100644
4165 --- a/arch/arm/mach-prima2/platsmp.c
4166 +++ b/arch/arm/mach-prima2/platsmp.c
4167 @@ -22,7 +22,7 @@
4168
4169  static void __iomem *clk_base;
4170
4171 -static DEFINE_SPINLOCK(boot_lock);
4172 +static DEFINE_RAW_SPINLOCK(boot_lock);
4173
4174  static void sirfsoc_secondary_init(unsigned int cpu)
4175  {
4176 @@ -36,8 +36,8 @@ static void sirfsoc_secondary_init(unsigned int cpu)
4177         /*
4178          * Synchronise with the boot thread.
4179          */
4180 -       spin_lock(&boot_lock);
4181 -       spin_unlock(&boot_lock);
4182 +       raw_spin_lock(&boot_lock);
4183 +       raw_spin_unlock(&boot_lock);
4184  }
4185
4186  static const struct of_device_id clk_ids[]  = {
4187 @@ -75,7 +75,7 @@ static int sirfsoc_boot_secondary(unsigned int cpu, struct task_struct *idle)
4188         /* make sure write buffer is drained */
4189         mb();
4190
4191 -       spin_lock(&boot_lock);
4192 +       raw_spin_lock(&boot_lock);
4193
4194         /*
4195          * The secondary processor is waiting to be released from
4196 @@ -107,7 +107,7 @@ static int sirfsoc_boot_secondary(unsigned int cpu, struct task_struct *idle)
4197          * now the secondary core is starting up let it run its
4198          * calibrations, then wait for it to finish
4199          */
4200 -       spin_unlock(&boot_lock);
4201 +       raw_spin_unlock(&boot_lock);
4202
4203         return pen_release != -1 ? -ENOSYS : 0;
4204  }
4205 diff --git a/arch/arm/mach-qcom/platsmp.c b/arch/arm/mach-qcom/platsmp.c
4206 index 5494c9e0c909..e8ce157d3548 100644
4207 --- a/arch/arm/mach-qcom/platsmp.c
4208 +++ b/arch/arm/mach-qcom/platsmp.c
4209 @@ -46,7 +46,7 @@
4210
4211  extern void secondary_startup_arm(void);
4212
4213 -static DEFINE_SPINLOCK(boot_lock);
4214 +static DEFINE_RAW_SPINLOCK(boot_lock);
4215
4216  #ifdef CONFIG_HOTPLUG_CPU
4217  static void qcom_cpu_die(unsigned int cpu)
4218 @@ -60,8 +60,8 @@ static void qcom_secondary_init(unsigned int cpu)
4219         /*
4220          * Synchronise with the boot thread.
4221          */
4222 -       spin_lock(&boot_lock);
4223 -       spin_unlock(&boot_lock);
4224 +       raw_spin_lock(&boot_lock);
4225 +       raw_spin_unlock(&boot_lock);
4226  }
4227
4228  static int scss_release_secondary(unsigned int cpu)
4229 @@ -284,7 +284,7 @@ static int qcom_boot_secondary(unsigned int cpu, int (*func)(unsigned int))
4230          * set synchronisation state between this boot processor
4231          * and the secondary one
4232          */
4233 -       spin_lock(&boot_lock);
4234 +       raw_spin_lock(&boot_lock);
4235
4236         /*
4237          * Send the secondary CPU a soft interrupt, thereby causing
4238 @@ -297,7 +297,7 @@ static int qcom_boot_secondary(unsigned int cpu, int (*func)(unsigned int))
4239          * now the secondary core is starting up let it run its
4240          * calibrations, then wait for it to finish
4241          */
4242 -       spin_unlock(&boot_lock);
4243 +       raw_spin_unlock(&boot_lock);
4244
4245         return ret;
4246  }
4247 diff --git a/arch/arm/mach-spear/platsmp.c b/arch/arm/mach-spear/platsmp.c
4248 index 39038a03836a..6da5c93872bf 100644
4249 --- a/arch/arm/mach-spear/platsmp.c
4250 +++ b/arch/arm/mach-spear/platsmp.c
4251 @@ -32,7 +32,7 @@ static void write_pen_release(int val)
4252         sync_cache_w(&pen_release);
4253  }
4254
4255 -static DEFINE_SPINLOCK(boot_lock);
4256 +static DEFINE_RAW_SPINLOCK(boot_lock);
4257
4258  static void __iomem *scu_base = IOMEM(VA_SCU_BASE);
4259
4260 @@ -47,8 +47,8 @@ static void spear13xx_secondary_init(unsigned int cpu)
4261         /*
4262          * Synchronise with the boot thread.
4263          */
4264 -       spin_lock(&boot_lock);
4265 -       spin_unlock(&boot_lock);
4266 +       raw_spin_lock(&boot_lock);
4267 +       raw_spin_unlock(&boot_lock);
4268  }
4269
4270  static int spear13xx_boot_secondary(unsigned int cpu, struct task_struct *idle)
4271 @@ -59,7 +59,7 @@ static int spear13xx_boot_secondary(unsigned int cpu, struct task_struct *idle)
4272          * set synchronisation state between this boot processor
4273          * and the secondary one
4274          */
4275 -       spin_lock(&boot_lock);
4276 +       raw_spin_lock(&boot_lock);
4277
4278         /*
4279          * The secondary processor is waiting to be released from
4280 @@ -84,7 +84,7 @@ static int spear13xx_boot_secondary(unsigned int cpu, struct task_struct *idle)
4281          * now the secondary core is starting up let it run its
4282          * calibrations, then wait for it to finish
4283          */
4284 -       spin_unlock(&boot_lock);
4285 +       raw_spin_unlock(&boot_lock);
4286
4287         return pen_release != -1 ? -ENOSYS : 0;
4288  }
4289 diff --git a/arch/arm/mach-sti/platsmp.c b/arch/arm/mach-sti/platsmp.c
4290 index 231f19e17436..a3419b7003e6 100644
4291 --- a/arch/arm/mach-sti/platsmp.c
4292 +++ b/arch/arm/mach-sti/platsmp.c
4293 @@ -35,7 +35,7 @@ static void write_pen_release(int val)
4294         sync_cache_w(&pen_release);
4295  }
4296
4297 -static DEFINE_SPINLOCK(boot_lock);
4298 +static DEFINE_RAW_SPINLOCK(boot_lock);
4299
4300  static void sti_secondary_init(unsigned int cpu)
4301  {
4302 @@ -48,8 +48,8 @@ static void sti_secondary_init(unsigned int cpu)
4303         /*
4304          * Synchronise with the boot thread.
4305          */
4306 -       spin_lock(&boot_lock);
4307 -       spin_unlock(&boot_lock);
4308 +       raw_spin_lock(&boot_lock);
4309 +       raw_spin_unlock(&boot_lock);
4310  }
4311
4312  static int sti_boot_secondary(unsigned int cpu, struct task_struct *idle)
4313 @@ -60,7 +60,7 @@ static int sti_boot_secondary(unsigned int cpu, struct task_struct *idle)
4314          * set synchronisation state between this boot processor
4315          * and the secondary one
4316          */
4317 -       spin_lock(&boot_lock);
4318 +       raw_spin_lock(&boot_lock);
4319
4320         /*
4321          * The secondary processor is waiting to be released from
4322 @@ -91,7 +91,7 @@ static int sti_boot_secondary(unsigned int cpu, struct task_struct *idle)
4323          * now the secondary core is starting up let it run its
4324          * calibrations, then wait for it to finish
4325          */
4326 -       spin_unlock(&boot_lock);
4327 +       raw_spin_unlock(&boot_lock);
4328
4329         return pen_release != -1 ? -ENOSYS : 0;
4330  }
4331 diff --git a/arch/arm/mm/fault.c b/arch/arm/mm/fault.c
4332 index 49b1b8048635..b261967ea028 100644
4333 --- a/arch/arm/mm/fault.c
4334 +++ b/arch/arm/mm/fault.c
4335 @@ -437,6 +437,9 @@ do_translation_fault(unsigned long addr, unsigned int fsr,
4336         if (addr < TASK_SIZE)
4337                 return do_page_fault(addr, fsr, regs);
4338
4339 +       if (interrupts_enabled(regs))
4340 +               local_irq_enable();
4341 +
4342         if (user_mode(regs))
4343                 goto bad_area;
4344
4345 @@ -504,6 +507,9 @@ do_translation_fault(unsigned long addr, unsigned int fsr,
4346  static int
4347  do_sect_fault(unsigned long addr, unsigned int fsr, struct pt_regs *regs)
4348  {
4349 +       if (interrupts_enabled(regs))
4350 +               local_irq_enable();
4351 +
4352         do_bad_area(addr, fsr, regs);
4353         return 0;
4354  }
4355 diff --git a/arch/arm/mm/highmem.c b/arch/arm/mm/highmem.c
4356 index d02f8187b1cc..542692dbd40a 100644
4357 --- a/arch/arm/mm/highmem.c
4358 +++ b/arch/arm/mm/highmem.c
4359 @@ -34,6 +34,11 @@ static inline pte_t get_fixmap_pte(unsigned long vaddr)
4360         return *ptep;
4361  }
4362
4363 +static unsigned int fixmap_idx(int type)
4364 +{
4365 +       return FIX_KMAP_BEGIN + type + KM_TYPE_NR * smp_processor_id();
4366 +}
4367 +
4368  void *kmap(struct page *page)
4369  {
4370         might_sleep();
4371 @@ -54,12 +59,13 @@ EXPORT_SYMBOL(kunmap);
4372
4373  void *kmap_atomic(struct page *page)
4374  {
4375 +       pte_t pte = mk_pte(page, kmap_prot);
4376         unsigned int idx;
4377         unsigned long vaddr;
4378         void *kmap;
4379         int type;
4380
4381 -       preempt_disable();
4382 +       preempt_disable_nort();
4383         pagefault_disable();
4384         if (!PageHighMem(page))
4385                 return page_address(page);
4386 @@ -79,7 +85,7 @@ void *kmap_atomic(struct page *page)
4387
4388         type = kmap_atomic_idx_push();
4389
4390 -       idx = FIX_KMAP_BEGIN + type + KM_TYPE_NR * smp_processor_id();
4391 +       idx = fixmap_idx(type);
4392         vaddr = __fix_to_virt(idx);
4393  #ifdef CONFIG_DEBUG_HIGHMEM
4394         /*
4395 @@ -93,7 +99,10 @@ void *kmap_atomic(struct page *page)
4396          * in place, so the contained TLB flush ensures the TLB is updated
4397          * with the new mapping.
4398          */
4399 -       set_fixmap_pte(idx, mk_pte(page, kmap_prot));
4400 +#ifdef CONFIG_PREEMPT_RT_FULL
4401 +       current->kmap_pte[type] = pte;
4402 +#endif
4403 +       set_fixmap_pte(idx, pte);
4404
4405         return (void *)vaddr;
4406  }
4407 @@ -106,44 +115,75 @@ void __kunmap_atomic(void *kvaddr)
4408
4409         if (kvaddr >= (void *)FIXADDR_START) {
4410                 type = kmap_atomic_idx();
4411 -               idx = FIX_KMAP_BEGIN + type + KM_TYPE_NR * smp_processor_id();
4412 +               idx = fixmap_idx(type);
4413
4414                 if (cache_is_vivt())
4415                         __cpuc_flush_dcache_area((void *)vaddr, PAGE_SIZE);
4416 +#ifdef CONFIG_PREEMPT_RT_FULL
4417 +               current->kmap_pte[type] = __pte(0);
4418 +#endif
4419  #ifdef CONFIG_DEBUG_HIGHMEM
4420                 BUG_ON(vaddr != __fix_to_virt(idx));
4421 -               set_fixmap_pte(idx, __pte(0));
4422  #else
4423                 (void) idx;  /* to kill a warning */
4424  #endif
4425 +               set_fixmap_pte(idx, __pte(0));
4426                 kmap_atomic_idx_pop();
4427         } else if (vaddr >= PKMAP_ADDR(0) && vaddr < PKMAP_ADDR(LAST_PKMAP)) {
4428                 /* this address was obtained through kmap_high_get() */
4429                 kunmap_high(pte_page(pkmap_page_table[PKMAP_NR(vaddr)]));
4430         }
4431         pagefault_enable();
4432 -       preempt_enable();
4433 +       preempt_enable_nort();
4434  }
4435  EXPORT_SYMBOL(__kunmap_atomic);
4436
4437  void *kmap_atomic_pfn(unsigned long pfn)
4438  {
4439 +       pte_t pte = pfn_pte(pfn, kmap_prot);
4440         unsigned long vaddr;
4441         int idx, type;
4442         struct page *page = pfn_to_page(pfn);
4443
4444 -       preempt_disable();
4445 +       preempt_disable_nort();
4446         pagefault_disable();
4447         if (!PageHighMem(page))
4448                 return page_address(page);
4449
4450         type = kmap_atomic_idx_push();
4451 -       idx = FIX_KMAP_BEGIN + type + KM_TYPE_NR * smp_processor_id();
4452 +       idx = fixmap_idx(type);
4453         vaddr = __fix_to_virt(idx);
4454  #ifdef CONFIG_DEBUG_HIGHMEM
4455         BUG_ON(!pte_none(get_fixmap_pte(vaddr)));
4456  #endif
4457 -       set_fixmap_pte(idx, pfn_pte(pfn, kmap_prot));
4458 +#ifdef CONFIG_PREEMPT_RT_FULL
4459 +       current->kmap_pte[type] = pte;
4460 +#endif
4461 +       set_fixmap_pte(idx, pte);
4462
4463         return (void *)vaddr;
4464  }
4465 +#if defined CONFIG_PREEMPT_RT_FULL
4466 +void switch_kmaps(struct task_struct *prev_p, struct task_struct *next_p)
4467 +{
4468 +       int i;
4469 +
4470 +       /*
4471 +        * Clear @prev's kmap_atomic mappings
4472 +        */
4473 +       for (i = 0; i < prev_p->kmap_idx; i++) {
4474 +               int idx = fixmap_idx(i);
4475 +
4476 +               set_fixmap_pte(idx, __pte(0));
4477 +       }
4478 +       /*
4479 +        * Restore @next_p's kmap_atomic mappings
4480 +        */
4481 +       for (i = 0; i < next_p->kmap_idx; i++) {
4482 +               int idx = fixmap_idx(i);
4483 +
4484 +               if (!pte_none(next_p->kmap_pte[i]))
4485 +                       set_fixmap_pte(idx, next_p->kmap_pte[i]);
4486 +       }
4487 +}
4488 +#endif
4489 diff --git a/arch/arm/plat-versatile/platsmp.c b/arch/arm/plat-versatile/platsmp.c
4490 index c2366510187a..6b60f582b738 100644
4491 --- a/arch/arm/plat-versatile/platsmp.c
4492 +++ b/arch/arm/plat-versatile/platsmp.c
4493 @@ -32,7 +32,7 @@ static void write_pen_release(int val)
4494         sync_cache_w(&pen_release);
4495  }
4496
4497 -static DEFINE_SPINLOCK(boot_lock);
4498 +static DEFINE_RAW_SPINLOCK(boot_lock);
4499
4500  void versatile_secondary_init(unsigned int cpu)
4501  {
4502 @@ -45,8 +45,8 @@ void versatile_secondary_init(unsigned int cpu)
4503         /*
4504          * Synchronise with the boot thread.
4505          */
4506 -       spin_lock(&boot_lock);
4507 -       spin_unlock(&boot_lock);
4508 +       raw_spin_lock(&boot_lock);
4509 +       raw_spin_unlock(&boot_lock);
4510  }
4511
4512  int versatile_boot_secondary(unsigned int cpu, struct task_struct *idle)
4513 @@ -57,7 +57,7 @@ int versatile_boot_secondary(unsigned int cpu, struct task_struct *idle)
4514          * Set synchronisation state between this boot processor
4515          * and the secondary one
4516          */
4517 -       spin_lock(&boot_lock);
4518 +       raw_spin_lock(&boot_lock);
4519
4520         /*
4521          * This is really belt and braces; we hold unintended secondary
4522 @@ -87,7 +87,7 @@ int versatile_boot_secondary(unsigned int cpu, struct task_struct *idle)
4523          * now the secondary core is starting up let it run its
4524          * calibrations, then wait for it to finish
4525          */
4526 -       spin_unlock(&boot_lock);
4527 +       raw_spin_unlock(&boot_lock);
4528
4529         return pen_release != -1 ? -ENOSYS : 0;
4530  }
4531 diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
4532 index c30cd78b6918..458d2033ffde 100644
4533 --- a/arch/arm64/Kconfig
4534 +++ b/arch/arm64/Kconfig
4535 @@ -103,6 +103,7 @@ config ARM64
4536         select HAVE_PERF_EVENTS
4537         select HAVE_PERF_REGS
4538         select HAVE_PERF_USER_STACK_DUMP
4539 +       select HAVE_PREEMPT_LAZY
4540         select HAVE_REGS_AND_STACK_ACCESS_API
4541         select HAVE_RCU_TABLE_FREE
4542         select HAVE_SYSCALL_TRACEPOINTS
4543 diff --git a/arch/arm64/crypto/Kconfig b/arch/arm64/crypto/Kconfig
4544 index 70c517aa4501..2a5f05b5a19a 100644
4545 --- a/arch/arm64/crypto/Kconfig
4546 +++ b/arch/arm64/crypto/Kconfig
4547 @@ -19,19 +19,19 @@ config CRYPTO_SHA512_ARM64
4548
4549  config CRYPTO_SHA1_ARM64_CE
4550         tristate "SHA-1 digest algorithm (ARMv8 Crypto Extensions)"
4551 -       depends on KERNEL_MODE_NEON
4552 +       depends on KERNEL_MODE_NEON && !PREEMPT_RT_BASE
4553         select CRYPTO_HASH
4554         select CRYPTO_SHA1
4555
4556  config CRYPTO_SHA2_ARM64_CE
4557         tristate "SHA-224/SHA-256 digest algorithm (ARMv8 Crypto Extensions)"
4558 -       depends on KERNEL_MODE_NEON
4559 +       depends on KERNEL_MODE_NEON && !PREEMPT_RT_BASE
4560         select CRYPTO_HASH
4561         select CRYPTO_SHA256_ARM64
4562
4563  config CRYPTO_GHASH_ARM64_CE
4564         tristate "GHASH/AES-GCM using ARMv8 Crypto Extensions"
4565 -       depends on KERNEL_MODE_NEON
4566 +       depends on KERNEL_MODE_NEON && !PREEMPT_RT_BASE
4567         select CRYPTO_HASH
4568         select CRYPTO_GF128MUL
4569         select CRYPTO_AES
4570 @@ -39,7 +39,7 @@ config CRYPTO_GHASH_ARM64_CE
4571
4572  config CRYPTO_CRCT10DIF_ARM64_CE
4573         tristate "CRCT10DIF digest algorithm using PMULL instructions"
4574 -       depends on KERNEL_MODE_NEON && CRC_T10DIF
4575 +       depends on KERNEL_MODE_NEON && CRC_T10DIF && !PREEMPT_RT_BASE
4576         select CRYPTO_HASH
4577
4578  config CRYPTO_CRC32_ARM64_CE
4579 @@ -53,13 +53,13 @@ config CRYPTO_AES_ARM64
4580
4581  config CRYPTO_AES_ARM64_CE
4582         tristate "AES core cipher using ARMv8 Crypto Extensions"
4583 -       depends on ARM64 && KERNEL_MODE_NEON
4584 +       depends on ARM64 && KERNEL_MODE_NEON && !PREEMPT_RT_BASE
4585         select CRYPTO_ALGAPI
4586         select CRYPTO_AES_ARM64
4587
4588  config CRYPTO_AES_ARM64_CE_CCM
4589         tristate "AES in CCM mode using ARMv8 Crypto Extensions"
4590 -       depends on ARM64 && KERNEL_MODE_NEON
4591 +       depends on ARM64 && KERNEL_MODE_NEON && !PREEMPT_RT_BASE
4592         select CRYPTO_ALGAPI
4593         select CRYPTO_AES_ARM64_CE
4594         select CRYPTO_AES_ARM64
4595 @@ -67,7 +67,7 @@ config CRYPTO_AES_ARM64_CE_CCM
4596
4597  config CRYPTO_AES_ARM64_CE_BLK
4598         tristate "AES in ECB/CBC/CTR/XTS modes using ARMv8 Crypto Extensions"
4599 -       depends on KERNEL_MODE_NEON
4600 +       depends on KERNEL_MODE_NEON && !PREEMPT_RT_BASE
4601         select CRYPTO_BLKCIPHER
4602         select CRYPTO_AES_ARM64_CE
4603         select CRYPTO_AES_ARM64
4604 @@ -75,7 +75,7 @@ config CRYPTO_AES_ARM64_CE_BLK
4605
4606  config CRYPTO_AES_ARM64_NEON_BLK
4607         tristate "AES in ECB/CBC/CTR/XTS modes using NEON instructions"
4608 -       depends on KERNEL_MODE_NEON
4609 +       depends on KERNEL_MODE_NEON && !PREEMPT_RT_BASE
4610         select CRYPTO_BLKCIPHER
4611         select CRYPTO_AES_ARM64
4612         select CRYPTO_AES
4613 @@ -83,13 +83,13 @@ config CRYPTO_AES_ARM64_NEON_BLK
4614
4615  config CRYPTO_CHACHA20_NEON
4616         tristate "NEON accelerated ChaCha20 symmetric cipher"
4617 -       depends on KERNEL_MODE_NEON
4618 +       depends on KERNEL_MODE_NEON && !PREEMPT_RT_BASE
4619         select CRYPTO_BLKCIPHER
4620         select CRYPTO_CHACHA20
4621
4622  config CRYPTO_AES_ARM64_BS
4623         tristate "AES in ECB/CBC/CTR/XTS modes using bit-sliced NEON algorithm"
4624 -       depends on KERNEL_MODE_NEON
4625 +       depends on KERNEL_MODE_NEON && !PREEMPT_RT_BASE
4626         select CRYPTO_BLKCIPHER
4627         select CRYPTO_AES_ARM64_NEON_BLK
4628         select CRYPTO_AES_ARM64
4629 diff --git a/arch/arm64/crypto/crc32-ce-glue.c b/arch/arm64/crypto/crc32-ce-glue.c
4630 index 34b4e3d46aab..ae055cdad8cf 100644
4631 --- a/arch/arm64/crypto/crc32-ce-glue.c
4632 +++ b/arch/arm64/crypto/crc32-ce-glue.c
4633 @@ -208,7 +208,8 @@ static struct shash_alg crc32_pmull_algs[] = { {
4634
4635  static int __init crc32_pmull_mod_init(void)
4636  {
4637 -       if (IS_ENABLED(CONFIG_KERNEL_MODE_NEON) && (elf_hwcap & HWCAP_PMULL)) {
4638 +       if (IS_ENABLED(CONFIG_KERNEL_MODE_NEON) &&
4639 +           !IS_ENABLED(CONFIG_PREEMPT_RT_BASE) && (elf_hwcap & HWCAP_PMULL)) {
4640                 crc32_pmull_algs[0].update = crc32_pmull_update;
4641                 crc32_pmull_algs[1].update = crc32c_pmull_update;
4642
4643 diff --git a/arch/arm64/include/asm/spinlock_types.h b/arch/arm64/include/asm/spinlock_types.h
4644 index 55be59a35e3f..ba0cf1361f65 100644
4645 --- a/arch/arm64/include/asm/spinlock_types.h
4646 +++ b/arch/arm64/include/asm/spinlock_types.h
4647 @@ -16,10 +16,6 @@
4648  #ifndef __ASM_SPINLOCK_TYPES_H
4649  #define __ASM_SPINLOCK_TYPES_H
4650
4651 -#if !defined(__LINUX_SPINLOCK_TYPES_H) && !defined(__ASM_SPINLOCK_H)
4652 -# error "please don't include this file directly"
4653 -#endif
4654 -
4655  #include <linux/types.h>
4656
4657  #define TICKET_SHIFT   16
4658 diff --git a/arch/arm64/include/asm/thread_info.h b/arch/arm64/include/asm/thread_info.h
4659 index fc786d344e46..b833258b7594 100644
4660 --- a/arch/arm64/include/asm/thread_info.h
4661 +++ b/arch/arm64/include/asm/thread_info.h
4662 @@ -43,6 +43,7 @@ struct thread_info {
4663         u64                     ttbr0;          /* saved TTBR0_EL1 */
4664  #endif
4665         int                     preempt_count;  /* 0 => preemptable, <0 => bug */
4666 +       int                     preempt_lazy_count; /* 0 => preemptable, <0 => bug */
4667  };
4668
4669  #define INIT_THREAD_INFO(tsk)                                          \
4670 @@ -82,6 +83,7 @@ void arch_setup_new_exec(void);
4671  #define TIF_FOREIGN_FPSTATE    3       /* CPU's FP state is not current's */
4672  #define TIF_UPROBE             4       /* uprobe breakpoint or singlestep */
4673  #define TIF_FSCHECK            5       /* Check FS is USER_DS on return */
4674 +#define TIF_NEED_RESCHED_LAZY  6
4675  #define TIF_NOHZ               7
4676  #define TIF_SYSCALL_TRACE      8
4677  #define TIF_SYSCALL_AUDIT      9
4678 @@ -98,6 +100,7 @@ void arch_setup_new_exec(void);
4679  #define _TIF_NEED_RESCHED      (1 << TIF_NEED_RESCHED)
4680  #define _TIF_NOTIFY_RESUME     (1 << TIF_NOTIFY_RESUME)
4681  #define _TIF_FOREIGN_FPSTATE   (1 << TIF_FOREIGN_FPSTATE)
4682 +#define _TIF_NEED_RESCHED_LAZY (1 << TIF_NEED_RESCHED_LAZY)
4683  #define _TIF_NOHZ              (1 << TIF_NOHZ)
4684  #define _TIF_SYSCALL_TRACE     (1 << TIF_SYSCALL_TRACE)
4685  #define _TIF_SYSCALL_AUDIT     (1 << TIF_SYSCALL_AUDIT)
4686 @@ -109,8 +112,9 @@ void arch_setup_new_exec(void);
4687
4688  #define _TIF_WORK_MASK         (_TIF_NEED_RESCHED | _TIF_SIGPENDING | \
4689                                  _TIF_NOTIFY_RESUME | _TIF_FOREIGN_FPSTATE | \
4690 -                                _TIF_UPROBE | _TIF_FSCHECK)
4691 +                                _TIF_UPROBE | _TIF_FSCHECK | _TIF_NEED_RESCHED_LAZY)
4692
4693 +#define _TIF_NEED_RESCHED_MASK (_TIF_NEED_RESCHED | _TIF_NEED_RESCHED_LAZY)
4694  #define _TIF_SYSCALL_WORK      (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_AUDIT | \
4695                                  _TIF_SYSCALL_TRACEPOINT | _TIF_SECCOMP | \
4696                                  _TIF_NOHZ)
4697 diff --git a/arch/arm64/kernel/asm-offsets.c b/arch/arm64/kernel/asm-offsets.c
4698 index b5e43b01b396..ae26a1664436 100644
4699 --- a/arch/arm64/kernel/asm-offsets.c
4700 +++ b/arch/arm64/kernel/asm-offsets.c
4701 @@ -39,6 +39,7 @@ int main(void)
4702    BLANK();
4703    DEFINE(TSK_TI_FLAGS,         offsetof(struct task_struct, thread_info.flags));
4704    DEFINE(TSK_TI_PREEMPT,       offsetof(struct task_struct, thread_info.preempt_count));
4705 +  DEFINE(TSK_TI_PREEMPT_LAZY,  offsetof(struct task_struct, thread_info.preempt_lazy_count));
4706    DEFINE(TSK_TI_ADDR_LIMIT,    offsetof(struct task_struct, thread_info.addr_limit));
4707  #ifdef CONFIG_ARM64_SW_TTBR0_PAN
4708    DEFINE(TSK_TI_TTBR0,         offsetof(struct task_struct, thread_info.ttbr0));
4709 diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S
4710 index c1ffa95c0ad2..c60ecb5a3916 100644
4711 --- a/arch/arm64/kernel/entry.S
4712 +++ b/arch/arm64/kernel/entry.S
4713 @@ -637,11 +637,16 @@ el1_irq:
4714
4715  #ifdef CONFIG_PREEMPT
4716         ldr     w24, [tsk, #TSK_TI_PREEMPT]     // get preempt count
4717 -       cbnz    w24, 1f                         // preempt count != 0
4718 +       cbnz    w24, 2f                         // preempt count != 0
4719         ldr     x0, [tsk, #TSK_TI_FLAGS]        // get flags
4720 -       tbz     x0, #TIF_NEED_RESCHED, 1f       // needs rescheduling?
4721 -       bl      el1_preempt
4722 +       tbnz    x0, #TIF_NEED_RESCHED, 1f       // needs rescheduling?
4723 +
4724 +       ldr     w24, [tsk, #TSK_TI_PREEMPT_LAZY] // get preempt lazy count
4725 +       cbnz    w24, 2f                         // preempt lazy count != 0
4726 +       tbz     x0, #TIF_NEED_RESCHED_LAZY, 2f  // needs rescheduling?
4727  1:
4728 +       bl      el1_preempt
4729 +2:
4730  #endif
4731  #ifdef CONFIG_TRACE_IRQFLAGS
4732         bl      trace_hardirqs_on
4733 @@ -655,6 +660,7 @@ el1_preempt:
4734  1:     bl      preempt_schedule_irq            // irq en/disable is done inside
4735         ldr     x0, [tsk, #TSK_TI_FLAGS]        // get new tasks TI_FLAGS
4736         tbnz    x0, #TIF_NEED_RESCHED, 1b       // needs rescheduling?
4737 +       tbnz    x0, #TIF_NEED_RESCHED_LAZY, 1b  // needs rescheduling?
4738         ret     x24
4739  #endif
4740
4741 diff --git a/arch/arm64/kernel/signal.c b/arch/arm64/kernel/signal.c
4742 index 43442b3a463f..81bf9545a589 100644
4743 --- a/arch/arm64/kernel/signal.c
4744 +++ b/arch/arm64/kernel/signal.c
4745 @@ -756,7 +756,7 @@ asmlinkage void do_notify_resume(struct pt_regs *regs,
4746                 /* Check valid user FS if needed */
4747                 addr_limit_user_check();
4748
4749 -               if (thread_flags & _TIF_NEED_RESCHED) {
4750 +               if (thread_flags & _TIF_NEED_RESCHED_MASK) {
4751                         schedule();
4752                 } else {
4753                         local_irq_enable();
4754 diff --git a/arch/blackfin/include/asm/spinlock_types.h b/arch/blackfin/include/asm/spinlock_types.h
4755 index 1a33608c958b..103b34d3dcf6 100644
4756 --- a/arch/blackfin/include/asm/spinlock_types.h
4757 +++ b/arch/blackfin/include/asm/spinlock_types.h
4758 @@ -7,10 +7,6 @@
4759  #ifndef __ASM_SPINLOCK_TYPES_H
4760  #define __ASM_SPINLOCK_TYPES_H
4761
4762 -#ifndef __LINUX_SPINLOCK_TYPES_H
4763 -# error "please don't include this file directly"
4764 -#endif
4765 -
4766  #include <asm/rwlock.h>
4767
4768  typedef struct {
4769 diff --git a/arch/hexagon/include/asm/spinlock_types.h b/arch/hexagon/include/asm/spinlock_types.h
4770 index 7a906b5214a4..d8f596fec022 100644
4771 --- a/arch/hexagon/include/asm/spinlock_types.h
4772 +++ b/arch/hexagon/include/asm/spinlock_types.h
4773 @@ -21,10 +21,6 @@
4774  #ifndef _ASM_SPINLOCK_TYPES_H
4775  #define _ASM_SPINLOCK_TYPES_H
4776
4777 -#ifndef __LINUX_SPINLOCK_TYPES_H
4778 -# error "please don't include this file directly"
4779 -#endif
4780 -
4781  typedef struct {
4782         volatile unsigned int lock;
4783  } arch_spinlock_t;
4784 diff --git a/arch/ia64/include/asm/spinlock_types.h b/arch/ia64/include/asm/spinlock_types.h
4785 index 6e345fefcdca..681408d6816f 100644
4786 --- a/arch/ia64/include/asm/spinlock_types.h
4787 +++ b/arch/ia64/include/asm/spinlock_types.h
4788 @@ -2,10 +2,6 @@
4789  #ifndef _ASM_IA64_SPINLOCK_TYPES_H
4790  #define _ASM_IA64_SPINLOCK_TYPES_H
4791
4792 -#ifndef __LINUX_SPINLOCK_TYPES_H
4793 -# error "please don't include this file directly"
4794 -#endif
4795 -
4796  typedef struct {
4797         volatile unsigned int lock;
4798  } arch_spinlock_t;
4799 diff --git a/arch/ia64/kernel/mca.c b/arch/ia64/kernel/mca.c
4800 index 555b11180156..6866201a7603 100644
4801 --- a/arch/ia64/kernel/mca.c
4802 +++ b/arch/ia64/kernel/mca.c
4803 @@ -1824,7 +1824,7 @@ format_mca_init_stack(void *mca_data, unsigned long offset,
4804         ti->cpu = cpu;
4805         p->stack = ti;
4806         p->state = TASK_UNINTERRUPTIBLE;
4807 -       cpumask_set_cpu(cpu, &p->cpus_allowed);
4808 +       cpumask_set_cpu(cpu, &p->cpus_mask);
4809         INIT_LIST_HEAD(&p->tasks);
4810         p->parent = p->real_parent = p->group_leader = p;
4811         INIT_LIST_HEAD(&p->children);
4812 diff --git a/arch/m32r/include/asm/spinlock_types.h b/arch/m32r/include/asm/spinlock_types.h
4813 index bb0d17b64198..fc6afa42fe11 100644
4814 --- a/arch/m32r/include/asm/spinlock_types.h
4815 +++ b/arch/m32r/include/asm/spinlock_types.h
4816 @@ -2,10 +2,6 @@
4817  #ifndef _ASM_M32R_SPINLOCK_TYPES_H
4818  #define _ASM_M32R_SPINLOCK_TYPES_H
4819
4820 -#ifndef __LINUX_SPINLOCK_TYPES_H
4821 -# error "please don't include this file directly"
4822 -#endif
4823 -
4824  typedef struct {
4825         volatile int slock;
4826  } arch_spinlock_t;
4827 diff --git a/arch/metag/include/asm/spinlock_types.h b/arch/metag/include/asm/spinlock_types.h
4828 index cd197f1bed59..adc26e9797c5 100644
4829 --- a/arch/metag/include/asm/spinlock_types.h
4830 +++ b/arch/metag/include/asm/spinlock_types.h
4831 @@ -2,10 +2,6 @@
4832  #ifndef _ASM_METAG_SPINLOCK_TYPES_H
4833  #define _ASM_METAG_SPINLOCK_TYPES_H
4834
4835 -#ifndef __LINUX_SPINLOCK_TYPES_H
4836 -# error "please don't include this file directly"
4837 -#endif
4838 -
4839  typedef struct {
4840         volatile unsigned int lock;
4841  } arch_spinlock_t;
4842 diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig
4843 index c82457b0e733..7bb1838508de 100644
4844 --- a/arch/mips/Kconfig
4845 +++ b/arch/mips/Kconfig
4846 @@ -2519,7 +2519,7 @@ config MIPS_ASID_BITS_VARIABLE
4847  #
4848  config HIGHMEM
4849         bool "High Memory Support"
4850 -       depends on 32BIT && CPU_SUPPORTS_HIGHMEM && SYS_SUPPORTS_HIGHMEM && !CPU_MIPS32_3_5_EVA
4851 +       depends on 32BIT && CPU_SUPPORTS_HIGHMEM && SYS_SUPPORTS_HIGHMEM && !CPU_MIPS32_3_5_EVA && !PREEMPT_RT_FULL
4852
4853  config CPU_SUPPORTS_HIGHMEM
4854         bool
4855 diff --git a/arch/mips/include/asm/switch_to.h b/arch/mips/include/asm/switch_to.h
4856 index e610473d61b8..1428b4febbc9 100644
4857 --- a/arch/mips/include/asm/switch_to.h
4858 +++ b/arch/mips/include/asm/switch_to.h
4859 @@ -42,7 +42,7 @@ extern struct task_struct *ll_task;
4860   * inline to try to keep the overhead down. If we have been forced to run on
4861   * a "CPU" with an FPU because of a previous high level of FP computation,
4862   * but did not actually use the FPU during the most recent time-slice (CU1
4863 - * isn't set), we undo the restriction on cpus_allowed.
4864 + * isn't set), we undo the restriction on cpus_mask.
4865   *
4866   * We're not calling set_cpus_allowed() here, because we have no need to
4867   * force prompt migration - we're already switching the current CPU to a
4868 @@ -57,7 +57,7 @@ do {                                                                  \
4869             test_ti_thread_flag(__prev_ti, TIF_FPUBOUND) &&             \
4870             (!(KSTK_STATUS(prev) & ST0_CU1))) {                         \
4871                 clear_ti_thread_flag(__prev_ti, TIF_FPUBOUND);          \
4872 -               prev->cpus_allowed = prev->thread.user_cpus_allowed;    \
4873 +               prev->cpus_mask = prev->thread.user_cpus_allowed;       \
4874         }                                                               \
4875         next->thread.emulated_fp = 0;                                   \
4876  } while(0)
4877 diff --git a/arch/mips/kernel/mips-mt-fpaff.c b/arch/mips/kernel/mips-mt-fpaff.c
4878 index a7c0f97e4b0d..1a08428eedcf 100644
4879 --- a/arch/mips/kernel/mips-mt-fpaff.c
4880 +++ b/arch/mips/kernel/mips-mt-fpaff.c
4881 @@ -177,7 +177,7 @@ asmlinkage long mipsmt_sys_sched_getaffinity(pid_t pid, unsigned int len,
4882         if (retval)
4883                 goto out_unlock;
4884
4885 -       cpumask_or(&allowed, &p->thread.user_cpus_allowed, &p->cpus_allowed);
4886 +       cpumask_or(&allowed, &p->thread.user_cpus_allowed, p->cpus_ptr);
4887         cpumask_and(&mask, &allowed, cpu_active_mask);
4888
4889  out_unlock:
4890 diff --git a/arch/mips/kernel/traps.c b/arch/mips/kernel/traps.c
4891 index 583aed906933..24ad7aaca5eb 100644
4892 --- a/arch/mips/kernel/traps.c
4893 +++ b/arch/mips/kernel/traps.c
4894 @@ -1193,12 +1193,12 @@ static void mt_ase_fp_affinity(void)
4895                  * restricted the allowed set to exclude any CPUs with FPUs,
4896                  * we'll skip the procedure.
4897                  */
4898 -               if (cpumask_intersects(&current->cpus_allowed, &mt_fpu_cpumask)) {
4899 +               if (cpumask_intersects(&current->cpus_mask, &mt_fpu_cpumask)) {
4900                         cpumask_t tmask;
4901
4902                         current->thread.user_cpus_allowed
4903 -                               = current->cpus_allowed;
4904 -                       cpumask_and(&tmask, &current->cpus_allowed,
4905 +                               = current->cpus_mask;
4906 +                       cpumask_and(&tmask, &current->cpus_mask,
4907                                     &mt_fpu_cpumask);
4908                         set_cpus_allowed_ptr(current, &tmask);
4909                         set_thread_flag(TIF_FPUBOUND);
4910 diff --git a/arch/mn10300/include/asm/spinlock_types.h b/arch/mn10300/include/asm/spinlock_types.h
4911 index 32abdc89bbc7..c45230a12d60 100644
4912 --- a/arch/mn10300/include/asm/spinlock_types.h
4913 +++ b/arch/mn10300/include/asm/spinlock_types.h
4914 @@ -2,10 +2,6 @@
4915  #ifndef _ASM_SPINLOCK_TYPES_H
4916  #define _ASM_SPINLOCK_TYPES_H
4917
4918 -#ifndef __LINUX_SPINLOCK_TYPES_H
4919 -# error "please don't include this file directly"
4920 -#endif
4921 -
4922  typedef struct arch_spinlock {
4923         unsigned int slock;
4924  } arch_spinlock_t;
4925 diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
4926 index fe418226df7f..b5658e925465 100644
4927 --- a/arch/powerpc/Kconfig
4928 +++ b/arch/powerpc/Kconfig
4929 @@ -111,10 +111,11 @@ config LOCKDEP_SUPPORT
4930
4931  config RWSEM_GENERIC_SPINLOCK
4932         bool
4933 +       default y if PREEMPT_RT_FULL
4934
4935  config RWSEM_XCHGADD_ALGORITHM
4936         bool
4937 -       default y
4938 +       default y if !PREEMPT_RT_FULL
4939
4940  config GENERIC_LOCKBREAK
4941         bool
4942 @@ -215,6 +216,7 @@ config PPC
4943         select HAVE_HARDLOCKUP_DETECTOR_PERF    if PERF_EVENTS && HAVE_PERF_EVENTS_NMI && !HAVE_HARDLOCKUP_DETECTOR_ARCH
4944         select HAVE_PERF_REGS
4945         select HAVE_PERF_USER_STACK_DUMP
4946 +       select HAVE_PREEMPT_LAZY
4947         select HAVE_RCU_TABLE_FREE              if SMP
4948         select HAVE_REGS_AND_STACK_ACCESS_API
4949         select HAVE_SYSCALL_TRACEPOINTS
4950 @@ -390,7 +392,7 @@ menu "Kernel options"
4951
4952  config HIGHMEM
4953         bool "High memory support"
4954 -       depends on PPC32
4955 +       depends on PPC32 && !PREEMPT_RT_FULL
4956
4957  source kernel/Kconfig.hz
4958  source kernel/Kconfig.preempt
4959 diff --git a/arch/powerpc/include/asm/spinlock_types.h b/arch/powerpc/include/asm/spinlock_types.h
4960 index 87adaf13b7e8..7305cb6a53e4 100644
4961 --- a/arch/powerpc/include/asm/spinlock_types.h
4962 +++ b/arch/powerpc/include/asm/spinlock_types.h
4963 @@ -2,10 +2,6 @@
4964  #ifndef _ASM_POWERPC_SPINLOCK_TYPES_H
4965  #define _ASM_POWERPC_SPINLOCK_TYPES_H
4966
4967 -#ifndef __LINUX_SPINLOCK_TYPES_H
4968 -# error "please don't include this file directly"
4969 -#endif
4970 -
4971  typedef struct {
4972         volatile unsigned int slock;
4973  } arch_spinlock_t;
4974 diff --git a/arch/powerpc/include/asm/thread_info.h b/arch/powerpc/include/asm/thread_info.h
4975 index a264c3ad366b..020afb8329a1 100644
4976 --- a/arch/powerpc/include/asm/thread_info.h
4977 +++ b/arch/powerpc/include/asm/thread_info.h
4978 @@ -36,6 +36,8 @@ struct thread_info {
4979         int             cpu;                    /* cpu we're on */
4980         int             preempt_count;          /* 0 => preemptable,
4981                                                    <0 => BUG */
4982 +       int             preempt_lazy_count;     /* 0 => preemptable,
4983 +                                                  <0 => BUG */
4984         unsigned long   local_flags;            /* private flags for thread */
4985  #ifdef CONFIG_LIVEPATCH
4986         unsigned long *livepatch_sp;
4987 @@ -81,8 +83,7 @@ static inline struct thread_info *current_thread_info(void)
4988  #define TIF_SYSCALL_TRACE      0       /* syscall trace active */
4989  #define TIF_SIGPENDING         1       /* signal pending */
4990  #define TIF_NEED_RESCHED       2       /* rescheduling necessary */
4991 -#define TIF_POLLING_NRFLAG     3       /* true if poll_idle() is polling
4992 -                                          TIF_NEED_RESCHED */
4993 +#define TIF_NEED_RESCHED_LAZY  3       /* lazy rescheduling necessary */
4994  #define TIF_32BIT              4       /* 32 bit binary */
4995  #define TIF_RESTORE_TM         5       /* need to restore TM FP/VEC/VSX */
4996  #define TIF_PATCH_PENDING      6       /* pending live patching update */
4997 @@ -101,6 +102,8 @@ static inline struct thread_info *current_thread_info(void)
4998  #if defined(CONFIG_PPC64)
4999  #define TIF_ELF2ABI            18      /* function descriptors must die! */
5000  #endif
5001 +#define TIF_POLLING_NRFLAG     19      /* true if poll_idle() is polling
5002 +                                          TIF_NEED_RESCHED */
5003
5004  /* as above, but as bit values */
5005  #define _TIF_SYSCALL_TRACE     (1<<TIF_SYSCALL_TRACE)
5006 @@ -120,14 +123,16 @@ static inline struct thread_info *current_thread_info(void)
5007  #define _TIF_SYSCALL_TRACEPOINT        (1<<TIF_SYSCALL_TRACEPOINT)
5008  #define _TIF_EMULATE_STACK_STORE       (1<<TIF_EMULATE_STACK_STORE)
5009  #define _TIF_NOHZ              (1<<TIF_NOHZ)
5010 +#define _TIF_NEED_RESCHED_LAZY (1<<TIF_NEED_RESCHED_LAZY)
5011  #define _TIF_SYSCALL_DOTRACE   (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_AUDIT | \
5012                                  _TIF_SECCOMP | _TIF_SYSCALL_TRACEPOINT | \
5013                                  _TIF_NOHZ)
5014
5015  #define _TIF_USER_WORK_MASK    (_TIF_SIGPENDING | _TIF_NEED_RESCHED | \
5016                                  _TIF_NOTIFY_RESUME | _TIF_UPROBE | \
5017 -                                _TIF_RESTORE_TM | _TIF_PATCH_PENDING)
5018 +                                _TIF_RESTORE_TM | _TIF_PATCH_PENDING | _TIF_NEED_RESCHED_LAZY)
5019  #define _TIF_PERSYSCALL_MASK   (_TIF_RESTOREALL|_TIF_NOERROR)
5020 +#define _TIF_NEED_RESCHED_MASK (_TIF_NEED_RESCHED | _TIF_NEED_RESCHED_LAZY)
5021
5022  /* Bits in local_flags */
5023  /* Don't move TLF_NAPPING without adjusting the code in entry_32.S */
5024 diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c
5025 index 2e5ea300258a..a2cb40098d7c 100644
5026 --- a/arch/powerpc/kernel/asm-offsets.c
5027 +++ b/arch/powerpc/kernel/asm-offsets.c
5028 @@ -156,6 +156,7 @@ int main(void)
5029         OFFSET(TI_FLAGS, thread_info, flags);
5030         OFFSET(TI_LOCAL_FLAGS, thread_info, local_flags);
5031         OFFSET(TI_PREEMPT, thread_info, preempt_count);
5032 +       OFFSET(TI_PREEMPT_LAZY, thread_info, preempt_lazy_count);
5033         OFFSET(TI_TASK, thread_info, task);
5034         OFFSET(TI_CPU, thread_info, cpu);
5035
5036 diff --git a/arch/powerpc/kernel/entry_32.S b/arch/powerpc/kernel/entry_32.S
5037 index e780e1fbf6c2..dc7fe90ff6a9 100644
5038 --- a/arch/powerpc/kernel/entry_32.S
5039 +++ b/arch/powerpc/kernel/entry_32.S
5040 @@ -866,7 +866,14 @@ resume_kernel:
5041         cmpwi   0,r0,0          /* if non-zero, just restore regs and return */
5042         bne     restore
5043         andi.   r8,r8,_TIF_NEED_RESCHED
5044 +       bne+    1f
5045 +       lwz     r0,TI_PREEMPT_LAZY(r9)
5046 +       cmpwi   0,r0,0          /* if non-zero, just restore regs and return */
5047 +       bne     restore
5048 +       lwz     r0,TI_FLAGS(r9)
5049 +       andi.   r0,r0,_TIF_NEED_RESCHED_LAZY
5050         beq+    restore
5051 +1:
5052         lwz     r3,_MSR(r1)
5053         andi.   r0,r3,MSR_EE    /* interrupts off? */
5054         beq     restore         /* don't schedule if so */
5055 @@ -877,11 +884,11 @@ resume_kernel:
5056          */
5057         bl      trace_hardirqs_off
5058  #endif
5059 -1:     bl      preempt_schedule_irq
5060 +2:     bl      preempt_schedule_irq
5061         CURRENT_THREAD_INFO(r9, r1)
5062         lwz     r3,TI_FLAGS(r9)
5063 -       andi.   r0,r3,_TIF_NEED_RESCHED
5064 -       bne-    1b
5065 +       andi.   r0,r3,_TIF_NEED_RESCHED_MASK
5066 +       bne-    2b
5067  #ifdef CONFIG_TRACE_IRQFLAGS
5068         /* And now, to properly rebalance the above, we tell lockdep they
5069          * are being turned back on, which will happen when we return
5070 @@ -1204,7 +1211,7 @@ global_dbcr0:
5071  #endif /* !(CONFIG_4xx || CONFIG_BOOKE) */
5072
5073  do_work:                       /* r10 contains MSR_KERNEL here */
5074 -       andi.   r0,r9,_TIF_NEED_RESCHED
5075 +       andi.   r0,r9,_TIF_NEED_RESCHED_MASK
5076         beq     do_user_signal
5077
5078  do_resched:                    /* r10 contains MSR_KERNEL here */
5079 @@ -1225,7 +1232,7 @@ recheck:
5080         MTMSRD(r10)             /* disable interrupts */
5081         CURRENT_THREAD_INFO(r9, r1)
5082         lwz     r9,TI_FLAGS(r9)
5083 -       andi.   r0,r9,_TIF_NEED_RESCHED
5084 +       andi.   r0,r9,_TIF_NEED_RESCHED_MASK
5085         bne-    do_resched
5086         andi.   r0,r9,_TIF_USER_WORK_MASK
5087         beq     restore_user
5088 diff --git a/arch/powerpc/kernel/entry_64.S b/arch/powerpc/kernel/entry_64.S
5089 index c194f4c8e66b..117c1f6cab66 100644
5090 --- a/arch/powerpc/kernel/entry_64.S
5091 +++ b/arch/powerpc/kernel/entry_64.S
5092 @@ -690,7 +690,7 @@ _GLOBAL(ret_from_except_lite)
5093         bl      restore_math
5094         b       restore
5095  #endif
5096 -1:     andi.   r0,r4,_TIF_NEED_RESCHED
5097 +1:     andi.   r0,r4,_TIF_NEED_RESCHED_MASK
5098         beq     2f
5099         bl      restore_interrupts
5100         SCHEDULE_USER
5101 @@ -752,10 +752,18 @@ resume_kernel:
5102
5103  #ifdef CONFIG_PREEMPT
5104         /* Check if we need to preempt */
5105 +       lwz     r8,TI_PREEMPT(r9)
5106 +       cmpwi   0,r8,0          /* if non-zero, just restore regs and return */
5107 +       bne     restore
5108         andi.   r0,r4,_TIF_NEED_RESCHED
5109 +       bne+    check_count
5110 +
5111 +       andi.   r0,r4,_TIF_NEED_RESCHED_LAZY
5112         beq+    restore
5113 +       lwz     r8,TI_PREEMPT_LAZY(r9)
5114 +
5115         /* Check that preempt_count() == 0 and interrupts are enabled */
5116 -       lwz     r8,TI_PREEMPT(r9)
5117 +check_count:
5118         cmpwi   cr1,r8,0
5119         ld      r0,SOFTE(r1)
5120         cmpdi   r0,0
5121 @@ -772,7 +780,7 @@ resume_kernel:
5122         /* Re-test flags and eventually loop */
5123         CURRENT_THREAD_INFO(r9, r1)
5124         ld      r4,TI_FLAGS(r9)
5125 -       andi.   r0,r4,_TIF_NEED_RESCHED
5126 +       andi.   r0,r4,_TIF_NEED_RESCHED_MASK
5127         bne     1b
5128
5129         /*
5130 diff --git a/arch/powerpc/kernel/irq.c b/arch/powerpc/kernel/irq.c
5131 index 0ce8b0e5d7ba..375adb3048fc 100644
5132 --- a/arch/powerpc/kernel/irq.c
5133 +++ b/arch/powerpc/kernel/irq.c
5134 @@ -693,6 +693,7 @@ void irq_ctx_init(void)
5135         }
5136  }
5137
5138 +#ifndef CONFIG_PREEMPT_RT_FULL
5139  void do_softirq_own_stack(void)
5140  {
5141         struct thread_info *curtp, *irqtp;
5142 @@ -710,6 +711,7 @@ void do_softirq_own_stack(void)
5143         if (irqtp->flags)
5144                 set_bits(irqtp->flags, &curtp->flags);
5145  }
5146 +#endif
5147
5148  irq_hw_number_t virq_to_hw(unsigned int virq)
5149  {
5150 diff --git a/arch/powerpc/kernel/misc_32.S b/arch/powerpc/kernel/misc_32.S
5151 index 3f7a9a2d2435..1795359d27b6 100644
5152 --- a/arch/powerpc/kernel/misc_32.S
5153 +++ b/arch/powerpc/kernel/misc_32.S
5154 @@ -41,6 +41,7 @@
5155   * We store the saved ksp_limit in the unused part
5156   * of the STACK_FRAME_OVERHEAD
5157   */
5158 +#ifndef CONFIG_PREEMPT_RT_FULL
5159  _GLOBAL(call_do_softirq)
5160         mflr    r0
5161         stw     r0,4(r1)
5162 @@ -57,6 +58,7 @@ _GLOBAL(call_do_softirq)
5163         stw     r10,THREAD+KSP_LIMIT(r2)
5164         mtlr    r0
5165         blr
5166 +#endif
5167
5168  /*
5169   * void call_do_irq(struct pt_regs *regs, struct thread_info *irqtp);
5170 diff --git a/arch/powerpc/kernel/misc_64.S b/arch/powerpc/kernel/misc_64.S
5171 index 3280953a82cf..dd2a80d190c4 100644
5172 --- a/arch/powerpc/kernel/misc_64.S
5173 +++ b/arch/powerpc/kernel/misc_64.S
5174 @@ -31,6 +31,7 @@
5175
5176         .text
5177
5178 +#ifndef CONFIG_PREEMPT_RT_FULL
5179  _GLOBAL(call_do_softirq)
5180         mflr    r0
5181         std     r0,16(r1)
5182 @@ -41,6 +42,7 @@ _GLOBAL(call_do_softirq)
5183         ld      r0,16(r1)
5184         mtlr    r0
5185         blr
5186 +#endif
5187
5188  _GLOBAL(call_do_irq)
5189         mflr    r0
5190 diff --git a/arch/powerpc/kvm/Kconfig b/arch/powerpc/kvm/Kconfig
5191 index 648160334abf..9d24331fc9b4 100644
5192 --- a/arch/powerpc/kvm/Kconfig
5193 +++ b/arch/powerpc/kvm/Kconfig
5194 @@ -177,6 +177,7 @@ config KVM_E500MC
5195  config KVM_MPIC
5196         bool "KVM in-kernel MPIC emulation"
5197         depends on KVM && E500
5198 +       depends on !PREEMPT_RT_FULL
5199         select HAVE_KVM_IRQCHIP
5200         select HAVE_KVM_IRQFD
5201         select HAVE_KVM_IRQ_ROUTING
5202 diff --git a/arch/powerpc/platforms/cell/spufs/sched.c b/arch/powerpc/platforms/cell/spufs/sched.c
5203 index 1fbb5da17dd2..ca86366d5424 100644
5204 --- a/arch/powerpc/platforms/cell/spufs/sched.c
5205 +++ b/arch/powerpc/platforms/cell/spufs/sched.c
5206 @@ -141,7 +141,7 @@ void __spu_update_sched_info(struct spu_context *ctx)
5207          * runqueue. The context will be rescheduled on the proper node
5208          * if it is timesliced or preempted.
5209          */
5210 -       cpumask_copy(&ctx->cpus_allowed, &current->cpus_allowed);
5211 +       cpumask_copy(&ctx->cpus_allowed, current->cpus_ptr);
5212
5213         /* Save the current cpu id for spu interrupt routing. */
5214         ctx->last_ran = raw_smp_processor_id();
5215 diff --git a/arch/powerpc/platforms/ps3/device-init.c b/arch/powerpc/platforms/ps3/device-init.c
5216 index e48462447ff0..2670cee66064 100644
5217 --- a/arch/powerpc/platforms/ps3/device-init.c
5218 +++ b/arch/powerpc/platforms/ps3/device-init.c
5219 @@ -752,7 +752,7 @@ static int ps3_notification_read_write(struct ps3_notification_device *dev,
5220         }
5221         pr_debug("%s:%u: notification %s issued\n", __func__, __LINE__, op);
5222
5223 -       res = wait_event_interruptible(dev->done.wait,
5224 +       res = swait_event_interruptible(dev->done.wait,
5225                                        dev->done.done || kthread_should_stop());
5226         if (kthread_should_stop())
5227                 res = -EINTR;
5228 diff --git a/arch/s390/include/asm/spinlock_types.h b/arch/s390/include/asm/spinlock_types.h
5229 index 1861a0c5dd47..74092ebaca3c 100644
5230 --- a/arch/s390/include/asm/spinlock_types.h
5231 +++ b/arch/s390/include/asm/spinlock_types.h
5232 @@ -2,10 +2,6 @@
5233  #ifndef __ASM_SPINLOCK_TYPES_H
5234  #define __ASM_SPINLOCK_TYPES_H
5235
5236 -#ifndef __LINUX_SPINLOCK_TYPES_H
5237 -# error "please don't include this file directly"
5238 -#endif
5239 -
5240  typedef struct {
5241         int lock;
5242  } __attribute__ ((aligned (4))) arch_spinlock_t;
5243 diff --git a/arch/sh/include/asm/spinlock_types.h b/arch/sh/include/asm/spinlock_types.h
5244 index e82369f286a2..22ca9a98bbb8 100644
5245 --- a/arch/sh/include/asm/spinlock_types.h
5246 +++ b/arch/sh/include/asm/spinlock_types.h
5247 @@ -2,10 +2,6 @@
5248  #ifndef __ASM_SH_SPINLOCK_TYPES_H
5249  #define __ASM_SH_SPINLOCK_TYPES_H
5250
5251 -#ifndef __LINUX_SPINLOCK_TYPES_H
5252 -# error "please don't include this file directly"
5253 -#endif
5254 -
5255  typedef struct {
5256         volatile unsigned int lock;
5257  } arch_spinlock_t;
5258 diff --git a/arch/sh/kernel/irq.c b/arch/sh/kernel/irq.c
5259 index 245dbeb20afe..e298c82d2a69 100644
5260 --- a/arch/sh/kernel/irq.c
5261 +++ b/arch/sh/kernel/irq.c
5262 @@ -148,6 +148,7 @@ void irq_ctx_exit(int cpu)
5263         hardirq_ctx[cpu] = NULL;
5264  }
5265
5266 +#ifndef CONFIG_PREEMPT_RT_FULL
5267  void do_softirq_own_stack(void)
5268  {
5269         struct thread_info *curctx;
5270 @@ -175,6 +176,7 @@ void do_softirq_own_stack(void)
5271                   "r5", "r6", "r7", "r8", "r9", "r15", "t", "pr"
5272         );
5273  }
5274 +#endif
5275  #else
5276  static inline void handle_one_irq(unsigned int irq)
5277  {
5278 diff --git a/arch/sparc/Kconfig b/arch/sparc/Kconfig
5279 index 4e83f950713e..7f9d71523763 100644
5280 --- a/arch/sparc/Kconfig
5281 +++ b/arch/sparc/Kconfig
5282 @@ -206,12 +206,10 @@ config NR_CPUS
5283  source kernel/Kconfig.hz
5284
5285  config RWSEM_GENERIC_SPINLOCK
5286 -       bool
5287 -       default y if SPARC32
5288 +       def_bool PREEMPT_RT_FULL
5289
5290  config RWSEM_XCHGADD_ALGORITHM
5291 -       bool
5292 -       default y if SPARC64
5293 +       def_bool !RWSEM_GENERIC_SPINLOCK && !PREEMPT_RT_FULL
5294
5295  config GENERIC_HWEIGHT
5296         bool
5297 diff --git a/arch/sparc/kernel/irq_64.c b/arch/sparc/kernel/irq_64.c
5298 index d66dde833f5e..f87b3f8f4d43 100644
5299 --- a/arch/sparc/kernel/irq_64.c
5300 +++ b/arch/sparc/kernel/irq_64.c
5301 @@ -855,6 +855,7 @@ void __irq_entry handler_irq(int pil, struct pt_regs *regs)
5302         set_irq_regs(old_regs);
5303  }
5304
5305 +#ifndef CONFIG_PREEMPT_RT_FULL
5306  void do_softirq_own_stack(void)
5307  {
5308         void *orig_sp, *sp = softirq_stack[smp_processor_id()];
5309 @@ -869,6 +870,7 @@ void do_softirq_own_stack(void)
5310         __asm__ __volatile__("mov %0, %%sp"
5311                              : : "r" (orig_sp));
5312  }
5313 +#endif
5314
5315  #ifdef CONFIG_HOTPLUG_CPU
5316  void fixup_irqs(void)
5317 diff --git a/arch/tile/include/asm/setup.h b/arch/tile/include/asm/setup.h
5318 index 2a0347af0702..670fa2f4cfc3 100644
5319 --- a/arch/tile/include/asm/setup.h
5320 +++ b/arch/tile/include/asm/setup.h
5321 @@ -49,7 +49,7 @@ int hardwall_ipi_valid(int cpu);
5322
5323  /* Hook hardwall code into changes in affinity. */
5324  #define arch_set_cpus_allowed(p, new_mask) do { \
5325 -       if (!cpumask_equal(&p->cpus_allowed, new_mask)) \
5326 +       if (!cpumask_equal(p->cpus_ptr, new_mask)) \
5327                 hardwall_deactivate_all(p); \
5328  } while (0)
5329  #endif
5330 diff --git a/arch/tile/include/asm/spinlock_types.h b/arch/tile/include/asm/spinlock_types.h
5331 index a71f59b49c50..9311c6ff2abc 100644
5332 --- a/arch/tile/include/asm/spinlock_types.h
5333 +++ b/arch/tile/include/asm/spinlock_types.h
5334 @@ -15,10 +15,6 @@
5335  #ifndef _ASM_TILE_SPINLOCK_TYPES_H
5336  #define _ASM_TILE_SPINLOCK_TYPES_H
5337
5338 -#ifndef __LINUX_SPINLOCK_TYPES_H
5339 -# error "please don't include this file directly"
5340 -#endif
5341 -
5342  #ifdef __tilegx__
5343
5344  /* Low 15 bits are "next"; high 15 bits are "current". */
5345 diff --git a/arch/tile/kernel/hardwall.c b/arch/tile/kernel/hardwall.c
5346 index 2fd1694ac1d0..98f4fb696289 100644
5347 --- a/arch/tile/kernel/hardwall.c
5348 +++ b/arch/tile/kernel/hardwall.c
5349 @@ -590,12 +590,12 @@ static int hardwall_activate(struct hardwall_info *info)
5350          * Get our affinity; if we're not bound to this tile uniquely,
5351          * we can't access the network registers.
5352          */
5353 -       if (cpumask_weight(&p->cpus_allowed) != 1)
5354 +       if (p->nr_cpus_allowed != 1)
5355                 return -EPERM;
5356
5357         /* Make sure we are bound to a cpu assigned to this resource. */
5358         cpu = smp_processor_id();
5359 -       BUG_ON(cpumask_first(&p->cpus_allowed) != cpu);
5360 +       BUG_ON(cpumask_first(p->cpus_ptr) != cpu);
5361         if (!cpumask_test_cpu(cpu, &info->cpumask))
5362                 return -EINVAL;
5363
5364 @@ -621,17 +621,17 @@ static int hardwall_activate(struct hardwall_info *info)
5365   * Deactivate a task's hardwall.  Must hold lock for hardwall_type.
5366   * This method may be called from exit_thread(), so we don't want to
5367   * rely on too many fields of struct task_struct still being valid.
5368 - * We assume the cpus_allowed, pid, and comm fields are still valid.
5369 + * We assume the nr_cpus_allowed, pid, and comm fields are still valid.
5370   */
5371  static void _hardwall_deactivate(struct hardwall_type *hwt,
5372                                  struct task_struct *task)
5373  {
5374         struct thread_struct *ts = &task->thread;
5375
5376 -       if (cpumask_weight(&task->cpus_allowed) != 1) {
5377 +       if (task->nr_cpus_allowed != 1) {
5378                 pr_err("pid %d (%s) releasing %s hardwall with an affinity mask containing %d cpus!\n",
5379                        task->pid, task->comm, hwt->name,
5380 -                      cpumask_weight(&task->cpus_allowed));
5381 +                      task->nr_cpus_allowed);
5382                 BUG();
5383         }
5384
5385 diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
5386 index 2af0af33362a..7764f936d6ab 100644
5387 --- a/arch/x86/Kconfig
5388 +++ b/arch/x86/Kconfig
5389 @@ -169,6 +169,7 @@ config X86
5390         select HAVE_HARDLOCKUP_DETECTOR_PERF    if PERF_EVENTS && HAVE_PERF_EVENTS_NMI
5391         select HAVE_PERF_REGS
5392         select HAVE_PERF_USER_STACK_DUMP
5393 +       select HAVE_PREEMPT_LAZY
5394         select HAVE_RCU_TABLE_FREE
5395         select HAVE_RCU_TABLE_INVALIDATE        if HAVE_RCU_TABLE_FREE
5396         select HAVE_REGS_AND_STACK_ACCESS_API
5397 @@ -257,8 +258,11 @@ config ARCH_MAY_HAVE_PC_FDC
5398         def_bool y
5399         depends on ISA_DMA_API
5400
5401 +config RWSEM_GENERIC_SPINLOCK
5402 +       def_bool PREEMPT_RT_FULL
5403 +
5404  config RWSEM_XCHGADD_ALGORITHM
5405 -       def_bool y
5406 +       def_bool !RWSEM_GENERIC_SPINLOCK && !PREEMPT_RT_FULL
5407
5408  config GENERIC_CALIBRATE_DELAY
5409         def_bool y
5410 @@ -933,7 +937,7 @@ config IOMMU_HELPER
5411  config MAXSMP
5412         bool "Enable Maximum number of SMP Processors and NUMA Nodes"
5413         depends on X86_64 && SMP && DEBUG_KERNEL
5414 -       select CPUMASK_OFFSTACK
5415 +       select CPUMASK_OFFSTACK if !PREEMPT_RT_FULL
5416         ---help---
5417           Enable maximum number of CPUS and NUMA Nodes for this architecture.
5418           If unsure, say N.
5419 diff --git a/arch/x86/crypto/aesni-intel_glue.c b/arch/x86/crypto/aesni-intel_glue.c
5420 index c690ddc78c03..7a3138d33e33 100644
5421 --- a/arch/x86/crypto/aesni-intel_glue.c
5422 +++ b/arch/x86/crypto/aesni-intel_glue.c
5423 @@ -387,14 +387,14 @@ static int ecb_encrypt(struct skcipher_request *req)
5424
5425         err = skcipher_walk_virt(&walk, req, true);
5426
5427 -       kernel_fpu_begin();
5428         while ((nbytes = walk.nbytes)) {
5429 +               kernel_fpu_begin();
5430                 aesni_ecb_enc(ctx, walk.dst.virt.addr, walk.src.virt.addr,
5431                               nbytes & AES_BLOCK_MASK);
5432 +               kernel_fpu_end();
5433                 nbytes &= AES_BLOCK_SIZE - 1;
5434                 err = skcipher_walk_done(&walk, nbytes);
5435         }
5436 -       kernel_fpu_end();
5437
5438         return err;
5439  }
5440 @@ -409,14 +409,14 @@ static int ecb_decrypt(struct skcipher_request *req)
5441
5442         err = skcipher_walk_virt(&walk, req, true);
5443
5444 -       kernel_fpu_begin();
5445         while ((nbytes = walk.nbytes)) {
5446 +               kernel_fpu_begin();
5447                 aesni_ecb_dec(ctx, walk.dst.virt.addr, walk.src.virt.addr,
5448                               nbytes & AES_BLOCK_MASK);
5449 +               kernel_fpu_end();
5450                 nbytes &= AES_BLOCK_SIZE - 1;
5451                 err = skcipher_walk_done(&walk, nbytes);
5452         }
5453 -       kernel_fpu_end();
5454
5455         return err;
5456  }
5457 @@ -431,14 +431,14 @@ static int cbc_encrypt(struct skcipher_request *req)
5458
5459         err = skcipher_walk_virt(&walk, req, true);
5460
5461 -       kernel_fpu_begin();
5462         while ((nbytes = walk.nbytes)) {
5463 +               kernel_fpu_begin();
5464                 aesni_cbc_enc(ctx, walk.dst.virt.addr, walk.src.virt.addr,
5465                               nbytes & AES_BLOCK_MASK, walk.iv);
5466 +               kernel_fpu_end();
5467                 nbytes &= AES_BLOCK_SIZE - 1;
5468                 err = skcipher_walk_done(&walk, nbytes);
5469         }
5470 -       kernel_fpu_end();
5471
5472         return err;
5473  }
5474 @@ -453,14 +453,14 @@ static int cbc_decrypt(struct skcipher_request *req)
5475
5476         err = skcipher_walk_virt(&walk, req, true);
5477
5478 -       kernel_fpu_begin();
5479         while ((nbytes = walk.nbytes)) {
5480 +               kernel_fpu_begin();
5481                 aesni_cbc_dec(ctx, walk.dst.virt.addr, walk.src.virt.addr,
5482                               nbytes & AES_BLOCK_MASK, walk.iv);
5483 +               kernel_fpu_end();
5484                 nbytes &= AES_BLOCK_SIZE - 1;
5485                 err = skcipher_walk_done(&walk, nbytes);
5486         }
5487 -       kernel_fpu_end();
5488
5489         return err;
5490  }
5491 @@ -510,18 +510,20 @@ static int ctr_crypt(struct skcipher_request *req)
5492
5493         err = skcipher_walk_virt(&walk, req, true);
5494
5495 -       kernel_fpu_begin();
5496         while ((nbytes = walk.nbytes) >= AES_BLOCK_SIZE) {
5497 +               kernel_fpu_begin();
5498                 aesni_ctr_enc_tfm(ctx, walk.dst.virt.addr, walk.src.virt.addr,
5499                                       nbytes & AES_BLOCK_MASK, walk.iv);
5500 +               kernel_fpu_end();
5501                 nbytes &= AES_BLOCK_SIZE - 1;
5502                 err = skcipher_walk_done(&walk, nbytes);
5503         }
5504         if (walk.nbytes) {
5505 +               kernel_fpu_begin();
5506                 ctr_crypt_final(ctx, &walk);
5507 +               kernel_fpu_end();
5508                 err = skcipher_walk_done(&walk, 0);
5509         }
5510 -       kernel_fpu_end();
5511
5512         return err;
5513  }
5514 diff --git a/arch/x86/crypto/camellia_aesni_avx2_glue.c b/arch/x86/crypto/camellia_aesni_avx2_glue.c
5515 index 60907c139c4e..0902db7d326a 100644
5516 --- a/arch/x86/crypto/camellia_aesni_avx2_glue.c
5517 +++ b/arch/x86/crypto/camellia_aesni_avx2_glue.c
5518 @@ -206,6 +206,20 @@ struct crypt_priv {
5519         bool fpu_enabled;
5520  };
5521
5522 +#ifdef CONFIG_PREEMPT_RT_FULL
5523 +static void camellia_fpu_end_rt(struct crypt_priv *ctx)
5524 +{
5525 +       bool fpu_enabled = ctx->fpu_enabled;
5526 +
5527 +       if (!fpu_enabled)
5528 +               return;
5529 +       camellia_fpu_end(fpu_enabled);
5530 +       ctx->fpu_enabled = false;
5531 +}
5532 +#else
5533 +static void camellia_fpu_end_rt(struct crypt_priv *ctx) { }
5534 +#endif
5535 +
5536  static void encrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes)
5537  {
5538         const unsigned int bsize = CAMELLIA_BLOCK_SIZE;
5539 @@ -221,16 +235,19 @@ static void encrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes)
5540         }
5541
5542         if (nbytes >= CAMELLIA_AESNI_PARALLEL_BLOCKS * bsize) {
5543 +               kernel_fpu_resched();
5544                 camellia_ecb_enc_16way(ctx->ctx, srcdst, srcdst);
5545                 srcdst += bsize * CAMELLIA_AESNI_PARALLEL_BLOCKS;
5546                 nbytes -= bsize * CAMELLIA_AESNI_PARALLEL_BLOCKS;
5547         }
5548
5549         while (nbytes >= CAMELLIA_PARALLEL_BLOCKS * bsize) {
5550 +               kernel_fpu_resched();
5551                 camellia_enc_blk_2way(ctx->ctx, srcdst, srcdst);
5552                 srcdst += bsize * CAMELLIA_PARALLEL_BLOCKS;
5553                 nbytes -= bsize * CAMELLIA_PARALLEL_BLOCKS;
5554         }
5555 +       camellia_fpu_end_rt(ctx);
5556
5557         for (i = 0; i < nbytes / bsize; i++, srcdst += bsize)
5558                 camellia_enc_blk(ctx->ctx, srcdst, srcdst);
5559 @@ -251,16 +268,19 @@ static void decrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes)
5560         }
5561
5562         if (nbytes >= CAMELLIA_AESNI_PARALLEL_BLOCKS * bsize) {
5563 +               kernel_fpu_resched();
5564                 camellia_ecb_dec_16way(ctx->ctx, srcdst, srcdst);
5565                 srcdst += bsize * CAMELLIA_AESNI_PARALLEL_BLOCKS;
5566                 nbytes -= bsize * CAMELLIA_AESNI_PARALLEL_BLOCKS;
5567         }
5568
5569         while (nbytes >= CAMELLIA_PARALLEL_BLOCKS * bsize) {
5570 +               kernel_fpu_resched();
5571                 camellia_dec_blk_2way(ctx->ctx, srcdst, srcdst);
5572                 srcdst += bsize * CAMELLIA_PARALLEL_BLOCKS;
5573                 nbytes -= bsize * CAMELLIA_PARALLEL_BLOCKS;
5574         }
5575 +       camellia_fpu_end_rt(ctx);
5576
5577         for (i = 0; i < nbytes / bsize; i++, srcdst += bsize)
5578                 camellia_dec_blk(ctx->ctx, srcdst, srcdst);
5579 diff --git a/arch/x86/crypto/camellia_aesni_avx_glue.c b/arch/x86/crypto/camellia_aesni_avx_glue.c
5580 index d96429da88eb..3b8e91841039 100644
5581 --- a/arch/x86/crypto/camellia_aesni_avx_glue.c
5582 +++ b/arch/x86/crypto/camellia_aesni_avx_glue.c
5583 @@ -210,6 +210,21 @@ struct crypt_priv {
5584         bool fpu_enabled;
5585  };
5586
5587 +#ifdef CONFIG_PREEMPT_RT_FULL
5588 +static void camellia_fpu_end_rt(struct crypt_priv *ctx)
5589 +{
5590 +       bool fpu_enabled = ctx->fpu_enabled;
5591 +
5592 +       if (!fpu_enabled)
5593 +               return;
5594 +       camellia_fpu_end(fpu_enabled);
5595 +       ctx->fpu_enabled = false;
5596 +}
5597 +
5598 +#else
5599 +static void camellia_fpu_end_rt(struct crypt_priv *ctx) { }
5600 +#endif
5601 +
5602  static void encrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes)
5603  {
5604         const unsigned int bsize = CAMELLIA_BLOCK_SIZE;
5605 @@ -225,10 +240,12 @@ static void encrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes)
5606         }
5607
5608         while (nbytes >= CAMELLIA_PARALLEL_BLOCKS * bsize) {
5609 +               kernel_fpu_resched();
5610                 camellia_enc_blk_2way(ctx->ctx, srcdst, srcdst);
5611                 srcdst += bsize * CAMELLIA_PARALLEL_BLOCKS;
5612                 nbytes -= bsize * CAMELLIA_PARALLEL_BLOCKS;
5613         }
5614 +       camellia_fpu_end_rt(ctx);
5615
5616         for (i = 0; i < nbytes / bsize; i++, srcdst += bsize)
5617                 camellia_enc_blk(ctx->ctx, srcdst, srcdst);
5618 @@ -249,10 +266,12 @@ static void decrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes)
5619         }
5620
5621         while (nbytes >= CAMELLIA_PARALLEL_BLOCKS * bsize) {
5622 +               kernel_fpu_resched();
5623                 camellia_dec_blk_2way(ctx->ctx, srcdst, srcdst);
5624                 srcdst += bsize * CAMELLIA_PARALLEL_BLOCKS;
5625                 nbytes -= bsize * CAMELLIA_PARALLEL_BLOCKS;
5626         }
5627 +       camellia_fpu_end_rt(ctx);
5628
5629         for (i = 0; i < nbytes / bsize; i++, srcdst += bsize)
5630                 camellia_dec_blk(ctx->ctx, srcdst, srcdst);
5631 diff --git a/arch/x86/crypto/cast5_avx_glue.c b/arch/x86/crypto/cast5_avx_glue.c
5632 index 575292a33bdf..0a4b0a222b18 100644
5633 --- a/arch/x86/crypto/cast5_avx_glue.c
5634 +++ b/arch/x86/crypto/cast5_avx_glue.c
5635 @@ -59,7 +59,7 @@ static inline void cast5_fpu_end(bool fpu_enabled)
5636  static int ecb_crypt(struct blkcipher_desc *desc, struct blkcipher_walk *walk,
5637                      bool enc)
5638  {
5639 -       bool fpu_enabled = false;
5640 +       bool fpu_enabled;
5641         struct cast5_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
5642         const unsigned int bsize = CAST5_BLOCK_SIZE;
5643         unsigned int nbytes;
5644 @@ -73,7 +73,7 @@ static int ecb_crypt(struct blkcipher_desc *desc, struct blkcipher_walk *walk,
5645                 u8 *wsrc = walk->src.virt.addr;
5646                 u8 *wdst = walk->dst.virt.addr;
5647
5648 -               fpu_enabled = cast5_fpu_begin(fpu_enabled, nbytes);
5649 +               fpu_enabled = cast5_fpu_begin(false, nbytes);
5650
5651                 /* Process multi-block batch */
5652                 if (nbytes >= bsize * CAST5_PARALLEL_BLOCKS) {
5653 @@ -102,10 +102,9 @@ static int ecb_crypt(struct blkcipher_desc *desc, struct blkcipher_walk *walk,
5654                 } while (nbytes >= bsize);
5655
5656  done:
5657 +               cast5_fpu_end(fpu_enabled);
5658                 err = blkcipher_walk_done(desc, walk, nbytes);
5659         }
5660 -
5661 -       cast5_fpu_end(fpu_enabled);
5662         return err;
5663  }
5664
5665 @@ -226,7 +225,7 @@ static unsigned int __cbc_decrypt(struct blkcipher_desc *desc,
5666  static int cbc_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
5667                        struct scatterlist *src, unsigned int nbytes)
5668  {
5669 -       bool fpu_enabled = false;
5670 +       bool fpu_enabled;
5671         struct blkcipher_walk walk;
5672         int err;
5673
5674 @@ -235,12 +234,11 @@ static int cbc_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
5675         desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
5676
5677         while ((nbytes = walk.nbytes)) {
5678 -               fpu_enabled = cast5_fpu_begin(fpu_enabled, nbytes);
5679 +               fpu_enabled = cast5_fpu_begin(false, nbytes);
5680                 nbytes = __cbc_decrypt(desc, &walk);
5681 +               cast5_fpu_end(fpu_enabled);
5682                 err = blkcipher_walk_done(desc, &walk, nbytes);
5683         }
5684 -
5685 -       cast5_fpu_end(fpu_enabled);
5686         return err;
5687  }
5688
5689 @@ -309,7 +307,7 @@ static unsigned int __ctr_crypt(struct blkcipher_desc *desc,
5690  static int ctr_crypt(struct blkcipher_desc *desc, struct scatterlist *dst,
5691                      struct scatterlist *src, unsigned int nbytes)
5692  {
5693 -       bool fpu_enabled = false;
5694 +       bool fpu_enabled;
5695         struct blkcipher_walk walk;
5696         int err;
5697
5698 @@ -318,13 +316,12 @@ static int ctr_crypt(struct blkcipher_desc *desc, struct scatterlist *dst,
5699         desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
5700
5701         while ((nbytes = walk.nbytes) >= CAST5_BLOCK_SIZE) {
5702 -               fpu_enabled = cast5_fpu_begin(fpu_enabled, nbytes);
5703 +               fpu_enabled = cast5_fpu_begin(false, nbytes);
5704                 nbytes = __ctr_crypt(desc, &walk);
5705 +               cast5_fpu_end(fpu_enabled);
5706                 err = blkcipher_walk_done(desc, &walk, nbytes);
5707         }
5708
5709 -       cast5_fpu_end(fpu_enabled);
5710 -
5711         if (walk.nbytes) {
5712                 ctr_crypt_final(desc, &walk);
5713                 err = blkcipher_walk_done(desc, &walk, 0);
5714 diff --git a/arch/x86/crypto/cast6_avx_glue.c b/arch/x86/crypto/cast6_avx_glue.c
5715 index 50e684768c55..8caf9ba8c1da 100644
5716 --- a/arch/x86/crypto/cast6_avx_glue.c
5717 +++ b/arch/x86/crypto/cast6_avx_glue.c
5718 @@ -205,19 +205,33 @@ struct crypt_priv {
5719         bool fpu_enabled;
5720  };
5721
5722 +#ifdef CONFIG_PREEMPT_RT_FULL
5723 +static void cast6_fpu_end_rt(struct crypt_priv *ctx)
5724 +{
5725 +       bool fpu_enabled = ctx->fpu_enabled;
5726 +
5727 +       if (!fpu_enabled)
5728 +               return;
5729 +       cast6_fpu_end(fpu_enabled);
5730 +       ctx->fpu_enabled = false;
5731 +}
5732 +
5733 +#else
5734 +static void cast6_fpu_end_rt(struct crypt_priv *ctx) { }
5735 +#endif
5736 +
5737  static void encrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes)
5738  {
5739         const unsigned int bsize = CAST6_BLOCK_SIZE;
5740         struct crypt_priv *ctx = priv;
5741         int i;
5742
5743 -       ctx->fpu_enabled = cast6_fpu_begin(ctx->fpu_enabled, nbytes);
5744 -
5745         if (nbytes == bsize * CAST6_PARALLEL_BLOCKS) {
5746 +               ctx->fpu_enabled = cast6_fpu_begin(ctx->fpu_enabled, nbytes);
5747                 cast6_ecb_enc_8way(ctx->ctx, srcdst, srcdst);
5748 +               cast6_fpu_end_rt(ctx);
5749                 return;
5750         }
5751 -
5752         for (i = 0; i < nbytes / bsize; i++, srcdst += bsize)
5753                 __cast6_encrypt(ctx->ctx, srcdst, srcdst);
5754  }
5755 @@ -228,10 +242,10 @@ static void decrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes)
5756         struct crypt_priv *ctx = priv;
5757         int i;
5758
5759 -       ctx->fpu_enabled = cast6_fpu_begin(ctx->fpu_enabled, nbytes);
5760 -
5761         if (nbytes == bsize * CAST6_PARALLEL_BLOCKS) {
5762 +               ctx->fpu_enabled = cast6_fpu_begin(ctx->fpu_enabled, nbytes);
5763                 cast6_ecb_dec_8way(ctx->ctx, srcdst, srcdst);
5764 +               cast6_fpu_end_rt(ctx);
5765                 return;
5766         }
5767
5768 diff --git a/arch/x86/crypto/chacha20_glue.c b/arch/x86/crypto/chacha20_glue.c
5769 index 1e6af1b35f7b..e7809fd2a4fd 100644
5770 --- a/arch/x86/crypto/chacha20_glue.c
5771 +++ b/arch/x86/crypto/chacha20_glue.c
5772 @@ -81,23 +81,24 @@ static int chacha20_simd(struct skcipher_request *req)
5773
5774         crypto_chacha20_init(state, ctx, walk.iv);
5775
5776 -       kernel_fpu_begin();
5777 -
5778         while (walk.nbytes >= CHACHA20_BLOCK_SIZE) {
5779 +               kernel_fpu_begin();
5780 +
5781                 chacha20_dosimd(state, walk.dst.virt.addr, walk.src.virt.addr,
5782                                 rounddown(walk.nbytes, CHACHA20_BLOCK_SIZE));
5783 +               kernel_fpu_end();
5784                 err = skcipher_walk_done(&walk,
5785                                          walk.nbytes % CHACHA20_BLOCK_SIZE);
5786         }
5787
5788         if (walk.nbytes) {
5789 +               kernel_fpu_begin();
5790                 chacha20_dosimd(state, walk.dst.virt.addr, walk.src.virt.addr,
5791                                 walk.nbytes);
5792 +               kernel_fpu_end();
5793                 err = skcipher_walk_done(&walk, 0);
5794         }
5795
5796 -       kernel_fpu_end();
5797 -
5798         return err;
5799  }
5800
5801 diff --git a/arch/x86/crypto/glue_helper.c b/arch/x86/crypto/glue_helper.c
5802 index d61e57960fe0..c67560d9718a 100644
5803 --- a/arch/x86/crypto/glue_helper.c
5804 +++ b/arch/x86/crypto/glue_helper.c
5805 @@ -40,7 +40,7 @@ static int __glue_ecb_crypt_128bit(const struct common_glue_ctx *gctx,
5806         void *ctx = crypto_blkcipher_ctx(desc->tfm);
5807         const unsigned int bsize = 128 / 8;
5808         unsigned int nbytes, i, func_bytes;
5809 -       bool fpu_enabled = false;
5810 +       bool fpu_enabled;
5811         int err;
5812
5813         err = blkcipher_walk_virt(desc, walk);
5814 @@ -50,7 +50,7 @@ static int __glue_ecb_crypt_128bit(const struct common_glue_ctx *gctx,
5815                 u8 *wdst = walk->dst.virt.addr;
5816
5817                 fpu_enabled = glue_fpu_begin(bsize, gctx->fpu_blocks_limit,
5818 -                                            desc, fpu_enabled, nbytes);
5819 +                                            desc, false, nbytes);
5820
5821                 for (i = 0; i < gctx->num_funcs; i++) {
5822                         func_bytes = bsize * gctx->funcs[i].num_blocks;
5823 @@ -72,10 +72,10 @@ static int __glue_ecb_crypt_128bit(const struct common_glue_ctx *gctx,
5824                 }
5825
5826  done:
5827 +               glue_fpu_end(fpu_enabled);
5828                 err = blkcipher_walk_done(desc, walk, nbytes);
5829         }
5830
5831 -       glue_fpu_end(fpu_enabled);
5832         return err;
5833  }
5834
5835 @@ -192,7 +192,7 @@ int glue_cbc_decrypt_128bit(const struct common_glue_ctx *gctx,
5836                             struct scatterlist *src, unsigned int nbytes)
5837  {
5838         const unsigned int bsize = 128 / 8;
5839 -       bool fpu_enabled = false;
5840 +       bool fpu_enabled;
5841         struct blkcipher_walk walk;
5842         int err;
5843
5844 @@ -201,12 +201,12 @@ int glue_cbc_decrypt_128bit(const struct common_glue_ctx *gctx,
5845
5846         while ((nbytes = walk.nbytes)) {
5847                 fpu_enabled = glue_fpu_begin(bsize, gctx->fpu_blocks_limit,
5848 -                                            desc, fpu_enabled, nbytes);
5849 +                                            desc, false, nbytes);
5850                 nbytes = __glue_cbc_decrypt_128bit(gctx, desc, &walk);
5851 +               glue_fpu_end(fpu_enabled);
5852                 err = blkcipher_walk_done(desc, &walk, nbytes);
5853         }
5854
5855 -       glue_fpu_end(fpu_enabled);
5856         return err;
5857  }
5858  EXPORT_SYMBOL_GPL(glue_cbc_decrypt_128bit);
5859 @@ -275,7 +275,7 @@ int glue_ctr_crypt_128bit(const struct common_glue_ctx *gctx,
5860                           struct scatterlist *src, unsigned int nbytes)
5861  {
5862         const unsigned int bsize = 128 / 8;
5863 -       bool fpu_enabled = false;
5864 +       bool fpu_enabled;
5865         struct blkcipher_walk walk;
5866         int err;
5867
5868 @@ -284,13 +284,12 @@ int glue_ctr_crypt_128bit(const struct common_glue_ctx *gctx,
5869
5870         while ((nbytes = walk.nbytes) >= bsize) {
5871                 fpu_enabled = glue_fpu_begin(bsize, gctx->fpu_blocks_limit,
5872 -                                            desc, fpu_enabled, nbytes);
5873 +                                            desc, false, nbytes);
5874                 nbytes = __glue_ctr_crypt_128bit(gctx, desc, &walk);
5875 +               glue_fpu_end(fpu_enabled);
5876                 err = blkcipher_walk_done(desc, &walk, nbytes);
5877         }
5878
5879 -       glue_fpu_end(fpu_enabled);
5880 -
5881         if (walk.nbytes) {
5882                 glue_ctr_crypt_final_128bit(
5883                         gctx->funcs[gctx->num_funcs - 1].fn_u.ctr, desc, &walk);
5884 @@ -380,7 +379,7 @@ int glue_xts_crypt_128bit(const struct common_glue_ctx *gctx,
5885                           void *tweak_ctx, void *crypt_ctx)
5886  {
5887         const unsigned int bsize = 128 / 8;
5888 -       bool fpu_enabled = false;
5889 +       bool fpu_enabled;
5890         struct blkcipher_walk walk;
5891         int err;
5892
5893 @@ -393,21 +392,21 @@ int glue_xts_crypt_128bit(const struct common_glue_ctx *gctx,
5894
5895         /* set minimum length to bsize, for tweak_fn */
5896         fpu_enabled = glue_fpu_begin(bsize, gctx->fpu_blocks_limit,
5897 -                                    desc, fpu_enabled,
5898 +                                    desc, false,
5899                                      nbytes < bsize ? bsize : nbytes);
5900 -
5901         /* calculate first value of T */
5902         tweak_fn(tweak_ctx, walk.iv, walk.iv);
5903 +       glue_fpu_end(fpu_enabled);
5904
5905         while (nbytes) {
5906 +               fpu_enabled = glue_fpu_begin(bsize, gctx->fpu_blocks_limit,
5907 +                               desc, false, nbytes);
5908                 nbytes = __glue_xts_crypt_128bit(gctx, crypt_ctx, desc, &walk);
5909
5910 +               glue_fpu_end(fpu_enabled);
5911                 err = blkcipher_walk_done(desc, &walk, nbytes);
5912                 nbytes = walk.nbytes;
5913         }
5914 -
5915 -       glue_fpu_end(fpu_enabled);
5916 -
5917         return err;
5918  }
5919  EXPORT_SYMBOL_GPL(glue_xts_crypt_128bit);
5920 diff --git a/arch/x86/crypto/serpent_avx2_glue.c b/arch/x86/crypto/serpent_avx2_glue.c
5921 index 870f6d812a2d..5c806bf39f1d 100644
5922 --- a/arch/x86/crypto/serpent_avx2_glue.c
5923 +++ b/arch/x86/crypto/serpent_avx2_glue.c
5924 @@ -184,6 +184,21 @@ struct crypt_priv {
5925         bool fpu_enabled;
5926  };
5927
5928 +#ifdef CONFIG_PREEMPT_RT_FULL
5929 +static void serpent_fpu_end_rt(struct crypt_priv *ctx)
5930 +{
5931 +       bool fpu_enabled = ctx->fpu_enabled;
5932 +
5933 +       if (!fpu_enabled)
5934 +               return;
5935 +       serpent_fpu_end(fpu_enabled);
5936 +       ctx->fpu_enabled = false;
5937 +}
5938 +
5939 +#else
5940 +static void serpent_fpu_end_rt(struct crypt_priv *ctx) { }
5941 +#endif
5942 +
5943  static void encrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes)
5944  {
5945         const unsigned int bsize = SERPENT_BLOCK_SIZE;
5946 @@ -199,10 +214,12 @@ static void encrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes)
5947         }
5948
5949         while (nbytes >= SERPENT_PARALLEL_BLOCKS * bsize) {
5950 +               kernel_fpu_resched();
5951                 serpent_ecb_enc_8way_avx(ctx->ctx, srcdst, srcdst);
5952                 srcdst += bsize * SERPENT_PARALLEL_BLOCKS;
5953                 nbytes -= bsize * SERPENT_PARALLEL_BLOCKS;
5954         }
5955 +       serpent_fpu_end_rt(ctx);
5956
5957         for (i = 0; i < nbytes / bsize; i++, srcdst += bsize)
5958                 __serpent_encrypt(ctx->ctx, srcdst, srcdst);
5959 @@ -223,10 +240,12 @@ static void decrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes)
5960         }
5961
5962         while (nbytes >= SERPENT_PARALLEL_BLOCKS * bsize) {
5963 +               kernel_fpu_resched();
5964                 serpent_ecb_dec_8way_avx(ctx->ctx, srcdst, srcdst);
5965                 srcdst += bsize * SERPENT_PARALLEL_BLOCKS;
5966                 nbytes -= bsize * SERPENT_PARALLEL_BLOCKS;
5967         }
5968 +       serpent_fpu_end_rt(ctx);
5969
5970         for (i = 0; i < nbytes / bsize; i++, srcdst += bsize)
5971                 __serpent_decrypt(ctx->ctx, srcdst, srcdst);
5972 diff --git a/arch/x86/crypto/serpent_avx_glue.c b/arch/x86/crypto/serpent_avx_glue.c
5973 index 6f778d3daa22..46dcbdbd0518 100644
5974 --- a/arch/x86/crypto/serpent_avx_glue.c
5975 +++ b/arch/x86/crypto/serpent_avx_glue.c
5976 @@ -218,16 +218,31 @@ struct crypt_priv {
5977         bool fpu_enabled;
5978  };
5979
5980 +#ifdef CONFIG_PREEMPT_RT_FULL
5981 +static void serpent_fpu_end_rt(struct crypt_priv *ctx)
5982 +{
5983 +       bool fpu_enabled = ctx->fpu_enabled;
5984 +
5985 +       if (!fpu_enabled)
5986 +               return;
5987 +       serpent_fpu_end(fpu_enabled);
5988 +       ctx->fpu_enabled = false;
5989 +}
5990 +
5991 +#else
5992 +static void serpent_fpu_end_rt(struct crypt_priv *ctx) { }
5993 +#endif
5994 +
5995  static void encrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes)
5996  {
5997         const unsigned int bsize = SERPENT_BLOCK_SIZE;
5998         struct crypt_priv *ctx = priv;
5999         int i;
6000
6001 -       ctx->fpu_enabled = serpent_fpu_begin(ctx->fpu_enabled, nbytes);
6002 -
6003         if (nbytes == bsize * SERPENT_PARALLEL_BLOCKS) {
6004 +               ctx->fpu_enabled = serpent_fpu_begin(ctx->fpu_enabled, nbytes);
6005                 serpent_ecb_enc_8way_avx(ctx->ctx, srcdst, srcdst);
6006 +               serpent_fpu_end_rt(ctx);
6007                 return;
6008         }
6009
6010 @@ -241,10 +256,10 @@ static void decrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes)
6011         struct crypt_priv *ctx = priv;
6012         int i;
6013
6014 -       ctx->fpu_enabled = serpent_fpu_begin(ctx->fpu_enabled, nbytes);
6015 -
6016         if (nbytes == bsize * SERPENT_PARALLEL_BLOCKS) {
6017 +               ctx->fpu_enabled = serpent_fpu_begin(ctx->fpu_enabled, nbytes);
6018                 serpent_ecb_dec_8way_avx(ctx->ctx, srcdst, srcdst);
6019 +               serpent_fpu_end_rt(ctx);
6020                 return;
6021         }
6022
6023 diff --git a/arch/x86/crypto/serpent_sse2_glue.c b/arch/x86/crypto/serpent_sse2_glue.c
6024 index ac0e831943f5..d35f607d067f 100644
6025 --- a/arch/x86/crypto/serpent_sse2_glue.c
6026 +++ b/arch/x86/crypto/serpent_sse2_glue.c
6027 @@ -187,16 +187,31 @@ struct crypt_priv {
6028         bool fpu_enabled;
6029  };
6030
6031 +#ifdef CONFIG_PREEMPT_RT_FULL
6032 +static void serpent_fpu_end_rt(struct crypt_priv *ctx)
6033 +{
6034 +       bool fpu_enabled = ctx->fpu_enabled;
6035 +
6036 +       if (!fpu_enabled)
6037 +               return;
6038 +       serpent_fpu_end(fpu_enabled);
6039 +       ctx->fpu_enabled = false;
6040 +}
6041 +
6042 +#else
6043 +static void serpent_fpu_end_rt(struct crypt_priv *ctx) { }
6044 +#endif
6045 +
6046  static void encrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes)
6047  {
6048         const unsigned int bsize = SERPENT_BLOCK_SIZE;
6049         struct crypt_priv *ctx = priv;
6050         int i;
6051
6052 -       ctx->fpu_enabled = serpent_fpu_begin(ctx->fpu_enabled, nbytes);
6053 -
6054         if (nbytes == bsize * SERPENT_PARALLEL_BLOCKS) {
6055 +               ctx->fpu_enabled = serpent_fpu_begin(ctx->fpu_enabled, nbytes);
6056                 serpent_enc_blk_xway(ctx->ctx, srcdst, srcdst);
6057 +               serpent_fpu_end_rt(ctx);
6058                 return;
6059         }
6060
6061 @@ -210,10 +225,10 @@ static void decrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes)
6062         struct crypt_priv *ctx = priv;
6063         int i;
6064
6065 -       ctx->fpu_enabled = serpent_fpu_begin(ctx->fpu_enabled, nbytes);
6066 -
6067         if (nbytes == bsize * SERPENT_PARALLEL_BLOCKS) {
6068 +               ctx->fpu_enabled = serpent_fpu_begin(ctx->fpu_enabled, nbytes);
6069                 serpent_dec_blk_xway(ctx->ctx, srcdst, srcdst);
6070 +               serpent_fpu_end_rt(ctx);
6071                 return;
6072         }
6073
6074 diff --git a/arch/x86/crypto/twofish_avx_glue.c b/arch/x86/crypto/twofish_avx_glue.c
6075 index b7a3904b953c..de00fe24927e 100644
6076 --- a/arch/x86/crypto/twofish_avx_glue.c
6077 +++ b/arch/x86/crypto/twofish_avx_glue.c
6078 @@ -218,6 +218,21 @@ struct crypt_priv {
6079         bool fpu_enabled;
6080  };
6081
6082 +#ifdef CONFIG_PREEMPT_RT_FULL
6083 +static void twofish_fpu_end_rt(struct crypt_priv *ctx)
6084 +{
6085 +       bool fpu_enabled = ctx->fpu_enabled;
6086 +
6087 +       if (!fpu_enabled)
6088 +               return;
6089 +       twofish_fpu_end(fpu_enabled);
6090 +       ctx->fpu_enabled = false;
6091 +}
6092 +
6093 +#else
6094 +static void twofish_fpu_end_rt(struct crypt_priv *ctx) { }
6095 +#endif
6096 +
6097  static void encrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes)
6098  {
6099         const unsigned int bsize = TF_BLOCK_SIZE;
6100 @@ -228,12 +243,16 @@ static void encrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes)
6101
6102         if (nbytes == bsize * TWOFISH_PARALLEL_BLOCKS) {
6103                 twofish_ecb_enc_8way(ctx->ctx, srcdst, srcdst);
6104 +               twofish_fpu_end_rt(ctx);
6105                 return;
6106         }
6107
6108 -       for (i = 0; i < nbytes / (bsize * 3); i++, srcdst += bsize * 3)
6109 +       for (i = 0; i < nbytes / (bsize * 3); i++, srcdst += bsize * 3) {
6110 +               kernel_fpu_resched();
6111                 twofish_enc_blk_3way(ctx->ctx, srcdst, srcdst);
6112 +       }
6113
6114 +       twofish_fpu_end_rt(ctx);
6115         nbytes %= bsize * 3;
6116
6117         for (i = 0; i < nbytes / bsize; i++, srcdst += bsize)
6118 @@ -250,11 +269,15 @@ static void decrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes)
6119
6120         if (nbytes == bsize * TWOFISH_PARALLEL_BLOCKS) {
6121                 twofish_ecb_dec_8way(ctx->ctx, srcdst, srcdst);
6122 +               twofish_fpu_end_rt(ctx);
6123                 return;
6124         }
6125
6126 -       for (i = 0; i < nbytes / (bsize * 3); i++, srcdst += bsize * 3)
6127 +       for (i = 0; i < nbytes / (bsize * 3); i++, srcdst += bsize * 3) {
6128 +               kernel_fpu_resched();
6129                 twofish_dec_blk_3way(ctx->ctx, srcdst, srcdst);
6130 +       }
6131 +       twofish_fpu_end_rt(ctx);
6132
6133         nbytes %= bsize * 3;
6134
6135 diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c
6136 index 60e21ccfb6d6..0e27f35febe7 100644
6137 --- a/arch/x86/entry/common.c
6138 +++ b/arch/x86/entry/common.c
6139 @@ -133,7 +133,7 @@ static long syscall_trace_enter(struct pt_regs *regs)
6140
6141  #define EXIT_TO_USERMODE_LOOP_FLAGS                            \
6142         (_TIF_SIGPENDING | _TIF_NOTIFY_RESUME | _TIF_UPROBE |   \
6143 -        _TIF_NEED_RESCHED | _TIF_USER_RETURN_NOTIFY | _TIF_PATCH_PENDING)
6144 +        _TIF_NEED_RESCHED_MASK | _TIF_USER_RETURN_NOTIFY | _TIF_PATCH_PENDING)
6145
6146  static void exit_to_usermode_loop(struct pt_regs *regs, u32 cached_flags)
6147  {
6148 @@ -148,9 +148,16 @@ static void exit_to_usermode_loop(struct pt_regs *regs, u32 cached_flags)
6149                 /* We have work to do. */
6150                 local_irq_enable();
6151
6152 -               if (cached_flags & _TIF_NEED_RESCHED)
6153 +               if (cached_flags & _TIF_NEED_RESCHED_MASK)
6154                         schedule();
6155
6156 +#ifdef ARCH_RT_DELAYS_SIGNAL_SEND
6157 +               if (unlikely(current->forced_info.si_signo)) {
6158 +                       struct task_struct *t = current;
6159 +                       force_sig_info(t->forced_info.si_signo, &t->forced_info, t);
6160 +                       t->forced_info.si_signo = 0;
6161 +               }
6162 +#endif
6163                 if (cached_flags & _TIF_UPROBE)
6164                         uprobe_notify_resume(regs);
6165
6166 diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S
6167 index 60c4c342316c..cd0c7c56e2dd 100644
6168 --- a/arch/x86/entry/entry_32.S
6169 +++ b/arch/x86/entry/entry_32.S
6170 @@ -350,8 +350,25 @@ END(ret_from_exception)
6171  ENTRY(resume_kernel)
6172         DISABLE_INTERRUPTS(CLBR_ANY)
6173  .Lneed_resched:
6174 +       # preempt count == 0 + NEED_RS set?
6175         cmpl    $0, PER_CPU_VAR(__preempt_count)
6176 +#ifndef CONFIG_PREEMPT_LAZY
6177         jnz     restore_all
6178 +#else
6179 +       jz test_int_off
6180 +
6181 +       # atleast preempt count == 0 ?
6182 +       cmpl $_PREEMPT_ENABLED,PER_CPU_VAR(__preempt_count)
6183 +       jne restore_all
6184 +
6185 +       movl    PER_CPU_VAR(current_task), %ebp
6186 +       cmpl    $0,TASK_TI_preempt_lazy_count(%ebp)     # non-zero preempt_lazy_count ?
6187 +       jnz     restore_all
6188 +
6189 +       testl   $_TIF_NEED_RESCHED_LAZY, TASK_TI_flags(%ebp)
6190 +       jz      restore_all
6191 +test_int_off:
6192 +#endif
6193         testl   $X86_EFLAGS_IF, PT_EFLAGS(%esp) # interrupts off (exception path) ?
6194         jz      restore_all
6195         call    preempt_schedule_irq
6196 diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
6197 index 164cd7529f0b..75d42cb8a7c9 100644
6198 --- a/arch/x86/entry/entry_64.S
6199 +++ b/arch/x86/entry/entry_64.S
6200 @@ -633,7 +633,23 @@ retint_kernel:
6201         btl     $9, EFLAGS(%rsp)                /* were interrupts off? */
6202         jnc     1f
6203  0:     cmpl    $0, PER_CPU_VAR(__preempt_count)
6204 +#ifndef CONFIG_PREEMPT_LAZY
6205         jnz     1f
6206 +#else
6207 +       jz      do_preempt_schedule_irq
6208 +
6209 +       # atleast preempt count == 0 ?
6210 +       cmpl $_PREEMPT_ENABLED,PER_CPU_VAR(__preempt_count)
6211 +       jnz     1f
6212 +
6213 +       movq    PER_CPU_VAR(current_task), %rcx
6214 +       cmpl    $0, TASK_TI_preempt_lazy_count(%rcx)
6215 +       jnz     1f
6216 +
6217 +       bt      $TIF_NEED_RESCHED_LAZY,TASK_TI_flags(%rcx)
6218 +       jnc     1f
6219 +do_preempt_schedule_irq:
6220 +#endif
6221         call    preempt_schedule_irq
6222         jmp     0b
6223  1:
6224 @@ -988,6 +1004,7 @@ bad_gs:
6225         jmp     2b
6226         .previous
6227
6228 +#ifndef CONFIG_PREEMPT_RT_FULL
6229  /* Call softirq on interrupt stack. Interrupts are off. */
6230  ENTRY(do_softirq_own_stack)
6231         pushq   %rbp
6232 @@ -998,6 +1015,7 @@ ENTRY(do_softirq_own_stack)
6233         leaveq
6234         ret
6235  ENDPROC(do_softirq_own_stack)
6236 +#endif
6237
6238  #ifdef CONFIG_XEN
6239  idtentry hypervisor_callback xen_do_hypervisor_callback has_error_code=0
6240 diff --git a/arch/x86/include/asm/fpu/api.h b/arch/x86/include/asm/fpu/api.h
6241 index a9caac9d4a72..18b31f22ca5d 100644
6242 --- a/arch/x86/include/asm/fpu/api.h
6243 +++ b/arch/x86/include/asm/fpu/api.h
6244 @@ -25,6 +25,7 @@ extern void __kernel_fpu_begin(void);
6245  extern void __kernel_fpu_end(void);
6246  extern void kernel_fpu_begin(void);
6247  extern void kernel_fpu_end(void);
6248 +extern void kernel_fpu_resched(void);
6249  extern bool irq_fpu_usable(void);
6250
6251  /*
6252 diff --git a/arch/x86/include/asm/preempt.h b/arch/x86/include/asm/preempt.h
6253 index 7f2dbd91fc74..22992c837795 100644
6254 --- a/arch/x86/include/asm/preempt.h
6255 +++ b/arch/x86/include/asm/preempt.h
6256 @@ -86,17 +86,46 @@ static __always_inline void __preempt_count_sub(int val)
6257   * a decrement which hits zero means we have no preempt_count and should
6258   * reschedule.
6259   */
6260 -static __always_inline bool __preempt_count_dec_and_test(void)
6261 +static __always_inline bool ____preempt_count_dec_and_test(void)
6262  {
6263         GEN_UNARY_RMWcc("decl", __preempt_count, __percpu_arg(0), e);
6264  }
6265
6266 +static __always_inline bool __preempt_count_dec_and_test(void)
6267 +{
6268 +       if (____preempt_count_dec_and_test())
6269 +               return true;
6270 +#ifdef CONFIG_PREEMPT_LAZY
6271 +       if (current_thread_info()->preempt_lazy_count)
6272 +               return false;
6273 +       return test_thread_flag(TIF_NEED_RESCHED_LAZY);
6274 +#else
6275 +       return false;
6276 +#endif
6277 +}
6278 +
6279  /*
6280   * Returns true when we need to resched and can (barring IRQ state).
6281   */
6282  static __always_inline bool should_resched(int preempt_offset)
6283  {
6284 +#ifdef CONFIG_PREEMPT_LAZY
6285 +       u32 tmp;
6286 +
6287 +       tmp = raw_cpu_read_4(__preempt_count);
6288 +       if (tmp == preempt_offset)
6289 +               return true;
6290 +
6291 +       /* preempt count == 0 ? */
6292 +       tmp &= ~PREEMPT_NEED_RESCHED;
6293 +       if (tmp)
6294 +               return false;
6295 +       if (current_thread_info()->preempt_lazy_count)
6296 +               return false;
6297 +       return test_thread_flag(TIF_NEED_RESCHED_LAZY);
6298 +#else
6299         return unlikely(raw_cpu_read_4(__preempt_count) == preempt_offset);
6300 +#endif
6301  }
6302
6303  #ifdef CONFIG_PREEMPT
6304 diff --git a/arch/x86/include/asm/signal.h b/arch/x86/include/asm/signal.h
6305 index 5f9012ff52ed..39117e57caf2 100644
6306 --- a/arch/x86/include/asm/signal.h
6307 +++ b/arch/x86/include/asm/signal.h
6308 @@ -28,6 +28,19 @@ typedef struct {
6309  #define SA_IA32_ABI    0x02000000u
6310  #define SA_X32_ABI     0x01000000u
6311
6312 +/*
6313 + * Because some traps use the IST stack, we must keep preemption
6314 + * disabled while calling do_trap(), but do_trap() may call
6315 + * force_sig_info() which will grab the signal spin_locks for the
6316 + * task, which in PREEMPT_RT_FULL are mutexes.  By defining
6317 + * ARCH_RT_DELAYS_SIGNAL_SEND the force_sig_info() will set
6318 + * TIF_NOTIFY_RESUME and set up the signal to be sent on exit of the
6319 + * trap.
6320 + */
6321 +#if defined(CONFIG_PREEMPT_RT_FULL)
6322 +#define ARCH_RT_DELAYS_SIGNAL_SEND
6323 +#endif
6324 +
6325  #ifndef CONFIG_COMPAT
6326  typedef sigset_t compat_sigset_t;
6327  #endif
6328 diff --git a/arch/x86/include/asm/stackprotector.h b/arch/x86/include/asm/stackprotector.h
6329 index 371b3a4af000..06613a805b25 100644
6330 --- a/arch/x86/include/asm/stackprotector.h
6331 +++ b/arch/x86/include/asm/stackprotector.h
6332 @@ -60,7 +60,7 @@
6333   */
6334  static __always_inline void boot_init_stack_canary(void)
6335  {
6336 -       u64 canary;
6337 +       u64 uninitialized_var(canary);
6338         u64 tsc;
6339
6340  #ifdef CONFIG_X86_64
6341 @@ -71,8 +71,14 @@ static __always_inline void boot_init_stack_canary(void)
6342          * of randomness. The TSC only matters for very early init,
6343          * there it already has some randomness on most systems. Later
6344          * on during the bootup the random pool has true entropy too.
6345 +        * For preempt-rt we need to weaken the randomness a bit, as
6346 +        * we can't call into the random generator from atomic context
6347 +        * due to locking constraints. We just leave canary
6348 +        * uninitialized and use the TSC based randomness on top of it.
6349          */
6350 +#ifndef CONFIG_PREEMPT_RT_FULL
6351         get_random_bytes(&canary, sizeof(canary));
6352 +#endif
6353         tsc = rdtsc();
6354         canary += tsc + (tsc << 32UL);
6355         canary &= CANARY_MASK;
6356 diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h
6357 index 95ff2d7f553f..b1c9129f64fc 100644
6358 --- a/arch/x86/include/asm/thread_info.h
6359 +++ b/arch/x86/include/asm/thread_info.h
6360 @@ -56,11 +56,14 @@ struct task_struct;
6361  struct thread_info {
6362         unsigned long           flags;          /* low level flags */
6363         u32                     status;         /* thread synchronous flags */
6364 +       int                     preempt_lazy_count;     /* 0 => lazy preemptable
6365 +                                                         <0 => BUG */
6366  };
6367
6368  #define INIT_THREAD_INFO(tsk)                  \
6369  {                                              \
6370         .flags          = 0,                    \
6371 +       .preempt_lazy_count = 0,                \
6372  }
6373
6374  #define init_stack             (init_thread_union.stack)
6375 @@ -69,6 +72,10 @@ struct thread_info {
6376
6377  #include <asm/asm-offsets.h>
6378
6379 +#define GET_THREAD_INFO(reg) \
6380 +       _ASM_MOV PER_CPU_VAR(cpu_current_top_of_stack),reg ; \
6381 +       _ASM_SUB $(THREAD_SIZE),reg ;
6382 +
6383  #endif
6384
6385  /*
6386 @@ -85,6 +92,7 @@ struct thread_info {
6387  #define TIF_SYSCALL_EMU                6       /* syscall emulation active */
6388  #define TIF_SYSCALL_AUDIT      7       /* syscall auditing active */
6389  #define TIF_SECCOMP            8       /* secure computing */
6390 +#define TIF_NEED_RESCHED_LAZY  9       /* lazy rescheduling necessary */
6391  #define TIF_USER_RETURN_NOTIFY 11      /* notify kernel of userspace return */
6392  #define TIF_UPROBE             12      /* breakpointed or singlestepping */
6393  #define TIF_PATCH_PENDING      13      /* pending live patching update */
6394 @@ -112,6 +120,7 @@ struct thread_info {
6395  #define _TIF_SYSCALL_EMU       (1 << TIF_SYSCALL_EMU)
6396  #define _TIF_SYSCALL_AUDIT     (1 << TIF_SYSCALL_AUDIT)
6397  #define _TIF_SECCOMP           (1 << TIF_SECCOMP)
6398 +#define _TIF_NEED_RESCHED_LAZY (1 << TIF_NEED_RESCHED_LAZY)
6399  #define _TIF_USER_RETURN_NOTIFY        (1 << TIF_USER_RETURN_NOTIFY)
6400  #define _TIF_UPROBE            (1 << TIF_UPROBE)
6401  #define _TIF_PATCH_PENDING     (1 << TIF_PATCH_PENDING)
6402 @@ -153,6 +162,8 @@ struct thread_info {
6403  #define _TIF_WORK_CTXSW_PREV (_TIF_WORK_CTXSW|_TIF_USER_RETURN_NOTIFY)
6404  #define _TIF_WORK_CTXSW_NEXT (_TIF_WORK_CTXSW)
6405
6406 +#define _TIF_NEED_RESCHED_MASK (_TIF_NEED_RESCHED | _TIF_NEED_RESCHED_LAZY)
6407 +
6408  #define STACK_WARN             (THREAD_SIZE/8)
6409
6410  /*
6411 diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
6412 index 96a8a68f9c79..c9af5afebc4a 100644
6413 --- a/arch/x86/kernel/apic/io_apic.c
6414 +++ b/arch/x86/kernel/apic/io_apic.c
6415 @@ -1688,19 +1688,20 @@ static bool io_apic_level_ack_pending(struct mp_chip_data *data)
6416         return false;
6417  }
6418
6419 -static inline bool ioapic_irqd_mask(struct irq_data *data)
6420 +static inline bool ioapic_prepare_move(struct irq_data *data)
6421  {
6422         /* If we are moving the irq we need to mask it */
6423         if (unlikely(irqd_is_setaffinity_pending(data))) {
6424 -               mask_ioapic_irq(data);
6425 +               if (!irqd_irq_masked(data))
6426 +                       mask_ioapic_irq(data);
6427                 return true;
6428         }
6429         return false;
6430  }
6431
6432 -static inline void ioapic_irqd_unmask(struct irq_data *data, bool masked)
6433 +static inline void ioapic_finish_move(struct irq_data *data, bool moveit)
6434  {
6435 -       if (unlikely(masked)) {
6436 +       if (unlikely(moveit)) {
6437                 /* Only migrate the irq if the ack has been received.
6438                  *
6439                  * On rare occasions the broadcast level triggered ack gets
6440 @@ -1729,15 +1730,17 @@ static inline void ioapic_irqd_unmask(struct irq_data *data, bool masked)
6441                  */
6442                 if (!io_apic_level_ack_pending(data->chip_data))
6443                         irq_move_masked_irq(data);
6444 -               unmask_ioapic_irq(data);
6445 +               /* If the irq is masked in the core, leave it */
6446 +               if (!irqd_irq_masked(data))
6447 +                       unmask_ioapic_irq(data);
6448         }
6449  }
6450  #else
6451 -static inline bool ioapic_irqd_mask(struct irq_data *data)
6452 +static inline bool ioapic_prepare_move(struct irq_data *data)
6453  {
6454         return false;
6455  }
6456 -static inline void ioapic_irqd_unmask(struct irq_data *data, bool masked)
6457 +static inline void ioapic_finish_move(struct irq_data *data, bool moveit)
6458  {
6459  }
6460  #endif
6461 @@ -1746,11 +1749,11 @@ static void ioapic_ack_level(struct irq_data *irq_data)
6462  {
6463         struct irq_cfg *cfg = irqd_cfg(irq_data);
6464         unsigned long v;
6465 -       bool masked;
6466 +       bool moveit;
6467         int i;
6468
6469         irq_complete_move(cfg);
6470 -       masked = ioapic_irqd_mask(irq_data);
6471 +       moveit = ioapic_prepare_move(irq_data);
6472
6473         /*
6474          * It appears there is an erratum which affects at least version 0x11
6475 @@ -1805,7 +1808,7 @@ static void ioapic_ack_level(struct irq_data *irq_data)
6476                 eoi_ioapic_pin(cfg->vector, irq_data->chip_data);
6477         }
6478
6479 -       ioapic_irqd_unmask(irq_data, masked);
6480 +       ioapic_finish_move(irq_data, moveit);
6481  }
6482
6483  static void ioapic_ir_ack_level(struct irq_data *irq_data)
6484 diff --git a/arch/x86/kernel/asm-offsets.c b/arch/x86/kernel/asm-offsets.c
6485 index 76417a9aab73..62c3e27c8e1c 100644
6486 --- a/arch/x86/kernel/asm-offsets.c
6487 +++ b/arch/x86/kernel/asm-offsets.c
6488 @@ -38,6 +38,7 @@ void common(void) {
6489
6490         BLANK();
6491         OFFSET(TASK_TI_flags, task_struct, thread_info.flags);
6492 +       OFFSET(TASK_TI_preempt_lazy_count, task_struct, thread_info.preempt_lazy_count);
6493         OFFSET(TASK_addr_limit, task_struct, thread.addr_limit);
6494
6495         BLANK();
6496 @@ -94,6 +95,7 @@ void common(void) {
6497
6498         BLANK();
6499         DEFINE(PTREGS_SIZE, sizeof(struct pt_regs));
6500 +       DEFINE(_PREEMPT_ENABLED, PREEMPT_ENABLED);
6501
6502         /* TLB state for the entry code */
6503         OFFSET(TLB_STATE_user_pcid_flush_mask, tlb_state, user_pcid_flush_mask);
6504 diff --git a/arch/x86/kernel/cpu/mcheck/dev-mcelog.c b/arch/x86/kernel/cpu/mcheck/dev-mcelog.c
6505 index 7f85b76f43bc..9e74b805070f 100644
6506 --- a/arch/x86/kernel/cpu/mcheck/dev-mcelog.c
6507 +++ b/arch/x86/kernel/cpu/mcheck/dev-mcelog.c
6508 @@ -14,6 +14,7 @@
6509  #include <linux/slab.h>
6510  #include <linux/kmod.h>
6511  #include <linux/poll.h>
6512 +#include <linux/swork.h>
6513
6514  #include "mce-internal.h"
6515
6516 @@ -86,13 +87,43 @@ static void mce_do_trigger(struct work_struct *work)
6517
6518  static DECLARE_WORK(mce_trigger_work, mce_do_trigger);
6519
6520 -
6521 -void mce_work_trigger(void)
6522 +static void __mce_work_trigger(struct swork_event *event)
6523  {
6524         if (mce_helper[0])
6525                 schedule_work(&mce_trigger_work);
6526  }
6527
6528 +#ifdef CONFIG_PREEMPT_RT_FULL
6529 +static bool notify_work_ready __read_mostly;
6530 +static struct swork_event notify_work;
6531 +
6532 +static int mce_notify_work_init(void)
6533 +{
6534 +       int err;
6535 +
6536 +       err = swork_get();
6537 +       if (err)
6538 +               return err;
6539 +
6540 +       INIT_SWORK(&notify_work, __mce_work_trigger);
6541 +       notify_work_ready = true;
6542 +       return 0;
6543 +}
6544 +
6545 +void mce_work_trigger(void)
6546 +{
6547 +       if (notify_work_ready)
6548 +               swork_queue(&notify_work);
6549 +}
6550 +
6551 +#else
6552 +void mce_work_trigger(void)
6553 +{
6554 +       __mce_work_trigger(NULL);
6555 +}
6556 +static inline int mce_notify_work_init(void) { return 0; }
6557 +#endif
6558 +
6559  static ssize_t
6560  show_trigger(struct device *s, struct device_attribute *attr, char *buf)
6561  {
6562 @@ -356,7 +387,7 @@ static __init int dev_mcelog_init_device(void)
6563
6564                 return err;
6565         }
6566 -
6567 +       mce_notify_work_init();
6568         mce_register_decode_chain(&dev_mcelog_nb);
6569         return 0;
6570  }
6571 diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
6572 index 98e4e4dc4a3b..5cce2ee3b9f6 100644
6573 --- a/arch/x86/kernel/cpu/mcheck/mce.c
6574 +++ b/arch/x86/kernel/cpu/mcheck/mce.c
6575 @@ -42,6 +42,7 @@
6576  #include <linux/debugfs.h>
6577  #include <linux/irq_work.h>
6578  #include <linux/export.h>
6579 +#include <linux/jiffies.h>
6580  #include <linux/jump_label.h>
6581
6582  #include <asm/intel-family.h>
6583 @@ -1365,7 +1366,7 @@ int memory_failure(unsigned long pfn, int vector, int flags)
6584  static unsigned long check_interval = INITIAL_CHECK_INTERVAL;
6585
6586  static DEFINE_PER_CPU(unsigned long, mce_next_interval); /* in jiffies */
6587 -static DEFINE_PER_CPU(struct timer_list, mce_timer);
6588 +static DEFINE_PER_CPU(struct hrtimer, mce_timer);
6589
6590  static unsigned long mce_adjust_timer_default(unsigned long interval)
6591  {
6592 @@ -1374,27 +1375,19 @@ static unsigned long mce_adjust_timer_default(unsigned long interval)
6593
6594  static unsigned long (*mce_adjust_timer)(unsigned long interval) = mce_adjust_timer_default;
6595
6596 -static void __start_timer(struct timer_list *t, unsigned long interval)
6597 +static void __start_timer(struct hrtimer *t, unsigned long iv)
6598  {
6599 -       unsigned long when = jiffies + interval;
6600 -       unsigned long flags;
6601 -
6602 -       local_irq_save(flags);
6603 -
6604 -       if (!timer_pending(t) || time_before(when, t->expires))
6605 -               mod_timer(t, round_jiffies(when));
6606 +       if (!iv)
6607 +               return;
6608
6609 -       local_irq_restore(flags);
6610 +       hrtimer_start_range_ns(t, ns_to_ktime(jiffies_to_usecs(iv) * 1000ULL),
6611 +                              0, HRTIMER_MODE_REL_PINNED);
6612  }
6613
6614 -static void mce_timer_fn(unsigned long data)
6615 +static  enum hrtimer_restart mce_timer_fn(struct hrtimer *timer)
6616  {
6617 -       struct timer_list *t = this_cpu_ptr(&mce_timer);
6618 -       int cpu = smp_processor_id();
6619         unsigned long iv;
6620
6621 -       WARN_ON(cpu != data);
6622 -
6623         iv = __this_cpu_read(mce_next_interval);
6624
6625         if (mce_available(this_cpu_ptr(&cpu_info))) {
6626 @@ -1417,7 +1410,11 @@ static void mce_timer_fn(unsigned long data)
6627
6628  done:
6629         __this_cpu_write(mce_next_interval, iv);
6630 -       __start_timer(t, iv);
6631 +       if (!iv)
6632 +               return HRTIMER_NORESTART;
6633 +
6634 +       hrtimer_forward_now(timer, ns_to_ktime(jiffies_to_nsecs(iv)));
6635 +       return HRTIMER_RESTART;
6636  }
6637
6638  /*
6639 @@ -1425,7 +1422,7 @@ static void mce_timer_fn(unsigned long data)
6640   */
6641  void mce_timer_kick(unsigned long interval)
6642  {
6643 -       struct timer_list *t = this_cpu_ptr(&mce_timer);
6644 +       struct hrtimer *t = this_cpu_ptr(&mce_timer);
6645         unsigned long iv = __this_cpu_read(mce_next_interval);
6646
6647         __start_timer(t, interval);
6648 @@ -1440,7 +1437,7 @@ static void mce_timer_delete_all(void)
6649         int cpu;
6650
6651         for_each_online_cpu(cpu)
6652 -               del_timer_sync(&per_cpu(mce_timer, cpu));
6653 +               hrtimer_cancel(&per_cpu(mce_timer, cpu));
6654  }
6655
6656  /*
6657 @@ -1769,7 +1766,7 @@ static void __mcheck_cpu_clear_vendor(struct cpuinfo_x86 *c)
6658         }
6659  }
6660
6661 -static void mce_start_timer(struct timer_list *t)
6662 +static void mce_start_timer(struct hrtimer *t)
6663  {
6664         unsigned long iv = check_interval * HZ;
6665
6666 @@ -1782,18 +1779,19 @@ static void mce_start_timer(struct timer_list *t)
6667
6668  static void __mcheck_cpu_setup_timer(void)
6669  {
6670 -       struct timer_list *t = this_cpu_ptr(&mce_timer);
6671 -       unsigned int cpu = smp_processor_id();
6672 +       struct hrtimer *t = this_cpu_ptr(&mce_timer);
6673
6674 -       setup_pinned_timer(t, mce_timer_fn, cpu);
6675 +       hrtimer_init(t, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
6676 +       t->function = mce_timer_fn;
6677  }
6678
6679  static void __mcheck_cpu_init_timer(void)
6680  {
6681 -       struct timer_list *t = this_cpu_ptr(&mce_timer);
6682 -       unsigned int cpu = smp_processor_id();
6683 +       struct hrtimer *t = this_cpu_ptr(&mce_timer);
6684 +
6685 +       hrtimer_init(t, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
6686 +       t->function = mce_timer_fn;
6687
6688 -       setup_pinned_timer(t, mce_timer_fn, cpu);
6689         mce_start_timer(t);
6690  }
6691
6692 @@ -2309,7 +2307,7 @@ static int mce_cpu_dead(unsigned int cpu)
6693
6694  static int mce_cpu_online(unsigned int cpu)
6695  {
6696 -       struct timer_list *t = this_cpu_ptr(&mce_timer);
6697 +       struct hrtimer *t = this_cpu_ptr(&mce_timer);
6698         int ret;
6699
6700         mce_device_create(cpu);
6701 @@ -2326,10 +2324,10 @@ static int mce_cpu_online(unsigned int cpu)
6702
6703  static int mce_cpu_pre_down(unsigned int cpu)
6704  {
6705 -       struct timer_list *t = this_cpu_ptr(&mce_timer);
6706 +       struct hrtimer *t = this_cpu_ptr(&mce_timer);
6707
6708         mce_disable_cpu();
6709 -       del_timer_sync(t);
6710 +       hrtimer_cancel(t);
6711         mce_threshold_remove_device(cpu);
6712         mce_device_remove(cpu);
6713         return 0;
6714 diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c
6715 index 2ea85b32421a..6914dc569d1e 100644
6716 --- a/arch/x86/kernel/fpu/core.c
6717 +++ b/arch/x86/kernel/fpu/core.c
6718 @@ -138,6 +138,18 @@ void kernel_fpu_end(void)
6719  }
6720  EXPORT_SYMBOL_GPL(kernel_fpu_end);
6721
6722 +void kernel_fpu_resched(void)
6723 +{
6724 +       WARN_ON_FPU(!this_cpu_read(in_kernel_fpu));
6725 +
6726 +       if (should_resched(PREEMPT_OFFSET)) {
6727 +               kernel_fpu_end();
6728 +               cond_resched();
6729 +               kernel_fpu_begin();
6730 +       }
6731 +}
6732 +EXPORT_SYMBOL_GPL(kernel_fpu_resched);
6733 +
6734  /*
6735   * Save the FPU state (mark it for reload if necessary):
6736   *
6737 diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c
6738 index 95600a99ae93..9192d76085ba 100644
6739 --- a/arch/x86/kernel/irq_32.c
6740 +++ b/arch/x86/kernel/irq_32.c
6741 @@ -130,6 +130,7 @@ void irq_ctx_init(int cpu)
6742                cpu, per_cpu(hardirq_stack, cpu),  per_cpu(softirq_stack, cpu));
6743  }
6744
6745 +#ifndef CONFIG_PREEMPT_RT_FULL
6746  void do_softirq_own_stack(void)
6747  {
6748         struct irq_stack *irqstk;
6749 @@ -146,6 +147,7 @@ void do_softirq_own_stack(void)
6750
6751         call_on_stack(__do_softirq, isp);
6752  }
6753 +#endif
6754
6755  bool handle_irq(struct irq_desc *desc, struct pt_regs *regs)
6756  {
6757 diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
6758 index 5224c6099184..9b2b1f0409c5 100644
6759 --- a/arch/x86/kernel/process_32.c
6760 +++ b/arch/x86/kernel/process_32.c
6761 @@ -38,6 +38,7 @@
6762  #include <linux/io.h>
6763  #include <linux/kdebug.h>
6764  #include <linux/syscalls.h>
6765 +#include <linux/highmem.h>
6766
6767  #include <asm/pgtable.h>
6768  #include <asm/ldt.h>
6769 @@ -198,6 +199,35 @@ start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp)
6770  }
6771  EXPORT_SYMBOL_GPL(start_thread);
6772
6773 +#ifdef CONFIG_PREEMPT_RT_FULL
6774 +static void switch_kmaps(struct task_struct *prev_p, struct task_struct *next_p)
6775 +{
6776 +       int i;
6777 +
6778 +       /*
6779 +        * Clear @prev's kmap_atomic mappings
6780 +        */
6781 +       for (i = 0; i < prev_p->kmap_idx; i++) {
6782 +               int idx = i + KM_TYPE_NR * smp_processor_id();
6783 +               pte_t *ptep = kmap_pte - idx;
6784 +
6785 +               kpte_clear_flush(ptep, __fix_to_virt(FIX_KMAP_BEGIN + idx));
6786 +       }
6787 +       /*
6788 +        * Restore @next_p's kmap_atomic mappings
6789 +        */
6790 +       for (i = 0; i < next_p->kmap_idx; i++) {
6791 +               int idx = i + KM_TYPE_NR * smp_processor_id();
6792 +
6793 +               if (!pte_none(next_p->kmap_pte[i]))
6794 +                       set_pte(kmap_pte - idx, next_p->kmap_pte[i]);
6795 +       }
6796 +}
6797 +#else
6798 +static inline void
6799 +switch_kmaps(struct task_struct *prev_p, struct task_struct *next_p) { }
6800 +#endif
6801 +
6802
6803  /*
6804   *     switch_to(x,y) should switch tasks from x to y.
6805 @@ -273,6 +303,8 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
6806                      task_thread_info(next_p)->flags & _TIF_WORK_CTXSW_NEXT))
6807                 __switch_to_xtra(prev_p, next_p, tss);
6808
6809 +       switch_kmaps(prev_p, next_p);
6810 +
6811         /*
6812          * Leave lazy mode, flushing any hypercalls made here.
6813          * This must be done before restoring TLS segments so
6814 diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
6815 index 13dfb55b84db..dd66f629d1d0 100644
6816 --- a/arch/x86/kvm/lapic.c
6817 +++ b/arch/x86/kvm/lapic.c
6818 @@ -2136,7 +2136,7 @@ int kvm_create_lapic(struct kvm_vcpu *vcpu)
6819         apic->vcpu = vcpu;
6820
6821         hrtimer_init(&apic->lapic_timer.timer, CLOCK_MONOTONIC,
6822 -                    HRTIMER_MODE_ABS_PINNED);
6823 +                    HRTIMER_MODE_ABS_PINNED_HARD);
6824         apic->lapic_timer.timer.function = apic_timer_fn;
6825
6826         /*
6827 diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
6828 index 3856828ee1dc..407658146ae1 100644
6829 --- a/arch/x86/kvm/x86.c
6830 +++ b/arch/x86/kvm/x86.c
6831 @@ -6287,6 +6287,13 @@ int kvm_arch_init(void *opaque)
6832                 goto out;
6833         }
6834
6835 +#ifdef CONFIG_PREEMPT_RT_FULL
6836 +       if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) {
6837 +               printk(KERN_ERR "RT requires X86_FEATURE_CONSTANT_TSC\n");
6838 +               return -EOPNOTSUPP;
6839 +       }
6840 +#endif
6841 +
6842         r = kvm_mmu_module_init();
6843         if (r)
6844                 goto out_free_percpu;
6845 diff --git a/arch/x86/mm/highmem_32.c b/arch/x86/mm/highmem_32.c
6846 index 6d18b70ed5a9..f752724c22e8 100644
6847 --- a/arch/x86/mm/highmem_32.c
6848 +++ b/arch/x86/mm/highmem_32.c
6849 @@ -32,10 +32,11 @@ EXPORT_SYMBOL(kunmap);
6850   */
6851  void *kmap_atomic_prot(struct page *page, pgprot_t prot)
6852  {
6853 +       pte_t pte = mk_pte(page, prot);
6854         unsigned long vaddr;
6855         int idx, type;
6856
6857 -       preempt_disable();
6858 +       preempt_disable_nort();
6859         pagefault_disable();
6860
6861         if (!PageHighMem(page))
6862 @@ -45,7 +46,10 @@ void *kmap_atomic_prot(struct page *page, pgprot_t prot)
6863         idx = type + KM_TYPE_NR*smp_processor_id();
6864         vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx);
6865         BUG_ON(!pte_none(*(kmap_pte-idx)));
6866 -       set_pte(kmap_pte-idx, mk_pte(page, prot));
6867 +#ifdef CONFIG_PREEMPT_RT_FULL
6868 +       current->kmap_pte[type] = pte;
6869 +#endif
6870 +       set_pte(kmap_pte-idx, pte);
6871         arch_flush_lazy_mmu_mode();
6872
6873         return (void *)vaddr;
6874 @@ -88,6 +92,9 @@ void __kunmap_atomic(void *kvaddr)
6875                  * is a bad idea also, in case the page changes cacheability
6876                  * attributes or becomes a protected page in a hypervisor.
6877                  */
6878 +#ifdef CONFIG_PREEMPT_RT_FULL
6879 +               current->kmap_pte[type] = __pte(0);
6880 +#endif
6881                 kpte_clear_flush(kmap_pte-idx, vaddr);
6882                 kmap_atomic_idx_pop();
6883                 arch_flush_lazy_mmu_mode();
6884 @@ -100,7 +107,7 @@ void __kunmap_atomic(void *kvaddr)
6885  #endif
6886
6887         pagefault_enable();
6888 -       preempt_enable();
6889 +       preempt_enable_nort();
6890  }
6891  EXPORT_SYMBOL(__kunmap_atomic);
6892
6893 diff --git a/arch/x86/mm/iomap_32.c b/arch/x86/mm/iomap_32.c
6894 index ada98b39b8ad..585f6829653b 100644
6895 --- a/arch/x86/mm/iomap_32.c
6896 +++ b/arch/x86/mm/iomap_32.c
6897 @@ -56,6 +56,7 @@ EXPORT_SYMBOL_GPL(iomap_free);
6898
6899  void *kmap_atomic_prot_pfn(unsigned long pfn, pgprot_t prot)
6900  {
6901 +       pte_t pte = pfn_pte(pfn, prot);
6902         unsigned long vaddr;
6903         int idx, type;
6904
6905 @@ -65,7 +66,12 @@ void *kmap_atomic_prot_pfn(unsigned long pfn, pgprot_t prot)
6906         type = kmap_atomic_idx_push();
6907         idx = type + KM_TYPE_NR * smp_processor_id();
6908         vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx);
6909 -       set_pte(kmap_pte - idx, pfn_pte(pfn, prot));
6910 +       WARN_ON(!pte_none(*(kmap_pte - idx)));
6911 +
6912 +#ifdef CONFIG_PREEMPT_RT_FULL
6913 +       current->kmap_pte[type] = pte;
6914 +#endif
6915 +       set_pte(kmap_pte - idx, pte);
6916         arch_flush_lazy_mmu_mode();
6917
6918         return (void *)vaddr;
6919 @@ -113,6 +119,9 @@ iounmap_atomic(void __iomem *kvaddr)
6920                  * is a bad idea also, in case the page changes cacheability
6921                  * attributes or becomes a protected page in a hypervisor.
6922                  */
6923 +#ifdef CONFIG_PREEMPT_RT_FULL
6924 +               current->kmap_pte[type] = __pte(0);
6925 +#endif
6926                 kpte_clear_flush(kmap_pte-idx, vaddr);
6927                 kmap_atomic_idx_pop();
6928         }
6929 diff --git a/arch/xtensa/include/asm/spinlock_types.h b/arch/xtensa/include/asm/spinlock_types.h
6930 index bb1fe6c1816e..8a22f1e7b6c9 100644
6931 --- a/arch/xtensa/include/asm/spinlock_types.h
6932 +++ b/arch/xtensa/include/asm/spinlock_types.h
6933 @@ -2,10 +2,6 @@
6934  #ifndef __ASM_SPINLOCK_TYPES_H
6935  #define __ASM_SPINLOCK_TYPES_H
6936
6937 -#ifndef __LINUX_SPINLOCK_TYPES_H
6938 -# error "please don't include this file directly"
6939 -#endif
6940 -
6941  typedef struct {
6942         volatile unsigned int slock;
6943  } arch_spinlock_t;
6944 diff --git a/block/blk-core.c b/block/blk-core.c
6945 index 6aa2bc4e9652..f005077ae291 100644
6946 --- a/block/blk-core.c
6947 +++ b/block/blk-core.c
6948 @@ -116,6 +116,9 @@ void blk_rq_init(struct request_queue *q, struct request *rq)
6949
6950         INIT_LIST_HEAD(&rq->queuelist);
6951         INIT_LIST_HEAD(&rq->timeout_list);
6952 +#ifdef CONFIG_PREEMPT_RT_FULL
6953 +       INIT_WORK(&rq->work, __blk_mq_complete_request_remote_work);
6954 +#endif
6955         rq->cpu = -1;
6956         rq->q = q;
6957         rq->__sector = (sector_t) -1;
6958 @@ -280,7 +283,7 @@ EXPORT_SYMBOL(blk_start_queue_async);
6959  void blk_start_queue(struct request_queue *q)
6960  {
6961         lockdep_assert_held(q->queue_lock);
6962 -       WARN_ON(!in_interrupt() && !irqs_disabled());
6963 +       WARN_ON_NONRT(!in_interrupt() && !irqs_disabled());
6964         WARN_ON_ONCE(q->mq_ops);
6965
6966         queue_flag_clear(QUEUE_FLAG_STOPPED, q);
6967 @@ -812,12 +815,21 @@ void blk_queue_exit(struct request_queue *q)
6968         percpu_ref_put(&q->q_usage_counter);
6969  }
6970
6971 +static void blk_queue_usage_counter_release_swork(struct swork_event *sev)
6972 +{
6973 +       struct request_queue *q =
6974 +               container_of(sev, struct request_queue, mq_pcpu_wake);
6975 +
6976 +       wake_up_all(&q->mq_freeze_wq);
6977 +}
6978 +
6979  static void blk_queue_usage_counter_release(struct percpu_ref *ref)
6980  {
6981         struct request_queue *q =
6982                 container_of(ref, struct request_queue, q_usage_counter);
6983
6984 -       wake_up_all(&q->mq_freeze_wq);
6985 +       if (wq_has_sleeper(&q->mq_freeze_wq))
6986 +               swork_queue(&q->mq_pcpu_wake);
6987  }
6988
6989  static void blk_rq_timed_out_timer(unsigned long data)
6990 @@ -894,6 +906,7 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id)
6991         __set_bit(QUEUE_FLAG_BYPASS, &q->queue_flags);
6992
6993         init_waitqueue_head(&q->mq_freeze_wq);
6994 +       INIT_SWORK(&q->mq_pcpu_wake, blk_queue_usage_counter_release_swork);
6995
6996         /*
6997          * Init percpu_ref in atomic mode so that it's faster to shutdown.
6998 @@ -3313,7 +3326,7 @@ static void queue_unplugged(struct request_queue *q, unsigned int depth,
6999                 blk_run_queue_async(q);
7000         else
7001                 __blk_run_queue(q);
7002 -       spin_unlock(q->queue_lock);
7003 +       spin_unlock_irq(q->queue_lock);
7004  }
7005
7006  static void flush_plug_callbacks(struct blk_plug *plug, bool from_schedule)
7007 @@ -3361,7 +3374,6 @@ EXPORT_SYMBOL(blk_check_plugged);
7008  void blk_flush_plug_list(struct blk_plug *plug, bool from_schedule)
7009  {
7010         struct request_queue *q;
7011 -       unsigned long flags;
7012         struct request *rq;
7013         LIST_HEAD(list);
7014         unsigned int depth;
7015 @@ -3381,11 +3393,6 @@ void blk_flush_plug_list(struct blk_plug *plug, bool from_schedule)
7016         q = NULL;
7017         depth = 0;
7018
7019 -       /*
7020 -        * Save and disable interrupts here, to avoid doing it for every
7021 -        * queue lock we have to take.
7022 -        */
7023 -       local_irq_save(flags);
7024         while (!list_empty(&list)) {
7025                 rq = list_entry_rq(list.next);
7026                 list_del_init(&rq->queuelist);
7027 @@ -3398,7 +3405,7 @@ void blk_flush_plug_list(struct blk_plug *plug, bool from_schedule)
7028                                 queue_unplugged(q, depth, from_schedule);
7029                         q = rq->q;
7030                         depth = 0;
7031 -                       spin_lock(q->queue_lock);
7032 +                       spin_lock_irq(q->queue_lock);
7033                 }
7034
7035                 /*
7036 @@ -3425,8 +3432,6 @@ void blk_flush_plug_list(struct blk_plug *plug, bool from_schedule)
7037          */
7038         if (q)
7039                 queue_unplugged(q, depth, from_schedule);
7040 -
7041 -       local_irq_restore(flags);
7042  }
7043
7044  void blk_finish_plug(struct blk_plug *plug)
7045 @@ -3638,6 +3643,8 @@ int __init blk_dev_init(void)
7046         if (!kblockd_workqueue)
7047                 panic("Failed to create kblockd\n");
7048
7049 +       BUG_ON(swork_get());
7050 +
7051         request_cachep = kmem_cache_create("blkdev_requests",
7052                         sizeof(struct request), 0, SLAB_PANIC, NULL);
7053
7054 diff --git a/block/blk-ioc.c b/block/blk-ioc.c
7055 index f23311e4b201..ca9ea624f159 100644
7056 --- a/block/blk-ioc.c
7057 +++ b/block/blk-ioc.c
7058 @@ -9,6 +9,7 @@
7059  #include <linux/blkdev.h>
7060  #include <linux/slab.h>
7061  #include <linux/sched/task.h>
7062 +#include <linux/delay.h>
7063
7064  #include "blk.h"
7065
7066 @@ -118,7 +119,7 @@ static void ioc_release_fn(struct work_struct *work)
7067                         spin_unlock(q->queue_lock);
7068                 } else {
7069                         spin_unlock_irqrestore(&ioc->lock, flags);
7070 -                       cpu_relax();
7071 +                       cpu_chill();
7072                         spin_lock_irqsave_nested(&ioc->lock, flags, 1);
7073                 }
7074         }
7075 @@ -202,7 +203,7 @@ void put_io_context_active(struct io_context *ioc)
7076                                 spin_unlock(icq->q->queue_lock);
7077                         } else {
7078                                 spin_unlock_irqrestore(&ioc->lock, flags);
7079 -                               cpu_relax();
7080 +                               cpu_chill();
7081                                 goto retry;
7082                         }
7083                 }
7084 diff --git a/block/blk-mq.c b/block/blk-mq.c
7085 index eac444804736..a6314b82273e 100644
7086 --- a/block/blk-mq.c
7087 +++ b/block/blk-mq.c
7088 @@ -339,6 +339,9 @@ static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data,
7089         /* tag was already set */
7090         rq->extra_len = 0;
7091
7092 +#ifdef CONFIG_PREEMPT_RT_FULL
7093 +       INIT_WORK(&rq->work, __blk_mq_complete_request_remote_work);
7094 +#endif
7095         INIT_LIST_HEAD(&rq->timeout_list);
7096         rq->timeout = 0;
7097
7098 @@ -533,12 +536,24 @@ void blk_mq_end_request(struct request *rq, blk_status_t error)
7099  }
7100  EXPORT_SYMBOL(blk_mq_end_request);
7101
7102 +#ifdef CONFIG_PREEMPT_RT_FULL
7103 +
7104 +void __blk_mq_complete_request_remote_work(struct work_struct *work)
7105 +{
7106 +       struct request *rq = container_of(work, struct request, work);
7107 +
7108 +       rq->q->softirq_done_fn(rq);
7109 +}
7110 +
7111 +#else
7112 +
7113  static void __blk_mq_complete_request_remote(void *data)
7114  {
7115         struct request *rq = data;
7116
7117         rq->q->softirq_done_fn(rq);
7118  }
7119 +#endif
7120
7121  static void __blk_mq_complete_request(struct request *rq)
7122  {
7123 @@ -558,19 +573,27 @@ static void __blk_mq_complete_request(struct request *rq)
7124                 return;
7125         }
7126
7127 -       cpu = get_cpu();
7128 +       cpu = get_cpu_light();
7129         if (!test_bit(QUEUE_FLAG_SAME_FORCE, &rq->q->queue_flags))
7130                 shared = cpus_share_cache(cpu, ctx->cpu);
7131
7132         if (cpu != ctx->cpu && !shared && cpu_online(ctx->cpu)) {
7133 +#ifdef CONFIG_PREEMPT_RT_FULL
7134 +               /*
7135 +                * We could force QUEUE_FLAG_SAME_FORCE then we would not get in
7136 +                * here. But we could try to invoke it one the CPU like this.
7137 +                */
7138 +               schedule_work_on(ctx->cpu, &rq->work);
7139 +#else
7140                 rq->csd.func = __blk_mq_complete_request_remote;
7141                 rq->csd.info = rq;
7142                 rq->csd.flags = 0;
7143                 smp_call_function_single_async(ctx->cpu, &rq->csd);
7144 +#endif
7145         } else {
7146                 rq->q->softirq_done_fn(rq);
7147         }
7148 -       put_cpu();
7149 +       put_cpu_light();
7150  }
7151
7152  /**
7153 @@ -1238,14 +1261,14 @@ static void __blk_mq_delay_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async,
7154                 return;
7155
7156         if (!async && !(hctx->flags & BLK_MQ_F_BLOCKING)) {
7157 -               int cpu = get_cpu();
7158 +               int cpu = get_cpu_light();
7159                 if (cpumask_test_cpu(cpu, hctx->cpumask)) {
7160                         __blk_mq_run_hw_queue(hctx);
7161 -                       put_cpu();
7162 +                       put_cpu_light();
7163                         return;
7164                 }
7165
7166 -               put_cpu();
7167 +               put_cpu_light();
7168         }
7169
7170         kblockd_schedule_delayed_work_on(blk_mq_hctx_next_cpu(hctx),
7171 @@ -2863,10 +2886,9 @@ static bool blk_mq_poll_hybrid_sleep(struct request_queue *q,
7172         kt = nsecs;
7173
7174         mode = HRTIMER_MODE_REL;
7175 -       hrtimer_init_on_stack(&hs.timer, CLOCK_MONOTONIC, mode);
7176 +       hrtimer_init_sleeper_on_stack(&hs, CLOCK_MONOTONIC, mode, current);
7177         hrtimer_set_expires(&hs.timer, kt);
7178
7179 -       hrtimer_init_sleeper(&hs, current);
7180         do {
7181                 if (test_bit(REQ_ATOM_COMPLETE, &rq->atomic_flags))
7182                         break;
7183 diff --git a/block/blk-mq.h b/block/blk-mq.h
7184 index 877237e09083..d944750bade0 100644
7185 --- a/block/blk-mq.h
7186 +++ b/block/blk-mq.h
7187 @@ -98,12 +98,12 @@ static inline struct blk_mq_ctx *__blk_mq_get_ctx(struct request_queue *q,
7188   */
7189  static inline struct blk_mq_ctx *blk_mq_get_ctx(struct request_queue *q)
7190  {
7191 -       return __blk_mq_get_ctx(q, get_cpu());
7192 +       return __blk_mq_get_ctx(q, get_cpu_light());
7193  }
7194
7195  static inline void blk_mq_put_ctx(struct blk_mq_ctx *ctx)
7196  {
7197 -       put_cpu();
7198 +       put_cpu_light();
7199  }
7200
7201  struct blk_mq_alloc_data {
7202 diff --git a/block/blk-softirq.c b/block/blk-softirq.c
7203 index 01e2b353a2b9..e8c0d4945f5a 100644
7204 --- a/block/blk-softirq.c
7205 +++ b/block/blk-softirq.c
7206 @@ -53,6 +53,7 @@ static void trigger_softirq(void *data)
7207                 raise_softirq_irqoff(BLOCK_SOFTIRQ);
7208
7209         local_irq_restore(flags);
7210 +       preempt_check_resched_rt();
7211  }
7212
7213  /*
7214 @@ -91,6 +92,7 @@ static int blk_softirq_cpu_dead(unsigned int cpu)
7215                          this_cpu_ptr(&blk_cpu_done));
7216         raise_softirq_irqoff(BLOCK_SOFTIRQ);
7217         local_irq_enable();
7218 +       preempt_check_resched_rt();
7219
7220         return 0;
7221  }
7222 @@ -143,6 +145,7 @@ void __blk_complete_request(struct request *req)
7223                 goto do_local;
7224
7225         local_irq_restore(flags);
7226 +       preempt_check_resched_rt();
7227  }
7228
7229  /**
7230 diff --git a/block/bounce.c b/block/bounce.c
7231 index 1d05c422c932..0101ffefddc4 100644
7232 --- a/block/bounce.c
7233 +++ b/block/bounce.c
7234 @@ -66,11 +66,11 @@ static void bounce_copy_vec(struct bio_vec *to, unsigned char *vfrom)
7235         unsigned long flags;
7236         unsigned char *vto;
7237
7238 -       local_irq_save(flags);
7239 +       local_irq_save_nort(flags);
7240         vto = kmap_atomic(to->bv_page);
7241         memcpy(vto + to->bv_offset, vfrom, to->bv_len);
7242         kunmap_atomic(vto);
7243 -       local_irq_restore(flags);
7244 +       local_irq_restore_nort(flags);
7245  }
7246
7247  #else /* CONFIG_HIGHMEM */
7248 diff --git a/crypto/algapi.c b/crypto/algapi.c
7249 index 50eb828db767..7bce92a6599a 100644
7250 --- a/crypto/algapi.c
7251 +++ b/crypto/algapi.c
7252 @@ -731,13 +731,13 @@ EXPORT_SYMBOL_GPL(crypto_spawn_tfm2);
7253
7254  int crypto_register_notifier(struct notifier_block *nb)
7255  {
7256 -       return blocking_notifier_chain_register(&crypto_chain, nb);
7257 +       return srcu_notifier_chain_register(&crypto_chain, nb);
7258  }
7259  EXPORT_SYMBOL_GPL(crypto_register_notifier);
7260
7261  int crypto_unregister_notifier(struct notifier_block *nb)
7262  {
7263 -       return blocking_notifier_chain_unregister(&crypto_chain, nb);
7264 +       return srcu_notifier_chain_unregister(&crypto_chain, nb);
7265  }
7266  EXPORT_SYMBOL_GPL(crypto_unregister_notifier);
7267
7268 diff --git a/crypto/api.c b/crypto/api.c
7269 index e485aed11ad0..089e648d2fa9 100644
7270 --- a/crypto/api.c
7271 +++ b/crypto/api.c
7272 @@ -31,7 +31,7 @@ EXPORT_SYMBOL_GPL(crypto_alg_list);
7273  DECLARE_RWSEM(crypto_alg_sem);
7274  EXPORT_SYMBOL_GPL(crypto_alg_sem);
7275
7276 -BLOCKING_NOTIFIER_HEAD(crypto_chain);
7277 +SRCU_NOTIFIER_HEAD(crypto_chain);
7278  EXPORT_SYMBOL_GPL(crypto_chain);
7279
7280  static struct crypto_alg *crypto_larval_wait(struct crypto_alg *alg);
7281 @@ -236,10 +236,10 @@ int crypto_probing_notify(unsigned long val, void *v)
7282  {
7283         int ok;
7284
7285 -       ok = blocking_notifier_call_chain(&crypto_chain, val, v);
7286 +       ok = srcu_notifier_call_chain(&crypto_chain, val, v);
7287         if (ok == NOTIFY_DONE) {
7288                 request_module("cryptomgr");
7289 -               ok = blocking_notifier_call_chain(&crypto_chain, val, v);
7290 +               ok = srcu_notifier_call_chain(&crypto_chain, val, v);
7291         }
7292
7293         return ok;
7294 diff --git a/crypto/cryptd.c b/crypto/cryptd.c
7295 index 248f6ba41688..54b7985c8caa 100644
7296 --- a/crypto/cryptd.c
7297 +++ b/crypto/cryptd.c
7298 @@ -37,6 +37,7 @@
7299  struct cryptd_cpu_queue {
7300         struct crypto_queue queue;
7301         struct work_struct work;
7302 +       spinlock_t qlock;
7303  };
7304
7305  struct cryptd_queue {
7306 @@ -115,6 +116,7 @@ static int cryptd_init_queue(struct cryptd_queue *queue,
7307                 cpu_queue = per_cpu_ptr(queue->cpu_queue, cpu);
7308                 crypto_init_queue(&cpu_queue->queue, max_cpu_qlen);
7309                 INIT_WORK(&cpu_queue->work, cryptd_queue_worker);
7310 +               spin_lock_init(&cpu_queue->qlock);
7311         }
7312         return 0;
7313  }
7314 @@ -139,8 +141,10 @@ static int cryptd_enqueue_request(struct cryptd_queue *queue,
7315         atomic_t *refcnt;
7316         bool may_backlog;
7317
7318 -       cpu = get_cpu();
7319 -       cpu_queue = this_cpu_ptr(queue->cpu_queue);
7320 +       cpu_queue = raw_cpu_ptr(queue->cpu_queue);
7321 +       spin_lock_bh(&cpu_queue->qlock);
7322 +       cpu = smp_processor_id();
7323 +
7324         err = crypto_enqueue_request(&cpu_queue->queue, request);
7325
7326         refcnt = crypto_tfm_ctx(request->tfm);
7327 @@ -157,7 +161,7 @@ static int cryptd_enqueue_request(struct cryptd_queue *queue,
7328         atomic_inc(refcnt);
7329
7330  out_put_cpu:
7331 -       put_cpu();
7332 +       spin_unlock_bh(&cpu_queue->qlock);
7333
7334         return err;
7335  }
7336 @@ -173,16 +177,11 @@ static void cryptd_queue_worker(struct work_struct *work)
7337         cpu_queue = container_of(work, struct cryptd_cpu_queue, work);
7338         /*
7339          * Only handle one request at a time to avoid hogging crypto workqueue.
7340 -        * preempt_disable/enable is used to prevent being preempted by
7341 -        * cryptd_enqueue_request(). local_bh_disable/enable is used to prevent
7342 -        * cryptd_enqueue_request() being accessed from software interrupts.
7343          */
7344 -       local_bh_disable();
7345 -       preempt_disable();
7346 +       spin_lock_bh(&cpu_queue->qlock);
7347         backlog = crypto_get_backlog(&cpu_queue->queue);
7348         req = crypto_dequeue_request(&cpu_queue->queue);
7349 -       preempt_enable();
7350 -       local_bh_enable();
7351 +       spin_unlock_bh(&cpu_queue->qlock);
7352
7353         if (!req)
7354                 return;
7355 diff --git a/crypto/internal.h b/crypto/internal.h
7356 index f07320423191..333d985088fe 100644
7357 --- a/crypto/internal.h
7358 +++ b/crypto/internal.h
7359 @@ -47,7 +47,7 @@ struct crypto_larval {
7360
7361  extern struct list_head crypto_alg_list;
7362  extern struct rw_semaphore crypto_alg_sem;
7363 -extern struct blocking_notifier_head crypto_chain;
7364 +extern struct srcu_notifier_head crypto_chain;
7365
7366  #ifdef CONFIG_PROC_FS
7367  void __init crypto_init_proc(void);
7368 @@ -143,7 +143,7 @@ static inline int crypto_is_moribund(struct crypto_alg *alg)
7369
7370  static inline void crypto_notify(unsigned long val, void *v)
7371  {
7372 -       blocking_notifier_call_chain(&crypto_chain, val, v);
7373 +       srcu_notifier_call_chain(&crypto_chain, val, v);
7374  }
7375
7376  #endif /* _CRYPTO_INTERNAL_H */
7377 diff --git a/crypto/scompress.c b/crypto/scompress.c
7378 index 2075e2c4e7df..c6b4e265c6bf 100644
7379 --- a/crypto/scompress.c
7380 +++ b/crypto/scompress.c
7381 @@ -24,6 +24,7 @@
7382  #include <linux/cryptouser.h>
7383  #include <net/netlink.h>
7384  #include <linux/scatterlist.h>
7385 +#include <linux/locallock.h>
7386  #include <crypto/scatterwalk.h>
7387  #include <crypto/internal/acompress.h>
7388  #include <crypto/internal/scompress.h>
7389 @@ -34,6 +35,7 @@ static void * __percpu *scomp_src_scratches;
7390  static void * __percpu *scomp_dst_scratches;
7391  static int scomp_scratch_users;
7392  static DEFINE_MUTEX(scomp_lock);
7393 +static DEFINE_LOCAL_IRQ_LOCK(scomp_scratches_lock);
7394
7395  #ifdef CONFIG_NET
7396  static int crypto_scomp_report(struct sk_buff *skb, struct crypto_alg *alg)
7397 @@ -193,7 +195,7 @@ static int scomp_acomp_comp_decomp(struct acomp_req *req, int dir)
7398         void **tfm_ctx = acomp_tfm_ctx(tfm);
7399         struct crypto_scomp *scomp = *tfm_ctx;
7400         void **ctx = acomp_request_ctx(req);
7401 -       const int cpu = get_cpu();
7402 +       const int cpu = local_lock_cpu(scomp_scratches_lock);
7403         u8 *scratch_src = *per_cpu_ptr(scomp_src_scratches, cpu);
7404         u8 *scratch_dst = *per_cpu_ptr(scomp_dst_scratches, cpu);
7405         int ret;
7406 @@ -228,7 +230,7 @@ static int scomp_acomp_comp_decomp(struct acomp_req *req, int dir)
7407                                          1);
7408         }
7409  out:
7410 -       put_cpu();
7411 +       local_unlock_cpu(scomp_scratches_lock);
7412         return ret;
7413  }
7414
7415 diff --git a/drivers/acpi/acpica/acglobal.h b/drivers/acpi/acpica/acglobal.h
7416 index 95eed442703f..50bc5b61d899 100644
7417 --- a/drivers/acpi/acpica/acglobal.h
7418 +++ b/drivers/acpi/acpica/acglobal.h
7419 @@ -116,7 +116,7 @@ ACPI_GLOBAL(u8, acpi_gbl_global_lock_pending);
7420   * interrupt level
7421   */
7422  ACPI_GLOBAL(acpi_spinlock, acpi_gbl_gpe_lock); /* For GPE data structs and registers */
7423 -ACPI_GLOBAL(acpi_spinlock, acpi_gbl_hardware_lock);    /* For ACPI H/W except GPE registers */
7424 +ACPI_GLOBAL(acpi_raw_spinlock, acpi_gbl_hardware_lock);        /* For ACPI H/W except GPE registers */
7425  ACPI_GLOBAL(acpi_spinlock, acpi_gbl_reference_count_lock);
7426
7427  /* Mutex for _OSI support */
7428 diff --git a/drivers/acpi/acpica/hwregs.c b/drivers/acpi/acpica/hwregs.c
7429 index acb417b58bbb..ea49e08c263f 100644
7430 --- a/drivers/acpi/acpica/hwregs.c
7431 +++ b/drivers/acpi/acpica/hwregs.c
7432 @@ -428,14 +428,14 @@ acpi_status acpi_hw_clear_acpi_status(void)
7433                           ACPI_BITMASK_ALL_FIXED_STATUS,
7434                           ACPI_FORMAT_UINT64(acpi_gbl_xpm1a_status.address)));
7435
7436 -       lock_flags = acpi_os_acquire_lock(acpi_gbl_hardware_lock);
7437 +       raw_spin_lock_irqsave(acpi_gbl_hardware_lock, lock_flags);
7438
7439         /* Clear the fixed events in PM1 A/B */
7440
7441         status = acpi_hw_register_write(ACPI_REGISTER_PM1_STATUS,
7442                                         ACPI_BITMASK_ALL_FIXED_STATUS);
7443
7444 -       acpi_os_release_lock(acpi_gbl_hardware_lock, lock_flags);
7445 +       raw_spin_unlock_irqrestore(acpi_gbl_hardware_lock, lock_flags);
7446
7447         if (ACPI_FAILURE(status)) {
7448                 goto exit;
7449 diff --git a/drivers/acpi/acpica/hwxface.c b/drivers/acpi/acpica/hwxface.c
7450 index 34684ae89981..fb84983e1839 100644
7451 --- a/drivers/acpi/acpica/hwxface.c
7452 +++ b/drivers/acpi/acpica/hwxface.c
7453 @@ -373,7 +373,7 @@ acpi_status acpi_write_bit_register(u32 register_id, u32 value)
7454                 return_ACPI_STATUS(AE_BAD_PARAMETER);
7455         }
7456
7457 -       lock_flags = acpi_os_acquire_lock(acpi_gbl_hardware_lock);
7458 +       raw_spin_lock_irqsave(acpi_gbl_hardware_lock, lock_flags);
7459
7460         /*
7461          * At this point, we know that the parent register is one of the
7462 @@ -434,7 +434,7 @@ acpi_status acpi_write_bit_register(u32 register_id, u32 value)
7463
7464  unlock_and_exit:
7465
7466 -       acpi_os_release_lock(acpi_gbl_hardware_lock, lock_flags);
7467 +       raw_spin_unlock_irqrestore(acpi_gbl_hardware_lock, lock_flags);
7468         return_ACPI_STATUS(status);
7469  }
7470
7471 diff --git a/drivers/acpi/acpica/utmutex.c b/drivers/acpi/acpica/utmutex.c
7472 index 586354788018..3a3c2a86437f 100644
7473 --- a/drivers/acpi/acpica/utmutex.c
7474 +++ b/drivers/acpi/acpica/utmutex.c
7475 @@ -88,7 +88,7 @@ acpi_status acpi_ut_mutex_initialize(void)
7476                 return_ACPI_STATUS (status);
7477         }
7478
7479 -       status = acpi_os_create_lock (&acpi_gbl_hardware_lock);
7480 +       status = acpi_os_create_raw_lock (&acpi_gbl_hardware_lock);
7481         if (ACPI_FAILURE (status)) {
7482                 return_ACPI_STATUS (status);
7483         }
7484 @@ -145,7 +145,7 @@ void acpi_ut_mutex_terminate(void)
7485         /* Delete the spinlocks */
7486
7487         acpi_os_delete_lock(acpi_gbl_gpe_lock);
7488 -       acpi_os_delete_lock(acpi_gbl_hardware_lock);
7489 +       acpi_os_delete_raw_lock(acpi_gbl_hardware_lock);
7490         acpi_os_delete_lock(acpi_gbl_reference_count_lock);
7491
7492         /* Delete the reader/writer lock */
7493 diff --git a/drivers/ata/libata-sff.c b/drivers/ata/libata-sff.c
7494 index cc2f2e35f4c2..0f0bc86e02df 100644
7495 --- a/drivers/ata/libata-sff.c
7496 +++ b/drivers/ata/libata-sff.c
7497 @@ -679,9 +679,9 @@ unsigned int ata_sff_data_xfer_noirq(struct ata_queued_cmd *qc, unsigned char *b
7498         unsigned long flags;
7499         unsigned int consumed;
7500
7501 -       local_irq_save(flags);
7502 +       local_irq_save_nort(flags);
7503         consumed = ata_sff_data_xfer32(qc, buf, buflen, rw);
7504 -       local_irq_restore(flags);
7505 +       local_irq_restore_nort(flags);
7506
7507         return consumed;
7508  }
7509 diff --git a/drivers/base/power/wakeup.c b/drivers/base/power/wakeup.c
7510 index cdd6f256da59..2269d379c92f 100644
7511 --- a/drivers/base/power/wakeup.c
7512 +++ b/drivers/base/power/wakeup.c
7513 @@ -52,7 +52,7 @@ static void split_counters(unsigned int *cnt, unsigned int *inpr)
7514  /* A preserved old value of the events counter. */
7515  static unsigned int saved_count;
7516
7517 -static DEFINE_SPINLOCK(events_lock);
7518 +static DEFINE_RAW_SPINLOCK(events_lock);
7519
7520  static void pm_wakeup_timer_fn(unsigned long data);
7521
7522 @@ -180,9 +180,9 @@ void wakeup_source_add(struct wakeup_source *ws)
7523         ws->active = false;
7524         ws->last_time = ktime_get();
7525
7526 -       spin_lock_irqsave(&events_lock, flags);
7527 +       raw_spin_lock_irqsave(&events_lock, flags);
7528         list_add_rcu(&ws->entry, &wakeup_sources);
7529 -       spin_unlock_irqrestore(&events_lock, flags);
7530 +       raw_spin_unlock_irqrestore(&events_lock, flags);
7531  }
7532  EXPORT_SYMBOL_GPL(wakeup_source_add);
7533
7534 @@ -197,9 +197,9 @@ void wakeup_source_remove(struct wakeup_source *ws)
7535         if (WARN_ON(!ws))
7536                 return;
7537
7538 -       spin_lock_irqsave(&events_lock, flags);
7539 +       raw_spin_lock_irqsave(&events_lock, flags);
7540         list_del_rcu(&ws->entry);
7541 -       spin_unlock_irqrestore(&events_lock, flags);
7542 +       raw_spin_unlock_irqrestore(&events_lock, flags);
7543         synchronize_srcu(&wakeup_srcu);
7544  }
7545  EXPORT_SYMBOL_GPL(wakeup_source_remove);
7546 @@ -844,7 +844,7 @@ bool pm_wakeup_pending(void)
7547         unsigned long flags;
7548         bool ret = false;
7549
7550 -       spin_lock_irqsave(&events_lock, flags);
7551 +       raw_spin_lock_irqsave(&events_lock, flags);
7552         if (events_check_enabled) {
7553                 unsigned int cnt, inpr;
7554
7555 @@ -852,7 +852,7 @@ bool pm_wakeup_pending(void)
7556                 ret = (cnt != saved_count || inpr > 0);
7557                 events_check_enabled = !ret;
7558         }
7559 -       spin_unlock_irqrestore(&events_lock, flags);
7560 +       raw_spin_unlock_irqrestore(&events_lock, flags);
7561
7562         if (ret) {
7563                 pr_info("PM: Wakeup pending, aborting suspend\n");
7564 @@ -941,13 +941,13 @@ bool pm_save_wakeup_count(unsigned int count)
7565         unsigned long flags;
7566
7567         events_check_enabled = false;
7568 -       spin_lock_irqsave(&events_lock, flags);
7569 +       raw_spin_lock_irqsave(&events_lock, flags);
7570         split_counters(&cnt, &inpr);
7571         if (cnt == count && inpr == 0) {
7572                 saved_count = count;
7573                 events_check_enabled = true;
7574         }
7575 -       spin_unlock_irqrestore(&events_lock, flags);
7576 +       raw_spin_unlock_irqrestore(&events_lock, flags);
7577         return events_check_enabled;
7578  }
7579
7580 diff --git a/drivers/block/brd.c b/drivers/block/brd.c
7581 index 2d7178f7754e..c1cf87718c2e 100644
7582 --- a/drivers/block/brd.c
7583 +++ b/drivers/block/brd.c
7584 @@ -60,7 +60,6 @@ struct brd_device {
7585  /*
7586   * Look up and return a brd's page for a given sector.
7587   */
7588 -static DEFINE_MUTEX(brd_mutex);
7589  static struct page *brd_lookup_page(struct brd_device *brd, sector_t sector)
7590  {
7591         pgoff_t idx;
7592 diff --git a/drivers/block/zram/zcomp.c b/drivers/block/zram/zcomp.c
7593 index 5b8992beffec..40345483a022 100644
7594 --- a/drivers/block/zram/zcomp.c
7595 +++ b/drivers/block/zram/zcomp.c
7596 @@ -116,12 +116,20 @@ ssize_t zcomp_available_show(const char *comp, char *buf)
7597
7598  struct zcomp_strm *zcomp_stream_get(struct zcomp *comp)
7599  {
7600 -       return *get_cpu_ptr(comp->stream);
7601 +       struct zcomp_strm *zstrm;
7602 +
7603 +       zstrm = *get_local_ptr(comp->stream);
7604 +       spin_lock(&zstrm->zcomp_lock);
7605 +       return zstrm;
7606  }
7607
7608  void zcomp_stream_put(struct zcomp *comp)
7609  {
7610 -       put_cpu_ptr(comp->stream);
7611 +       struct zcomp_strm *zstrm;
7612 +
7613 +       zstrm = *this_cpu_ptr(comp->stream);
7614 +       spin_unlock(&zstrm->zcomp_lock);
7615 +       put_local_ptr(zstrm);
7616  }
7617
7618  int zcomp_compress(struct zcomp_strm *zstrm,
7619 @@ -171,6 +179,7 @@ int zcomp_cpu_up_prepare(unsigned int cpu, struct hlist_node *node)
7620                 pr_err("Can't allocate a compression stream\n");
7621                 return -ENOMEM;
7622         }
7623 +       spin_lock_init(&zstrm->zcomp_lock);
7624         *per_cpu_ptr(comp->stream, cpu) = zstrm;
7625         return 0;
7626  }
7627 diff --git a/drivers/block/zram/zcomp.h b/drivers/block/zram/zcomp.h
7628 index 41c1002a7d7d..d424eafcbf8e 100644
7629 --- a/drivers/block/zram/zcomp.h
7630 +++ b/drivers/block/zram/zcomp.h
7631 @@ -14,6 +14,7 @@ struct zcomp_strm {
7632         /* compression/decompression buffer */
7633         void *buffer;
7634         struct crypto_comp *tfm;
7635 +       spinlock_t zcomp_lock;
7636  };
7637
7638  /* dynamic per-device compression frontend */
7639 diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
7640 index 1e2648e4c286..c5d61209eb05 100644
7641 --- a/drivers/block/zram/zram_drv.c
7642 +++ b/drivers/block/zram/zram_drv.c
7643 @@ -761,6 +761,30 @@ static DEVICE_ATTR_RO(io_stat);
7644  static DEVICE_ATTR_RO(mm_stat);
7645  static DEVICE_ATTR_RO(debug_stat);
7646
7647 +#ifdef CONFIG_PREEMPT_RT_BASE
7648 +static void zram_meta_init_table_locks(struct zram *zram, size_t num_pages)
7649 +{
7650 +       size_t index;
7651 +
7652 +       for (index = 0; index < num_pages; index++)
7653 +               spin_lock_init(&zram->table[index].lock);
7654 +}
7655 +
7656 +static void zram_slot_lock(struct zram *zram, u32 index)
7657 +{
7658 +       spin_lock(&zram->table[index].lock);
7659 +       __set_bit(ZRAM_ACCESS, &zram->table[index].value);
7660 +}
7661 +
7662 +static void zram_slot_unlock(struct zram *zram, u32 index)
7663 +{
7664 +       __clear_bit(ZRAM_ACCESS, &zram->table[index].value);
7665 +       spin_unlock(&zram->table[index].lock);
7666 +}
7667 +
7668 +#else
7669 +static void zram_meta_init_table_locks(struct zram *zram, size_t num_pages) { }
7670 +
7671  static void zram_slot_lock(struct zram *zram, u32 index)
7672  {
7673         bit_spin_lock(ZRAM_ACCESS, &zram->table[index].value);
7674 @@ -770,6 +794,7 @@ static void zram_slot_unlock(struct zram *zram, u32 index)
7675  {
7676         bit_spin_unlock(ZRAM_ACCESS, &zram->table[index].value);
7677  }
7678 +#endif
7679
7680  static void zram_meta_free(struct zram *zram, u64 disksize)
7681  {
7682 @@ -799,6 +824,7 @@ static bool zram_meta_alloc(struct zram *zram, u64 disksize)
7683                 return false;
7684         }
7685
7686 +       zram_meta_init_table_locks(zram, num_pages);
7687         return true;
7688  }
7689
7690 @@ -850,6 +876,7 @@ static int __zram_bvec_read(struct zram *zram, struct page *page, u32 index,
7691         unsigned long handle;
7692         unsigned int size;
7693         void *src, *dst;
7694 +       struct zcomp_strm *zstrm;
7695
7696         if (zram_wb_enabled(zram)) {
7697                 zram_slot_lock(zram, index);
7698 @@ -884,6 +911,7 @@ static int __zram_bvec_read(struct zram *zram, struct page *page, u32 index,
7699
7700         size = zram_get_obj_size(zram, index);
7701
7702 +       zstrm = zcomp_stream_get(zram->comp);
7703         src = zs_map_object(zram->mem_pool, handle, ZS_MM_RO);
7704         if (size == PAGE_SIZE) {
7705                 dst = kmap_atomic(page);
7706 @@ -891,14 +919,13 @@ static int __zram_bvec_read(struct zram *zram, struct page *page, u32 index,
7707                 kunmap_atomic(dst);
7708                 ret = 0;
7709         } else {
7710 -               struct zcomp_strm *zstrm = zcomp_stream_get(zram->comp);
7711
7712                 dst = kmap_atomic(page);
7713                 ret = zcomp_decompress(zstrm, src, size, dst);
7714                 kunmap_atomic(dst);
7715 -               zcomp_stream_put(zram->comp);
7716         }
7717         zs_unmap_object(zram->mem_pool, handle);
7718 +       zcomp_stream_put(zram->comp);
7719         zram_slot_unlock(zram, index);
7720
7721         /* Should NEVER happen. Return bio error if it does. */
7722 diff --git a/drivers/block/zram/zram_drv.h b/drivers/block/zram/zram_drv.h
7723 index 31762db861e3..a417c96b8f3f 100644
7724 --- a/drivers/block/zram/zram_drv.h
7725 +++ b/drivers/block/zram/zram_drv.h
7726 @@ -77,6 +77,9 @@ struct zram_table_entry {
7727                 unsigned long element;
7728         };
7729         unsigned long value;
7730 +#ifdef CONFIG_PREEMPT_RT_BASE
7731 +       spinlock_t lock;
7732 +#endif
7733  };
7734
7735  struct zram_stats {
7736 diff --git a/drivers/char/random.c b/drivers/char/random.c
7737 index ea4dbfa30657..c72a7f0b4494 100644
7738 --- a/drivers/char/random.c
7739 +++ b/drivers/char/random.c
7740 @@ -265,6 +265,7 @@
7741  #include <linux/syscalls.h>
7742  #include <linux/completion.h>
7743  #include <linux/uuid.h>
7744 +#include <linux/locallock.h>
7745  #include <crypto/chacha20.h>
7746
7747  #include <asm/processor.h>
7748 @@ -856,7 +857,7 @@ static int crng_fast_load(const char *cp, size_t len)
7749                 invalidate_batched_entropy();
7750                 crng_init = 1;
7751                 wake_up_interruptible(&crng_init_wait);
7752 -               pr_notice("random: fast init done\n");
7753 +               /* pr_notice("random: fast init done\n"); */
7754         }
7755         return 1;
7756  }
7757 @@ -941,17 +942,21 @@ static void crng_reseed(struct crng_state *crng, struct entropy_store *r)
7758                 crng_init = 2;
7759                 process_random_ready_list();
7760                 wake_up_interruptible(&crng_init_wait);
7761 -               pr_notice("random: crng init done\n");
7762 +               /* pr_notice("random: crng init done\n"); */
7763                 if (unseeded_warning.missed) {
7764 +#if 0
7765                         pr_notice("random: %d get_random_xx warning(s) missed "
7766                                   "due to ratelimiting\n",
7767                                   unseeded_warning.missed);
7768 +#endif
7769                         unseeded_warning.missed = 0;
7770                 }
7771                 if (urandom_warning.missed) {
7772 +#if 0
7773                         pr_notice("random: %d urandom warning(s) missed "
7774                                   "due to ratelimiting\n",
7775                                   urandom_warning.missed);
7776 +#endif
7777                         urandom_warning.missed = 0;
7778                 }
7779         }
7780 @@ -1122,8 +1127,6 @@ static void add_timer_randomness(struct timer_rand_state *state, unsigned num)
7781         } sample;
7782         long delta, delta2, delta3;
7783
7784 -       preempt_disable();
7785 -
7786         sample.jiffies = jiffies;
7787         sample.cycles = random_get_entropy();
7788         sample.num = num;
7789 @@ -1164,7 +1167,6 @@ static void add_timer_randomness(struct timer_rand_state *state, unsigned num)
7790                  */
7791                 credit_entropy_bits(r, min_t(int, fls(delta>>1), 11));
7792         }
7793 -       preempt_enable();
7794  }
7795
7796  void add_input_randomness(unsigned int type, unsigned int code,
7797 @@ -1221,28 +1223,27 @@ static __u32 get_reg(struct fast_pool *f, struct pt_regs *regs)
7798         return *ptr;
7799  }
7800
7801 -void add_interrupt_randomness(int irq, int irq_flags)
7802 +void add_interrupt_randomness(int irq, int irq_flags, __u64 ip)
7803  {
7804         struct entropy_store    *r;
7805         struct fast_pool        *fast_pool = this_cpu_ptr(&irq_randomness);
7806 -       struct pt_regs          *regs = get_irq_regs();
7807         unsigned long           now = jiffies;
7808         cycles_t                cycles = random_get_entropy();
7809         __u32                   c_high, j_high;
7810 -       __u64                   ip;
7811         unsigned long           seed;
7812         int                     credit = 0;
7813
7814         if (cycles == 0)
7815 -               cycles = get_reg(fast_pool, regs);
7816 +               cycles = get_reg(fast_pool, NULL);
7817         c_high = (sizeof(cycles) > 4) ? cycles >> 32 : 0;
7818         j_high = (sizeof(now) > 4) ? now >> 32 : 0;
7819         fast_pool->pool[0] ^= cycles ^ j_high ^ irq;
7820         fast_pool->pool[1] ^= now ^ c_high;
7821 -       ip = regs ? instruction_pointer(regs) : _RET_IP_;
7822 +       if (!ip)
7823 +               ip = _RET_IP_;
7824         fast_pool->pool[2] ^= ip;
7825         fast_pool->pool[3] ^= (sizeof(ip) > 4) ? ip >> 32 :
7826 -               get_reg(fast_pool, regs);
7827 +               get_reg(fast_pool, NULL);
7828
7829         fast_mix(fast_pool);
7830         add_interrupt_bench(cycles);
7831 @@ -2200,6 +2201,7 @@ static rwlock_t batched_entropy_reset_lock = __RW_LOCK_UNLOCKED(batched_entropy_
7832   * at any point prior.
7833   */
7834  static DEFINE_PER_CPU(struct batched_entropy, batched_entropy_u64);
7835 +static DEFINE_LOCAL_IRQ_LOCK(batched_entropy_u64_lock);
7836  u64 get_random_u64(void)
7837  {
7838         u64 ret;
7839 @@ -2220,7 +2222,7 @@ u64 get_random_u64(void)
7840         warn_unseeded_randomness(&previous);
7841
7842         use_lock = READ_ONCE(crng_init) < 2;
7843 -       batch = &get_cpu_var(batched_entropy_u64);
7844 +       batch = &get_locked_var(batched_entropy_u64_lock, batched_entropy_u64);
7845         if (use_lock)
7846                 read_lock_irqsave(&batched_entropy_reset_lock, flags);
7847         if (batch->position % ARRAY_SIZE(batch->entropy_u64) == 0) {
7848 @@ -2230,12 +2232,13 @@ u64 get_random_u64(void)
7849         ret = batch->entropy_u64[batch->position++];
7850         if (use_lock)
7851                 read_unlock_irqrestore(&batched_entropy_reset_lock, flags);
7852 -       put_cpu_var(batched_entropy_u64);
7853 +       put_locked_var(batched_entropy_u64_lock, batched_entropy_u64);
7854         return ret;
7855  }
7856  EXPORT_SYMBOL(get_random_u64);
7857
7858  static DEFINE_PER_CPU(struct batched_entropy, batched_entropy_u32);
7859 +static DEFINE_LOCAL_IRQ_LOCK(batched_entropy_u32_lock);
7860  u32 get_random_u32(void)
7861  {
7862         u32 ret;
7863 @@ -2250,7 +2253,7 @@ u32 get_random_u32(void)
7864         warn_unseeded_randomness(&previous);
7865
7866         use_lock = READ_ONCE(crng_init) < 2;
7867 -       batch = &get_cpu_var(batched_entropy_u32);
7868 +       batch = &get_locked_var(batched_entropy_u32_lock, batched_entropy_u32);
7869         if (use_lock)
7870                 read_lock_irqsave(&batched_entropy_reset_lock, flags);
7871         if (batch->position % ARRAY_SIZE(batch->entropy_u32) == 0) {
7872 @@ -2260,7 +2263,7 @@ u32 get_random_u32(void)
7873         ret = batch->entropy_u32[batch->position++];
7874         if (use_lock)
7875                 read_unlock_irqrestore(&batched_entropy_reset_lock, flags);
7876 -       put_cpu_var(batched_entropy_u32);
7877 +       put_locked_var(batched_entropy_u32_lock, batched_entropy_u32);
7878         return ret;
7879  }
7880  EXPORT_SYMBOL(get_random_u32);
7881 diff --git a/drivers/char/tpm/tpm_tis.c b/drivers/char/tpm/tpm_tis.c
7882 index 50b59a69dc33..cbdb0a6c5337 100644
7883 --- a/drivers/char/tpm/tpm_tis.c
7884 +++ b/drivers/char/tpm/tpm_tis.c
7885 @@ -52,6 +52,31 @@ static inline struct tpm_tis_tcg_phy *to_tpm_tis_tcg_phy(struct tpm_tis_data *da
7886         return container_of(data, struct tpm_tis_tcg_phy, priv);
7887  }
7888
7889 +#ifdef CONFIG_PREEMPT_RT_FULL
7890 +/*
7891 + * Flushes previous write operations to chip so that a subsequent
7892 + * ioread*()s won't stall a cpu.
7893 + */
7894 +static inline void tpm_tis_flush(void __iomem *iobase)
7895 +{
7896 +       ioread8(iobase + TPM_ACCESS(0));
7897 +}
7898 +#else
7899 +#define tpm_tis_flush(iobase) do { } while (0)
7900 +#endif
7901 +
7902 +static inline void tpm_tis_iowrite8(u8 b, void __iomem *iobase, u32 addr)
7903 +{
7904 +       iowrite8(b, iobase + addr);
7905 +       tpm_tis_flush(iobase);
7906 +}
7907 +
7908 +static inline void tpm_tis_iowrite32(u32 b, void __iomem *iobase, u32 addr)
7909 +{
7910 +       iowrite32(b, iobase + addr);
7911 +       tpm_tis_flush(iobase);
7912 +}
7913 +
7914  static bool interrupts = true;
7915  module_param(interrupts, bool, 0444);
7916  MODULE_PARM_DESC(interrupts, "Enable interrupts");
7917 @@ -149,7 +174,7 @@ static int tpm_tcg_write_bytes(struct tpm_tis_data *data, u32 addr, u16 len,
7918         struct tpm_tis_tcg_phy *phy = to_tpm_tis_tcg_phy(data);
7919
7920         while (len--)
7921 -               iowrite8(*value++, phy->iobase + addr);
7922 +               tpm_tis_iowrite8(*value++, phy->iobase, addr);
7923
7924         return 0;
7925  }
7926 @@ -176,7 +201,7 @@ static int tpm_tcg_write32(struct tpm_tis_data *data, u32 addr, u32 value)
7927  {
7928         struct tpm_tis_tcg_phy *phy = to_tpm_tis_tcg_phy(data);
7929
7930 -       iowrite32(value, phy->iobase + addr);
7931 +       tpm_tis_iowrite32(value, phy->iobase, addr);
7932
7933         return 0;
7934  }
7935 diff --git a/drivers/clocksource/tcb_clksrc.c b/drivers/clocksource/tcb_clksrc.c
7936 index 9de47d4d2d9e..05f4b88bb955 100644
7937 --- a/drivers/clocksource/tcb_clksrc.c
7938 +++ b/drivers/clocksource/tcb_clksrc.c
7939 @@ -25,8 +25,7 @@
7940   *     this 32 bit free-running counter. the second channel is not used.
7941   *
7942   *   - The third channel may be used to provide a 16-bit clockevent
7943 - *     source, used in either periodic or oneshot mode.  This runs
7944 - *     at 32 KiHZ, and can handle delays of up to two seconds.
7945 + *     source, used in either periodic or oneshot mode.
7946   *
7947   * A boot clocksource and clockevent source are also currently needed,
7948   * unless the relevant platforms (ARM/AT91, AVR32/AT32) are changed so
7949 @@ -126,6 +125,8 @@ static struct clocksource clksrc = {
7950  struct tc_clkevt_device {
7951         struct clock_event_device       clkevt;
7952         struct clk                      *clk;
7953 +       bool                            clk_enabled;
7954 +       u32                             freq;
7955         void __iomem                    *regs;
7956  };
7957
7958 @@ -134,15 +135,26 @@ static struct tc_clkevt_device *to_tc_clkevt(struct clock_event_device *clkevt)
7959         return container_of(clkevt, struct tc_clkevt_device, clkevt);
7960  }
7961
7962 -/* For now, we always use the 32K clock ... this optimizes for NO_HZ,
7963 - * because using one of the divided clocks would usually mean the
7964 - * tick rate can never be less than several dozen Hz (vs 0.5 Hz).
7965 - *
7966 - * A divided clock could be good for high resolution timers, since
7967 - * 30.5 usec resolution can seem "low".
7968 - */
7969  static u32 timer_clock;
7970
7971 +static void tc_clk_disable(struct clock_event_device *d)
7972 +{
7973 +       struct tc_clkevt_device *tcd = to_tc_clkevt(d);
7974 +
7975 +       clk_disable(tcd->clk);
7976 +       tcd->clk_enabled = false;
7977 +}
7978 +
7979 +static void tc_clk_enable(struct clock_event_device *d)
7980 +{
7981 +       struct tc_clkevt_device *tcd = to_tc_clkevt(d);
7982 +
7983 +       if (tcd->clk_enabled)
7984 +               return;
7985 +       clk_enable(tcd->clk);
7986 +       tcd->clk_enabled = true;
7987 +}
7988 +
7989  static int tc_shutdown(struct clock_event_device *d)
7990  {
7991         struct tc_clkevt_device *tcd = to_tc_clkevt(d);
7992 @@ -150,8 +162,14 @@ static int tc_shutdown(struct clock_event_device *d)
7993
7994         writel(0xff, regs + ATMEL_TC_REG(2, IDR));
7995         writel(ATMEL_TC_CLKDIS, regs + ATMEL_TC_REG(2, CCR));
7996 +       return 0;
7997 +}
7998 +
7999 +static int tc_shutdown_clk_off(struct clock_event_device *d)
8000 +{
8001 +       tc_shutdown(d);
8002         if (!clockevent_state_detached(d))
8003 -               clk_disable(tcd->clk);
8004 +               tc_clk_disable(d);
8005
8006         return 0;
8007  }
8008 @@ -164,9 +182,9 @@ static int tc_set_oneshot(struct clock_event_device *d)
8009         if (clockevent_state_oneshot(d) || clockevent_state_periodic(d))
8010                 tc_shutdown(d);
8011
8012 -       clk_enable(tcd->clk);
8013 +       tc_clk_enable(d);
8014
8015 -       /* slow clock, count up to RC, then irq and stop */
8016 +       /* count up to RC, then irq and stop */
8017         writel(timer_clock | ATMEL_TC_CPCSTOP | ATMEL_TC_WAVE |
8018                      ATMEL_TC_WAVESEL_UP_AUTO, regs + ATMEL_TC_REG(2, CMR));
8019         writel(ATMEL_TC_CPCS, regs + ATMEL_TC_REG(2, IER));
8020 @@ -186,12 +204,12 @@ static int tc_set_periodic(struct clock_event_device *d)
8021         /* By not making the gentime core emulate periodic mode on top
8022          * of oneshot, we get lower overhead and improved accuracy.
8023          */
8024 -       clk_enable(tcd->clk);
8025 +       tc_clk_enable(d);
8026
8027 -       /* slow clock, count up to RC, then irq and restart */
8028 +       /* count up to RC, then irq and restart */
8029         writel(timer_clock | ATMEL_TC_WAVE | ATMEL_TC_WAVESEL_UP_AUTO,
8030                      regs + ATMEL_TC_REG(2, CMR));
8031 -       writel((32768 + HZ / 2) / HZ, tcaddr + ATMEL_TC_REG(2, RC));
8032 +       writel((tcd->freq + HZ / 2) / HZ, tcaddr + ATMEL_TC_REG(2, RC));
8033
8034         /* Enable clock and interrupts on RC compare */
8035         writel(ATMEL_TC_CPCS, regs + ATMEL_TC_REG(2, IER));
8036 @@ -218,9 +236,13 @@ static struct tc_clkevt_device clkevt = {
8037                 .features               = CLOCK_EVT_FEAT_PERIODIC |
8038                                           CLOCK_EVT_FEAT_ONESHOT,
8039                 /* Should be lower than at91rm9200's system timer */
8040 +#ifdef CONFIG_ATMEL_TCB_CLKSRC_USE_SLOW_CLOCK
8041                 .rating                 = 125,
8042 +#else
8043 +               .rating                 = 200,
8044 +#endif
8045                 .set_next_event         = tc_next_event,
8046 -               .set_state_shutdown     = tc_shutdown,
8047 +               .set_state_shutdown     = tc_shutdown_clk_off,
8048                 .set_state_periodic     = tc_set_periodic,
8049                 .set_state_oneshot      = tc_set_oneshot,
8050         },
8051 @@ -240,8 +262,9 @@ static irqreturn_t ch2_irq(int irq, void *handle)
8052         return IRQ_NONE;
8053  }
8054
8055 -static int __init setup_clkevents(struct atmel_tc *tc, int clk32k_divisor_idx)
8056 +static int __init setup_clkevents(struct atmel_tc *tc, int divisor_idx)
8057  {
8058 +       unsigned divisor = atmel_tc_divisors[divisor_idx];
8059         int ret;
8060         struct clk *t2_clk = tc->clk[2];
8061         int irq = tc->irq[2];
8062 @@ -262,7 +285,11 @@ static int __init setup_clkevents(struct atmel_tc *tc, int clk32k_divisor_idx)
8063         clkevt.regs = tc->regs;
8064         clkevt.clk = t2_clk;
8065
8066 -       timer_clock = clk32k_divisor_idx;
8067 +       timer_clock = divisor_idx;
8068 +       if (!divisor)
8069 +               clkevt.freq = 32768;
8070 +       else
8071 +               clkevt.freq = clk_get_rate(t2_clk) / divisor;
8072
8073         clkevt.clkevt.cpumask = cpumask_of(0);
8074
8075 @@ -273,7 +300,7 @@ static int __init setup_clkevents(struct atmel_tc *tc, int clk32k_divisor_idx)
8076                 return ret;
8077         }
8078
8079 -       clockevents_config_and_register(&clkevt.clkevt, 32768, 1, 0xffff);
8080 +       clockevents_config_and_register(&clkevt.clkevt, clkevt.freq, 1, 0xffff);
8081
8082         return ret;
8083  }
8084 @@ -410,7 +437,11 @@ static int __init tcb_clksrc_init(void)
8085                 goto err_disable_t1;
8086
8087         /* channel 2:  periodic and oneshot timer support */
8088 +#ifdef CONFIG_ATMEL_TCB_CLKSRC_USE_SLOW_CLOCK
8089         ret = setup_clkevents(tc, clk32k_divisor_idx);
8090 +#else
8091 +       ret = setup_clkevents(tc, best_divisor_idx);
8092 +#endif
8093         if (ret)
8094                 goto err_unregister_clksrc;
8095
8096 diff --git a/drivers/clocksource/timer-atmel-pit.c b/drivers/clocksource/timer-atmel-pit.c
8097 index 2fab18fae4fc..98460c1bdec0 100644
8098 --- a/drivers/clocksource/timer-atmel-pit.c
8099 +++ b/drivers/clocksource/timer-atmel-pit.c
8100 @@ -46,6 +46,7 @@ struct pit_data {
8101         u32             cycle;
8102         u32             cnt;
8103         unsigned int    irq;
8104 +       bool            irq_requested;
8105         struct clk      *mck;
8106  };
8107
8108 @@ -96,15 +97,29 @@ static int pit_clkevt_shutdown(struct clock_event_device *dev)
8109
8110         /* disable irq, leaving the clocksource active */
8111         pit_write(data->base, AT91_PIT_MR, (data->cycle - 1) | AT91_PIT_PITEN);
8112 +       if (data->irq_requested) {
8113 +               free_irq(data->irq, data);
8114 +               data->irq_requested = false;
8115 +       }
8116         return 0;
8117  }
8118
8119 +static irqreturn_t at91sam926x_pit_interrupt(int irq, void *dev_id);
8120  /*
8121   * Clockevent device:  interrupts every 1/HZ (== pit_cycles * MCK/16)
8122   */
8123  static int pit_clkevt_set_periodic(struct clock_event_device *dev)
8124  {
8125         struct pit_data *data = clkevt_to_pit_data(dev);
8126 +       int ret;
8127 +
8128 +       ret = request_irq(data->irq, at91sam926x_pit_interrupt,
8129 +                         IRQF_SHARED | IRQF_TIMER | IRQF_IRQPOLL,
8130 +                         "at91_tick", data);
8131 +       if (ret)
8132 +               panic(pr_fmt("Unable to setup IRQ\n"));
8133 +
8134 +       data->irq_requested = true;
8135
8136         /* update clocksource counter */
8137         data->cnt += data->cycle * PIT_PICNT(pit_read(data->base, AT91_PIT_PIVR));
8138 @@ -233,16 +248,6 @@ static int __init at91sam926x_pit_dt_init(struct device_node *node)
8139                 goto exit;
8140         }
8141
8142 -       /* Set up irq handler */
8143 -       ret = request_irq(data->irq, at91sam926x_pit_interrupt,
8144 -                         IRQF_SHARED | IRQF_TIMER | IRQF_IRQPOLL,
8145 -                         "at91_tick", data);
8146 -       if (ret) {
8147 -               pr_err("Unable to setup IRQ\n");
8148 -               clocksource_unregister(&data->clksrc);
8149 -               goto exit;
8150 -       }
8151 -
8152         /* Set up and register clockevents */
8153         data->clkevt.name = "pit";
8154         data->clkevt.features = CLOCK_EVT_FEAT_PERIODIC;
8155 diff --git a/drivers/clocksource/timer-atmel-st.c b/drivers/clocksource/timer-atmel-st.c
8156 index d2e660f475af..c63b96cfc23e 100644
8157 --- a/drivers/clocksource/timer-atmel-st.c
8158 +++ b/drivers/clocksource/timer-atmel-st.c
8159 @@ -115,18 +115,29 @@ static void clkdev32k_disable_and_flush_irq(void)
8160         last_crtr = read_CRTR();
8161  }
8162
8163 +static int atmel_st_irq;
8164 +
8165  static int clkevt32k_shutdown(struct clock_event_device *evt)
8166  {
8167         clkdev32k_disable_and_flush_irq();
8168         irqmask = 0;
8169         regmap_write(regmap_st, AT91_ST_IER, irqmask);
8170 +       free_irq(atmel_st_irq, regmap_st);
8171         return 0;
8172  }
8173
8174  static int clkevt32k_set_oneshot(struct clock_event_device *dev)
8175  {
8176 +       int ret;
8177 +
8178         clkdev32k_disable_and_flush_irq();
8179
8180 +       ret = request_irq(atmel_st_irq, at91rm9200_timer_interrupt,
8181 +                         IRQF_SHARED | IRQF_TIMER | IRQF_IRQPOLL,
8182 +                         "at91_tick", regmap_st);
8183 +       if (ret)
8184 +               panic(pr_fmt("Unable to setup IRQ\n"));
8185 +
8186         /*
8187          * ALM for oneshot irqs, set by next_event()
8188          * before 32 seconds have passed.
8189 @@ -139,8 +150,16 @@ static int clkevt32k_set_oneshot(struct clock_event_device *dev)
8190
8191  static int clkevt32k_set_periodic(struct clock_event_device *dev)
8192  {
8193 +       int ret;
8194 +
8195         clkdev32k_disable_and_flush_irq();
8196
8197 +       ret = request_irq(atmel_st_irq, at91rm9200_timer_interrupt,
8198 +                         IRQF_SHARED | IRQF_TIMER | IRQF_IRQPOLL,
8199 +                         "at91_tick", regmap_st);
8200 +       if (ret)
8201 +               panic(pr_fmt("Unable to setup IRQ\n"));
8202 +
8203         /* PIT for periodic irqs; fixed rate of 1/HZ */
8204         irqmask = AT91_ST_PITS;
8205         regmap_write(regmap_st, AT91_ST_PIMR, timer_latch);
8206 @@ -198,7 +217,7 @@ static int __init atmel_st_timer_init(struct device_node *node)
8207  {
8208         struct clk *sclk;
8209         unsigned int sclk_rate, val;
8210 -       int irq, ret;
8211 +       int ret;
8212
8213         regmap_st = syscon_node_to_regmap(node);
8214         if (IS_ERR(regmap_st)) {
8215 @@ -212,21 +231,12 @@ static int __init atmel_st_timer_init(struct device_node *node)
8216         regmap_read(regmap_st, AT91_ST_SR, &val);
8217
8218         /* Get the interrupts property */
8219 -       irq  = irq_of_parse_and_map(node, 0);
8220 -       if (!irq) {
8221 +       atmel_st_irq  = irq_of_parse_and_map(node, 0);
8222 +       if (!atmel_st_irq) {
8223                 pr_err("Unable to get IRQ from DT\n");
8224                 return -EINVAL;
8225         }
8226
8227 -       /* Make IRQs happen for the system timer */
8228 -       ret = request_irq(irq, at91rm9200_timer_interrupt,
8229 -                         IRQF_SHARED | IRQF_TIMER | IRQF_IRQPOLL,
8230 -                         "at91_tick", regmap_st);
8231 -       if (ret) {
8232 -               pr_err("Unable to setup IRQ\n");
8233 -               return ret;
8234 -       }
8235 -
8236         sclk = of_clk_get(node, 0);
8237         if (IS_ERR(sclk)) {
8238                 pr_err("Unable to get slow clock\n");
8239 diff --git a/drivers/connector/cn_proc.c b/drivers/connector/cn_proc.c
8240 index a782ce87715c..19d265948526 100644
8241 --- a/drivers/connector/cn_proc.c
8242 +++ b/drivers/connector/cn_proc.c
8243 @@ -32,6 +32,7 @@
8244  #include <linux/pid_namespace.h>
8245
8246  #include <linux/cn_proc.h>
8247 +#include <linux/locallock.h>
8248
8249  /*
8250   * Size of a cn_msg followed by a proc_event structure.  Since the
8251 @@ -54,10 +55,11 @@ static struct cb_id cn_proc_event_id = { CN_IDX_PROC, CN_VAL_PROC };
8252
8253  /* proc_event_counts is used as the sequence number of the netlink message */
8254  static DEFINE_PER_CPU(__u32, proc_event_counts) = { 0 };
8255 +static DEFINE_LOCAL_IRQ_LOCK(send_msg_lock);
8256
8257  static inline void send_msg(struct cn_msg *msg)
8258  {
8259 -       preempt_disable();
8260 +       local_lock(send_msg_lock);
8261
8262         msg->seq = __this_cpu_inc_return(proc_event_counts) - 1;
8263         ((struct proc_event *)msg->data)->cpu = smp_processor_id();
8264 @@ -70,7 +72,7 @@ static inline void send_msg(struct cn_msg *msg)
8265          */
8266         cn_netlink_send(msg, 0, CN_IDX_PROC, GFP_NOWAIT);
8267
8268 -       preempt_enable();
8269 +       local_unlock(send_msg_lock);
8270  }
8271
8272  void proc_fork_connector(struct task_struct *task)
8273 diff --git a/drivers/cpufreq/Kconfig.x86 b/drivers/cpufreq/Kconfig.x86
8274 index 35f71825b7f3..bb4a6160d0f7 100644
8275 --- a/drivers/cpufreq/Kconfig.x86
8276 +++ b/drivers/cpufreq/Kconfig.x86
8277 @@ -125,7 +125,7 @@ config X86_POWERNOW_K7_ACPI
8278
8279  config X86_POWERNOW_K8
8280         tristate "AMD Opteron/Athlon64 PowerNow!"
8281 -       depends on ACPI && ACPI_PROCESSOR && X86_ACPI_CPUFREQ
8282 +       depends on ACPI && ACPI_PROCESSOR && X86_ACPI_CPUFREQ && !PREEMPT_RT_BASE
8283         help
8284           This adds the CPUFreq driver for K8/early Opteron/Athlon64 processors.
8285           Support for K10 and newer processors is now in acpi-cpufreq.
8286 diff --git a/drivers/firmware/efi/efi.c b/drivers/firmware/efi/efi.c
8287 index c3eefa126e3b..47093745a53c 100644
8288 --- a/drivers/firmware/efi/efi.c
8289 +++ b/drivers/firmware/efi/efi.c
8290 @@ -74,7 +74,7 @@ static unsigned long *efi_tables[] = {
8291         &efi.mem_attr_table,
8292  };
8293
8294 -static bool disable_runtime;
8295 +static bool disable_runtime = IS_ENABLED(CONFIG_PREEMPT_RT_BASE);
8296  static int __init setup_noefi(char *arg)
8297  {
8298         disable_runtime = true;
8299 @@ -100,6 +100,9 @@ static int __init parse_efi_cmdline(char *str)
8300         if (parse_option_str(str, "noruntime"))
8301                 disable_runtime = true;
8302
8303 +       if (parse_option_str(str, "runtime"))
8304 +               disable_runtime = false;
8305 +
8306         return 0;
8307  }
8308  early_param("efi", parse_efi_cmdline);
8309 diff --git a/drivers/gpu/drm/i915/i915_gem_timeline.c b/drivers/gpu/drm/i915/i915_gem_timeline.c
8310 index c597ce277a04..c1108d3921f8 100644
8311 --- a/drivers/gpu/drm/i915/i915_gem_timeline.c
8312 +++ b/drivers/gpu/drm/i915/i915_gem_timeline.c
8313 @@ -33,11 +33,8 @@ static void __intel_timeline_init(struct intel_timeline *tl,
8314  {
8315         tl->fence_context = context;
8316         tl->common = parent;
8317 -#ifdef CONFIG_DEBUG_SPINLOCK
8318 -       __raw_spin_lock_init(&tl->lock.rlock, lockname, lockclass);
8319 -#else
8320         spin_lock_init(&tl->lock);
8321 -#endif
8322 +       lockdep_set_class_and_name(&tl->lock, lockclass, lockname);
8323         init_request_active(&tl->last_request, NULL);
8324         INIT_LIST_HEAD(&tl->requests);
8325         i915_syncmap_init(&tl->sync);
8326 diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
8327 index 20a471ad0ad2..5d34d48a8b7b 100644
8328 --- a/drivers/gpu/drm/i915/i915_irq.c
8329 +++ b/drivers/gpu/drm/i915/i915_irq.c
8330 @@ -867,6 +867,7 @@ static bool i915_get_crtc_scanoutpos(struct drm_device *dev, unsigned int pipe,
8331         spin_lock_irqsave(&dev_priv->uncore.lock, irqflags);
8332
8333         /* preempt_disable_rt() should go right here in PREEMPT_RT patchset. */
8334 +       preempt_disable_rt();
8335
8336         /* Get optional system timestamp before query. */
8337         if (stime)
8338 @@ -918,6 +919,7 @@ static bool i915_get_crtc_scanoutpos(struct drm_device *dev, unsigned int pipe,
8339                 *etime = ktime_get();
8340
8341         /* preempt_enable_rt() should go right here in PREEMPT_RT patchset. */
8342 +       preempt_enable_rt();
8343
8344         spin_unlock_irqrestore(&dev_priv->uncore.lock, irqflags);
8345
8346 diff --git a/drivers/gpu/drm/i915/intel_sprite.c b/drivers/gpu/drm/i915/intel_sprite.c
8347 index 41e31a454604..7e0cadf51b31 100644
8348 --- a/drivers/gpu/drm/i915/intel_sprite.c
8349 +++ b/drivers/gpu/drm/i915/intel_sprite.c
8350 @@ -36,6 +36,7 @@
8351  #include <drm/drm_rect.h>
8352  #include <drm/drm_atomic.h>
8353  #include <drm/drm_plane_helper.h>
8354 +#include <linux/locallock.h>
8355  #include "intel_drv.h"
8356  #include "intel_frontbuffer.h"
8357  #include <drm/i915_drm.h>
8358 @@ -67,7 +68,7 @@ int intel_usecs_to_scanlines(const struct drm_display_mode *adjusted_mode,
8359  }
8360
8361  #define VBLANK_EVASION_TIME_US 100
8362 -
8363 +static DEFINE_LOCAL_IRQ_LOCK(pipe_update_lock);
8364  /**
8365   * intel_pipe_update_start() - start update of a set of display registers
8366   * @crtc: the crtc of which the registers are going to be updated
8367 @@ -102,7 +103,7 @@ void intel_pipe_update_start(struct intel_crtc *crtc)
8368                                                       VBLANK_EVASION_TIME_US);
8369         max = vblank_start - 1;
8370
8371 -       local_irq_disable();
8372 +       local_lock_irq(pipe_update_lock);
8373
8374         if (min <= 0 || max <= 0)
8375                 return;
8376 @@ -132,11 +133,11 @@ void intel_pipe_update_start(struct intel_crtc *crtc)
8377                         break;
8378                 }
8379
8380 -               local_irq_enable();
8381 +               local_unlock_irq(pipe_update_lock);
8382
8383                 timeout = schedule_timeout(timeout);
8384
8385 -               local_irq_disable();
8386 +               local_lock_irq(pipe_update_lock);
8387         }
8388
8389         finish_wait(wq, &wait);
8390 @@ -201,7 +202,7 @@ void intel_pipe_update_end(struct intel_crtc *crtc)
8391                 crtc->base.state->event = NULL;
8392         }
8393
8394 -       local_irq_enable();
8395 +       local_unlock_irq(pipe_update_lock);
8396
8397         if (intel_vgpu_active(dev_priv))
8398                 return;
8399 diff --git a/drivers/gpu/drm/radeon/radeon_display.c b/drivers/gpu/drm/radeon/radeon_display.c
8400 index ddfe91efa61e..3157bcf6428f 100644
8401 --- a/drivers/gpu/drm/radeon/radeon_display.c
8402 +++ b/drivers/gpu/drm/radeon/radeon_display.c
8403 @@ -1839,6 +1839,7 @@ int radeon_get_crtc_scanoutpos(struct drm_device *dev, unsigned int pipe,
8404         struct radeon_device *rdev = dev->dev_private;
8405
8406         /* preempt_disable_rt() should go right here in PREEMPT_RT patchset. */
8407 +       preempt_disable_rt();
8408
8409         /* Get optional system timestamp before query. */
8410         if (stime)
8411 @@ -1931,6 +1932,7 @@ int radeon_get_crtc_scanoutpos(struct drm_device *dev, unsigned int pipe,
8412                 *etime = ktime_get();
8413
8414         /* preempt_enable_rt() should go right here in PREEMPT_RT patchset. */
8415 +       preempt_enable_rt();
8416
8417         /* Decode into vertical and horizontal scanout position. */
8418         *vpos = position & 0x1fff;
8419 diff --git a/drivers/hv/hyperv_vmbus.h b/drivers/hv/hyperv_vmbus.h
8420 index 49569f8fe038..a3608cd52805 100644
8421 --- a/drivers/hv/hyperv_vmbus.h
8422 +++ b/drivers/hv/hyperv_vmbus.h
8423 @@ -30,6 +30,7 @@
8424  #include <linux/atomic.h>
8425  #include <linux/hyperv.h>
8426  #include <linux/interrupt.h>
8427 +#include <linux/irq.h>
8428
8429  /*
8430   * Timeout for services such as KVP and fcopy.
8431 diff --git a/drivers/hv/vmbus_drv.c b/drivers/hv/vmbus_drv.c
8432 index 2cd134dd94d2..cedf225d4182 100644
8433 --- a/drivers/hv/vmbus_drv.c
8434 +++ b/drivers/hv/vmbus_drv.c
8435 @@ -966,6 +966,8 @@ static void vmbus_isr(void)
8436         void *page_addr = hv_cpu->synic_event_page;
8437         struct hv_message *msg;
8438         union hv_synic_event_flags *event;
8439 +       struct pt_regs *regs = get_irq_regs();
8440 +       u64 ip = regs ? instruction_pointer(regs) : 0;
8441         bool handled = false;
8442
8443         if (unlikely(page_addr == NULL))
8444 @@ -1009,7 +1011,7 @@ static void vmbus_isr(void)
8445                         tasklet_schedule(&hv_cpu->msg_dpc);
8446         }
8447
8448 -       add_interrupt_randomness(HYPERVISOR_CALLBACK_VECTOR, 0);
8449 +       add_interrupt_randomness(HYPERVISOR_CALLBACK_VECTOR, 0, ip);
8450  }
8451
8452
8453 diff --git a/drivers/ide/alim15x3.c b/drivers/ide/alim15x3.c
8454 index 36f76e28a0bf..394f142f90c7 100644
8455 --- a/drivers/ide/alim15x3.c
8456 +++ b/drivers/ide/alim15x3.c
8457 @@ -234,7 +234,7 @@ static int init_chipset_ali15x3(struct pci_dev *dev)
8458
8459         isa_dev = pci_get_device(PCI_VENDOR_ID_AL, PCI_DEVICE_ID_AL_M1533, NULL);
8460
8461 -       local_irq_save(flags);
8462 +       local_irq_save_nort(flags);
8463
8464         if (m5229_revision < 0xC2) {
8465                 /*
8466 @@ -325,7 +325,7 @@ static int init_chipset_ali15x3(struct pci_dev *dev)
8467         }
8468         pci_dev_put(north);
8469         pci_dev_put(isa_dev);
8470 -       local_irq_restore(flags);
8471 +       local_irq_restore_nort(flags);
8472         return 0;
8473  }
8474
8475 diff --git a/drivers/ide/hpt366.c b/drivers/ide/hpt366.c
8476 index 4b5dc0162e67..590cc7d64622 100644
8477 --- a/drivers/ide/hpt366.c
8478 +++ b/drivers/ide/hpt366.c
8479 @@ -1236,7 +1236,7 @@ static int init_dma_hpt366(ide_hwif_t *hwif,
8480
8481         dma_old = inb(base + 2);
8482
8483 -       local_irq_save(flags);
8484 +       local_irq_save_nort(flags);
8485
8486         dma_new = dma_old;
8487         pci_read_config_byte(dev, hwif->channel ? 0x4b : 0x43, &masterdma);
8488 @@ -1247,7 +1247,7 @@ static int init_dma_hpt366(ide_hwif_t *hwif,
8489         if (dma_new != dma_old)
8490                 outb(dma_new, base + 2);
8491
8492 -       local_irq_restore(flags);
8493 +       local_irq_restore_nort(flags);
8494
8495         printk(KERN_INFO "    %s: BM-DMA at 0x%04lx-0x%04lx\n",
8496                          hwif->name, base, base + 7);
8497 diff --git a/drivers/ide/ide-io-std.c b/drivers/ide/ide-io-std.c
8498 index 19763977568c..4169433faab5 100644
8499 --- a/drivers/ide/ide-io-std.c
8500 +++ b/drivers/ide/ide-io-std.c
8501 @@ -175,7 +175,7 @@ void ide_input_data(ide_drive_t *drive, struct ide_cmd *cmd, void *buf,
8502                 unsigned long uninitialized_var(flags);
8503
8504                 if ((io_32bit & 2) && !mmio) {
8505 -                       local_irq_save(flags);
8506 +                       local_irq_save_nort(flags);
8507                         ata_vlb_sync(io_ports->nsect_addr);
8508                 }
8509
8510 @@ -186,7 +186,7 @@ void ide_input_data(ide_drive_t *drive, struct ide_cmd *cmd, void *buf,
8511                         insl(data_addr, buf, words);
8512
8513                 if ((io_32bit & 2) && !mmio)
8514 -                       local_irq_restore(flags);
8515 +                       local_irq_restore_nort(flags);
8516
8517                 if (((len + 1) & 3) < 2)
8518                         return;
8519 @@ -219,7 +219,7 @@ void ide_output_data(ide_drive_t *drive, struct ide_cmd *cmd, void *buf,
8520                 unsigned long uninitialized_var(flags);
8521
8522                 if ((io_32bit & 2) && !mmio) {
8523 -                       local_irq_save(flags);
8524 +                       local_irq_save_nort(flags);
8525                         ata_vlb_sync(io_ports->nsect_addr);
8526                 }
8527
8528 @@ -230,7 +230,7 @@ void ide_output_data(ide_drive_t *drive, struct ide_cmd *cmd, void *buf,
8529                         outsl(data_addr, buf, words);
8530
8531                 if ((io_32bit & 2) && !mmio)
8532 -                       local_irq_restore(flags);
8533 +                       local_irq_restore_nort(flags);
8534
8535                 if (((len + 1) & 3) < 2)
8536                         return;
8537 diff --git a/drivers/ide/ide-io.c b/drivers/ide/ide-io.c
8538 index 3a234701d92c..420e4e645856 100644
8539 --- a/drivers/ide/ide-io.c
8540 +++ b/drivers/ide/ide-io.c
8541 @@ -660,7 +660,7 @@ void ide_timer_expiry (unsigned long data)
8542                 /* disable_irq_nosync ?? */
8543                 disable_irq(hwif->irq);
8544                 /* local CPU only, as if we were handling an interrupt */
8545 -               local_irq_disable();
8546 +               local_irq_disable_nort();
8547                 if (hwif->polling) {
8548                         startstop = handler(drive);
8549                 } else if (drive_is_ready(drive)) {
8550 diff --git a/drivers/ide/ide-iops.c b/drivers/ide/ide-iops.c
8551 index 210a0887dd29..7bf05b6147e8 100644
8552 --- a/drivers/ide/ide-iops.c
8553 +++ b/drivers/ide/ide-iops.c
8554 @@ -129,12 +129,12 @@ int __ide_wait_stat(ide_drive_t *drive, u8 good, u8 bad,
8555                                 if ((stat & ATA_BUSY) == 0)
8556                                         break;
8557
8558 -                               local_irq_restore(flags);
8559 +                               local_irq_restore_nort(flags);
8560                                 *rstat = stat;
8561                                 return -EBUSY;
8562                         }
8563                 }
8564 -               local_irq_restore(flags);
8565 +               local_irq_restore_nort(flags);
8566         }
8567         /*
8568          * Allow status to settle, then read it again.
8569 diff --git a/drivers/ide/ide-probe.c b/drivers/ide/ide-probe.c
8570 index eaf39e5db08b..be4c941eaa83 100644
8571 --- a/drivers/ide/ide-probe.c
8572 +++ b/drivers/ide/ide-probe.c
8573 @@ -196,10 +196,10 @@ static void do_identify(ide_drive_t *drive, u8 cmd, u16 *id)
8574         int bswap = 1;
8575
8576         /* local CPU only; some systems need this */
8577 -       local_irq_save(flags);
8578 +       local_irq_save_nort(flags);
8579         /* read 512 bytes of id info */
8580         hwif->tp_ops->input_data(drive, NULL, id, SECTOR_SIZE);
8581 -       local_irq_restore(flags);
8582 +       local_irq_restore_nort(flags);
8583
8584         drive->dev_flags |= IDE_DFLAG_ID_READ;
8585  #ifdef DEBUG
8586 diff --git a/drivers/ide/ide-taskfile.c b/drivers/ide/ide-taskfile.c
8587 index 4efe4c6e956c..7eae3aa1def7 100644
8588 --- a/drivers/ide/ide-taskfile.c
8589 +++ b/drivers/ide/ide-taskfile.c
8590 @@ -251,7 +251,7 @@ void ide_pio_bytes(ide_drive_t *drive, struct ide_cmd *cmd,
8591
8592                 page_is_high = PageHighMem(page);
8593                 if (page_is_high)
8594 -                       local_irq_save(flags);
8595 +                       local_irq_save_nort(flags);
8596
8597                 buf = kmap_atomic(page) + offset;
8598
8599 @@ -272,7 +272,7 @@ void ide_pio_bytes(ide_drive_t *drive, struct ide_cmd *cmd,
8600                 kunmap_atomic(buf);
8601
8602                 if (page_is_high)
8603 -                       local_irq_restore(flags);
8604 +                       local_irq_restore_nort(flags);
8605
8606                 len -= nr_bytes;
8607         }
8608 @@ -415,7 +415,7 @@ static ide_startstop_t pre_task_out_intr(ide_drive_t *drive,
8609         }
8610
8611         if ((drive->dev_flags & IDE_DFLAG_UNMASK) == 0)
8612 -               local_irq_disable();
8613 +               local_irq_disable_nort();
8614
8615         ide_set_handler(drive, &task_pio_intr, WAIT_WORSTCASE);
8616
8617 diff --git a/drivers/infiniband/hw/hfi1/affinity.c b/drivers/infiniband/hw/hfi1/affinity.c
8618 index b197e925fe36..95ac319c8e69 100644
8619 --- a/drivers/infiniband/hw/hfi1/affinity.c
8620 +++ b/drivers/infiniband/hw/hfi1/affinity.c
8621 @@ -593,7 +593,7 @@ int hfi1_get_proc_affinity(int node)
8622         struct hfi1_affinity_node *entry;
8623         cpumask_var_t diff, hw_thread_mask, available_mask, intrs_mask;
8624         const struct cpumask *node_mask,
8625 -               *proc_mask = &current->cpus_allowed;
8626 +               *proc_mask = current->cpus_ptr;
8627         struct hfi1_affinity_node_list *affinity = &node_affinity;
8628         struct cpu_mask_set *set = &affinity->proc;
8629
8630 @@ -601,7 +601,7 @@ int hfi1_get_proc_affinity(int node)
8631          * check whether process/context affinity has already
8632          * been set
8633          */
8634 -       if (cpumask_weight(proc_mask) == 1) {
8635 +       if (current->nr_cpus_allowed == 1) {
8636                 hfi1_cdbg(PROC, "PID %u %s affinity set to CPU %*pbl",
8637                           current->pid, current->comm,
8638                           cpumask_pr_args(proc_mask));
8639 @@ -612,7 +612,7 @@ int hfi1_get_proc_affinity(int node)
8640                 cpu = cpumask_first(proc_mask);
8641                 cpumask_set_cpu(cpu, &set->used);
8642                 goto done;
8643 -       } else if (cpumask_weight(proc_mask) < cpumask_weight(&set->mask)) {
8644 +       } else if (current->nr_cpus_allowed < cpumask_weight(&set->mask)) {
8645                 hfi1_cdbg(PROC, "PID %u %s affinity set to CPU set(s) %*pbl",
8646                           current->pid, current->comm,
8647                           cpumask_pr_args(proc_mask));
8648 diff --git a/drivers/infiniband/hw/hfi1/sdma.c b/drivers/infiniband/hw/hfi1/sdma.c
8649 index 6781bcdb10b3..d069ad261572 100644
8650 --- a/drivers/infiniband/hw/hfi1/sdma.c
8651 +++ b/drivers/infiniband/hw/hfi1/sdma.c
8652 @@ -856,14 +856,13 @@ struct sdma_engine *sdma_select_user_engine(struct hfi1_devdata *dd,
8653  {
8654         struct sdma_rht_node *rht_node;
8655         struct sdma_engine *sde = NULL;
8656 -       const struct cpumask *current_mask = &current->cpus_allowed;
8657         unsigned long cpu_id;
8658
8659         /*
8660          * To ensure that always the same sdma engine(s) will be
8661          * selected make sure the process is pinned to this CPU only.
8662          */
8663 -       if (cpumask_weight(current_mask) != 1)
8664 +       if (current->nr_cpus_allowed != 1)
8665                 goto out;
8666
8667         cpu_id = smp_processor_id();
8668 diff --git a/drivers/infiniband/hw/qib/qib_file_ops.c b/drivers/infiniband/hw/qib/qib_file_ops.c
8669 index 40efc9151ec4..12924aad90cc 100644
8670 --- a/drivers/infiniband/hw/qib/qib_file_ops.c
8671 +++ b/drivers/infiniband/hw/qib/qib_file_ops.c
8672 @@ -1167,7 +1167,7 @@ static unsigned int qib_poll(struct file *fp, struct poll_table_struct *pt)
8673  static void assign_ctxt_affinity(struct file *fp, struct qib_devdata *dd)
8674  {
8675         struct qib_filedata *fd = fp->private_data;
8676 -       const unsigned int weight = cpumask_weight(&current->cpus_allowed);
8677 +       const unsigned int weight = current->nr_cpus_allowed;
8678         const struct cpumask *local_mask = cpumask_of_pcibus(dd->pcidev->bus);
8679         int local_cpu;
8680
8681 @@ -1648,9 +1648,8 @@ static int qib_assign_ctxt(struct file *fp, const struct qib_user_info *uinfo)
8682                 ret = find_free_ctxt(i_minor - 1, fp, uinfo);
8683         else {
8684                 int unit;
8685 -               const unsigned int cpu = cpumask_first(&current->cpus_allowed);
8686 -               const unsigned int weight =
8687 -                       cpumask_weight(&current->cpus_allowed);
8688 +               const unsigned int cpu = cpumask_first(current->cpus_ptr);
8689 +               const unsigned int weight = current->nr_cpus_allowed;
8690
8691                 if (weight == 1 && !test_bit(cpu, qib_cpulist))
8692                         if (!find_hca(cpu, &unit) && unit >= 0)
8693 diff --git a/drivers/infiniband/ulp/ipoib/ipoib_multicast.c b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c
8694 index 9b3f47ae2016..8327b598d909 100644
8695 --- a/drivers/infiniband/ulp/ipoib/ipoib_multicast.c
8696 +++ b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c
8697 @@ -898,7 +898,7 @@ void ipoib_mcast_restart_task(struct work_struct *work)
8698
8699         ipoib_dbg_mcast(priv, "restarting multicast task\n");
8700
8701 -       local_irq_save(flags);
8702 +       local_irq_save_nort(flags);
8703         netif_addr_lock(dev);
8704         spin_lock(&priv->lock);
8705
8706 @@ -980,7 +980,7 @@ void ipoib_mcast_restart_task(struct work_struct *work)
8707
8708         spin_unlock(&priv->lock);
8709         netif_addr_unlock(dev);
8710 -       local_irq_restore(flags);
8711 +       local_irq_restore_nort(flags);
8712
8713         ipoib_mcast_remove_list(&remove_list);
8714
8715 diff --git a/drivers/input/gameport/gameport.c b/drivers/input/gameport/gameport.c
8716 index cedc665364cd..4a4fdef151aa 100644
8717 --- a/drivers/input/gameport/gameport.c
8718 +++ b/drivers/input/gameport/gameport.c
8719 @@ -91,13 +91,13 @@ static int gameport_measure_speed(struct gameport *gameport)
8720         tx = ~0;
8721
8722         for (i = 0; i < 50; i++) {
8723 -               local_irq_save(flags);
8724 +               local_irq_save_nort(flags);
8725                 t1 = ktime_get_ns();
8726                 for (t = 0; t < 50; t++)
8727                         gameport_read(gameport);
8728                 t2 = ktime_get_ns();
8729                 t3 = ktime_get_ns();
8730 -               local_irq_restore(flags);
8731 +               local_irq_restore_nort(flags);
8732                 udelay(i * 10);
8733                 t = (t2 - t1) - (t3 - t2);
8734                 if (t < tx)
8735 @@ -124,12 +124,12 @@ static int old_gameport_measure_speed(struct gameport *gameport)
8736         tx = 1 << 30;
8737
8738         for(i = 0; i < 50; i++) {
8739 -               local_irq_save(flags);
8740 +               local_irq_save_nort(flags);
8741                 GET_TIME(t1);
8742                 for (t = 0; t < 50; t++) gameport_read(gameport);
8743                 GET_TIME(t2);
8744                 GET_TIME(t3);
8745 -               local_irq_restore(flags);
8746 +               local_irq_restore_nort(flags);
8747                 udelay(i * 10);
8748                 if ((t = DELTA(t2,t1) - DELTA(t3,t2)) < tx) tx = t;
8749         }
8750 @@ -148,11 +148,11 @@ static int old_gameport_measure_speed(struct gameport *gameport)
8751         tx = 1 << 30;
8752
8753         for(i = 0; i < 50; i++) {
8754 -               local_irq_save(flags);
8755 +               local_irq_save_nort(flags);
8756                 t1 = rdtsc();
8757                 for (t = 0; t < 50; t++) gameport_read(gameport);
8758                 t2 = rdtsc();
8759 -               local_irq_restore(flags);
8760 +               local_irq_restore_nort(flags);
8761                 udelay(i * 10);
8762                 if (t2 - t1 < tx) tx = t2 - t1;
8763         }
8764 diff --git a/drivers/iommu/amd_iommu.c b/drivers/iommu/amd_iommu.c
8765 index efa6cd2500b9..7d9d41f803d1 100644
8766 --- a/drivers/iommu/amd_iommu.c
8767 +++ b/drivers/iommu/amd_iommu.c
8768 @@ -81,11 +81,12 @@
8769   */
8770  #define AMD_IOMMU_PGSIZES      ((~0xFFFUL) & ~(2ULL << 38))
8771
8772 -static DEFINE_RWLOCK(amd_iommu_devtable_lock);
8773 +static DEFINE_SPINLOCK(amd_iommu_devtable_lock);
8774 +static DEFINE_SPINLOCK(pd_bitmap_lock);
8775 +static DEFINE_SPINLOCK(iommu_table_lock);
8776
8777  /* List of all available dev_data structures */
8778 -static LIST_HEAD(dev_data_list);
8779 -static DEFINE_SPINLOCK(dev_data_list_lock);
8780 +static LLIST_HEAD(dev_data_list);
8781
8782  LIST_HEAD(ioapic_map);
8783  LIST_HEAD(hpet_map);
8784 @@ -204,40 +205,33 @@ static struct dma_ops_domain* to_dma_ops_domain(struct protection_domain *domain
8785  static struct iommu_dev_data *alloc_dev_data(u16 devid)
8786  {
8787         struct iommu_dev_data *dev_data;
8788 -       unsigned long flags;
8789
8790         dev_data = kzalloc(sizeof(*dev_data), GFP_KERNEL);
8791         if (!dev_data)
8792                 return NULL;
8793
8794         dev_data->devid = devid;
8795 -
8796 -       spin_lock_irqsave(&dev_data_list_lock, flags);
8797 -       list_add_tail(&dev_data->dev_data_list, &dev_data_list);
8798 -       spin_unlock_irqrestore(&dev_data_list_lock, flags);
8799 -
8800         ratelimit_default_init(&dev_data->rs);
8801
8802 +       llist_add(&dev_data->dev_data_list, &dev_data_list);
8803         return dev_data;
8804  }
8805
8806  static struct iommu_dev_data *search_dev_data(u16 devid)
8807  {
8808         struct iommu_dev_data *dev_data;
8809 -       unsigned long flags;
8810 +       struct llist_node *node;
8811 +
8812 +       if (llist_empty(&dev_data_list))
8813 +               return NULL;
8814
8815 -       spin_lock_irqsave(&dev_data_list_lock, flags);
8816 -       list_for_each_entry(dev_data, &dev_data_list, dev_data_list) {
8817 +       node = dev_data_list.first;
8818 +       llist_for_each_entry(dev_data, node, dev_data_list) {
8819                 if (dev_data->devid == devid)
8820 -                       goto out_unlock;
8821 +                       return dev_data;
8822         }
8823
8824 -       dev_data = NULL;
8825 -
8826 -out_unlock:
8827 -       spin_unlock_irqrestore(&dev_data_list_lock, flags);
8828 -
8829 -       return dev_data;
8830 +       return NULL;
8831  }
8832
8833  static int __last_alias(struct pci_dev *pdev, u16 alias, void *data)
8834 @@ -1062,9 +1056,9 @@ static int iommu_queue_command_sync(struct amd_iommu *iommu,
8835         unsigned long flags;
8836         int ret;
8837
8838 -       spin_lock_irqsave(&iommu->lock, flags);
8839 +       raw_spin_lock_irqsave(&iommu->lock, flags);
8840         ret = __iommu_queue_command_sync(iommu, cmd, sync);
8841 -       spin_unlock_irqrestore(&iommu->lock, flags);
8842 +       raw_spin_unlock_irqrestore(&iommu->lock, flags);
8843
8844         return ret;
8845  }
8846 @@ -1090,7 +1084,7 @@ static int iommu_completion_wait(struct amd_iommu *iommu)
8847
8848         build_completion_wait(&cmd, (u64)&iommu->cmd_sem);
8849
8850 -       spin_lock_irqsave(&iommu->lock, flags);
8851 +       raw_spin_lock_irqsave(&iommu->lock, flags);
8852
8853         iommu->cmd_sem = 0;
8854
8855 @@ -1101,7 +1095,7 @@ static int iommu_completion_wait(struct amd_iommu *iommu)
8856         ret = wait_on_sem(&iommu->cmd_sem);
8857
8858  out_unlock:
8859 -       spin_unlock_irqrestore(&iommu->lock, flags);
8860 +       raw_spin_unlock_irqrestore(&iommu->lock, flags);
8861
8862         return ret;
8863  }
8864 @@ -1610,29 +1604,26 @@ static void del_domain_from_list(struct protection_domain *domain)
8865
8866  static u16 domain_id_alloc(void)
8867  {
8868 -       unsigned long flags;
8869         int id;
8870
8871 -       write_lock_irqsave(&amd_iommu_devtable_lock, flags);
8872 +       spin_lock(&pd_bitmap_lock);
8873         id = find_first_zero_bit(amd_iommu_pd_alloc_bitmap, MAX_DOMAIN_ID);
8874         BUG_ON(id == 0);
8875         if (id > 0 && id < MAX_DOMAIN_ID)
8876                 __set_bit(id, amd_iommu_pd_alloc_bitmap);
8877         else
8878                 id = 0;
8879 -       write_unlock_irqrestore(&amd_iommu_devtable_lock, flags);
8880 +       spin_unlock(&pd_bitmap_lock);
8881
8882         return id;
8883  }
8884
8885  static void domain_id_free(int id)
8886  {
8887 -       unsigned long flags;
8888 -
8889 -       write_lock_irqsave(&amd_iommu_devtable_lock, flags);
8890 +       spin_lock(&pd_bitmap_lock);
8891         if (id > 0 && id < MAX_DOMAIN_ID)
8892                 __clear_bit(id, amd_iommu_pd_alloc_bitmap);
8893 -       write_unlock_irqrestore(&amd_iommu_devtable_lock, flags);
8894 +       spin_unlock(&pd_bitmap_lock);
8895  }
8896
8897  #define DEFINE_FREE_PT_FN(LVL, FN)                             \
8898 @@ -1952,10 +1943,10 @@ static int __attach_device(struct iommu_dev_data *dev_data,
8899         int ret;
8900
8901         /*
8902 -        * Must be called with IRQs disabled. Warn here to detect early
8903 -        * when its not.
8904 +        * Must be called with IRQs disabled on a non RT kernel. Warn here to
8905 +        * detect early when its not.
8906          */
8907 -       WARN_ON(!irqs_disabled());
8908 +       WARN_ON_NONRT(!irqs_disabled());
8909
8910         /* lock domain */
8911         spin_lock(&domain->lock);
8912 @@ -2101,9 +2092,9 @@ static int attach_device(struct device *dev,
8913         }
8914
8915  skip_ats_check:
8916 -       write_lock_irqsave(&amd_iommu_devtable_lock, flags);
8917 +       spin_lock_irqsave(&amd_iommu_devtable_lock, flags);
8918         ret = __attach_device(dev_data, domain);
8919 -       write_unlock_irqrestore(&amd_iommu_devtable_lock, flags);
8920 +       spin_unlock_irqrestore(&amd_iommu_devtable_lock, flags);
8921
8922         /*
8923          * We might boot into a crash-kernel here. The crashed kernel
8924 @@ -2123,10 +2114,10 @@ static void __detach_device(struct iommu_dev_data *dev_data)
8925         struct protection_domain *domain;
8926
8927         /*
8928 -        * Must be called with IRQs disabled. Warn here to detect early
8929 -        * when its not.
8930 +        * Must be called with IRQs disabled on a non RT kernel. Warn here to
8931 +        * detect early when its not.
8932          */
8933 -       WARN_ON(!irqs_disabled());
8934 +       WARN_ON_NONRT(!irqs_disabled());
8935
8936         if (WARN_ON(!dev_data->domain))
8937                 return;
8938 @@ -2153,9 +2144,9 @@ static void detach_device(struct device *dev)
8939         domain   = dev_data->domain;
8940
8941         /* lock device table */
8942 -       write_lock_irqsave(&amd_iommu_devtable_lock, flags);
8943 +       spin_lock_irqsave(&amd_iommu_devtable_lock, flags);
8944         __detach_device(dev_data);
8945 -       write_unlock_irqrestore(&amd_iommu_devtable_lock, flags);
8946 +       spin_unlock_irqrestore(&amd_iommu_devtable_lock, flags);
8947
8948         if (!dev_is_pci(dev))
8949                 return;
8950 @@ -2819,7 +2810,7 @@ static void cleanup_domain(struct protection_domain *domain)
8951         struct iommu_dev_data *entry;
8952         unsigned long flags;
8953
8954 -       write_lock_irqsave(&amd_iommu_devtable_lock, flags);
8955 +       spin_lock_irqsave(&amd_iommu_devtable_lock, flags);
8956
8957         while (!list_empty(&domain->dev_list)) {
8958                 entry = list_first_entry(&domain->dev_list,
8959 @@ -2827,7 +2818,7 @@ static void cleanup_domain(struct protection_domain *domain)
8960                 __detach_device(entry);
8961         }
8962
8963 -       write_unlock_irqrestore(&amd_iommu_devtable_lock, flags);
8964 +       spin_unlock_irqrestore(&amd_iommu_devtable_lock, flags);
8965  }
8966
8967  static void protection_domain_free(struct protection_domain *domain)
8968 @@ -3594,14 +3585,62 @@ static void set_dte_irq_entry(u16 devid, struct irq_remap_table *table)
8969         amd_iommu_dev_table[devid].data[2] = dte;
8970  }
8971
8972 -static struct irq_remap_table *get_irq_table(u16 devid, bool ioapic)
8973 +static struct irq_remap_table *get_irq_table(u16 devid)
8974 +{
8975 +       struct irq_remap_table *table;
8976 +
8977 +       if (WARN_ONCE(!amd_iommu_rlookup_table[devid],
8978 +                     "%s: no iommu for devid %x\n", __func__, devid))
8979 +               return NULL;
8980 +
8981 +       table = irq_lookup_table[devid];
8982 +       if (WARN_ONCE(!table, "%s: no table for devid %x\n", __func__, devid))
8983 +               return NULL;
8984 +
8985 +       return table;
8986 +}
8987 +
8988 +static struct irq_remap_table *__alloc_irq_table(void)
8989 +{
8990 +       struct irq_remap_table *table;
8991 +
8992 +       table = kzalloc(sizeof(*table), GFP_KERNEL);
8993 +       if (!table)
8994 +               return NULL;
8995 +
8996 +       table->table = kmem_cache_alloc(amd_iommu_irq_cache, GFP_KERNEL);
8997 +       if (!table->table) {
8998 +               kfree(table);
8999 +               return NULL;
9000 +       }
9001 +       raw_spin_lock_init(&table->lock);
9002 +
9003 +       if (!AMD_IOMMU_GUEST_IR_GA(amd_iommu_guest_ir))
9004 +               memset(table->table, 0,
9005 +                      MAX_IRQS_PER_TABLE * sizeof(u32));
9006 +       else
9007 +               memset(table->table, 0,
9008 +                      (MAX_IRQS_PER_TABLE * (sizeof(u64) * 2)));
9009 +       return table;
9010 +}
9011 +
9012 +static void set_remap_table_entry(struct amd_iommu *iommu, u16 devid,
9013 +                                 struct irq_remap_table *table)
9014 +{
9015 +       irq_lookup_table[devid] = table;
9016 +       set_dte_irq_entry(devid, table);
9017 +       iommu_flush_dte(iommu, devid);
9018 +}
9019 +
9020 +static struct irq_remap_table *alloc_irq_table(u16 devid)
9021  {
9022         struct irq_remap_table *table = NULL;
9023 +       struct irq_remap_table *new_table = NULL;
9024         struct amd_iommu *iommu;
9025         unsigned long flags;
9026         u16 alias;
9027
9028 -       write_lock_irqsave(&amd_iommu_devtable_lock, flags);
9029 +       spin_lock_irqsave(&iommu_table_lock, flags);
9030
9031         iommu = amd_iommu_rlookup_table[devid];
9032         if (!iommu)
9033 @@ -3614,60 +3653,45 @@ static struct irq_remap_table *get_irq_table(u16 devid, bool ioapic)
9034         alias = amd_iommu_alias_table[devid];
9035         table = irq_lookup_table[alias];
9036         if (table) {
9037 -               irq_lookup_table[devid] = table;
9038 -               set_dte_irq_entry(devid, table);
9039 -               iommu_flush_dte(iommu, devid);
9040 -               goto out;
9041 +               set_remap_table_entry(iommu, devid, table);
9042 +               goto out_wait;
9043         }
9044 +       spin_unlock_irqrestore(&iommu_table_lock, flags);
9045
9046         /* Nothing there yet, allocate new irq remapping table */
9047 -       table = kzalloc(sizeof(*table), GFP_ATOMIC);
9048 -       if (!table)
9049 -               goto out_unlock;
9050 -
9051 -       /* Initialize table spin-lock */
9052 -       spin_lock_init(&table->lock);
9053 +       new_table = __alloc_irq_table();
9054 +       if (!new_table)
9055 +               return NULL;
9056
9057 -       if (ioapic)
9058 -               /* Keep the first 32 indexes free for IOAPIC interrupts */
9059 -               table->min_index = 32;
9060 +       spin_lock_irqsave(&iommu_table_lock, flags);
9061
9062 -       table->table = kmem_cache_alloc(amd_iommu_irq_cache, GFP_ATOMIC);
9063 -       if (!table->table) {
9064 -               kfree(table);
9065 -               table = NULL;
9066 +       table = irq_lookup_table[devid];
9067 +       if (table)
9068                 goto out_unlock;
9069 -       }
9070 -
9071 -       if (!AMD_IOMMU_GUEST_IR_GA(amd_iommu_guest_ir))
9072 -               memset(table->table, 0,
9073 -                      MAX_IRQS_PER_TABLE * sizeof(u32));
9074 -       else
9075 -               memset(table->table, 0,
9076 -                      (MAX_IRQS_PER_TABLE * (sizeof(u64) * 2)));
9077
9078 -       if (ioapic) {
9079 -               int i;
9080 -
9081 -               for (i = 0; i < 32; ++i)
9082 -                       iommu->irte_ops->set_allocated(table, i);
9083 +       table = irq_lookup_table[alias];
9084 +       if (table) {
9085 +               set_remap_table_entry(iommu, devid, table);
9086 +               goto out_wait;
9087         }
9088
9089 -       irq_lookup_table[devid] = table;
9090 -       set_dte_irq_entry(devid, table);
9091 -       iommu_flush_dte(iommu, devid);
9092 -       if (devid != alias) {
9093 -               irq_lookup_table[alias] = table;
9094 -               set_dte_irq_entry(alias, table);
9095 -               iommu_flush_dte(iommu, alias);
9096 -       }
9097 +       table = new_table;
9098 +       new_table = NULL;
9099
9100 -out:
9101 +       set_remap_table_entry(iommu, devid, table);
9102 +       if (devid != alias)
9103 +               set_remap_table_entry(iommu, alias, table);
9104 +
9105 +out_wait:
9106         iommu_completion_wait(iommu);
9107
9108  out_unlock:
9109 -       write_unlock_irqrestore(&amd_iommu_devtable_lock, flags);
9110 +       spin_unlock_irqrestore(&iommu_table_lock, flags);
9111
9112 +       if (new_table) {
9113 +               kmem_cache_free(amd_iommu_irq_cache, new_table->table);
9114 +               kfree(new_table);
9115 +       }
9116         return table;
9117  }
9118
9119 @@ -3681,11 +3705,11 @@ static int alloc_irq_index(u16 devid, int count)
9120         if (!iommu)
9121                 return -ENODEV;
9122
9123 -       table = get_irq_table(devid, false);
9124 +       table = alloc_irq_table(devid);
9125         if (!table)
9126                 return -ENODEV;
9127
9128 -       spin_lock_irqsave(&table->lock, flags);
9129 +       raw_spin_lock_irqsave(&table->lock, flags);
9130
9131         /* Scan table for free entries */
9132         for (c = 0, index = table->min_index;
9133 @@ -3708,7 +3732,7 @@ static int alloc_irq_index(u16 devid, int count)
9134         index = -ENOSPC;
9135
9136  out:
9137 -       spin_unlock_irqrestore(&table->lock, flags);
9138 +       raw_spin_unlock_irqrestore(&table->lock, flags);
9139
9140         return index;
9141  }
9142 @@ -3725,11 +3749,11 @@ static int modify_irte_ga(u16 devid, int index, struct irte_ga *irte,
9143         if (iommu == NULL)
9144                 return -EINVAL;
9145
9146 -       table = get_irq_table(devid, false);
9147 +       table = get_irq_table(devid);
9148         if (!table)
9149                 return -ENOMEM;
9150
9151 -       spin_lock_irqsave(&table->lock, flags);
9152 +       raw_spin_lock_irqsave(&table->lock, flags);
9153
9154         entry = (struct irte_ga *)table->table;
9155         entry = &entry[index];
9156 @@ -3740,7 +3764,7 @@ static int modify_irte_ga(u16 devid, int index, struct irte_ga *irte,
9157         if (data)
9158                 data->ref = entry;
9159
9160 -       spin_unlock_irqrestore(&table->lock, flags);
9161 +       raw_spin_unlock_irqrestore(&table->lock, flags);
9162
9163         iommu_flush_irt(iommu, devid);
9164         iommu_completion_wait(iommu);
9165 @@ -3758,13 +3782,13 @@ static int modify_irte(u16 devid, int index, union irte *irte)
9166         if (iommu == NULL)
9167                 return -EINVAL;
9168
9169 -       table = get_irq_table(devid, false);
9170 +       table = get_irq_table(devid);
9171         if (!table)
9172                 return -ENOMEM;
9173
9174 -       spin_lock_irqsave(&table->lock, flags);
9175 +       raw_spin_lock_irqsave(&table->lock, flags);
9176         table->table[index] = irte->val;
9177 -       spin_unlock_irqrestore(&table->lock, flags);
9178 +       raw_spin_unlock_irqrestore(&table->lock, flags);
9179
9180         iommu_flush_irt(iommu, devid);
9181         iommu_completion_wait(iommu);
9182 @@ -3782,13 +3806,13 @@ static void free_irte(u16 devid, int index)
9183         if (iommu == NULL)
9184                 return;
9185
9186 -       table = get_irq_table(devid, false);
9187 +       table = get_irq_table(devid);
9188         if (!table)
9189                 return;
9190
9191 -       spin_lock_irqsave(&table->lock, flags);
9192 +       raw_spin_lock_irqsave(&table->lock, flags);
9193         iommu->irte_ops->clear_allocated(table, index);
9194 -       spin_unlock_irqrestore(&table->lock, flags);
9195 +       raw_spin_unlock_irqrestore(&table->lock, flags);
9196
9197         iommu_flush_irt(iommu, devid);
9198         iommu_completion_wait(iommu);
9199 @@ -3869,10 +3893,8 @@ static void irte_ga_set_affinity(void *entry, u16 devid, u16 index,
9200                                  u8 vector, u32 dest_apicid)
9201  {
9202         struct irte_ga *irte = (struct irte_ga *) entry;
9203 -       struct iommu_dev_data *dev_data = search_dev_data(devid);
9204
9205 -       if (!dev_data || !dev_data->use_vapic ||
9206 -           !irte->lo.fields_remap.guest_mode) {
9207 +       if (!irte->lo.fields_remap.guest_mode) {
9208                 irte->hi.fields.vector = vector;
9209                 irte->lo.fields_remap.destination = dest_apicid;
9210                 modify_irte_ga(devid, index, irte, NULL);
9211 @@ -4078,7 +4100,7 @@ static int irq_remapping_alloc(struct irq_domain *domain, unsigned int virq,
9212         struct amd_ir_data *data = NULL;
9213         struct irq_cfg *cfg;
9214         int i, ret, devid;
9215 -       int index = -1;
9216 +       int index;
9217
9218         if (!info)
9219                 return -EINVAL;
9220 @@ -4102,10 +4124,26 @@ static int irq_remapping_alloc(struct irq_domain *domain, unsigned int virq,
9221                 return ret;
9222
9223         if (info->type == X86_IRQ_ALLOC_TYPE_IOAPIC) {
9224 -               if (get_irq_table(devid, true))
9225 +               struct irq_remap_table *table;
9226 +               struct amd_iommu *iommu;
9227 +
9228 +               table = alloc_irq_table(devid);
9229 +               if (table) {
9230 +                       if (!table->min_index) {
9231 +                               /*
9232 +                                * Keep the first 32 indexes free for IOAPIC
9233 +                                * interrupts.
9234 +                                */
9235 +                               table->min_index = 32;
9236 +                               iommu = amd_iommu_rlookup_table[devid];
9237 +                               for (i = 0; i < 32; ++i)
9238 +                                       iommu->irte_ops->set_allocated(table, i);
9239 +                       }
9240 +                       WARN_ON(table->min_index != 32);
9241                         index = info->ioapic_pin;
9242 -               else
9243 -                       ret = -ENOMEM;
9244 +               } else {
9245 +                       index = -ENOMEM;
9246 +               }
9247         } else {
9248                 index = alloc_irq_index(devid, nr_irqs);
9249         }
9250 @@ -4349,7 +4387,7 @@ int amd_iommu_update_ga(int cpu, bool is_run, void *data)
9251  {
9252         unsigned long flags;
9253         struct amd_iommu *iommu;
9254 -       struct irq_remap_table *irt;
9255 +       struct irq_remap_table *table;
9256         struct amd_ir_data *ir_data = (struct amd_ir_data *)data;
9257         int devid = ir_data->irq_2_irte.devid;
9258         struct irte_ga *entry = (struct irte_ga *) ir_data->entry;
9259 @@ -4363,11 +4401,11 @@ int amd_iommu_update_ga(int cpu, bool is_run, void *data)
9260         if (!iommu)
9261                 return -ENODEV;
9262
9263 -       irt = get_irq_table(devid, false);
9264 -       if (!irt)
9265 +       table = get_irq_table(devid);
9266 +       if (!table)
9267                 return -ENODEV;
9268
9269 -       spin_lock_irqsave(&irt->lock, flags);
9270 +       raw_spin_lock_irqsave(&table->lock, flags);
9271
9272         if (ref->lo.fields_vapic.guest_mode) {
9273                 if (cpu >= 0)
9274 @@ -4376,7 +4414,7 @@ int amd_iommu_update_ga(int cpu, bool is_run, void *data)
9275                 barrier();
9276         }
9277
9278 -       spin_unlock_irqrestore(&irt->lock, flags);
9279 +       raw_spin_unlock_irqrestore(&table->lock, flags);
9280
9281         iommu_flush_irt(iommu, devid);
9282         iommu_completion_wait(iommu);
9283 diff --git a/drivers/iommu/amd_iommu_init.c b/drivers/iommu/amd_iommu_init.c
9284 index 6fe2d0346073..e3cd81b32a33 100644
9285 --- a/drivers/iommu/amd_iommu_init.c
9286 +++ b/drivers/iommu/amd_iommu_init.c
9287 @@ -1474,7 +1474,7 @@ static int __init init_iommu_one(struct amd_iommu *iommu, struct ivhd_header *h)
9288  {
9289         int ret;
9290
9291 -       spin_lock_init(&iommu->lock);
9292 +       raw_spin_lock_init(&iommu->lock);
9293
9294         /* Add IOMMU to internal data structures */
9295         list_add_tail(&iommu->list, &amd_iommu_list);
9296 diff --git a/drivers/iommu/amd_iommu_types.h b/drivers/iommu/amd_iommu_types.h
9297 index f6b24c7d8b70..16b1404da58c 100644
9298 --- a/drivers/iommu/amd_iommu_types.h
9299 +++ b/drivers/iommu/amd_iommu_types.h
9300 @@ -406,7 +406,7 @@ extern bool amd_iommu_iotlb_sup;
9301  #define IRQ_TABLE_ALIGNMENT    128
9302
9303  struct irq_remap_table {
9304 -       spinlock_t lock;
9305 +       raw_spinlock_t lock;
9306         unsigned min_index;
9307         u32 *table;
9308  };
9309 @@ -488,7 +488,7 @@ struct amd_iommu {
9310         int index;
9311
9312         /* locks the accesses to the hardware */
9313 -       spinlock_t lock;
9314 +       raw_spinlock_t lock;
9315
9316         /* Pointer to PCI device of this IOMMU */
9317         struct pci_dev *dev;
9318 @@ -625,7 +625,7 @@ struct devid_map {
9319   */
9320  struct iommu_dev_data {
9321         struct list_head list;            /* For domain->dev_list */
9322 -       struct list_head dev_data_list;   /* For global dev_data_list */
9323 +       struct llist_node dev_data_list;  /* For global dev_data_list */
9324         struct protection_domain *domain; /* Domain the device is bound to */
9325         u16 devid;                        /* PCI Device ID */
9326         u16 alias;                        /* Alias Device ID */
9327 diff --git a/drivers/iommu/iova.c b/drivers/iommu/iova.c
9328 index 33edfa794ae9..b30900025c62 100644
9329 --- a/drivers/iommu/iova.c
9330 +++ b/drivers/iommu/iova.c
9331 @@ -570,7 +570,7 @@ void queue_iova(struct iova_domain *iovad,
9332                 unsigned long pfn, unsigned long pages,
9333                 unsigned long data)
9334  {
9335 -       struct iova_fq *fq = get_cpu_ptr(iovad->fq);
9336 +       struct iova_fq *fq = raw_cpu_ptr(iovad->fq);
9337         unsigned long flags;
9338         unsigned idx;
9339
9340 @@ -600,8 +600,6 @@ void queue_iova(struct iova_domain *iovad,
9341         if (atomic_cmpxchg(&iovad->fq_timer_on, 0, 1) == 0)
9342                 mod_timer(&iovad->fq_timer,
9343                           jiffies + msecs_to_jiffies(IOVA_FQ_TIMEOUT));
9344 -
9345 -       put_cpu_ptr(iovad->fq);
9346  }
9347  EXPORT_SYMBOL_GPL(queue_iova);
9348
9349 diff --git a/drivers/irqchip/irq-gic-v3-its.c b/drivers/irqchip/irq-gic-v3-its.c
9350 index 2ea39a83737f..a3e23d0fc4af 100644
9351 --- a/drivers/irqchip/irq-gic-v3-its.c
9352 +++ b/drivers/irqchip/irq-gic-v3-its.c
9353 @@ -148,7 +148,7 @@ static struct {
9354  } vpe_proxy;
9355
9356  static LIST_HEAD(its_nodes);
9357 -static DEFINE_SPINLOCK(its_lock);
9358 +static DEFINE_RAW_SPINLOCK(its_lock);
9359  static struct rdists *gic_rdists;
9360  static struct irq_domain *its_parent;
9361
9362 @@ -165,6 +165,7 @@ static DEFINE_RAW_SPINLOCK(vmovp_lock);
9363  static DEFINE_IDA(its_vpeid_ida);
9364
9365  #define gic_data_rdist()               (raw_cpu_ptr(gic_rdists->rdist))
9366 +#define gic_data_rdist_cpu(cpu)                (per_cpu_ptr(gic_rdists->rdist, cpu))
9367  #define gic_data_rdist_rd_base()       (gic_data_rdist()->rd_base)
9368  #define gic_data_rdist_vlpi_base()     (gic_data_rdist_rd_base() + SZ_128K)
9369
9370 @@ -1432,7 +1433,7 @@ static void its_free_prop_table(struct page *prop_page)
9371                    get_order(LPI_PROPBASE_SZ));
9372  }
9373
9374 -static int __init its_alloc_lpi_tables(void)
9375 +static int __init its_alloc_lpi_prop_table(void)
9376  {
9377         phys_addr_t paddr;
9378
9379 @@ -1758,30 +1759,47 @@ static void its_free_pending_table(struct page *pt)
9380                    get_order(max_t(u32, LPI_PENDBASE_SZ, SZ_64K)));
9381  }
9382
9383 -static void its_cpu_init_lpis(void)
9384 +static int __init allocate_lpi_tables(void)
9385  {
9386 -       void __iomem *rbase = gic_data_rdist_rd_base();
9387 -       struct page *pend_page;
9388 -       u64 val, tmp;
9389 +       int err, cpu;
9390
9391 -       /* If we didn't allocate the pending table yet, do it now */
9392 -       pend_page = gic_data_rdist()->pend_page;
9393 -       if (!pend_page) {
9394 -               phys_addr_t paddr;
9395 +       err = its_alloc_lpi_prop_table();
9396 +       if (err)
9397 +               return err;
9398 +
9399 +       /*
9400 +        * We allocate all the pending tables anyway, as we may have a
9401 +        * mix of RDs that have had LPIs enabled, and some that
9402 +        * don't. We'll free the unused ones as each CPU comes online.
9403 +        */
9404 +       for_each_possible_cpu(cpu) {
9405 +               struct page *pend_page;
9406
9407                 pend_page = its_allocate_pending_table(GFP_NOWAIT);
9408                 if (!pend_page) {
9409 -                       pr_err("Failed to allocate PENDBASE for CPU%d\n",
9410 -                              smp_processor_id());
9411 -                       return;
9412 +                       pr_err("Failed to allocate PENDBASE for CPU%d\n", cpu);
9413 +                       return -ENOMEM;
9414                 }
9415
9416 -               paddr = page_to_phys(pend_page);
9417 -               pr_info("CPU%d: using LPI pending table @%pa\n",
9418 -                       smp_processor_id(), &paddr);
9419 -               gic_data_rdist()->pend_page = pend_page;
9420 +               gic_data_rdist_cpu(cpu)->pend_page = pend_page;
9421         }
9422
9423 +       return 0;
9424 +}
9425 +
9426 +static void its_cpu_init_lpis(void)
9427 +{
9428 +       void __iomem *rbase = gic_data_rdist_rd_base();
9429 +       struct page *pend_page;
9430 +       phys_addr_t paddr;
9431 +       u64 val, tmp;
9432 +
9433 +       if (gic_data_rdist()->lpi_enabled)
9434 +               return;
9435 +
9436 +       pend_page = gic_data_rdist()->pend_page;
9437 +       paddr = page_to_phys(pend_page);
9438 +
9439         /* Disable LPIs */
9440         val = readl_relaxed(rbase + GICR_CTLR);
9441         val &= ~GICR_CTLR_ENABLE_LPIS;
9442 @@ -1843,6 +1861,10 @@ static void its_cpu_init_lpis(void)
9443
9444         /* Make sure the GIC has seen the above */
9445         dsb(sy);
9446 +       gic_data_rdist()->lpi_enabled = true;
9447 +       pr_info("GICv3: CPU%d: using LPI pending table @%pa\n",
9448 +               smp_processor_id(),
9449 +               &paddr);
9450  }
9451
9452  static void its_cpu_init_collection(void)
9453 @@ -1850,7 +1872,7 @@ static void its_cpu_init_collection(void)
9454         struct its_node *its;
9455         int cpu;
9456
9457 -       spin_lock(&its_lock);
9458 +       raw_spin_lock(&its_lock);
9459         cpu = smp_processor_id();
9460
9461         list_for_each_entry(its, &its_nodes, entry) {
9462 @@ -1892,7 +1914,7 @@ static void its_cpu_init_collection(void)
9463                 its_send_invall(its, &its->collections[cpu]);
9464         }
9465
9466 -       spin_unlock(&its_lock);
9467 +       raw_spin_unlock(&its_lock);
9468  }
9469
9470  static struct its_device *its_find_device(struct its_node *its, u32 dev_id)
9471 @@ -3041,9 +3063,9 @@ static int __init its_probe_one(struct resource *res,
9472         if (err)
9473                 goto out_free_tables;
9474
9475 -       spin_lock(&its_lock);
9476 +       raw_spin_lock(&its_lock);
9477         list_add(&its->entry, &its_nodes);
9478 -       spin_unlock(&its_lock);
9479 +       raw_spin_unlock(&its_lock);
9480
9481         return 0;
9482
9483 @@ -3278,7 +3300,8 @@ int __init its_init(struct fwnode_handle *handle, struct rdists *rdists,
9484         }
9485
9486         gic_rdists = rdists;
9487 -       err = its_alloc_lpi_tables();
9488 +
9489 +       err = allocate_lpi_tables();
9490         if (err)
9491                 return err;
9492
9493 diff --git a/drivers/leds/trigger/Kconfig b/drivers/leds/trigger/Kconfig
9494 index 3f9ddb9fafa7..09da5b6b44a1 100644
9495 --- a/drivers/leds/trigger/Kconfig
9496 +++ b/drivers/leds/trigger/Kconfig
9497 @@ -69,7 +69,7 @@ config LEDS_TRIGGER_BACKLIGHT
9498
9499  config LEDS_TRIGGER_CPU
9500         bool "LED CPU Trigger"
9501 -       depends on LEDS_TRIGGERS
9502 +       depends on LEDS_TRIGGERS && !PREEMPT_RT_BASE
9503         help
9504           This allows LEDs to be controlled by active CPUs. This shows
9505           the active CPUs across an array of LEDs so you can see which
9506 diff --git a/drivers/md/bcache/Kconfig b/drivers/md/bcache/Kconfig
9507 index 4d200883c505..98b64ed5cb81 100644
9508 --- a/drivers/md/bcache/Kconfig
9509 +++ b/drivers/md/bcache/Kconfig
9510 @@ -1,6 +1,7 @@
9511
9512  config BCACHE
9513         tristate "Block device as cache"
9514 +       depends on !PREEMPT_RT_FULL
9515         ---help---
9516         Allows a block device to be used as cache for other devices; uses
9517         a btree for indexing and the layout is optimized for SSDs.
9518 diff --git a/drivers/md/dm-rq.c b/drivers/md/dm-rq.c
9519 index eadfcfd106ff..8824aeda85cf 100644
9520 --- a/drivers/md/dm-rq.c
9521 +++ b/drivers/md/dm-rq.c
9522 @@ -671,7 +671,7 @@ static void dm_old_request_fn(struct request_queue *q)
9523                 /* Establish tio->ti before queuing work (map_tio_request) */
9524                 tio->ti = ti;
9525                 kthread_queue_work(&md->kworker, &tio->work);
9526 -               BUG_ON(!irqs_disabled());
9527 +               BUG_ON_NONRT(!irqs_disabled());
9528         }
9529  }
9530
9531 diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
9532 index dbf51b4c21b3..5cfccaf87687 100644
9533 --- a/drivers/md/raid5.c
9534 +++ b/drivers/md/raid5.c
9535 @@ -410,7 +410,7 @@ void raid5_release_stripe(struct stripe_head *sh)
9536                 md_wakeup_thread(conf->mddev->thread);
9537         return;
9538  slow_path:
9539 -       local_irq_save(flags);
9540 +       local_irq_save_nort(flags);
9541         /* we are ok here if STRIPE_ON_RELEASE_LIST is set or not */
9542         if (atomic_dec_and_lock(&sh->count, &conf->device_lock)) {
9543                 INIT_LIST_HEAD(&list);
9544 @@ -419,7 +419,7 @@ void raid5_release_stripe(struct stripe_head *sh)
9545                 spin_unlock(&conf->device_lock);
9546                 release_inactive_stripe_list(conf, &list, hash);
9547         }
9548 -       local_irq_restore(flags);
9549 +       local_irq_restore_nort(flags);
9550  }
9551
9552  static inline void remove_hash(struct stripe_head *sh)
9553 @@ -2067,8 +2067,9 @@ static void raid_run_ops(struct stripe_head *sh, unsigned long ops_request)
9554         struct raid5_percpu *percpu;
9555         unsigned long cpu;
9556
9557 -       cpu = get_cpu();
9558 +       cpu = get_cpu_light();
9559         percpu = per_cpu_ptr(conf->percpu, cpu);
9560 +       spin_lock(&percpu->lock);
9561         if (test_bit(STRIPE_OP_BIOFILL, &ops_request)) {
9562                 ops_run_biofill(sh);
9563                 overlap_clear++;
9564 @@ -2127,7 +2128,8 @@ static void raid_run_ops(struct stripe_head *sh, unsigned long ops_request)
9565                         if (test_and_clear_bit(R5_Overlap, &dev->flags))
9566                                 wake_up(&sh->raid_conf->wait_for_overlap);
9567                 }
9568 -       put_cpu();
9569 +       spin_unlock(&percpu->lock);
9570 +       put_cpu_light();
9571  }
9572
9573  static void free_stripe(struct kmem_cache *sc, struct stripe_head *sh)
9574 @@ -6781,6 +6783,7 @@ static int raid456_cpu_up_prepare(unsigned int cpu, struct hlist_node *node)
9575                         __func__, cpu);
9576                 return -ENOMEM;
9577         }
9578 +       spin_lock_init(&per_cpu_ptr(conf->percpu, cpu)->lock);
9579         return 0;
9580  }
9581
9582 @@ -6791,7 +6794,6 @@ static int raid5_alloc_percpu(struct r5conf *conf)
9583         conf->percpu = alloc_percpu(struct raid5_percpu);
9584         if (!conf->percpu)
9585                 return -ENOMEM;
9586 -
9587         err = cpuhp_state_add_instance(CPUHP_MD_RAID5_PREPARE, &conf->node);
9588         if (!err) {
9589                 conf->scribble_disks = max(conf->raid_disks,
9590 diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h
9591 index 2e6123825095..37a6021418a2 100644
9592 --- a/drivers/md/raid5.h
9593 +++ b/drivers/md/raid5.h
9594 @@ -624,6 +624,7 @@ struct r5conf {
9595         int                     recovery_disabled;
9596         /* per cpu variables */
9597         struct raid5_percpu {
9598 +               spinlock_t      lock;           /* Protection for -RT */
9599                 struct page     *spare_page; /* Used when checking P/Q in raid6 */
9600                 struct flex_array *scribble;   /* space for constructing buffer
9601                                               * lists and performing address
9602 diff --git a/drivers/mfd/atmel-smc.c b/drivers/mfd/atmel-smc.c
9603 index 7d77948567d7..0adbd2e796fe 100644
9604 --- a/drivers/mfd/atmel-smc.c
9605 +++ b/drivers/mfd/atmel-smc.c
9606 @@ -12,6 +12,7 @@
9607   */
9608
9609  #include <linux/mfd/syscon/atmel-smc.h>
9610 +#include <linux/string.h>
9611
9612  /**
9613   * atmel_smc_cs_conf_init - initialize a SMC CS conf
9614 diff --git a/drivers/misc/Kconfig b/drivers/misc/Kconfig
9615 index 8136dc7e863d..86e83b9629d7 100644
9616 --- a/drivers/misc/Kconfig
9617 +++ b/drivers/misc/Kconfig
9618 @@ -54,6 +54,7 @@ config AD525X_DPOT_SPI
9619  config ATMEL_TCLIB
9620         bool "Atmel AT32/AT91 Timer/Counter Library"
9621         depends on (AVR32 || ARCH_AT91)
9622 +       default y if PREEMPT_RT_FULL
9623         help
9624           Select this if you want a library to allocate the Timer/Counter
9625           blocks found on many Atmel processors.  This facilitates using
9626 @@ -69,8 +70,7 @@ config ATMEL_TCB_CLKSRC
9627           are combined to make a single 32-bit timer.
9628
9629           When GENERIC_CLOCKEVENTS is defined, the third timer channel
9630 -         may be used as a clock event device supporting oneshot mode
9631 -         (delays of up to two seconds) based on the 32 KiHz clock.
9632 +         may be used as a clock event device supporting oneshot mode.
9633
9634  config ATMEL_TCB_CLKSRC_BLOCK
9635         int
9636 @@ -84,6 +84,15 @@ config ATMEL_TCB_CLKSRC_BLOCK
9637           TC can be used for other purposes, such as PWM generation and
9638           interval timing.
9639
9640 +config ATMEL_TCB_CLKSRC_USE_SLOW_CLOCK
9641 +       bool "TC Block use 32 KiHz clock"
9642 +       depends on ATMEL_TCB_CLKSRC
9643 +       default y if !PREEMPT_RT_FULL
9644 +       help
9645 +         Select this to use 32 KiHz base clock rate as TC block clock
9646 +         source for clock events.
9647 +
9648 +
9649  config DUMMY_IRQ
9650         tristate "Dummy IRQ handler"
9651         default n
9652 diff --git a/drivers/mmc/host/mmci.c b/drivers/mmc/host/mmci.c
9653 index f1f54a818489..ce102378df02 100644
9654 --- a/drivers/mmc/host/mmci.c
9655 +++ b/drivers/mmc/host/mmci.c
9656 @@ -1200,15 +1200,12 @@ static irqreturn_t mmci_pio_irq(int irq, void *dev_id)
9657         struct sg_mapping_iter *sg_miter = &host->sg_miter;
9658         struct variant_data *variant = host->variant;
9659         void __iomem *base = host->base;
9660 -       unsigned long flags;
9661         u32 status;
9662
9663         status = readl(base + MMCISTATUS);
9664
9665         dev_dbg(mmc_dev(host->mmc), "irq1 (pio) %08x\n", status);
9666
9667 -       local_irq_save(flags);
9668 -
9669         do {
9670                 unsigned int remain, len;
9671                 char *buffer;
9672 @@ -1248,8 +1245,6 @@ static irqreturn_t mmci_pio_irq(int irq, void *dev_id)
9673
9674         sg_miter_stop(sg_miter);
9675
9676 -       local_irq_restore(flags);
9677 -
9678         /*
9679          * If we have less than the fifo 'half-full' threshold to transfer,
9680          * trigger a PIO interrupt as soon as any data is available.
9681 diff --git a/drivers/net/ethernet/3com/3c59x.c b/drivers/net/ethernet/3com/3c59x.c
9682 index 402d9090ad29..9bc02563b853 100644
9683 --- a/drivers/net/ethernet/3com/3c59x.c
9684 +++ b/drivers/net/ethernet/3com/3c59x.c
9685 @@ -842,9 +842,9 @@ static void poll_vortex(struct net_device *dev)
9686  {
9687         struct vortex_private *vp = netdev_priv(dev);
9688         unsigned long flags;
9689 -       local_irq_save(flags);
9690 +       local_irq_save_nort(flags);
9691         (vp->full_bus_master_rx ? boomerang_interrupt:vortex_interrupt)(dev->irq,dev);
9692 -       local_irq_restore(flags);
9693 +       local_irq_restore_nort(flags);
9694  }
9695  #endif
9696
9697 @@ -1908,12 +1908,12 @@ static void vortex_tx_timeout(struct net_device *dev)
9698                          * Block interrupts because vortex_interrupt does a bare spin_lock()
9699                          */
9700                         unsigned long flags;
9701 -                       local_irq_save(flags);
9702 +                       local_irq_save_nort(flags);
9703                         if (vp->full_bus_master_tx)
9704                                 boomerang_interrupt(dev->irq, dev);
9705                         else
9706                                 vortex_interrupt(dev->irq, dev);
9707 -                       local_irq_restore(flags);
9708 +                       local_irq_restore_nort(flags);
9709                 }
9710         }
9711
9712 diff --git a/drivers/net/ethernet/marvell/mvpp2.c b/drivers/net/ethernet/marvell/mvpp2.c
9713 index 00e6f1d155a6..9c69ab2c5b07 100644
9714 --- a/drivers/net/ethernet/marvell/mvpp2.c
9715 +++ b/drivers/net/ethernet/marvell/mvpp2.c
9716 @@ -831,9 +831,8 @@ struct mvpp2_pcpu_stats {
9717  /* Per-CPU port control */
9718  struct mvpp2_port_pcpu {
9719         struct hrtimer tx_done_timer;
9720 +       struct net_device *dev;
9721         bool timer_scheduled;
9722 -       /* Tasklet for egress finalization */
9723 -       struct tasklet_struct tx_done_tasklet;
9724  };
9725
9726  struct mvpp2_queue_vector {
9727 @@ -5955,46 +5954,34 @@ static void mvpp2_link_event(struct net_device *dev)
9728         }
9729  }
9730
9731 -static void mvpp2_timer_set(struct mvpp2_port_pcpu *port_pcpu)
9732 -{
9733 -       ktime_t interval;
9734 -
9735 -       if (!port_pcpu->timer_scheduled) {
9736 -               port_pcpu->timer_scheduled = true;
9737 -               interval = MVPP2_TXDONE_HRTIMER_PERIOD_NS;
9738 -               hrtimer_start(&port_pcpu->tx_done_timer, interval,
9739 -                             HRTIMER_MODE_REL_PINNED);
9740 -       }
9741 -}
9742 -
9743 -static void mvpp2_tx_proc_cb(unsigned long data)
9744 +static enum hrtimer_restart mvpp2_hr_timer_cb(struct hrtimer *timer)
9745  {
9746 -       struct net_device *dev = (struct net_device *)data;
9747 -       struct mvpp2_port *port = netdev_priv(dev);
9748 -       struct mvpp2_port_pcpu *port_pcpu = this_cpu_ptr(port->pcpu);
9749 +       struct net_device *dev;
9750 +       struct mvpp2_port *port;
9751 +       struct mvpp2_port_pcpu *port_pcpu;
9752         unsigned int tx_todo, cause;
9753
9754 +       port_pcpu = container_of(timer, struct mvpp2_port_pcpu, tx_done_timer);
9755 +       dev = port_pcpu->dev;
9756 +
9757         if (!netif_running(dev))
9758 -               return;
9759 +               return HRTIMER_NORESTART;
9760 +
9761         port_pcpu->timer_scheduled = false;
9762 +       port = netdev_priv(dev);
9763
9764         /* Process all the Tx queues */
9765         cause = (1 << port->ntxqs) - 1;
9766         tx_todo = mvpp2_tx_done(port, cause, smp_processor_id());
9767
9768         /* Set the timer in case not all the packets were processed */
9769 -       if (tx_todo)
9770 -               mvpp2_timer_set(port_pcpu);
9771 -}
9772 -
9773 -static enum hrtimer_restart mvpp2_hr_timer_cb(struct hrtimer *timer)
9774 -{
9775 -       struct mvpp2_port_pcpu *port_pcpu = container_of(timer,
9776 -                                                        struct mvpp2_port_pcpu,
9777 -                                                        tx_done_timer);
9778 -
9779 -       tasklet_schedule(&port_pcpu->tx_done_tasklet);
9780 +       if (tx_todo && !port_pcpu->timer_scheduled) {
9781 +               port_pcpu->timer_scheduled = true;
9782 +               hrtimer_forward_now(&port_pcpu->tx_done_timer,
9783 +                                   MVPP2_TXDONE_HRTIMER_PERIOD_NS);
9784
9785 +               return HRTIMER_RESTART;
9786 +       }
9787         return HRTIMER_NORESTART;
9788  }
9789
9790 @@ -6484,7 +6471,12 @@ static int mvpp2_tx(struct sk_buff *skb, struct net_device *dev)
9791             txq_pcpu->count > 0) {
9792                 struct mvpp2_port_pcpu *port_pcpu = this_cpu_ptr(port->pcpu);
9793
9794 -               mvpp2_timer_set(port_pcpu);
9795 +               if (!port_pcpu->timer_scheduled) {
9796 +                       port_pcpu->timer_scheduled = true;
9797 +                       hrtimer_start(&port_pcpu->tx_done_timer,
9798 +                                     MVPP2_TXDONE_HRTIMER_PERIOD_NS,
9799 +                                     HRTIMER_MODE_REL_PINNED_SOFT);
9800 +               }
9801         }
9802
9803         return NETDEV_TX_OK;
9804 @@ -6875,7 +6867,6 @@ static int mvpp2_stop(struct net_device *dev)
9805
9806                         hrtimer_cancel(&port_pcpu->tx_done_timer);
9807                         port_pcpu->timer_scheduled = false;
9808 -                       tasklet_kill(&port_pcpu->tx_done_tasklet);
9809                 }
9810         }
9811         mvpp2_cleanup_rxqs(port);
9812 @@ -7648,13 +7639,10 @@ static int mvpp2_port_probe(struct platform_device *pdev,
9813                         port_pcpu = per_cpu_ptr(port->pcpu, cpu);
9814
9815                         hrtimer_init(&port_pcpu->tx_done_timer, CLOCK_MONOTONIC,
9816 -                                    HRTIMER_MODE_REL_PINNED);
9817 +                                    HRTIMER_MODE_REL_PINNED_SOFT);
9818                         port_pcpu->tx_done_timer.function = mvpp2_hr_timer_cb;
9819                         port_pcpu->timer_scheduled = false;
9820 -
9821 -                       tasklet_init(&port_pcpu->tx_done_tasklet,
9822 -                                    mvpp2_tx_proc_cb,
9823 -                                    (unsigned long)dev);
9824 +                       port_pcpu->dev = dev;
9825                 }
9826         }
9827
9828 diff --git a/drivers/net/wireless/intersil/orinoco/orinoco_usb.c b/drivers/net/wireless/intersil/orinoco/orinoco_usb.c
9829 index 56f6e3b71f48..a50350d01a80 100644
9830 --- a/drivers/net/wireless/intersil/orinoco/orinoco_usb.c
9831 +++ b/drivers/net/wireless/intersil/orinoco/orinoco_usb.c
9832 @@ -697,7 +697,7 @@ static void ezusb_req_ctx_wait(struct ezusb_priv *upriv,
9833                         while (!ctx->done.done && msecs--)
9834                                 udelay(1000);
9835                 } else {
9836 -                       wait_event_interruptible(ctx->done.wait,
9837 +                       swait_event_interruptible(ctx->done.wait,
9838                                                  ctx->done.done);
9839                 }
9840                 break;
9841 diff --git a/drivers/net/wireless/mac80211_hwsim.c b/drivers/net/wireless/mac80211_hwsim.c
9842 index aafa7aa18fbd..388f6d71ba71 100644
9843 --- a/drivers/net/wireless/mac80211_hwsim.c
9844 +++ b/drivers/net/wireless/mac80211_hwsim.c
9845 @@ -537,7 +537,7 @@ struct mac80211_hwsim_data {
9846         unsigned int rx_filter;
9847         bool started, idle, scanning;
9848         struct mutex mutex;
9849 -       struct tasklet_hrtimer beacon_timer;
9850 +       struct hrtimer beacon_timer;
9851         enum ps_mode {
9852                 PS_DISABLED, PS_ENABLED, PS_AUTO_POLL, PS_MANUAL_POLL
9853         } ps;
9854 @@ -1423,7 +1423,7 @@ static void mac80211_hwsim_stop(struct ieee80211_hw *hw)
9855  {
9856         struct mac80211_hwsim_data *data = hw->priv;
9857         data->started = false;
9858 -       tasklet_hrtimer_cancel(&data->beacon_timer);
9859 +       hrtimer_cancel(&data->beacon_timer);
9860         wiphy_debug(hw->wiphy, "%s\n", __func__);
9861  }
9862
9863 @@ -1546,14 +1546,12 @@ static enum hrtimer_restart
9864  mac80211_hwsim_beacon(struct hrtimer *timer)
9865  {
9866         struct mac80211_hwsim_data *data =
9867 -               container_of(timer, struct mac80211_hwsim_data,
9868 -                            beacon_timer.timer);
9869 +               container_of(timer, struct mac80211_hwsim_data, beacon_timer);
9870         struct ieee80211_hw *hw = data->hw;
9871         u64 bcn_int = data->beacon_int;
9872 -       ktime_t next_bcn;
9873
9874         if (!data->started)
9875 -               goto out;
9876 +               return HRTIMER_NORESTART;
9877
9878         ieee80211_iterate_active_interfaces_atomic(
9879                 hw, IEEE80211_IFACE_ITER_NORMAL,
9880 @@ -1565,11 +1563,9 @@ mac80211_hwsim_beacon(struct hrtimer *timer)
9881                 data->bcn_delta = 0;
9882         }
9883
9884 -       next_bcn = ktime_add(hrtimer_get_expires(timer),
9885 -                            ns_to_ktime(bcn_int * 1000));
9886 -       tasklet_hrtimer_start(&data->beacon_timer, next_bcn, HRTIMER_MODE_ABS);
9887 -out:
9888 -       return HRTIMER_NORESTART;
9889 +       hrtimer_forward(&data->beacon_timer, hrtimer_get_expires(timer),
9890 +                       ns_to_ktime(bcn_int * NSEC_PER_USEC));
9891 +       return HRTIMER_RESTART;
9892  }
9893
9894  static const char * const hwsim_chanwidths[] = {
9895 @@ -1643,15 +1639,15 @@ static int mac80211_hwsim_config(struct ieee80211_hw *hw, u32 changed)
9896         mutex_unlock(&data->mutex);
9897
9898         if (!data->started || !data->beacon_int)
9899 -               tasklet_hrtimer_cancel(&data->beacon_timer);
9900 -       else if (!hrtimer_is_queued(&data->beacon_timer.timer)) {
9901 +               hrtimer_cancel(&data->beacon_timer);
9902 +       else if (!hrtimer_is_queued(&data->beacon_timer)) {
9903                 u64 tsf = mac80211_hwsim_get_tsf(hw, NULL);
9904                 u32 bcn_int = data->beacon_int;
9905                 u64 until_tbtt = bcn_int - do_div(tsf, bcn_int);
9906
9907 -               tasklet_hrtimer_start(&data->beacon_timer,
9908 -                                     ns_to_ktime(until_tbtt * 1000),
9909 -                                     HRTIMER_MODE_REL);
9910 +               hrtimer_start(&data->beacon_timer,
9911 +                             ns_to_ktime(until_tbtt * 1000),
9912 +                             HRTIMER_MODE_REL_SOFT);
9913         }
9914
9915         return 0;
9916 @@ -1714,7 +1710,7 @@ static void mac80211_hwsim_bss_info_changed(struct ieee80211_hw *hw,
9917                             info->enable_beacon, info->beacon_int);
9918                 vp->bcn_en = info->enable_beacon;
9919                 if (data->started &&
9920 -                   !hrtimer_is_queued(&data->beacon_timer.timer) &&
9921 +                   !hrtimer_is_queued(&data->beacon_timer) &&
9922                     info->enable_beacon) {
9923                         u64 tsf, until_tbtt;
9924                         u32 bcn_int;
9925 @@ -1722,9 +1718,9 @@ static void mac80211_hwsim_bss_info_changed(struct ieee80211_hw *hw,
9926                         tsf = mac80211_hwsim_get_tsf(hw, vif);
9927                         bcn_int = data->beacon_int;
9928                         until_tbtt = bcn_int - do_div(tsf, bcn_int);
9929 -                       tasklet_hrtimer_start(&data->beacon_timer,
9930 -                                             ns_to_ktime(until_tbtt * 1000),
9931 -                                             HRTIMER_MODE_REL);
9932 +                       hrtimer_start(&data->beacon_timer,
9933 +                                     ns_to_ktime(until_tbtt * 1000),
9934 +                                     HRTIMER_MODE_REL_SOFT);
9935                 } else if (!info->enable_beacon) {
9936                         unsigned int count = 0;
9937                         ieee80211_iterate_active_interfaces_atomic(
9938 @@ -1733,7 +1729,7 @@ static void mac80211_hwsim_bss_info_changed(struct ieee80211_hw *hw,
9939                         wiphy_debug(hw->wiphy, "  beaconing vifs remaining: %u",
9940                                     count);
9941                         if (count == 0) {
9942 -                               tasklet_hrtimer_cancel(&data->beacon_timer);
9943 +                               hrtimer_cancel(&data->beacon_timer);
9944                                 data->beacon_int = 0;
9945                         }
9946                 }
9947 @@ -2722,9 +2718,9 @@ static int mac80211_hwsim_new_radio(struct genl_info *info,
9948                                     data->debugfs,
9949                                     data, &hwsim_simulate_radar);
9950
9951 -       tasklet_hrtimer_init(&data->beacon_timer,
9952 -                            mac80211_hwsim_beacon,
9953 -                            CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
9954 +       hrtimer_init(&data->beacon_timer, CLOCK_MONOTONIC,
9955 +                    HRTIMER_MODE_ABS_SOFT);
9956 +       data->beacon_timer.function = mac80211_hwsim_beacon;
9957
9958         spin_lock_bh(&hwsim_radio_lock);
9959         list_add_tail(&data->list, &hwsim_radios);
9960 diff --git a/drivers/pci/switch/switchtec.c b/drivers/pci/switch/switchtec.c
9961 index 620f5b995a12..7fd1548a2905 100644
9962 --- a/drivers/pci/switch/switchtec.c
9963 +++ b/drivers/pci/switch/switchtec.c
9964 @@ -308,10 +308,11 @@ struct switchtec_user {
9965
9966         enum mrpc_state state;
9967
9968 -       struct completion comp;
9969 +       wait_queue_head_t cmd_comp;
9970         struct kref kref;
9971         struct list_head list;
9972
9973 +       bool cmd_done;
9974         u32 cmd;
9975         u32 status;
9976         u32 return_code;
9977 @@ -333,7 +334,7 @@ static struct switchtec_user *stuser_create(struct switchtec_dev *stdev)
9978         stuser->stdev = stdev;
9979         kref_init(&stuser->kref);
9980         INIT_LIST_HEAD(&stuser->list);
9981 -       init_completion(&stuser->comp);
9982 +       init_waitqueue_head(&stuser->cmd_comp);
9983         stuser->event_cnt = atomic_read(&stdev->event_cnt);
9984
9985         dev_dbg(&stdev->dev, "%s: %p\n", __func__, stuser);
9986 @@ -416,7 +417,7 @@ static int mrpc_queue_cmd(struct switchtec_user *stuser)
9987         kref_get(&stuser->kref);
9988         stuser->read_len = sizeof(stuser->data);
9989         stuser_set_state(stuser, MRPC_QUEUED);
9990 -       init_completion(&stuser->comp);
9991 +       stuser->cmd_done = false;
9992         list_add_tail(&stuser->list, &stdev->mrpc_queue);
9993
9994         mrpc_cmd_submit(stdev);
9995 @@ -453,7 +454,8 @@ static void mrpc_complete_cmd(struct switchtec_dev *stdev)
9996                       stuser->read_len);
9997
9998  out:
9999 -       complete_all(&stuser->comp);
10000 +       stuser->cmd_done = true;
10001 +       wake_up_interruptible(&stuser->cmd_comp);
10002         list_del_init(&stuser->list);
10003         stuser_put(stuser);
10004         stdev->mrpc_busy = 0;
10005 @@ -723,10 +725,11 @@ static ssize_t switchtec_dev_read(struct file *filp, char __user *data,
10006         mutex_unlock(&stdev->mrpc_mutex);
10007
10008         if (filp->f_flags & O_NONBLOCK) {
10009 -               if (!try_wait_for_completion(&stuser->comp))
10010 +               if (!READ_ONCE(stuser->cmd_done))
10011                         return -EAGAIN;
10012         } else {
10013 -               rc = wait_for_completion_interruptible(&stuser->comp);
10014 +               rc = wait_event_interruptible(stuser->cmd_comp,
10015 +                                             stuser->cmd_done);
10016                 if (rc < 0)
10017                         return rc;
10018         }
10019 @@ -774,7 +777,7 @@ static unsigned int switchtec_dev_poll(struct file *filp, poll_table *wait)
10020         struct switchtec_dev *stdev = stuser->stdev;
10021         int ret = 0;
10022
10023 -       poll_wait(filp, &stuser->comp.wait, wait);
10024 +       poll_wait(filp, &stuser->cmd_comp, wait);
10025         poll_wait(filp, &stdev->event_wq, wait);
10026
10027         if (lock_mutex_and_test_alive(stdev))
10028 @@ -782,7 +785,7 @@ static unsigned int switchtec_dev_poll(struct file *filp, poll_table *wait)
10029
10030         mutex_unlock(&stdev->mrpc_mutex);
10031
10032 -       if (try_wait_for_completion(&stuser->comp))
10033 +       if (READ_ONCE(stuser->cmd_done))
10034                 ret |= POLLIN | POLLRDNORM;
10035
10036         if (stuser->event_cnt != atomic_read(&stdev->event_cnt))
10037 @@ -1259,7 +1262,8 @@ static void stdev_kill(struct switchtec_dev *stdev)
10038
10039         /* Wake up and kill any users waiting on an MRPC request */
10040         list_for_each_entry_safe(stuser, tmpuser, &stdev->mrpc_queue, list) {
10041 -               complete_all(&stuser->comp);
10042 +               stuser->cmd_done = true;
10043 +               wake_up_interruptible(&stuser->cmd_comp);
10044                 list_del_init(&stuser->list);
10045                 stuser_put(stuser);
10046         }
10047 diff --git a/drivers/scsi/fcoe/fcoe.c b/drivers/scsi/fcoe/fcoe.c
10048 index 85f9a3eba387..08ea05ddcd82 100644
10049 --- a/drivers/scsi/fcoe/fcoe.c
10050 +++ b/drivers/scsi/fcoe/fcoe.c
10051 @@ -1464,11 +1464,11 @@ static int fcoe_rcv(struct sk_buff *skb, struct net_device *netdev,
10052  static int fcoe_alloc_paged_crc_eof(struct sk_buff *skb, int tlen)
10053  {
10054         struct fcoe_percpu_s *fps;
10055 -       int rc;
10056 +       int rc, cpu = get_cpu_light();
10057
10058 -       fps = &get_cpu_var(fcoe_percpu);
10059 +       fps = &per_cpu(fcoe_percpu, cpu);
10060         rc = fcoe_get_paged_crc_eof(skb, tlen, fps);
10061 -       put_cpu_var(fcoe_percpu);
10062 +       put_cpu_light();
10063
10064         return rc;
10065  }
10066 @@ -1655,11 +1655,11 @@ static inline int fcoe_filter_frames(struct fc_lport *lport,
10067                 return 0;
10068         }
10069
10070 -       stats = per_cpu_ptr(lport->stats, get_cpu());
10071 +       stats = per_cpu_ptr(lport->stats, get_cpu_light());
10072         stats->InvalidCRCCount++;
10073         if (stats->InvalidCRCCount < 5)
10074                 printk(KERN_WARNING "fcoe: dropping frame with CRC error\n");
10075 -       put_cpu();
10076 +       put_cpu_light();
10077         return -EINVAL;
10078  }
10079
10080 @@ -1702,7 +1702,7 @@ static void fcoe_recv_frame(struct sk_buff *skb)
10081          */
10082         hp = (struct fcoe_hdr *) skb_network_header(skb);
10083
10084 -       stats = per_cpu_ptr(lport->stats, get_cpu());
10085 +       stats = per_cpu_ptr(lport->stats, get_cpu_light());
10086         if (unlikely(FC_FCOE_DECAPS_VER(hp) != FC_FCOE_VER)) {
10087                 if (stats->ErrorFrames < 5)
10088                         printk(KERN_WARNING "fcoe: FCoE version "
10089 @@ -1734,13 +1734,13 @@ static void fcoe_recv_frame(struct sk_buff *skb)
10090                 goto drop;
10091
10092         if (!fcoe_filter_frames(lport, fp)) {
10093 -               put_cpu();
10094 +               put_cpu_light();
10095                 fc_exch_recv(lport, fp);
10096                 return;
10097         }
10098  drop:
10099         stats->ErrorFrames++;
10100 -       put_cpu();
10101 +       put_cpu_light();
10102         kfree_skb(skb);
10103  }
10104
10105 diff --git a/drivers/scsi/fcoe/fcoe_ctlr.c b/drivers/scsi/fcoe/fcoe_ctlr.c
10106 index 03019e07abb9..9ec11316bfe6 100644
10107 --- a/drivers/scsi/fcoe/fcoe_ctlr.c
10108 +++ b/drivers/scsi/fcoe/fcoe_ctlr.c
10109 @@ -835,7 +835,7 @@ static unsigned long fcoe_ctlr_age_fcfs(struct fcoe_ctlr *fip)
10110
10111         INIT_LIST_HEAD(&del_list);
10112
10113 -       stats = per_cpu_ptr(fip->lp->stats, get_cpu());
10114 +       stats = per_cpu_ptr(fip->lp->stats, get_cpu_light());
10115
10116         list_for_each_entry_safe(fcf, next, &fip->fcfs, list) {
10117                 deadline = fcf->time + fcf->fka_period + fcf->fka_period / 2;
10118 @@ -871,7 +871,7 @@ static unsigned long fcoe_ctlr_age_fcfs(struct fcoe_ctlr *fip)
10119                                 sel_time = fcf->time;
10120                 }
10121         }
10122 -       put_cpu();
10123 +       put_cpu_light();
10124
10125         list_for_each_entry_safe(fcf, next, &del_list, list) {
10126                 /* Removes fcf from current list */
10127 diff --git a/drivers/scsi/libfc/fc_exch.c b/drivers/scsi/libfc/fc_exch.c
10128 index 42bcf7f3a0f9..2ce045d6860c 100644
10129 --- a/drivers/scsi/libfc/fc_exch.c
10130 +++ b/drivers/scsi/libfc/fc_exch.c
10131 @@ -833,10 +833,10 @@ static struct fc_exch *fc_exch_em_alloc(struct fc_lport *lport,
10132         }
10133         memset(ep, 0, sizeof(*ep));
10134
10135 -       cpu = get_cpu();
10136 +       cpu = get_cpu_light();
10137         pool = per_cpu_ptr(mp->pool, cpu);
10138         spin_lock_bh(&pool->lock);
10139 -       put_cpu();
10140 +       put_cpu_light();
10141
10142         /* peek cache of free slot */
10143         if (pool->left != FC_XID_UNKNOWN) {
10144 diff --git a/drivers/scsi/libsas/sas_ata.c b/drivers/scsi/libsas/sas_ata.c
10145 index 70be4425ae0b..a23ef685deac 100644
10146 --- a/drivers/scsi/libsas/sas_ata.c
10147 +++ b/drivers/scsi/libsas/sas_ata.c
10148 @@ -190,7 +190,7 @@ static unsigned int sas_ata_qc_issue(struct ata_queued_cmd *qc)
10149         /* TODO: audit callers to ensure they are ready for qc_issue to
10150          * unconditionally re-enable interrupts
10151          */
10152 -       local_irq_save(flags);
10153 +       local_irq_save_nort(flags);
10154         spin_unlock(ap->lock);
10155
10156         /* If the device fell off, no sense in issuing commands */
10157 @@ -252,7 +252,7 @@ static unsigned int sas_ata_qc_issue(struct ata_queued_cmd *qc)
10158
10159   out:
10160         spin_lock(ap->lock);
10161 -       local_irq_restore(flags);
10162 +       local_irq_restore_nort(flags);
10163         return ret;
10164  }
10165
10166 diff --git a/drivers/scsi/qla2xxx/qla_inline.h b/drivers/scsi/qla2xxx/qla_inline.h
10167 index 3f5a0f0f8b62..c75783143dc1 100644
10168 --- a/drivers/scsi/qla2xxx/qla_inline.h
10169 +++ b/drivers/scsi/qla2xxx/qla_inline.h
10170 @@ -59,12 +59,12 @@ qla2x00_poll(struct rsp_que *rsp)
10171  {
10172         unsigned long flags;
10173         struct qla_hw_data *ha = rsp->hw;
10174 -       local_irq_save(flags);
10175 +       local_irq_save_nort(flags);
10176         if (IS_P3P_TYPE(ha))
10177                 qla82xx_poll(0, rsp);
10178         else
10179                 ha->isp_ops->intr_handler(0, rsp);
10180 -       local_irq_restore(flags);
10181 +       local_irq_restore_nort(flags);
10182  }
10183
10184  static inline uint8_t *
10185 diff --git a/drivers/staging/greybus/audio_manager.c b/drivers/staging/greybus/audio_manager.c
10186 index aa6508b44fab..045696ce85c7 100644
10187 --- a/drivers/staging/greybus/audio_manager.c
10188 +++ b/drivers/staging/greybus/audio_manager.c
10189 @@ -10,7 +10,7 @@
10190  #include <linux/sysfs.h>
10191  #include <linux/module.h>
10192  #include <linux/init.h>
10193 -#include <linux/rwlock.h>
10194 +#include <linux/spinlock.h>
10195  #include <linux/idr.h>
10196
10197  #include "audio_manager.h"
10198 diff --git a/drivers/target/target_core_tmr.c b/drivers/target/target_core_tmr.c
10199 index 9c7bc1ca341a..3d35dad1de2c 100644
10200 --- a/drivers/target/target_core_tmr.c
10201 +++ b/drivers/target/target_core_tmr.c
10202 @@ -114,8 +114,6 @@ static bool __target_check_io_state(struct se_cmd *se_cmd,
10203  {
10204         struct se_session *sess = se_cmd->se_sess;
10205
10206 -       assert_spin_locked(&sess->sess_cmd_lock);
10207 -       WARN_ON_ONCE(!irqs_disabled());
10208         /*
10209          * If command already reached CMD_T_COMPLETE state within
10210          * target_complete_cmd() or CMD_T_FABRIC_STOP due to shutdown,
10211 diff --git a/drivers/target/target_core_transport.c b/drivers/target/target_core_transport.c
10212 index 0d0be7d8b9d6..f652e58e2988 100644
10213 --- a/drivers/target/target_core_transport.c
10214 +++ b/drivers/target/target_core_transport.c
10215 @@ -2967,9 +2967,6 @@ __transport_wait_for_tasks(struct se_cmd *cmd, bool fabric_stop,
10216         __acquires(&cmd->t_state_lock)
10217  {
10218
10219 -       assert_spin_locked(&cmd->t_state_lock);
10220 -       WARN_ON_ONCE(!irqs_disabled());
10221 -
10222         if (fabric_stop)
10223                 cmd->transport_state |= CMD_T_FABRIC_STOP;
10224
10225 @@ -3239,9 +3236,6 @@ static int __transport_check_aborted_status(struct se_cmd *cmd, int send_status)
10226  {
10227         int ret;
10228
10229 -       assert_spin_locked(&cmd->t_state_lock);
10230 -       WARN_ON_ONCE(!irqs_disabled());
10231 -
10232         if (!(cmd->transport_state & CMD_T_ABORTED))
10233                 return 0;
10234         /*
10235 diff --git a/drivers/thermal/x86_pkg_temp_thermal.c b/drivers/thermal/x86_pkg_temp_thermal.c
10236 index d93eee2f101b..0287333b1f3c 100644
10237 --- a/drivers/thermal/x86_pkg_temp_thermal.c
10238 +++ b/drivers/thermal/x86_pkg_temp_thermal.c
10239 @@ -29,6 +29,7 @@
10240  #include <linux/pm.h>
10241  #include <linux/thermal.h>
10242  #include <linux/debugfs.h>
10243 +#include <linux/swork.h>
10244  #include <asm/cpu_device_id.h>
10245  #include <asm/mce.h>
10246
10247 @@ -329,7 +330,7 @@ static void pkg_thermal_schedule_work(int cpu, struct delayed_work *work)
10248         schedule_delayed_work_on(cpu, work, ms);
10249  }
10250
10251 -static int pkg_thermal_notify(u64 msr_val)
10252 +static void pkg_thermal_notify_work(struct swork_event *event)
10253  {
10254         int cpu = smp_processor_id();
10255         struct pkg_device *pkgdev;
10256 @@ -348,9 +349,47 @@ static int pkg_thermal_notify(u64 msr_val)
10257         }
10258
10259         spin_unlock_irqrestore(&pkg_temp_lock, flags);
10260 +}
10261 +
10262 +#ifdef CONFIG_PREEMPT_RT_FULL
10263 +static struct swork_event notify_work;
10264 +
10265 +static int pkg_thermal_notify_work_init(void)
10266 +{
10267 +       int err;
10268 +
10269 +       err = swork_get();
10270 +       if (err)
10271 +               return err;
10272 +
10273 +       INIT_SWORK(&notify_work, pkg_thermal_notify_work);
10274         return 0;
10275  }
10276
10277 +static void pkg_thermal_notify_work_cleanup(void)
10278 +{
10279 +       swork_put();
10280 +}
10281 +
10282 +static int pkg_thermal_notify(u64 msr_val)
10283 +{
10284 +       swork_queue(&notify_work);
10285 +       return 0;
10286 +}
10287 +
10288 +#else  /* !CONFIG_PREEMPT_RT_FULL */
10289 +
10290 +static int pkg_thermal_notify_work_init(void) { return 0; }
10291 +
10292 +static void pkg_thermal_notify_work_cleanup(void) {  }
10293 +
10294 +static int pkg_thermal_notify(u64 msr_val)
10295 +{
10296 +       pkg_thermal_notify_work(NULL);
10297 +       return 0;
10298 +}
10299 +#endif /* CONFIG_PREEMPT_RT_FULL */
10300 +
10301  static int pkg_temp_thermal_device_add(unsigned int cpu)
10302  {
10303         int pkgid = topology_logical_package_id(cpu);
10304 @@ -515,10 +554,15 @@ static int __init pkg_temp_thermal_init(void)
10305         if (!x86_match_cpu(pkg_temp_thermal_ids))
10306                 return -ENODEV;
10307
10308 +       if (!pkg_thermal_notify_work_init())
10309 +               return -ENODEV;
10310 +
10311         max_packages = topology_max_packages();
10312         packages = kzalloc(max_packages * sizeof(struct pkg_device *), GFP_KERNEL);
10313 -       if (!packages)
10314 -               return -ENOMEM;
10315 +       if (!packages) {
10316 +               ret = -ENOMEM;
10317 +               goto err;
10318 +       }
10319
10320         ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "thermal/x86_pkg:online",
10321                                 pkg_thermal_cpu_online, pkg_thermal_cpu_offline);
10322 @@ -536,6 +580,7 @@ static int __init pkg_temp_thermal_init(void)
10323         return 0;
10324
10325  err:
10326 +       pkg_thermal_notify_work_cleanup();
10327         kfree(packages);
10328         return ret;
10329  }
10330 @@ -549,6 +594,7 @@ static void __exit pkg_temp_thermal_exit(void)
10331         cpuhp_remove_state(pkg_thermal_hp_state);
10332         debugfs_remove_recursive(debugfs);
10333         kfree(packages);
10334 +       pkg_thermal_notify_work_cleanup();
10335  }
10336  module_exit(pkg_temp_thermal_exit)
10337
10338 diff --git a/drivers/tty/serial/8250/8250_core.c b/drivers/tty/serial/8250/8250_core.c
10339 index d29b512a7d9f..bc8cbb995b29 100644
10340 --- a/drivers/tty/serial/8250/8250_core.c
10341 +++ b/drivers/tty/serial/8250/8250_core.c
10342 @@ -58,7 +58,16 @@ static struct uart_driver serial8250_reg;
10343
10344  static unsigned int skip_txen_test; /* force skip of txen test at init time */
10345
10346 -#define PASS_LIMIT     512
10347 +/*
10348 + * On -rt we can have a more delays, and legitimately
10349 + * so - so don't drop work spuriously and spam the
10350 + * syslog:
10351 + */
10352 +#ifdef CONFIG_PREEMPT_RT_FULL
10353 +# define PASS_LIMIT    1000000
10354 +#else
10355 +# define PASS_LIMIT    512
10356 +#endif
10357
10358  #include <asm/serial.h>
10359  /*
10360 diff --git a/drivers/tty/serial/8250/8250_port.c b/drivers/tty/serial/8250/8250_port.c
10361 index ecf3d631bc09..6e029f34f37f 100644
10362 --- a/drivers/tty/serial/8250/8250_port.c
10363 +++ b/drivers/tty/serial/8250/8250_port.c
10364 @@ -35,6 +35,7 @@
10365  #include <linux/nmi.h>
10366  #include <linux/mutex.h>
10367  #include <linux/slab.h>
10368 +#include <linux/kdb.h>
10369  #include <linux/uaccess.h>
10370  #include <linux/pm_runtime.h>
10371  #include <linux/ktime.h>
10372 @@ -3224,9 +3225,9 @@ void serial8250_console_write(struct uart_8250_port *up, const char *s,
10373
10374         serial8250_rpm_get(up);
10375
10376 -       if (port->sysrq)
10377 +       if (port->sysrq || oops_in_progress)
10378                 locked = 0;
10379 -       else if (oops_in_progress)
10380 +       else if (in_kdb_printk())
10381                 locked = spin_trylock_irqsave(&port->lock, flags);
10382         else
10383                 spin_lock_irqsave(&port->lock, flags);
10384 diff --git a/drivers/tty/serial/amba-pl011.c b/drivers/tty/serial/amba-pl011.c
10385 index c9f701aca677..81d6b15fb80a 100644
10386 --- a/drivers/tty/serial/amba-pl011.c
10387 +++ b/drivers/tty/serial/amba-pl011.c
10388 @@ -2236,13 +2236,19 @@ pl011_console_write(struct console *co, const char *s, unsigned int count)
10389
10390         clk_enable(uap->clk);
10391
10392 -       local_irq_save(flags);
10393 +       /*
10394 +        * local_irq_save(flags);
10395 +        *
10396 +        * This local_irq_save() is nonsense. If we come in via sysrq
10397 +        * handling then interrupts are already disabled. Aside of
10398 +        * that the port.sysrq check is racy on SMP regardless.
10399 +       */
10400         if (uap->port.sysrq)
10401                 locked = 0;
10402         else if (oops_in_progress)
10403 -               locked = spin_trylock(&uap->port.lock);
10404 +               locked = spin_trylock_irqsave(&uap->port.lock, flags);
10405         else
10406 -               spin_lock(&uap->port.lock);
10407 +               spin_lock_irqsave(&uap->port.lock, flags);
10408
10409         /*
10410          *      First save the CR then disable the interrupts
10411 @@ -2268,8 +2274,7 @@ pl011_console_write(struct console *co, const char *s, unsigned int count)
10412                 pl011_write(old_cr, uap, REG_CR);
10413
10414         if (locked)
10415 -               spin_unlock(&uap->port.lock);
10416 -       local_irq_restore(flags);
10417 +               spin_unlock_irqrestore(&uap->port.lock, flags);
10418
10419         clk_disable(uap->clk);
10420  }
10421 diff --git a/drivers/tty/serial/omap-serial.c b/drivers/tty/serial/omap-serial.c
10422 index 26a22b100df1..69117e355bcd 100644
10423 --- a/drivers/tty/serial/omap-serial.c
10424 +++ b/drivers/tty/serial/omap-serial.c
10425 @@ -1311,13 +1311,10 @@ serial_omap_console_write(struct console *co, const char *s,
10426
10427         pm_runtime_get_sync(up->dev);
10428
10429 -       local_irq_save(flags);
10430 -       if (up->port.sysrq)
10431 -               locked = 0;
10432 -       else if (oops_in_progress)
10433 -               locked = spin_trylock(&up->port.lock);
10434 +       if (up->port.sysrq || oops_in_progress)
10435 +               locked = spin_trylock_irqsave(&up->port.lock, flags);
10436         else
10437 -               spin_lock(&up->port.lock);
10438 +               spin_lock_irqsave(&up->port.lock, flags);
10439
10440         /*
10441          * First save the IER then disable the interrupts
10442 @@ -1346,8 +1343,7 @@ serial_omap_console_write(struct console *co, const char *s,
10443         pm_runtime_mark_last_busy(up->dev);
10444         pm_runtime_put_autosuspend(up->dev);
10445         if (locked)
10446 -               spin_unlock(&up->port.lock);
10447 -       local_irq_restore(flags);
10448 +               spin_unlock_irqrestore(&up->port.lock, flags);
10449  }
10450
10451  static int __init
10452 diff --git a/drivers/usb/core/hcd.c b/drivers/usb/core/hcd.c
10453 index d0b2e0ed9bab..91f4f2bd55b0 100644
10454 --- a/drivers/usb/core/hcd.c
10455 +++ b/drivers/usb/core/hcd.c
10456 @@ -1775,9 +1775,9 @@ static void __usb_hcd_giveback_urb(struct urb *urb)
10457          * and no one may trigger the above deadlock situation when
10458          * running complete() in tasklet.
10459          */
10460 -       local_irq_save(flags);
10461 +       local_irq_save_nort(flags);
10462         urb->complete(urb);
10463 -       local_irq_restore(flags);
10464 +       local_irq_restore_nort(flags);
10465
10466         usb_anchor_resume_wakeups(anchor);
10467         atomic_dec(&urb->use_count);
10468 diff --git a/drivers/usb/gadget/function/f_fs.c b/drivers/usb/gadget/function/f_fs.c
10469 index 17467545391b..42ec6f2db6a9 100644
10470 --- a/drivers/usb/gadget/function/f_fs.c
10471 +++ b/drivers/usb/gadget/function/f_fs.c
10472 @@ -1623,7 +1623,7 @@ static void ffs_data_put(struct ffs_data *ffs)
10473                 pr_info("%s(): freeing\n", __func__);
10474                 ffs_data_clear(ffs);
10475                 BUG_ON(waitqueue_active(&ffs->ev.waitq) ||
10476 -                      waitqueue_active(&ffs->ep0req_completion.wait) ||
10477 +                      swait_active(&ffs->ep0req_completion.wait) ||
10478                        waitqueue_active(&ffs->wait));
10479                 destroy_workqueue(ffs->io_completion_wq);
10480                 kfree(ffs->dev_name);
10481 diff --git a/drivers/usb/gadget/function/f_ncm.c b/drivers/usb/gadget/function/f_ncm.c
10482 index 45b334ceaf2e..5f24e6d3b6eb 100644
10483 --- a/drivers/usb/gadget/function/f_ncm.c
10484 +++ b/drivers/usb/gadget/function/f_ncm.c
10485 @@ -77,9 +77,7 @@ struct f_ncm {
10486         struct sk_buff                  *skb_tx_ndp;
10487         u16                             ndp_dgram_count;
10488         bool                            timer_force_tx;
10489 -       struct tasklet_struct           tx_tasklet;
10490         struct hrtimer                  task_timer;
10491 -
10492         bool                            timer_stopping;
10493  };
10494
10495 @@ -1108,7 +1106,7 @@ static struct sk_buff *ncm_wrap_ntb(struct gether *port,
10496
10497                 /* Delay the timer. */
10498                 hrtimer_start(&ncm->task_timer, TX_TIMEOUT_NSECS,
10499 -                             HRTIMER_MODE_REL);
10500 +                             HRTIMER_MODE_REL_SOFT);
10501
10502                 /* Add the datagram position entries */
10503                 ntb_ndp = skb_put_zero(ncm->skb_tx_ndp, dgram_idx_len);
10504 @@ -1152,17 +1150,15 @@ static struct sk_buff *ncm_wrap_ntb(struct gether *port,
10505  }
10506
10507  /*
10508 - * This transmits the NTB if there are frames waiting.
10509 + * The transmit should only be run if no skb data has been sent
10510 + * for a certain duration.
10511   */
10512 -static void ncm_tx_tasklet(unsigned long data)
10513 +static enum hrtimer_restart ncm_tx_timeout(struct hrtimer *data)
10514  {
10515 -       struct f_ncm    *ncm = (void *)data;
10516 -
10517 -       if (ncm->timer_stopping)
10518 -               return;
10519 +       struct f_ncm *ncm = container_of(data, struct f_ncm, task_timer);
10520
10521         /* Only send if data is available. */
10522 -       if (ncm->skb_tx_data) {
10523 +       if (!ncm->timer_stopping && ncm->skb_tx_data) {
10524                 ncm->timer_force_tx = true;
10525
10526                 /* XXX This allowance of a NULL skb argument to ndo_start_xmit
10527 @@ -1175,16 +1171,6 @@ static void ncm_tx_tasklet(unsigned long data)
10528
10529                 ncm->timer_force_tx = false;
10530         }
10531 -}
10532 -
10533 -/*
10534 - * The transmit should only be run if no skb data has been sent
10535 - * for a certain duration.
10536 - */
10537 -static enum hrtimer_restart ncm_tx_timeout(struct hrtimer *data)
10538 -{
10539 -       struct f_ncm *ncm = container_of(data, struct f_ncm, task_timer);
10540 -       tasklet_schedule(&ncm->tx_tasklet);
10541         return HRTIMER_NORESTART;
10542  }
10543
10544 @@ -1517,8 +1503,7 @@ static int ncm_bind(struct usb_configuration *c, struct usb_function *f)
10545         ncm->port.open = ncm_open;
10546         ncm->port.close = ncm_close;
10547
10548 -       tasklet_init(&ncm->tx_tasklet, ncm_tx_tasklet, (unsigned long) ncm);
10549 -       hrtimer_init(&ncm->task_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
10550 +       hrtimer_init(&ncm->task_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_SOFT);
10551         ncm->task_timer.function = ncm_tx_timeout;
10552
10553         DBG(cdev, "CDC Network: %s speed IN/%s OUT/%s NOTIFY/%s\n",
10554 @@ -1627,7 +1612,6 @@ static void ncm_unbind(struct usb_configuration *c, struct usb_function *f)
10555         DBG(c->cdev, "ncm unbind\n");
10556
10557         hrtimer_cancel(&ncm->task_timer);
10558 -       tasklet_kill(&ncm->tx_tasklet);
10559
10560         ncm_string_defs[0].id = 0;
10561         usb_free_all_descriptors(f);
10562 diff --git a/drivers/usb/gadget/legacy/inode.c b/drivers/usb/gadget/legacy/inode.c
10563 index 5c28bee327e1..ed49dba4704d 100644
10564 --- a/drivers/usb/gadget/legacy/inode.c
10565 +++ b/drivers/usb/gadget/legacy/inode.c
10566 @@ -347,7 +347,7 @@ ep_io (struct ep_data *epdata, void *buf, unsigned len)
10567         spin_unlock_irq (&epdata->dev->lock);
10568
10569         if (likely (value == 0)) {
10570 -               value = wait_event_interruptible (done.wait, done.done);
10571 +               value = swait_event_interruptible (done.wait, done.done);
10572                 if (value != 0) {
10573                         spin_lock_irq (&epdata->dev->lock);
10574                         if (likely (epdata->ep != NULL)) {
10575 @@ -356,7 +356,7 @@ ep_io (struct ep_data *epdata, void *buf, unsigned len)
10576                                 usb_ep_dequeue (epdata->ep, epdata->req);
10577                                 spin_unlock_irq (&epdata->dev->lock);
10578
10579 -                               wait_event (done.wait, done.done);
10580 +                               swait_event (done.wait, done.done);
10581                                 if (epdata->status == -ECONNRESET)
10582                                         epdata->status = -EINTR;
10583                         } else {
10584 diff --git a/fs/aio.c b/fs/aio.c
10585 index 3a749c3a92e3..24c6ceadaae6 100644
10586 --- a/fs/aio.c
10587 +++ b/fs/aio.c
10588 @@ -40,6 +40,7 @@
10589  #include <linux/ramfs.h>
10590  #include <linux/percpu-refcount.h>
10591  #include <linux/mount.h>
10592 +#include <linux/swork.h>
10593
10594  #include <asm/kmap_types.h>
10595  #include <linux/uaccess.h>
10596 @@ -117,6 +118,7 @@ struct kioctx {
10597
10598         struct rcu_head         free_rcu;
10599         struct work_struct      free_work;      /* see free_ioctx() */
10600 +       struct swork_event      free_swork;     /* see free_ioctx() */
10601
10602         /*
10603          * signals when all in-flight requests are done
10604 @@ -259,6 +261,7 @@ static int __init aio_setup(void)
10605                 .mount          = aio_mount,
10606                 .kill_sb        = kill_anon_super,
10607         };
10608 +       BUG_ON(swork_get());
10609         aio_mnt = kern_mount(&aio_fs);
10610         if (IS_ERR(aio_mnt))
10611                 panic("Failed to create aio fs mount.");
10612 @@ -633,9 +636,9 @@ static void free_ioctx_reqs(struct percpu_ref *ref)
10613   * and ctx->users has dropped to 0, so we know no more kiocbs can be submitted -
10614   * now it's safe to cancel any that need to be.
10615   */
10616 -static void free_ioctx_users(struct percpu_ref *ref)
10617 +static void free_ioctx_users_work(struct swork_event *sev)
10618  {
10619 -       struct kioctx *ctx = container_of(ref, struct kioctx, users);
10620 +       struct kioctx *ctx = container_of(sev, struct kioctx, free_swork);
10621         struct aio_kiocb *req;
10622
10623         spin_lock_irq(&ctx->ctx_lock);
10624 @@ -653,6 +656,14 @@ static void free_ioctx_users(struct percpu_ref *ref)
10625         percpu_ref_put(&ctx->reqs);
10626  }
10627
10628 +static void free_ioctx_users(struct percpu_ref *ref)
10629 +{
10630 +       struct kioctx *ctx = container_of(ref, struct kioctx, users);
10631 +
10632 +       INIT_SWORK(&ctx->free_swork, free_ioctx_users_work);
10633 +       swork_queue(&ctx->free_swork);
10634 +}
10635 +
10636  static int ioctx_add_table(struct kioctx *ctx, struct mm_struct *mm)
10637  {
10638         unsigned i, new_nr;
10639 diff --git a/fs/autofs4/autofs_i.h b/fs/autofs4/autofs_i.h
10640 index ce696d6c4641..b120fbd41483 100644
10641 --- a/fs/autofs4/autofs_i.h
10642 +++ b/fs/autofs4/autofs_i.h
10643 @@ -20,6 +20,7 @@
10644  #include <linux/sched.h>
10645  #include <linux/mount.h>
10646  #include <linux/namei.h>
10647 +#include <linux/delay.h>
10648  #include <linux/uaccess.h>
10649  #include <linux/mutex.h>
10650  #include <linux/spinlock.h>
10651 diff --git a/fs/autofs4/expire.c b/fs/autofs4/expire.c
10652 index 57725d4a8c59..62220508bace 100644
10653 --- a/fs/autofs4/expire.c
10654 +++ b/fs/autofs4/expire.c
10655 @@ -148,7 +148,7 @@ static struct dentry *get_next_positive_dentry(struct dentry *prev,
10656                         parent = p->d_parent;
10657                         if (!spin_trylock(&parent->d_lock)) {
10658                                 spin_unlock(&p->d_lock);
10659 -                               cpu_relax();
10660 +                               cpu_chill();
10661                                 goto relock;
10662                         }
10663                         spin_unlock(&p->d_lock);
10664 diff --git a/fs/buffer.c b/fs/buffer.c
10665 index b96f3b98a6ef..4ca5f222537a 100644
10666 --- a/fs/buffer.c
10667 +++ b/fs/buffer.c
10668 @@ -302,8 +302,7 @@ static void end_buffer_async_read(struct buffer_head *bh, int uptodate)
10669          * decide that the page is now completely done.
10670          */
10671         first = page_buffers(page);
10672 -       local_irq_save(flags);
10673 -       bit_spin_lock(BH_Uptodate_Lock, &first->b_state);
10674 +       flags = bh_uptodate_lock_irqsave(first);
10675         clear_buffer_async_read(bh);
10676         unlock_buffer(bh);
10677         tmp = bh;
10678 @@ -316,8 +315,7 @@ static void end_buffer_async_read(struct buffer_head *bh, int uptodate)
10679                 }
10680                 tmp = tmp->b_this_page;
10681         } while (tmp != bh);
10682 -       bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
10683 -       local_irq_restore(flags);
10684 +       bh_uptodate_unlock_irqrestore(first, flags);
10685
10686         /*
10687          * If none of the buffers had errors and they are all
10688 @@ -329,9 +327,7 @@ static void end_buffer_async_read(struct buffer_head *bh, int uptodate)
10689         return;
10690
10691  still_busy:
10692 -       bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
10693 -       local_irq_restore(flags);
10694 -       return;
10695 +       bh_uptodate_unlock_irqrestore(first, flags);
10696  }
10697
10698  /*
10699 @@ -358,8 +354,7 @@ void end_buffer_async_write(struct buffer_head *bh, int uptodate)
10700         }
10701
10702         first = page_buffers(page);
10703 -       local_irq_save(flags);
10704 -       bit_spin_lock(BH_Uptodate_Lock, &first->b_state);
10705 +       flags = bh_uptodate_lock_irqsave(first);
10706
10707         clear_buffer_async_write(bh);
10708         unlock_buffer(bh);
10709 @@ -371,15 +366,12 @@ void end_buffer_async_write(struct buffer_head *bh, int uptodate)
10710                 }
10711                 tmp = tmp->b_this_page;
10712         }
10713 -       bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
10714 -       local_irq_restore(flags);
10715 +       bh_uptodate_unlock_irqrestore(first, flags);
10716         end_page_writeback(page);
10717         return;
10718
10719  still_busy:
10720 -       bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
10721 -       local_irq_restore(flags);
10722 -       return;
10723 +       bh_uptodate_unlock_irqrestore(first, flags);
10724  }
10725  EXPORT_SYMBOL(end_buffer_async_write);
10726
10727 @@ -3417,6 +3409,7 @@ struct buffer_head *alloc_buffer_head(gfp_t gfp_flags)
10728         struct buffer_head *ret = kmem_cache_zalloc(bh_cachep, gfp_flags);
10729         if (ret) {
10730                 INIT_LIST_HEAD(&ret->b_assoc_buffers);
10731 +               buffer_head_init_locks(ret);
10732                 preempt_disable();
10733                 __this_cpu_inc(bh_accounting.nr);
10734                 recalc_bh_state();
10735 diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c
10736 index ef24b4527459..3ce6331a1101 100644
10737 --- a/fs/cifs/readdir.c
10738 +++ b/fs/cifs/readdir.c
10739 @@ -80,7 +80,7 @@ cifs_prime_dcache(struct dentry *parent, struct qstr *name,
10740         struct inode *inode;
10741         struct super_block *sb = parent->d_sb;
10742         struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
10743 -       DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq);
10744 +       DECLARE_SWAIT_QUEUE_HEAD_ONSTACK(wq);
10745
10746         cifs_dbg(FYI, "%s: for %s\n", __func__, name->name);
10747
10748 diff --git a/fs/dcache.c b/fs/dcache.c
10749 index 28b2e770bb69..b08506ef464a 100644
10750 --- a/fs/dcache.c
10751 +++ b/fs/dcache.c
10752 @@ -19,6 +19,7 @@
10753  #include <linux/mm.h>
10754  #include <linux/fs.h>
10755  #include <linux/fsnotify.h>
10756 +#include <linux/delay.h>
10757  #include <linux/slab.h>
10758  #include <linux/init.h>
10759  #include <linux/hash.h>
10760 @@ -808,6 +809,8 @@ static inline bool fast_dput(struct dentry *dentry)
10761   */
10762  void dput(struct dentry *dentry)
10763  {
10764 +       struct dentry *parent;
10765 +
10766         if (unlikely(!dentry))
10767                 return;
10768
10769 @@ -844,9 +847,18 @@ void dput(struct dentry *dentry)
10770         return;
10771
10772  kill_it:
10773 -       dentry = dentry_kill(dentry);
10774 -       if (dentry) {
10775 -               cond_resched();
10776 +       parent = dentry_kill(dentry);
10777 +       if (parent) {
10778 +               int r;
10779 +
10780 +               if (parent == dentry) {
10781 +                       /* the task with the highest priority won't schedule */
10782 +                       r = cond_resched();
10783 +                       if (!r)
10784 +                               cpu_chill();
10785 +               } else {
10786 +                       dentry = parent;
10787 +               }
10788                 goto repeat;
10789         }
10790  }
10791 @@ -2414,7 +2426,7 @@ void d_delete(struct dentry * dentry)
10792         if (dentry->d_lockref.count == 1) {
10793                 if (!spin_trylock(&inode->i_lock)) {
10794                         spin_unlock(&dentry->d_lock);
10795 -                       cpu_relax();
10796 +                       cpu_chill();
10797                         goto again;
10798                 }
10799                 dentry->d_flags &= ~DCACHE_CANT_MOUNT;
10800 @@ -2459,9 +2471,10 @@ EXPORT_SYMBOL(d_rehash);
10801  static inline unsigned start_dir_add(struct inode *dir)
10802  {
10803
10804 +       preempt_disable_rt();
10805         for (;;) {
10806 -               unsigned n = dir->i_dir_seq;
10807 -               if (!(n & 1) && cmpxchg(&dir->i_dir_seq, n, n + 1) == n)
10808 +               unsigned n = dir->__i_dir_seq;
10809 +               if (!(n & 1) && cmpxchg(&dir->__i_dir_seq, n, n + 1) == n)
10810                         return n;
10811                 cpu_relax();
10812         }
10813 @@ -2469,26 +2482,30 @@ static inline unsigned start_dir_add(struct inode *dir)
10814
10815  static inline void end_dir_add(struct inode *dir, unsigned n)
10816  {
10817 -       smp_store_release(&dir->i_dir_seq, n + 2);
10818 +       smp_store_release(&dir->__i_dir_seq, n + 2);
10819 +       preempt_enable_rt();
10820  }
10821
10822  static void d_wait_lookup(struct dentry *dentry)
10823  {
10824 -       if (d_in_lookup(dentry)) {
10825 -               DECLARE_WAITQUEUE(wait, current);
10826 -               add_wait_queue(dentry->d_wait, &wait);
10827 -               do {
10828 -                       set_current_state(TASK_UNINTERRUPTIBLE);
10829 -                       spin_unlock(&dentry->d_lock);
10830 -                       schedule();
10831 -                       spin_lock(&dentry->d_lock);
10832 -               } while (d_in_lookup(dentry));
10833 -       }
10834 +       struct swait_queue __wait;
10835 +
10836 +       if (!d_in_lookup(dentry))
10837 +               return;
10838 +
10839 +       INIT_LIST_HEAD(&__wait.task_list);
10840 +       do {
10841 +               prepare_to_swait(dentry->d_wait, &__wait, TASK_UNINTERRUPTIBLE);
10842 +               spin_unlock(&dentry->d_lock);
10843 +               schedule();
10844 +               spin_lock(&dentry->d_lock);
10845 +       } while (d_in_lookup(dentry));
10846 +       finish_swait(dentry->d_wait, &__wait);
10847  }
10848
10849  struct dentry *d_alloc_parallel(struct dentry *parent,
10850                                 const struct qstr *name,
10851 -                               wait_queue_head_t *wq)
10852 +                               struct swait_queue_head *wq)
10853  {
10854         unsigned int hash = name->hash;
10855         struct hlist_bl_head *b = in_lookup_hash(parent, hash);
10856 @@ -2502,7 +2519,7 @@ struct dentry *d_alloc_parallel(struct dentry *parent,
10857
10858  retry:
10859         rcu_read_lock();
10860 -       seq = smp_load_acquire(&parent->d_inode->i_dir_seq);
10861 +       seq = smp_load_acquire(&parent->d_inode->__i_dir_seq);
10862         r_seq = read_seqbegin(&rename_lock);
10863         dentry = __d_lookup_rcu(parent, name, &d_seq);
10864         if (unlikely(dentry)) {
10865 @@ -2530,7 +2547,7 @@ struct dentry *d_alloc_parallel(struct dentry *parent,
10866         }
10867
10868         hlist_bl_lock(b);
10869 -       if (unlikely(READ_ONCE(parent->d_inode->i_dir_seq) != seq)) {
10870 +       if (unlikely(READ_ONCE(parent->d_inode->__i_dir_seq) != seq)) {
10871                 hlist_bl_unlock(b);
10872                 rcu_read_unlock();
10873                 goto retry;
10874 @@ -2603,7 +2620,7 @@ void __d_lookup_done(struct dentry *dentry)
10875         hlist_bl_lock(b);
10876         dentry->d_flags &= ~DCACHE_PAR_LOOKUP;
10877         __hlist_bl_del(&dentry->d_u.d_in_lookup_hash);
10878 -       wake_up_all(dentry->d_wait);
10879 +       swake_up_all(dentry->d_wait);
10880         dentry->d_wait = NULL;
10881         hlist_bl_unlock(b);
10882         INIT_HLIST_NODE(&dentry->d_u.d_alias);
10883 @@ -3638,6 +3655,8 @@ __setup("dhash_entries=", set_dhash_entries);
10884
10885  static void __init dcache_init_early(void)
10886  {
10887 +       unsigned int loop;
10888 +
10889         /* If hashes are distributed across NUMA nodes, defer
10890          * hash allocation until vmalloc space is available.
10891          */
10892 @@ -3654,10 +3673,14 @@ static void __init dcache_init_early(void)
10893                                         &d_hash_mask,
10894                                         0,
10895                                         0);
10896 +
10897 +       for (loop = 0; loop < (1U << d_hash_shift); loop++)
10898 +               INIT_HLIST_BL_HEAD(dentry_hashtable + loop);
10899  }
10900
10901  static void __init dcache_init(void)
10902  {
10903 +       unsigned int loop;
10904         /*
10905          * A constructor could be added for stable state like the lists,
10906          * but it is probably not worth it because of the cache nature
10907 @@ -3680,6 +3703,10 @@ static void __init dcache_init(void)
10908                                         &d_hash_mask,
10909                                         0,
10910                                         0);
10911 +
10912 +       for (loop = 0; loop < (1U << d_hash_shift); loop++)
10913 +               INIT_HLIST_BL_HEAD(dentry_hashtable + loop);
10914 +
10915  }
10916
10917  /* SLAB cache for __getname() consumers */
10918 diff --git a/fs/eventpoll.c b/fs/eventpoll.c
10919 index 2fabd19cdeea..b768c32631eb 100644
10920 --- a/fs/eventpoll.c
10921 +++ b/fs/eventpoll.c
10922 @@ -587,12 +587,12 @@ static int ep_poll_wakeup_proc(void *priv, void *cookie, int call_nests)
10923   */
10924  static void ep_poll_safewake(wait_queue_head_t *wq)
10925  {
10926 -       int this_cpu = get_cpu();
10927 +       int this_cpu = get_cpu_light();
10928
10929         ep_call_nested(&poll_safewake_ncalls, EP_MAX_NESTS,
10930                        ep_poll_wakeup_proc, NULL, wq, (void *) (long) this_cpu);
10931
10932 -       put_cpu();
10933 +       put_cpu_light();
10934  }
10935
10936  static void ep_remove_wait_queue(struct eppoll_entry *pwq)
10937 diff --git a/fs/exec.c b/fs/exec.c
10938 index 0da4d748b4e6..609aee4dbfa9 100644
10939 --- a/fs/exec.c
10940 +++ b/fs/exec.c
10941 @@ -1024,12 +1024,14 @@ static int exec_mmap(struct mm_struct *mm)
10942                 }
10943         }
10944         task_lock(tsk);
10945 +       preempt_disable_rt();
10946         active_mm = tsk->active_mm;
10947         tsk->mm = mm;
10948         tsk->active_mm = mm;
10949         activate_mm(active_mm, mm);
10950         tsk->mm->vmacache_seqnum = 0;
10951         vmacache_flush(tsk);
10952 +       preempt_enable_rt();
10953         task_unlock(tsk);
10954         if (old_mm) {
10955                 up_read(&old_mm->mmap_sem);
10956 diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c
10957 index db7590178dfc..d76364124443 100644
10958 --- a/fs/ext4/page-io.c
10959 +++ b/fs/ext4/page-io.c
10960 @@ -95,8 +95,7 @@ static void ext4_finish_bio(struct bio *bio)
10961                  * We check all buffers in the page under BH_Uptodate_Lock
10962                  * to avoid races with other end io clearing async_write flags
10963                  */
10964 -               local_irq_save(flags);
10965 -               bit_spin_lock(BH_Uptodate_Lock, &head->b_state);
10966 +               flags = bh_uptodate_lock_irqsave(head);
10967                 do {
10968                         if (bh_offset(bh) < bio_start ||
10969                             bh_offset(bh) + bh->b_size > bio_end) {
10970 @@ -108,8 +107,7 @@ static void ext4_finish_bio(struct bio *bio)
10971                         if (bio->bi_status)
10972                                 buffer_io_error(bh);
10973                 } while ((bh = bh->b_this_page) != head);
10974 -               bit_spin_unlock(BH_Uptodate_Lock, &head->b_state);
10975 -               local_irq_restore(flags);
10976 +               bh_uptodate_unlock_irqrestore(head, flags);
10977                 if (!under_io) {
10978  #ifdef CONFIG_EXT4_FS_ENCRYPTION
10979                         if (data_page)
10980 diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
10981 index 29868c35c19a..76d354eee035 100644
10982 --- a/fs/fuse/dir.c
10983 +++ b/fs/fuse/dir.c
10984 @@ -1188,7 +1188,7 @@ static int fuse_direntplus_link(struct file *file,
10985         struct inode *dir = d_inode(parent);
10986         struct fuse_conn *fc;
10987         struct inode *inode;
10988 -       DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq);
10989 +       DECLARE_SWAIT_QUEUE_HEAD_ONSTACK(wq);
10990
10991         if (!o->nodeid) {
10992                 /*
10993 diff --git a/fs/inode.c b/fs/inode.c
10994 index cfc36d11bcb3..b77ce179798a 100644
10995 --- a/fs/inode.c
10996 +++ b/fs/inode.c
10997 @@ -154,7 +154,7 @@ int inode_init_always(struct super_block *sb, struct inode *inode)
10998         inode->i_bdev = NULL;
10999         inode->i_cdev = NULL;
11000         inode->i_link = NULL;
11001 -       inode->i_dir_seq = 0;
11002 +       inode->__i_dir_seq = 0;
11003         inode->i_rdev = 0;
11004         inode->dirtied_when = 0;
11005
11006 diff --git a/fs/libfs.c b/fs/libfs.c
11007 index 3aabe553fc45..b5d63bf1ad8e 100644
11008 --- a/fs/libfs.c
11009 +++ b/fs/libfs.c
11010 @@ -90,7 +90,7 @@ static struct dentry *next_positive(struct dentry *parent,
11011                                     struct list_head *from,
11012                                     int count)
11013  {
11014 -       unsigned *seq = &parent->d_inode->i_dir_seq, n;
11015 +       unsigned *seq = &parent->d_inode->__i_dir_seq, n;
11016         struct dentry *res;
11017         struct list_head *p;
11018         bool skipped;
11019 @@ -123,8 +123,9 @@ static struct dentry *next_positive(struct dentry *parent,
11020  static void move_cursor(struct dentry *cursor, struct list_head *after)
11021  {
11022         struct dentry *parent = cursor->d_parent;
11023 -       unsigned n, *seq = &parent->d_inode->i_dir_seq;
11024 +       unsigned n, *seq = &parent->d_inode->__i_dir_seq;
11025         spin_lock(&parent->d_lock);
11026 +       preempt_disable_rt();
11027         for (;;) {
11028                 n = *seq;
11029                 if (!(n & 1) && cmpxchg(seq, n, n + 1) == n)
11030 @@ -137,6 +138,7 @@ static void move_cursor(struct dentry *cursor, struct list_head *after)
11031         else
11032                 list_add_tail(&cursor->d_child, &parent->d_subdirs);
11033         smp_store_release(seq, n + 2);
11034 +       preempt_enable_rt();
11035         spin_unlock(&parent->d_lock);
11036  }
11037
11038 diff --git a/fs/locks.c b/fs/locks.c
11039 index 665e3ce9ab47..47b66bfc4fa3 100644
11040 --- a/fs/locks.c
11041 +++ b/fs/locks.c
11042 @@ -945,7 +945,7 @@ static int flock_lock_inode(struct inode *inode, struct file_lock *request)
11043                         return -ENOMEM;
11044         }
11045
11046 -       percpu_down_read_preempt_disable(&file_rwsem);
11047 +       percpu_down_read(&file_rwsem);
11048         spin_lock(&ctx->flc_lock);
11049         if (request->fl_flags & FL_ACCESS)
11050                 goto find_conflict;
11051 @@ -986,7 +986,7 @@ static int flock_lock_inode(struct inode *inode, struct file_lock *request)
11052
11053  out:
11054         spin_unlock(&ctx->flc_lock);
11055 -       percpu_up_read_preempt_enable(&file_rwsem);
11056 +       percpu_up_read(&file_rwsem);
11057         if (new_fl)
11058                 locks_free_lock(new_fl);
11059         locks_dispose_list(&dispose);
11060 @@ -1023,7 +1023,7 @@ static int posix_lock_inode(struct inode *inode, struct file_lock *request,
11061                 new_fl2 = locks_alloc_lock();
11062         }
11063
11064 -       percpu_down_read_preempt_disable(&file_rwsem);
11065 +       percpu_down_read(&file_rwsem);
11066         spin_lock(&ctx->flc_lock);
11067         /*
11068          * New lock request. Walk all POSIX locks and look for conflicts. If
11069 @@ -1195,7 +1195,7 @@ static int posix_lock_inode(struct inode *inode, struct file_lock *request,
11070         }
11071   out:
11072         spin_unlock(&ctx->flc_lock);
11073 -       percpu_up_read_preempt_enable(&file_rwsem);
11074 +       percpu_up_read(&file_rwsem);
11075         /*
11076          * Free any unused locks.
11077          */
11078 @@ -1470,7 +1470,7 @@ int __break_lease(struct inode *inode, unsigned int mode, unsigned int type)
11079                 return error;
11080         }
11081
11082 -       percpu_down_read_preempt_disable(&file_rwsem);
11083 +       percpu_down_read(&file_rwsem);
11084         spin_lock(&ctx->flc_lock);
11085
11086         time_out_leases(inode, &dispose);
11087 @@ -1522,13 +1522,13 @@ int __break_lease(struct inode *inode, unsigned int mode, unsigned int type)
11088         locks_insert_block(fl, new_fl);
11089         trace_break_lease_block(inode, new_fl);
11090         spin_unlock(&ctx->flc_lock);
11091 -       percpu_up_read_preempt_enable(&file_rwsem);
11092 +       percpu_up_read(&file_rwsem);
11093
11094         locks_dispose_list(&dispose);
11095         error = wait_event_interruptible_timeout(new_fl->fl_wait,
11096                                                 !new_fl->fl_next, break_time);
11097
11098 -       percpu_down_read_preempt_disable(&file_rwsem);
11099 +       percpu_down_read(&file_rwsem);
11100         spin_lock(&ctx->flc_lock);
11101         trace_break_lease_unblock(inode, new_fl);
11102         locks_delete_block(new_fl);
11103 @@ -1545,7 +1545,7 @@ int __break_lease(struct inode *inode, unsigned int mode, unsigned int type)
11104         }
11105  out:
11106         spin_unlock(&ctx->flc_lock);
11107 -       percpu_up_read_preempt_enable(&file_rwsem);
11108 +       percpu_up_read(&file_rwsem);
11109         locks_dispose_list(&dispose);
11110         locks_free_lock(new_fl);
11111         return error;
11112 @@ -1619,7 +1619,7 @@ int fcntl_getlease(struct file *filp)
11113
11114         ctx = smp_load_acquire(&inode->i_flctx);
11115         if (ctx && !list_empty_careful(&ctx->flc_lease)) {
11116 -               percpu_down_read_preempt_disable(&file_rwsem);
11117 +               percpu_down_read(&file_rwsem);
11118                 spin_lock(&ctx->flc_lock);
11119                 time_out_leases(inode, &dispose);
11120                 list_for_each_entry(fl, &ctx->flc_lease, fl_list) {
11121 @@ -1629,7 +1629,7 @@ int fcntl_getlease(struct file *filp)
11122                         break;
11123                 }
11124                 spin_unlock(&ctx->flc_lock);
11125 -               percpu_up_read_preempt_enable(&file_rwsem);
11126 +               percpu_up_read(&file_rwsem);
11127
11128                 locks_dispose_list(&dispose);
11129         }
11130 @@ -1704,7 +1704,7 @@ generic_add_lease(struct file *filp, long arg, struct file_lock **flp, void **pr
11131                 return -EINVAL;
11132         }
11133
11134 -       percpu_down_read_preempt_disable(&file_rwsem);
11135 +       percpu_down_read(&file_rwsem);
11136         spin_lock(&ctx->flc_lock);
11137         time_out_leases(inode, &dispose);
11138         error = check_conflicting_open(dentry, arg, lease->fl_flags);
11139 @@ -1775,7 +1775,7 @@ generic_add_lease(struct file *filp, long arg, struct file_lock **flp, void **pr
11140                 lease->fl_lmops->lm_setup(lease, priv);
11141  out:
11142         spin_unlock(&ctx->flc_lock);
11143 -       percpu_up_read_preempt_enable(&file_rwsem);
11144 +       percpu_up_read(&file_rwsem);
11145         locks_dispose_list(&dispose);
11146         if (is_deleg)
11147                 inode_unlock(inode);
11148 @@ -1798,7 +1798,7 @@ static int generic_delete_lease(struct file *filp, void *owner)
11149                 return error;
11150         }
11151
11152 -       percpu_down_read_preempt_disable(&file_rwsem);
11153 +       percpu_down_read(&file_rwsem);
11154         spin_lock(&ctx->flc_lock);
11155         list_for_each_entry(fl, &ctx->flc_lease, fl_list) {
11156                 if (fl->fl_file == filp &&
11157 @@ -1811,7 +1811,7 @@ static int generic_delete_lease(struct file *filp, void *owner)
11158         if (victim)
11159                 error = fl->fl_lmops->lm_change(victim, F_UNLCK, &dispose);
11160         spin_unlock(&ctx->flc_lock);
11161 -       percpu_up_read_preempt_enable(&file_rwsem);
11162 +       percpu_up_read(&file_rwsem);
11163         locks_dispose_list(&dispose);
11164         return error;
11165  }
11166 @@ -2542,13 +2542,13 @@ locks_remove_lease(struct file *filp, struct file_lock_context *ctx)
11167         if (list_empty(&ctx->flc_lease))
11168                 return;
11169
11170 -       percpu_down_read_preempt_disable(&file_rwsem);
11171 +       percpu_down_read(&file_rwsem);
11172         spin_lock(&ctx->flc_lock);
11173         list_for_each_entry_safe(fl, tmp, &ctx->flc_lease, fl_list)
11174                 if (filp == fl->fl_file)
11175                         lease_modify(fl, F_UNLCK, &dispose);
11176         spin_unlock(&ctx->flc_lock);
11177 -       percpu_up_read_preempt_enable(&file_rwsem);
11178 +       percpu_up_read(&file_rwsem);
11179
11180         locks_dispose_list(&dispose);
11181  }
11182 diff --git a/fs/namei.c b/fs/namei.c
11183 index 0b46b858cd42..f5c6c2ec44ce 100644
11184 --- a/fs/namei.c
11185 +++ b/fs/namei.c
11186 @@ -1627,7 +1627,7 @@ static struct dentry *lookup_slow(const struct qstr *name,
11187  {
11188         struct dentry *dentry = ERR_PTR(-ENOENT), *old;
11189         struct inode *inode = dir->d_inode;
11190 -       DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq);
11191 +       DECLARE_SWAIT_QUEUE_HEAD_ONSTACK(wq);
11192
11193         inode_lock_shared(inode);
11194         /* Don't go there if it's already dead */
11195 @@ -3100,7 +3100,7 @@ static int lookup_open(struct nameidata *nd, struct path *path,
11196         struct dentry *dentry;
11197         int error, create_error = 0;
11198         umode_t mode = op->mode;
11199 -       DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq);
11200 +       DECLARE_SWAIT_QUEUE_HEAD_ONSTACK(wq);
11201
11202         if (unlikely(IS_DEADDIR(dir_inode)))
11203                 return -ENOENT;
11204 diff --git a/fs/namespace.c b/fs/namespace.c
11205 index 9dc146e7b5e0..85bfe5e55adf 100644
11206 --- a/fs/namespace.c
11207 +++ b/fs/namespace.c
11208 @@ -14,6 +14,7 @@
11209  #include <linux/mnt_namespace.h>
11210  #include <linux/user_namespace.h>
11211  #include <linux/namei.h>
11212 +#include <linux/delay.h>
11213  #include <linux/security.h>
11214  #include <linux/cred.h>
11215  #include <linux/idr.h>
11216 @@ -353,8 +354,11 @@ int __mnt_want_write(struct vfsmount *m)
11217          * incremented count after it has set MNT_WRITE_HOLD.
11218          */
11219         smp_mb();
11220 -       while (ACCESS_ONCE(mnt->mnt.mnt_flags) & MNT_WRITE_HOLD)
11221 -               cpu_relax();
11222 +       while (ACCESS_ONCE(mnt->mnt.mnt_flags) & MNT_WRITE_HOLD) {
11223 +               preempt_enable();
11224 +               cpu_chill();
11225 +               preempt_disable();
11226 +       }
11227         /*
11228          * After the slowpath clears MNT_WRITE_HOLD, mnt_is_readonly will
11229          * be set to match its requirements. So we must not load that until
11230 diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c
11231 index 606dd3871f66..fa41eb75b4d8 100644
11232 --- a/fs/nfs/delegation.c
11233 +++ b/fs/nfs/delegation.c
11234 @@ -150,11 +150,11 @@ static int nfs_delegation_claim_opens(struct inode *inode,
11235                 sp = state->owner;
11236                 /* Block nfs4_proc_unlck */
11237                 mutex_lock(&sp->so_delegreturn_mutex);
11238 -               seq = raw_seqcount_begin(&sp->so_reclaim_seqcount);
11239 +               seq = read_seqbegin(&sp->so_reclaim_seqlock);
11240                 err = nfs4_open_delegation_recall(ctx, state, stateid, type);
11241                 if (!err)
11242                         err = nfs_delegation_claim_locks(ctx, state, stateid);
11243 -               if (!err && read_seqcount_retry(&sp->so_reclaim_seqcount, seq))
11244 +               if (!err && read_seqretry(&sp->so_reclaim_seqlock, seq))
11245                         err = -EAGAIN;
11246                 mutex_unlock(&sp->so_delegreturn_mutex);
11247                 put_nfs_open_context(ctx);
11248 diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
11249 index bf2c43635062..f43f5da4a8c3 100644
11250 --- a/fs/nfs/dir.c
11251 +++ b/fs/nfs/dir.c
11252 @@ -452,7 +452,7 @@ static
11253  void nfs_prime_dcache(struct dentry *parent, struct nfs_entry *entry)
11254  {
11255         struct qstr filename = QSTR_INIT(entry->name, entry->len);
11256 -       DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq);
11257 +       DECLARE_SWAIT_QUEUE_HEAD_ONSTACK(wq);
11258         struct dentry *dentry;
11259         struct dentry *alias;
11260         struct inode *dir = d_inode(parent);
11261 @@ -1443,7 +1443,7 @@ int nfs_atomic_open(struct inode *dir, struct dentry *dentry,
11262                     struct file *file, unsigned open_flags,
11263                     umode_t mode, int *opened)
11264  {
11265 -       DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq);
11266 +       DECLARE_SWAIT_QUEUE_HEAD_ONSTACK(wq);
11267         struct nfs_open_context *ctx;
11268         struct dentry *res;
11269         struct iattr attr = { .ia_valid = ATTR_OPEN };
11270 @@ -1763,7 +1763,11 @@ int nfs_rmdir(struct inode *dir, struct dentry *dentry)
11271
11272         trace_nfs_rmdir_enter(dir, dentry);
11273         if (d_really_is_positive(dentry)) {
11274 +#ifdef CONFIG_PREEMPT_RT_BASE
11275 +               down(&NFS_I(d_inode(dentry))->rmdir_sem);
11276 +#else
11277                 down_write(&NFS_I(d_inode(dentry))->rmdir_sem);
11278 +#endif
11279                 error = NFS_PROTO(dir)->rmdir(dir, &dentry->d_name);
11280                 /* Ensure the VFS deletes this inode */
11281                 switch (error) {
11282 @@ -1773,7 +1777,11 @@ int nfs_rmdir(struct inode *dir, struct dentry *dentry)
11283                 case -ENOENT:
11284                         nfs_dentry_handle_enoent(dentry);
11285                 }
11286 +#ifdef CONFIG_PREEMPT_RT_BASE
11287 +               up(&NFS_I(d_inode(dentry))->rmdir_sem);
11288 +#else
11289                 up_write(&NFS_I(d_inode(dentry))->rmdir_sem);
11290 +#endif
11291         } else
11292                 error = NFS_PROTO(dir)->rmdir(dir, &dentry->d_name);
11293         trace_nfs_rmdir_exit(dir, dentry, error);
11294 diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
11295 index 134d9f560240..ff64167f9811 100644
11296 --- a/fs/nfs/inode.c
11297 +++ b/fs/nfs/inode.c
11298 @@ -2014,7 +2014,11 @@ static void init_once(void *foo)
11299         atomic_long_set(&nfsi->nrequests, 0);
11300         atomic_long_set(&nfsi->commit_info.ncommit, 0);
11301         atomic_set(&nfsi->commit_info.rpcs_out, 0);
11302 +#ifdef CONFIG_PREEMPT_RT_BASE
11303 +       sema_init(&nfsi->rmdir_sem, 1);
11304 +#else
11305         init_rwsem(&nfsi->rmdir_sem);
11306 +#endif
11307         mutex_init(&nfsi->commit_mutex);
11308         nfs4_init_once(nfsi);
11309  }
11310 diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h
11311 index a73144b3cb8c..0c403d280b96 100644
11312 --- a/fs/nfs/nfs4_fs.h
11313 +++ b/fs/nfs/nfs4_fs.h
11314 @@ -112,7 +112,7 @@ struct nfs4_state_owner {
11315         unsigned long        so_flags;
11316         struct list_head     so_states;
11317         struct nfs_seqid_counter so_seqid;
11318 -       seqcount_t           so_reclaim_seqcount;
11319 +       seqlock_t            so_reclaim_seqlock;
11320         struct mutex         so_delegreturn_mutex;
11321  };
11322
11323 diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
11324 index a3b67d3b1dfb..4ce6ec109c2b 100644
11325 --- a/fs/nfs/nfs4proc.c
11326 +++ b/fs/nfs/nfs4proc.c
11327 @@ -2700,7 +2700,7 @@ static int _nfs4_open_and_get_state(struct nfs4_opendata *opendata,
11328         unsigned int seq;
11329         int ret;
11330
11331 -       seq = raw_seqcount_begin(&sp->so_reclaim_seqcount);
11332 +       seq = raw_seqcount_begin(&sp->so_reclaim_seqlock.seqcount);
11333
11334         ret = _nfs4_proc_open(opendata);
11335         if (ret != 0)
11336 @@ -2738,7 +2738,7 @@ static int _nfs4_open_and_get_state(struct nfs4_opendata *opendata,
11337
11338         if (d_inode(dentry) == state->inode) {
11339                 nfs_inode_attach_open_context(ctx);
11340 -               if (read_seqcount_retry(&sp->so_reclaim_seqcount, seq))
11341 +               if (read_seqretry(&sp->so_reclaim_seqlock, seq))
11342                         nfs4_schedule_stateid_recovery(server, state);
11343         }
11344  out:
11345 diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
11346 index e1d88bca815e..c51bcc176026 100644
11347 --- a/fs/nfs/nfs4state.c
11348 +++ b/fs/nfs/nfs4state.c
11349 @@ -494,7 +494,7 @@ nfs4_alloc_state_owner(struct nfs_server *server,
11350         nfs4_init_seqid_counter(&sp->so_seqid);
11351         atomic_set(&sp->so_count, 1);
11352         INIT_LIST_HEAD(&sp->so_lru);
11353 -       seqcount_init(&sp->so_reclaim_seqcount);
11354 +       seqlock_init(&sp->so_reclaim_seqlock);
11355         mutex_init(&sp->so_delegreturn_mutex);
11356         return sp;
11357  }
11358 @@ -1521,8 +1521,12 @@ static int nfs4_reclaim_open_state(struct nfs4_state_owner *sp, const struct nfs
11359          * recovering after a network partition or a reboot from a
11360          * server that doesn't support a grace period.
11361          */
11362 +#ifdef CONFIG_PREEMPT_RT_FULL
11363 +       write_seqlock(&sp->so_reclaim_seqlock);
11364 +#else
11365 +       write_seqcount_begin(&sp->so_reclaim_seqlock.seqcount);
11366 +#endif
11367         spin_lock(&sp->so_lock);
11368 -       raw_write_seqcount_begin(&sp->so_reclaim_seqcount);
11369  restart:
11370         list_for_each_entry(state, &sp->so_states, open_states) {
11371                 if (!test_and_clear_bit(ops->state_flag_bit, &state->flags))
11372 @@ -1591,14 +1595,20 @@ static int nfs4_reclaim_open_state(struct nfs4_state_owner *sp, const struct nfs
11373                 spin_lock(&sp->so_lock);
11374                 goto restart;
11375         }
11376 -       raw_write_seqcount_end(&sp->so_reclaim_seqcount);
11377         spin_unlock(&sp->so_lock);
11378 +#ifdef CONFIG_PREEMPT_RT_FULL
11379 +       write_sequnlock(&sp->so_reclaim_seqlock);
11380 +#else
11381 +       write_seqcount_end(&sp->so_reclaim_seqlock.seqcount);
11382 +#endif
11383         return 0;
11384  out_err:
11385         nfs4_put_open_state(state);
11386 -       spin_lock(&sp->so_lock);
11387 -       raw_write_seqcount_end(&sp->so_reclaim_seqcount);
11388 -       spin_unlock(&sp->so_lock);
11389 +#ifdef CONFIG_PREEMPT_RT_FULL
11390 +       write_sequnlock(&sp->so_reclaim_seqlock);
11391 +#else
11392 +       write_seqcount_end(&sp->so_reclaim_seqlock.seqcount);
11393 +#endif
11394         return status;
11395  }
11396
11397 diff --git a/fs/nfs/unlink.c b/fs/nfs/unlink.c
11398 index 630b4a3c1a93..0dc1d3e6a62f 100644
11399 --- a/fs/nfs/unlink.c
11400 +++ b/fs/nfs/unlink.c
11401 @@ -13,7 +13,7 @@
11402  #include <linux/sunrpc/clnt.h>
11403  #include <linux/nfs_fs.h>
11404  #include <linux/sched.h>
11405 -#include <linux/wait.h>
11406 +#include <linux/swait.h>
11407  #include <linux/namei.h>
11408  #include <linux/fsnotify.h>
11409
11410 @@ -52,6 +52,29 @@ static void nfs_async_unlink_done(struct rpc_task *task, void *calldata)
11411                 rpc_restart_call_prepare(task);
11412  }
11413
11414 +#ifdef CONFIG_PREEMPT_RT_BASE
11415 +static void nfs_down_anon(struct semaphore *sema)
11416 +{
11417 +       down(sema);
11418 +}
11419 +
11420 +static void nfs_up_anon(struct semaphore *sema)
11421 +{
11422 +       up(sema);
11423 +}
11424 +
11425 +#else
11426 +static void nfs_down_anon(struct rw_semaphore *rwsem)
11427 +{
11428 +       down_read_non_owner(rwsem);
11429 +}
11430 +
11431 +static void nfs_up_anon(struct rw_semaphore *rwsem)
11432 +{
11433 +       up_read_non_owner(rwsem);
11434 +}
11435 +#endif
11436 +
11437  /**
11438   * nfs_async_unlink_release - Release the sillydelete data.
11439   * @task: rpc_task of the sillydelete
11440 @@ -65,7 +88,7 @@ static void nfs_async_unlink_release(void *calldata)
11441         struct dentry *dentry = data->dentry;
11442         struct super_block *sb = dentry->d_sb;
11443
11444 -       up_read_non_owner(&NFS_I(d_inode(dentry->d_parent))->rmdir_sem);
11445 +       nfs_up_anon(&NFS_I(d_inode(dentry->d_parent))->rmdir_sem);
11446         d_lookup_done(dentry);
11447         nfs_free_unlinkdata(data);
11448         dput(dentry);
11449 @@ -118,10 +141,10 @@ static int nfs_call_unlink(struct dentry *dentry, struct nfs_unlinkdata *data)
11450         struct inode *dir = d_inode(dentry->d_parent);
11451         struct dentry *alias;
11452
11453 -       down_read_non_owner(&NFS_I(dir)->rmdir_sem);
11454 +       nfs_down_anon(&NFS_I(dir)->rmdir_sem);
11455         alias = d_alloc_parallel(dentry->d_parent, &data->args.name, &data->wq);
11456         if (IS_ERR(alias)) {
11457 -               up_read_non_owner(&NFS_I(dir)->rmdir_sem);
11458 +               nfs_up_anon(&NFS_I(dir)->rmdir_sem);
11459                 return 0;
11460         }
11461         if (!d_in_lookup(alias)) {
11462 @@ -143,7 +166,7 @@ static int nfs_call_unlink(struct dentry *dentry, struct nfs_unlinkdata *data)
11463                         ret = 0;
11464                 spin_unlock(&alias->d_lock);
11465                 dput(alias);
11466 -               up_read_non_owner(&NFS_I(dir)->rmdir_sem);
11467 +               nfs_up_anon(&NFS_I(dir)->rmdir_sem);
11468                 /*
11469                  * If we'd displaced old cached devname, free it.  At that
11470                  * point dentry is definitely not a root, so we won't need
11471 @@ -183,7 +206,7 @@ nfs_async_unlink(struct dentry *dentry, const struct qstr *name)
11472                 goto out_free_name;
11473         }
11474         data->res.dir_attr = &data->dir_attr;
11475 -       init_waitqueue_head(&data->wq);
11476 +       init_swait_queue_head(&data->wq);
11477
11478         status = -EBUSY;
11479         spin_lock(&dentry->d_lock);
11480 diff --git a/fs/ntfs/aops.c b/fs/ntfs/aops.c
11481 index cc91856b5e2d..a982d7c3ad91 100644
11482 --- a/fs/ntfs/aops.c
11483 +++ b/fs/ntfs/aops.c
11484 @@ -93,13 +93,13 @@ static void ntfs_end_buffer_async_read(struct buffer_head *bh, int uptodate)
11485                         ofs = 0;
11486                         if (file_ofs < init_size)
11487                                 ofs = init_size - file_ofs;
11488 -                       local_irq_save(flags);
11489 +                       local_irq_save_nort(flags);
11490                         kaddr = kmap_atomic(page);
11491                         memset(kaddr + bh_offset(bh) + ofs, 0,
11492                                         bh->b_size - ofs);
11493                         flush_dcache_page(page);
11494                         kunmap_atomic(kaddr);
11495 -                       local_irq_restore(flags);
11496 +                       local_irq_restore_nort(flags);
11497                 }
11498         } else {
11499                 clear_buffer_uptodate(bh);
11500 @@ -108,8 +108,7 @@ static void ntfs_end_buffer_async_read(struct buffer_head *bh, int uptodate)
11501                                 "0x%llx.", (unsigned long long)bh->b_blocknr);
11502         }
11503         first = page_buffers(page);
11504 -       local_irq_save(flags);
11505 -       bit_spin_lock(BH_Uptodate_Lock, &first->b_state);
11506 +       flags = bh_uptodate_lock_irqsave(first);
11507         clear_buffer_async_read(bh);
11508         unlock_buffer(bh);
11509         tmp = bh;
11510 @@ -124,8 +123,7 @@ static void ntfs_end_buffer_async_read(struct buffer_head *bh, int uptodate)
11511                 }
11512                 tmp = tmp->b_this_page;
11513         } while (tmp != bh);
11514 -       bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
11515 -       local_irq_restore(flags);
11516 +       bh_uptodate_unlock_irqrestore(first, flags);
11517         /*
11518          * If none of the buffers had errors then we can set the page uptodate,
11519          * but we first have to perform the post read mst fixups, if the
11520 @@ -146,13 +144,13 @@ static void ntfs_end_buffer_async_read(struct buffer_head *bh, int uptodate)
11521                 recs = PAGE_SIZE / rec_size;
11522                 /* Should have been verified before we got here... */
11523                 BUG_ON(!recs);
11524 -               local_irq_save(flags);
11525 +               local_irq_save_nort(flags);
11526                 kaddr = kmap_atomic(page);
11527                 for (i = 0; i < recs; i++)
11528                         post_read_mst_fixup((NTFS_RECORD*)(kaddr +
11529                                         i * rec_size), rec_size);
11530                 kunmap_atomic(kaddr);
11531 -               local_irq_restore(flags);
11532 +               local_irq_restore_nort(flags);
11533                 flush_dcache_page(page);
11534                 if (likely(page_uptodate && !PageError(page)))
11535                         SetPageUptodate(page);
11536 @@ -160,9 +158,7 @@ static void ntfs_end_buffer_async_read(struct buffer_head *bh, int uptodate)
11537         unlock_page(page);
11538         return;
11539  still_busy:
11540 -       bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
11541 -       local_irq_restore(flags);
11542 -       return;
11543 +       bh_uptodate_unlock_irqrestore(first, flags);
11544  }
11545
11546  /**
11547 diff --git a/fs/proc/array.c b/fs/proc/array.c
11548 index 4ac811e1a26c..9dcb40690cde 100644
11549 --- a/fs/proc/array.c
11550 +++ b/fs/proc/array.c
11551 @@ -386,9 +386,9 @@ static inline void task_context_switch_counts(struct seq_file *m,
11552  static void task_cpus_allowed(struct seq_file *m, struct task_struct *task)
11553  {
11554         seq_printf(m, "Cpus_allowed:\t%*pb\n",
11555 -                  cpumask_pr_args(&task->cpus_allowed));
11556 +                  cpumask_pr_args(task->cpus_ptr));
11557         seq_printf(m, "Cpus_allowed_list:\t%*pbl\n",
11558 -                  cpumask_pr_args(&task->cpus_allowed));
11559 +                  cpumask_pr_args(task->cpus_ptr));
11560  }
11561
11562  int proc_pid_status(struct seq_file *m, struct pid_namespace *ns,
11563 diff --git a/fs/proc/base.c b/fs/proc/base.c
11564 index 9063738ff1f0..4085e56e261c 100644
11565 --- a/fs/proc/base.c
11566 +++ b/fs/proc/base.c
11567 @@ -1900,7 +1900,7 @@ bool proc_fill_cache(struct file *file, struct dir_context *ctx,
11568
11569         child = d_hash_and_lookup(dir, &qname);
11570         if (!child) {
11571 -               DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq);
11572 +               DECLARE_SWAIT_QUEUE_HEAD_ONSTACK(wq);
11573                 child = d_alloc_parallel(dir, &qname, &wq);
11574                 if (IS_ERR(child))
11575                         goto end_instantiate;
11576 diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c
11577 index 82ac5f682b73..c35714621a38 100644
11578 --- a/fs/proc/proc_sysctl.c
11579 +++ b/fs/proc/proc_sysctl.c
11580 @@ -679,7 +679,7 @@ static bool proc_sys_fill_cache(struct file *file,
11581
11582         child = d_lookup(dir, &qname);
11583         if (!child) {
11584 -               DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq);
11585 +               DECLARE_SWAIT_QUEUE_HEAD_ONSTACK(wq);
11586                 child = d_alloc_parallel(dir, &qname, &wq);
11587                 if (IS_ERR(child))
11588                         return false;
11589 diff --git a/fs/squashfs/decompressor_multi_percpu.c b/fs/squashfs/decompressor_multi_percpu.c
11590 index 23a9c28ad8ea..6a73c4fa88e7 100644
11591 --- a/fs/squashfs/decompressor_multi_percpu.c
11592 +++ b/fs/squashfs/decompressor_multi_percpu.c
11593 @@ -10,6 +10,7 @@
11594  #include <linux/slab.h>
11595  #include <linux/percpu.h>
11596  #include <linux/buffer_head.h>
11597 +#include <linux/locallock.h>
11598
11599  #include "squashfs_fs.h"
11600  #include "squashfs_fs_sb.h"
11601 @@ -25,6 +26,8 @@ struct squashfs_stream {
11602         void            *stream;
11603  };
11604
11605 +static DEFINE_LOCAL_IRQ_LOCK(stream_lock);
11606 +
11607  void *squashfs_decompressor_create(struct squashfs_sb_info *msblk,
11608                                                 void *comp_opts)
11609  {
11610 @@ -79,10 +82,15 @@ int squashfs_decompress(struct squashfs_sb_info *msblk, struct buffer_head **bh,
11611  {
11612         struct squashfs_stream __percpu *percpu =
11613                         (struct squashfs_stream __percpu *) msblk->stream;
11614 -       struct squashfs_stream *stream = get_cpu_ptr(percpu);
11615 -       int res = msblk->decompressor->decompress(msblk, stream->stream, bh, b,
11616 -               offset, length, output);
11617 -       put_cpu_ptr(stream);
11618 +       struct squashfs_stream *stream;
11619 +       int res;
11620 +
11621 +       stream = get_locked_ptr(stream_lock, percpu);
11622 +
11623 +       res = msblk->decompressor->decompress(msblk, stream->stream, bh, b,
11624 +                       offset, length, output);
11625 +
11626 +       put_locked_ptr(stream_lock, stream);
11627
11628         if (res < 0)
11629                 ERROR("%s decompression failed, data probably corrupt\n",
11630 diff --git a/fs/timerfd.c b/fs/timerfd.c
11631 index 040612ec9598..b3d9d435926c 100644
11632 --- a/fs/timerfd.c
11633 +++ b/fs/timerfd.c
11634 @@ -471,7 +471,10 @@ static int do_timerfd_settime(int ufd, int flags,
11635                                 break;
11636                 }
11637                 spin_unlock_irq(&ctx->wqh.lock);
11638 -               cpu_relax();
11639 +               if (isalarm(ctx))
11640 +                       hrtimer_wait_for_timer(&ctx->t.alarm.timer);
11641 +               else
11642 +                       hrtimer_wait_for_timer(&ctx->t.tmr);
11643         }
11644
11645         /*
11646 diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
11647 index b0cccf8a81a8..eaa4383defec 100644
11648 --- a/fs/xfs/xfs_aops.c
11649 +++ b/fs/xfs/xfs_aops.c
11650 @@ -120,8 +120,7 @@ xfs_finish_page_writeback(
11651         ASSERT(bvec->bv_offset + bvec->bv_len <= PAGE_SIZE);
11652         ASSERT((bvec->bv_len & (i_blocksize(inode) - 1)) == 0);
11653
11654 -       local_irq_save(flags);
11655 -       bit_spin_lock(BH_Uptodate_Lock, &head->b_state);
11656 +       flags = bh_uptodate_lock_irqsave(head);
11657         do {
11658                 if (off >= bvec->bv_offset &&
11659                     off < bvec->bv_offset + bvec->bv_len) {
11660 @@ -143,8 +142,7 @@ xfs_finish_page_writeback(
11661                 }
11662                 off += bh->b_size;
11663         } while ((bh = bh->b_this_page) != head);
11664 -       bit_spin_unlock(BH_Uptodate_Lock, &head->b_state);
11665 -       local_irq_restore(flags);
11666 +       bh_uptodate_unlock_irqrestore(head, flags);
11667
11668         if (!busy)
11669                 end_page_writeback(bvec->bv_page);
11670 diff --git a/include/acpi/platform/aclinux.h b/include/acpi/platform/aclinux.h
11671 index 1b473efd9eb6..89ee5e1dac48 100644
11672 --- a/include/acpi/platform/aclinux.h
11673 +++ b/include/acpi/platform/aclinux.h
11674 @@ -134,6 +134,7 @@
11675
11676  #define acpi_cache_t                        struct kmem_cache
11677  #define acpi_spinlock                       spinlock_t *
11678 +#define acpi_raw_spinlock              raw_spinlock_t *
11679  #define acpi_cpu_flags                      unsigned long
11680
11681  /* Use native linux version of acpi_os_allocate_zeroed */
11682 @@ -152,6 +153,20 @@
11683  #define ACPI_USE_ALTERNATE_PROTOTYPE_acpi_os_get_thread_id
11684  #define ACPI_USE_ALTERNATE_PROTOTYPE_acpi_os_create_lock
11685
11686 +#define acpi_os_create_raw_lock(__handle)                      \
11687 +({                                                             \
11688 +        raw_spinlock_t *lock = ACPI_ALLOCATE(sizeof(*lock));   \
11689 +                                                               \
11690 +        if (lock) {                                            \
11691 +               *(__handle) = lock;                             \
11692 +               raw_spin_lock_init(*(__handle));                \
11693 +        }                                                      \
11694 +        lock ? AE_OK : AE_NO_MEMORY;                           \
11695 + })
11696 +
11697 +#define acpi_os_delete_raw_lock(__handle)      kfree(__handle)
11698 +
11699 +
11700  /*
11701   * OSL interfaces used by debugger/disassembler
11702   */
11703 diff --git a/include/asm-generic/bug.h b/include/asm-generic/bug.h
11704 index ae1a33aa8955..c6d04eca8345 100644
11705 --- a/include/asm-generic/bug.h
11706 +++ b/include/asm-generic/bug.h
11707 @@ -234,6 +234,20 @@ void __warn(const char *file, int line, void *caller, unsigned taint,
11708  # define WARN_ON_SMP(x)                        ({0;})
11709  #endif
11710
11711 +#ifdef CONFIG_PREEMPT_RT_BASE
11712 +# define BUG_ON_RT(c)                  BUG_ON(c)
11713 +# define BUG_ON_NONRT(c)               do { } while (0)
11714 +# define WARN_ON_RT(condition)         WARN_ON(condition)
11715 +# define WARN_ON_NONRT(condition)      do { } while (0)
11716 +# define WARN_ON_ONCE_NONRT(condition) do { } while (0)
11717 +#else
11718 +# define BUG_ON_RT(c)                  do { } while (0)
11719 +# define BUG_ON_NONRT(c)               BUG_ON(c)
11720 +# define WARN_ON_RT(condition)         do { } while (0)
11721 +# define WARN_ON_NONRT(condition)      WARN_ON(condition)
11722 +# define WARN_ON_ONCE_NONRT(condition) WARN_ON_ONCE(condition)
11723 +#endif
11724 +
11725  #endif /* __ASSEMBLY__ */
11726
11727  #endif
11728 diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
11729 index 994cbb0f7ffc..0d4b7e3489a9 100644
11730 --- a/include/linux/blk-mq.h
11731 +++ b/include/linux/blk-mq.h
11732 @@ -226,7 +226,7 @@ static inline u16 blk_mq_unique_tag_to_tag(u32 unique_tag)
11733         return unique_tag & BLK_MQ_UNIQUE_TAG_MASK;
11734  }
11735
11736 -
11737 +void __blk_mq_complete_request_remote_work(struct work_struct *work);
11738  int blk_mq_request_started(struct request *rq);
11739  void blk_mq_start_request(struct request *rq);
11740  void blk_mq_end_request(struct request *rq, blk_status_t error);
11741 diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
11742 index 4d4af0e94059..cbf9d5730dd3 100644
11743 --- a/include/linux/blkdev.h
11744 +++ b/include/linux/blkdev.h
11745 @@ -27,6 +27,7 @@
11746  #include <linux/percpu-refcount.h>
11747  #include <linux/scatterlist.h>
11748  #include <linux/blkzoned.h>
11749 +#include <linux/swork.h>
11750
11751  struct module;
11752  struct scsi_ioctl_command;
11753 @@ -134,6 +135,9 @@ typedef __u32 __bitwise req_flags_t;
11754   */
11755  struct request {
11756         struct list_head queuelist;
11757 +#ifdef CONFIG_PREEMPT_RT_FULL
11758 +       struct work_struct work;
11759 +#endif
11760         union {
11761                 struct __call_single_data csd;
11762                 u64 fifo_time;
11763 @@ -596,6 +600,7 @@ struct request_queue {
11764  #endif
11765         struct rcu_head         rcu_head;
11766         wait_queue_head_t       mq_freeze_wq;
11767 +       struct swork_event      mq_pcpu_wake;
11768         struct percpu_ref       q_usage_counter;
11769         struct list_head        all_q_node;
11770
11771 diff --git a/include/linux/bottom_half.h b/include/linux/bottom_half.h
11772 index a19519f4241d..40dd5ef9c154 100644
11773 --- a/include/linux/bottom_half.h
11774 +++ b/include/linux/bottom_half.h
11775 @@ -4,6 +4,39 @@
11776
11777  #include <linux/preempt.h>
11778
11779 +#ifdef CONFIG_PREEMPT_RT_FULL
11780 +
11781 +extern void __local_bh_disable(void);
11782 +extern void _local_bh_enable(void);
11783 +extern void __local_bh_enable(void);
11784 +
11785 +static inline void local_bh_disable(void)
11786 +{
11787 +       __local_bh_disable();
11788 +}
11789 +
11790 +static inline void __local_bh_disable_ip(unsigned long ip, unsigned int cnt)
11791 +{
11792 +       __local_bh_disable();
11793 +}
11794 +
11795 +static inline void local_bh_enable(void)
11796 +{
11797 +       __local_bh_enable();
11798 +}
11799 +
11800 +static inline void __local_bh_enable_ip(unsigned long ip, unsigned int cnt)
11801 +{
11802 +       __local_bh_enable();
11803 +}
11804 +
11805 +static inline void local_bh_enable_ip(unsigned long ip)
11806 +{
11807 +       __local_bh_enable();
11808 +}
11809 +
11810 +#else
11811 +
11812  #ifdef CONFIG_TRACE_IRQFLAGS
11813  extern void __local_bh_disable_ip(unsigned long ip, unsigned int cnt);
11814  #else
11815 @@ -31,5 +64,6 @@ static inline void local_bh_enable(void)
11816  {
11817         __local_bh_enable_ip(_THIS_IP_, SOFTIRQ_DISABLE_OFFSET);
11818  }
11819 +#endif
11820
11821  #endif /* _LINUX_BH_H */
11822 diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h
11823 index afa37f807f12..48505fade7e1 100644
11824 --- a/include/linux/buffer_head.h
11825 +++ b/include/linux/buffer_head.h
11826 @@ -76,8 +76,50 @@ struct buffer_head {
11827         struct address_space *b_assoc_map;      /* mapping this buffer is
11828                                                    associated with */
11829         atomic_t b_count;               /* users using this buffer_head */
11830 +#ifdef CONFIG_PREEMPT_RT_BASE
11831 +       spinlock_t b_uptodate_lock;
11832 +#if IS_ENABLED(CONFIG_JBD2)
11833 +       spinlock_t b_state_lock;
11834 +       spinlock_t b_journal_head_lock;
11835 +#endif
11836 +#endif
11837  };
11838
11839 +static inline unsigned long bh_uptodate_lock_irqsave(struct buffer_head *bh)
11840 +{
11841 +       unsigned long flags;
11842 +
11843 +#ifndef CONFIG_PREEMPT_RT_BASE
11844 +       local_irq_save(flags);
11845 +       bit_spin_lock(BH_Uptodate_Lock, &bh->b_state);
11846 +#else
11847 +       spin_lock_irqsave(&bh->b_uptodate_lock, flags);
11848 +#endif
11849 +       return flags;
11850 +}
11851 +
11852 +static inline void
11853 +bh_uptodate_unlock_irqrestore(struct buffer_head *bh, unsigned long flags)
11854 +{
11855 +#ifndef CONFIG_PREEMPT_RT_BASE
11856 +       bit_spin_unlock(BH_Uptodate_Lock, &bh->b_state);
11857 +       local_irq_restore(flags);
11858 +#else
11859 +       spin_unlock_irqrestore(&bh->b_uptodate_lock, flags);
11860 +#endif
11861 +}
11862 +
11863 +static inline void buffer_head_init_locks(struct buffer_head *bh)
11864 +{
11865 +#ifdef CONFIG_PREEMPT_RT_BASE
11866 +       spin_lock_init(&bh->b_uptodate_lock);
11867 +#if IS_ENABLED(CONFIG_JBD2)
11868 +       spin_lock_init(&bh->b_state_lock);
11869 +       spin_lock_init(&bh->b_journal_head_lock);
11870 +#endif
11871 +#endif
11872 +}
11873 +
11874  /*
11875   * macro tricks to expand the set_buffer_foo(), clear_buffer_foo()
11876   * and buffer_foo() functions.
11877 diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h
11878 index e7905d9353e8..4ecf7875e04f 100644
11879 --- a/include/linux/cgroup-defs.h
11880 +++ b/include/linux/cgroup-defs.h
11881 @@ -19,6 +19,7 @@
11882  #include <linux/percpu-rwsem.h>
11883  #include <linux/workqueue.h>
11884  #include <linux/bpf-cgroup.h>
11885 +#include <linux/swork.h>
11886
11887  #ifdef CONFIG_CGROUPS
11888
11889 @@ -152,6 +153,7 @@ struct cgroup_subsys_state {
11890         /* percpu_ref killing and RCU release */
11891         struct rcu_head rcu_head;
11892         struct work_struct destroy_work;
11893 +       struct swork_event destroy_swork;
11894
11895         /*
11896          * PI: the parent css.  Placed here for cache proximity to following
11897 diff --git a/include/linux/completion.h b/include/linux/completion.h
11898 index 7828451e161a..f5838b10cf84 100644
11899 --- a/include/linux/completion.h
11900 +++ b/include/linux/completion.h
11901 @@ -9,7 +9,7 @@
11902   * See kernel/sched/completion.c for details.
11903   */
11904
11905 -#include <linux/wait.h>
11906 +#include <linux/swait.h>
11907  #ifdef CONFIG_LOCKDEP_COMPLETIONS
11908  #include <linux/lockdep.h>
11909  #endif
11910 @@ -28,7 +28,7 @@
11911   */
11912  struct completion {
11913         unsigned int done;
11914 -       wait_queue_head_t wait;
11915 +       struct swait_queue_head wait;
11916  #ifdef CONFIG_LOCKDEP_COMPLETIONS
11917         struct lockdep_map_cross map;
11918  #endif
11919 @@ -67,11 +67,11 @@ static inline void complete_release_commit(struct completion *x) {}
11920
11921  #ifdef CONFIG_LOCKDEP_COMPLETIONS
11922  #define COMPLETION_INITIALIZER(work) \
11923 -       { 0, __WAIT_QUEUE_HEAD_INITIALIZER((work).wait), \
11924 +       { 0, __SWAIT_QUEUE_HEAD_INITIALIZER((work).wait), \
11925         STATIC_CROSS_LOCKDEP_MAP_INIT("(complete)" #work, &(work)) }
11926  #else
11927  #define COMPLETION_INITIALIZER(work) \
11928 -       { 0, __WAIT_QUEUE_HEAD_INITIALIZER((work).wait) }
11929 +       { 0, __SWAIT_QUEUE_HEAD_INITIALIZER((work).wait) }
11930  #endif
11931
11932  #define COMPLETION_INITIALIZER_ONSTACK(work) \
11933 @@ -117,7 +117,7 @@ static inline void complete_release_commit(struct completion *x) {}
11934  static inline void __init_completion(struct completion *x)
11935  {
11936         x->done = 0;
11937 -       init_waitqueue_head(&x->wait);
11938 +       init_swait_queue_head(&x->wait);
11939  }
11940
11941  /**
11942 diff --git a/include/linux/cpu.h b/include/linux/cpu.h
11943 index 2a378d261914..b418d3c5159d 100644
11944 --- a/include/linux/cpu.h
11945 +++ b/include/linux/cpu.h
11946 @@ -120,6 +120,8 @@ extern void cpu_hotplug_disable(void);
11947  extern void cpu_hotplug_enable(void);
11948  void clear_tasks_mm_cpumask(int cpu);
11949  int cpu_down(unsigned int cpu);
11950 +extern void pin_current_cpu(void);
11951 +extern void unpin_current_cpu(void);
11952
11953  #else /* CONFIG_HOTPLUG_CPU */
11954
11955 @@ -130,6 +132,9 @@ static inline void cpus_read_unlock(void) { }
11956  static inline void lockdep_assert_cpus_held(void) { }
11957  static inline void cpu_hotplug_disable(void) { }
11958  static inline void cpu_hotplug_enable(void) { }
11959 +static inline void pin_current_cpu(void) { }
11960 +static inline void unpin_current_cpu(void) { }
11961 +
11962  #endif /* !CONFIG_HOTPLUG_CPU */
11963
11964  /* Wrappers which go away once all code is converted */
11965 diff --git a/include/linux/dcache.h b/include/linux/dcache.h
11966 index 006f4ccda5f5..d413993f7f17 100644
11967 --- a/include/linux/dcache.h
11968 +++ b/include/linux/dcache.h
11969 @@ -107,7 +107,7 @@ struct dentry {
11970
11971         union {
11972                 struct list_head d_lru;         /* LRU list */
11973 -               wait_queue_head_t *d_wait;      /* in-lookup ones only */
11974 +               struct swait_queue_head *d_wait;        /* in-lookup ones only */
11975         };
11976         struct list_head d_child;       /* child of parent list */
11977         struct list_head d_subdirs;     /* our children */
11978 @@ -238,7 +238,7 @@ extern void d_set_d_op(struct dentry *dentry, const struct dentry_operations *op
11979  extern struct dentry * d_alloc(struct dentry *, const struct qstr *);
11980  extern struct dentry * d_alloc_pseudo(struct super_block *, const struct qstr *);
11981  extern struct dentry * d_alloc_parallel(struct dentry *, const struct qstr *,
11982 -                                       wait_queue_head_t *);
11983 +                                       struct swait_queue_head *);
11984  extern struct dentry * d_splice_alias(struct inode *, struct dentry *);
11985  extern struct dentry * d_add_ci(struct dentry *, struct inode *, struct qstr *);
11986  extern struct dentry * d_exact_alias(struct dentry *, struct inode *);
11987 diff --git a/include/linux/delay.h b/include/linux/delay.h
11988 index b78bab4395d8..7c4bc414a504 100644
11989 --- a/include/linux/delay.h
11990 +++ b/include/linux/delay.h
11991 @@ -64,4 +64,10 @@ static inline void ssleep(unsigned int seconds)
11992         msleep(seconds * 1000);
11993  }
11994
11995 +#ifdef CONFIG_PREEMPT_RT_FULL
11996 +extern void cpu_chill(void);
11997 +#else
11998 +# define cpu_chill()   cpu_relax()
11999 +#endif
12000 +
12001  #endif /* defined(_LINUX_DELAY_H) */
12002 diff --git a/include/linux/fs.h b/include/linux/fs.h
12003 index cc613f20e5a6..b806e2116f5c 100644
12004 --- a/include/linux/fs.h
12005 +++ b/include/linux/fs.h
12006 @@ -655,7 +655,7 @@ struct inode {
12007                 struct block_device     *i_bdev;
12008                 struct cdev             *i_cdev;
12009                 char                    *i_link;
12010 -               unsigned                i_dir_seq;
12011 +               unsigned                __i_dir_seq;
12012         };
12013
12014         __u32                   i_generation;
12015 diff --git a/include/linux/highmem.h b/include/linux/highmem.h
12016 index 776f90f3a1cd..5f0bd7a3e6a7 100644
12017 --- a/include/linux/highmem.h
12018 +++ b/include/linux/highmem.h
12019 @@ -8,6 +8,7 @@
12020  #include <linux/mm.h>
12021  #include <linux/uaccess.h>
12022  #include <linux/hardirq.h>
12023 +#include <linux/sched.h>
12024
12025  #include <asm/cacheflush.h>
12026
12027 @@ -66,7 +67,7 @@ static inline void kunmap(struct page *page)
12028
12029  static inline void *kmap_atomic(struct page *page)
12030  {
12031 -       preempt_disable();
12032 +       preempt_disable_nort();
12033         pagefault_disable();
12034         return page_address(page);
12035  }
12036 @@ -75,7 +76,7 @@ static inline void *kmap_atomic(struct page *page)
12037  static inline void __kunmap_atomic(void *addr)
12038  {
12039         pagefault_enable();
12040 -       preempt_enable();
12041 +       preempt_enable_nort();
12042  }
12043
12044  #define kmap_atomic_pfn(pfn)   kmap_atomic(pfn_to_page(pfn))
12045 @@ -87,32 +88,51 @@ static inline void __kunmap_atomic(void *addr)
12046
12047  #if defined(CONFIG_HIGHMEM) || defined(CONFIG_X86_32)
12048
12049 +#ifndef CONFIG_PREEMPT_RT_FULL
12050  DECLARE_PER_CPU(int, __kmap_atomic_idx);
12051 +#endif
12052
12053  static inline int kmap_atomic_idx_push(void)
12054  {
12055 +#ifndef CONFIG_PREEMPT_RT_FULL
12056         int idx = __this_cpu_inc_return(__kmap_atomic_idx) - 1;
12057
12058 -#ifdef CONFIG_DEBUG_HIGHMEM
12059 +# ifdef CONFIG_DEBUG_HIGHMEM
12060         WARN_ON_ONCE(in_irq() && !irqs_disabled());
12061         BUG_ON(idx >= KM_TYPE_NR);
12062 -#endif
12063 +# endif
12064         return idx;
12065 +#else
12066 +       current->kmap_idx++;
12067 +       BUG_ON(current->kmap_idx > KM_TYPE_NR);
12068 +       return current->kmap_idx - 1;
12069 +#endif
12070  }
12071
12072  static inline int kmap_atomic_idx(void)
12073  {
12074 +#ifndef CONFIG_PREEMPT_RT_FULL
12075         return __this_cpu_read(__kmap_atomic_idx) - 1;
12076 +#else
12077 +       return current->kmap_idx - 1;
12078 +#endif
12079  }
12080
12081  static inline void kmap_atomic_idx_pop(void)
12082  {
12083 -#ifdef CONFIG_DEBUG_HIGHMEM
12084 +#ifndef CONFIG_PREEMPT_RT_FULL
12085 +# ifdef CONFIG_DEBUG_HIGHMEM
12086         int idx = __this_cpu_dec_return(__kmap_atomic_idx);
12087
12088         BUG_ON(idx < 0);
12089 -#else
12090 +# else
12091         __this_cpu_dec(__kmap_atomic_idx);
12092 +# endif
12093 +#else
12094 +       current->kmap_idx--;
12095 +# ifdef CONFIG_DEBUG_HIGHMEM
12096 +       BUG_ON(current->kmap_idx < 0);
12097 +# endif
12098  #endif
12099  }
12100
12101 diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h
12102 index 012c37fdb688..3bd606859b0a 100644
12103 --- a/include/linux/hrtimer.h
12104 +++ b/include/linux/hrtimer.h
12105 @@ -22,19 +22,42 @@
12106  #include <linux/percpu.h>
12107  #include <linux/timer.h>
12108  #include <linux/timerqueue.h>
12109 +#include <linux/wait.h>
12110
12111  struct hrtimer_clock_base;
12112  struct hrtimer_cpu_base;
12113
12114  /*
12115   * Mode arguments of xxx_hrtimer functions:
12116 + *
12117 + * HRTIMER_MODE_ABS            - Time value is absolute
12118 + * HRTIMER_MODE_REL            - Time value is relative to now
12119 + * HRTIMER_MODE_PINNED         - Timer is bound to CPU (is only considered
12120 + *                               when starting the timer)
12121 + * HRTIMER_MODE_SOFT           - Timer callback function will be executed in
12122 + *                               soft irq context
12123   */
12124  enum hrtimer_mode {
12125 -       HRTIMER_MODE_ABS = 0x0,         /* Time value is absolute */
12126 -       HRTIMER_MODE_REL = 0x1,         /* Time value is relative to now */
12127 -       HRTIMER_MODE_PINNED = 0x02,     /* Timer is bound to CPU */
12128 -       HRTIMER_MODE_ABS_PINNED = 0x02,
12129 -       HRTIMER_MODE_REL_PINNED = 0x03,
12130 +       HRTIMER_MODE_ABS        = 0x00,
12131 +       HRTIMER_MODE_REL        = 0x01,
12132 +       HRTIMER_MODE_PINNED     = 0x02,
12133 +       HRTIMER_MODE_SOFT       = 0x04,
12134 +       HRTIMER_MODE_HARD       = 0x08,
12135 +
12136 +       HRTIMER_MODE_ABS_PINNED = HRTIMER_MODE_ABS | HRTIMER_MODE_PINNED,
12137 +       HRTIMER_MODE_REL_PINNED = HRTIMER_MODE_REL | HRTIMER_MODE_PINNED,
12138 +
12139 +       HRTIMER_MODE_ABS_SOFT   = HRTIMER_MODE_ABS | HRTIMER_MODE_SOFT,
12140 +       HRTIMER_MODE_REL_SOFT   = HRTIMER_MODE_REL | HRTIMER_MODE_SOFT,
12141 +
12142 +       HRTIMER_MODE_ABS_PINNED_SOFT = HRTIMER_MODE_ABS_PINNED | HRTIMER_MODE_SOFT,
12143 +       HRTIMER_MODE_REL_PINNED_SOFT = HRTIMER_MODE_REL_PINNED | HRTIMER_MODE_SOFT,
12144 +
12145 +       HRTIMER_MODE_ABS_HARD   = HRTIMER_MODE_ABS | HRTIMER_MODE_HARD,
12146 +       HRTIMER_MODE_REL_HARD   = HRTIMER_MODE_REL | HRTIMER_MODE_HARD,
12147 +
12148 +       HRTIMER_MODE_ABS_PINNED_HARD = HRTIMER_MODE_ABS_PINNED | HRTIMER_MODE_HARD,
12149 +       HRTIMER_MODE_REL_PINNED_HARD = HRTIMER_MODE_REL_PINNED | HRTIMER_MODE_HARD,
12150  };
12151
12152  /*
12153 @@ -87,6 +110,7 @@ enum hrtimer_restart {
12154   * @base:      pointer to the timer base (per cpu and per clock)
12155   * @state:     state information (See bit values above)
12156   * @is_rel:    Set if the timer was armed relative
12157 + * @is_soft:   Set if hrtimer will be expired in soft interrupt context.
12158   *
12159   * The hrtimer structure must be initialized by hrtimer_init()
12160   */
12161 @@ -97,6 +121,7 @@ struct hrtimer {
12162         struct hrtimer_clock_base       *base;
12163         u8                              state;
12164         u8                              is_rel;
12165 +       u8                              is_soft;
12166  };
12167
12168  /**
12169 @@ -112,9 +137,9 @@ struct hrtimer_sleeper {
12170  };
12171
12172  #ifdef CONFIG_64BIT
12173 -# define HRTIMER_CLOCK_BASE_ALIGN      64
12174 +# define __hrtimer_clock_base_align    ____cacheline_aligned
12175  #else
12176 -# define HRTIMER_CLOCK_BASE_ALIGN      32
12177 +# define __hrtimer_clock_base_align
12178  #endif
12179
12180  /**
12181 @@ -123,48 +148,57 @@ struct hrtimer_sleeper {
12182   * @index:             clock type index for per_cpu support when moving a
12183   *                     timer to a base on another cpu.
12184   * @clockid:           clock id for per_cpu support
12185 + * @seq:               seqcount around __run_hrtimer
12186 + * @running:           pointer to the currently running hrtimer
12187   * @active:            red black tree root node for the active timers
12188   * @get_time:          function to retrieve the current time of the clock
12189   * @offset:            offset of this clock to the monotonic base
12190   */
12191  struct hrtimer_clock_base {
12192         struct hrtimer_cpu_base *cpu_base;
12193 -       int                     index;
12194 +       unsigned int            index;
12195         clockid_t               clockid;
12196 +       seqcount_t              seq;
12197 +       struct hrtimer          *running;
12198         struct timerqueue_head  active;
12199         ktime_t                 (*get_time)(void);
12200         ktime_t                 offset;
12201 -} __attribute__((__aligned__(HRTIMER_CLOCK_BASE_ALIGN)));
12202 +} __hrtimer_clock_base_align;
12203
12204  enum  hrtimer_base_type {
12205         HRTIMER_BASE_MONOTONIC,
12206         HRTIMER_BASE_REALTIME,
12207         HRTIMER_BASE_BOOTTIME,
12208         HRTIMER_BASE_TAI,
12209 +       HRTIMER_BASE_MONOTONIC_SOFT,
12210 +       HRTIMER_BASE_REALTIME_SOFT,
12211 +       HRTIMER_BASE_BOOTTIME_SOFT,
12212 +       HRTIMER_BASE_TAI_SOFT,
12213         HRTIMER_MAX_CLOCK_BASES,
12214  };
12215
12216 -/*
12217 +/**
12218   * struct hrtimer_cpu_base - the per cpu clock bases
12219   * @lock:              lock protecting the base and associated clock bases
12220   *                     and timers
12221 - * @seq:               seqcount around __run_hrtimer
12222 - * @running:           pointer to the currently running hrtimer
12223   * @cpu:               cpu number
12224   * @active_bases:      Bitfield to mark bases with active timers
12225   * @clock_was_set_seq: Sequence counter of clock was set events
12226 - * @migration_enabled: The migration of hrtimers to other cpus is enabled
12227 - * @nohz_active:       The nohz functionality is enabled
12228 - * @expires_next:      absolute time of the next event which was scheduled
12229 - *                     via clock_set_next_event()
12230 - * @next_timer:                Pointer to the first expiring timer
12231 - * @in_hrtirq:         hrtimer_interrupt() is currently executing
12232   * @hres_active:       State of high resolution mode
12233 + * @in_hrtirq:         hrtimer_interrupt() is currently executing
12234   * @hang_detected:     The last hrtimer interrupt detected a hang
12235 + * @softirq_activated: displays, if the softirq is raised - update of softirq
12236 + *                     related settings is not required then.
12237   * @nr_events:         Total number of hrtimer interrupt events
12238   * @nr_retries:                Total number of hrtimer interrupt retries
12239   * @nr_hangs:          Total number of hrtimer interrupt hangs
12240   * @max_hang_time:     Maximum time spent in hrtimer_interrupt
12241 + * @expires_next:      absolute time of the next event, is required for remote
12242 + *                     hrtimer enqueue; it is the total first expiry time (hard
12243 + *                     and soft hrtimer are taken into account)
12244 + * @next_timer:                Pointer to the first expiring timer
12245 + * @softirq_expires_next: Time to check, if soft queues needs also to be expired
12246 + * @softirq_next_timer: Pointer to the first expiring softirq based timer
12247   * @clock_base:                array of clock bases for this cpu
12248   *
12249   * Note: next_timer is just an optimization for __remove_hrtimer().
12250 @@ -173,31 +207,31 @@ enum  hrtimer_base_type {
12251   */
12252  struct hrtimer_cpu_base {
12253         raw_spinlock_t                  lock;
12254 -       seqcount_t                      seq;
12255 -       struct hrtimer                  *running;
12256         unsigned int                    cpu;
12257         unsigned int                    active_bases;
12258         unsigned int                    clock_was_set_seq;
12259 -       bool                            migration_enabled;
12260 -       bool                            nohz_active;
12261 +       unsigned int                    hres_active             : 1,
12262 +                                       in_hrtirq               : 1,
12263 +                                       hang_detected           : 1,
12264 +                                       softirq_activated       : 1;
12265  #ifdef CONFIG_HIGH_RES_TIMERS
12266 -       unsigned int                    in_hrtirq       : 1,
12267 -                                       hres_active     : 1,
12268 -                                       hang_detected   : 1;
12269 -       ktime_t                         expires_next;
12270 -       struct hrtimer                  *next_timer;
12271         unsigned int                    nr_events;
12272 -       unsigned int                    nr_retries;
12273 -       unsigned int                    nr_hangs;
12274 +       unsigned short                  nr_retries;
12275 +       unsigned short                  nr_hangs;
12276         unsigned int                    max_hang_time;
12277  #endif
12278 +       ktime_t                         expires_next;
12279 +       struct hrtimer                  *next_timer;
12280 +       ktime_t                         softirq_expires_next;
12281 +#ifdef CONFIG_PREEMPT_RT_BASE
12282 +       wait_queue_head_t               wait;
12283 +#endif
12284 +       struct hrtimer                  *softirq_next_timer;
12285         struct hrtimer_clock_base       clock_base[HRTIMER_MAX_CLOCK_BASES];
12286  } ____cacheline_aligned;
12287
12288  static inline void hrtimer_set_expires(struct hrtimer *timer, ktime_t time)
12289  {
12290 -       BUILD_BUG_ON(sizeof(struct hrtimer_clock_base) > HRTIMER_CLOCK_BASE_ALIGN);
12291 -
12292         timer->node.expires = time;
12293         timer->_softexpires = time;
12294  }
12295 @@ -266,16 +300,17 @@ static inline ktime_t hrtimer_cb_get_time(struct hrtimer *timer)
12296         return timer->base->get_time();
12297  }
12298
12299 +static inline int hrtimer_is_hres_active(struct hrtimer *timer)
12300 +{
12301 +       return IS_ENABLED(CONFIG_HIGH_RES_TIMERS) ?
12302 +               timer->base->cpu_base->hres_active : 0;
12303 +}
12304 +
12305  #ifdef CONFIG_HIGH_RES_TIMERS
12306  struct clock_event_device;
12307
12308  extern void hrtimer_interrupt(struct clock_event_device *dev);
12309
12310 -static inline int hrtimer_is_hres_active(struct hrtimer *timer)
12311 -{
12312 -       return timer->base->cpu_base->hres_active;
12313 -}
12314 -
12315  /*
12316   * The resolution of the clocks. The resolution value is returned in
12317   * the clock_getres() system call to give application programmers an
12318 @@ -298,11 +333,6 @@ extern unsigned int hrtimer_resolution;
12319
12320  #define hrtimer_resolution     (unsigned int)LOW_RES_NSEC
12321
12322 -static inline int hrtimer_is_hres_active(struct hrtimer *timer)
12323 -{
12324 -       return 0;
12325 -}
12326 -
12327  static inline void clock_was_set_delayed(void) { }
12328
12329  #endif
12330 @@ -344,10 +374,17 @@ DECLARE_PER_CPU(struct tick_device, tick_cpu_device);
12331  /* Initialize timers: */
12332  extern void hrtimer_init(struct hrtimer *timer, clockid_t which_clock,
12333                          enum hrtimer_mode mode);
12334 +extern void hrtimer_init_sleeper(struct hrtimer_sleeper *sl, clockid_t clock_id,
12335 +                                enum hrtimer_mode mode,
12336 +                                struct task_struct *task);
12337
12338  #ifdef CONFIG_DEBUG_OBJECTS_TIMERS
12339  extern void hrtimer_init_on_stack(struct hrtimer *timer, clockid_t which_clock,
12340                                   enum hrtimer_mode mode);
12341 +extern void hrtimer_init_sleeper_on_stack(struct hrtimer_sleeper *sl,
12342 +                                         clockid_t clock_id,
12343 +                                         enum hrtimer_mode mode,
12344 +                                         struct task_struct *task);
12345
12346  extern void destroy_hrtimer_on_stack(struct hrtimer *timer);
12347  #else
12348 @@ -357,6 +394,15 @@ static inline void hrtimer_init_on_stack(struct hrtimer *timer,
12349  {
12350         hrtimer_init(timer, which_clock, mode);
12351  }
12352 +
12353 +static inline void hrtimer_init_sleeper_on_stack(struct hrtimer_sleeper *sl,
12354 +                                           clockid_t clock_id,
12355 +                                           enum hrtimer_mode mode,
12356 +                                           struct task_struct *task)
12357 +{
12358 +       hrtimer_init_sleeper(sl, clock_id, mode, task);
12359 +}
12360 +
12361  static inline void destroy_hrtimer_on_stack(struct hrtimer *timer) { }
12362  #endif
12363
12364 @@ -365,11 +411,12 @@ extern void hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim,
12365                                    u64 range_ns, const enum hrtimer_mode mode);
12366
12367  /**
12368 - * hrtimer_start - (re)start an hrtimer on the current CPU
12369 + * hrtimer_start - (re)start an hrtimer
12370   * @timer:     the timer to be added
12371   * @tim:       expiry time
12372 - * @mode:      expiry mode: absolute (HRTIMER_MODE_ABS) or
12373 - *             relative (HRTIMER_MODE_REL)
12374 + * @mode:      timer mode: absolute (HRTIMER_MODE_ABS) or
12375 + *             relative (HRTIMER_MODE_REL), and pinned (HRTIMER_MODE_PINNED);
12376 + *             softirq based mode is considered for debug purpose only!
12377   */
12378  static inline void hrtimer_start(struct hrtimer *timer, ktime_t tim,
12379                                  const enum hrtimer_mode mode)
12380 @@ -396,6 +443,13 @@ static inline void hrtimer_restart(struct hrtimer *timer)
12381         hrtimer_start_expires(timer, HRTIMER_MODE_ABS);
12382  }
12383
12384 +/* Softirq preemption could deadlock timer removal */
12385 +#ifdef CONFIG_PREEMPT_RT_BASE
12386 +  extern void hrtimer_wait_for_timer(const struct hrtimer *timer);
12387 +#else
12388 +# define hrtimer_wait_for_timer(timer) do { cpu_relax(); } while (0)
12389 +#endif
12390 +
12391  /* Query timers: */
12392  extern ktime_t __hrtimer_get_remaining(const struct hrtimer *timer, bool adjust);
12393
12394 @@ -420,9 +474,9 @@ static inline int hrtimer_is_queued(struct hrtimer *timer)
12395   * Helper function to check, whether the timer is running the callback
12396   * function
12397   */
12398 -static inline int hrtimer_callback_running(struct hrtimer *timer)
12399 +static inline int hrtimer_callback_running(const struct hrtimer *timer)
12400  {
12401 -       return timer->base->cpu_base->running == timer;
12402 +       return timer->base->running == timer;
12403  }
12404
12405  /* Forward a hrtimer so it expires after now: */
12406 @@ -458,15 +512,12 @@ extern long hrtimer_nanosleep(const struct timespec64 *rqtp,
12407                               const enum hrtimer_mode mode,
12408                               const clockid_t clockid);
12409
12410 -extern void hrtimer_init_sleeper(struct hrtimer_sleeper *sl,
12411 -                                struct task_struct *tsk);
12412 -
12413  extern int schedule_hrtimeout_range(ktime_t *expires, u64 delta,
12414                                                 const enum hrtimer_mode mode);
12415  extern int schedule_hrtimeout_range_clock(ktime_t *expires,
12416                                           u64 delta,
12417                                           const enum hrtimer_mode mode,
12418 -                                         int clock);
12419 +                                         clockid_t clock_id);
12420  extern int schedule_hrtimeout(ktime_t *expires, const enum hrtimer_mode mode);
12421
12422  /* Soft interrupt function to run the hrtimer queues: */
12423 diff --git a/include/linux/idr.h b/include/linux/idr.h
12424 index 7c3a365f7e12..a922d984d9b6 100644
12425 --- a/include/linux/idr.h
12426 +++ b/include/linux/idr.h
12427 @@ -167,10 +167,7 @@ static inline bool idr_is_empty(const struct idr *idr)
12428   * Each idr_preload() should be matched with an invocation of this
12429   * function.  See idr_preload() for details.
12430   */
12431 -static inline void idr_preload_end(void)
12432 -{
12433 -       preempt_enable();
12434 -}
12435 +void idr_preload_end(void);
12436
12437  /**
12438   * idr_find - return pointer for given id
12439 diff --git a/include/linux/init_task.h b/include/linux/init_task.h
12440 index 8062e6cc607c..ee3ff961b84c 100644
12441 --- a/include/linux/init_task.h
12442 +++ b/include/linux/init_task.h
12443 @@ -163,6 +163,12 @@ extern struct cred init_cred;
12444  # define INIT_PERF_EVENTS(tsk)
12445  #endif
12446
12447 +#if defined(CONFIG_POSIX_TIMERS) && defined(CONFIG_PREEMPT_RT_BASE)
12448 +# define INIT_TIMER_LIST               .posix_timer_list = NULL,
12449 +#else
12450 +# define INIT_TIMER_LIST
12451 +#endif
12452 +
12453  #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
12454  # define INIT_VTIME(tsk)                                               \
12455         .vtime.seqcount = SEQCNT_ZERO(tsk.vtime.seqcount),              \
12456 @@ -234,7 +240,8 @@ extern struct cred init_cred;
12457         .static_prio    = MAX_PRIO-20,                                  \
12458         .normal_prio    = MAX_PRIO-20,                                  \
12459         .policy         = SCHED_NORMAL,                                 \
12460 -       .cpus_allowed   = CPU_MASK_ALL,                                 \
12461 +       .cpus_ptr       = &tsk.cpus_mask,                               \
12462 +       .cpus_mask      = CPU_MASK_ALL,                                 \
12463         .nr_cpus_allowed= NR_CPUS,                                      \
12464         .mm             = NULL,                                         \
12465         .active_mm      = &init_mm,                                     \
12466 @@ -276,6 +283,7 @@ extern struct cred init_cred;
12467         INIT_CPU_TIMERS(tsk)                                            \
12468         .pi_lock        = __RAW_SPIN_LOCK_UNLOCKED(tsk.pi_lock),        \
12469         .timer_slack_ns = 50000, /* 50 usec default slack */            \
12470 +       INIT_TIMER_LIST                                                 \
12471         .pids = {                                                       \
12472                 [PIDTYPE_PID]  = INIT_PID_LINK(PIDTYPE_PID),            \
12473                 [PIDTYPE_PGID] = INIT_PID_LINK(PIDTYPE_PGID),           \
12474 diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h
12475 index 69c238210325..0f25fa19b2d8 100644
12476 --- a/include/linux/interrupt.h
12477 +++ b/include/linux/interrupt.h
12478 @@ -15,6 +15,7 @@
12479  #include <linux/hrtimer.h>
12480  #include <linux/kref.h>
12481  #include <linux/workqueue.h>
12482 +#include <linux/swork.h>
12483
12484  #include <linux/atomic.h>
12485  #include <asm/ptrace.h>
12486 @@ -63,6 +64,7 @@
12487   *                interrupt handler after suspending interrupts. For system
12488   *                wakeup devices users need to implement wakeup detection in
12489   *                their interrupt handlers.
12490 + * IRQF_NO_SOFTIRQ_CALL - Do not process softirqs in the irq thread context (RT)
12491   */
12492  #define IRQF_SHARED            0x00000080
12493  #define IRQF_PROBE_SHARED      0x00000100
12494 @@ -76,6 +78,7 @@
12495  #define IRQF_NO_THREAD         0x00010000
12496  #define IRQF_EARLY_RESUME      0x00020000
12497  #define IRQF_COND_SUSPEND      0x00040000
12498 +#define IRQF_NO_SOFTIRQ_CALL   0x00080000
12499
12500  #define IRQF_TIMER             (__IRQF_TIMER | IRQF_NO_SUSPEND | IRQF_NO_THREAD)
12501
12502 @@ -207,7 +210,7 @@ extern void devm_free_irq(struct device *dev, unsigned int irq, void *dev_id);
12503  #ifdef CONFIG_LOCKDEP
12504  # define local_irq_enable_in_hardirq() do { } while (0)
12505  #else
12506 -# define local_irq_enable_in_hardirq() local_irq_enable()
12507 +# define local_irq_enable_in_hardirq() local_irq_enable_nort()
12508  #endif
12509
12510  extern void disable_irq_nosync(unsigned int irq);
12511 @@ -227,6 +230,7 @@ extern void resume_device_irqs(void);
12512   * struct irq_affinity_notify - context for notification of IRQ affinity changes
12513   * @irq:               Interrupt to which notification applies
12514   * @kref:              Reference count, for internal use
12515 + * @swork:             Swork item, for internal use
12516   * @work:              Work item, for internal use
12517   * @notify:            Function to be called on change.  This will be
12518   *                     called in process context.
12519 @@ -238,7 +242,11 @@ extern void resume_device_irqs(void);
12520  struct irq_affinity_notify {
12521         unsigned int irq;
12522         struct kref kref;
12523 +#ifdef CONFIG_PREEMPT_RT_BASE
12524 +       struct swork_event swork;
12525 +#else
12526         struct work_struct work;
12527 +#endif
12528         void (*notify)(struct irq_affinity_notify *, const cpumask_t *mask);
12529         void (*release)(struct kref *ref);
12530  };
12531 @@ -429,9 +437,13 @@ extern int irq_set_irqchip_state(unsigned int irq, enum irqchip_irq_state which,
12532                                  bool state);
12533
12534  #ifdef CONFIG_IRQ_FORCED_THREADING
12535 +# ifndef CONFIG_PREEMPT_RT_BASE
12536  extern bool force_irqthreads;
12537 +# else
12538 +#  define force_irqthreads     (true)
12539 +# endif
12540  #else
12541 -#define force_irqthreads       (0)
12542 +#define force_irqthreads       (false)
12543  #endif
12544
12545  #ifndef __ARCH_SET_SOFTIRQ_PENDING
12546 @@ -488,9 +500,10 @@ struct softirq_action
12547         void    (*action)(struct softirq_action *);
12548  };
12549
12550 +#ifndef CONFIG_PREEMPT_RT_FULL
12551  asmlinkage void do_softirq(void);
12552  asmlinkage void __do_softirq(void);
12553 -
12554 +static inline void thread_do_softirq(void) { do_softirq(); }
12555  #ifdef __ARCH_HAS_DO_SOFTIRQ
12556  void do_softirq_own_stack(void);
12557  #else
12558 @@ -499,13 +512,25 @@ static inline void do_softirq_own_stack(void)
12559         __do_softirq();
12560  }
12561  #endif
12562 +#else
12563 +extern void thread_do_softirq(void);
12564 +#endif
12565
12566  extern void open_softirq(int nr, void (*action)(struct softirq_action *));
12567  extern void softirq_init(void);
12568  extern void __raise_softirq_irqoff(unsigned int nr);
12569 +#ifdef CONFIG_PREEMPT_RT_FULL
12570 +extern void __raise_softirq_irqoff_ksoft(unsigned int nr);
12571 +#else
12572 +static inline void __raise_softirq_irqoff_ksoft(unsigned int nr)
12573 +{
12574 +       __raise_softirq_irqoff(nr);
12575 +}
12576 +#endif
12577
12578  extern void raise_softirq_irqoff(unsigned int nr);
12579  extern void raise_softirq(unsigned int nr);
12580 +extern void softirq_check_pending_idle(void);
12581
12582  DECLARE_PER_CPU(struct task_struct *, ksoftirqd);
12583
12584 @@ -527,8 +552,9 @@ static inline struct task_struct *this_cpu_ksoftirqd(void)
12585       to be executed on some cpu at least once after this.
12586     * If the tasklet is already scheduled, but its execution is still not
12587       started, it will be executed only once.
12588 -   * If this tasklet is already running on another CPU (or schedule is called
12589 -     from tasklet itself), it is rescheduled for later.
12590 +   * If this tasklet is already running on another CPU, it is rescheduled
12591 +     for later.
12592 +   * Schedule must not be called from the tasklet itself (a lockup occurs)
12593     * Tasklet is strictly serialized wrt itself, but not
12594       wrt another tasklets. If client needs some intertask synchronization,
12595       he makes it with spinlocks.
12596 @@ -553,27 +579,36 @@ struct tasklet_struct name = { NULL, 0, ATOMIC_INIT(1), func, data }
12597  enum
12598  {
12599         TASKLET_STATE_SCHED,    /* Tasklet is scheduled for execution */
12600 -       TASKLET_STATE_RUN       /* Tasklet is running (SMP only) */
12601 +       TASKLET_STATE_RUN,      /* Tasklet is running (SMP only) */
12602 +       TASKLET_STATE_PENDING   /* Tasklet is pending */
12603  };
12604
12605 -#ifdef CONFIG_SMP
12606 +#define TASKLET_STATEF_SCHED   (1 << TASKLET_STATE_SCHED)
12607 +#define TASKLET_STATEF_RUN     (1 << TASKLET_STATE_RUN)
12608 +#define TASKLET_STATEF_PENDING (1 << TASKLET_STATE_PENDING)
12609 +
12610 +#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT_FULL)
12611  static inline int tasklet_trylock(struct tasklet_struct *t)
12612  {
12613         return !test_and_set_bit(TASKLET_STATE_RUN, &(t)->state);
12614  }
12615
12616 +static inline int tasklet_tryunlock(struct tasklet_struct *t)
12617 +{
12618 +       return cmpxchg(&t->state, TASKLET_STATEF_RUN, 0) == TASKLET_STATEF_RUN;
12619 +}
12620 +
12621  static inline void tasklet_unlock(struct tasklet_struct *t)
12622  {
12623         smp_mb__before_atomic();
12624         clear_bit(TASKLET_STATE_RUN, &(t)->state);
12625  }
12626
12627 -static inline void tasklet_unlock_wait(struct tasklet_struct *t)
12628 -{
12629 -       while (test_bit(TASKLET_STATE_RUN, &(t)->state)) { barrier(); }
12630 -}
12631 +extern void tasklet_unlock_wait(struct tasklet_struct *t);
12632 +
12633  #else
12634  #define tasklet_trylock(t) 1
12635 +#define tasklet_tryunlock(t)   1
12636  #define tasklet_unlock_wait(t) do { } while (0)
12637  #define tasklet_unlock(t) do { } while (0)
12638  #endif
12639 @@ -607,41 +642,17 @@ static inline void tasklet_disable(struct tasklet_struct *t)
12640         smp_mb();
12641  }
12642
12643 -static inline void tasklet_enable(struct tasklet_struct *t)
12644 -{
12645 -       smp_mb__before_atomic();
12646 -       atomic_dec(&t->count);
12647 -}
12648 -
12649 +extern void tasklet_enable(struct tasklet_struct *t);
12650  extern void tasklet_kill(struct tasklet_struct *t);
12651  extern void tasklet_kill_immediate(struct tasklet_struct *t, unsigned int cpu);
12652  extern void tasklet_init(struct tasklet_struct *t,
12653                          void (*func)(unsigned long), unsigned long data);
12654
12655 -struct tasklet_hrtimer {
12656 -       struct hrtimer          timer;
12657 -       struct tasklet_struct   tasklet;
12658 -       enum hrtimer_restart    (*function)(struct hrtimer *);
12659 -};
12660 -
12661 -extern void
12662 -tasklet_hrtimer_init(struct tasklet_hrtimer *ttimer,
12663 -                    enum hrtimer_restart (*function)(struct hrtimer *),
12664 -                    clockid_t which_clock, enum hrtimer_mode mode);
12665 -
12666 -static inline
12667 -void tasklet_hrtimer_start(struct tasklet_hrtimer *ttimer, ktime_t time,
12668 -                          const enum hrtimer_mode mode)
12669 -{
12670 -       hrtimer_start(&ttimer->timer, time, mode);
12671 -}
12672 -
12673 -static inline
12674 -void tasklet_hrtimer_cancel(struct tasklet_hrtimer *ttimer)
12675 -{
12676 -       hrtimer_cancel(&ttimer->timer);
12677 -       tasklet_kill(&ttimer->tasklet);
12678 -}
12679 +#ifdef CONFIG_PREEMPT_RT_FULL
12680 +extern void softirq_early_init(void);
12681 +#else
12682 +static inline void softirq_early_init(void) { }
12683 +#endif
12684
12685  /*
12686   * Autoprobing for irqs:
12687 diff --git a/include/linux/irq.h b/include/linux/irq.h
12688 index 0d53626405bf..ddd23c6e2e55 100644
12689 --- a/include/linux/irq.h
12690 +++ b/include/linux/irq.h
12691 @@ -74,6 +74,7 @@ enum irqchip_irq_state;
12692   * IRQ_IS_POLLED               - Always polled by another interrupt. Exclude
12693   *                               it from the spurious interrupt detection
12694   *                               mechanism and from core side polling.
12695 + * IRQ_NO_SOFTIRQ_CALL         - No softirq processing in the irq thread context (RT)
12696   * IRQ_DISABLE_UNLAZY          - Disable lazy irq disable
12697   */
12698  enum {
12699 @@ -101,13 +102,14 @@ enum {
12700         IRQ_PER_CPU_DEVID       = (1 << 17),
12701         IRQ_IS_POLLED           = (1 << 18),
12702         IRQ_DISABLE_UNLAZY      = (1 << 19),
12703 +       IRQ_NO_SOFTIRQ_CALL     = (1 << 20),
12704  };
12705
12706  #define IRQF_MODIFY_MASK       \
12707         (IRQ_TYPE_SENSE_MASK | IRQ_NOPROBE | IRQ_NOREQUEST | \
12708          IRQ_NOAUTOEN | IRQ_MOVE_PCNTXT | IRQ_LEVEL | IRQ_NO_BALANCING | \
12709          IRQ_PER_CPU | IRQ_NESTED_THREAD | IRQ_NOTHREAD | IRQ_PER_CPU_DEVID | \
12710 -        IRQ_IS_POLLED | IRQ_DISABLE_UNLAZY)
12711 +        IRQ_IS_POLLED | IRQ_DISABLE_UNLAZY | IRQ_NO_SOFTIRQ_CALL)
12712
12713  #define IRQ_NO_BALANCING_MASK  (IRQ_PER_CPU | IRQ_NO_BALANCING)
12714
12715 diff --git a/include/linux/irq_work.h b/include/linux/irq_work.h
12716 index 9270d73ea682..1e66fac6f1d2 100644
12717 --- a/include/linux/irq_work.h
12718 +++ b/include/linux/irq_work.h
12719 @@ -17,6 +17,7 @@
12720  #define IRQ_WORK_BUSY          2UL
12721  #define IRQ_WORK_FLAGS         3UL
12722  #define IRQ_WORK_LAZY          4UL /* Doesn't want IPI, wait for tick */
12723 +#define IRQ_WORK_HARD_IRQ      8UL /* Run hard IRQ context, even on RT */
12724
12725  struct irq_work {
12726         unsigned long flags;
12727 @@ -52,4 +53,10 @@ static inline bool irq_work_needs_cpu(void) { return false; }
12728  static inline void irq_work_run(void) { }
12729  #endif
12730
12731 +#if defined(CONFIG_IRQ_WORK) && defined(CONFIG_PREEMPT_RT_FULL)
12732 +void irq_work_tick_soft(void);
12733 +#else
12734 +static inline void irq_work_tick_soft(void) { }
12735 +#endif
12736 +
12737  #endif /* _LINUX_IRQ_WORK_H */
12738 diff --git a/include/linux/irqchip/arm-gic-v3.h b/include/linux/irqchip/arm-gic-v3.h
12739 index bacb499c512c..688f2565294c 100644
12740 --- a/include/linux/irqchip/arm-gic-v3.h
12741 +++ b/include/linux/irqchip/arm-gic-v3.h
12742 @@ -568,6 +568,7 @@ struct rdists {
12743                 void __iomem    *rd_base;
12744                 struct page     *pend_page;
12745                 phys_addr_t     phys_base;
12746 +               bool            lpi_enabled;
12747         } __percpu              *rdist;
12748         struct page             *prop_page;
12749         int                     id_bits;
12750 diff --git a/include/linux/irqdesc.h b/include/linux/irqdesc.h
12751 index b6084898d330..d334476cdca6 100644
12752 --- a/include/linux/irqdesc.h
12753 +++ b/include/linux/irqdesc.h
12754 @@ -70,6 +70,7 @@ struct irq_desc {
12755         unsigned int            irqs_unhandled;
12756         atomic_t                threads_handled;
12757         int                     threads_handled_last;
12758 +       u64                     random_ip;
12759         raw_spinlock_t          lock;
12760         struct cpumask          *percpu_enabled;
12761         const struct cpumask    *percpu_affinity;
12762 diff --git a/include/linux/irqflags.h b/include/linux/irqflags.h
12763 index 46cb57d5eb13..2e023bfe45af 100644
12764 --- a/include/linux/irqflags.h
12765 +++ b/include/linux/irqflags.h
12766 @@ -34,16 +34,6 @@ do {                                         \
12767         current->hardirq_context--;             \
12768         crossrelease_hist_end(XHLOCK_HARD);     \
12769  } while (0)
12770 -# define lockdep_softirq_enter()               \
12771 -do {                                           \
12772 -       current->softirq_context++;             \
12773 -       crossrelease_hist_start(XHLOCK_SOFT);   \
12774 -} while (0)
12775 -# define lockdep_softirq_exit()                        \
12776 -do {                                           \
12777 -       current->softirq_context--;             \
12778 -       crossrelease_hist_end(XHLOCK_SOFT);     \
12779 -} while (0)
12780  # define INIT_TRACE_IRQFLAGS   .softirqs_enabled = 1,
12781  #else
12782  # define trace_hardirqs_on()           do { } while (0)
12783 @@ -56,9 +46,23 @@ do {                                         \
12784  # define trace_softirqs_enabled(p)     0
12785  # define trace_hardirq_enter()         do { } while (0)
12786  # define trace_hardirq_exit()          do { } while (0)
12787 +# define INIT_TRACE_IRQFLAGS
12788 +#endif
12789 +
12790 +#if defined(CONFIG_TRACE_IRQFLAGS) && !defined(CONFIG_PREEMPT_RT_FULL)
12791 +# define lockdep_softirq_enter()               \
12792 +do {                                           \
12793 +       current->softirq_context++;             \
12794 +       crossrelease_hist_start(XHLOCK_SOFT);   \
12795 +} while (0)
12796 +# define lockdep_softirq_exit()                        \
12797 +do {                                           \
12798 +       current->softirq_context--;             \
12799 +       crossrelease_hist_end(XHLOCK_SOFT);     \
12800 +} while (0)
12801 +#else
12802  # define lockdep_softirq_enter()       do { } while (0)
12803  # define lockdep_softirq_exit()                do { } while (0)
12804 -# define INIT_TRACE_IRQFLAGS
12805  #endif
12806
12807  #if defined(CONFIG_IRQSOFF_TRACER) || \
12808 @@ -165,4 +169,23 @@ do {                                               \
12809
12810  #define irqs_disabled_flags(flags) raw_irqs_disabled_flags(flags)
12811
12812 +/*
12813 + * local_irq* variants depending on RT/!RT
12814 + */
12815 +#ifdef CONFIG_PREEMPT_RT_FULL
12816 +# define local_irq_disable_nort()      do { } while (0)
12817 +# define local_irq_enable_nort()       do { } while (0)
12818 +# define local_irq_save_nort(flags)    local_save_flags(flags)
12819 +# define local_irq_restore_nort(flags) (void)(flags)
12820 +# define local_irq_disable_rt()                local_irq_disable()
12821 +# define local_irq_enable_rt()         local_irq_enable()
12822 +#else
12823 +# define local_irq_disable_nort()      local_irq_disable()
12824 +# define local_irq_enable_nort()       local_irq_enable()
12825 +# define local_irq_save_nort(flags)    local_irq_save(flags)
12826 +# define local_irq_restore_nort(flags) local_irq_restore(flags)
12827 +# define local_irq_disable_rt()                do { } while (0)
12828 +# define local_irq_enable_rt()         do { } while (0)
12829 +#endif
12830 +
12831  #endif
12832 diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h
12833 index 29290bfb94a8..32379bfab9f0 100644
12834 --- a/include/linux/jbd2.h
12835 +++ b/include/linux/jbd2.h
12836 @@ -347,32 +347,56 @@ static inline struct journal_head *bh2jh(struct buffer_head *bh)
12837
12838  static inline void jbd_lock_bh_state(struct buffer_head *bh)
12839  {
12840 +#ifndef CONFIG_PREEMPT_RT_BASE
12841         bit_spin_lock(BH_State, &bh->b_state);
12842 +#else
12843 +       spin_lock(&bh->b_state_lock);
12844 +#endif
12845  }
12846
12847  static inline int jbd_trylock_bh_state(struct buffer_head *bh)
12848  {
12849 +#ifndef CONFIG_PREEMPT_RT_BASE
12850         return bit_spin_trylock(BH_State, &bh->b_state);
12851 +#else
12852 +       return spin_trylock(&bh->b_state_lock);
12853 +#endif
12854  }
12855
12856  static inline int jbd_is_locked_bh_state(struct buffer_head *bh)
12857  {
12858 +#ifndef CONFIG_PREEMPT_RT_BASE
12859         return bit_spin_is_locked(BH_State, &bh->b_state);
12860 +#else
12861 +       return spin_is_locked(&bh->b_state_lock);
12862 +#endif
12863  }
12864
12865  static inline void jbd_unlock_bh_state(struct buffer_head *bh)
12866  {
12867 +#ifndef CONFIG_PREEMPT_RT_BASE
12868         bit_spin_unlock(BH_State, &bh->b_state);
12869 +#else
12870 +       spin_unlock(&bh->b_state_lock);
12871 +#endif
12872  }
12873
12874  static inline void jbd_lock_bh_journal_head(struct buffer_head *bh)
12875  {
12876 +#ifndef CONFIG_PREEMPT_RT_BASE
12877         bit_spin_lock(BH_JournalHead, &bh->b_state);
12878 +#else
12879 +       spin_lock(&bh->b_journal_head_lock);
12880 +#endif
12881  }
12882
12883  static inline void jbd_unlock_bh_journal_head(struct buffer_head *bh)
12884  {
12885 +#ifndef CONFIG_PREEMPT_RT_BASE
12886         bit_spin_unlock(BH_JournalHead, &bh->b_state);
12887 +#else
12888 +       spin_unlock(&bh->b_journal_head_lock);
12889 +#endif
12890  }
12891
12892  #define J_ASSERT(assert)       BUG_ON(!(assert))
12893 diff --git a/include/linux/kdb.h b/include/linux/kdb.h
12894 index 68bd88223417..e033b25b0b72 100644
12895 --- a/include/linux/kdb.h
12896 +++ b/include/linux/kdb.h
12897 @@ -167,6 +167,7 @@ extern __printf(2, 0) int vkdb_printf(enum kdb_msgsrc src, const char *fmt,
12898  extern __printf(1, 2) int kdb_printf(const char *, ...);
12899  typedef __printf(1, 2) int (*kdb_printf_t)(const char *, ...);
12900
12901 +#define in_kdb_printk()        (kdb_trap_printk)
12902  extern void kdb_init(int level);
12903
12904  /* Access to kdb specific polling devices */
12905 @@ -201,6 +202,7 @@ extern int kdb_register_flags(char *, kdb_func_t, char *, char *,
12906  extern int kdb_unregister(char *);
12907  #else /* ! CONFIG_KGDB_KDB */
12908  static inline __printf(1, 2) int kdb_printf(const char *fmt, ...) { return 0; }
12909 +#define in_kdb_printk() (0)
12910  static inline void kdb_init(int level) {}
12911  static inline int kdb_register(char *cmd, kdb_func_t func, char *usage,
12912                                char *help, short minlen) { return 0; }
12913 diff --git a/include/linux/kernel.h b/include/linux/kernel.h
12914 index 4b484ab9e163..74feebf9d82c 100644
12915 --- a/include/linux/kernel.h
12916 +++ b/include/linux/kernel.h
12917 @@ -225,6 +225,9 @@ extern int _cond_resched(void);
12918   */
12919  # define might_sleep() \
12920         do { __might_sleep(__FILE__, __LINE__, 0); might_resched(); } while (0)
12921 +
12922 +# define might_sleep_no_state_check() \
12923 +       do { ___might_sleep(__FILE__, __LINE__, 0); might_resched(); } while (0)
12924  # define sched_annotate_sleep()        (current->task_state_change = 0)
12925  #else
12926    static inline void ___might_sleep(const char *file, int line,
12927 @@ -232,6 +235,7 @@ extern int _cond_resched(void);
12928    static inline void __might_sleep(const char *file, int line,
12929                                    int preempt_offset) { }
12930  # define might_sleep() do { might_resched(); } while (0)
12931 +# define might_sleep_no_state_check() do { might_resched(); } while (0)
12932  # define sched_annotate_sleep() do { } while (0)
12933  #endif
12934
12935 @@ -531,6 +535,7 @@ extern enum system_states {
12936         SYSTEM_HALT,
12937         SYSTEM_POWER_OFF,
12938         SYSTEM_RESTART,
12939 +       SYSTEM_SUSPEND,
12940  } system_state;
12941
12942  #define TAINT_PROPRIETARY_MODULE       0
12943 diff --git a/include/linux/list_bl.h b/include/linux/list_bl.h
12944 index 3fc2cc57ba1b..0b5de7d9ffcf 100644
12945 --- a/include/linux/list_bl.h
12946 +++ b/include/linux/list_bl.h
12947 @@ -3,6 +3,7 @@
12948  #define _LINUX_LIST_BL_H
12949
12950  #include <linux/list.h>
12951 +#include <linux/spinlock.h>
12952  #include <linux/bit_spinlock.h>
12953
12954  /*
12955 @@ -33,13 +34,24 @@
12956
12957  struct hlist_bl_head {
12958         struct hlist_bl_node *first;
12959 +#ifdef CONFIG_PREEMPT_RT_BASE
12960 +       raw_spinlock_t lock;
12961 +#endif
12962  };
12963
12964  struct hlist_bl_node {
12965         struct hlist_bl_node *next, **pprev;
12966  };
12967 -#define INIT_HLIST_BL_HEAD(ptr) \
12968 -       ((ptr)->first = NULL)
12969 +
12970 +#ifdef CONFIG_PREEMPT_RT_BASE
12971 +#define INIT_HLIST_BL_HEAD(h)          \
12972 +do {                                   \
12973 +       (h)->first = NULL;              \
12974 +       raw_spin_lock_init(&(h)->lock); \
12975 +} while (0)
12976 +#else
12977 +#define INIT_HLIST_BL_HEAD(h) (h)->first = NULL
12978 +#endif
12979
12980  static inline void INIT_HLIST_BL_NODE(struct hlist_bl_node *h)
12981  {
12982 @@ -119,12 +131,26 @@ static inline void hlist_bl_del_init(struct hlist_bl_node *n)
12983
12984  static inline void hlist_bl_lock(struct hlist_bl_head *b)
12985  {
12986 +#ifndef CONFIG_PREEMPT_RT_BASE
12987         bit_spin_lock(0, (unsigned long *)b);
12988 +#else
12989 +       raw_spin_lock(&b->lock);
12990 +#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK)
12991 +       __set_bit(0, (unsigned long *)b);
12992 +#endif
12993 +#endif
12994  }
12995
12996  static inline void hlist_bl_unlock(struct hlist_bl_head *b)
12997  {
12998 +#ifndef CONFIG_PREEMPT_RT_BASE
12999         __bit_spin_unlock(0, (unsigned long *)b);
13000 +#else
13001 +#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK)
13002 +       __clear_bit(0, (unsigned long *)b);
13003 +#endif
13004 +       raw_spin_unlock(&b->lock);
13005 +#endif
13006  }
13007
13008  static inline bool hlist_bl_is_locked(struct hlist_bl_head *b)
13009 diff --git a/include/linux/locallock.h b/include/linux/locallock.h
13010 new file mode 100644
13011 index 000000000000..921eab83cd34
13012 --- /dev/null
13013 +++ b/include/linux/locallock.h
13014 @@ -0,0 +1,281 @@
13015 +#ifndef _LINUX_LOCALLOCK_H
13016 +#define _LINUX_LOCALLOCK_H
13017 +
13018 +#include <linux/percpu.h>
13019 +#include <linux/spinlock.h>
13020 +
13021 +#ifdef CONFIG_PREEMPT_RT_BASE
13022 +
13023 +#ifdef CONFIG_DEBUG_SPINLOCK
13024 +# define LL_WARN(cond) WARN_ON(cond)
13025 +#else
13026 +# define LL_WARN(cond) do { } while (0)
13027 +#endif
13028 +
13029 +/*
13030 + * per cpu lock based substitute for local_irq_*()
13031 + */
13032 +struct local_irq_lock {
13033 +       spinlock_t              lock;
13034 +       struct task_struct      *owner;
13035 +       int                     nestcnt;
13036 +       unsigned long           flags;
13037 +};
13038 +
13039 +#define DEFINE_LOCAL_IRQ_LOCK(lvar)                                    \
13040 +       DEFINE_PER_CPU(struct local_irq_lock, lvar) = {                 \
13041 +               .lock = __SPIN_LOCK_UNLOCKED((lvar).lock) }
13042 +
13043 +#define DECLARE_LOCAL_IRQ_LOCK(lvar)                                   \
13044 +       DECLARE_PER_CPU(struct local_irq_lock, lvar)
13045 +
13046 +#define local_irq_lock_init(lvar)                                      \
13047 +       do {                                                            \
13048 +               int __cpu;                                              \
13049 +               for_each_possible_cpu(__cpu)                            \
13050 +                       spin_lock_init(&per_cpu(lvar, __cpu).lock);     \
13051 +       } while (0)
13052 +
13053 +static inline void __local_lock(struct local_irq_lock *lv)
13054 +{
13055 +       if (lv->owner != current) {
13056 +               spin_lock(&lv->lock);
13057 +               LL_WARN(lv->owner);
13058 +               LL_WARN(lv->nestcnt);
13059 +               lv->owner = current;
13060 +       }
13061 +       lv->nestcnt++;
13062 +}
13063 +
13064 +#define local_lock(lvar)                                       \
13065 +       do { __local_lock(&get_local_var(lvar)); } while (0)
13066 +
13067 +#define local_lock_on(lvar, cpu)                               \
13068 +       do { __local_lock(&per_cpu(lvar, cpu)); } while (0)
13069 +
13070 +static inline int __local_trylock(struct local_irq_lock *lv)
13071 +{
13072 +       if (lv->owner != current && spin_trylock(&lv->lock)) {
13073 +               LL_WARN(lv->owner);
13074 +               LL_WARN(lv->nestcnt);
13075 +               lv->owner = current;
13076 +               lv->nestcnt = 1;
13077 +               return 1;
13078 +       } else if (lv->owner == current) {
13079 +               lv->nestcnt++;
13080 +               return 1;
13081 +       }
13082 +       return 0;
13083 +}
13084 +
13085 +#define local_trylock(lvar)                                            \
13086 +       ({                                                              \
13087 +               int __locked;                                           \
13088 +               __locked = __local_trylock(&get_local_var(lvar));       \
13089 +               if (!__locked)                                          \
13090 +                       put_local_var(lvar);                            \
13091 +               __locked;                                               \
13092 +       })
13093 +
13094 +static inline void __local_unlock(struct local_irq_lock *lv)
13095 +{
13096 +       LL_WARN(lv->nestcnt == 0);
13097 +       LL_WARN(lv->owner != current);
13098 +       if (--lv->nestcnt)
13099 +               return;
13100 +
13101 +       lv->owner = NULL;
13102 +       spin_unlock(&lv->lock);
13103 +}
13104 +
13105 +#define local_unlock(lvar)                                     \
13106 +       do {                                                    \
13107 +               __local_unlock(this_cpu_ptr(&lvar));            \
13108 +               put_local_var(lvar);                            \
13109 +       } while (0)
13110 +
13111 +#define local_unlock_on(lvar, cpu)                       \
13112 +       do { __local_unlock(&per_cpu(lvar, cpu)); } while (0)
13113 +
13114 +static inline void __local_lock_irq(struct local_irq_lock *lv)
13115 +{
13116 +       spin_lock_irqsave(&lv->lock, lv->flags);
13117 +       LL_WARN(lv->owner);
13118 +       LL_WARN(lv->nestcnt);
13119 +       lv->owner = current;
13120 +       lv->nestcnt = 1;
13121 +}
13122 +
13123 +#define local_lock_irq(lvar)                                           \
13124 +       do { __local_lock_irq(&get_local_var(lvar)); } while (0)
13125 +
13126 +#define local_lock_irq_on(lvar, cpu)                                   \
13127 +       do { __local_lock_irq(&per_cpu(lvar, cpu)); } while (0)
13128 +
13129 +static inline void __local_unlock_irq(struct local_irq_lock *lv)
13130 +{
13131 +       LL_WARN(!lv->nestcnt);
13132 +       LL_WARN(lv->owner != current);
13133 +       lv->owner = NULL;
13134 +       lv->nestcnt = 0;
13135 +       spin_unlock_irq(&lv->lock);
13136 +}
13137 +
13138 +#define local_unlock_irq(lvar)                                         \
13139 +       do {                                                            \
13140 +               __local_unlock_irq(this_cpu_ptr(&lvar));                \
13141 +               put_local_var(lvar);                                    \
13142 +       } while (0)
13143 +
13144 +#define local_unlock_irq_on(lvar, cpu)                                 \
13145 +       do {                                                            \
13146 +               __local_unlock_irq(&per_cpu(lvar, cpu));                \
13147 +       } while (0)
13148 +
13149 +static inline int __local_lock_irqsave(struct local_irq_lock *lv)
13150 +{
13151 +       if (lv->owner != current) {
13152 +               __local_lock_irq(lv);
13153 +               return 0;
13154 +       } else {
13155 +               lv->nestcnt++;
13156 +               return 1;
13157 +       }
13158 +}
13159 +
13160 +#define local_lock_irqsave(lvar, _flags)                               \
13161 +       do {                                                            \
13162 +               if (__local_lock_irqsave(&get_local_var(lvar)))         \
13163 +                       put_local_var(lvar);                            \
13164 +               _flags = __this_cpu_read(lvar.flags);                   \
13165 +       } while (0)
13166 +
13167 +#define local_lock_irqsave_on(lvar, _flags, cpu)                       \
13168 +       do {                                                            \
13169 +               __local_lock_irqsave(&per_cpu(lvar, cpu));              \
13170 +               _flags = per_cpu(lvar, cpu).flags;                      \
13171 +       } while (0)
13172 +
13173 +static inline int __local_unlock_irqrestore(struct local_irq_lock *lv,
13174 +                                           unsigned long flags)
13175 +{
13176 +       LL_WARN(!lv->nestcnt);
13177 +       LL_WARN(lv->owner != current);
13178 +       if (--lv->nestcnt)
13179 +               return 0;
13180 +
13181 +       lv->owner = NULL;
13182 +       spin_unlock_irqrestore(&lv->lock, lv->flags);
13183 +       return 1;
13184 +}
13185 +
13186 +#define local_unlock_irqrestore(lvar, flags)                           \
13187 +       do {                                                            \
13188 +               if (__local_unlock_irqrestore(this_cpu_ptr(&lvar), flags)) \
13189 +                       put_local_var(lvar);                            \
13190 +       } while (0)
13191 +
13192 +#define local_unlock_irqrestore_on(lvar, flags, cpu)                   \
13193 +       do {                                                            \
13194 +               __local_unlock_irqrestore(&per_cpu(lvar, cpu), flags);  \
13195 +       } while (0)
13196 +
13197 +#define local_spin_trylock_irq(lvar, lock)                             \
13198 +       ({                                                              \
13199 +               int __locked;                                           \
13200 +               local_lock_irq(lvar);                                   \
13201 +               __locked = spin_trylock(lock);                          \
13202 +               if (!__locked)                                          \
13203 +                       local_unlock_irq(lvar);                         \
13204 +               __locked;                                               \
13205 +       })
13206 +
13207 +#define local_spin_lock_irq(lvar, lock)                                        \
13208 +       do {                                                            \
13209 +               local_lock_irq(lvar);                                   \
13210 +               spin_lock(lock);                                        \
13211 +       } while (0)
13212 +
13213 +#define local_spin_unlock_irq(lvar, lock)                              \
13214 +       do {                                                            \
13215 +               spin_unlock(lock);                                      \
13216 +               local_unlock_irq(lvar);                                 \
13217 +       } while (0)
13218 +
13219 +#define local_spin_lock_irqsave(lvar, lock, flags)                     \
13220 +       do {                                                            \
13221 +               local_lock_irqsave(lvar, flags);                        \
13222 +               spin_lock(lock);                                        \
13223 +       } while (0)
13224 +
13225 +#define local_spin_unlock_irqrestore(lvar, lock, flags)                        \
13226 +       do {                                                            \
13227 +               spin_unlock(lock);                                      \
13228 +               local_unlock_irqrestore(lvar, flags);                   \
13229 +       } while (0)
13230 +
13231 +#define get_locked_var(lvar, var)                                      \
13232 +       (*({                                                            \
13233 +               local_lock(lvar);                                       \
13234 +               this_cpu_ptr(&var);                                     \
13235 +       }))
13236 +
13237 +#define put_locked_var(lvar, var)      local_unlock(lvar);
13238 +
13239 +#define get_locked_ptr(lvar, var)                                      \
13240 +       ({                                                              \
13241 +               local_lock(lvar);                                       \
13242 +               this_cpu_ptr(var);                                      \
13243 +       })
13244 +
13245 +#define put_locked_ptr(lvar, var)      local_unlock(lvar);
13246 +
13247 +#define local_lock_cpu(lvar)                                           \
13248 +       ({                                                              \
13249 +               local_lock(lvar);                                       \
13250 +               smp_processor_id();                                     \
13251 +       })
13252 +
13253 +#define local_unlock_cpu(lvar)                 local_unlock(lvar)
13254 +
13255 +#else /* PREEMPT_RT_BASE */
13256 +
13257 +#define DEFINE_LOCAL_IRQ_LOCK(lvar)            __typeof__(const int) lvar
13258 +#define DECLARE_LOCAL_IRQ_LOCK(lvar)           extern __typeof__(const int) lvar
13259 +
13260 +static inline void local_irq_lock_init(int lvar) { }
13261 +
13262 +#define local_trylock(lvar)                                    \
13263 +       ({                                                      \
13264 +               preempt_disable();                              \
13265 +               1;                                              \
13266 +       })
13267 +
13268 +#define local_lock(lvar)                       preempt_disable()
13269 +#define local_unlock(lvar)                     preempt_enable()
13270 +#define local_lock_irq(lvar)                   local_irq_disable()
13271 +#define local_lock_irq_on(lvar, cpu)           local_irq_disable()
13272 +#define local_unlock_irq(lvar)                 local_irq_enable()
13273 +#define local_unlock_irq_on(lvar, cpu)         local_irq_enable()
13274 +#define local_lock_irqsave(lvar, flags)                local_irq_save(flags)
13275 +#define local_unlock_irqrestore(lvar, flags)   local_irq_restore(flags)
13276 +
13277 +#define local_spin_trylock_irq(lvar, lock)     spin_trylock_irq(lock)
13278 +#define local_spin_lock_irq(lvar, lock)                spin_lock_irq(lock)
13279 +#define local_spin_unlock_irq(lvar, lock)      spin_unlock_irq(lock)
13280 +#define local_spin_lock_irqsave(lvar, lock, flags)     \
13281 +       spin_lock_irqsave(lock, flags)
13282 +#define local_spin_unlock_irqrestore(lvar, lock, flags)        \
13283 +       spin_unlock_irqrestore(lock, flags)
13284 +
13285 +#define get_locked_var(lvar, var)              get_cpu_var(var)
13286 +#define put_locked_var(lvar, var)              put_cpu_var(var)
13287 +#define get_locked_ptr(lvar, var)              get_cpu_ptr(var)
13288 +#define put_locked_ptr(lvar, var)              put_cpu_ptr(var)
13289 +
13290 +#define local_lock_cpu(lvar)                   get_cpu()
13291 +#define local_unlock_cpu(lvar)                 put_cpu()
13292 +
13293 +#endif
13294 +
13295 +#endif
13296 diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
13297 index e41ef532c4ce..63317710311e 100644
13298 --- a/include/linux/mm_types.h
13299 +++ b/include/linux/mm_types.h
13300 @@ -12,6 +12,7 @@
13301  #include <linux/completion.h>
13302  #include <linux/cpumask.h>
13303  #include <linux/uprobes.h>
13304 +#include <linux/rcupdate.h>
13305  #include <linux/page-flags-layout.h>
13306  #include <linux/workqueue.h>
13307
13308 @@ -496,6 +497,9 @@ struct mm_struct {
13309         bool tlb_flush_batched;
13310  #endif
13311         struct uprobes_state uprobes_state;
13312 +#ifdef CONFIG_PREEMPT_RT_BASE
13313 +       struct rcu_head delayed_drop;
13314 +#endif
13315  #ifdef CONFIG_HUGETLB_PAGE
13316         atomic_long_t hugetlb_usage;
13317  #endif
13318 diff --git a/include/linux/mutex.h b/include/linux/mutex.h
13319 index 153274f78402..dbb52857b25b 100644
13320 --- a/include/linux/mutex.h
13321 +++ b/include/linux/mutex.h
13322 @@ -23,6 +23,17 @@
13323
13324  struct ww_acquire_ctx;
13325
13326 +#ifdef CONFIG_DEBUG_LOCK_ALLOC
13327 +# define __DEP_MAP_MUTEX_INITIALIZER(lockname) \
13328 +               , .dep_map = { .name = #lockname }
13329 +#else
13330 +# define __DEP_MAP_MUTEX_INITIALIZER(lockname)
13331 +#endif
13332 +
13333 +#ifdef CONFIG_PREEMPT_RT_FULL
13334 +# include <linux/mutex_rt.h>
13335 +#else
13336 +
13337  /*
13338   * Simple, straightforward mutexes with strict semantics:
13339   *
13340 @@ -114,13 +125,6 @@ do {                                                                       \
13341         __mutex_init((mutex), #mutex, &__key);                          \
13342  } while (0)
13343
13344 -#ifdef CONFIG_DEBUG_LOCK_ALLOC
13345 -# define __DEP_MAP_MUTEX_INITIALIZER(lockname) \
13346 -               , .dep_map = { .name = #lockname }
13347 -#else
13348 -# define __DEP_MAP_MUTEX_INITIALIZER(lockname)
13349 -#endif
13350 -
13351  #define __MUTEX_INITIALIZER(lockname) \
13352                 { .owner = ATOMIC_LONG_INIT(0) \
13353                 , .wait_lock = __SPIN_LOCK_UNLOCKED(lockname.wait_lock) \
13354 @@ -228,4 +232,6 @@ mutex_trylock_recursive(struct mutex *lock)
13355         return mutex_trylock(lock);
13356  }
13357
13358 +#endif /* !PREEMPT_RT_FULL */
13359 +
13360  #endif /* __LINUX_MUTEX_H */
13361 diff --git a/include/linux/mutex_rt.h b/include/linux/mutex_rt.h
13362 new file mode 100644
13363 index 000000000000..3fcb5edb1d2b
13364 --- /dev/null
13365 +++ b/include/linux/mutex_rt.h
13366 @@ -0,0 +1,130 @@
13367 +#ifndef __LINUX_MUTEX_RT_H
13368 +#define __LINUX_MUTEX_RT_H
13369 +
13370 +#ifndef __LINUX_MUTEX_H
13371 +#error "Please include mutex.h"
13372 +#endif
13373 +
13374 +#include <linux/rtmutex.h>
13375 +
13376 +/* FIXME: Just for __lockfunc */
13377 +#include <linux/spinlock.h>
13378 +
13379 +struct mutex {
13380 +       struct rt_mutex         lock;
13381 +#ifdef CONFIG_DEBUG_LOCK_ALLOC
13382 +       struct lockdep_map      dep_map;
13383 +#endif
13384 +};
13385 +
13386 +#define __MUTEX_INITIALIZER(mutexname)                                 \
13387 +       {                                                               \
13388 +               .lock = __RT_MUTEX_INITIALIZER(mutexname.lock)          \
13389 +               __DEP_MAP_MUTEX_INITIALIZER(mutexname)                  \
13390 +       }
13391 +
13392 +#define DEFINE_MUTEX(mutexname)                                                \
13393 +       struct mutex mutexname = __MUTEX_INITIALIZER(mutexname)
13394 +
13395 +extern void __mutex_do_init(struct mutex *lock, const char *name, struct lock_class_key *key);
13396 +extern void __lockfunc _mutex_lock(struct mutex *lock);
13397 +extern void __lockfunc _mutex_lock_io(struct mutex *lock);
13398 +extern void __lockfunc _mutex_lock_io_nested(struct mutex *lock, int subclass);
13399 +extern int __lockfunc _mutex_lock_interruptible(struct mutex *lock);
13400 +extern int __lockfunc _mutex_lock_killable(struct mutex *lock);
13401 +extern void __lockfunc _mutex_lock_nested(struct mutex *lock, int subclass);
13402 +extern void __lockfunc _mutex_lock_nest_lock(struct mutex *lock, struct lockdep_map *nest_lock);
13403 +extern int __lockfunc _mutex_lock_interruptible_nested(struct mutex *lock, int subclass);
13404 +extern int __lockfunc _mutex_lock_killable_nested(struct mutex *lock, int subclass);
13405 +extern int __lockfunc _mutex_trylock(struct mutex *lock);
13406 +extern void __lockfunc _mutex_unlock(struct mutex *lock);
13407 +
13408 +#define mutex_is_locked(l)             rt_mutex_is_locked(&(l)->lock)
13409 +#define mutex_lock(l)                  _mutex_lock(l)
13410 +#define mutex_lock_interruptible(l)    _mutex_lock_interruptible(l)
13411 +#define mutex_lock_killable(l)         _mutex_lock_killable(l)
13412 +#define mutex_trylock(l)               _mutex_trylock(l)
13413 +#define mutex_unlock(l)                        _mutex_unlock(l)
13414 +#define mutex_lock_io(l)               _mutex_lock_io(l);
13415 +
13416 +#define __mutex_owner(l)               ((l)->lock.owner)
13417 +
13418 +#ifdef CONFIG_DEBUG_MUTEXES
13419 +#define mutex_destroy(l)               rt_mutex_destroy(&(l)->lock)
13420 +#else
13421 +static inline void mutex_destroy(struct mutex *lock) {}
13422 +#endif
13423 +
13424 +#ifdef CONFIG_DEBUG_LOCK_ALLOC
13425 +# define mutex_lock_nested(l, s)       _mutex_lock_nested(l, s)
13426 +# define mutex_lock_interruptible_nested(l, s) \
13427 +                                       _mutex_lock_interruptible_nested(l, s)
13428 +# define mutex_lock_killable_nested(l, s) \
13429 +                                       _mutex_lock_killable_nested(l, s)
13430 +# define mutex_lock_io_nested(l, s)    _mutex_lock_io_nested(l, s)
13431 +
13432 +# define mutex_lock_nest_lock(lock, nest_lock)                         \
13433 +do {                                                                   \
13434 +       typecheck(struct lockdep_map *, &(nest_lock)->dep_map);         \
13435 +       _mutex_lock_nest_lock(lock, &(nest_lock)->dep_map);             \
13436 +} while (0)
13437 +
13438 +#else
13439 +# define mutex_lock_nested(l, s)       _mutex_lock(l)
13440 +# define mutex_lock_interruptible_nested(l, s) \
13441 +                                       _mutex_lock_interruptible(l)
13442 +# define mutex_lock_killable_nested(l, s) \
13443 +                                       _mutex_lock_killable(l)
13444 +# define mutex_lock_nest_lock(lock, nest_lock) mutex_lock(lock)
13445 +# define mutex_lock_io_nested(l, s)    _mutex_lock_io(l)
13446 +#endif
13447 +
13448 +# define mutex_init(mutex)                             \
13449 +do {                                                   \
13450 +       static struct lock_class_key __key;             \
13451 +                                                       \
13452 +       rt_mutex_init(&(mutex)->lock);                  \
13453 +       __mutex_do_init((mutex), #mutex, &__key);       \
13454 +} while (0)
13455 +
13456 +# define __mutex_init(mutex, name, key)                        \
13457 +do {                                                   \
13458 +       rt_mutex_init(&(mutex)->lock);                  \
13459 +       __mutex_do_init((mutex), name, key);            \
13460 +} while (0)
13461 +
13462 +/**
13463 + * These values are chosen such that FAIL and SUCCESS match the
13464 + * values of the regular mutex_trylock().
13465 + */
13466 +enum mutex_trylock_recursive_enum {
13467 +       MUTEX_TRYLOCK_FAILED    = 0,
13468 +       MUTEX_TRYLOCK_SUCCESS   = 1,
13469 +       MUTEX_TRYLOCK_RECURSIVE,
13470 +};
13471 +/**
13472 + * mutex_trylock_recursive - trylock variant that allows recursive locking
13473 + * @lock: mutex to be locked
13474 + *
13475 + * This function should not be used, _ever_. It is purely for hysterical GEM
13476 + * raisins, and once those are gone this will be removed.
13477 + *
13478 + * Returns:
13479 + *  MUTEX_TRYLOCK_FAILED    - trylock failed,
13480 + *  MUTEX_TRYLOCK_SUCCESS   - lock acquired,
13481 + *  MUTEX_TRYLOCK_RECURSIVE - we already owned the lock.
13482 + */
13483 +int __rt_mutex_owner_current(struct rt_mutex *lock);
13484 +
13485 +static inline /* __deprecated */ __must_check enum mutex_trylock_recursive_enum
13486 +mutex_trylock_recursive(struct mutex *lock)
13487 +{
13488 +       if (unlikely(__rt_mutex_owner_current(&lock->lock)))
13489 +               return MUTEX_TRYLOCK_RECURSIVE;
13490 +
13491 +       return mutex_trylock(lock);
13492 +}
13493 +
13494 +extern int atomic_dec_and_mutex_lock(atomic_t *cnt, struct mutex *lock);
13495 +
13496 +#endif
13497 diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
13498 index a516dbe5869f..3ceccf72757e 100644
13499 --- a/include/linux/netdevice.h
13500 +++ b/include/linux/netdevice.h
13501 @@ -409,7 +409,19 @@ typedef enum rx_handler_result rx_handler_result_t;
13502  typedef rx_handler_result_t rx_handler_func_t(struct sk_buff **pskb);
13503
13504  void __napi_schedule(struct napi_struct *n);
13505 +
13506 +/*
13507 + * When PREEMPT_RT_FULL is defined, all device interrupt handlers
13508 + * run as threads, and they can also be preempted (without PREEMPT_RT
13509 + * interrupt threads can not be preempted). Which means that calling
13510 + * __napi_schedule_irqoff() from an interrupt handler can be preempted
13511 + * and can corrupt the napi->poll_list.
13512 + */
13513 +#ifdef CONFIG_PREEMPT_RT_FULL
13514 +#define __napi_schedule_irqoff(n) __napi_schedule(n)
13515 +#else
13516  void __napi_schedule_irqoff(struct napi_struct *n);
13517 +#endif
13518
13519  static inline bool napi_disable_pending(struct napi_struct *n)
13520  {
13521 @@ -571,7 +583,11 @@ struct netdev_queue {
13522   * write-mostly part
13523   */
13524         spinlock_t              _xmit_lock ____cacheline_aligned_in_smp;
13525 +#ifdef CONFIG_PREEMPT_RT_FULL
13526 +       struct task_struct      *xmit_lock_owner;
13527 +#else
13528         int                     xmit_lock_owner;
13529 +#endif
13530         /*
13531          * Time (in jiffies) of last Tx
13532          */
13533 @@ -2440,14 +2456,53 @@ void netdev_freemem(struct net_device *dev);
13534  void synchronize_net(void);
13535  int init_dummy_netdev(struct net_device *dev);
13536
13537 -DECLARE_PER_CPU(int, xmit_recursion);
13538  #define XMIT_RECURSION_LIMIT   10
13539 +#ifdef CONFIG_PREEMPT_RT_FULL
13540 +static inline int dev_recursion_level(void)
13541 +{
13542 +       return current->xmit_recursion;
13543 +}
13544 +
13545 +static inline int xmit_rec_read(void)
13546 +{
13547 +       return current->xmit_recursion;
13548 +}
13549 +
13550 +static inline void xmit_rec_inc(void)
13551 +{
13552 +       current->xmit_recursion++;
13553 +}
13554 +
13555 +static inline void xmit_rec_dec(void)
13556 +{
13557 +       current->xmit_recursion--;
13558 +}
13559 +
13560 +#else
13561 +
13562 +DECLARE_PER_CPU(int, xmit_recursion);
13563
13564  static inline int dev_recursion_level(void)
13565  {
13566         return this_cpu_read(xmit_recursion);
13567  }
13568
13569 +static inline int xmit_rec_read(void)
13570 +{
13571 +       return __this_cpu_read(xmit_recursion);
13572 +}
13573 +
13574 +static inline void xmit_rec_inc(void)
13575 +{
13576 +       __this_cpu_inc(xmit_recursion);
13577 +}
13578 +
13579 +static inline void xmit_rec_dec(void)
13580 +{
13581 +       __this_cpu_dec(xmit_recursion);
13582 +}
13583 +#endif
13584 +
13585  struct net_device *dev_get_by_index(struct net *net, int ifindex);
13586  struct net_device *__dev_get_by_index(struct net *net, int ifindex);
13587  struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex);
13588 @@ -2799,6 +2854,7 @@ struct softnet_data {
13589         unsigned int            dropped;
13590         struct sk_buff_head     input_pkt_queue;
13591         struct napi_struct      backlog;
13592 +       struct sk_buff_head     tofree_queue;
13593
13594  };
13595
13596 @@ -3522,10 +3578,48 @@ static inline u32 netif_msg_init(int debug_value, int default_msg_enable_bits)
13597         return (1 << debug_value) - 1;
13598  }
13599
13600 +#ifdef CONFIG_PREEMPT_RT_FULL
13601 +static inline void netdev_queue_set_owner(struct netdev_queue *txq, int cpu)
13602 +{
13603 +       txq->xmit_lock_owner = current;
13604 +}
13605 +
13606 +static inline void netdev_queue_clear_owner(struct netdev_queue *txq)
13607 +{
13608 +       txq->xmit_lock_owner = NULL;
13609 +}
13610 +
13611 +static inline bool netdev_queue_has_owner(struct netdev_queue *txq)
13612 +{
13613 +       if (txq->xmit_lock_owner != NULL)
13614 +               return true;
13615 +       return false;
13616 +}
13617 +
13618 +#else
13619 +
13620 +static inline void netdev_queue_set_owner(struct netdev_queue *txq, int cpu)
13621 +{
13622 +       txq->xmit_lock_owner = cpu;
13623 +}
13624 +
13625 +static inline void netdev_queue_clear_owner(struct netdev_queue *txq)
13626 +{
13627 +       txq->xmit_lock_owner = -1;
13628 +}
13629 +
13630 +static inline bool netdev_queue_has_owner(struct netdev_queue *txq)
13631 +{
13632 +       if (txq->xmit_lock_owner != -1)
13633 +               return true;
13634 +       return false;
13635 +}
13636 +#endif
13637 +
13638  static inline void __netif_tx_lock(struct netdev_queue *txq, int cpu)
13639  {
13640         spin_lock(&txq->_xmit_lock);
13641 -       txq->xmit_lock_owner = cpu;
13642 +       netdev_queue_set_owner(txq, cpu);
13643  }
13644
13645  static inline bool __netif_tx_acquire(struct netdev_queue *txq)
13646 @@ -3542,32 +3636,32 @@ static inline void __netif_tx_release(struct netdev_queue *txq)
13647  static inline void __netif_tx_lock_bh(struct netdev_queue *txq)
13648  {
13649         spin_lock_bh(&txq->_xmit_lock);
13650 -       txq->xmit_lock_owner = smp_processor_id();
13651 +       netdev_queue_set_owner(txq, smp_processor_id());
13652  }
13653
13654  static inline bool __netif_tx_trylock(struct netdev_queue *txq)
13655  {
13656         bool ok = spin_trylock(&txq->_xmit_lock);
13657         if (likely(ok))
13658 -               txq->xmit_lock_owner = smp_processor_id();
13659 +               netdev_queue_set_owner(txq, smp_processor_id());
13660         return ok;
13661  }
13662
13663  static inline void __netif_tx_unlock(struct netdev_queue *txq)
13664  {
13665 -       txq->xmit_lock_owner = -1;
13666 +       netdev_queue_clear_owner(txq);
13667         spin_unlock(&txq->_xmit_lock);
13668  }
13669
13670  static inline void __netif_tx_unlock_bh(struct netdev_queue *txq)
13671  {
13672 -       txq->xmit_lock_owner = -1;
13673 +       netdev_queue_clear_owner(txq);
13674         spin_unlock_bh(&txq->_xmit_lock);
13675  }
13676
13677  static inline void txq_trans_update(struct netdev_queue *txq)
13678  {
13679 -       if (txq->xmit_lock_owner != -1)
13680 +       if (netdev_queue_has_owner(txq))
13681                 txq->trans_start = jiffies;
13682  }
13683
13684 diff --git a/include/linux/netfilter/x_tables.h b/include/linux/netfilter/x_tables.h
13685 index 54f346a45cd0..79723e76af66 100644
13686 --- a/include/linux/netfilter/x_tables.h
13687 +++ b/include/linux/netfilter/x_tables.h
13688 @@ -6,6 +6,7 @@
13689  #include <linux/netdevice.h>
13690  #include <linux/static_key.h>
13691  #include <linux/netfilter.h>
13692 +#include <linux/locallock.h>
13693  #include <uapi/linux/netfilter/x_tables.h>
13694
13695  /* Test a struct->invflags and a boolean for inequality */
13696 @@ -341,6 +342,8 @@ void xt_free_table_info(struct xt_table_info *info);
13697   */
13698  DECLARE_PER_CPU(seqcount_t, xt_recseq);
13699
13700 +DECLARE_LOCAL_IRQ_LOCK(xt_write_lock);
13701 +
13702  /* xt_tee_enabled - true if x_tables needs to handle reentrancy
13703   *
13704   * Enabled if current ip(6)tables ruleset has at least one -j TEE rule.
13705 @@ -361,6 +364,9 @@ static inline unsigned int xt_write_recseq_begin(void)
13706  {
13707         unsigned int addend;
13708
13709 +       /* RT protection */
13710 +       local_lock(xt_write_lock);
13711 +
13712         /*
13713          * Low order bit of sequence is set if we already
13714          * called xt_write_recseq_begin().
13715 @@ -391,6 +397,7 @@ static inline void xt_write_recseq_end(unsigned int addend)
13716         /* this is kind of a write_seqcount_end(), but addend is 0 or 1 */
13717         smp_wmb();
13718         __this_cpu_add(xt_recseq.sequence, addend);
13719 +       local_unlock(xt_write_lock);
13720  }
13721
13722  /*
13723 diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h
13724 index f0015f801a78..c38288622819 100644
13725 --- a/include/linux/nfs_fs.h
13726 +++ b/include/linux/nfs_fs.h
13727 @@ -162,7 +162,11 @@ struct nfs_inode {
13728
13729         /* Readers: in-flight sillydelete RPC calls */
13730         /* Writers: rmdir */
13731 +#ifdef CONFIG_PREEMPT_RT_BASE
13732 +       struct semaphore        rmdir_sem;
13733 +#else
13734         struct rw_semaphore     rmdir_sem;
13735 +#endif
13736         struct mutex            commit_mutex;
13737
13738  #if IS_ENABLED(CONFIG_NFS_V4)
13739 diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
13740 index 6959968dc36a..802e849b57ac 100644
13741 --- a/include/linux/nfs_xdr.h
13742 +++ b/include/linux/nfs_xdr.h
13743 @@ -1530,7 +1530,7 @@ struct nfs_unlinkdata {
13744         struct nfs_removeargs args;
13745         struct nfs_removeres res;
13746         struct dentry *dentry;
13747 -       wait_queue_head_t wq;
13748 +       struct swait_queue_head wq;
13749         struct rpc_cred *cred;
13750         struct nfs_fattr dir_attr;
13751         long timeout;
13752 diff --git a/include/linux/notifier.h b/include/linux/notifier.h
13753 index 6d731110e0db..e758627da14d 100644
13754 --- a/include/linux/notifier.h
13755 +++ b/include/linux/notifier.h
13756 @@ -7,7 +7,7 @@
13757   *
13758   *                             Alan Cox <Alan.Cox@linux.org>
13759   */
13760 -
13761 +
13762  #ifndef _LINUX_NOTIFIER_H
13763  #define _LINUX_NOTIFIER_H
13764  #include <linux/errno.h>
13765 @@ -43,9 +43,7 @@
13766   * in srcu_notifier_call_chain(): no cache bounces and no memory barriers.
13767   * As compensation, srcu_notifier_chain_unregister() is rather expensive.
13768   * SRCU notifier chains should be used when the chain will be called very
13769 - * often but notifier_blocks will seldom be removed.  Also, SRCU notifier
13770 - * chains are slightly more difficult to use because they require special
13771 - * runtime initialization.
13772 + * often but notifier_blocks will seldom be removed.
13773   */
13774
13775  struct notifier_block;
13776 @@ -91,7 +89,7 @@ struct srcu_notifier_head {
13777                 (name)->head = NULL;            \
13778         } while (0)
13779
13780 -/* srcu_notifier_heads must be initialized and cleaned up dynamically */
13781 +/* srcu_notifier_heads must be cleaned up dynamically */
13782  extern void srcu_init_notifier_head(struct srcu_notifier_head *nh);
13783  #define srcu_cleanup_notifier_head(name)       \
13784                 cleanup_srcu_struct(&(name)->srcu);
13785 @@ -104,7 +102,13 @@ extern void srcu_init_notifier_head(struct srcu_notifier_head *nh);
13786                 .head = NULL }
13787  #define RAW_NOTIFIER_INIT(name)        {                               \
13788                 .head = NULL }
13789 -/* srcu_notifier_heads cannot be initialized statically */
13790 +
13791 +#define SRCU_NOTIFIER_INIT(name, pcpu)                         \
13792 +       {                                                       \
13793 +               .mutex = __MUTEX_INITIALIZER(name.mutex),       \
13794 +               .head = NULL,                                   \
13795 +               .srcu = __SRCU_STRUCT_INIT(name.srcu, pcpu),    \
13796 +       }
13797
13798  #define ATOMIC_NOTIFIER_HEAD(name)                             \
13799         struct atomic_notifier_head name =                      \
13800 @@ -116,6 +120,26 @@ extern void srcu_init_notifier_head(struct srcu_notifier_head *nh);
13801         struct raw_notifier_head name =                         \
13802                 RAW_NOTIFIER_INIT(name)
13803
13804 +#ifdef CONFIG_TREE_SRCU
13805 +#define _SRCU_NOTIFIER_HEAD(name, mod)                         \
13806 +       static DEFINE_PER_CPU(struct srcu_data,                 \
13807 +                       name##_head_srcu_data);                 \
13808 +       mod struct srcu_notifier_head name =                    \
13809 +                       SRCU_NOTIFIER_INIT(name, name##_head_srcu_data)
13810 +
13811 +#else
13812 +#define _SRCU_NOTIFIER_HEAD(name, mod)                         \
13813 +       mod struct srcu_notifier_head name =                    \
13814 +                       SRCU_NOTIFIER_INIT(name, name)
13815 +
13816 +#endif
13817 +
13818 +#define SRCU_NOTIFIER_HEAD(name)                               \
13819 +       _SRCU_NOTIFIER_HEAD(name, )
13820 +
13821 +#define SRCU_NOTIFIER_HEAD_STATIC(name)                                \
13822 +       _SRCU_NOTIFIER_HEAD(name, static)
13823 +
13824  #ifdef __KERNEL__
13825
13826  extern int atomic_notifier_chain_register(struct atomic_notifier_head *nh,
13827 @@ -185,12 +209,12 @@ static inline int notifier_to_errno(int ret)
13828
13829  /*
13830   *     Declared notifiers so far. I can imagine quite a few more chains
13831 - *     over time (eg laptop power reset chains, reboot chain (to clean
13832 + *     over time (eg laptop power reset chains, reboot chain (to clean
13833   *     device units up), device [un]mount chain, module load/unload chain,
13834 - *     low memory chain, screenblank chain (for plug in modular screenblankers)
13835 + *     low memory chain, screenblank chain (for plug in modular screenblankers)
13836   *     VC switch chains (for loadable kernel svgalib VC switch helpers) etc...
13837   */
13838 -
13839 +
13840  /* CPU notfiers are defined in include/linux/cpu.h. */
13841
13842  /* netdevice notifiers are defined in include/linux/netdevice.h */
13843 diff --git a/include/linux/percpu-rwsem.h b/include/linux/percpu-rwsem.h
13844 index 79b99d653e03..fb44e237316d 100644
13845 --- a/include/linux/percpu-rwsem.h
13846 +++ b/include/linux/percpu-rwsem.h
13847 @@ -29,7 +29,7 @@ static struct percpu_rw_semaphore name = {                            \
13848  extern int __percpu_down_read(struct percpu_rw_semaphore *, int);
13849  extern void __percpu_up_read(struct percpu_rw_semaphore *);
13850
13851 -static inline void percpu_down_read_preempt_disable(struct percpu_rw_semaphore *sem)
13852 +static inline void percpu_down_read(struct percpu_rw_semaphore *sem)
13853  {
13854         might_sleep();
13855
13856 @@ -47,16 +47,10 @@ static inline void percpu_down_read_preempt_disable(struct percpu_rw_semaphore *
13857         __this_cpu_inc(*sem->read_count);
13858         if (unlikely(!rcu_sync_is_idle(&sem->rss)))
13859                 __percpu_down_read(sem, false); /* Unconditional memory barrier */
13860 -       barrier();
13861         /*
13862 -        * The barrier() prevents the compiler from
13863 +        * The preempt_enable() prevents the compiler from
13864          * bleeding the critical section out.
13865          */
13866 -}
13867 -
13868 -static inline void percpu_down_read(struct percpu_rw_semaphore *sem)
13869 -{
13870 -       percpu_down_read_preempt_disable(sem);
13871         preempt_enable();
13872  }
13873
13874 @@ -83,13 +77,9 @@ static inline int percpu_down_read_trylock(struct percpu_rw_semaphore *sem)
13875         return ret;
13876  }
13877
13878 -static inline void percpu_up_read_preempt_enable(struct percpu_rw_semaphore *sem)
13879 +static inline void percpu_up_read(struct percpu_rw_semaphore *sem)
13880  {
13881 -       /*
13882 -        * The barrier() prevents the compiler from
13883 -        * bleeding the critical section out.
13884 -        */
13885 -       barrier();
13886 +       preempt_disable();
13887         /*
13888          * Same as in percpu_down_read().
13889          */
13890 @@ -102,12 +92,6 @@ static inline void percpu_up_read_preempt_enable(struct percpu_rw_semaphore *sem
13891         rwsem_release(&sem->rw_sem.dep_map, 1, _RET_IP_);
13892  }
13893
13894 -static inline void percpu_up_read(struct percpu_rw_semaphore *sem)
13895 -{
13896 -       preempt_disable();
13897 -       percpu_up_read_preempt_enable(sem);
13898 -}
13899 -
13900  extern void percpu_down_write(struct percpu_rw_semaphore *);
13901  extern void percpu_up_write(struct percpu_rw_semaphore *);
13902
13903 diff --git a/include/linux/percpu.h b/include/linux/percpu.h
13904 index 296bbe49d5d1..4414796e3941 100644
13905 --- a/include/linux/percpu.h
13906 +++ b/include/linux/percpu.h
13907 @@ -19,6 +19,35 @@
13908  #define PERCPU_MODULE_RESERVE          0
13909  #endif
13910
13911 +#ifdef CONFIG_PREEMPT_RT_FULL
13912 +
13913 +#define get_local_var(var) (*({        \
13914 +       migrate_disable();      \
13915 +       this_cpu_ptr(&var);     }))
13916 +
13917 +#define put_local_var(var) do {        \
13918 +       (void)&(var);           \
13919 +       migrate_enable();       \
13920 +} while (0)
13921 +
13922 +# define get_local_ptr(var) ({ \
13923 +       migrate_disable();      \
13924 +       this_cpu_ptr(var);      })
13925 +
13926 +# define put_local_ptr(var) do {       \
13927 +       (void)(var);                    \
13928 +       migrate_enable();               \
13929 +} while (0)
13930 +
13931 +#else
13932 +
13933 +#define get_local_var(var)     get_cpu_var(var)
13934 +#define put_local_var(var)     put_cpu_var(var)
13935 +#define get_local_ptr(var)     get_cpu_ptr(var)
13936 +#define put_local_ptr(var)     put_cpu_ptr(var)
13937 +
13938 +#endif
13939 +
13940  /* minimum unit size, also is the maximum supported allocation size */
13941  #define PCPU_MIN_UNIT_SIZE             PFN_ALIGN(32 << 10)
13942
13943 diff --git a/include/linux/pid.h b/include/linux/pid.h
13944 index dfd684ce0787..bc954a99aa70 100644
13945 --- a/include/linux/pid.h
13946 +++ b/include/linux/pid.h
13947 @@ -3,6 +3,7 @@
13948  #define _LINUX_PID_H
13949
13950  #include <linux/rculist.h>
13951 +#include <linux/atomic.h>
13952
13953  enum pid_type
13954  {
13955 diff --git a/include/linux/posix-timers.h b/include/linux/posix-timers.h
13956 index 437a539898ae..de5c49b0dccf 100644
13957 --- a/include/linux/posix-timers.h
13958 +++ b/include/linux/posix-timers.h
13959 @@ -101,8 +101,8 @@ struct k_itimer {
13960                 struct {
13961                         struct alarm    alarmtimer;
13962                 } alarm;
13963 -               struct rcu_head         rcu;
13964         } it;
13965 +       struct rcu_head         rcu;
13966  };
13967
13968  void run_posix_cpu_timers(struct task_struct *task);
13969 diff --git a/include/linux/preempt.h b/include/linux/preempt.h
13970 index 5bd3f151da78..6728662a81e8 100644
13971 --- a/include/linux/preempt.h
13972 +++ b/include/linux/preempt.h
13973 @@ -51,7 +51,11 @@
13974  #define HARDIRQ_OFFSET (1UL << HARDIRQ_SHIFT)
13975  #define NMI_OFFSET     (1UL << NMI_SHIFT)
13976
13977 -#define SOFTIRQ_DISABLE_OFFSET (2 * SOFTIRQ_OFFSET)
13978 +#ifndef CONFIG_PREEMPT_RT_FULL
13979 +# define SOFTIRQ_DISABLE_OFFSET                (2 * SOFTIRQ_OFFSET)
13980 +#else
13981 +# define SOFTIRQ_DISABLE_OFFSET                (0)
13982 +#endif
13983
13984  /* We use the MSB mostly because its available */
13985  #define PREEMPT_NEED_RESCHED   0x80000000
13986 @@ -81,9 +85,15 @@
13987  #include <asm/preempt.h>
13988
13989  #define hardirq_count()        (preempt_count() & HARDIRQ_MASK)
13990 -#define softirq_count()        (preempt_count() & SOFTIRQ_MASK)
13991  #define irq_count()    (preempt_count() & (HARDIRQ_MASK | SOFTIRQ_MASK \
13992                                  | NMI_MASK))
13993 +#ifndef CONFIG_PREEMPT_RT_FULL
13994 +# define softirq_count()       (preempt_count() & SOFTIRQ_MASK)
13995 +# define in_serving_softirq()  (softirq_count() & SOFTIRQ_OFFSET)
13996 +#else
13997 +# define softirq_count()       (0UL)
13998 +extern int in_serving_softirq(void);
13999 +#endif
14000
14001  /*
14002   * Are we doing bottom half or hardware interrupt processing?
14003 @@ -101,7 +111,6 @@
14004  #define in_irq()               (hardirq_count())
14005  #define in_softirq()           (softirq_count())
14006  #define in_interrupt()         (irq_count())
14007 -#define in_serving_softirq()   (softirq_count() & SOFTIRQ_OFFSET)
14008  #define in_nmi()               (preempt_count() & NMI_MASK)
14009  #define in_task()              (!(preempt_count() & \
14010                                    (NMI_MASK | HARDIRQ_MASK | SOFTIRQ_OFFSET)))
14011 @@ -118,7 +127,11 @@
14012  /*
14013   * The preempt_count offset after spin_lock()
14014   */
14015 +#if !defined(CONFIG_PREEMPT_RT_FULL)
14016  #define PREEMPT_LOCK_OFFSET    PREEMPT_DISABLE_OFFSET
14017 +#else
14018 +#define PREEMPT_LOCK_OFFSET    0
14019 +#endif
14020
14021  /*
14022   * The preempt_count offset needed for things like:
14023 @@ -167,6 +180,20 @@ extern void preempt_count_sub(int val);
14024  #define preempt_count_inc() preempt_count_add(1)
14025  #define preempt_count_dec() preempt_count_sub(1)
14026
14027 +#ifdef CONFIG_PREEMPT_LAZY
14028 +#define add_preempt_lazy_count(val)    do { preempt_lazy_count() += (val); } while (0)
14029 +#define sub_preempt_lazy_count(val)    do { preempt_lazy_count() -= (val); } while (0)
14030 +#define inc_preempt_lazy_count()       add_preempt_lazy_count(1)
14031 +#define dec_preempt_lazy_count()       sub_preempt_lazy_count(1)
14032 +#define preempt_lazy_count()           (current_thread_info()->preempt_lazy_count)
14033 +#else
14034 +#define add_preempt_lazy_count(val)    do { } while (0)
14035 +#define sub_preempt_lazy_count(val)    do { } while (0)
14036 +#define inc_preempt_lazy_count()       do { } while (0)
14037 +#define dec_preempt_lazy_count()       do { } while (0)
14038 +#define preempt_lazy_count()           (0)
14039 +#endif
14040 +
14041  #ifdef CONFIG_PREEMPT_COUNT
14042
14043  #define preempt_disable() \
14044 @@ -175,16 +202,53 @@ do { \
14045         barrier(); \
14046  } while (0)
14047
14048 +#define preempt_lazy_disable() \
14049 +do { \
14050 +       inc_preempt_lazy_count(); \
14051 +       barrier(); \
14052 +} while (0)
14053 +
14054  #define sched_preempt_enable_no_resched() \
14055  do { \
14056         barrier(); \
14057         preempt_count_dec(); \
14058  } while (0)
14059
14060 -#define preempt_enable_no_resched() sched_preempt_enable_no_resched()
14061 +#ifdef CONFIG_PREEMPT_RT_BASE
14062 +# define preempt_enable_no_resched() sched_preempt_enable_no_resched()
14063 +# define preempt_check_resched_rt() preempt_check_resched()
14064 +#else
14065 +# define preempt_enable_no_resched() preempt_enable()
14066 +# define preempt_check_resched_rt() barrier();
14067 +#endif
14068
14069  #define preemptible()  (preempt_count() == 0 && !irqs_disabled())
14070
14071 +#if defined(CONFIG_SMP) && defined(CONFIG_PREEMPT_RT_BASE)
14072 +
14073 +extern void migrate_disable(void);
14074 +extern void migrate_enable(void);
14075 +
14076 +int __migrate_disabled(struct task_struct *p);
14077 +
14078 +#elif !defined(CONFIG_SMP) && defined(CONFIG_PREEMPT_RT_BASE)
14079 +
14080 +extern void migrate_disable(void);
14081 +extern void migrate_enable(void);
14082 +static inline int __migrate_disabled(struct task_struct *p)
14083 +{
14084 +       return 0;
14085 +}
14086 +
14087 +#else
14088 +#define migrate_disable()              preempt_disable()
14089 +#define migrate_enable()               preempt_enable()
14090 +static inline int __migrate_disabled(struct task_struct *p)
14091 +{
14092 +       return 0;
14093 +}
14094 +#endif
14095 +
14096  #ifdef CONFIG_PREEMPT
14097  #define preempt_enable() \
14098  do { \
14099 @@ -206,6 +270,13 @@ do { \
14100                 __preempt_schedule(); \
14101  } while (0)
14102
14103 +#define preempt_lazy_enable() \
14104 +do { \
14105 +       dec_preempt_lazy_count(); \
14106 +       barrier(); \
14107 +       preempt_check_resched(); \
14108 +} while (0)
14109 +
14110  #else /* !CONFIG_PREEMPT */
14111  #define preempt_enable() \
14112  do { \
14113 @@ -213,6 +284,12 @@ do { \
14114         preempt_count_dec(); \
14115  } while (0)
14116
14117 +#define preempt_lazy_enable() \
14118 +do { \
14119 +       dec_preempt_lazy_count(); \
14120 +       barrier(); \
14121 +} while (0)
14122 +
14123  #define preempt_enable_notrace() \
14124  do { \
14125         barrier(); \
14126 @@ -251,8 +328,16 @@ do { \
14127  #define preempt_disable_notrace()              barrier()
14128  #define preempt_enable_no_resched_notrace()    barrier()
14129  #define preempt_enable_notrace()               barrier()
14130 +#define preempt_check_resched_rt()             barrier()
14131  #define preemptible()                          0
14132
14133 +#define migrate_disable()                      barrier()
14134 +#define migrate_enable()                       barrier()
14135 +
14136 +static inline int __migrate_disabled(struct task_struct *p)
14137 +{
14138 +       return 0;
14139 +}
14140  #endif /* CONFIG_PREEMPT_COUNT */
14141
14142  #ifdef MODULE
14143 @@ -271,10 +356,22 @@ do { \
14144  } while (0)
14145  #define preempt_fold_need_resched() \
14146  do { \
14147 -       if (tif_need_resched()) \
14148 +       if (tif_need_resched_now()) \
14149                 set_preempt_need_resched(); \
14150  } while (0)
14151
14152 +#ifdef CONFIG_PREEMPT_RT_FULL
14153 +# define preempt_disable_rt()          preempt_disable()
14154 +# define preempt_enable_rt()           preempt_enable()
14155 +# define preempt_disable_nort()                barrier()
14156 +# define preempt_enable_nort()         barrier()
14157 +#else
14158 +# define preempt_disable_rt()          barrier()
14159 +# define preempt_enable_rt()           barrier()
14160 +# define preempt_disable_nort()                preempt_disable()
14161 +# define preempt_enable_nort()         preempt_enable()
14162 +#endif
14163 +
14164  #ifdef CONFIG_PREEMPT_NOTIFIERS
14165
14166  struct preempt_notifier;
14167 diff --git a/include/linux/printk.h b/include/linux/printk.h
14168 index 6106befed756..1dba9cb7b91b 100644
14169 --- a/include/linux/printk.h
14170 +++ b/include/linux/printk.h
14171 @@ -142,9 +142,11 @@ struct va_format {
14172  #ifdef CONFIG_EARLY_PRINTK
14173  extern asmlinkage __printf(1, 2)
14174  void early_printk(const char *fmt, ...);
14175 +extern void printk_kill(void);
14176  #else
14177  static inline __printf(1, 2) __cold
14178  void early_printk(const char *s, ...) { }
14179 +static inline void printk_kill(void) { }
14180  #endif
14181
14182  #ifdef CONFIG_PRINTK_NMI
14183 diff --git a/include/linux/radix-tree.h b/include/linux/radix-tree.h
14184 index 567ebb5eaab0..9da7ea957399 100644
14185 --- a/include/linux/radix-tree.h
14186 +++ b/include/linux/radix-tree.h
14187 @@ -328,6 +328,8 @@ unsigned int radix_tree_gang_lookup_slot(const struct radix_tree_root *,
14188  int radix_tree_preload(gfp_t gfp_mask);
14189  int radix_tree_maybe_preload(gfp_t gfp_mask);
14190  int radix_tree_maybe_preload_order(gfp_t gfp_mask, int order);
14191 +void radix_tree_preload_end(void);
14192 +
14193  void radix_tree_init(void);
14194  void *radix_tree_tag_set(struct radix_tree_root *,
14195                         unsigned long index, unsigned int tag);
14196 @@ -347,11 +349,6 @@ unsigned int radix_tree_gang_lookup_tag_slot(const struct radix_tree_root *,
14197                 unsigned int max_items, unsigned int tag);
14198  int radix_tree_tagged(const struct radix_tree_root *, unsigned int tag);
14199
14200 -static inline void radix_tree_preload_end(void)
14201 -{
14202 -       preempt_enable();
14203 -}
14204 -
14205  int radix_tree_split_preload(unsigned old_order, unsigned new_order, gfp_t);
14206  int radix_tree_split(struct radix_tree_root *, unsigned long index,
14207                         unsigned new_order);
14208 diff --git a/include/linux/random.h b/include/linux/random.h
14209 index 4024f7d9c77d..462d752a739b 100644
14210 --- a/include/linux/random.h
14211 +++ b/include/linux/random.h
14212 @@ -32,7 +32,7 @@ static inline void add_latent_entropy(void) {}
14213
14214  extern void add_input_randomness(unsigned int type, unsigned int code,
14215                                  unsigned int value) __latent_entropy;
14216 -extern void add_interrupt_randomness(int irq, int irq_flags) __latent_entropy;
14217 +extern void add_interrupt_randomness(int irq, int irq_flags, __u64 ip) __latent_entropy;
14218
14219  extern void get_random_bytes(void *buf, int nbytes);
14220  extern int wait_for_random_bytes(void);
14221 diff --git a/include/linux/rbtree.h b/include/linux/rbtree.h
14222 index d574361943ea..0a9f442409b9 100644
14223 --- a/include/linux/rbtree.h
14224 +++ b/include/linux/rbtree.h
14225 @@ -31,7 +31,7 @@
14226
14227  #include <linux/kernel.h>
14228  #include <linux/stddef.h>
14229 -#include <linux/rcupdate.h>
14230 +#include <linux/rcu_assign_pointer.h>
14231
14232  struct rb_node {
14233         unsigned long  __rb_parent_color;
14234 diff --git a/include/linux/rbtree_augmented.h b/include/linux/rbtree_augmented.h
14235 index 6bfd2b581f75..af8a61be2d8d 100644
14236 --- a/include/linux/rbtree_augmented.h
14237 +++ b/include/linux/rbtree_augmented.h
14238 @@ -26,6 +26,7 @@
14239
14240  #include <linux/compiler.h>
14241  #include <linux/rbtree.h>
14242 +#include <linux/rcupdate.h>
14243
14244  /*
14245   * Please note - only struct rb_augment_callbacks and the prototypes for
14246 diff --git a/include/linux/rbtree_latch.h b/include/linux/rbtree_latch.h
14247 index ece43e882b56..7d012faa509a 100644
14248 --- a/include/linux/rbtree_latch.h
14249 +++ b/include/linux/rbtree_latch.h
14250 @@ -35,6 +35,7 @@
14251
14252  #include <linux/rbtree.h>
14253  #include <linux/seqlock.h>
14254 +#include <linux/rcupdate.h>
14255
14256  struct latch_tree_node {
14257         struct rb_node node[2];
14258 diff --git a/include/linux/rcu_assign_pointer.h b/include/linux/rcu_assign_pointer.h
14259 new file mode 100644
14260 index 000000000000..7066962a4379
14261 --- /dev/null
14262 +++ b/include/linux/rcu_assign_pointer.h
14263 @@ -0,0 +1,54 @@
14264 +#ifndef __LINUX_RCU_ASSIGN_POINTER_H__
14265 +#define __LINUX_RCU_ASSIGN_POINTER_H__
14266 +#include <linux/compiler.h>
14267 +#include <asm/barrier.h>
14268 +
14269 +/**
14270 + * RCU_INITIALIZER() - statically initialize an RCU-protected global variable
14271 + * @v: The value to statically initialize with.
14272 + */
14273 +#define RCU_INITIALIZER(v) (typeof(*(v)) __force __rcu *)(v)
14274 +
14275 +/**
14276 + * rcu_assign_pointer() - assign to RCU-protected pointer
14277 + * @p: pointer to assign to
14278 + * @v: value to assign (publish)
14279 + *
14280 + * Assigns the specified value to the specified RCU-protected
14281 + * pointer, ensuring that any concurrent RCU readers will see
14282 + * any prior initialization.
14283 + *
14284 + * Inserts memory barriers on architectures that require them
14285 + * (which is most of them), and also prevents the compiler from
14286 + * reordering the code that initializes the structure after the pointer
14287 + * assignment.  More importantly, this call documents which pointers
14288 + * will be dereferenced by RCU read-side code.
14289 + *
14290 + * In some special cases, you may use RCU_INIT_POINTER() instead
14291 + * of rcu_assign_pointer().  RCU_INIT_POINTER() is a bit faster due
14292 + * to the fact that it does not constrain either the CPU or the compiler.
14293 + * That said, using RCU_INIT_POINTER() when you should have used
14294 + * rcu_assign_pointer() is a very bad thing that results in
14295 + * impossible-to-diagnose memory corruption.  So please be careful.
14296 + * See the RCU_INIT_POINTER() comment header for details.
14297 + *
14298 + * Note that rcu_assign_pointer() evaluates each of its arguments only
14299 + * once, appearances notwithstanding.  One of the "extra" evaluations
14300 + * is in typeof() and the other visible only to sparse (__CHECKER__),
14301 + * neither of which actually execute the argument.  As with most cpp
14302 + * macros, this execute-arguments-only-once property is important, so
14303 + * please be careful when making changes to rcu_assign_pointer() and the
14304 + * other macros that it invokes.
14305 + */
14306 +#define rcu_assign_pointer(p, v)                                             \
14307 +({                                                                           \
14308 +       uintptr_t _r_a_p__v = (uintptr_t)(v);                                 \
14309 +                                                                             \
14310 +       if (__builtin_constant_p(v) && (_r_a_p__v) == (uintptr_t)NULL)        \
14311 +               WRITE_ONCE((p), (typeof(p))(_r_a_p__v));                      \
14312 +       else                                                                  \
14313 +               smp_store_release(&p, RCU_INITIALIZER((typeof(p))_r_a_p__v)); \
14314 +       _r_a_p__v;                                                            \
14315 +})
14316 +
14317 +#endif
14318 diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
14319 index a6ddc42f87a5..70996e134818 100644
14320 --- a/include/linux/rcupdate.h
14321 +++ b/include/linux/rcupdate.h
14322 @@ -42,6 +42,7 @@
14323  #include <linux/lockdep.h>
14324  #include <asm/processor.h>
14325  #include <linux/cpumask.h>
14326 +#include <linux/rcu_assign_pointer.h>
14327
14328  #define ULONG_CMP_GE(a, b)     (ULONG_MAX / 2 >= (a) - (b))
14329  #define ULONG_CMP_LT(a, b)     (ULONG_MAX / 2 < (a) - (b))
14330 @@ -55,7 +56,11 @@ void call_rcu(struct rcu_head *head, rcu_callback_t func);
14331  #define        call_rcu        call_rcu_sched
14332  #endif /* #else #ifdef CONFIG_PREEMPT_RCU */
14333
14334 +#ifdef CONFIG_PREEMPT_RT_FULL
14335 +#define call_rcu_bh    call_rcu
14336 +#else
14337  void call_rcu_bh(struct rcu_head *head, rcu_callback_t func);
14338 +#endif
14339  void call_rcu_sched(struct rcu_head *head, rcu_callback_t func);
14340  void synchronize_sched(void);
14341  void rcu_barrier_tasks(void);
14342 @@ -74,6 +79,11 @@ void synchronize_rcu(void);
14343   * types of kernel builds, the rcu_read_lock() nesting depth is unknowable.
14344   */
14345  #define rcu_preempt_depth() (current->rcu_read_lock_nesting)
14346 +#ifndef CONFIG_PREEMPT_RT_FULL
14347 +#define sched_rcu_preempt_depth()      rcu_preempt_depth()
14348 +#else
14349 +static inline int sched_rcu_preempt_depth(void) { return 0; }
14350 +#endif
14351
14352  #else /* #ifdef CONFIG_PREEMPT_RCU */
14353
14354 @@ -99,6 +109,8 @@ static inline int rcu_preempt_depth(void)
14355         return 0;
14356  }
14357
14358 +#define sched_rcu_preempt_depth()      rcu_preempt_depth()
14359 +
14360  #endif /* #else #ifdef CONFIG_PREEMPT_RCU */
14361
14362  /* Internal to kernel */
14363 @@ -255,7 +267,14 @@ extern struct lockdep_map rcu_sched_lock_map;
14364  extern struct lockdep_map rcu_callback_map;
14365  int debug_lockdep_rcu_enabled(void);
14366  int rcu_read_lock_held(void);
14367 +#ifdef CONFIG_PREEMPT_RT_FULL
14368 +static inline int rcu_read_lock_bh_held(void)
14369 +{
14370 +       return rcu_read_lock_held();
14371 +}
14372 +#else
14373  int rcu_read_lock_bh_held(void);
14374 +#endif
14375  int rcu_read_lock_sched_held(void);
14376
14377  #else /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
14378 @@ -364,54 +383,6 @@ static inline void rcu_preempt_sleep_check(void) { }
14379         ((typeof(*p) __force __kernel *)(________p1)); \
14380  })
14381
14382 -/**
14383 - * RCU_INITIALIZER() - statically initialize an RCU-protected global variable
14384 - * @v: The value to statically initialize with.
14385 - */
14386 -#define RCU_INITIALIZER(v) (typeof(*(v)) __force __rcu *)(v)
14387 -
14388 -/**
14389 - * rcu_assign_pointer() - assign to RCU-protected pointer
14390 - * @p: pointer to assign to
14391 - * @v: value to assign (publish)
14392 - *
14393 - * Assigns the specified value to the specified RCU-protected
14394 - * pointer, ensuring that any concurrent RCU readers will see
14395 - * any prior initialization.
14396 - *
14397 - * Inserts memory barriers on architectures that require them
14398 - * (which is most of them), and also prevents the compiler from
14399 - * reordering the code that initializes the structure after the pointer
14400 - * assignment.  More importantly, this call documents which pointers
14401 - * will be dereferenced by RCU read-side code.
14402 - *
14403 - * In some special cases, you may use RCU_INIT_POINTER() instead
14404 - * of rcu_assign_pointer().  RCU_INIT_POINTER() is a bit faster due
14405 - * to the fact that it does not constrain either the CPU or the compiler.
14406 - * That said, using RCU_INIT_POINTER() when you should have used
14407 - * rcu_assign_pointer() is a very bad thing that results in
14408 - * impossible-to-diagnose memory corruption.  So please be careful.
14409 - * See the RCU_INIT_POINTER() comment header for details.
14410 - *
14411 - * Note that rcu_assign_pointer() evaluates each of its arguments only
14412 - * once, appearances notwithstanding.  One of the "extra" evaluations
14413 - * is in typeof() and the other visible only to sparse (__CHECKER__),
14414 - * neither of which actually execute the argument.  As with most cpp
14415 - * macros, this execute-arguments-only-once property is important, so
14416 - * please be careful when making changes to rcu_assign_pointer() and the
14417 - * other macros that it invokes.
14418 - */
14419 -#define rcu_assign_pointer(p, v)                                             \
14420 -({                                                                           \
14421 -       uintptr_t _r_a_p__v = (uintptr_t)(v);                                 \
14422 -                                                                             \
14423 -       if (__builtin_constant_p(v) && (_r_a_p__v) == (uintptr_t)NULL)        \
14424 -               WRITE_ONCE((p), (typeof(p))(_r_a_p__v));                      \
14425 -       else                                                                  \
14426 -               smp_store_release(&p, RCU_INITIALIZER((typeof(p))_r_a_p__v)); \
14427 -       _r_a_p__v;                                                            \
14428 -})
14429 -
14430  /**
14431   * rcu_swap_protected() - swap an RCU and a regular pointer
14432   * @rcu_ptr: RCU pointer
14433 @@ -707,10 +678,14 @@ static inline void rcu_read_unlock(void)
14434  static inline void rcu_read_lock_bh(void)
14435  {
14436         local_bh_disable();
14437 +#ifdef CONFIG_PREEMPT_RT_FULL
14438 +       rcu_read_lock();
14439 +#else
14440         __acquire(RCU_BH);
14441         rcu_lock_acquire(&rcu_bh_lock_map);
14442         RCU_LOCKDEP_WARN(!rcu_is_watching(),
14443                          "rcu_read_lock_bh() used illegally while idle");
14444 +#endif
14445  }
14446
14447  /*
14448 @@ -720,10 +695,14 @@ static inline void rcu_read_lock_bh(void)
14449   */
14450  static inline void rcu_read_unlock_bh(void)
14451  {
14452 +#ifdef CONFIG_PREEMPT_RT_FULL
14453 +       rcu_read_unlock();
14454 +#else
14455         RCU_LOCKDEP_WARN(!rcu_is_watching(),
14456                          "rcu_read_unlock_bh() used illegally while idle");
14457         rcu_lock_release(&rcu_bh_lock_map);
14458         __release(RCU_BH);
14459 +#endif
14460         local_bh_enable();
14461  }
14462
14463 diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h
14464 index 37d6fd3b7ff8..a082fde7d6bc 100644
14465 --- a/include/linux/rcutree.h
14466 +++ b/include/linux/rcutree.h
14467 @@ -44,7 +44,11 @@ static inline void rcu_virt_note_context_switch(int cpu)
14468         rcu_note_context_switch(false);
14469  }
14470
14471 +#ifdef CONFIG_PREEMPT_RT_FULL
14472 +# define synchronize_rcu_bh    synchronize_rcu
14473 +#else
14474  void synchronize_rcu_bh(void);
14475 +#endif
14476  void synchronize_sched_expedited(void);
14477  void synchronize_rcu_expedited(void);
14478
14479 @@ -72,7 +76,11 @@ static inline void synchronize_rcu_bh_expedited(void)
14480  }
14481
14482  void rcu_barrier(void);
14483 +#ifdef CONFIG_PREEMPT_RT_FULL
14484 +# define rcu_barrier_bh                rcu_barrier
14485 +#else
14486  void rcu_barrier_bh(void);
14487 +#endif
14488  void rcu_barrier_sched(void);
14489  unsigned long get_state_synchronize_rcu(void);
14490  void cond_synchronize_rcu(unsigned long oldstate);
14491 diff --git a/include/linux/ring_buffer.h b/include/linux/ring_buffer.h
14492 index 5caa062a02b2..abce5f5325e1 100644
14493 --- a/include/linux/ring_buffer.h
14494 +++ b/include/linux/ring_buffer.h
14495 @@ -34,10 +34,12 @@ struct ring_buffer_event {
14496   *                              array[0] = time delta (28 .. 59)
14497   *                              size = 8 bytes
14498   *
14499 - * @RINGBUF_TYPE_TIME_STAMP:   Sync time stamp with external clock
14500 - *                              array[0]    = tv_nsec
14501 - *                              array[1..2] = tv_sec
14502 - *                              size = 16 bytes
14503 + * @RINGBUF_TYPE_TIME_STAMP:   Absolute timestamp
14504 + *                              Same format as TIME_EXTEND except that the
14505 + *                              value is an absolute timestamp, not a delta
14506 + *                              event.time_delta contains bottom 27 bits
14507 + *                              array[0] = top (28 .. 59) bits
14508 + *                              size = 8 bytes
14509   *
14510   * <= @RINGBUF_TYPE_DATA_TYPE_LEN_MAX:
14511   *                             Data record
14512 @@ -54,12 +56,12 @@ enum ring_buffer_type {
14513         RINGBUF_TYPE_DATA_TYPE_LEN_MAX = 28,
14514         RINGBUF_TYPE_PADDING,
14515         RINGBUF_TYPE_TIME_EXTEND,
14516 -       /* FIXME: RINGBUF_TYPE_TIME_STAMP not implemented */
14517         RINGBUF_TYPE_TIME_STAMP,
14518  };
14519
14520  unsigned ring_buffer_event_length(struct ring_buffer_event *event);
14521  void *ring_buffer_event_data(struct ring_buffer_event *event);
14522 +u64 ring_buffer_event_time_stamp(struct ring_buffer_event *event);
14523
14524  /*
14525   * ring_buffer_discard_commit will remove an event that has not
14526 @@ -115,6 +117,9 @@ int ring_buffer_unlock_commit(struct ring_buffer *buffer,
14527  int ring_buffer_write(struct ring_buffer *buffer,
14528                       unsigned long length, void *data);
14529
14530 +void ring_buffer_nest_start(struct ring_buffer *buffer);
14531 +void ring_buffer_nest_end(struct ring_buffer *buffer);
14532 +
14533  struct ring_buffer_event *
14534  ring_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts,
14535                  unsigned long *lost_events);
14536 @@ -179,6 +184,8 @@ void ring_buffer_normalize_time_stamp(struct ring_buffer *buffer,
14537                                       int cpu, u64 *ts);
14538  void ring_buffer_set_clock(struct ring_buffer *buffer,
14539                            u64 (*clock)(void));
14540 +void ring_buffer_set_time_stamp_abs(struct ring_buffer *buffer, bool abs);
14541 +bool ring_buffer_time_stamp_abs(struct ring_buffer *buffer);
14542
14543  size_t ring_buffer_page_len(void *page);
14544
14545 diff --git a/include/linux/rtmutex.h b/include/linux/rtmutex.h
14546 index 6fd615a0eea9..138bd1e183e0 100644
14547 --- a/include/linux/rtmutex.h
14548 +++ b/include/linux/rtmutex.h
14549 @@ -14,11 +14,15 @@
14550  #define __LINUX_RT_MUTEX_H
14551
14552  #include <linux/linkage.h>
14553 +#include <linux/spinlock_types_raw.h>
14554  #include <linux/rbtree.h>
14555 -#include <linux/spinlock_types.h>
14556
14557  extern int max_lock_depth; /* for sysctl */
14558
14559 +#ifdef CONFIG_DEBUG_MUTEXES
14560 +#include <linux/debug_locks.h>
14561 +#endif
14562 +
14563  /**
14564   * The rt_mutex structure
14565   *
14566 @@ -31,8 +35,8 @@ struct rt_mutex {
14567         raw_spinlock_t          wait_lock;
14568         struct rb_root_cached   waiters;
14569         struct task_struct      *owner;
14570 -#ifdef CONFIG_DEBUG_RT_MUTEXES
14571         int                     save_state;
14572 +#ifdef CONFIG_DEBUG_RT_MUTEXES
14573         const char              *name, *file;
14574         int                     line;
14575         void                    *magic;
14576 @@ -82,16 +86,23 @@ do { \
14577  #define __DEP_MAP_RT_MUTEX_INITIALIZER(mutexname)
14578  #endif
14579
14580 -#define __RT_MUTEX_INITIALIZER(mutexname) \
14581 -       { .wait_lock = __RAW_SPIN_LOCK_UNLOCKED(mutexname.wait_lock) \
14582 +#define __RT_MUTEX_INITIALIZER_PLAIN(mutexname) \
14583 +       .wait_lock = __RAW_SPIN_LOCK_UNLOCKED(mutexname.wait_lock) \
14584         , .waiters = RB_ROOT_CACHED \
14585         , .owner = NULL \
14586         __DEBUG_RT_MUTEX_INITIALIZER(mutexname) \
14587 -       __DEP_MAP_RT_MUTEX_INITIALIZER(mutexname)}
14588 +       __DEP_MAP_RT_MUTEX_INITIALIZER(mutexname)
14589 +
14590 +#define __RT_MUTEX_INITIALIZER(mutexname) \
14591 +       { __RT_MUTEX_INITIALIZER_PLAIN(mutexname) }
14592
14593  #define DEFINE_RT_MUTEX(mutexname) \
14594         struct rt_mutex mutexname = __RT_MUTEX_INITIALIZER(mutexname)
14595
14596 +#define __RT_MUTEX_INITIALIZER_SAVE_STATE(mutexname) \
14597 +       { __RT_MUTEX_INITIALIZER_PLAIN(mutexname)    \
14598 +               , .save_state = 1 }
14599 +
14600  /**
14601   * rt_mutex_is_locked - is the mutex locked
14602   * @lock: the mutex to be queried
14603 @@ -115,6 +126,7 @@ extern void rt_mutex_lock(struct rt_mutex *lock);
14604  #endif
14605
14606  extern int rt_mutex_lock_interruptible(struct rt_mutex *lock);
14607 +extern int rt_mutex_lock_killable(struct rt_mutex *lock);
14608  extern int rt_mutex_timed_lock(struct rt_mutex *lock,
14609                                struct hrtimer_sleeper *timeout);
14610
14611 diff --git a/include/linux/rwlock_rt.h b/include/linux/rwlock_rt.h
14612 new file mode 100644
14613 index 000000000000..a9c4c2ac4d1f
14614 --- /dev/null
14615 +++ b/include/linux/rwlock_rt.h
14616 @@ -0,0 +1,119 @@
14617 +#ifndef __LINUX_RWLOCK_RT_H
14618 +#define __LINUX_RWLOCK_RT_H
14619 +
14620 +#ifndef __LINUX_SPINLOCK_H
14621 +#error Do not include directly. Use spinlock.h
14622 +#endif
14623 +
14624 +extern void __lockfunc rt_write_lock(rwlock_t *rwlock);
14625 +extern void __lockfunc rt_read_lock(rwlock_t *rwlock);
14626 +extern int __lockfunc rt_write_trylock(rwlock_t *rwlock);
14627 +extern int __lockfunc rt_read_trylock(rwlock_t *rwlock);
14628 +extern void __lockfunc rt_write_unlock(rwlock_t *rwlock);
14629 +extern void __lockfunc rt_read_unlock(rwlock_t *rwlock);
14630 +extern int __lockfunc rt_read_can_lock(rwlock_t *rwlock);
14631 +extern int __lockfunc rt_write_can_lock(rwlock_t *rwlock);
14632 +extern void __rt_rwlock_init(rwlock_t *rwlock, char *name, struct lock_class_key *key);
14633 +
14634 +#define read_can_lock(rwlock)          rt_read_can_lock(rwlock)
14635 +#define write_can_lock(rwlock)         rt_write_can_lock(rwlock)
14636 +
14637 +#define read_trylock(lock)     __cond_lock(lock, rt_read_trylock(lock))
14638 +#define write_trylock(lock)    __cond_lock(lock, rt_write_trylock(lock))
14639 +
14640 +static inline int __write_trylock_rt_irqsave(rwlock_t *lock, unsigned long *flags)
14641 +{
14642 +       /* XXX ARCH_IRQ_ENABLED */
14643 +       *flags = 0;
14644 +       return rt_write_trylock(lock);
14645 +}
14646 +
14647 +#define write_trylock_irqsave(lock, flags)             \
14648 +       __cond_lock(lock, __write_trylock_rt_irqsave(lock, &(flags)))
14649 +
14650 +#define read_lock_irqsave(lock, flags)                 \
14651 +       do {                                            \
14652 +               typecheck(unsigned long, flags);        \
14653 +               rt_read_lock(lock);                     \
14654 +               flags = 0;                              \
14655 +       } while (0)
14656 +
14657 +#define write_lock_irqsave(lock, flags)                        \
14658 +       do {                                            \
14659 +               typecheck(unsigned long, flags);        \
14660 +               rt_write_lock(lock);                    \
14661 +               flags = 0;                              \
14662 +       } while (0)
14663 +
14664 +#define read_lock(lock)                rt_read_lock(lock)
14665 +
14666 +#define read_lock_bh(lock)                             \
14667 +       do {                                            \
14668 +               local_bh_disable();                     \
14669 +               rt_read_lock(lock);                     \
14670 +       } while (0)
14671 +
14672 +#define read_lock_irq(lock)    read_lock(lock)
14673 +
14674 +#define write_lock(lock)       rt_write_lock(lock)
14675 +
14676 +#define write_lock_bh(lock)                            \
14677 +       do {                                            \
14678 +               local_bh_disable();                     \
14679 +               rt_write_lock(lock);                    \
14680 +       } while (0)
14681 +
14682 +#define write_lock_irq(lock)   write_lock(lock)
14683 +
14684 +#define read_unlock(lock)      rt_read_unlock(lock)
14685 +
14686 +#define read_unlock_bh(lock)                           \
14687 +       do {                                            \
14688 +               rt_read_unlock(lock);                   \
14689 +               local_bh_enable();                      \
14690 +       } while (0)
14691 +
14692 +#define read_unlock_irq(lock)  read_unlock(lock)
14693 +
14694 +#define write_unlock(lock)     rt_write_unlock(lock)
14695 +
14696 +#define write_unlock_bh(lock)                          \
14697 +       do {                                            \
14698 +               rt_write_unlock(lock);                  \
14699 +               local_bh_enable();                      \
14700 +       } while (0)
14701 +
14702 +#define write_unlock_irq(lock) write_unlock(lock)
14703 +
14704 +#define read_unlock_irqrestore(lock, flags)            \
14705 +       do {                                            \
14706 +               typecheck(unsigned long, flags);        \
14707 +               (void) flags;                           \
14708 +               rt_read_unlock(lock);                   \
14709 +       } while (0)
14710 +
14711 +#define write_unlock_irqrestore(lock, flags) \
14712 +       do {                                            \
14713 +               typecheck(unsigned long, flags);        \
14714 +               (void) flags;                           \
14715 +               rt_write_unlock(lock);                  \
14716 +       } while (0)
14717 +
14718 +#define rwlock_init(rwl)                               \
14719 +do {                                                   \
14720 +       static struct lock_class_key __key;             \
14721 +                                                       \
14722 +       __rt_rwlock_init(rwl, #rwl, &__key);            \
14723 +} while (0)
14724 +
14725 +/*
14726 + * Internal functions made global for CPU pinning
14727 + */
14728 +void __read_rt_lock(struct rt_rw_lock *lock);
14729 +int __read_rt_trylock(struct rt_rw_lock *lock);
14730 +void __write_rt_lock(struct rt_rw_lock *lock);
14731 +int __write_rt_trylock(struct rt_rw_lock *lock);
14732 +void __read_rt_unlock(struct rt_rw_lock *lock);
14733 +void __write_rt_unlock(struct rt_rw_lock *lock);
14734 +
14735 +#endif
14736 diff --git a/include/linux/rwlock_types.h b/include/linux/rwlock_types.h
14737 index cc0072e93e36..5317cd957292 100644
14738 --- a/include/linux/rwlock_types.h
14739 +++ b/include/linux/rwlock_types.h
14740 @@ -1,6 +1,10 @@
14741  #ifndef __LINUX_RWLOCK_TYPES_H
14742  #define __LINUX_RWLOCK_TYPES_H
14743
14744 +#if !defined(__LINUX_SPINLOCK_TYPES_H)
14745 +# error "Do not include directly, include spinlock_types.h"
14746 +#endif
14747 +
14748  /*
14749   * include/linux/rwlock_types.h - generic rwlock type definitions
14750   *                               and initializers
14751 diff --git a/include/linux/rwlock_types_rt.h b/include/linux/rwlock_types_rt.h
14752 new file mode 100644
14753 index 000000000000..546a1f8f1274
14754 --- /dev/null
14755 +++ b/include/linux/rwlock_types_rt.h
14756 @@ -0,0 +1,55 @@
14757 +#ifndef __LINUX_RWLOCK_TYPES_RT_H
14758 +#define __LINUX_RWLOCK_TYPES_RT_H
14759 +
14760 +#ifndef __LINUX_SPINLOCK_TYPES_H
14761 +#error "Do not include directly. Include spinlock_types.h instead"
14762 +#endif
14763 +
14764 +#ifdef CONFIG_DEBUG_LOCK_ALLOC
14765 +# define RW_DEP_MAP_INIT(lockname)     .dep_map = { .name = #lockname }
14766 +#else
14767 +# define RW_DEP_MAP_INIT(lockname)
14768 +#endif
14769 +
14770 +typedef struct rt_rw_lock rwlock_t;
14771 +
14772 +#define __RW_LOCK_UNLOCKED(name) __RWLOCK_RT_INITIALIZER(name)
14773 +
14774 +#define DEFINE_RWLOCK(name) \
14775 +       rwlock_t name = __RW_LOCK_UNLOCKED(name)
14776 +
14777 +/*
14778 + * A reader biased implementation primarily for CPU pinning.
14779 + *
14780 + * Can be selected as general replacement for the single reader RT rwlock
14781 + * variant
14782 + */
14783 +struct rt_rw_lock {
14784 +       struct rt_mutex         rtmutex;
14785 +       atomic_t                readers;
14786 +#ifdef CONFIG_DEBUG_LOCK_ALLOC
14787 +       struct lockdep_map      dep_map;
14788 +#endif
14789 +};
14790 +
14791 +#define READER_BIAS    (1U << 31)
14792 +#define WRITER_BIAS    (1U << 30)
14793 +
14794 +#define __RWLOCK_RT_INITIALIZER(name)                                  \
14795 +{                                                                      \
14796 +       .readers = ATOMIC_INIT(READER_BIAS),                            \
14797 +       .rtmutex = __RT_MUTEX_INITIALIZER_SAVE_STATE(name.rtmutex),     \
14798 +       RW_DEP_MAP_INIT(name)                                           \
14799 +}
14800 +
14801 +void __rwlock_biased_rt_init(struct rt_rw_lock *lock, const char *name,
14802 +                            struct lock_class_key *key);
14803 +
14804 +#define rwlock_biased_rt_init(rwlock)                                  \
14805 +       do {                                                            \
14806 +               static struct lock_class_key __key;                     \
14807 +                                                                       \
14808 +               __rwlock_biased_rt_init((rwlock), #rwlock, &__key);     \
14809 +       } while (0)
14810 +
14811 +#endif
14812 diff --git a/include/linux/rwsem.h b/include/linux/rwsem.h
14813 index c427ffaa4904..513df11a364e 100644
14814 --- a/include/linux/rwsem.h
14815 +++ b/include/linux/rwsem.h
14816 @@ -20,6 +20,10 @@
14817  #include <linux/osq_lock.h>
14818  #endif
14819
14820 +#ifdef CONFIG_PREEMPT_RT_FULL
14821 +#include <linux/rwsem_rt.h>
14822 +#else /* PREEMPT_RT_FULL */
14823 +
14824  struct rw_semaphore;
14825
14826  #ifdef CONFIG_RWSEM_GENERIC_SPINLOCK
14827 @@ -114,6 +118,13 @@ static inline int rwsem_is_contended(struct rw_semaphore *sem)
14828         return !list_empty(&sem->wait_list);
14829  }
14830
14831 +#endif /* !PREEMPT_RT_FULL */
14832 +
14833 +/*
14834 + * The functions below are the same for all rwsem implementations including
14835 + * the RT specific variant.
14836 + */
14837 +
14838  /*
14839   * lock for reading
14840   */
14841 diff --git a/include/linux/rwsem_rt.h b/include/linux/rwsem_rt.h
14842 new file mode 100644
14843 index 000000000000..2ffbf093ae92
14844 --- /dev/null
14845 +++ b/include/linux/rwsem_rt.h
14846 @@ -0,0 +1,67 @@
14847 +#ifndef _LINUX_RWSEM_RT_H
14848 +#define _LINUX_RWSEM_RT_H
14849 +
14850 +#ifndef _LINUX_RWSEM_H
14851 +#error "Include rwsem.h"
14852 +#endif
14853 +
14854 +#include <linux/rtmutex.h>
14855 +#include <linux/swait.h>
14856 +
14857 +#define READER_BIAS            (1U << 31)
14858 +#define WRITER_BIAS            (1U << 30)
14859 +
14860 +struct rw_semaphore {
14861 +       atomic_t                readers;
14862 +       struct rt_mutex         rtmutex;
14863 +#ifdef CONFIG_DEBUG_LOCK_ALLOC
14864 +       struct lockdep_map      dep_map;
14865 +#endif
14866 +};
14867 +
14868 +#define __RWSEM_INITIALIZER(name)                              \
14869 +{                                                              \
14870 +       .readers = ATOMIC_INIT(READER_BIAS),                    \
14871 +       .rtmutex = __RT_MUTEX_INITIALIZER(name.rtmutex),        \
14872 +       RW_DEP_MAP_INIT(name)                                   \
14873 +}
14874 +
14875 +#define DECLARE_RWSEM(lockname) \
14876 +       struct rw_semaphore lockname = __RWSEM_INITIALIZER(lockname)
14877 +
14878 +extern void  __rwsem_init(struct rw_semaphore *rwsem, const char *name,
14879 +                         struct lock_class_key *key);
14880 +
14881 +#define __init_rwsem(sem, name, key)                   \
14882 +do {                                                   \
14883 +               rt_mutex_init(&(sem)->rtmutex);         \
14884 +               __rwsem_init((sem), (name), (key));     \
14885 +} while (0)
14886 +
14887 +#define init_rwsem(sem)                                        \
14888 +do {                                                   \
14889 +       static struct lock_class_key __key;             \
14890 +                                                       \
14891 +       __init_rwsem((sem), #sem, &__key);              \
14892 +} while (0)
14893 +
14894 +static inline int rwsem_is_locked(struct rw_semaphore *sem)
14895 +{
14896 +       return atomic_read(&sem->readers) != READER_BIAS;
14897 +}
14898 +
14899 +static inline int rwsem_is_contended(struct rw_semaphore *sem)
14900 +{
14901 +       return atomic_read(&sem->readers) > 0;
14902 +}
14903 +
14904 +extern void __down_read(struct rw_semaphore *sem);
14905 +extern int __down_read_trylock(struct rw_semaphore *sem);
14906 +extern void __down_write(struct rw_semaphore *sem);
14907 +extern int __must_check __down_write_killable(struct rw_semaphore *sem);
14908 +extern int __down_write_trylock(struct rw_semaphore *sem);
14909 +extern void __up_read(struct rw_semaphore *sem);
14910 +extern void __up_write(struct rw_semaphore *sem);
14911 +extern void __downgrade_write(struct rw_semaphore *sem);
14912 +
14913 +#endif
14914 diff --git a/include/linux/sched.h b/include/linux/sched.h
14915 index e04919aa8201..a6ffb552be01 100644
14916 --- a/include/linux/sched.h
14917 +++ b/include/linux/sched.h
14918 @@ -27,6 +27,7 @@
14919  #include <linux/signal_types.h>
14920  #include <linux/mm_types_task.h>
14921  #include <linux/task_io_accounting.h>
14922 +#include <asm/kmap_types.h>
14923
14924  /* task_struct member predeclarations (sorted alphabetically): */
14925  struct audit_context;
14926 @@ -93,7 +94,6 @@ struct task_group;
14927
14928  /* Convenience macros for the sake of wake_up(): */
14929  #define TASK_NORMAL                    (TASK_INTERRUPTIBLE | TASK_UNINTERRUPTIBLE)
14930 -#define TASK_ALL                       (TASK_NORMAL | __TASK_STOPPED | __TASK_TRACED)
14931
14932  /* get_task_state(): */
14933  #define TASK_REPORT                    (TASK_RUNNING | TASK_INTERRUPTIBLE | \
14934 @@ -101,12 +101,8 @@ struct task_group;
14935                                          __TASK_TRACED | EXIT_DEAD | EXIT_ZOMBIE | \
14936                                          TASK_PARKED)
14937
14938 -#define task_is_traced(task)           ((task->state & __TASK_TRACED) != 0)
14939 -
14940  #define task_is_stopped(task)          ((task->state & __TASK_STOPPED) != 0)
14941
14942 -#define task_is_stopped_or_traced(task)        ((task->state & (__TASK_STOPPED | __TASK_TRACED)) != 0)
14943 -
14944  #define task_contributes_to_load(task) ((task->state & TASK_UNINTERRUPTIBLE) != 0 && \
14945                                          (task->flags & PF_FROZEN) == 0 && \
14946                                          (task->state & TASK_NOLOAD) == 0)
14947 @@ -134,6 +130,11 @@ struct task_group;
14948                 smp_store_mb(current->state, (state_value));    \
14949         } while (0)
14950
14951 +#define __set_current_state_no_track(state_value)              \
14952 +               current->state = (state_value);
14953 +#define set_current_state_no_track(state_value)                        \
14954 +               smp_store_mb(current->state, (state_value));
14955 +
14956  #define set_special_state(state_value)                                 \
14957         do {                                                            \
14958                 unsigned long flags; /* may shadow */                   \
14959 @@ -187,6 +188,9 @@ struct task_group;
14960  #define set_current_state(state_value)                                 \
14961         smp_store_mb(current->state, (state_value))
14962
14963 +#define __set_current_state_no_track(state_value)      __set_current_state(state_value)
14964 +#define set_current_state_no_track(state_value)                set_current_state(state_value)
14965 +
14966  /*
14967   * set_special_state() should be used for those states when the blocking task
14968   * can not use the regular condition based wait-loop. In that case we must
14969 @@ -566,6 +570,8 @@ struct task_struct {
14970  #endif
14971         /* -1 unrunnable, 0 runnable, >0 stopped: */
14972         volatile long                   state;
14973 +       /* saved state for "spinlock sleepers" */
14974 +       volatile long                   saved_state;
14975
14976         /*
14977          * This begins the randomizable portion of task_struct. Only
14978 @@ -618,7 +624,25 @@ struct task_struct {
14979
14980         unsigned int                    policy;
14981         int                             nr_cpus_allowed;
14982 -       cpumask_t                       cpus_allowed;
14983 +       const cpumask_t                 *cpus_ptr;
14984 +       cpumask_t                       cpus_mask;
14985 +#if defined(CONFIG_SMP) && defined(CONFIG_PREEMPT_RT_BASE)
14986 +       int                             migrate_disable;
14987 +       int                             migrate_disable_update;
14988 +       int                             pinned_on_cpu;
14989 +# ifdef CONFIG_SCHED_DEBUG
14990 +       int                             migrate_disable_atomic;
14991 +# endif
14992 +
14993 +#elif !defined(CONFIG_SMP) && defined(CONFIG_PREEMPT_RT_BASE)
14994 +# ifdef CONFIG_SCHED_DEBUG
14995 +       int                             migrate_disable;
14996 +       int                             migrate_disable_atomic;
14997 +# endif
14998 +#endif
14999 +#ifdef CONFIG_PREEMPT_RT_FULL
15000 +       int                             sleeping_lock;
15001 +#endif
15002
15003  #ifdef CONFIG_PREEMPT_RCU
15004         int                             rcu_read_lock_nesting;
15005 @@ -777,6 +801,9 @@ struct task_struct {
15006  #ifdef CONFIG_POSIX_TIMERS
15007         struct task_cputime             cputime_expires;
15008         struct list_head                cpu_timers[3];
15009 +#ifdef CONFIG_PREEMPT_RT_BASE
15010 +       struct task_struct              *posix_timer_list;
15011 +#endif
15012  #endif
15013
15014         /* Process credentials: */
15015 @@ -820,11 +847,17 @@ struct task_struct {
15016         /* Signal handlers: */
15017         struct signal_struct            *signal;
15018         struct sighand_struct           *sighand;
15019 +       struct sigqueue                 *sigqueue_cache;
15020 +
15021         sigset_t                        blocked;
15022         sigset_t                        real_blocked;
15023         /* Restored if set_restore_sigmask() was used: */
15024         sigset_t                        saved_sigmask;
15025         struct sigpending               pending;
15026 +#ifdef CONFIG_PREEMPT_RT_FULL
15027 +       /* TODO: move me into ->restart_block ? */
15028 +       struct                          siginfo forced_info;
15029 +#endif
15030         unsigned long                   sas_ss_sp;
15031         size_t                          sas_ss_size;
15032         unsigned int                    sas_ss_flags;
15033 @@ -849,6 +882,7 @@ struct task_struct {
15034         raw_spinlock_t                  pi_lock;
15035
15036         struct wake_q_node              wake_q;
15037 +       struct wake_q_node              wake_q_sleeper;
15038
15039  #ifdef CONFIG_RT_MUTEXES
15040         /* PI waiters blocked on a rt_mutex held by this task: */
15041 @@ -1116,8 +1150,22 @@ struct task_struct {
15042         unsigned int                    sequential_io;
15043         unsigned int                    sequential_io_avg;
15044  #endif
15045 +#ifdef CONFIG_PREEMPT_RT_BASE
15046 +       struct rcu_head                 put_rcu;
15047 +       int                             softirq_nestcnt;
15048 +       unsigned int                    softirqs_raised;
15049 +#endif
15050 +#ifdef CONFIG_PREEMPT_RT_FULL
15051 +# if defined CONFIG_HIGHMEM || defined CONFIG_X86_32
15052 +       int                             kmap_idx;
15053 +       pte_t                           kmap_pte[KM_TYPE_NR];
15054 +# endif
15055 +#endif
15056  #ifdef CONFIG_DEBUG_ATOMIC_SLEEP
15057         unsigned long                   task_state_change;
15058 +#endif
15059 +#ifdef CONFIG_PREEMPT_RT_FULL
15060 +       int                             xmit_recursion;
15061  #endif
15062         int                             pagefault_disabled;
15063  #ifdef CONFIG_MMU
15064 @@ -1332,6 +1380,7 @@ extern struct pid *cad_pid;
15065  /*
15066   * Per process flags
15067   */
15068 +#define PF_IN_SOFTIRQ          0x00000001      /* Task is serving softirq */
15069  #define PF_IDLE                        0x00000002      /* I am an IDLE thread */
15070  #define PF_EXITING             0x00000004      /* Getting shut down */
15071  #define PF_EXITPIDONE          0x00000008      /* PI exit done on shut down */
15072 @@ -1355,7 +1404,7 @@ extern struct pid *cad_pid;
15073  #define PF_KTHREAD             0x00200000      /* I am a kernel thread */
15074  #define PF_RANDOMIZE           0x00400000      /* Randomize virtual address space */
15075  #define PF_SWAPWRITE           0x00800000      /* Allowed to write to swap */
15076 -#define PF_NO_SETAFFINITY      0x04000000      /* Userland is not allowed to meddle with cpus_allowed */
15077 +#define PF_NO_SETAFFINITY      0x04000000      /* Userland is not allowed to meddle with cpus_mask */
15078  #define PF_MCE_EARLY           0x08000000      /* Early kill for mce process policy */
15079  #define PF_MUTEX_TESTER                0x20000000      /* Thread belongs to the rt mutex tester */
15080  #define PF_FREEZER_SKIP                0x40000000      /* Freezer should not count it as freezable */
15081 @@ -1535,6 +1584,7 @@ extern struct task_struct *find_task_by_pid_ns(pid_t nr, struct pid_namespace *n
15082
15083  extern int wake_up_state(struct task_struct *tsk, unsigned int state);
15084  extern int wake_up_process(struct task_struct *tsk);
15085 +extern int wake_up_lock_sleeper(struct task_struct *tsk);
15086  extern void wake_up_new_task(struct task_struct *tsk);
15087
15088  #ifdef CONFIG_SMP
15089 @@ -1611,6 +1661,89 @@ static inline int test_tsk_need_resched(struct task_struct *tsk)
15090         return unlikely(test_tsk_thread_flag(tsk,TIF_NEED_RESCHED));
15091  }
15092
15093 +#ifdef CONFIG_PREEMPT_LAZY
15094 +static inline void set_tsk_need_resched_lazy(struct task_struct *tsk)
15095 +{
15096 +       set_tsk_thread_flag(tsk,TIF_NEED_RESCHED_LAZY);
15097 +}
15098 +
15099 +static inline void clear_tsk_need_resched_lazy(struct task_struct *tsk)
15100 +{
15101 +       clear_tsk_thread_flag(tsk,TIF_NEED_RESCHED_LAZY);
15102 +}
15103 +
15104 +static inline int test_tsk_need_resched_lazy(struct task_struct *tsk)
15105 +{
15106 +       return unlikely(test_tsk_thread_flag(tsk,TIF_NEED_RESCHED_LAZY));
15107 +}
15108 +
15109 +static inline int need_resched_lazy(void)
15110 +{
15111 +       return test_thread_flag(TIF_NEED_RESCHED_LAZY);
15112 +}
15113 +
15114 +static inline int need_resched_now(void)
15115 +{
15116 +       return test_thread_flag(TIF_NEED_RESCHED);
15117 +}
15118 +
15119 +#else
15120 +static inline void clear_tsk_need_resched_lazy(struct task_struct *tsk) { }
15121 +static inline int need_resched_lazy(void) { return 0; }
15122 +
15123 +static inline int need_resched_now(void)
15124 +{
15125 +       return test_thread_flag(TIF_NEED_RESCHED);
15126 +}
15127 +
15128 +#endif
15129 +
15130 +
15131 +static inline bool __task_is_stopped_or_traced(struct task_struct *task)
15132 +{
15133 +       if (task->state & (__TASK_STOPPED | __TASK_TRACED))
15134 +               return true;
15135 +#ifdef CONFIG_PREEMPT_RT_FULL
15136 +       if (task->saved_state & (__TASK_STOPPED | __TASK_TRACED))
15137 +               return true;
15138 +#endif
15139 +       return false;
15140 +}
15141 +
15142 +static inline bool task_is_stopped_or_traced(struct task_struct *task)
15143 +{
15144 +       bool traced_stopped;
15145 +
15146 +#ifdef CONFIG_PREEMPT_RT_FULL
15147 +       unsigned long flags;
15148 +
15149 +       raw_spin_lock_irqsave(&task->pi_lock, flags);
15150 +       traced_stopped = __task_is_stopped_or_traced(task);
15151 +       raw_spin_unlock_irqrestore(&task->pi_lock, flags);
15152 +#else
15153 +       traced_stopped = __task_is_stopped_or_traced(task);
15154 +#endif
15155 +       return traced_stopped;
15156 +}
15157 +
15158 +static inline bool task_is_traced(struct task_struct *task)
15159 +{
15160 +       bool traced = false;
15161 +
15162 +       if (task->state & __TASK_TRACED)
15163 +               return true;
15164 +#ifdef CONFIG_PREEMPT_RT_FULL
15165 +       /* in case the task is sleeping on tasklist_lock */
15166 +       raw_spin_lock_irq(&task->pi_lock);
15167 +       if (task->state & __TASK_TRACED)
15168 +               traced = true;
15169 +       else if (task->saved_state & __TASK_TRACED)
15170 +               traced = true;
15171 +       raw_spin_unlock_irq(&task->pi_lock);
15172 +#endif
15173 +       return traced;
15174 +}
15175 +
15176  /*
15177   * cond_resched() and cond_resched_lock(): latency reduction via
15178   * explicit rescheduling in places that are safe. The return
15179 @@ -1636,12 +1769,16 @@ extern int __cond_resched_lock(spinlock_t *lock);
15180         __cond_resched_lock(lock);                              \
15181  })
15182
15183 +#ifndef CONFIG_PREEMPT_RT_FULL
15184  extern int __cond_resched_softirq(void);
15185
15186  #define cond_resched_softirq() ({                                      \
15187         ___might_sleep(__FILE__, __LINE__, SOFTIRQ_DISABLE_OFFSET);     \
15188         __cond_resched_softirq();                                       \
15189  })
15190 +#else
15191 +# define cond_resched_softirq()                cond_resched()
15192 +#endif
15193
15194  static inline void cond_resched_rcu(void)
15195  {
15196 @@ -1671,6 +1808,23 @@ static __always_inline bool need_resched(void)
15197         return unlikely(tif_need_resched());
15198  }
15199
15200 +#ifdef CONFIG_PREEMPT_RT_FULL
15201 +static inline void sleeping_lock_inc(void)
15202 +{
15203 +       current->sleeping_lock++;
15204 +}
15205 +
15206 +static inline void sleeping_lock_dec(void)
15207 +{
15208 +       current->sleeping_lock--;
15209 +}
15210 +
15211 +#else
15212 +
15213 +static inline void sleeping_lock_inc(void) { }
15214 +static inline void sleeping_lock_dec(void) { }
15215 +#endif
15216 +
15217  /*
15218   * Wrappers for p->thread_info->cpu access. No-op on UP.
15219   */
15220 diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h
15221 index 3d49b91b674d..d8f2fa8f500c 100644
15222 --- a/include/linux/sched/mm.h
15223 +++ b/include/linux/sched/mm.h
15224 @@ -43,6 +43,17 @@ static inline void mmdrop(struct mm_struct *mm)
15225                 __mmdrop(mm);
15226  }
15227
15228 +#ifdef CONFIG_PREEMPT_RT_BASE
15229 +extern void __mmdrop_delayed(struct rcu_head *rhp);
15230 +static inline void mmdrop_delayed(struct mm_struct *mm)
15231 +{
15232 +       if (atomic_dec_and_test(&mm->mm_count))
15233 +               call_rcu(&mm->delayed_drop, __mmdrop_delayed);
15234 +}
15235 +#else
15236 +# define mmdrop_delayed(mm)    mmdrop(mm)
15237 +#endif
15238 +
15239  static inline void mmdrop_async_fn(struct work_struct *work)
15240  {
15241         struct mm_struct *mm = container_of(work, struct mm_struct, async_put_work);
15242 diff --git a/include/linux/sched/task.h b/include/linux/sched/task.h
15243 index a74ec619ac51..8e7f741370c5 100644
15244 --- a/include/linux/sched/task.h
15245 +++ b/include/linux/sched/task.h
15246 @@ -88,6 +88,15 @@ extern void sched_exec(void);
15247
15248  #define get_task_struct(tsk) do { atomic_inc(&(tsk)->usage); } while(0)
15249
15250 +#ifdef CONFIG_PREEMPT_RT_BASE
15251 +extern void __put_task_struct_cb(struct rcu_head *rhp);
15252 +
15253 +static inline void put_task_struct(struct task_struct *t)
15254 +{
15255 +       if (atomic_dec_and_test(&t->usage))
15256 +               call_rcu(&t->put_rcu, __put_task_struct_cb);
15257 +}
15258 +#else
15259  extern void __put_task_struct(struct task_struct *t);
15260
15261  static inline void put_task_struct(struct task_struct *t)
15262 @@ -95,7 +104,7 @@ static inline void put_task_struct(struct task_struct *t)
15263         if (atomic_dec_and_test(&t->usage))
15264                 __put_task_struct(t);
15265  }
15266 -
15267 +#endif
15268  struct task_struct *task_rcu_dereference(struct task_struct **ptask);
15269
15270  #ifdef CONFIG_ARCH_WANTS_DYNAMIC_TASK_STRUCT
15271 diff --git a/include/linux/sched/wake_q.h b/include/linux/sched/wake_q.h
15272 index 10b19a192b2d..ce3ccff3d9d8 100644
15273 --- a/include/linux/sched/wake_q.h
15274 +++ b/include/linux/sched/wake_q.h
15275 @@ -47,8 +47,29 @@ static inline void wake_q_init(struct wake_q_head *head)
15276         head->lastp = &head->first;
15277  }
15278
15279 -extern void wake_q_add(struct wake_q_head *head,
15280 -                      struct task_struct *task);
15281 -extern void wake_up_q(struct wake_q_head *head);
15282 +extern void __wake_q_add(struct wake_q_head *head,
15283 +                        struct task_struct *task, bool sleeper);
15284 +static inline void wake_q_add(struct wake_q_head *head,
15285 +                             struct task_struct *task)
15286 +{
15287 +       __wake_q_add(head, task, false);
15288 +}
15289 +
15290 +static inline void wake_q_add_sleeper(struct wake_q_head *head,
15291 +                                     struct task_struct *task)
15292 +{
15293 +       __wake_q_add(head, task, true);
15294 +}
15295 +
15296 +extern void __wake_up_q(struct wake_q_head *head, bool sleeper);
15297 +static inline void wake_up_q(struct wake_q_head *head)
15298 +{
15299 +       __wake_up_q(head, false);
15300 +}
15301 +
15302 +static inline void wake_up_q_sleeper(struct wake_q_head *head)
15303 +{
15304 +       __wake_up_q(head, true);
15305 +}
15306
15307  #endif /* _LINUX_SCHED_WAKE_Q_H */
15308 diff --git a/include/linux/seqlock.h b/include/linux/seqlock.h
15309 index f189a8a3bbb8..107079a2d7ed 100644
15310 --- a/include/linux/seqlock.h
15311 +++ b/include/linux/seqlock.h
15312 @@ -221,20 +221,30 @@ static inline int read_seqcount_retry(const seqcount_t *s, unsigned start)
15313         return __read_seqcount_retry(s, start);
15314  }
15315
15316 -
15317 -
15318 -static inline void raw_write_seqcount_begin(seqcount_t *s)
15319 +static inline void __raw_write_seqcount_begin(seqcount_t *s)
15320  {
15321         s->sequence++;
15322         smp_wmb();
15323  }
15324
15325 -static inline void raw_write_seqcount_end(seqcount_t *s)
15326 +static inline void raw_write_seqcount_begin(seqcount_t *s)
15327 +{
15328 +       preempt_disable_rt();
15329 +       __raw_write_seqcount_begin(s);
15330 +}
15331 +
15332 +static inline void __raw_write_seqcount_end(seqcount_t *s)
15333  {
15334         smp_wmb();
15335         s->sequence++;
15336  }
15337
15338 +static inline void raw_write_seqcount_end(seqcount_t *s)
15339 +{
15340 +       __raw_write_seqcount_end(s);
15341 +       preempt_enable_rt();
15342 +}
15343 +
15344  /**
15345   * raw_write_seqcount_barrier - do a seq write barrier
15346   * @s: pointer to seqcount_t
15347 @@ -429,10 +439,33 @@ typedef struct {
15348  /*
15349   * Read side functions for starting and finalizing a read side section.
15350   */
15351 +#ifndef CONFIG_PREEMPT_RT_FULL
15352  static inline unsigned read_seqbegin(const seqlock_t *sl)
15353  {
15354         return read_seqcount_begin(&sl->seqcount);
15355  }
15356 +#else
15357 +/*
15358 + * Starvation safe read side for RT
15359 + */
15360 +static inline unsigned read_seqbegin(seqlock_t *sl)
15361 +{
15362 +       unsigned ret;
15363 +
15364 +repeat:
15365 +       ret = ACCESS_ONCE(sl->seqcount.sequence);
15366 +       if (unlikely(ret & 1)) {
15367 +               /*
15368 +                * Take the lock and let the writer proceed (i.e. evtl
15369 +                * boost it), otherwise we could loop here forever.
15370 +                */
15371 +               spin_unlock_wait(&sl->lock);
15372 +               goto repeat;
15373 +       }
15374 +       smp_rmb();
15375 +       return ret;
15376 +}
15377 +#endif
15378
15379  static inline unsigned read_seqretry(const seqlock_t *sl, unsigned start)
15380  {
15381 @@ -447,36 +480,45 @@ static inline unsigned read_seqretry(const seqlock_t *sl, unsigned start)
15382  static inline void write_seqlock(seqlock_t *sl)
15383  {
15384         spin_lock(&sl->lock);
15385 -       write_seqcount_begin(&sl->seqcount);
15386 +       __raw_write_seqcount_begin(&sl->seqcount);
15387 +}
15388 +
15389 +static inline int try_write_seqlock(seqlock_t *sl)
15390 +{
15391 +       if (spin_trylock(&sl->lock)) {
15392 +               __raw_write_seqcount_begin(&sl->seqcount);
15393 +               return 1;
15394 +       }
15395 +       return 0;
15396  }
15397
15398  static inline void write_sequnlock(seqlock_t *sl)
15399  {
15400 -       write_seqcount_end(&sl->seqcount);
15401 +       __raw_write_seqcount_end(&sl->seqcount);
15402         spin_unlock(&sl->lock);
15403  }
15404
15405  static inline void write_seqlock_bh(seqlock_t *sl)
15406  {
15407         spin_lock_bh(&sl->lock);
15408 -       write_seqcount_begin(&sl->seqcount);
15409 +       __raw_write_seqcount_begin(&sl->seqcount);
15410  }
15411
15412  static inline void write_sequnlock_bh(seqlock_t *sl)
15413  {
15414 -       write_seqcount_end(&sl->seqcount);
15415 +       __raw_write_seqcount_end(&sl->seqcount);
15416         spin_unlock_bh(&sl->lock);
15417  }
15418
15419  static inline void write_seqlock_irq(seqlock_t *sl)
15420  {
15421         spin_lock_irq(&sl->lock);
15422 -       write_seqcount_begin(&sl->seqcount);
15423 +       __raw_write_seqcount_begin(&sl->seqcount);
15424  }
15425
15426  static inline void write_sequnlock_irq(seqlock_t *sl)
15427  {
15428 -       write_seqcount_end(&sl->seqcount);
15429 +       __raw_write_seqcount_end(&sl->seqcount);
15430         spin_unlock_irq(&sl->lock);
15431  }
15432
15433 @@ -485,7 +527,7 @@ static inline unsigned long __write_seqlock_irqsave(seqlock_t *sl)
15434         unsigned long flags;
15435
15436         spin_lock_irqsave(&sl->lock, flags);
15437 -       write_seqcount_begin(&sl->seqcount);
15438 +       __raw_write_seqcount_begin(&sl->seqcount);
15439         return flags;
15440  }
15441
15442 @@ -495,7 +537,7 @@ static inline unsigned long __write_seqlock_irqsave(seqlock_t *sl)
15443  static inline void
15444  write_sequnlock_irqrestore(seqlock_t *sl, unsigned long flags)
15445  {
15446 -       write_seqcount_end(&sl->seqcount);
15447 +       __raw_write_seqcount_end(&sl->seqcount);
15448         spin_unlock_irqrestore(&sl->lock, flags);
15449  }
15450
15451 diff --git a/include/linux/signal.h b/include/linux/signal.h
15452 index 042968dd98f0..a7d20f85cc0e 100644
15453 --- a/include/linux/signal.h
15454 +++ b/include/linux/signal.h
15455 @@ -243,6 +243,7 @@ static inline void init_sigpending(struct sigpending *sig)
15456  }
15457
15458  extern void flush_sigqueue(struct sigpending *queue);
15459 +extern void flush_task_sigqueue(struct task_struct *tsk);
15460
15461  /* Test if 'sig' is valid signal. Use this instead of testing _NSIG directly */
15462  static inline int valid_signal(unsigned long sig)
15463 diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
15464 index f64e88444082..07576a062ac0 100644
15465 --- a/include/linux/skbuff.h
15466 +++ b/include/linux/skbuff.h
15467 @@ -287,6 +287,7 @@ struct sk_buff_head {
15468
15469         __u32           qlen;
15470         spinlock_t      lock;
15471 +       raw_spinlock_t  raw_lock;
15472  };
15473
15474  struct sk_buff;
15475 @@ -1672,6 +1673,12 @@ static inline void skb_queue_head_init(struct sk_buff_head *list)
15476         __skb_queue_head_init(list);
15477  }
15478
15479 +static inline void skb_queue_head_init_raw(struct sk_buff_head *list)
15480 +{
15481 +       raw_spin_lock_init(&list->raw_lock);
15482 +       __skb_queue_head_init(list);
15483 +}
15484 +
15485  static inline void skb_queue_head_init_class(struct sk_buff_head *list,
15486                 struct lock_class_key *class)
15487  {
15488 diff --git a/include/linux/smp.h b/include/linux/smp.h
15489 index 9fb239e12b82..5801e516ba63 100644
15490 --- a/include/linux/smp.h
15491 +++ b/include/linux/smp.h
15492 @@ -202,6 +202,9 @@ static inline int get_boot_cpu_id(void)
15493  #define get_cpu()              ({ preempt_disable(); smp_processor_id(); })
15494  #define put_cpu()              preempt_enable()
15495
15496 +#define get_cpu_light()                ({ migrate_disable(); smp_processor_id(); })
15497 +#define put_cpu_light()                migrate_enable()
15498 +
15499  /*
15500   * Callback to arch code if there's nosmp or maxcpus=0 on the
15501   * boot command line:
15502 diff --git a/include/linux/spinlock.h b/include/linux/spinlock.h
15503 index 341e1a12bfc7..7c8f0a985b9e 100644
15504 --- a/include/linux/spinlock.h
15505 +++ b/include/linux/spinlock.h
15506 @@ -286,7 +286,11 @@ static inline void do_raw_spin_unlock(raw_spinlock_t *lock) __releases(lock)
15507  #define raw_spin_can_lock(lock)        (!raw_spin_is_locked(lock))
15508
15509  /* Include rwlock functions */
15510 -#include <linux/rwlock.h>
15511 +#ifdef CONFIG_PREEMPT_RT_FULL
15512 +# include <linux/rwlock_rt.h>
15513 +#else
15514 +# include <linux/rwlock.h>
15515 +#endif
15516
15517  /*
15518   * Pull the _spin_*()/_read_*()/_write_*() functions/declarations:
15519 @@ -297,6 +301,10 @@ static inline void do_raw_spin_unlock(raw_spinlock_t *lock) __releases(lock)
15520  # include <linux/spinlock_api_up.h>
15521  #endif
15522
15523 +#ifdef CONFIG_PREEMPT_RT_FULL
15524 +# include <linux/spinlock_rt.h>
15525 +#else /* PREEMPT_RT_FULL */
15526 +
15527  /*
15528   * Map the spin_lock functions to the raw variants for PREEMPT_RT=n
15529   */
15530 @@ -421,4 +429,6 @@ extern int _atomic_dec_and_lock(atomic_t *atomic, spinlock_t *lock);
15531  #define atomic_dec_and_lock(atomic, lock) \
15532                 __cond_lock(lock, _atomic_dec_and_lock(atomic, lock))
15533
15534 +#endif /* !PREEMPT_RT_FULL */
15535 +
15536  #endif /* __LINUX_SPINLOCK_H */
15537 diff --git a/include/linux/spinlock_api_smp.h b/include/linux/spinlock_api_smp.h
15538 index 42dfab89e740..29d99ae5a8ab 100644
15539 --- a/include/linux/spinlock_api_smp.h
15540 +++ b/include/linux/spinlock_api_smp.h
15541 @@ -187,6 +187,8 @@ static inline int __raw_spin_trylock_bh(raw_spinlock_t *lock)
15542         return 0;
15543  }
15544
15545 -#include <linux/rwlock_api_smp.h>
15546 +#ifndef CONFIG_PREEMPT_RT_FULL
15547 +# include <linux/rwlock_api_smp.h>
15548 +#endif
15549
15550  #endif /* __LINUX_SPINLOCK_API_SMP_H */
15551 diff --git a/include/linux/spinlock_rt.h b/include/linux/spinlock_rt.h
15552 new file mode 100644
15553 index 000000000000..c95e1f5145ac
15554 --- /dev/null
15555 +++ b/include/linux/spinlock_rt.h
15556 @@ -0,0 +1,159 @@
15557 +#ifndef __LINUX_SPINLOCK_RT_H
15558 +#define __LINUX_SPINLOCK_RT_H
15559 +
15560 +#ifndef __LINUX_SPINLOCK_H
15561 +#error Do not include directly. Use spinlock.h
15562 +#endif
15563 +
15564 +#include <linux/bug.h>
15565 +
15566 +extern void
15567 +__rt_spin_lock_init(spinlock_t *lock, const char *name, struct lock_class_key *key);
15568 +
15569 +#define spin_lock_init(slock)                          \
15570 +do {                                                   \
15571 +       static struct lock_class_key __key;             \
15572 +                                                       \
15573 +       rt_mutex_init(&(slock)->lock);                  \
15574 +       __rt_spin_lock_init(slock, #slock, &__key);     \
15575 +} while (0)
15576 +
15577 +extern void __lockfunc rt_spin_lock(spinlock_t *lock);
15578 +extern unsigned long __lockfunc rt_spin_lock_trace_flags(spinlock_t *lock);
15579 +extern void __lockfunc rt_spin_lock_nested(spinlock_t *lock, int subclass);
15580 +extern void __lockfunc rt_spin_unlock(spinlock_t *lock);
15581 +extern void __lockfunc rt_spin_unlock_wait(spinlock_t *lock);
15582 +extern int __lockfunc rt_spin_trylock_irqsave(spinlock_t *lock, unsigned long *flags);
15583 +extern int __lockfunc rt_spin_trylock_bh(spinlock_t *lock);
15584 +extern int __lockfunc rt_spin_trylock(spinlock_t *lock);
15585 +extern int atomic_dec_and_spin_lock(atomic_t *atomic, spinlock_t *lock);
15586 +
15587 +/*
15588 + * lockdep-less calls, for derived types like rwlock:
15589 + * (for trylock they can use rt_mutex_trylock() directly.
15590 + * Migrate disable handling must be done at the call site.
15591 + */
15592 +extern void __lockfunc __rt_spin_lock(struct rt_mutex *lock);
15593 +extern void __lockfunc __rt_spin_trylock(struct rt_mutex *lock);
15594 +extern void __lockfunc __rt_spin_unlock(struct rt_mutex *lock);
15595 +
15596 +#define spin_lock(lock)                        rt_spin_lock(lock)
15597 +
15598 +#define spin_lock_bh(lock)                     \
15599 +       do {                                    \
15600 +               local_bh_disable();             \
15601 +               rt_spin_lock(lock);             \
15602 +       } while (0)
15603 +
15604 +#define spin_lock_irq(lock)            spin_lock(lock)
15605 +
15606 +#define spin_do_trylock(lock)          __cond_lock(lock, rt_spin_trylock(lock))
15607 +
15608 +#define spin_trylock(lock)                     \
15609 +({                                             \
15610 +       int __locked;                           \
15611 +       __locked = spin_do_trylock(lock);       \
15612 +       __locked;                               \
15613 +})
15614 +
15615 +#ifdef CONFIG_LOCKDEP
15616 +# define spin_lock_nested(lock, subclass)              \
15617 +       do {                                            \
15618 +               rt_spin_lock_nested(lock, subclass);    \
15619 +       } while (0)
15620 +
15621 +#define spin_lock_bh_nested(lock, subclass)            \
15622 +       do {                                            \
15623 +               local_bh_disable();                     \
15624 +               rt_spin_lock_nested(lock, subclass);    \
15625 +       } while (0)
15626 +
15627 +# define spin_lock_irqsave_nested(lock, flags, subclass) \
15628 +       do {                                             \
15629 +               typecheck(unsigned long, flags);         \
15630 +               flags = 0;                               \
15631 +               rt_spin_lock_nested(lock, subclass);     \
15632 +       } while (0)
15633 +#else
15634 +# define spin_lock_nested(lock, subclass)      spin_lock(lock)
15635 +# define spin_lock_bh_nested(lock, subclass)   spin_lock_bh(lock)
15636 +
15637 +# define spin_lock_irqsave_nested(lock, flags, subclass) \
15638 +       do {                                             \
15639 +               typecheck(unsigned long, flags);         \
15640 +               flags = 0;                               \
15641 +               spin_lock(lock);                         \
15642 +       } while (0)
15643 +#endif
15644 +
15645 +#define spin_lock_irqsave(lock, flags)                  \
15646 +       do {                                             \
15647 +               typecheck(unsigned long, flags);         \
15648 +               flags = 0;                               \
15649 +               spin_lock(lock);                         \
15650 +       } while (0)
15651 +
15652 +static inline unsigned long spin_lock_trace_flags(spinlock_t *lock)
15653 +{
15654 +       unsigned long flags = 0;
15655 +#ifdef CONFIG_TRACE_IRQFLAGS
15656 +       flags = rt_spin_lock_trace_flags(lock);
15657 +#else
15658 +       spin_lock(lock); /* lock_local */
15659 +#endif
15660 +       return flags;
15661 +}
15662 +
15663 +/* FIXME: we need rt_spin_lock_nest_lock */
15664 +#define spin_lock_nest_lock(lock, nest_lock) spin_lock_nested(lock, 0)
15665 +
15666 +#define spin_unlock(lock)                      rt_spin_unlock(lock)
15667 +
15668 +#define spin_unlock_bh(lock)                           \
15669 +       do {                                            \
15670 +               rt_spin_unlock(lock);                   \
15671 +               local_bh_enable();                      \
15672 +       } while (0)
15673 +
15674 +#define spin_unlock_irq(lock)          spin_unlock(lock)
15675 +
15676 +#define spin_unlock_irqrestore(lock, flags)            \
15677 +       do {                                            \
15678 +               typecheck(unsigned long, flags);        \
15679 +               (void) flags;                           \
15680 +               spin_unlock(lock);                      \
15681 +       } while (0)
15682 +
15683 +#define spin_trylock_bh(lock)  __cond_lock(lock, rt_spin_trylock_bh(lock))
15684 +#define spin_trylock_irq(lock) spin_trylock(lock)
15685 +
15686 +#define spin_trylock_irqsave(lock, flags)      \
15687 +       rt_spin_trylock_irqsave(lock, &(flags))
15688 +
15689 +#define spin_unlock_wait(lock)         rt_spin_unlock_wait(lock)
15690 +
15691 +#ifdef CONFIG_GENERIC_LOCKBREAK
15692 +# define spin_is_contended(lock)       ((lock)->break_lock)
15693 +#else
15694 +# define spin_is_contended(lock)       (((void)(lock), 0))
15695 +#endif
15696 +
15697 +static inline int spin_can_lock(spinlock_t *lock)
15698 +{
15699 +       return !rt_mutex_is_locked(&lock->lock);
15700 +}
15701 +
15702 +static inline int spin_is_locked(spinlock_t *lock)
15703 +{
15704 +       return rt_mutex_is_locked(&lock->lock);
15705 +}
15706 +
15707 +static inline void assert_spin_locked(spinlock_t *lock)
15708 +{
15709 +       BUG_ON(!spin_is_locked(lock));
15710 +}
15711 +
15712 +#define atomic_dec_and_lock(atomic, lock) \
15713 +       atomic_dec_and_spin_lock(atomic, lock)
15714 +
15715 +#endif
15716 diff --git a/include/linux/spinlock_types.h b/include/linux/spinlock_types.h
15717 index 73548eb13a5d..10bac715ea96 100644
15718 --- a/include/linux/spinlock_types.h
15719 +++ b/include/linux/spinlock_types.h
15720 @@ -9,80 +9,15 @@
15721   * Released under the General Public License (GPL).
15722   */
15723
15724 -#if defined(CONFIG_SMP)
15725 -# include <asm/spinlock_types.h>
15726 -#else
15727 -# include <linux/spinlock_types_up.h>
15728 -#endif
15729 -
15730 -#include <linux/lockdep.h>
15731 -
15732 -typedef struct raw_spinlock {
15733 -       arch_spinlock_t raw_lock;
15734 -#ifdef CONFIG_GENERIC_LOCKBREAK
15735 -       unsigned int break_lock;
15736 -#endif
15737 -#ifdef CONFIG_DEBUG_SPINLOCK
15738 -       unsigned int magic, owner_cpu;
15739 -       void *owner;
15740 -#endif
15741 -#ifdef CONFIG_DEBUG_LOCK_ALLOC
15742 -       struct lockdep_map dep_map;
15743 -#endif
15744 -} raw_spinlock_t;
15745 -
15746 -#define SPINLOCK_MAGIC         0xdead4ead
15747 -
15748 -#define SPINLOCK_OWNER_INIT    ((void *)-1L)
15749 -
15750 -#ifdef CONFIG_DEBUG_LOCK_ALLOC
15751 -# define SPIN_DEP_MAP_INIT(lockname)   .dep_map = { .name = #lockname }
15752 -#else
15753 -# define SPIN_DEP_MAP_INIT(lockname)
15754 -#endif
15755 +#include <linux/spinlock_types_raw.h>
15756
15757 -#ifdef CONFIG_DEBUG_SPINLOCK
15758 -# define SPIN_DEBUG_INIT(lockname)             \
15759 -       .magic = SPINLOCK_MAGIC,                \
15760 -       .owner_cpu = -1,                        \
15761 -       .owner = SPINLOCK_OWNER_INIT,
15762 +#ifndef CONFIG_PREEMPT_RT_FULL
15763 +# include <linux/spinlock_types_nort.h>
15764 +# include <linux/rwlock_types.h>
15765  #else
15766 -# define SPIN_DEBUG_INIT(lockname)
15767 +# include <linux/rtmutex.h>
15768 +# include <linux/spinlock_types_rt.h>
15769 +# include <linux/rwlock_types_rt.h>
15770  #endif
15771
15772 -#define __RAW_SPIN_LOCK_INITIALIZER(lockname)  \
15773 -       {                                       \
15774 -       .raw_lock = __ARCH_SPIN_LOCK_UNLOCKED,  \
15775 -       SPIN_DEBUG_INIT(lockname)               \
15776 -       SPIN_DEP_MAP_INIT(lockname) }
15777 -
15778 -#define __RAW_SPIN_LOCK_UNLOCKED(lockname)     \
15779 -       (raw_spinlock_t) __RAW_SPIN_LOCK_INITIALIZER(lockname)
15780 -
15781 -#define DEFINE_RAW_SPINLOCK(x) raw_spinlock_t x = __RAW_SPIN_LOCK_UNLOCKED(x)
15782 -
15783 -typedef struct spinlock {
15784 -       union {
15785 -               struct raw_spinlock rlock;
15786 -
15787 -#ifdef CONFIG_DEBUG_LOCK_ALLOC
15788 -# define LOCK_PADSIZE (offsetof(struct raw_spinlock, dep_map))
15789 -               struct {
15790 -                       u8 __padding[LOCK_PADSIZE];
15791 -                       struct lockdep_map dep_map;
15792 -               };
15793 -#endif
15794 -       };
15795 -} spinlock_t;
15796 -
15797 -#define __SPIN_LOCK_INITIALIZER(lockname) \
15798 -       { { .rlock = __RAW_SPIN_LOCK_INITIALIZER(lockname) } }
15799 -
15800 -#define __SPIN_LOCK_UNLOCKED(lockname) \
15801 -       (spinlock_t ) __SPIN_LOCK_INITIALIZER(lockname)
15802 -
15803 -#define DEFINE_SPINLOCK(x)     spinlock_t x = __SPIN_LOCK_UNLOCKED(x)
15804 -
15805 -#include <linux/rwlock_types.h>
15806 -
15807  #endif /* __LINUX_SPINLOCK_TYPES_H */
15808 diff --git a/include/linux/spinlock_types_nort.h b/include/linux/spinlock_types_nort.h
15809 new file mode 100644
15810 index 000000000000..f1dac1fb1d6a
15811 --- /dev/null
15812 +++ b/include/linux/spinlock_types_nort.h
15813 @@ -0,0 +1,33 @@
15814 +#ifndef __LINUX_SPINLOCK_TYPES_NORT_H
15815 +#define __LINUX_SPINLOCK_TYPES_NORT_H
15816 +
15817 +#ifndef __LINUX_SPINLOCK_TYPES_H
15818 +#error "Do not include directly. Include spinlock_types.h instead"
15819 +#endif
15820 +
15821 +/*
15822 + * The non RT version maps spinlocks to raw_spinlocks
15823 + */
15824 +typedef struct spinlock {
15825 +       union {
15826 +               struct raw_spinlock rlock;
15827 +
15828 +#ifdef CONFIG_DEBUG_LOCK_ALLOC
15829 +# define LOCK_PADSIZE (offsetof(struct raw_spinlock, dep_map))
15830 +               struct {
15831 +                       u8 __padding[LOCK_PADSIZE];
15832 +                       struct lockdep_map dep_map;
15833 +               };
15834 +#endif
15835 +       };
15836 +} spinlock_t;
15837 +
15838 +#define __SPIN_LOCK_INITIALIZER(lockname) \
15839 +       { { .rlock = __RAW_SPIN_LOCK_INITIALIZER(lockname) } }
15840 +
15841 +#define __SPIN_LOCK_UNLOCKED(lockname) \
15842 +       (spinlock_t ) __SPIN_LOCK_INITIALIZER(lockname)
15843 +
15844 +#define DEFINE_SPINLOCK(x)     spinlock_t x = __SPIN_LOCK_UNLOCKED(x)
15845 +
15846 +#endif
15847 diff --git a/include/linux/spinlock_types_raw.h b/include/linux/spinlock_types_raw.h
15848 new file mode 100644
15849 index 000000000000..03235b475b77
15850 --- /dev/null
15851 +++ b/include/linux/spinlock_types_raw.h
15852 @@ -0,0 +1,58 @@
15853 +#ifndef __LINUX_SPINLOCK_TYPES_RAW_H
15854 +#define __LINUX_SPINLOCK_TYPES_RAW_H
15855 +
15856 +#include <linux/types.h>
15857 +
15858 +#if defined(CONFIG_SMP)
15859 +# include <asm/spinlock_types.h>
15860 +#else
15861 +# include <linux/spinlock_types_up.h>
15862 +#endif
15863 +
15864 +#include <linux/lockdep.h>
15865 +
15866 +typedef struct raw_spinlock {
15867 +       arch_spinlock_t raw_lock;
15868 +#ifdef CONFIG_GENERIC_LOCKBREAK
15869 +       unsigned int break_lock;
15870 +#endif
15871 +#ifdef CONFIG_DEBUG_SPINLOCK
15872 +       unsigned int magic, owner_cpu;
15873 +       void *owner;
15874 +#endif
15875 +#ifdef CONFIG_DEBUG_LOCK_ALLOC
15876 +       struct lockdep_map dep_map;
15877 +#endif
15878 +} raw_spinlock_t;
15879 +
15880 +#define SPINLOCK_MAGIC         0xdead4ead
15881 +
15882 +#define SPINLOCK_OWNER_INIT    ((void *)-1L)
15883 +
15884 +#ifdef CONFIG_DEBUG_LOCK_ALLOC
15885 +# define SPIN_DEP_MAP_INIT(lockname)   .dep_map = { .name = #lockname }
15886 +#else
15887 +# define SPIN_DEP_MAP_INIT(lockname)
15888 +#endif
15889 +
15890 +#ifdef CONFIG_DEBUG_SPINLOCK
15891 +# define SPIN_DEBUG_INIT(lockname)             \
15892 +       .magic = SPINLOCK_MAGIC,                \
15893 +       .owner_cpu = -1,                        \
15894 +       .owner = SPINLOCK_OWNER_INIT,
15895 +#else
15896 +# define SPIN_DEBUG_INIT(lockname)
15897 +#endif
15898 +
15899 +#define __RAW_SPIN_LOCK_INITIALIZER(lockname)  \
15900 +       {                                       \
15901 +       .raw_lock = __ARCH_SPIN_LOCK_UNLOCKED,  \
15902 +       SPIN_DEBUG_INIT(lockname)               \
15903 +       SPIN_DEP_MAP_INIT(lockname) }
15904 +
15905 +#define __RAW_SPIN_LOCK_UNLOCKED(lockname)     \
15906 +       (raw_spinlock_t) __RAW_SPIN_LOCK_INITIALIZER(lockname)
15907 +
15908 +#define DEFINE_RAW_SPINLOCK(x) raw_spinlock_t x = __RAW_SPIN_LOCK_UNLOCKED(x)
15909 +
15910 +#endif
15911 diff --git a/include/linux/spinlock_types_rt.h b/include/linux/spinlock_types_rt.h
15912 new file mode 100644
15913 index 000000000000..3e3d8c5f7a9a
15914 --- /dev/null
15915 +++ b/include/linux/spinlock_types_rt.h
15916 @@ -0,0 +1,48 @@
15917 +#ifndef __LINUX_SPINLOCK_TYPES_RT_H
15918 +#define __LINUX_SPINLOCK_TYPES_RT_H
15919 +
15920 +#ifndef __LINUX_SPINLOCK_TYPES_H
15921 +#error "Do not include directly. Include spinlock_types.h instead"
15922 +#endif
15923 +
15924 +#include <linux/cache.h>
15925 +
15926 +/*
15927 + * PREEMPT_RT: spinlocks - an RT mutex plus lock-break field:
15928 + */
15929 +typedef struct spinlock {
15930 +       struct rt_mutex         lock;
15931 +       unsigned int            break_lock;
15932 +#ifdef CONFIG_DEBUG_LOCK_ALLOC
15933 +       struct lockdep_map      dep_map;
15934 +#endif
15935 +} spinlock_t;
15936 +
15937 +#ifdef CONFIG_DEBUG_RT_MUTEXES
15938 +# define __RT_SPIN_INITIALIZER(name) \
15939 +       { \
15940 +       .wait_lock = __RAW_SPIN_LOCK_UNLOCKED(name.wait_lock), \
15941 +       .save_state = 1, \
15942 +       .file = __FILE__, \
15943 +       .line = __LINE__ , \
15944 +       }
15945 +#else
15946 +# define __RT_SPIN_INITIALIZER(name) \
15947 +       {                                                               \
15948 +       .wait_lock = __RAW_SPIN_LOCK_UNLOCKED(name.wait_lock),          \
15949 +       .save_state = 1, \
15950 +       }
15951 +#endif
15952 +
15953 +/*
15954 +.wait_list = PLIST_HEAD_INIT_RAW((name).lock.wait_list, (name).lock.wait_lock)
15955 +*/
15956 +
15957 +#define __SPIN_LOCK_UNLOCKED(name)                     \
15958 +       { .lock = __RT_SPIN_INITIALIZER(name.lock),             \
15959 +         SPIN_DEP_MAP_INIT(name) }
15960 +
15961 +#define DEFINE_SPINLOCK(name) \
15962 +       spinlock_t name = __SPIN_LOCK_UNLOCKED(name)
15963 +
15964 +#endif
15965 diff --git a/include/linux/spinlock_types_up.h b/include/linux/spinlock_types_up.h
15966 index c09b6407ae1b..b0243ba07fb7 100644
15967 --- a/include/linux/spinlock_types_up.h
15968 +++ b/include/linux/spinlock_types_up.h
15969 @@ -1,10 +1,6 @@
15970  #ifndef __LINUX_SPINLOCK_TYPES_UP_H
15971  #define __LINUX_SPINLOCK_TYPES_UP_H
15972
15973 -#ifndef __LINUX_SPINLOCK_TYPES_H
15974 -# error "please don't include this file directly"
15975 -#endif
15976 -
15977  /*
15978   * include/linux/spinlock_types_up.h - spinlock type definitions for UP
15979   *
15980 diff --git a/include/linux/srcutiny.h b/include/linux/srcutiny.h
15981 index 261471f407a5..f41d2fb09f87 100644
15982 --- a/include/linux/srcutiny.h
15983 +++ b/include/linux/srcutiny.h
15984 @@ -43,7 +43,7 @@ struct srcu_struct {
15985
15986  void srcu_drive_gp(struct work_struct *wp);
15987
15988 -#define __SRCU_STRUCT_INIT(name)                                       \
15989 +#define __SRCU_STRUCT_INIT(name, __ignored)                            \
15990  {                                                                      \
15991         .srcu_wq = __SWAIT_QUEUE_HEAD_INITIALIZER(name.srcu_wq),        \
15992         .srcu_cb_tail = &name.srcu_cb_head,                             \
15993 @@ -56,9 +56,9 @@ void srcu_drive_gp(struct work_struct *wp);
15994   * Tree SRCU, which needs some per-CPU data.
15995   */
15996  #define DEFINE_SRCU(name) \
15997 -       struct srcu_struct name = __SRCU_STRUCT_INIT(name)
15998 +       struct srcu_struct name = __SRCU_STRUCT_INIT(name, name)
15999  #define DEFINE_STATIC_SRCU(name) \
16000 -       static struct srcu_struct name = __SRCU_STRUCT_INIT(name)
16001 +       static struct srcu_struct name = __SRCU_STRUCT_INIT(name, name)
16002
16003  void synchronize_srcu(struct srcu_struct *sp);
16004
16005 diff --git a/include/linux/srcutree.h b/include/linux/srcutree.h
16006 index a949f4f9e4d7..745d4ca4dd50 100644
16007 --- a/include/linux/srcutree.h
16008 +++ b/include/linux/srcutree.h
16009 @@ -40,7 +40,7 @@ struct srcu_data {
16010         unsigned long srcu_unlock_count[2];     /* Unlocks per CPU. */
16011
16012         /* Update-side state. */
16013 -       raw_spinlock_t __private lock ____cacheline_internodealigned_in_smp;
16014 +       spinlock_t __private lock ____cacheline_internodealigned_in_smp;
16015         struct rcu_segcblist srcu_cblist;       /* List of callbacks.*/
16016         unsigned long srcu_gp_seq_needed;       /* Furthest future GP needed. */
16017         unsigned long srcu_gp_seq_needed_exp;   /* Furthest future exp GP. */
16018 @@ -58,7 +58,7 @@ struct srcu_data {
16019   * Node in SRCU combining tree, similar in function to rcu_data.
16020   */
16021  struct srcu_node {
16022 -       raw_spinlock_t __private lock;
16023 +       spinlock_t __private lock;
16024         unsigned long srcu_have_cbs[4];         /* GP seq for children */
16025                                                 /*  having CBs, but only */
16026                                                 /*  is > ->srcu_gq_seq. */
16027 @@ -78,7 +78,7 @@ struct srcu_struct {
16028         struct srcu_node *level[RCU_NUM_LVLS + 1];
16029                                                 /* First node at each level. */
16030         struct mutex srcu_cb_mutex;             /* Serialize CB preparation. */
16031 -       raw_spinlock_t __private lock;          /* Protect counters */
16032 +       spinlock_t __private lock;              /* Protect counters */
16033         struct mutex srcu_gp_mutex;             /* Serialize GP work. */
16034         unsigned int srcu_idx;                  /* Current rdr array element. */
16035         unsigned long srcu_gp_seq;              /* Grace-period seq #. */
16036 @@ -104,10 +104,10 @@ struct srcu_struct {
16037  #define SRCU_STATE_SCAN1       1
16038  #define SRCU_STATE_SCAN2       2
16039
16040 -#define __SRCU_STRUCT_INIT(name)                                       \
16041 +#define __SRCU_STRUCT_INIT(name, pcpu_name)                            \
16042         {                                                               \
16043 -               .sda = &name##_srcu_data,                               \
16044 -               .lock = __RAW_SPIN_LOCK_UNLOCKED(name.lock),            \
16045 +               .sda = &pcpu_name,                                      \
16046 +               .lock = __SPIN_LOCK_UNLOCKED(name.lock),                \
16047                 .srcu_gp_seq_needed = 0 - 1,                            \
16048                 __SRCU_DEP_MAP_INIT(name)                               \
16049         }
16050 @@ -133,7 +133,7 @@ struct srcu_struct {
16051   */
16052  #define __DEFINE_SRCU(name, is_static)                                 \
16053         static DEFINE_PER_CPU(struct srcu_data, name##_srcu_data);\
16054 -       is_static struct srcu_struct name = __SRCU_STRUCT_INIT(name)
16055 +       is_static struct srcu_struct name = __SRCU_STRUCT_INIT(name, name##_srcu_data)
16056  #define DEFINE_SRCU(name)              __DEFINE_SRCU(name, /* not static */)
16057  #define DEFINE_STATIC_SRCU(name)       __DEFINE_SRCU(name, static)
16058
16059 diff --git a/include/linux/suspend.h b/include/linux/suspend.h
16060 index 8544357d92d0..616ea66cd283 100644
16061 --- a/include/linux/suspend.h
16062 +++ b/include/linux/suspend.h
16063 @@ -196,6 +196,12 @@ struct platform_s2idle_ops {
16064         void (*end)(void);
16065  };
16066
16067 +#if defined(CONFIG_SUSPEND) || defined(CONFIG_HIBERNATION)
16068 +extern bool pm_in_action;
16069 +#else
16070 +# define pm_in_action false
16071 +#endif
16072 +
16073  #ifdef CONFIG_SUSPEND
16074  extern suspend_state_t mem_sleep_current;
16075  extern suspend_state_t mem_sleep_default;
16076 diff --git a/include/linux/swait.h b/include/linux/swait.h
16077 index c98aaf677466..853f3e61a9f4 100644
16078 --- a/include/linux/swait.h
16079 +++ b/include/linux/swait.h
16080 @@ -5,6 +5,7 @@
16081  #include <linux/list.h>
16082  #include <linux/stddef.h>
16083  #include <linux/spinlock.h>
16084 +#include <linux/wait.h>
16085  #include <asm/current.h>
16086
16087  /*
16088 @@ -147,6 +148,7 @@ static inline bool swq_has_sleeper(struct swait_queue_head *wq)
16089  extern void swake_up(struct swait_queue_head *q);
16090  extern void swake_up_all(struct swait_queue_head *q);
16091  extern void swake_up_locked(struct swait_queue_head *q);
16092 +extern void swake_up_all_locked(struct swait_queue_head *q);
16093
16094  extern void __prepare_to_swait(struct swait_queue_head *q, struct swait_queue *wait);
16095  extern void prepare_to_swait(struct swait_queue_head *q, struct swait_queue *wait, int state);
16096 diff --git a/include/linux/swap.h b/include/linux/swap.h
16097 index f02fb5db8914..6c775168df67 100644
16098 --- a/include/linux/swap.h
16099 +++ b/include/linux/swap.h
16100 @@ -12,6 +12,7 @@
16101  #include <linux/fs.h>
16102  #include <linux/atomic.h>
16103  #include <linux/page-flags.h>
16104 +#include <linux/locallock.h>
16105  #include <asm/page.h>
16106
16107  struct notifier_block;
16108 @@ -297,7 +298,8 @@ struct vma_swap_readahead {
16109  void *workingset_eviction(struct address_space *mapping, struct page *page);
16110  bool workingset_refault(void *shadow);
16111  void workingset_activation(struct page *page);
16112 -void workingset_update_node(struct radix_tree_node *node, void *private);
16113 +void __workingset_update_node(struct radix_tree_node *node, void *private);
16114 +DECLARE_LOCAL_IRQ_LOCK(shadow_nodes_lock);
16115
16116  /* linux/mm/page_alloc.c */
16117  extern unsigned long totalram_pages;
16118 @@ -310,6 +312,7 @@ extern unsigned long nr_free_pagecache_pages(void);
16119
16120
16121  /* linux/mm/swap.c */
16122 +DECLARE_LOCAL_IRQ_LOCK(swapvec_lock);
16123  extern void lru_cache_add(struct page *);
16124  extern void lru_cache_add_anon(struct page *page);
16125  extern void lru_cache_add_file(struct page *page);
16126 diff --git a/include/linux/swork.h b/include/linux/swork.h
16127 new file mode 100644
16128 index 000000000000..f175fa9a6016
16129 --- /dev/null
16130 +++ b/include/linux/swork.h
16131 @@ -0,0 +1,24 @@
16132 +#ifndef _LINUX_SWORK_H
16133 +#define _LINUX_SWORK_H
16134 +
16135 +#include <linux/list.h>
16136 +
16137 +struct swork_event {
16138 +       struct list_head item;
16139 +       unsigned long flags;
16140 +       void (*func)(struct swork_event *);
16141 +};
16142 +
16143 +static inline void INIT_SWORK(struct swork_event *event,
16144 +                             void (*func)(struct swork_event *))
16145 +{
16146 +       event->flags = 0;
16147 +       event->func = func;
16148 +}
16149 +
16150 +bool swork_queue(struct swork_event *sev);
16151 +
16152 +int swork_get(void);
16153 +void swork_put(void);
16154 +
16155 +#endif /* _LINUX_SWORK_H */
16156 diff --git a/include/linux/thread_info.h b/include/linux/thread_info.h
16157 index cf2862bd134a..fd05d83740df 100644
16158 --- a/include/linux/thread_info.h
16159 +++ b/include/linux/thread_info.h
16160 @@ -86,7 +86,17 @@ static inline int test_ti_thread_flag(struct thread_info *ti, int flag)
16161  #define test_thread_flag(flag) \
16162         test_ti_thread_flag(current_thread_info(), flag)
16163
16164 -#define tif_need_resched() test_thread_flag(TIF_NEED_RESCHED)
16165 +#ifdef CONFIG_PREEMPT_LAZY
16166 +#define tif_need_resched()     (test_thread_flag(TIF_NEED_RESCHED) || \
16167 +                                test_thread_flag(TIF_NEED_RESCHED_LAZY))
16168 +#define tif_need_resched_now() (test_thread_flag(TIF_NEED_RESCHED))
16169 +#define tif_need_resched_lazy()        test_thread_flag(TIF_NEED_RESCHED_LAZY))
16170 +
16171 +#else
16172 +#define tif_need_resched()     test_thread_flag(TIF_NEED_RESCHED)
16173 +#define tif_need_resched_now() test_thread_flag(TIF_NEED_RESCHED)
16174 +#define tif_need_resched_lazy()        0
16175 +#endif
16176
16177  #ifndef CONFIG_HAVE_ARCH_WITHIN_STACK_FRAMES
16178  static inline int arch_within_stack_frames(const void * const stack,
16179 diff --git a/include/linux/timer.h b/include/linux/timer.h
16180 index e0ea1fe87572..df3085ddf662 100644
16181 --- a/include/linux/timer.h
16182 +++ b/include/linux/timer.h
16183 @@ -213,7 +213,7 @@ extern void add_timer(struct timer_list *timer);
16184
16185  extern int try_to_del_timer_sync(struct timer_list *timer);
16186
16187 -#ifdef CONFIG_SMP
16188 +#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT_FULL)
16189    extern int del_timer_sync(struct timer_list *timer);
16190  #else
16191  # define del_timer_sync(t)             del_timer(t)
16192 diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h
16193 index 2bcb4dc6df1a..edd1e42e8a2f 100644
16194 --- a/include/linux/trace_events.h
16195 +++ b/include/linux/trace_events.h
16196 @@ -62,6 +62,9 @@ struct trace_entry {
16197         unsigned char           flags;
16198         unsigned char           preempt_count;
16199         int                     pid;
16200 +       unsigned short          migrate_disable;
16201 +       unsigned short          padding;
16202 +       unsigned char           preempt_lazy_count;
16203  };
16204
16205  #define TRACE_EVENT_TYPE_MAX                                           \
16206 @@ -402,11 +405,13 @@ enum event_trigger_type {
16207
16208  extern int filter_match_preds(struct event_filter *filter, void *rec);
16209
16210 -extern enum event_trigger_type event_triggers_call(struct trace_event_file *file,
16211 -                                                  void *rec);
16212 -extern void event_triggers_post_call(struct trace_event_file *file,
16213 -                                    enum event_trigger_type tt,
16214 -                                    void *rec);
16215 +extern enum event_trigger_type
16216 +event_triggers_call(struct trace_event_file *file, void *rec,
16217 +                   struct ring_buffer_event *event);
16218 +extern void
16219 +event_triggers_post_call(struct trace_event_file *file,
16220 +                        enum event_trigger_type tt,
16221 +                        void *rec, struct ring_buffer_event *event);
16222
16223  bool trace_event_ignore_this_pid(struct trace_event_file *trace_file);
16224
16225 @@ -426,7 +431,7 @@ trace_trigger_soft_disabled(struct trace_event_file *file)
16226
16227         if (!(eflags & EVENT_FILE_FL_TRIGGER_COND)) {
16228                 if (eflags & EVENT_FILE_FL_TRIGGER_MODE)
16229 -                       event_triggers_call(file, NULL);
16230 +                       event_triggers_call(file, NULL, NULL);
16231                 if (eflags & EVENT_FILE_FL_SOFT_DISABLED)
16232                         return true;
16233                 if (eflags & EVENT_FILE_FL_PID_FILTER)
16234 diff --git a/include/linux/uaccess.h b/include/linux/uaccess.h
16235 index 251e655d407f..57e8e32ef2b0 100644
16236 --- a/include/linux/uaccess.h
16237 +++ b/include/linux/uaccess.h
16238 @@ -185,6 +185,7 @@ static __always_inline void pagefault_disabled_dec(void)
16239   */
16240  static inline void pagefault_disable(void)
16241  {
16242 +       migrate_disable();
16243         pagefault_disabled_inc();
16244         /*
16245          * make sure to have issued the store before a pagefault
16246 @@ -201,6 +202,7 @@ static inline void pagefault_enable(void)
16247          */
16248         barrier();
16249         pagefault_disabled_dec();
16250 +       migrate_enable();
16251  }
16252
16253  /*
16254 diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h
16255 index 1e0cb72e0598..87ab0996a9b0 100644
16256 --- a/include/linux/vmstat.h
16257 +++ b/include/linux/vmstat.h
16258 @@ -33,7 +33,9 @@ DECLARE_PER_CPU(struct vm_event_state, vm_event_states);
16259   */
16260  static inline void __count_vm_event(enum vm_event_item item)
16261  {
16262 +       preempt_disable_rt();
16263         raw_cpu_inc(vm_event_states.event[item]);
16264 +       preempt_enable_rt();
16265  }
16266
16267  static inline void count_vm_event(enum vm_event_item item)
16268 @@ -43,7 +45,9 @@ static inline void count_vm_event(enum vm_event_item item)
16269
16270  static inline void __count_vm_events(enum vm_event_item item, long delta)
16271  {
16272 +       preempt_disable_rt();
16273         raw_cpu_add(vm_event_states.event[item], delta);
16274 +       preempt_enable_rt();
16275  }
16276
16277  static inline void count_vm_events(enum vm_event_item item, long delta)
16278 diff --git a/include/linux/wait.h b/include/linux/wait.h
16279 index 158715445ffb..3451706a3074 100644
16280 --- a/include/linux/wait.h
16281 +++ b/include/linux/wait.h
16282 @@ -10,6 +10,7 @@
16283
16284  #include <asm/current.h>
16285  #include <uapi/linux/wait.h>
16286 +#include <linux/atomic.h>
16287
16288  typedef struct wait_queue_entry wait_queue_entry_t;
16289
16290 @@ -486,8 +487,8 @@ do {                                                                                \
16291         int __ret = 0;                                                          \
16292         struct hrtimer_sleeper __t;                                             \
16293                                                                                 \
16294 -       hrtimer_init_on_stack(&__t.timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);   \
16295 -       hrtimer_init_sleeper(&__t, current);                                    \
16296 +       hrtimer_init_sleeper_on_stack(&__t, CLOCK_MONOTONIC, HRTIMER_MODE_REL,  \
16297 +                                     current);                                 \
16298         if ((timeout) != KTIME_MAX)                                             \
16299                 hrtimer_start_range_ns(&__t.timer, timeout,                     \
16300                                        current->timer_slack_ns,                 \
16301 diff --git a/include/net/gen_stats.h b/include/net/gen_stats.h
16302 index 304f7aa9cc01..00d3813cef26 100644
16303 --- a/include/net/gen_stats.h
16304 +++ b/include/net/gen_stats.h
16305 @@ -6,6 +6,7 @@
16306  #include <linux/socket.h>
16307  #include <linux/rtnetlink.h>
16308  #include <linux/pkt_sched.h>
16309 +#include <net/net_seq_lock.h>
16310
16311  struct gnet_stats_basic_cpu {
16312         struct gnet_stats_basic_packed bstats;
16313 @@ -36,11 +37,11 @@ int gnet_stats_start_copy_compat(struct sk_buff *skb, int type,
16314                                  spinlock_t *lock, struct gnet_dump *d,
16315                                  int padattr);
16316
16317 -int gnet_stats_copy_basic(const seqcount_t *running,
16318 +int gnet_stats_copy_basic(net_seqlock_t *running,
16319                           struct gnet_dump *d,
16320                           struct gnet_stats_basic_cpu __percpu *cpu,
16321                           struct gnet_stats_basic_packed *b);
16322 -void __gnet_stats_copy_basic(const seqcount_t *running,
16323 +void __gnet_stats_copy_basic(net_seqlock_t *running,
16324                              struct gnet_stats_basic_packed *bstats,
16325                              struct gnet_stats_basic_cpu __percpu *cpu,
16326                              struct gnet_stats_basic_packed *b);
16327 @@ -57,13 +58,13 @@ int gen_new_estimator(struct gnet_stats_basic_packed *bstats,
16328                       struct gnet_stats_basic_cpu __percpu *cpu_bstats,
16329                       struct net_rate_estimator __rcu **rate_est,
16330                       spinlock_t *stats_lock,
16331 -                     seqcount_t *running, struct nlattr *opt);
16332 +                     net_seqlock_t *running, struct nlattr *opt);
16333  void gen_kill_estimator(struct net_rate_estimator __rcu **ptr);
16334  int gen_replace_estimator(struct gnet_stats_basic_packed *bstats,
16335                           struct gnet_stats_basic_cpu __percpu *cpu_bstats,
16336                           struct net_rate_estimator __rcu **ptr,
16337                           spinlock_t *stats_lock,
16338 -                         seqcount_t *running, struct nlattr *opt);
16339 +                         net_seqlock_t *running, struct nlattr *opt);
16340  bool gen_estimator_active(struct net_rate_estimator __rcu **ptr);
16341  bool gen_estimator_read(struct net_rate_estimator __rcu **ptr,
16342                         struct gnet_stats_rate_est64 *sample);
16343 diff --git a/include/net/neighbour.h b/include/net/neighbour.h
16344 index a964366a7ef5..51c854583987 100644
16345 --- a/include/net/neighbour.h
16346 +++ b/include/net/neighbour.h
16347 @@ -450,7 +450,7 @@ static inline int neigh_hh_bridge(struct hh_cache *hh, struct sk_buff *skb)
16348  }
16349  #endif
16350
16351 -static inline int neigh_hh_output(const struct hh_cache *hh, struct sk_buff *skb)
16352 +static inline int neigh_hh_output(struct hh_cache *hh, struct sk_buff *skb)
16353  {
16354         unsigned int seq;
16355         unsigned int hh_len;
16356 @@ -474,7 +474,7 @@ static inline int neigh_hh_output(const struct hh_cache *hh, struct sk_buff *skb
16357
16358  static inline int neigh_output(struct neighbour *n, struct sk_buff *skb)
16359  {
16360 -       const struct hh_cache *hh = &n->hh;
16361 +       struct hh_cache *hh = &n->hh;
16362
16363         if ((n->nud_state & NUD_CONNECTED) && hh->hh_len)
16364                 return neigh_hh_output(hh, skb);
16365 @@ -515,7 +515,7 @@ struct neighbour_cb {
16366
16367  #define NEIGH_CB(skb)  ((struct neighbour_cb *)(skb)->cb)
16368
16369 -static inline void neigh_ha_snapshot(char *dst, const struct neighbour *n,
16370 +static inline void neigh_ha_snapshot(char *dst, struct neighbour *n,
16371                                      const struct net_device *dev)
16372  {
16373         unsigned int seq;
16374 diff --git a/include/net/net_seq_lock.h b/include/net/net_seq_lock.h
16375 new file mode 100644
16376 index 000000000000..a7034298a82a
16377 --- /dev/null
16378 +++ b/include/net/net_seq_lock.h
16379 @@ -0,0 +1,15 @@
16380 +#ifndef __NET_NET_SEQ_LOCK_H__
16381 +#define __NET_NET_SEQ_LOCK_H__
16382 +
16383 +#ifdef CONFIG_PREEMPT_RT_BASE
16384 +# define net_seqlock_t                 seqlock_t
16385 +# define net_seq_begin(__r)            read_seqbegin(__r)
16386 +# define net_seq_retry(__r, __s)       read_seqretry(__r, __s)
16387 +
16388 +#else
16389 +# define net_seqlock_t                 seqcount_t
16390 +# define net_seq_begin(__r)            read_seqcount_begin(__r)
16391 +# define net_seq_retry(__r, __s)       read_seqcount_retry(__r, __s)
16392 +#endif
16393 +
16394 +#endif
16395 diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
16396 index f59acacaa265..6ac7c3659973 100644
16397 --- a/include/net/sch_generic.h
16398 +++ b/include/net/sch_generic.h
16399 @@ -10,6 +10,7 @@
16400  #include <linux/percpu.h>
16401  #include <linux/dynamic_queue_limits.h>
16402  #include <linux/list.h>
16403 +#include <net/net_seq_lock.h>
16404  #include <linux/refcount.h>
16405  #include <linux/workqueue.h>
16406  #include <net/gen_stats.h>
16407 @@ -90,7 +91,7 @@ struct Qdisc {
16408         struct sk_buff          *gso_skb ____cacheline_aligned_in_smp;
16409         struct qdisc_skb_head   q;
16410         struct gnet_stats_basic_packed bstats;
16411 -       seqcount_t              running;
16412 +       net_seqlock_t           running;
16413         struct gnet_stats_queue qstats;
16414         unsigned long           state;
16415         struct Qdisc            *next_sched;
16416 @@ -109,13 +110,22 @@ static inline void qdisc_refcount_inc(struct Qdisc *qdisc)
16417         refcount_inc(&qdisc->refcnt);
16418  }
16419
16420 -static inline bool qdisc_is_running(const struct Qdisc *qdisc)
16421 +static inline bool qdisc_is_running(struct Qdisc *qdisc)
16422  {
16423 +#ifdef CONFIG_PREEMPT_RT_BASE
16424 +       return spin_is_locked(&qdisc->running.lock) ? true : false;
16425 +#else
16426         return (raw_read_seqcount(&qdisc->running) & 1) ? true : false;
16427 +#endif
16428  }
16429
16430  static inline bool qdisc_run_begin(struct Qdisc *qdisc)
16431  {
16432 +#ifdef CONFIG_PREEMPT_RT_BASE
16433 +       if (try_write_seqlock(&qdisc->running))
16434 +               return true;
16435 +       return false;
16436 +#else
16437         if (qdisc_is_running(qdisc))
16438                 return false;
16439         /* Variant of write_seqcount_begin() telling lockdep a trylock
16440 @@ -124,11 +134,16 @@ static inline bool qdisc_run_begin(struct Qdisc *qdisc)
16441         raw_write_seqcount_begin(&qdisc->running);
16442         seqcount_acquire(&qdisc->running.dep_map, 0, 1, _RET_IP_);
16443         return true;
16444 +#endif
16445  }
16446
16447  static inline void qdisc_run_end(struct Qdisc *qdisc)
16448  {
16449 +#ifdef CONFIG_PREEMPT_RT_BASE
16450 +       write_sequnlock(&qdisc->running);
16451 +#else
16452         write_seqcount_end(&qdisc->running);
16453 +#endif
16454  }
16455
16456  static inline bool qdisc_may_bulk(const struct Qdisc *qdisc)
16457 @@ -337,7 +352,7 @@ static inline spinlock_t *qdisc_root_sleeping_lock(const struct Qdisc *qdisc)
16458         return qdisc_lock(root);
16459  }
16460
16461 -static inline seqcount_t *qdisc_root_sleeping_running(const struct Qdisc *qdisc)
16462 +static inline net_seqlock_t *qdisc_root_sleeping_running(const struct Qdisc *qdisc)
16463  {
16464         struct Qdisc *root = qdisc_root_sleeping(qdisc);
16465
16466 diff --git a/include/net/xfrm.h b/include/net/xfrm.h
16467 index db99efb2d1d0..a7b95ffbbf8b 100644
16468 --- a/include/net/xfrm.h
16469 +++ b/include/net/xfrm.h
16470 @@ -217,7 +217,7 @@ struct xfrm_state {
16471         struct xfrm_stats       stats;
16472
16473         struct xfrm_lifetime_cur curlft;
16474 -       struct tasklet_hrtimer  mtimer;
16475 +       struct hrtimer          mtimer;
16476
16477         struct xfrm_state_offload xso;
16478
16479 diff --git a/include/trace/events/timer.h b/include/trace/events/timer.h
16480 index c6f728037c53..a57e4ee989d6 100644
16481 --- a/include/trace/events/timer.h
16482 +++ b/include/trace/events/timer.h
16483 @@ -148,7 +148,11 @@ DEFINE_EVENT(timer_class, timer_cancel,
16484                 { HRTIMER_MODE_ABS,             "ABS"           },      \
16485                 { HRTIMER_MODE_REL,             "REL"           },      \
16486                 { HRTIMER_MODE_ABS_PINNED,      "ABS|PINNED"    },      \
16487 -               { HRTIMER_MODE_REL_PINNED,      "REL|PINNED"    })
16488 +               { HRTIMER_MODE_REL_PINNED,      "REL|PINNED"    },      \
16489 +               { HRTIMER_MODE_ABS_SOFT,        "ABS|SOFT"      },      \
16490 +               { HRTIMER_MODE_REL_SOFT,        "REL|SOFT"      },      \
16491 +               { HRTIMER_MODE_ABS_PINNED_SOFT, "ABS|PINNED|SOFT" },    \
16492 +               { HRTIMER_MODE_REL_PINNED_SOFT, "REL|PINNED|SOFT" })
16493
16494  /**
16495   * hrtimer_init - called when the hrtimer is initialized
16496 @@ -186,15 +190,16 @@ TRACE_EVENT(hrtimer_init,
16497   */
16498  TRACE_EVENT(hrtimer_start,
16499
16500 -       TP_PROTO(struct hrtimer *hrtimer),
16501 +       TP_PROTO(struct hrtimer *hrtimer, enum hrtimer_mode mode),
16502
16503 -       TP_ARGS(hrtimer),
16504 +       TP_ARGS(hrtimer, mode),
16505
16506         TP_STRUCT__entry(
16507                 __field( void *,        hrtimer         )
16508                 __field( void *,        function        )
16509                 __field( s64,           expires         )
16510                 __field( s64,           softexpires     )
16511 +               __field( enum hrtimer_mode,     mode    )
16512         ),
16513
16514         TP_fast_assign(
16515 @@ -202,12 +207,14 @@ TRACE_EVENT(hrtimer_start,
16516                 __entry->function       = hrtimer->function;
16517                 __entry->expires        = hrtimer_get_expires(hrtimer);
16518                 __entry->softexpires    = hrtimer_get_softexpires(hrtimer);
16519 +               __entry->mode           = mode;
16520         ),
16521
16522 -       TP_printk("hrtimer=%p function=%pf expires=%llu softexpires=%llu",
16523 -                 __entry->hrtimer, __entry->function,
16524 +       TP_printk("hrtimer=%p function=%pf expires=%llu softexpires=%llu "
16525 +                 "mode=%s", __entry->hrtimer, __entry->function,
16526                   (unsigned long long) __entry->expires,
16527 -                 (unsigned long long) __entry->softexpires)
16528 +                 (unsigned long long) __entry->softexpires,
16529 +                 decode_hrtimer_mode(__entry->mode))
16530  );
16531
16532  /**
16533 diff --git a/init/Kconfig b/init/Kconfig
16534 index 46075327c165..a7aff2c1a203 100644
16535 --- a/init/Kconfig
16536 +++ b/init/Kconfig
16537 @@ -744,6 +744,7 @@ config CFS_BANDWIDTH
16538  config RT_GROUP_SCHED
16539         bool "Group scheduling for SCHED_RR/FIFO"
16540         depends on CGROUP_SCHED
16541 +       depends on !PREEMPT_RT_FULL
16542         default n
16543         help
16544           This feature lets you explicitly allocate real CPU bandwidth
16545 @@ -1533,6 +1534,7 @@ choice
16546
16547  config SLAB
16548         bool "SLAB"
16549 +       depends on !PREEMPT_RT_FULL
16550         select HAVE_HARDENED_USERCOPY_ALLOCATOR
16551         help
16552           The regular slab allocator that is established and known to work
16553 @@ -1553,6 +1555,7 @@ config SLUB
16554  config SLOB
16555         depends on EXPERT
16556         bool "SLOB (Simple Allocator)"
16557 +       depends on !PREEMPT_RT_FULL
16558         help
16559            SLOB replaces the stock allocator with a drastically simpler
16560            allocator. SLOB is generally more space efficient but
16561 @@ -1594,7 +1597,7 @@ config SLAB_FREELIST_HARDENED
16562
16563  config SLUB_CPU_PARTIAL
16564         default y
16565 -       depends on SLUB && SMP
16566 +       depends on SLUB && SMP && !PREEMPT_RT_FULL
16567         bool "SLUB per cpu partial cache"
16568         help
16569           Per cpu partial caches accellerate objects allocation and freeing
16570 diff --git a/init/Makefile b/init/Makefile
16571 index 1dbb23787290..eabf3f1b14be 100644
16572 --- a/init/Makefile
16573 +++ b/init/Makefile
16574 @@ -36,4 +36,4 @@ silent_chk_compile.h = :
16575  include/generated/compile.h: FORCE
16576         @$($(quiet)chk_compile.h)
16577         $(Q)$(CONFIG_SHELL) $(srctree)/scripts/mkcompile_h $@ \
16578 -       "$(UTS_MACHINE)" "$(CONFIG_SMP)" "$(CONFIG_PREEMPT)" "$(CC) $(KBUILD_CFLAGS)"
16579 +       "$(UTS_MACHINE)" "$(CONFIG_SMP)" "$(CONFIG_PREEMPT)" "$(CONFIG_PREEMPT_RT_FULL)" "$(CC) $(KBUILD_CFLAGS)"
16580 diff --git a/init/main.c b/init/main.c
16581 index c4a45145e102..c86f3d3b9a72 100644
16582 --- a/init/main.c
16583 +++ b/init/main.c
16584 @@ -543,6 +543,7 @@ asmlinkage __visible void __init start_kernel(void)
16585         setup_command_line(command_line);
16586         setup_nr_cpu_ids();
16587         setup_per_cpu_areas();
16588 +       softirq_early_init();
16589         smp_prepare_boot_cpu(); /* arch-specific boot-cpu hooks */
16590         boot_cpu_hotplug_init();
16591
16592 diff --git a/kernel/Kconfig.locks b/kernel/Kconfig.locks
16593 index 84d882f3e299..af27c4000812 100644
16594 --- a/kernel/Kconfig.locks
16595 +++ b/kernel/Kconfig.locks
16596 @@ -225,11 +225,11 @@ config ARCH_SUPPORTS_ATOMIC_RMW
16597
16598  config MUTEX_SPIN_ON_OWNER
16599         def_bool y
16600 -       depends on SMP && ARCH_SUPPORTS_ATOMIC_RMW
16601 +       depends on SMP && ARCH_SUPPORTS_ATOMIC_RMW && !PREEMPT_RT_FULL
16602
16603  config RWSEM_SPIN_ON_OWNER
16604         def_bool y
16605 -       depends on SMP && RWSEM_XCHGADD_ALGORITHM && ARCH_SUPPORTS_ATOMIC_RMW
16606 +       depends on SMP && RWSEM_XCHGADD_ALGORITHM && ARCH_SUPPORTS_ATOMIC_RMW && !PREEMPT_RT_FULL
16607
16608  config LOCK_SPIN_ON_OWNER
16609         def_bool y
16610 diff --git a/kernel/Kconfig.preempt b/kernel/Kconfig.preempt
16611 index 3f9c97419f02..11dbe26a8279 100644
16612 --- a/kernel/Kconfig.preempt
16613 +++ b/kernel/Kconfig.preempt
16614 @@ -1,3 +1,16 @@
16615 +config PREEMPT
16616 +       bool
16617 +       select PREEMPT_COUNT
16618 +
16619 +config PREEMPT_RT_BASE
16620 +       bool
16621 +       select PREEMPT
16622 +
16623 +config HAVE_PREEMPT_LAZY
16624 +       bool
16625 +
16626 +config PREEMPT_LAZY
16627 +       def_bool y if HAVE_PREEMPT_LAZY && PREEMPT_RT_FULL
16628
16629  choice
16630         prompt "Preemption Model"
16631 @@ -33,9 +46,9 @@ config PREEMPT_VOLUNTARY
16632
16633           Select this if you are building a kernel for a desktop system.
16634
16635 -config PREEMPT
16636 +config PREEMPT__LL
16637         bool "Preemptible Kernel (Low-Latency Desktop)"
16638 -       select PREEMPT_COUNT
16639 +       select PREEMPT
16640         select UNINLINE_SPIN_UNLOCK if !ARCH_INLINE_SPIN_UNLOCK
16641         help
16642           This option reduces the latency of the kernel by making
16643 @@ -52,6 +65,22 @@ config PREEMPT
16644           embedded system with latency requirements in the milliseconds
16645           range.
16646
16647 +config PREEMPT_RTB
16648 +       bool "Preemptible Kernel (Basic RT)"
16649 +       select PREEMPT_RT_BASE
16650 +       help
16651 +         This option is basically the same as (Low-Latency Desktop) but
16652 +         enables changes which are preliminary for the full preemptible
16653 +         RT kernel.
16654 +
16655 +config PREEMPT_RT_FULL
16656 +       bool "Fully Preemptible Kernel (RT)"
16657 +       depends on IRQ_FORCED_THREADING
16658 +       select PREEMPT_RT_BASE
16659 +       select PREEMPT_RCU
16660 +       help
16661 +         All and everything
16662 +
16663  endchoice
16664
16665  config PREEMPT_COUNT
16666 diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
16667 index 3fc11b8851ac..a04c3aded76b 100644
16668 --- a/kernel/cgroup/cgroup.c
16669 +++ b/kernel/cgroup/cgroup.c
16670 @@ -4515,10 +4515,10 @@ static void css_free_rcu_fn(struct rcu_head *rcu_head)
16671         queue_work(cgroup_destroy_wq, &css->destroy_work);
16672  }
16673
16674 -static void css_release_work_fn(struct work_struct *work)
16675 +static void css_release_work_fn(struct swork_event *sev)
16676  {
16677         struct cgroup_subsys_state *css =
16678 -               container_of(work, struct cgroup_subsys_state, destroy_work);
16679 +               container_of(sev, struct cgroup_subsys_state, destroy_swork);
16680         struct cgroup_subsys *ss = css->ss;
16681         struct cgroup *cgrp = css->cgroup;
16682
16683 @@ -4569,8 +4569,8 @@ static void css_release(struct percpu_ref *ref)
16684         struct cgroup_subsys_state *css =
16685                 container_of(ref, struct cgroup_subsys_state, refcnt);
16686
16687 -       INIT_WORK(&css->destroy_work, css_release_work_fn);
16688 -       queue_work(cgroup_destroy_wq, &css->destroy_work);
16689 +       INIT_SWORK(&css->destroy_swork, css_release_work_fn);
16690 +       swork_queue(&css->destroy_swork);
16691  }
16692
16693  static void init_and_link_css(struct cgroup_subsys_state *css,
16694 @@ -5276,6 +5276,7 @@ static int __init cgroup_wq_init(void)
16695          */
16696         cgroup_destroy_wq = alloc_workqueue("cgroup_destroy", 0, 1);
16697         BUG_ON(!cgroup_destroy_wq);
16698 +       BUG_ON(swork_get());
16699         return 0;
16700  }
16701  core_initcall(cgroup_wq_init);
16702 diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
16703 index 4657e2924ecb..bda2af78277a 100644
16704 --- a/kernel/cgroup/cpuset.c
16705 +++ b/kernel/cgroup/cpuset.c
16706 @@ -288,7 +288,7 @@ static struct cpuset top_cpuset = {
16707   */
16708
16709  static DEFINE_MUTEX(cpuset_mutex);
16710 -static DEFINE_SPINLOCK(callback_lock);
16711 +static DEFINE_RAW_SPINLOCK(callback_lock);
16712
16713  static struct workqueue_struct *cpuset_migrate_mm_wq;
16714
16715 @@ -926,9 +926,9 @@ static void update_cpumasks_hier(struct cpuset *cs, struct cpumask *new_cpus)
16716                         continue;
16717                 rcu_read_unlock();
16718
16719 -               spin_lock_irq(&callback_lock);
16720 +               raw_spin_lock_irq(&callback_lock);
16721                 cpumask_copy(cp->effective_cpus, new_cpus);
16722 -               spin_unlock_irq(&callback_lock);
16723 +               raw_spin_unlock_irq(&callback_lock);
16724
16725                 WARN_ON(!is_in_v2_mode() &&
16726                         !cpumask_equal(cp->cpus_allowed, cp->effective_cpus));
16727 @@ -993,9 +993,9 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
16728         if (retval < 0)
16729                 return retval;
16730
16731 -       spin_lock_irq(&callback_lock);
16732 +       raw_spin_lock_irq(&callback_lock);
16733         cpumask_copy(cs->cpus_allowed, trialcs->cpus_allowed);
16734 -       spin_unlock_irq(&callback_lock);
16735 +       raw_spin_unlock_irq(&callback_lock);
16736
16737         /* use trialcs->cpus_allowed as a temp variable */
16738         update_cpumasks_hier(cs, trialcs->cpus_allowed);
16739 @@ -1179,9 +1179,9 @@ static void update_nodemasks_hier(struct cpuset *cs, nodemask_t *new_mems)
16740                         continue;
16741                 rcu_read_unlock();
16742
16743 -               spin_lock_irq(&callback_lock);
16744 +               raw_spin_lock_irq(&callback_lock);
16745                 cp->effective_mems = *new_mems;
16746 -               spin_unlock_irq(&callback_lock);
16747 +               raw_spin_unlock_irq(&callback_lock);
16748
16749                 WARN_ON(!is_in_v2_mode() &&
16750                         !nodes_equal(cp->mems_allowed, cp->effective_mems));
16751 @@ -1249,9 +1249,9 @@ static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs,
16752         if (retval < 0)
16753                 goto done;
16754
16755 -       spin_lock_irq(&callback_lock);
16756 +       raw_spin_lock_irq(&callback_lock);
16757         cs->mems_allowed = trialcs->mems_allowed;
16758 -       spin_unlock_irq(&callback_lock);
16759 +       raw_spin_unlock_irq(&callback_lock);
16760
16761         /* use trialcs->mems_allowed as a temp variable */
16762         update_nodemasks_hier(cs, &trialcs->mems_allowed);
16763 @@ -1342,9 +1342,9 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs,
16764         spread_flag_changed = ((is_spread_slab(cs) != is_spread_slab(trialcs))
16765                         || (is_spread_page(cs) != is_spread_page(trialcs)));
16766
16767 -       spin_lock_irq(&callback_lock);
16768 +       raw_spin_lock_irq(&callback_lock);
16769         cs->flags = trialcs->flags;
16770 -       spin_unlock_irq(&callback_lock);
16771 +       raw_spin_unlock_irq(&callback_lock);
16772
16773         if (!cpumask_empty(trialcs->cpus_allowed) && balance_flag_changed)
16774                 rebuild_sched_domains_locked();
16775 @@ -1759,7 +1759,7 @@ static int cpuset_common_seq_show(struct seq_file *sf, void *v)
16776         cpuset_filetype_t type = seq_cft(sf)->private;
16777         int ret = 0;
16778
16779 -       spin_lock_irq(&callback_lock);
16780 +       raw_spin_lock_irq(&callback_lock);
16781
16782         switch (type) {
16783         case FILE_CPULIST:
16784 @@ -1778,7 +1778,7 @@ static int cpuset_common_seq_show(struct seq_file *sf, void *v)
16785                 ret = -EINVAL;
16786         }
16787
16788 -       spin_unlock_irq(&callback_lock);
16789 +       raw_spin_unlock_irq(&callback_lock);
16790         return ret;
16791  }
16792
16793 @@ -1993,12 +1993,12 @@ static int cpuset_css_online(struct cgroup_subsys_state *css)
16794
16795         cpuset_inc();
16796
16797 -       spin_lock_irq(&callback_lock);
16798 +       raw_spin_lock_irq(&callback_lock);
16799         if (is_in_v2_mode()) {
16800                 cpumask_copy(cs->effective_cpus, parent->effective_cpus);
16801                 cs->effective_mems = parent->effective_mems;
16802         }
16803 -       spin_unlock_irq(&callback_lock);
16804 +       raw_spin_unlock_irq(&callback_lock);
16805
16806         if (!test_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags))
16807                 goto out_unlock;
16808 @@ -2025,12 +2025,12 @@ static int cpuset_css_online(struct cgroup_subsys_state *css)
16809         }
16810         rcu_read_unlock();
16811
16812 -       spin_lock_irq(&callback_lock);
16813 +       raw_spin_lock_irq(&callback_lock);
16814         cs->mems_allowed = parent->mems_allowed;
16815         cs->effective_mems = parent->mems_allowed;
16816         cpumask_copy(cs->cpus_allowed, parent->cpus_allowed);
16817         cpumask_copy(cs->effective_cpus, parent->cpus_allowed);
16818 -       spin_unlock_irq(&callback_lock);
16819 +       raw_spin_unlock_irq(&callback_lock);
16820  out_unlock:
16821         mutex_unlock(&cpuset_mutex);
16822         return 0;
16823 @@ -2069,7 +2069,7 @@ static void cpuset_css_free(struct cgroup_subsys_state *css)
16824  static void cpuset_bind(struct cgroup_subsys_state *root_css)
16825  {
16826         mutex_lock(&cpuset_mutex);
16827 -       spin_lock_irq(&callback_lock);
16828 +       raw_spin_lock_irq(&callback_lock);
16829
16830         if (is_in_v2_mode()) {
16831                 cpumask_copy(top_cpuset.cpus_allowed, cpu_possible_mask);
16832 @@ -2080,7 +2080,7 @@ static void cpuset_bind(struct cgroup_subsys_state *root_css)
16833                 top_cpuset.mems_allowed = top_cpuset.effective_mems;
16834         }
16835
16836 -       spin_unlock_irq(&callback_lock);
16837 +       raw_spin_unlock_irq(&callback_lock);
16838         mutex_unlock(&cpuset_mutex);
16839  }
16840
16841 @@ -2094,7 +2094,7 @@ static void cpuset_fork(struct task_struct *task)
16842         if (task_css_is_root(task, cpuset_cgrp_id))
16843                 return;
16844
16845 -       set_cpus_allowed_ptr(task, &current->cpus_allowed);
16846 +       set_cpus_allowed_ptr(task, current->cpus_ptr);
16847         task->mems_allowed = current->mems_allowed;
16848  }
16849
16850 @@ -2178,12 +2178,12 @@ hotplug_update_tasks_legacy(struct cpuset *cs,
16851  {
16852         bool is_empty;
16853
16854 -       spin_lock_irq(&callback_lock);
16855 +       raw_spin_lock_irq(&callback_lock);
16856         cpumask_copy(cs->cpus_allowed, new_cpus);
16857         cpumask_copy(cs->effective_cpus, new_cpus);
16858         cs->mems_allowed = *new_mems;
16859         cs->effective_mems = *new_mems;
16860 -       spin_unlock_irq(&callback_lock);
16861 +       raw_spin_unlock_irq(&callback_lock);
16862
16863         /*
16864          * Don't call update_tasks_cpumask() if the cpuset becomes empty,
16865 @@ -2220,10 +2220,10 @@ hotplug_update_tasks(struct cpuset *cs,
16866         if (nodes_empty(*new_mems))
16867                 *new_mems = parent_cs(cs)->effective_mems;
16868
16869 -       spin_lock_irq(&callback_lock);
16870 +       raw_spin_lock_irq(&callback_lock);
16871         cpumask_copy(cs->effective_cpus, new_cpus);
16872         cs->effective_mems = *new_mems;
16873 -       spin_unlock_irq(&callback_lock);
16874 +       raw_spin_unlock_irq(&callback_lock);
16875
16876         if (cpus_updated)
16877                 update_tasks_cpumask(cs);
16878 @@ -2316,21 +2316,21 @@ static void cpuset_hotplug_workfn(struct work_struct *work)
16879
16880         /* synchronize cpus_allowed to cpu_active_mask */
16881         if (cpus_updated) {
16882 -               spin_lock_irq(&callback_lock);
16883 +               raw_spin_lock_irq(&callback_lock);
16884                 if (!on_dfl)
16885                         cpumask_copy(top_cpuset.cpus_allowed, &new_cpus);
16886                 cpumask_copy(top_cpuset.effective_cpus, &new_cpus);
16887 -               spin_unlock_irq(&callback_lock);
16888 +               raw_spin_unlock_irq(&callback_lock);
16889                 /* we don't mess with cpumasks of tasks in top_cpuset */
16890         }
16891
16892         /* synchronize mems_allowed to N_MEMORY */
16893         if (mems_updated) {
16894 -               spin_lock_irq(&callback_lock);
16895 +               raw_spin_lock_irq(&callback_lock);
16896                 if (!on_dfl)
16897                         top_cpuset.mems_allowed = new_mems;
16898                 top_cpuset.effective_mems = new_mems;
16899 -               spin_unlock_irq(&callback_lock);
16900 +               raw_spin_unlock_irq(&callback_lock);
16901                 update_tasks_nodemask(&top_cpuset);
16902         }
16903
16904 @@ -2429,11 +2429,11 @@ void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask)
16905  {
16906         unsigned long flags;
16907
16908 -       spin_lock_irqsave(&callback_lock, flags);
16909 +       raw_spin_lock_irqsave(&callback_lock, flags);
16910         rcu_read_lock();
16911         guarantee_online_cpus(task_cs(tsk), pmask);
16912         rcu_read_unlock();
16913 -       spin_unlock_irqrestore(&callback_lock, flags);
16914 +       raw_spin_unlock_irqrestore(&callback_lock, flags);
16915  }
16916
16917  void cpuset_cpus_allowed_fallback(struct task_struct *tsk)
16918 @@ -2481,11 +2481,11 @@ nodemask_t cpuset_mems_allowed(struct task_struct *tsk)
16919         nodemask_t mask;
16920         unsigned long flags;
16921
16922 -       spin_lock_irqsave(&callback_lock, flags);
16923 +       raw_spin_lock_irqsave(&callback_lock, flags);
16924         rcu_read_lock();
16925         guarantee_online_mems(task_cs(tsk), &mask);
16926         rcu_read_unlock();
16927 -       spin_unlock_irqrestore(&callback_lock, flags);
16928 +       raw_spin_unlock_irqrestore(&callback_lock, flags);
16929
16930         return mask;
16931  }
16932 @@ -2577,14 +2577,14 @@ bool __cpuset_node_allowed(int node, gfp_t gfp_mask)
16933                 return true;
16934
16935         /* Not hardwall and node outside mems_allowed: scan up cpusets */
16936 -       spin_lock_irqsave(&callback_lock, flags);
16937 +       raw_spin_lock_irqsave(&callback_lock, flags);
16938
16939         rcu_read_lock();
16940         cs = nearest_hardwall_ancestor(task_cs(current));
16941         allowed = node_isset(node, cs->mems_allowed);
16942         rcu_read_unlock();
16943
16944 -       spin_unlock_irqrestore(&callback_lock, flags);
16945 +       raw_spin_unlock_irqrestore(&callback_lock, flags);
16946         return allowed;
16947  }
16948
16949 diff --git a/kernel/cpu.c b/kernel/cpu.c
16950 index f3f389e33343..7d777b62e4eb 100644
16951 --- a/kernel/cpu.c
16952 +++ b/kernel/cpu.c
16953 @@ -74,6 +74,11 @@ static DEFINE_PER_CPU(struct cpuhp_cpu_state, cpuhp_state) = {
16954         .fail = CPUHP_INVALID,
16955  };
16956
16957 +#if defined(CONFIG_HOTPLUG_CPU) && defined(CONFIG_PREEMPT_RT_FULL)
16958 +static DEFINE_PER_CPU(struct rt_rw_lock, cpuhp_pin_lock) = \
16959 +       __RWLOCK_RT_INITIALIZER(cpuhp_pin_lock);
16960 +#endif
16961 +
16962  #if defined(CONFIG_LOCKDEP) && defined(CONFIG_SMP)
16963  static struct lockdep_map cpuhp_state_up_map =
16964         STATIC_LOCKDEP_MAP_INIT("cpuhp_state-up", &cpuhp_state_up_map);
16965 @@ -287,6 +292,55 @@ static int cpu_hotplug_disabled;
16966
16967  #ifdef CONFIG_HOTPLUG_CPU
16968
16969 +/**
16970 + * pin_current_cpu - Prevent the current cpu from being unplugged
16971 + */
16972 +void pin_current_cpu(void)
16973 +{
16974 +#ifdef CONFIG_PREEMPT_RT_FULL
16975 +       struct rt_rw_lock *cpuhp_pin;
16976 +       unsigned int cpu;
16977 +       int ret;
16978 +
16979 +again:
16980 +       cpuhp_pin = this_cpu_ptr(&cpuhp_pin_lock);
16981 +       ret = __read_rt_trylock(cpuhp_pin);
16982 +       if (ret) {
16983 +               current->pinned_on_cpu = smp_processor_id();
16984 +               return;
16985 +       }
16986 +       cpu = smp_processor_id();
16987 +       preempt_lazy_enable();
16988 +       preempt_enable();
16989 +
16990 +       __read_rt_lock(cpuhp_pin);
16991 +
16992 +       preempt_disable();
16993 +       preempt_lazy_disable();
16994 +       if (cpu != smp_processor_id()) {
16995 +               __read_rt_unlock(cpuhp_pin);
16996 +               goto again;
16997 +       }
16998 +       current->pinned_on_cpu = cpu;
16999 +#endif
17000 +}
17001 +
17002 +/**
17003 + * unpin_current_cpu - Allow unplug of current cpu
17004 + */
17005 +void unpin_current_cpu(void)
17006 +{
17007 +#ifdef CONFIG_PREEMPT_RT_FULL
17008 +       struct rt_rw_lock *cpuhp_pin = this_cpu_ptr(&cpuhp_pin_lock);
17009 +
17010 +       if (WARN_ON(current->pinned_on_cpu != smp_processor_id()))
17011 +               cpuhp_pin = per_cpu_ptr(&cpuhp_pin_lock, current->pinned_on_cpu);
17012 +
17013 +       current->pinned_on_cpu = -1;
17014 +       __read_rt_unlock(cpuhp_pin);
17015 +#endif
17016 +}
17017 +
17018  DEFINE_STATIC_PERCPU_RWSEM(cpu_hotplug_lock);
17019
17020  void cpus_read_lock(void)
17021 @@ -843,6 +897,9 @@ static int take_cpu_down(void *_param)
17022
17023  static int takedown_cpu(unsigned int cpu)
17024  {
17025 +#ifdef CONFIG_PREEMPT_RT_FULL
17026 +       struct rt_rw_lock *cpuhp_pin = per_cpu_ptr(&cpuhp_pin_lock, cpu);
17027 +#endif
17028         struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
17029         int err;
17030
17031 @@ -855,11 +912,18 @@ static int takedown_cpu(unsigned int cpu)
17032          */
17033         irq_lock_sparse();
17034
17035 +#ifdef CONFIG_PREEMPT_RT_FULL
17036 +       __write_rt_lock(cpuhp_pin);
17037 +#endif
17038 +
17039         /*
17040          * So now all preempt/rcu users must observe !cpu_active().
17041          */
17042         err = stop_machine_cpuslocked(take_cpu_down, NULL, cpumask_of(cpu));
17043         if (err) {
17044 +#ifdef CONFIG_PREEMPT_RT_FULL
17045 +               __write_rt_unlock(cpuhp_pin);
17046 +#endif
17047                 /* CPU refused to die */
17048                 irq_unlock_sparse();
17049                 /* Unpark the hotplug thread so we can rollback there */
17050 @@ -878,6 +942,9 @@ static int takedown_cpu(unsigned int cpu)
17051         wait_for_ap_thread(st, false);
17052         BUG_ON(st->state != CPUHP_AP_IDLE_DEAD);
17053
17054 +#ifdef CONFIG_PREEMPT_RT_FULL
17055 +       __write_rt_unlock(cpuhp_pin);
17056 +#endif
17057         /* Interrupts are moved away from the dying cpu, reenable alloc/free */
17058         irq_unlock_sparse();
17059
17060 diff --git a/kernel/debug/kdb/kdb_io.c b/kernel/debug/kdb/kdb_io.c
17061 index ed5d34925ad0..c0d4c24fc241 100644
17062 --- a/kernel/debug/kdb/kdb_io.c
17063 +++ b/kernel/debug/kdb/kdb_io.c
17064 @@ -854,9 +854,11 @@ int kdb_printf(const char *fmt, ...)
17065         va_list ap;
17066         int r;
17067
17068 +       kdb_trap_printk++;
17069         va_start(ap, fmt);
17070         r = vkdb_printf(KDB_MSGSRC_INTERNAL, fmt, ap);
17071         va_end(ap);
17072 +       kdb_trap_printk--;
17073
17074         return r;
17075  }
17076 diff --git a/kernel/events/core.c b/kernel/events/core.c
17077 index 4dbce29a9313..de3d23bae9bf 100644
17078 --- a/kernel/events/core.c
17079 +++ b/kernel/events/core.c
17080 @@ -1065,7 +1065,7 @@ static void __perf_mux_hrtimer_init(struct perf_cpu_context *cpuctx, int cpu)
17081         cpuctx->hrtimer_interval = ns_to_ktime(NSEC_PER_MSEC * interval);
17082
17083         raw_spin_lock_init(&cpuctx->hrtimer_lock);
17084 -       hrtimer_init(timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED);
17085 +       hrtimer_init(timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED_HARD);
17086         timer->function = perf_mux_hrtimer_handler;
17087  }
17088
17089 @@ -8760,7 +8760,7 @@ static void perf_swevent_init_hrtimer(struct perf_event *event)
17090         if (!is_sampling_event(event))
17091                 return;
17092
17093 -       hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
17094 +       hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD);
17095         hwc->hrtimer.function = perf_swevent_hrtimer;
17096
17097         /*
17098 diff --git a/kernel/exit.c b/kernel/exit.c
17099 index e3a08761eb40..26f3b352b37a 100644
17100 --- a/kernel/exit.c
17101 +++ b/kernel/exit.c
17102 @@ -159,7 +159,7 @@ static void __exit_signal(struct task_struct *tsk)
17103          * Do this under ->siglock, we can race with another thread
17104          * doing sigqueue_free() if we have SIGQUEUE_PREALLOC signals.
17105          */
17106 -       flush_sigqueue(&tsk->pending);
17107 +       flush_task_sigqueue(tsk);
17108         tsk->sighand = NULL;
17109         spin_unlock(&sighand->siglock);
17110
17111 diff --git a/kernel/fork.c b/kernel/fork.c
17112 index 6a219fea4926..bc849ac60aa6 100644
17113 --- a/kernel/fork.c
17114 +++ b/kernel/fork.c
17115 @@ -40,6 +40,7 @@
17116  #include <linux/hmm.h>
17117  #include <linux/fs.h>
17118  #include <linux/mm.h>
17119 +#include <linux/kprobes.h>
17120  #include <linux/vmacache.h>
17121  #include <linux/nsproxy.h>
17122  #include <linux/capability.h>
17123 @@ -407,13 +408,24 @@ static inline void put_signal_struct(struct signal_struct *sig)
17124         if (atomic_dec_and_test(&sig->sigcnt))
17125                 free_signal_struct(sig);
17126  }
17127 -
17128 +#ifdef CONFIG_PREEMPT_RT_BASE
17129 +static
17130 +#endif
17131  void __put_task_struct(struct task_struct *tsk)
17132  {
17133         WARN_ON(!tsk->exit_state);
17134         WARN_ON(atomic_read(&tsk->usage));
17135         WARN_ON(tsk == current);
17136
17137 +       /*
17138 +        * Remove function-return probe instances associated with this
17139 +        * task and put them back on the free list.
17140 +        */
17141 +       kprobe_flush_task(tsk);
17142 +
17143 +       /* Task is done with its stack. */
17144 +       put_task_stack(tsk);
17145 +
17146         cgroup_free(tsk);
17147         task_numa_free(tsk);
17148         security_task_free(tsk);
17149 @@ -424,7 +436,18 @@ void __put_task_struct(struct task_struct *tsk)
17150         if (!profile_handoff_task(tsk))
17151                 free_task(tsk);
17152  }
17153 +#ifndef CONFIG_PREEMPT_RT_BASE
17154  EXPORT_SYMBOL_GPL(__put_task_struct);
17155 +#else
17156 +void __put_task_struct_cb(struct rcu_head *rhp)
17157 +{
17158 +       struct task_struct *tsk = container_of(rhp, struct task_struct, put_rcu);
17159 +
17160 +       __put_task_struct(tsk);
17161 +
17162 +}
17163 +EXPORT_SYMBOL_GPL(__put_task_struct_cb);
17164 +#endif
17165
17166  void __init __weak arch_task_cache_init(void) { }
17167
17168 @@ -563,7 +586,8 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
17169  #ifdef CONFIG_CC_STACKPROTECTOR
17170         tsk->stack_canary = get_random_canary();
17171  #endif
17172 -
17173 +       if (orig->cpus_ptr == &orig->cpus_mask)
17174 +               tsk->cpus_ptr = &tsk->cpus_mask;
17175         /*
17176          * One for us, one for whoever does the "release_task()" (usually
17177          * parent)
17178 @@ -575,6 +599,7 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
17179         tsk->splice_pipe = NULL;
17180         tsk->task_frag.page = NULL;
17181         tsk->wake_q.next = NULL;
17182 +       tsk->wake_q_sleeper.next = NULL;
17183
17184         account_kernel_stack(tsk, 1);
17185
17186 @@ -915,6 +940,19 @@ void __mmdrop(struct mm_struct *mm)
17187  }
17188  EXPORT_SYMBOL_GPL(__mmdrop);
17189
17190 +#ifdef CONFIG_PREEMPT_RT_BASE
17191 +/*
17192 + * RCU callback for delayed mm drop. Not strictly rcu, but we don't
17193 + * want another facility to make this work.
17194 + */
17195 +void __mmdrop_delayed(struct rcu_head *rhp)
17196 +{
17197 +       struct mm_struct *mm = container_of(rhp, struct mm_struct, delayed_drop);
17198 +
17199 +       __mmdrop(mm);
17200 +}
17201 +#endif
17202 +
17203  static inline void __mmput(struct mm_struct *mm)
17204  {
17205         VM_BUG_ON(atomic_read(&mm->mm_users));
17206 @@ -1496,6 +1534,9 @@ static void rt_mutex_init_task(struct task_struct *p)
17207   */
17208  static void posix_cpu_timers_init(struct task_struct *tsk)
17209  {
17210 +#ifdef CONFIG_PREEMPT_RT_BASE
17211 +       tsk->posix_timer_list = NULL;
17212 +#endif
17213         tsk->cputime_expires.prof_exp = 0;
17214         tsk->cputime_expires.virt_exp = 0;
17215         tsk->cputime_expires.sched_exp = 0;
17216 @@ -1648,6 +1689,7 @@ static __latent_entropy struct task_struct *copy_process(
17217         spin_lock_init(&p->alloc_lock);
17218
17219         init_sigpending(&p->pending);
17220 +       p->sigqueue_cache = NULL;
17221
17222         p->utime = p->stime = p->gtime = 0;
17223  #ifdef CONFIG_ARCH_HAS_SCALED_CPUTIME
17224 diff --git a/kernel/futex.c b/kernel/futex.c
17225 index 046cd780d057..2ba7fb04a107 100644
17226 --- a/kernel/futex.c
17227 +++ b/kernel/futex.c
17228 @@ -936,7 +936,9 @@ void exit_pi_state_list(struct task_struct *curr)
17229                 if (head->next != next) {
17230                         /* retain curr->pi_lock for the loop invariant */
17231                         raw_spin_unlock(&pi_state->pi_mutex.wait_lock);
17232 +                       raw_spin_unlock_irq(&curr->pi_lock);
17233                         spin_unlock(&hb->lock);
17234 +                       raw_spin_lock_irq(&curr->pi_lock);
17235                         put_pi_state(pi_state);
17236                         continue;
17237                 }
17238 @@ -1430,6 +1432,7 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_pi_state *pi_
17239         struct task_struct *new_owner;
17240         bool postunlock = false;
17241         DEFINE_WAKE_Q(wake_q);
17242 +       DEFINE_WAKE_Q(wake_sleeper_q);
17243         int ret = 0;
17244
17245         new_owner = rt_mutex_next_owner(&pi_state->pi_mutex);
17246 @@ -1491,13 +1494,13 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_pi_state *pi_
17247         pi_state->owner = new_owner;
17248         raw_spin_unlock(&new_owner->pi_lock);
17249
17250 -       postunlock = __rt_mutex_futex_unlock(&pi_state->pi_mutex, &wake_q);
17251 -
17252 +       postunlock = __rt_mutex_futex_unlock(&pi_state->pi_mutex, &wake_q,
17253 +                                            &wake_sleeper_q);
17254  out_unlock:
17255         raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
17256
17257         if (postunlock)
17258 -               rt_mutex_postunlock(&wake_q);
17259 +               rt_mutex_postunlock(&wake_q, &wake_sleeper_q);
17260
17261         return ret;
17262  }
17263 @@ -2104,6 +2107,16 @@ static int futex_requeue(u32 __user *uaddr1, unsigned int flags,
17264                                 requeue_pi_wake_futex(this, &key2, hb2);
17265                                 drop_count++;
17266                                 continue;
17267 +                       } else if (ret == -EAGAIN) {
17268 +                               /*
17269 +                                * Waiter was woken by timeout or
17270 +                                * signal and has set pi_blocked_on to
17271 +                                * PI_WAKEUP_INPROGRESS before we
17272 +                                * tried to enqueue it on the rtmutex.
17273 +                                */
17274 +                               this->pi_state = NULL;
17275 +                               put_pi_state(pi_state);
17276 +                               continue;
17277                         } else if (ret) {
17278                                 /*
17279                                  * rt_mutex_start_proxy_lock() detected a
17280 @@ -2642,10 +2655,9 @@ static int futex_wait(u32 __user *uaddr, unsigned int flags, u32 val,
17281         if (abs_time) {
17282                 to = &timeout;
17283
17284 -               hrtimer_init_on_stack(&to->timer, (flags & FLAGS_CLOCKRT) ?
17285 -                                     CLOCK_REALTIME : CLOCK_MONOTONIC,
17286 -                                     HRTIMER_MODE_ABS);
17287 -               hrtimer_init_sleeper(to, current);
17288 +               hrtimer_init_sleeper_on_stack(to, (flags & FLAGS_CLOCKRT) ?
17289 +                                             CLOCK_REALTIME : CLOCK_MONOTONIC,
17290 +                                             HRTIMER_MODE_ABS, current);
17291                 hrtimer_set_expires_range_ns(&to->timer, *abs_time,
17292                                              current->timer_slack_ns);
17293         }
17294 @@ -2744,9 +2756,8 @@ static int futex_lock_pi(u32 __user *uaddr, unsigned int flags,
17295
17296         if (time) {
17297                 to = &timeout;
17298 -               hrtimer_init_on_stack(&to->timer, CLOCK_REALTIME,
17299 -                                     HRTIMER_MODE_ABS);
17300 -               hrtimer_init_sleeper(to, current);
17301 +               hrtimer_init_sleeper_on_stack(to, CLOCK_REALTIME,
17302 +                                             HRTIMER_MODE_ABS, current);
17303                 hrtimer_set_expires(&to->timer, *time);
17304         }
17305
17306 @@ -2801,7 +2812,7 @@ static int futex_lock_pi(u32 __user *uaddr, unsigned int flags,
17307                 goto no_block;
17308         }
17309
17310 -       rt_mutex_init_waiter(&rt_waiter);
17311 +       rt_mutex_init_waiter(&rt_waiter, false);
17312
17313         /*
17314          * On PREEMPT_RT_FULL, when hb->lock becomes an rt_mutex, we must not
17315 @@ -2816,9 +2827,18 @@ static int futex_lock_pi(u32 __user *uaddr, unsigned int flags,
17316          * lock handoff sequence.
17317          */
17318         raw_spin_lock_irq(&q.pi_state->pi_mutex.wait_lock);
17319 +       /*
17320 +        * the migrate_disable() here disables migration in the in_atomic() fast
17321 +        * path which is enabled again in the following spin_unlock(). We have
17322 +        * one migrate_disable() pending in the slow-path which is reversed
17323 +        * after the raw_spin_unlock_irq() where we leave the atomic context.
17324 +        */
17325 +       migrate_disable();
17326 +
17327         spin_unlock(q.lock_ptr);
17328         ret = __rt_mutex_start_proxy_lock(&q.pi_state->pi_mutex, &rt_waiter, current);
17329         raw_spin_unlock_irq(&q.pi_state->pi_mutex.wait_lock);
17330 +       migrate_enable();
17331
17332         if (ret) {
17333                 if (ret == 1)
17334 @@ -2965,11 +2985,21 @@ static int futex_unlock_pi(u32 __user *uaddr, unsigned int flags)
17335                  * observed.
17336                  */
17337                 raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
17338 +               /*
17339 +                * Magic trickery for now to make the RT migrate disable
17340 +                * logic happy. The following spin_unlock() happens with
17341 +                * interrupts disabled so the internal migrate_enable()
17342 +                * won't undo the migrate_disable() which was issued when
17343 +                * locking hb->lock.
17344 +                */
17345 +               migrate_disable();
17346                 spin_unlock(&hb->lock);
17347
17348                 /* drops pi_state->pi_mutex.wait_lock */
17349                 ret = wake_futex_pi(uaddr, uval, pi_state);
17350
17351 +               migrate_enable();
17352 +
17353                 put_pi_state(pi_state);
17354
17355                 /*
17356 @@ -3127,7 +3157,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
17357         struct hrtimer_sleeper timeout, *to = NULL;
17358         struct futex_pi_state *pi_state = NULL;
17359         struct rt_mutex_waiter rt_waiter;
17360 -       struct futex_hash_bucket *hb;
17361 +       struct futex_hash_bucket *hb, *hb2;
17362         union futex_key key2 = FUTEX_KEY_INIT;
17363         struct futex_q q = futex_q_init;
17364         int res, ret;
17365 @@ -3143,10 +3173,9 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
17366
17367         if (abs_time) {
17368                 to = &timeout;
17369 -               hrtimer_init_on_stack(&to->timer, (flags & FLAGS_CLOCKRT) ?
17370 -                                     CLOCK_REALTIME : CLOCK_MONOTONIC,
17371 -                                     HRTIMER_MODE_ABS);
17372 -               hrtimer_init_sleeper(to, current);
17373 +               hrtimer_init_sleeper_on_stack(to, (flags & FLAGS_CLOCKRT) ?
17374 +                                             CLOCK_REALTIME : CLOCK_MONOTONIC,
17375 +                                             HRTIMER_MODE_ABS, current);
17376                 hrtimer_set_expires_range_ns(&to->timer, *abs_time,
17377                                              current->timer_slack_ns);
17378         }
17379 @@ -3155,7 +3184,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
17380          * The waiter is allocated on our stack, manipulated by the requeue
17381          * code while we sleep on uaddr.
17382          */
17383 -       rt_mutex_init_waiter(&rt_waiter);
17384 +       rt_mutex_init_waiter(&rt_waiter, false);
17385
17386         ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2, VERIFY_WRITE);
17387         if (unlikely(ret != 0))
17388 @@ -3186,20 +3215,55 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
17389         /* Queue the futex_q, drop the hb lock, wait for wakeup. */
17390         futex_wait_queue_me(hb, &q, to);
17391
17392 -       spin_lock(&hb->lock);
17393 -       ret = handle_early_requeue_pi_wakeup(hb, &q, &key2, to);
17394 -       spin_unlock(&hb->lock);
17395 -       if (ret)
17396 -               goto out_put_keys;
17397 +       /*
17398 +        * On RT we must avoid races with requeue and trying to block
17399 +        * on two mutexes (hb->lock and uaddr2's rtmutex) by
17400 +        * serializing access to pi_blocked_on with pi_lock.
17401 +        */
17402 +       raw_spin_lock_irq(&current->pi_lock);
17403 +       if (current->pi_blocked_on) {
17404 +               /*
17405 +                * We have been requeued or are in the process of
17406 +                * being requeued.
17407 +                */
17408 +               raw_spin_unlock_irq(&current->pi_lock);
17409 +       } else {
17410 +               /*
17411 +                * Setting pi_blocked_on to PI_WAKEUP_INPROGRESS
17412 +                * prevents a concurrent requeue from moving us to the
17413 +                * uaddr2 rtmutex. After that we can safely acquire
17414 +                * (and possibly block on) hb->lock.
17415 +                */
17416 +               current->pi_blocked_on = PI_WAKEUP_INPROGRESS;
17417 +               raw_spin_unlock_irq(&current->pi_lock);
17418 +
17419 +               spin_lock(&hb->lock);
17420 +
17421 +               /*
17422 +                * Clean up pi_blocked_on. We might leak it otherwise
17423 +                * when we succeeded with the hb->lock in the fast
17424 +                * path.
17425 +                */
17426 +               raw_spin_lock_irq(&current->pi_lock);
17427 +               current->pi_blocked_on = NULL;
17428 +               raw_spin_unlock_irq(&current->pi_lock);
17429 +
17430 +               ret = handle_early_requeue_pi_wakeup(hb, &q, &key2, to);
17431 +               spin_unlock(&hb->lock);
17432 +               if (ret)
17433 +                       goto out_put_keys;
17434 +       }
17435
17436         /*
17437 -        * In order for us to be here, we know our q.key == key2, and since
17438 -        * we took the hb->lock above, we also know that futex_requeue() has
17439 -        * completed and we no longer have to concern ourselves with a wakeup
17440 -        * race with the atomic proxy lock acquisition by the requeue code. The
17441 -        * futex_requeue dropped our key1 reference and incremented our key2
17442 -        * reference count.
17443 +        * In order to be here, we have either been requeued, are in
17444 +        * the process of being requeued, or requeue successfully
17445 +        * acquired uaddr2 on our behalf.  If pi_blocked_on was
17446 +        * non-null above, we may be racing with a requeue.  Do not
17447 +        * rely on q->lock_ptr to be hb2->lock until after blocking on
17448 +        * hb->lock or hb2->lock. The futex_requeue dropped our key1
17449 +        * reference and incremented our key2 reference count.
17450          */
17451 +       hb2 = hash_futex(&key2);
17452
17453         /* Check if the requeue code acquired the second futex for us. */
17454         if (!q.rt_waiter) {
17455 @@ -3208,7 +3272,8 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
17456                  * did a lock-steal - fix up the PI-state in that case.
17457                  */
17458                 if (q.pi_state && (q.pi_state->owner != current)) {
17459 -                       spin_lock(q.lock_ptr);
17460 +                       spin_lock(&hb2->lock);
17461 +                       BUG_ON(&hb2->lock != q.lock_ptr);
17462                         ret = fixup_pi_state_owner(uaddr2, &q, current);
17463                         if (ret && rt_mutex_owner(&q.pi_state->pi_mutex) == current) {
17464                                 pi_state = q.pi_state;
17465 @@ -3219,7 +3284,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
17466                          * the requeue_pi() code acquired for us.
17467                          */
17468                         put_pi_state(q.pi_state);
17469 -                       spin_unlock(q.lock_ptr);
17470 +                       spin_unlock(&hb2->lock);
17471                 }
17472         } else {
17473                 struct rt_mutex *pi_mutex;
17474 @@ -3233,7 +3298,8 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
17475                 pi_mutex = &q.pi_state->pi_mutex;
17476                 ret = rt_mutex_wait_proxy_lock(pi_mutex, to, &rt_waiter);
17477
17478 -               spin_lock(q.lock_ptr);
17479 +               spin_lock(&hb2->lock);
17480 +               BUG_ON(&hb2->lock != q.lock_ptr);
17481                 if (ret && !rt_mutex_cleanup_proxy_lock(pi_mutex, &rt_waiter))
17482                         ret = 0;
17483
17484 diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
17485 index 79f987b942b8..d1dbacc29941 100644
17486 --- a/kernel/irq/handle.c
17487 +++ b/kernel/irq/handle.c
17488 @@ -183,10 +183,16 @@ irqreturn_t handle_irq_event_percpu(struct irq_desc *desc)
17489  {
17490         irqreturn_t retval;
17491         unsigned int flags = 0;
17492 +       struct pt_regs *regs = get_irq_regs();
17493 +       u64 ip = regs ? instruction_pointer(regs) : 0;
17494
17495         retval = __handle_irq_event_percpu(desc, &flags);
17496
17497 -       add_interrupt_randomness(desc->irq_data.irq, flags);
17498 +#ifdef CONFIG_PREEMPT_RT_FULL
17499 +       desc->random_ip = ip;
17500 +#else
17501 +       add_interrupt_randomness(desc->irq_data.irq, flags, ip);
17502 +#endif
17503
17504         if (!noirqdebug)
17505                 note_interrupt(desc, retval);
17506 diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
17507 index 069311541577..f82dcca81712 100644
17508 --- a/kernel/irq/manage.c
17509 +++ b/kernel/irq/manage.c
17510 @@ -24,6 +24,7 @@
17511  #include "internals.h"
17512
17513  #ifdef CONFIG_IRQ_FORCED_THREADING
17514 +# ifndef CONFIG_PREEMPT_RT_BASE
17515  __read_mostly bool force_irqthreads;
17516
17517  static int __init setup_forced_irqthreads(char *arg)
17518 @@ -32,6 +33,7 @@ static int __init setup_forced_irqthreads(char *arg)
17519         return 0;
17520  }
17521  early_param("threadirqs", setup_forced_irqthreads);
17522 +# endif
17523  #endif
17524
17525  static void __synchronize_hardirq(struct irq_desc *desc)
17526 @@ -224,7 +226,12 @@ int irq_set_affinity_locked(struct irq_data *data, const struct cpumask *mask,
17527
17528         if (desc->affinity_notify) {
17529                 kref_get(&desc->affinity_notify->kref);
17530 +
17531 +#ifdef CONFIG_PREEMPT_RT_BASE
17532 +               swork_queue(&desc->affinity_notify->swork);
17533 +#else
17534                 schedule_work(&desc->affinity_notify->work);
17535 +#endif
17536         }
17537         irqd_set(data, IRQD_AFFINITY_SET);
17538
17539 @@ -262,10 +269,8 @@ int irq_set_affinity_hint(unsigned int irq, const struct cpumask *m)
17540  }
17541  EXPORT_SYMBOL_GPL(irq_set_affinity_hint);
17542
17543 -static void irq_affinity_notify(struct work_struct *work)
17544 +static void _irq_affinity_notify(struct irq_affinity_notify *notify)
17545  {
17546 -       struct irq_affinity_notify *notify =
17547 -               container_of(work, struct irq_affinity_notify, work);
17548         struct irq_desc *desc = irq_to_desc(notify->irq);
17549         cpumask_var_t cpumask;
17550         unsigned long flags;
17551 @@ -287,6 +292,35 @@ static void irq_affinity_notify(struct work_struct *work)
17552         kref_put(&notify->kref, notify->release);
17553  }
17554
17555 +#ifdef CONFIG_PREEMPT_RT_BASE
17556 +static void init_helper_thread(void)
17557 +{
17558 +       static int init_sworker_once;
17559 +
17560 +       if (init_sworker_once)
17561 +               return;
17562 +       if (WARN_ON(swork_get()))
17563 +               return;
17564 +       init_sworker_once = 1;
17565 +}
17566 +
17567 +static void irq_affinity_notify(struct swork_event *swork)
17568 +{
17569 +       struct irq_affinity_notify *notify =
17570 +               container_of(swork, struct irq_affinity_notify, swork);
17571 +       _irq_affinity_notify(notify);
17572 +}
17573 +
17574 +#else
17575 +
17576 +static void irq_affinity_notify(struct work_struct *work)
17577 +{
17578 +       struct irq_affinity_notify *notify =
17579 +               container_of(work, struct irq_affinity_notify, work);
17580 +       _irq_affinity_notify(notify);
17581 +}
17582 +#endif
17583 +
17584  /**
17585   *     irq_set_affinity_notifier - control notification of IRQ affinity changes
17586   *     @irq:           Interrupt for which to enable/disable notification
17587 @@ -315,7 +349,12 @@ irq_set_affinity_notifier(unsigned int irq, struct irq_affinity_notify *notify)
17588         if (notify) {
17589                 notify->irq = irq;
17590                 kref_init(&notify->kref);
17591 +#ifdef CONFIG_PREEMPT_RT_BASE
17592 +               INIT_SWORK(&notify->swork, irq_affinity_notify);
17593 +               init_helper_thread();
17594 +#else
17595                 INIT_WORK(&notify->work, irq_affinity_notify);
17596 +#endif
17597         }
17598
17599         raw_spin_lock_irqsave(&desc->lock, flags);
17600 @@ -883,7 +922,15 @@ irq_forced_thread_fn(struct irq_desc *desc, struct irqaction *action)
17601         local_bh_disable();
17602         ret = action->thread_fn(action->irq, action->dev_id);
17603         irq_finalize_oneshot(desc, action);
17604 -       local_bh_enable();
17605 +       /*
17606 +        * Interrupts which have real time requirements can be set up
17607 +        * to avoid softirq processing in the thread handler. This is
17608 +        * safe as these interrupts do not raise soft interrupts.
17609 +        */
17610 +       if (irq_settings_no_softirq_call(desc))
17611 +               _local_bh_enable();
17612 +       else
17613 +               local_bh_enable();
17614         return ret;
17615  }
17616
17617 @@ -980,6 +1027,12 @@ static int irq_thread(void *data)
17618                 if (action_ret == IRQ_WAKE_THREAD)
17619                         irq_wake_secondary(desc, action);
17620
17621 +#ifdef CONFIG_PREEMPT_RT_FULL
17622 +               migrate_disable();
17623 +               add_interrupt_randomness(action->irq, 0,
17624 +                                desc->random_ip ^ (unsigned long) action);
17625 +               migrate_enable();
17626 +#endif
17627                 wake_threads_waitq(desc);
17628         }
17629
17630 @@ -1378,6 +1431,9 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
17631                         irqd_set(&desc->irq_data, IRQD_NO_BALANCING);
17632                 }
17633
17634 +               if (new->flags & IRQF_NO_SOFTIRQ_CALL)
17635 +                       irq_settings_set_no_softirq_call(desc);
17636 +
17637                 if (irq_settings_can_autoenable(desc)) {
17638                         irq_startup(desc, IRQ_RESEND, IRQ_START_COND);
17639                 } else {
17640 @@ -2159,7 +2215,7 @@ EXPORT_SYMBOL_GPL(irq_get_irqchip_state);
17641   *     This call sets the internal irqchip state of an interrupt,
17642   *     depending on the value of @which.
17643   *
17644 - *     This function should be called with preemption disabled if the
17645 + *     This function should be called with migration disabled if the
17646   *     interrupt controller has per-cpu registers.
17647   */
17648  int irq_set_irqchip_state(unsigned int irq, enum irqchip_irq_state which,
17649 diff --git a/kernel/irq/settings.h b/kernel/irq/settings.h
17650 index e43795cd2ccf..47e2f9e23586 100644
17651 --- a/kernel/irq/settings.h
17652 +++ b/kernel/irq/settings.h
17653 @@ -17,6 +17,7 @@ enum {
17654         _IRQ_PER_CPU_DEVID      = IRQ_PER_CPU_DEVID,
17655         _IRQ_IS_POLLED          = IRQ_IS_POLLED,
17656         _IRQ_DISABLE_UNLAZY     = IRQ_DISABLE_UNLAZY,
17657 +       _IRQ_NO_SOFTIRQ_CALL    = IRQ_NO_SOFTIRQ_CALL,
17658         _IRQF_MODIFY_MASK       = IRQF_MODIFY_MASK,
17659  };
17660
17661 @@ -31,6 +32,7 @@ enum {
17662  #define IRQ_PER_CPU_DEVID      GOT_YOU_MORON
17663  #define IRQ_IS_POLLED          GOT_YOU_MORON
17664  #define IRQ_DISABLE_UNLAZY     GOT_YOU_MORON
17665 +#define IRQ_NO_SOFTIRQ_CALL    GOT_YOU_MORON
17666  #undef IRQF_MODIFY_MASK
17667  #define IRQF_MODIFY_MASK       GOT_YOU_MORON
17668
17669 @@ -41,6 +43,16 @@ irq_settings_clr_and_set(struct irq_desc *desc, u32 clr, u32 set)
17670         desc->status_use_accessors |= (set & _IRQF_MODIFY_MASK);
17671  }
17672
17673 +static inline bool irq_settings_no_softirq_call(struct irq_desc *desc)
17674 +{
17675 +       return desc->status_use_accessors & _IRQ_NO_SOFTIRQ_CALL;
17676 +}
17677 +
17678 +static inline void irq_settings_set_no_softirq_call(struct irq_desc *desc)
17679 +{
17680 +       desc->status_use_accessors |= _IRQ_NO_SOFTIRQ_CALL;
17681 +}
17682 +
17683  static inline bool irq_settings_is_per_cpu(struct irq_desc *desc)
17684  {
17685         return desc->status_use_accessors & _IRQ_PER_CPU;
17686 diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c
17687 index 987d7bca4864..75347fb1dfea 100644
17688 --- a/kernel/irq/spurious.c
17689 +++ b/kernel/irq/spurious.c
17690 @@ -445,6 +445,10 @@ MODULE_PARM_DESC(noirqdebug, "Disable irq lockup detection when true");
17691
17692  static int __init irqfixup_setup(char *str)
17693  {
17694 +#ifdef CONFIG_PREEMPT_RT_BASE
17695 +       pr_warn("irqfixup boot option not supported w/ CONFIG_PREEMPT_RT_BASE\n");
17696 +       return 1;
17697 +#endif
17698         irqfixup = 1;
17699         printk(KERN_WARNING "Misrouted IRQ fixup support enabled.\n");
17700         printk(KERN_WARNING "This may impact system performance.\n");
17701 @@ -457,6 +461,10 @@ module_param(irqfixup, int, 0644);
17702
17703  static int __init irqpoll_setup(char *str)
17704  {
17705 +#ifdef CONFIG_PREEMPT_RT_BASE
17706 +       pr_warn("irqpoll boot option not supported w/ CONFIG_PREEMPT_RT_BASE\n");
17707 +       return 1;
17708 +#endif
17709         irqfixup = 2;
17710         printk(KERN_WARNING "Misrouted IRQ fixup and polling support "
17711                                 "enabled\n");
17712 diff --git a/kernel/irq_work.c b/kernel/irq_work.c
17713 index bcf107ce0854..2899ba0d23d1 100644
17714 --- a/kernel/irq_work.c
17715 +++ b/kernel/irq_work.c
17716 @@ -17,6 +17,7 @@
17717  #include <linux/cpu.h>
17718  #include <linux/notifier.h>
17719  #include <linux/smp.h>
17720 +#include <linux/interrupt.h>
17721  #include <asm/processor.h>
17722
17723
17724 @@ -65,6 +66,8 @@ void __weak arch_irq_work_raise(void)
17725   */
17726  bool irq_work_queue_on(struct irq_work *work, int cpu)
17727  {
17728 +       struct llist_head *list;
17729 +
17730         /* All work should have been flushed before going offline */
17731         WARN_ON_ONCE(cpu_is_offline(cpu));
17732
17733 @@ -75,7 +78,12 @@ bool irq_work_queue_on(struct irq_work *work, int cpu)
17734         if (!irq_work_claim(work))
17735                 return false;
17736
17737 -       if (llist_add(&work->llnode, &per_cpu(raised_list, cpu)))
17738 +       if (IS_ENABLED(CONFIG_PREEMPT_RT_FULL) && !(work->flags & IRQ_WORK_HARD_IRQ))
17739 +               list = &per_cpu(lazy_list, cpu);
17740 +       else
17741 +               list = &per_cpu(raised_list, cpu);
17742 +
17743 +       if (llist_add(&work->llnode, list))
17744                 arch_send_call_function_single_ipi(cpu);
17745
17746         return true;
17747 @@ -86,6 +94,9 @@ EXPORT_SYMBOL_GPL(irq_work_queue_on);
17748  /* Enqueue the irq work @work on the current CPU */
17749  bool irq_work_queue(struct irq_work *work)
17750  {
17751 +       struct llist_head *list;
17752 +       bool lazy_work, realtime = IS_ENABLED(CONFIG_PREEMPT_RT_FULL);
17753 +
17754         /* Only queue if not already pending */
17755         if (!irq_work_claim(work))
17756                 return false;
17757 @@ -93,13 +104,15 @@ bool irq_work_queue(struct irq_work *work)
17758         /* Queue the entry and raise the IPI if needed. */
17759         preempt_disable();
17760
17761 -       /* If the work is "lazy", handle it from next tick if any */
17762 -       if (work->flags & IRQ_WORK_LAZY) {
17763 -               if (llist_add(&work->llnode, this_cpu_ptr(&lazy_list)) &&
17764 -                   tick_nohz_tick_stopped())
17765 -                       arch_irq_work_raise();
17766 -       } else {
17767 -               if (llist_add(&work->llnode, this_cpu_ptr(&raised_list)))
17768 +       lazy_work = work->flags & IRQ_WORK_LAZY;
17769 +
17770 +       if (lazy_work || (realtime && !(work->flags & IRQ_WORK_HARD_IRQ)))
17771 +               list = this_cpu_ptr(&lazy_list);
17772 +       else
17773 +               list = this_cpu_ptr(&raised_list);
17774 +
17775 +       if (llist_add(&work->llnode, list)) {
17776 +               if (!lazy_work || tick_nohz_tick_stopped())
17777                         arch_irq_work_raise();
17778         }
17779
17780 @@ -116,9 +129,8 @@ bool irq_work_needs_cpu(void)
17781         raised = this_cpu_ptr(&raised_list);
17782         lazy = this_cpu_ptr(&lazy_list);
17783
17784 -       if (llist_empty(raised) || arch_irq_work_has_interrupt())
17785 -               if (llist_empty(lazy))
17786 -                       return false;
17787 +       if (llist_empty(raised) && llist_empty(lazy))
17788 +               return false;
17789
17790         /* All work should have been flushed before going offline */
17791         WARN_ON_ONCE(cpu_is_offline(smp_processor_id()));
17792 @@ -132,7 +144,7 @@ static void irq_work_run_list(struct llist_head *list)
17793         struct irq_work *work;
17794         struct llist_node *llnode;
17795
17796 -       BUG_ON(!irqs_disabled());
17797 +       BUG_ON_NONRT(!irqs_disabled());
17798
17799         if (llist_empty(list))
17800                 return;
17801 @@ -169,7 +181,16 @@ static void irq_work_run_list(struct llist_head *list)
17802  void irq_work_run(void)
17803  {
17804         irq_work_run_list(this_cpu_ptr(&raised_list));
17805 -       irq_work_run_list(this_cpu_ptr(&lazy_list));
17806 +       if (IS_ENABLED(CONFIG_PREEMPT_RT_FULL)) {
17807 +               /*
17808 +                * NOTE: we raise softirq via IPI for safety,
17809 +                * and execute in irq_work_tick() to move the
17810 +                * overhead from hard to soft irq context.
17811 +                */
17812 +               if (!llist_empty(this_cpu_ptr(&lazy_list)))
17813 +                       raise_softirq(TIMER_SOFTIRQ);
17814 +       } else
17815 +               irq_work_run_list(this_cpu_ptr(&lazy_list));
17816  }
17817  EXPORT_SYMBOL_GPL(irq_work_run);
17818
17819 @@ -179,8 +200,17 @@ void irq_work_tick(void)
17820
17821         if (!llist_empty(raised) && !arch_irq_work_has_interrupt())
17822                 irq_work_run_list(raised);
17823 +
17824 +       if (!IS_ENABLED(CONFIG_PREEMPT_RT_FULL))
17825 +               irq_work_run_list(this_cpu_ptr(&lazy_list));
17826 +}
17827 +
17828 +#if defined(CONFIG_IRQ_WORK) && defined(CONFIG_PREEMPT_RT_FULL)
17829 +void irq_work_tick_soft(void)
17830 +{
17831         irq_work_run_list(this_cpu_ptr(&lazy_list));
17832  }
17833 +#endif
17834
17835  /*
17836   * Synchronize against the irq_work @entry, ensures the entry is not
17837 diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c
17838 index 46ba853656f6..9a23632b6294 100644
17839 --- a/kernel/ksysfs.c
17840 +++ b/kernel/ksysfs.c
17841 @@ -140,6 +140,15 @@ KERNEL_ATTR_RO(vmcoreinfo);
17842
17843  #endif /* CONFIG_CRASH_CORE */
17844
17845 +#if defined(CONFIG_PREEMPT_RT_FULL)
17846 +static ssize_t realtime_show(struct kobject *kobj,
17847 +                            struct kobj_attribute *attr, char *buf)
17848 +{
17849 +       return sprintf(buf, "%d\n", 1);
17850 +}
17851 +KERNEL_ATTR_RO(realtime);
17852 +#endif
17853 +
17854  /* whether file capabilities are enabled */
17855  static ssize_t fscaps_show(struct kobject *kobj,
17856                                   struct kobj_attribute *attr, char *buf)
17857 @@ -230,6 +239,9 @@ static struct attribute * kernel_attrs[] = {
17858  #ifndef CONFIG_TINY_RCU
17859         &rcu_expedited_attr.attr,
17860         &rcu_normal_attr.attr,
17861 +#endif
17862 +#ifdef CONFIG_PREEMPT_RT_FULL
17863 +       &realtime_attr.attr,
17864  #endif
17865         NULL
17866  };
17867 diff --git a/kernel/locking/Makefile b/kernel/locking/Makefile
17868 index 392c7f23af76..c0bf04b6b965 100644
17869 --- a/kernel/locking/Makefile
17870 +++ b/kernel/locking/Makefile
17871 @@ -3,7 +3,7 @@
17872  # and is generally not a function of system call inputs.
17873  KCOV_INSTRUMENT                := n
17874
17875 -obj-y += mutex.o semaphore.o rwsem.o percpu-rwsem.o
17876 +obj-y += semaphore.o percpu-rwsem.o
17877
17878  ifdef CONFIG_FUNCTION_TRACER
17879  CFLAGS_REMOVE_lockdep.o = $(CC_FLAGS_FTRACE)
17880 @@ -12,7 +12,11 @@ CFLAGS_REMOVE_mutex-debug.o = $(CC_FLAGS_FTRACE)
17881  CFLAGS_REMOVE_rtmutex-debug.o = $(CC_FLAGS_FTRACE)
17882  endif
17883
17884 +ifneq ($(CONFIG_PREEMPT_RT_FULL),y)
17885 +obj-y += mutex.o
17886  obj-$(CONFIG_DEBUG_MUTEXES) += mutex-debug.o
17887 +endif
17888 +obj-y += rwsem.o
17889  obj-$(CONFIG_LOCKDEP) += lockdep.o
17890  ifeq ($(CONFIG_PROC_FS),y)
17891  obj-$(CONFIG_LOCKDEP) += lockdep_proc.o
17892 @@ -25,8 +29,11 @@ obj-$(CONFIG_RT_MUTEXES) += rtmutex.o
17893  obj-$(CONFIG_DEBUG_RT_MUTEXES) += rtmutex-debug.o
17894  obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock.o
17895  obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock_debug.o
17896 +ifneq ($(CONFIG_PREEMPT_RT_FULL),y)
17897  obj-$(CONFIG_RWSEM_GENERIC_SPINLOCK) += rwsem-spinlock.o
17898  obj-$(CONFIG_RWSEM_XCHGADD_ALGORITHM) += rwsem-xadd.o
17899 +endif
17900 +obj-$(CONFIG_PREEMPT_RT_FULL) += mutex-rt.o rwsem-rt.o rwlock-rt.o
17901  obj-$(CONFIG_QUEUED_RWLOCKS) += qrwlock.o
17902  obj-$(CONFIG_LOCK_TORTURE_TEST) += locktorture.o
17903  obj-$(CONFIG_WW_MUTEX_SELFTEST) += test-ww_mutex.o
17904 diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
17905 index d7c155048ea9..def51a27f20f 100644
17906 --- a/kernel/locking/lockdep.c
17907 +++ b/kernel/locking/lockdep.c
17908 @@ -3914,6 +3914,7 @@ static void check_flags(unsigned long flags)
17909                 }
17910         }
17911
17912 +#ifndef CONFIG_PREEMPT_RT_FULL
17913         /*
17914          * We dont accurately track softirq state in e.g.
17915          * hardirq contexts (such as on 4KSTACKS), so only
17916 @@ -3928,6 +3929,7 @@ static void check_flags(unsigned long flags)
17917                         DEBUG_LOCKS_WARN_ON(!current->softirqs_enabled);
17918                 }
17919         }
17920 +#endif
17921
17922         if (!debug_locks)
17923                 print_irqtrace_events(current);
17924 diff --git a/kernel/locking/locktorture.c b/kernel/locking/locktorture.c
17925 index 6dca260eeccf..5d01ac590d4c 100644
17926 --- a/kernel/locking/locktorture.c
17927 +++ b/kernel/locking/locktorture.c
17928 @@ -26,7 +26,6 @@
17929  #include <linux/kthread.h>
17930  #include <linux/sched/rt.h>
17931  #include <linux/spinlock.h>
17932 -#include <linux/rwlock.h>
17933  #include <linux/mutex.h>
17934  #include <linux/rwsem.h>
17935  #include <linux/smp.h>
17936 diff --git a/kernel/locking/mutex-rt.c b/kernel/locking/mutex-rt.c
17937 new file mode 100644
17938 index 000000000000..4f81595c0f52
17939 --- /dev/null
17940 +++ b/kernel/locking/mutex-rt.c
17941 @@ -0,0 +1,223 @@
17942 +/*
17943 + * kernel/rt.c
17944 + *
17945 + * Real-Time Preemption Support
17946 + *
17947 + * started by Ingo Molnar:
17948 + *
17949 + *  Copyright (C) 2004-2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
17950 + *  Copyright (C) 2006, Timesys Corp., Thomas Gleixner <tglx@timesys.com>
17951 + *
17952 + * historic credit for proving that Linux spinlocks can be implemented via
17953 + * RT-aware mutexes goes to many people: The Pmutex project (Dirk Grambow
17954 + * and others) who prototyped it on 2.4 and did lots of comparative
17955 + * research and analysis; TimeSys, for proving that you can implement a
17956 + * fully preemptible kernel via the use of IRQ threading and mutexes;
17957 + * Bill Huey for persuasively arguing on lkml that the mutex model is the
17958 + * right one; and to MontaVista, who ported pmutexes to 2.6.
17959 + *
17960 + * This code is a from-scratch implementation and is not based on pmutexes,
17961 + * but the idea of converting spinlocks to mutexes is used here too.
17962 + *
17963 + * lock debugging, locking tree, deadlock detection:
17964 + *
17965 + *  Copyright (C) 2004, LynuxWorks, Inc., Igor Manyilov, Bill Huey
17966 + *  Released under the General Public License (GPL).
17967 + *
17968 + * Includes portions of the generic R/W semaphore implementation from:
17969 + *
17970 + *  Copyright (c) 2001   David Howells (dhowells@redhat.com).
17971 + *  - Derived partially from idea by Andrea Arcangeli <andrea@suse.de>
17972 + *  - Derived also from comments by Linus
17973 + *
17974 + * Pending ownership of locks and ownership stealing:
17975 + *
17976 + *  Copyright (C) 2005, Kihon Technologies Inc., Steven Rostedt
17977 + *
17978 + *   (also by Steven Rostedt)
17979 + *    - Converted single pi_lock to individual task locks.
17980 + *
17981 + * By Esben Nielsen:
17982 + *    Doing priority inheritance with help of the scheduler.
17983 + *
17984 + *  Copyright (C) 2006, Timesys Corp., Thomas Gleixner <tglx@timesys.com>
17985 + *  - major rework based on Esben Nielsens initial patch
17986 + *  - replaced thread_info references by task_struct refs
17987 + *  - removed task->pending_owner dependency
17988 + *  - BKL drop/reacquire for semaphore style locks to avoid deadlocks
17989 + *    in the scheduler return path as discussed with Steven Rostedt
17990 + *
17991 + *  Copyright (C) 2006, Kihon Technologies Inc.
17992 + *    Steven Rostedt <rostedt@goodmis.org>
17993 + *  - debugged and patched Thomas Gleixner's rework.
17994 + *  - added back the cmpxchg to the rework.
17995 + *  - turned atomic require back on for SMP.
17996 + */
17997 +
17998 +#include <linux/spinlock.h>
17999 +#include <linux/rtmutex.h>
18000 +#include <linux/sched.h>
18001 +#include <linux/delay.h>
18002 +#include <linux/module.h>
18003 +#include <linux/kallsyms.h>
18004 +#include <linux/syscalls.h>
18005 +#include <linux/interrupt.h>
18006 +#include <linux/plist.h>
18007 +#include <linux/fs.h>
18008 +#include <linux/futex.h>
18009 +#include <linux/hrtimer.h>
18010 +
18011 +#include "rtmutex_common.h"
18012 +
18013 +/*
18014 + * struct mutex functions
18015 + */
18016 +void __mutex_do_init(struct mutex *mutex, const char *name,
18017 +                    struct lock_class_key *key)
18018 +{
18019 +#ifdef CONFIG_DEBUG_LOCK_ALLOC
18020 +       /*
18021 +        * Make sure we are not reinitializing a held lock:
18022 +        */
18023 +       debug_check_no_locks_freed((void *)mutex, sizeof(*mutex));
18024 +       lockdep_init_map(&mutex->dep_map, name, key, 0);
18025 +#endif
18026 +       mutex->lock.save_state = 0;
18027 +}
18028 +EXPORT_SYMBOL(__mutex_do_init);
18029 +
18030 +void __lockfunc _mutex_lock(struct mutex *lock)
18031 +{
18032 +       mutex_acquire(&lock->dep_map, 0, 0, _RET_IP_);
18033 +       __rt_mutex_lock_state(&lock->lock, TASK_UNINTERRUPTIBLE);
18034 +}
18035 +EXPORT_SYMBOL(_mutex_lock);
18036 +
18037 +void __lockfunc _mutex_lock_io(struct mutex *lock)
18038 +{
18039 +       int token;
18040 +
18041 +       token = io_schedule_prepare();
18042 +       _mutex_lock(lock);
18043 +       io_schedule_finish(token);
18044 +}
18045 +EXPORT_SYMBOL_GPL(_mutex_lock_io);
18046 +
18047 +int __lockfunc _mutex_lock_interruptible(struct mutex *lock)
18048 +{
18049 +       int ret;
18050 +
18051 +       mutex_acquire(&lock->dep_map, 0, 0, _RET_IP_);
18052 +       ret = __rt_mutex_lock_state(&lock->lock, TASK_INTERRUPTIBLE);
18053 +       if (ret)
18054 +               mutex_release(&lock->dep_map, 1, _RET_IP_);
18055 +       return ret;
18056 +}
18057 +EXPORT_SYMBOL(_mutex_lock_interruptible);
18058 +
18059 +int __lockfunc _mutex_lock_killable(struct mutex *lock)
18060 +{
18061 +       int ret;
18062 +
18063 +       mutex_acquire(&lock->dep_map, 0, 0, _RET_IP_);
18064 +       ret = __rt_mutex_lock_state(&lock->lock, TASK_KILLABLE);
18065 +       if (ret)
18066 +               mutex_release(&lock->dep_map, 1, _RET_IP_);
18067 +       return ret;
18068 +}
18069 +EXPORT_SYMBOL(_mutex_lock_killable);
18070 +
18071 +#ifdef CONFIG_DEBUG_LOCK_ALLOC
18072 +void __lockfunc _mutex_lock_nested(struct mutex *lock, int subclass)
18073 +{
18074 +       mutex_acquire_nest(&lock->dep_map, subclass, 0, NULL, _RET_IP_);
18075 +       __rt_mutex_lock_state(&lock->lock, TASK_UNINTERRUPTIBLE);
18076 +}
18077 +EXPORT_SYMBOL(_mutex_lock_nested);
18078 +
18079 +void __lockfunc _mutex_lock_io_nested(struct mutex *lock, int subclass)
18080 +{
18081 +       int token;
18082 +
18083 +       token = io_schedule_prepare();
18084 +
18085 +       mutex_acquire_nest(&lock->dep_map, subclass, 0, NULL, _RET_IP_);
18086 +       __rt_mutex_lock_state(&lock->lock, TASK_UNINTERRUPTIBLE);
18087 +
18088 +       io_schedule_finish(token);
18089 +}
18090 +EXPORT_SYMBOL_GPL(_mutex_lock_io_nested);
18091 +
18092 +void __lockfunc _mutex_lock_nest_lock(struct mutex *lock, struct lockdep_map *nest)
18093 +{
18094 +       mutex_acquire_nest(&lock->dep_map, 0, 0, nest, _RET_IP_);
18095 +       __rt_mutex_lock_state(&lock->lock, TASK_UNINTERRUPTIBLE);
18096 +}
18097 +EXPORT_SYMBOL(_mutex_lock_nest_lock);
18098 +
18099 +int __lockfunc _mutex_lock_interruptible_nested(struct mutex *lock, int subclass)
18100 +{
18101 +       int ret;
18102 +
18103 +       mutex_acquire_nest(&lock->dep_map, subclass, 0, NULL, _RET_IP_);
18104 +       ret = __rt_mutex_lock_state(&lock->lock, TASK_INTERRUPTIBLE);
18105 +       if (ret)
18106 +               mutex_release(&lock->dep_map, 1, _RET_IP_);
18107 +       return ret;
18108 +}
18109 +EXPORT_SYMBOL(_mutex_lock_interruptible_nested);
18110 +
18111 +int __lockfunc _mutex_lock_killable_nested(struct mutex *lock, int subclass)
18112 +{
18113 +       int ret;
18114 +
18115 +       mutex_acquire(&lock->dep_map, subclass, 0, _RET_IP_);
18116 +       ret = __rt_mutex_lock_state(&lock->lock, TASK_KILLABLE);
18117 +       if (ret)
18118 +               mutex_release(&lock->dep_map, 1, _RET_IP_);
18119 +       return ret;
18120 +}
18121 +EXPORT_SYMBOL(_mutex_lock_killable_nested);
18122 +#endif
18123 +
18124 +int __lockfunc _mutex_trylock(struct mutex *lock)
18125 +{
18126 +       int ret = __rt_mutex_trylock(&lock->lock);
18127 +
18128 +       if (ret)
18129 +               mutex_acquire(&lock->dep_map, 0, 1, _RET_IP_);
18130 +
18131 +       return ret;
18132 +}
18133 +EXPORT_SYMBOL(_mutex_trylock);
18134 +
18135 +void __lockfunc _mutex_unlock(struct mutex *lock)
18136 +{
18137 +       mutex_release(&lock->dep_map, 1, _RET_IP_);
18138 +       __rt_mutex_unlock(&lock->lock);
18139 +}
18140 +EXPORT_SYMBOL(_mutex_unlock);
18141 +
18142 +/**
18143 + * atomic_dec_and_mutex_lock - return holding mutex if we dec to 0
18144 + * @cnt: the atomic which we are to dec
18145 + * @lock: the mutex to return holding if we dec to 0
18146 + *
18147 + * return true and hold lock if we dec to 0, return false otherwise
18148 + */
18149 +int atomic_dec_and_mutex_lock(atomic_t *cnt, struct mutex *lock)
18150 +{
18151 +       /* dec if we can't possibly hit 0 */
18152 +       if (atomic_add_unless(cnt, -1, 1))
18153 +               return 0;
18154 +       /* we might hit 0, so take the lock */
18155 +       mutex_lock(lock);
18156 +       if (!atomic_dec_and_test(cnt)) {
18157 +               /* when we actually did the dec, we didn't hit 0 */
18158 +               mutex_unlock(lock);
18159 +               return 0;
18160 +       }
18161 +       /* we hit 0, and we hold the lock */
18162 +       return 1;
18163 +}
18164 +EXPORT_SYMBOL(atomic_dec_and_mutex_lock);
18165 diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c
18166 index 4ad35718f123..08e233b7dc21 100644
18167 --- a/kernel/locking/rtmutex.c
18168 +++ b/kernel/locking/rtmutex.c
18169 @@ -7,6 +7,11 @@
18170   *  Copyright (C) 2005-2006 Timesys Corp., Thomas Gleixner <tglx@timesys.com>
18171   *  Copyright (C) 2005 Kihon Technologies Inc., Steven Rostedt
18172   *  Copyright (C) 2006 Esben Nielsen
18173 + *  Adaptive Spinlocks:
18174 + *  Copyright (C) 2008 Novell, Inc., Gregory Haskins, Sven Dietrich,
18175 + *                                  and Peter Morreale,
18176 + * Adaptive Spinlocks simplification:
18177 + *  Copyright (C) 2008 Red Hat, Inc., Steven Rostedt <srostedt@redhat.com>
18178   *
18179   *  See Documentation/locking/rt-mutex-design.txt for details.
18180   */
18181 @@ -18,6 +23,8 @@
18182  #include <linux/sched/wake_q.h>
18183  #include <linux/sched/debug.h>
18184  #include <linux/timer.h>
18185 +#include <linux/ww_mutex.h>
18186 +#include <linux/blkdev.h>
18187
18188  #include "rtmutex_common.h"
18189
18190 @@ -135,6 +142,12 @@ static void fixup_rt_mutex_waiters(struct rt_mutex *lock)
18191                 WRITE_ONCE(*p, owner & ~RT_MUTEX_HAS_WAITERS);
18192  }
18193
18194 +static int rt_mutex_real_waiter(struct rt_mutex_waiter *waiter)
18195 +{
18196 +       return waiter && waiter != PI_WAKEUP_INPROGRESS &&
18197 +               waiter != PI_REQUEUE_INPROGRESS;
18198 +}
18199 +
18200  /*
18201   * We can speed up the acquire/release, if there's no debugging state to be
18202   * set up.
18203 @@ -228,7 +241,7 @@ static inline bool unlock_rt_mutex_safe(struct rt_mutex *lock,
18204   * Only use with rt_mutex_waiter_{less,equal}()
18205   */
18206  #define task_to_waiter(p)      \
18207 -       &(struct rt_mutex_waiter){ .prio = (p)->prio, .deadline = (p)->dl.deadline }
18208 +       &(struct rt_mutex_waiter){ .prio = (p)->prio, .deadline = (p)->dl.deadline, .task = (p) }
18209
18210  static inline int
18211  rt_mutex_waiter_less(struct rt_mutex_waiter *left,
18212 @@ -268,6 +281,27 @@ rt_mutex_waiter_equal(struct rt_mutex_waiter *left,
18213         return 1;
18214  }
18215
18216 +#define STEAL_NORMAL  0
18217 +#define STEAL_LATERAL 1
18218 +
18219 +static inline int
18220 +rt_mutex_steal(struct rt_mutex *lock, struct rt_mutex_waiter *waiter, int mode)
18221 +{
18222 +       struct rt_mutex_waiter *top_waiter = rt_mutex_top_waiter(lock);
18223 +
18224 +       if (waiter == top_waiter || rt_mutex_waiter_less(waiter, top_waiter))
18225 +               return 1;
18226 +
18227 +       /*
18228 +        * Note that RT tasks are excluded from lateral-steals
18229 +        * to prevent the introduction of an unbounded latency.
18230 +        */
18231 +       if (mode == STEAL_NORMAL || rt_task(waiter->task))
18232 +               return 0;
18233 +
18234 +       return rt_mutex_waiter_equal(waiter, top_waiter);
18235 +}
18236 +
18237  static void
18238  rt_mutex_enqueue(struct rt_mutex *lock, struct rt_mutex_waiter *waiter)
18239  {
18240 @@ -372,6 +406,14 @@ static bool rt_mutex_cond_detect_deadlock(struct rt_mutex_waiter *waiter,
18241         return debug_rt_mutex_detect_deadlock(waiter, chwalk);
18242  }
18243
18244 +static void rt_mutex_wake_waiter(struct rt_mutex_waiter *waiter)
18245 +{
18246 +       if (waiter->savestate)
18247 +               wake_up_lock_sleeper(waiter->task);
18248 +       else
18249 +               wake_up_process(waiter->task);
18250 +}
18251 +
18252  /*
18253   * Max number of times we'll walk the boosting chain:
18254   */
18255 @@ -379,7 +421,8 @@ int max_lock_depth = 1024;
18256
18257  static inline struct rt_mutex *task_blocked_on_lock(struct task_struct *p)
18258  {
18259 -       return p->pi_blocked_on ? p->pi_blocked_on->lock : NULL;
18260 +       return rt_mutex_real_waiter(p->pi_blocked_on) ?
18261 +               p->pi_blocked_on->lock : NULL;
18262  }
18263
18264  /*
18265 @@ -515,7 +558,7 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
18266          * reached or the state of the chain has changed while we
18267          * dropped the locks.
18268          */
18269 -       if (!waiter)
18270 +       if (!rt_mutex_real_waiter(waiter))
18271                 goto out_unlock_pi;
18272
18273         /*
18274 @@ -696,13 +739,16 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
18275          * follow here. This is the end of the chain we are walking.
18276          */
18277         if (!rt_mutex_owner(lock)) {
18278 +               struct rt_mutex_waiter *lock_top_waiter;
18279 +
18280                 /*
18281                  * If the requeue [7] above changed the top waiter,
18282                  * then we need to wake the new top waiter up to try
18283                  * to get the lock.
18284                  */
18285 -               if (prerequeue_top_waiter != rt_mutex_top_waiter(lock))
18286 -                       wake_up_process(rt_mutex_top_waiter(lock)->task);
18287 +               lock_top_waiter = rt_mutex_top_waiter(lock);
18288 +               if (prerequeue_top_waiter != lock_top_waiter)
18289 +                       rt_mutex_wake_waiter(lock_top_waiter);
18290                 raw_spin_unlock_irq(&lock->wait_lock);
18291                 return 0;
18292         }
18293 @@ -804,9 +850,11 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
18294   * @task:   The task which wants to acquire the lock
18295   * @waiter: The waiter that is queued to the lock's wait tree if the
18296   *         callsite called task_blocked_on_lock(), otherwise NULL
18297 + * @mode:   Lock steal mode (STEAL_NORMAL, STEAL_LATERAL)
18298   */
18299 -static int try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task,
18300 -                               struct rt_mutex_waiter *waiter)
18301 +static int __try_to_take_rt_mutex(struct rt_mutex *lock,
18302 +                                 struct task_struct *task,
18303 +                                 struct rt_mutex_waiter *waiter, int mode)
18304  {
18305         lockdep_assert_held(&lock->wait_lock);
18306
18307 @@ -842,12 +890,11 @@ static int try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task,
18308          */
18309         if (waiter) {
18310                 /*
18311 -                * If waiter is not the highest priority waiter of
18312 -                * @lock, give up.
18313 +                * If waiter is not the highest priority waiter of @lock,
18314 +                * or its peer when lateral steal is allowed, give up.
18315                  */
18316 -               if (waiter != rt_mutex_top_waiter(lock))
18317 +               if (!rt_mutex_steal(lock, waiter, mode))
18318                         return 0;
18319 -
18320                 /*
18321                  * We can acquire the lock. Remove the waiter from the
18322                  * lock waiters tree.
18323 @@ -865,14 +912,12 @@ static int try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task,
18324                  */
18325                 if (rt_mutex_has_waiters(lock)) {
18326                         /*
18327 -                        * If @task->prio is greater than or equal to
18328 -                        * the top waiter priority (kernel view),
18329 -                        * @task lost.
18330 +                        * If @task->prio is greater than the top waiter
18331 +                        * priority (kernel view), or equal to it when a
18332 +                        * lateral steal is forbidden, @task lost.
18333                          */
18334 -                       if (!rt_mutex_waiter_less(task_to_waiter(task),
18335 -                                                 rt_mutex_top_waiter(lock)))
18336 +                       if (!rt_mutex_steal(lock, task_to_waiter(task), mode))
18337                                 return 0;
18338 -
18339                         /*
18340                          * The current top waiter stays enqueued. We
18341                          * don't have to change anything in the lock
18342 @@ -919,6 +964,351 @@ static int try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task,
18343         return 1;
18344  }
18345
18346 +#ifdef CONFIG_PREEMPT_RT_FULL
18347 +/*
18348 + * preemptible spin_lock functions:
18349 + */
18350 +static inline void rt_spin_lock_fastlock(struct rt_mutex *lock,
18351 +                                        void  (*slowfn)(struct rt_mutex *lock))
18352 +{
18353 +       might_sleep_no_state_check();
18354 +
18355 +       if (likely(rt_mutex_cmpxchg_acquire(lock, NULL, current)))
18356 +               return;
18357 +       else
18358 +               slowfn(lock);
18359 +}
18360 +
18361 +static inline void rt_spin_lock_fastunlock(struct rt_mutex *lock,
18362 +                                          void  (*slowfn)(struct rt_mutex *lock))
18363 +{
18364 +       if (likely(rt_mutex_cmpxchg_release(lock, current, NULL)))
18365 +               return;
18366 +       else
18367 +               slowfn(lock);
18368 +}
18369 +#ifdef CONFIG_SMP
18370 +/*
18371 + * Note that owner is a speculative pointer and dereferencing relies
18372 + * on rcu_read_lock() and the check against the lock owner.
18373 + */
18374 +static int adaptive_wait(struct rt_mutex *lock,
18375 +                        struct task_struct *owner)
18376 +{
18377 +       int res = 0;
18378 +
18379 +       rcu_read_lock();
18380 +       for (;;) {
18381 +               if (owner != rt_mutex_owner(lock))
18382 +                       break;
18383 +               /*
18384 +                * Ensure that owner->on_cpu is dereferenced _after_
18385 +                * checking the above to be valid.
18386 +                */
18387 +               barrier();
18388 +               if (!owner->on_cpu) {
18389 +                       res = 1;
18390 +                       break;
18391 +               }
18392 +               cpu_relax();
18393 +       }
18394 +       rcu_read_unlock();
18395 +       return res;
18396 +}
18397 +#else
18398 +static int adaptive_wait(struct rt_mutex *lock,
18399 +                        struct task_struct *orig_owner)
18400 +{
18401 +       return 1;
18402 +}
18403 +#endif
18404 +
18405 +static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
18406 +                                  struct rt_mutex_waiter *waiter,
18407 +                                  struct task_struct *task,
18408 +                                  enum rtmutex_chainwalk chwalk);
18409 +/*
18410 + * Slow path lock function spin_lock style: this variant is very
18411 + * careful not to miss any non-lock wakeups.
18412 + *
18413 + * We store the current state under p->pi_lock in p->saved_state and
18414 + * the try_to_wake_up() code handles this accordingly.
18415 + */
18416 +void __sched rt_spin_lock_slowlock_locked(struct rt_mutex *lock,
18417 +                                         struct rt_mutex_waiter *waiter,
18418 +                                         unsigned long flags)
18419 +{
18420 +       struct task_struct *lock_owner, *self = current;
18421 +       struct rt_mutex_waiter *top_waiter;
18422 +       int ret;
18423 +
18424 +       if (__try_to_take_rt_mutex(lock, self, NULL, STEAL_LATERAL))
18425 +               return;
18426 +
18427 +       BUG_ON(rt_mutex_owner(lock) == self);
18428 +
18429 +       /*
18430 +        * We save whatever state the task is in and we'll restore it
18431 +        * after acquiring the lock taking real wakeups into account
18432 +        * as well. We are serialized via pi_lock against wakeups. See
18433 +        * try_to_wake_up().
18434 +        */
18435 +       raw_spin_lock(&self->pi_lock);
18436 +       self->saved_state = self->state;
18437 +       __set_current_state_no_track(TASK_UNINTERRUPTIBLE);
18438 +       raw_spin_unlock(&self->pi_lock);
18439 +
18440 +       ret = task_blocks_on_rt_mutex(lock, waiter, self, RT_MUTEX_MIN_CHAINWALK);
18441 +       BUG_ON(ret);
18442 +
18443 +       for (;;) {
18444 +               /* Try to acquire the lock again. */
18445 +               if (__try_to_take_rt_mutex(lock, self, waiter, STEAL_LATERAL))
18446 +                       break;
18447 +
18448 +               top_waiter = rt_mutex_top_waiter(lock);
18449 +               lock_owner = rt_mutex_owner(lock);
18450 +
18451 +               raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
18452 +
18453 +               debug_rt_mutex_print_deadlock(waiter);
18454 +
18455 +               if (top_waiter != waiter || adaptive_wait(lock, lock_owner))
18456 +                       schedule();
18457 +
18458 +               raw_spin_lock_irqsave(&lock->wait_lock, flags);
18459 +
18460 +               raw_spin_lock(&self->pi_lock);
18461 +               __set_current_state_no_track(TASK_UNINTERRUPTIBLE);
18462 +               raw_spin_unlock(&self->pi_lock);
18463 +       }
18464 +
18465 +       /*
18466 +        * Restore the task state to current->saved_state. We set it
18467 +        * to the original state above and the try_to_wake_up() code
18468 +        * has possibly updated it when a real (non-rtmutex) wakeup
18469 +        * happened while we were blocked. Clear saved_state so
18470 +        * try_to_wakeup() does not get confused.
18471 +        */
18472 +       raw_spin_lock(&self->pi_lock);
18473 +       __set_current_state_no_track(self->saved_state);
18474 +       self->saved_state = TASK_RUNNING;
18475 +       raw_spin_unlock(&self->pi_lock);
18476 +
18477 +       /*
18478 +        * try_to_take_rt_mutex() sets the waiter bit
18479 +        * unconditionally. We might have to fix that up:
18480 +        */
18481 +       fixup_rt_mutex_waiters(lock);
18482 +
18483 +       BUG_ON(rt_mutex_has_waiters(lock) && waiter == rt_mutex_top_waiter(lock));
18484 +       BUG_ON(!RB_EMPTY_NODE(&waiter->tree_entry));
18485 +}
18486 +
18487 +static void noinline __sched rt_spin_lock_slowlock(struct rt_mutex *lock)
18488 +{
18489 +       struct rt_mutex_waiter waiter;
18490 +       unsigned long flags;
18491 +
18492 +       rt_mutex_init_waiter(&waiter, true);
18493 +
18494 +       raw_spin_lock_irqsave(&lock->wait_lock, flags);
18495 +       rt_spin_lock_slowlock_locked(lock, &waiter, flags);
18496 +       raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
18497 +       debug_rt_mutex_free_waiter(&waiter);
18498 +}
18499 +
18500 +static bool __sched __rt_mutex_unlock_common(struct rt_mutex *lock,
18501 +                                            struct wake_q_head *wake_q,
18502 +                                            struct wake_q_head *wq_sleeper);
18503 +/*
18504 + * Slow path to release a rt_mutex spin_lock style
18505 + */
18506 +void __sched rt_spin_lock_slowunlock(struct rt_mutex *lock)
18507 +{
18508 +       unsigned long flags;
18509 +       DEFINE_WAKE_Q(wake_q);
18510 +       DEFINE_WAKE_Q(wake_sleeper_q);
18511 +       bool postunlock;
18512 +
18513 +       raw_spin_lock_irqsave(&lock->wait_lock, flags);
18514 +       postunlock = __rt_mutex_unlock_common(lock, &wake_q, &wake_sleeper_q);
18515 +       raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
18516 +
18517 +       if (postunlock)
18518 +               rt_mutex_postunlock(&wake_q, &wake_sleeper_q);
18519 +}
18520 +
18521 +void __lockfunc rt_spin_lock(spinlock_t *lock)
18522 +{
18523 +       sleeping_lock_inc();
18524 +       migrate_disable();
18525 +       spin_acquire(&lock->dep_map, 0, 0, _RET_IP_);
18526 +       rt_spin_lock_fastlock(&lock->lock, rt_spin_lock_slowlock);
18527 +}
18528 +EXPORT_SYMBOL(rt_spin_lock);
18529 +
18530 +void __lockfunc __rt_spin_lock(struct rt_mutex *lock)
18531 +{
18532 +       rt_spin_lock_fastlock(lock, rt_spin_lock_slowlock);
18533 +}
18534 +
18535 +#ifdef CONFIG_DEBUG_LOCK_ALLOC
18536 +void __lockfunc rt_spin_lock_nested(spinlock_t *lock, int subclass)
18537 +{
18538 +       sleeping_lock_inc();
18539 +       migrate_disable();
18540 +       spin_acquire(&lock->dep_map, subclass, 0, _RET_IP_);
18541 +       rt_spin_lock_fastlock(&lock->lock, rt_spin_lock_slowlock);
18542 +}
18543 +EXPORT_SYMBOL(rt_spin_lock_nested);
18544 +#endif
18545 +
18546 +void __lockfunc rt_spin_unlock(spinlock_t *lock)
18547 +{
18548 +       /* NOTE: we always pass in '1' for nested, for simplicity */
18549 +       spin_release(&lock->dep_map, 1, _RET_IP_);
18550 +       rt_spin_lock_fastunlock(&lock->lock, rt_spin_lock_slowunlock);
18551 +       migrate_enable();
18552 +       sleeping_lock_dec();
18553 +}
18554 +EXPORT_SYMBOL(rt_spin_unlock);
18555 +
18556 +void __lockfunc __rt_spin_unlock(struct rt_mutex *lock)
18557 +{
18558 +       rt_spin_lock_fastunlock(lock, rt_spin_lock_slowunlock);
18559 +}
18560 +EXPORT_SYMBOL(__rt_spin_unlock);
18561 +
18562 +/*
18563 + * Wait for the lock to get unlocked: instead of polling for an unlock
18564 + * (like raw spinlocks do), we lock and unlock, to force the kernel to
18565 + * schedule if there's contention:
18566 + */
18567 +void __lockfunc rt_spin_unlock_wait(spinlock_t *lock)
18568 +{
18569 +       spin_lock(lock);
18570 +       spin_unlock(lock);
18571 +}
18572 +EXPORT_SYMBOL(rt_spin_unlock_wait);
18573 +
18574 +int __lockfunc rt_spin_trylock(spinlock_t *lock)
18575 +{
18576 +       int ret;
18577 +
18578 +       sleeping_lock_inc();
18579 +       migrate_disable();
18580 +       ret = __rt_mutex_trylock(&lock->lock);
18581 +       if (ret) {
18582 +               spin_acquire(&lock->dep_map, 0, 1, _RET_IP_);
18583 +       } else {
18584 +               migrate_enable();
18585 +               sleeping_lock_dec();
18586 +       }
18587 +       return ret;
18588 +}
18589 +EXPORT_SYMBOL(rt_spin_trylock);
18590 +
18591 +int __lockfunc rt_spin_trylock_bh(spinlock_t *lock)
18592 +{
18593 +       int ret;
18594 +
18595 +       local_bh_disable();
18596 +       ret = __rt_mutex_trylock(&lock->lock);
18597 +       if (ret) {
18598 +               sleeping_lock_inc();
18599 +               migrate_disable();
18600 +               spin_acquire(&lock->dep_map, 0, 1, _RET_IP_);
18601 +       } else
18602 +               local_bh_enable();
18603 +       return ret;
18604 +}
18605 +EXPORT_SYMBOL(rt_spin_trylock_bh);
18606 +
18607 +int __lockfunc rt_spin_trylock_irqsave(spinlock_t *lock, unsigned long *flags)
18608 +{
18609 +       int ret;
18610 +
18611 +       *flags = 0;
18612 +       ret = __rt_mutex_trylock(&lock->lock);
18613 +       if (ret) {
18614 +               sleeping_lock_inc();
18615 +               migrate_disable();
18616 +               spin_acquire(&lock->dep_map, 0, 1, _RET_IP_);
18617 +       }
18618 +       return ret;
18619 +}
18620 +EXPORT_SYMBOL(rt_spin_trylock_irqsave);
18621 +
18622 +int atomic_dec_and_spin_lock(atomic_t *atomic, spinlock_t *lock)
18623 +{
18624 +       /* Subtract 1 from counter unless that drops it to 0 (ie. it was 1) */
18625 +       if (atomic_add_unless(atomic, -1, 1))
18626 +               return 0;
18627 +       rt_spin_lock(lock);
18628 +       if (atomic_dec_and_test(atomic))
18629 +               return 1;
18630 +       rt_spin_unlock(lock);
18631 +       return 0;
18632 +}
18633 +EXPORT_SYMBOL(atomic_dec_and_spin_lock);
18634 +
18635 +void
18636 +__rt_spin_lock_init(spinlock_t *lock, const char *name, struct lock_class_key *key)
18637 +{
18638 +#ifdef CONFIG_DEBUG_LOCK_ALLOC
18639 +       /*
18640 +        * Make sure we are not reinitializing a held lock:
18641 +        */
18642 +       debug_check_no_locks_freed((void *)lock, sizeof(*lock));
18643 +       lockdep_init_map(&lock->dep_map, name, key, 0);
18644 +#endif
18645 +}
18646 +EXPORT_SYMBOL(__rt_spin_lock_init);
18647 +
18648 +#endif /* PREEMPT_RT_FULL */
18649 +
18650 +#ifdef CONFIG_PREEMPT_RT_FULL
18651 +       static inline int __sched
18652 +__mutex_lock_check_stamp(struct rt_mutex *lock, struct ww_acquire_ctx *ctx)
18653 +{
18654 +       struct ww_mutex *ww = container_of(lock, struct ww_mutex, base.lock);
18655 +       struct ww_acquire_ctx *hold_ctx = ACCESS_ONCE(ww->ctx);
18656 +
18657 +       if (!hold_ctx)
18658 +               return 0;
18659 +
18660 +       if (unlikely(ctx == hold_ctx))
18661 +               return -EALREADY;
18662 +
18663 +       if (ctx->stamp - hold_ctx->stamp <= LONG_MAX &&
18664 +           (ctx->stamp != hold_ctx->stamp || ctx > hold_ctx)) {
18665 +#ifdef CONFIG_DEBUG_MUTEXES
18666 +               DEBUG_LOCKS_WARN_ON(ctx->contending_lock);
18667 +               ctx->contending_lock = ww;
18668 +#endif
18669 +               return -EDEADLK;
18670 +       }
18671 +
18672 +       return 0;
18673 +}
18674 +#else
18675 +       static inline int __sched
18676 +__mutex_lock_check_stamp(struct rt_mutex *lock, struct ww_acquire_ctx *ctx)
18677 +{
18678 +       BUG();
18679 +       return 0;
18680 +}
18681 +
18682 +#endif
18683 +
18684 +static inline int
18685 +try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task,
18686 +                    struct rt_mutex_waiter *waiter)
18687 +{
18688 +       return __try_to_take_rt_mutex(lock, task, waiter, STEAL_NORMAL);
18689 +}
18690 +
18691  /*
18692   * Task blocks on lock.
18693   *
18694 @@ -951,6 +1341,22 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
18695                 return -EDEADLK;
18696
18697         raw_spin_lock(&task->pi_lock);
18698 +       /*
18699 +        * In the case of futex requeue PI, this will be a proxy
18700 +        * lock. The task will wake unaware that it is enqueueed on
18701 +        * this lock. Avoid blocking on two locks and corrupting
18702 +        * pi_blocked_on via the PI_WAKEUP_INPROGRESS
18703 +        * flag. futex_wait_requeue_pi() sets this when it wakes up
18704 +        * before requeue (due to a signal or timeout). Do not enqueue
18705 +        * the task if PI_WAKEUP_INPROGRESS is set.
18706 +        */
18707 +       if (task != current && task->pi_blocked_on == PI_WAKEUP_INPROGRESS) {
18708 +               raw_spin_unlock(&task->pi_lock);
18709 +               return -EAGAIN;
18710 +       }
18711 +
18712 +       BUG_ON(rt_mutex_real_waiter(task->pi_blocked_on));
18713 +
18714         waiter->task = task;
18715         waiter->lock = lock;
18716         waiter->prio = task->prio;
18717 @@ -974,7 +1380,7 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
18718                 rt_mutex_enqueue_pi(owner, waiter);
18719
18720                 rt_mutex_adjust_prio(owner);
18721 -               if (owner->pi_blocked_on)
18722 +               if (rt_mutex_real_waiter(owner->pi_blocked_on))
18723                         chain_walk = 1;
18724         } else if (rt_mutex_cond_detect_deadlock(waiter, chwalk)) {
18725                 chain_walk = 1;
18726 @@ -1016,6 +1422,7 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
18727   * Called with lock->wait_lock held and interrupts disabled.
18728   */
18729  static void mark_wakeup_next_waiter(struct wake_q_head *wake_q,
18730 +                                   struct wake_q_head *wake_sleeper_q,
18731                                     struct rt_mutex *lock)
18732  {
18733         struct rt_mutex_waiter *waiter;
18734 @@ -1055,7 +1462,10 @@ static void mark_wakeup_next_waiter(struct wake_q_head *wake_q,
18735          * Pairs with preempt_enable() in rt_mutex_postunlock();
18736          */
18737         preempt_disable();
18738 -       wake_q_add(wake_q, waiter->task);
18739 +       if (waiter->savestate)
18740 +               wake_q_add_sleeper(wake_sleeper_q, waiter->task);
18741 +       else
18742 +               wake_q_add(wake_q, waiter->task);
18743         raw_spin_unlock(&current->pi_lock);
18744  }
18745
18746 @@ -1070,7 +1480,7 @@ static void remove_waiter(struct rt_mutex *lock,
18747  {
18748         bool is_top_waiter = (waiter == rt_mutex_top_waiter(lock));
18749         struct task_struct *owner = rt_mutex_owner(lock);
18750 -       struct rt_mutex *next_lock;
18751 +       struct rt_mutex *next_lock = NULL;
18752
18753         lockdep_assert_held(&lock->wait_lock);
18754
18755 @@ -1096,7 +1506,8 @@ static void remove_waiter(struct rt_mutex *lock,
18756         rt_mutex_adjust_prio(owner);
18757
18758         /* Store the lock on which owner is blocked or NULL */
18759 -       next_lock = task_blocked_on_lock(owner);
18760 +       if (rt_mutex_real_waiter(owner->pi_blocked_on))
18761 +               next_lock = task_blocked_on_lock(owner);
18762
18763         raw_spin_unlock(&owner->pi_lock);
18764
18765 @@ -1132,26 +1543,28 @@ void rt_mutex_adjust_pi(struct task_struct *task)
18766         raw_spin_lock_irqsave(&task->pi_lock, flags);
18767
18768         waiter = task->pi_blocked_on;
18769 -       if (!waiter || rt_mutex_waiter_equal(waiter, task_to_waiter(task))) {
18770 +       if (!rt_mutex_real_waiter(waiter) ||
18771 +           rt_mutex_waiter_equal(waiter, task_to_waiter(task))) {
18772                 raw_spin_unlock_irqrestore(&task->pi_lock, flags);
18773                 return;
18774         }
18775         next_lock = waiter->lock;
18776 -       raw_spin_unlock_irqrestore(&task->pi_lock, flags);
18777
18778         /* gets dropped in rt_mutex_adjust_prio_chain()! */
18779         get_task_struct(task);
18780
18781 +       raw_spin_unlock_irqrestore(&task->pi_lock, flags);
18782         rt_mutex_adjust_prio_chain(task, RT_MUTEX_MIN_CHAINWALK, NULL,
18783                                    next_lock, NULL, task);
18784  }
18785
18786 -void rt_mutex_init_waiter(struct rt_mutex_waiter *waiter)
18787 +void rt_mutex_init_waiter(struct rt_mutex_waiter *waiter, bool savestate)
18788  {
18789         debug_rt_mutex_init_waiter(waiter);
18790         RB_CLEAR_NODE(&waiter->pi_tree_entry);
18791         RB_CLEAR_NODE(&waiter->tree_entry);
18792         waiter->task = NULL;
18793 +       waiter->savestate = savestate;
18794  }
18795
18796  /**
18797 @@ -1167,7 +1580,8 @@ void rt_mutex_init_waiter(struct rt_mutex_waiter *waiter)
18798  static int __sched
18799  __rt_mutex_slowlock(struct rt_mutex *lock, int state,
18800                     struct hrtimer_sleeper *timeout,
18801 -                   struct rt_mutex_waiter *waiter)
18802 +                   struct rt_mutex_waiter *waiter,
18803 +                   struct ww_acquire_ctx *ww_ctx)
18804  {
18805         int ret = 0;
18806
18807 @@ -1176,16 +1590,17 @@ __rt_mutex_slowlock(struct rt_mutex *lock, int state,
18808                 if (try_to_take_rt_mutex(lock, current, waiter))
18809                         break;
18810
18811 -               /*
18812 -                * TASK_INTERRUPTIBLE checks for signals and
18813 -                * timeout. Ignored otherwise.
18814 -                */
18815 -               if (likely(state == TASK_INTERRUPTIBLE)) {
18816 -                       /* Signal pending? */
18817 -                       if (signal_pending(current))
18818 -                               ret = -EINTR;
18819 -                       if (timeout && !timeout->task)
18820 -                               ret = -ETIMEDOUT;
18821 +               if (timeout && !timeout->task) {
18822 +                       ret = -ETIMEDOUT;
18823 +                       break;
18824 +               }
18825 +               if (signal_pending_state(state, current)) {
18826 +                       ret = -EINTR;
18827 +                       break;
18828 +               }
18829 +
18830 +               if (ww_ctx && ww_ctx->acquired > 0) {
18831 +                       ret = __mutex_lock_check_stamp(lock, ww_ctx);
18832                         if (ret)
18833                                 break;
18834                 }
18835 @@ -1224,33 +1639,104 @@ static void rt_mutex_handle_deadlock(int res, int detect_deadlock,
18836         }
18837  }
18838
18839 -/*
18840 - * Slow path lock function:
18841 - */
18842 -static int __sched
18843 -rt_mutex_slowlock(struct rt_mutex *lock, int state,
18844 -                 struct hrtimer_sleeper *timeout,
18845 -                 enum rtmutex_chainwalk chwalk)
18846 +static __always_inline void ww_mutex_lock_acquired(struct ww_mutex *ww,
18847 +                                                  struct ww_acquire_ctx *ww_ctx)
18848  {
18849 -       struct rt_mutex_waiter waiter;
18850 -       unsigned long flags;
18851 -       int ret = 0;
18852 +#ifdef CONFIG_DEBUG_MUTEXES
18853 +       /*
18854 +        * If this WARN_ON triggers, you used ww_mutex_lock to acquire,
18855 +        * but released with a normal mutex_unlock in this call.
18856 +        *
18857 +        * This should never happen, always use ww_mutex_unlock.
18858 +        */
18859 +       DEBUG_LOCKS_WARN_ON(ww->ctx);
18860
18861 -       rt_mutex_init_waiter(&waiter);
18862 +       /*
18863 +        * Not quite done after calling ww_acquire_done() ?
18864 +        */
18865 +       DEBUG_LOCKS_WARN_ON(ww_ctx->done_acquire);
18866 +
18867 +       if (ww_ctx->contending_lock) {
18868 +               /*
18869 +                * After -EDEADLK you tried to
18870 +                * acquire a different ww_mutex? Bad!
18871 +                */
18872 +               DEBUG_LOCKS_WARN_ON(ww_ctx->contending_lock != ww);
18873 +
18874 +               /*
18875 +                * You called ww_mutex_lock after receiving -EDEADLK,
18876 +                * but 'forgot' to unlock everything else first?
18877 +                */
18878 +               DEBUG_LOCKS_WARN_ON(ww_ctx->acquired > 0);
18879 +               ww_ctx->contending_lock = NULL;
18880 +       }
18881
18882         /*
18883 -        * Technically we could use raw_spin_[un]lock_irq() here, but this can
18884 -        * be called in early boot if the cmpxchg() fast path is disabled
18885 -        * (debug, no architecture support). In this case we will acquire the
18886 -        * rtmutex with lock->wait_lock held. But we cannot unconditionally
18887 -        * enable interrupts in that early boot case. So we need to use the
18888 -        * irqsave/restore variants.
18889 +        * Naughty, using a different class will lead to undefined behavior!
18890          */
18891 -       raw_spin_lock_irqsave(&lock->wait_lock, flags);
18892 +       DEBUG_LOCKS_WARN_ON(ww_ctx->ww_class != ww->ww_class);
18893 +#endif
18894 +       ww_ctx->acquired++;
18895 +}
18896 +
18897 +#ifdef CONFIG_PREEMPT_RT_FULL
18898 +static void ww_mutex_account_lock(struct rt_mutex *lock,
18899 +                                 struct ww_acquire_ctx *ww_ctx)
18900 +{
18901 +       struct ww_mutex *ww = container_of(lock, struct ww_mutex, base.lock);
18902 +       struct rt_mutex_waiter *waiter, *n;
18903 +
18904 +       /*
18905 +        * This branch gets optimized out for the common case,
18906 +        * and is only important for ww_mutex_lock.
18907 +        */
18908 +       ww_mutex_lock_acquired(ww, ww_ctx);
18909 +       ww->ctx = ww_ctx;
18910 +
18911 +       /*
18912 +        * Give any possible sleeping processes the chance to wake up,
18913 +        * so they can recheck if they have to back off.
18914 +        */
18915 +       rbtree_postorder_for_each_entry_safe(waiter, n, &lock->waiters.rb_root,
18916 +                                            tree_entry) {
18917 +               /* XXX debug rt mutex waiter wakeup */
18918 +
18919 +               BUG_ON(waiter->lock != lock);
18920 +               rt_mutex_wake_waiter(waiter);
18921 +       }
18922 +}
18923 +
18924 +#else
18925 +
18926 +static void ww_mutex_account_lock(struct rt_mutex *lock,
18927 +                                 struct ww_acquire_ctx *ww_ctx)
18928 +{
18929 +       BUG();
18930 +}
18931 +#endif
18932 +
18933 +int __sched rt_mutex_slowlock_locked(struct rt_mutex *lock, int state,
18934 +                                    struct hrtimer_sleeper *timeout,
18935 +                                    enum rtmutex_chainwalk chwalk,
18936 +                                    struct ww_acquire_ctx *ww_ctx,
18937 +                                    struct rt_mutex_waiter *waiter)
18938 +{
18939 +       int ret;
18940 +
18941 +#ifdef CONFIG_PREEMPT_RT_FULL
18942 +       if (ww_ctx) {
18943 +               struct ww_mutex *ww;
18944 +
18945 +               ww = container_of(lock, struct ww_mutex, base.lock);
18946 +               if (unlikely(ww_ctx == READ_ONCE(ww->ctx)))
18947 +                       return -EALREADY;
18948 +       }
18949 +#endif
18950
18951         /* Try to acquire the lock again: */
18952         if (try_to_take_rt_mutex(lock, current, NULL)) {
18953 -               raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
18954 +               if (ww_ctx)
18955 +                       ww_mutex_account_lock(lock, ww_ctx);
18956                 return 0;
18957         }
18958
18959 @@ -1260,17 +1746,27 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state,
18960         if (unlikely(timeout))
18961                 hrtimer_start_expires(&timeout->timer, HRTIMER_MODE_ABS);
18962
18963 -       ret = task_blocks_on_rt_mutex(lock, &waiter, current, chwalk);
18964 +       ret = task_blocks_on_rt_mutex(lock, waiter, current, chwalk);
18965
18966 -       if (likely(!ret))
18967 +       if (likely(!ret)) {
18968                 /* sleep on the mutex */
18969 -               ret = __rt_mutex_slowlock(lock, state, timeout, &waiter);
18970 +               ret = __rt_mutex_slowlock(lock, state, timeout, waiter,
18971 +                                         ww_ctx);
18972 +       } else if (ww_ctx) {
18973 +               /* ww_mutex received EDEADLK, let it become EALREADY */
18974 +               ret = __mutex_lock_check_stamp(lock, ww_ctx);
18975 +               BUG_ON(!ret);
18976 +       }
18977
18978         if (unlikely(ret)) {
18979                 __set_current_state(TASK_RUNNING);
18980                 if (rt_mutex_has_waiters(lock))
18981 -                       remove_waiter(lock, &waiter);
18982 -               rt_mutex_handle_deadlock(ret, chwalk, &waiter);
18983 +                       remove_waiter(lock, waiter);
18984 +               /* ww_mutex want to report EDEADLK/EALREADY, let them */
18985 +               if (!ww_ctx)
18986 +                       rt_mutex_handle_deadlock(ret, chwalk, waiter);
18987 +       } else if (ww_ctx) {
18988 +               ww_mutex_account_lock(lock, ww_ctx);
18989         }
18990
18991         /*
18992 @@ -1278,6 +1774,36 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state,
18993          * unconditionally. We might have to fix that up.
18994          */
18995         fixup_rt_mutex_waiters(lock);
18996 +       return ret;
18997 +}
18998 +
18999 +/*
19000 + * Slow path lock function:
19001 + */
19002 +static int __sched
19003 +rt_mutex_slowlock(struct rt_mutex *lock, int state,
19004 +                 struct hrtimer_sleeper *timeout,
19005 +                 enum rtmutex_chainwalk chwalk,
19006 +                 struct ww_acquire_ctx *ww_ctx)
19007 +{
19008 +       struct rt_mutex_waiter waiter;
19009 +       unsigned long flags;
19010 +       int ret = 0;
19011 +
19012 +       rt_mutex_init_waiter(&waiter, false);
19013 +
19014 +       /*
19015 +        * Technically we could use raw_spin_[un]lock_irq() here, but this can
19016 +        * be called in early boot if the cmpxchg() fast path is disabled
19017 +        * (debug, no architecture support). In this case we will acquire the
19018 +        * rtmutex with lock->wait_lock held. But we cannot unconditionally
19019 +        * enable interrupts in that early boot case. So we need to use the
19020 +        * irqsave/restore variants.
19021 +        */
19022 +       raw_spin_lock_irqsave(&lock->wait_lock, flags);
19023 +
19024 +       ret = rt_mutex_slowlock_locked(lock, state, timeout, chwalk, ww_ctx,
19025 +                                      &waiter);
19026
19027         raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
19028
19029 @@ -1338,7 +1864,8 @@ static inline int rt_mutex_slowtrylock(struct rt_mutex *lock)
19030   * Return whether the current task needs to call rt_mutex_postunlock().
19031   */
19032  static bool __sched rt_mutex_slowunlock(struct rt_mutex *lock,
19033 -                                       struct wake_q_head *wake_q)
19034 +                                       struct wake_q_head *wake_q,
19035 +                                       struct wake_q_head *wake_sleeper_q)
19036  {
19037         unsigned long flags;
19038
19039 @@ -1392,7 +1919,7 @@ static bool __sched rt_mutex_slowunlock(struct rt_mutex *lock,
19040          *
19041          * Queue the next waiter for wakeup once we release the wait_lock.
19042          */
19043 -       mark_wakeup_next_waiter(wake_q, lock);
19044 +       mark_wakeup_next_waiter(wake_q, wake_sleeper_q, lock);
19045         raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
19046
19047         return true; /* call rt_mutex_postunlock() */
19048 @@ -1406,29 +1933,45 @@ static bool __sched rt_mutex_slowunlock(struct rt_mutex *lock,
19049   */
19050  static inline int
19051  rt_mutex_fastlock(struct rt_mutex *lock, int state,
19052 +                 struct ww_acquire_ctx *ww_ctx,
19053                   int (*slowfn)(struct rt_mutex *lock, int state,
19054                                 struct hrtimer_sleeper *timeout,
19055 -                               enum rtmutex_chainwalk chwalk))
19056 +                               enum rtmutex_chainwalk chwalk,
19057 +                               struct ww_acquire_ctx *ww_ctx))
19058  {
19059         if (likely(rt_mutex_cmpxchg_acquire(lock, NULL, current)))
19060                 return 0;
19061
19062 -       return slowfn(lock, state, NULL, RT_MUTEX_MIN_CHAINWALK);
19063 +       /*
19064 +        * If rt_mutex blocks, the function sched_submit_work will not call
19065 +        * blk_schedule_flush_plug (because tsk_is_pi_blocked would be true).
19066 +        * We must call blk_schedule_flush_plug here, if we don't call it,
19067 +        * a deadlock in device mapper may happen.
19068 +        */
19069 +       if (unlikely(blk_needs_flush_plug(current)))
19070 +               blk_schedule_flush_plug(current);
19071 +
19072 +       return slowfn(lock, state, NULL, RT_MUTEX_MIN_CHAINWALK, ww_ctx);
19073  }
19074
19075  static inline int
19076  rt_mutex_timed_fastlock(struct rt_mutex *lock, int state,
19077                         struct hrtimer_sleeper *timeout,
19078                         enum rtmutex_chainwalk chwalk,
19079 +                       struct ww_acquire_ctx *ww_ctx,
19080                         int (*slowfn)(struct rt_mutex *lock, int state,
19081                                       struct hrtimer_sleeper *timeout,
19082 -                                     enum rtmutex_chainwalk chwalk))
19083 +                                     enum rtmutex_chainwalk chwalk,
19084 +                                     struct ww_acquire_ctx *ww_ctx))
19085  {
19086         if (chwalk == RT_MUTEX_MIN_CHAINWALK &&
19087             likely(rt_mutex_cmpxchg_acquire(lock, NULL, current)))
19088                 return 0;
19089
19090 -       return slowfn(lock, state, timeout, chwalk);
19091 +       if (unlikely(blk_needs_flush_plug(current)))
19092 +               blk_schedule_flush_plug(current);
19093 +
19094 +       return slowfn(lock, state, timeout, chwalk, ww_ctx);
19095  }
19096
19097  static inline int
19098 @@ -1444,9 +1987,11 @@ rt_mutex_fasttrylock(struct rt_mutex *lock,
19099  /*
19100   * Performs the wakeup of the the top-waiter and re-enables preemption.
19101   */
19102 -void rt_mutex_postunlock(struct wake_q_head *wake_q)
19103 +void rt_mutex_postunlock(struct wake_q_head *wake_q,
19104 +                        struct wake_q_head *wake_sleeper_q)
19105  {
19106         wake_up_q(wake_q);
19107 +       wake_up_q_sleeper(wake_sleeper_q);
19108
19109         /* Pairs with preempt_disable() in rt_mutex_slowunlock() */
19110         preempt_enable();
19111 @@ -1455,23 +2000,40 @@ void rt_mutex_postunlock(struct wake_q_head *wake_q)
19112  static inline void
19113  rt_mutex_fastunlock(struct rt_mutex *lock,
19114                     bool (*slowfn)(struct rt_mutex *lock,
19115 -                                  struct wake_q_head *wqh))
19116 +                                  struct wake_q_head *wqh,
19117 +                                  struct wake_q_head *wq_sleeper))
19118  {
19119         DEFINE_WAKE_Q(wake_q);
19120 +       DEFINE_WAKE_Q(wake_sleeper_q);
19121
19122         if (likely(rt_mutex_cmpxchg_release(lock, current, NULL)))
19123                 return;
19124
19125 -       if (slowfn(lock, &wake_q))
19126 -               rt_mutex_postunlock(&wake_q);
19127 +       if (slowfn(lock, &wake_q, &wake_sleeper_q))
19128 +               rt_mutex_postunlock(&wake_q, &wake_sleeper_q);
19129  }
19130
19131 -static inline void __rt_mutex_lock(struct rt_mutex *lock, unsigned int subclass)
19132 +int __sched __rt_mutex_lock_state(struct rt_mutex *lock, int state)
19133  {
19134         might_sleep();
19135 +       return rt_mutex_fastlock(lock, state, NULL, rt_mutex_slowlock);
19136 +}
19137 +
19138 +/**
19139 + * rt_mutex_lock_state - lock a rt_mutex with a given state
19140 + *
19141 + * @lock:      The rt_mutex to be locked
19142 + * @state:     The state to set when blocking on the rt_mutex
19143 + */
19144 +static int __sched rt_mutex_lock_state(struct rt_mutex *lock, int state, unsigned int subclass)
19145 +{
19146 +       int ret;
19147
19148         mutex_acquire(&lock->dep_map, subclass, 0, _RET_IP_);
19149 -       rt_mutex_fastlock(lock, TASK_UNINTERRUPTIBLE, rt_mutex_slowlock);
19150 +       ret = __rt_mutex_lock_state(lock, state);
19151 +       if (ret)
19152 +               mutex_release(&lock->dep_map, 1, _RET_IP_);
19153 +       return ret;
19154  }
19155
19156  #ifdef CONFIG_DEBUG_LOCK_ALLOC
19157 @@ -1483,7 +2045,7 @@ static inline void __rt_mutex_lock(struct rt_mutex *lock, unsigned int subclass)
19158   */
19159  void __sched rt_mutex_lock_nested(struct rt_mutex *lock, unsigned int subclass)
19160  {
19161 -       __rt_mutex_lock(lock, subclass);
19162 +       rt_mutex_lock_state(lock, TASK_UNINTERRUPTIBLE, subclass);
19163  }
19164  EXPORT_SYMBOL_GPL(rt_mutex_lock_nested);
19165  #endif
19166 @@ -1496,7 +2058,7 @@ EXPORT_SYMBOL_GPL(rt_mutex_lock_nested);
19167   */
19168  void __sched rt_mutex_lock(struct rt_mutex *lock)
19169  {
19170 -       __rt_mutex_lock(lock, 0);
19171 +       rt_mutex_lock_state(lock, TASK_UNINTERRUPTIBLE,  0);
19172  }
19173  EXPORT_SYMBOL_GPL(rt_mutex_lock);
19174  #endif
19175 @@ -1512,16 +2074,7 @@ EXPORT_SYMBOL_GPL(rt_mutex_lock);
19176   */
19177  int __sched rt_mutex_lock_interruptible(struct rt_mutex *lock)
19178  {
19179 -       int ret;
19180 -
19181 -       might_sleep();
19182 -
19183 -       mutex_acquire(&lock->dep_map, 0, 0, _RET_IP_);
19184 -       ret = rt_mutex_fastlock(lock, TASK_INTERRUPTIBLE, rt_mutex_slowlock);
19185 -       if (ret)
19186 -               mutex_release(&lock->dep_map, 1, _RET_IP_);
19187 -
19188 -       return ret;
19189 +       return rt_mutex_lock_state(lock, TASK_INTERRUPTIBLE, 0);
19190  }
19191  EXPORT_SYMBOL_GPL(rt_mutex_lock_interruptible);
19192
19193 @@ -1538,6 +2091,22 @@ int __sched __rt_mutex_futex_trylock(struct rt_mutex *lock)
19194         return __rt_mutex_slowtrylock(lock);
19195  }
19196
19197 +/**
19198 + * rt_mutex_lock_killable - lock a rt_mutex killable
19199 + *
19200 + * @lock:              the rt_mutex to be locked
19201 + * @detect_deadlock:   deadlock detection on/off
19202 + *
19203 + * Returns:
19204 + *  0          on success
19205 + * -EINTR      when interrupted by a signal
19206 + */
19207 +int __sched rt_mutex_lock_killable(struct rt_mutex *lock)
19208 +{
19209 +       return rt_mutex_lock_state(lock, TASK_KILLABLE, 0);
19210 +}
19211 +EXPORT_SYMBOL_GPL(rt_mutex_lock_killable);
19212 +
19213  /**
19214   * rt_mutex_timed_lock - lock a rt_mutex interruptible
19215   *                     the timeout structure is provided
19216 @@ -1561,6 +2130,7 @@ rt_mutex_timed_lock(struct rt_mutex *lock, struct hrtimer_sleeper *timeout)
19217         mutex_acquire(&lock->dep_map, 0, 0, _RET_IP_);
19218         ret = rt_mutex_timed_fastlock(lock, TASK_INTERRUPTIBLE, timeout,
19219                                        RT_MUTEX_MIN_CHAINWALK,
19220 +                                      NULL,
19221                                        rt_mutex_slowlock);
19222         if (ret)
19223                 mutex_release(&lock->dep_map, 1, _RET_IP_);
19224 @@ -1569,6 +2139,18 @@ rt_mutex_timed_lock(struct rt_mutex *lock, struct hrtimer_sleeper *timeout)
19225  }
19226  EXPORT_SYMBOL_GPL(rt_mutex_timed_lock);
19227
19228 +int __sched __rt_mutex_trylock(struct rt_mutex *lock)
19229 +{
19230 +#ifdef CONFIG_PREEMPT_RT_FULL
19231 +       if (WARN_ON_ONCE(in_irq() || in_nmi()))
19232 +#else
19233 +       if (WARN_ON_ONCE(in_irq() || in_nmi() || in_serving_softirq()))
19234 +#endif
19235 +               return 0;
19236 +
19237 +       return rt_mutex_fasttrylock(lock, rt_mutex_slowtrylock);
19238 +}
19239 +
19240  /**
19241   * rt_mutex_trylock - try to lock a rt_mutex
19242   *
19243 @@ -1584,10 +2166,7 @@ int __sched rt_mutex_trylock(struct rt_mutex *lock)
19244  {
19245         int ret;
19246
19247 -       if (WARN_ON_ONCE(in_irq() || in_nmi() || in_serving_softirq()))
19248 -               return 0;
19249 -
19250 -       ret = rt_mutex_fasttrylock(lock, rt_mutex_slowtrylock);
19251 +       ret = __rt_mutex_trylock(lock);
19252         if (ret)
19253                 mutex_acquire(&lock->dep_map, 0, 1, _RET_IP_);
19254
19255 @@ -1595,6 +2174,11 @@ int __sched rt_mutex_trylock(struct rt_mutex *lock)
19256  }
19257  EXPORT_SYMBOL_GPL(rt_mutex_trylock);
19258
19259 +void __sched __rt_mutex_unlock(struct rt_mutex *lock)
19260 +{
19261 +       rt_mutex_fastunlock(lock, rt_mutex_slowunlock);
19262 +}
19263 +
19264  /**
19265   * rt_mutex_unlock - unlock a rt_mutex
19266   *
19267 @@ -1603,16 +2187,13 @@ EXPORT_SYMBOL_GPL(rt_mutex_trylock);
19268  void __sched rt_mutex_unlock(struct rt_mutex *lock)
19269  {
19270         mutex_release(&lock->dep_map, 1, _RET_IP_);
19271 -       rt_mutex_fastunlock(lock, rt_mutex_slowunlock);
19272 +       __rt_mutex_unlock(lock);
19273  }
19274  EXPORT_SYMBOL_GPL(rt_mutex_unlock);
19275
19276 -/**
19277 - * Futex variant, that since futex variants do not use the fast-path, can be
19278 - * simple and will not need to retry.
19279 - */
19280 -bool __sched __rt_mutex_futex_unlock(struct rt_mutex *lock,
19281 -                                   struct wake_q_head *wake_q)
19282 +static bool __sched __rt_mutex_unlock_common(struct rt_mutex *lock,
19283 +                                            struct wake_q_head *wake_q,
19284 +                                            struct wake_q_head *wq_sleeper)
19285  {
19286         lockdep_assert_held(&lock->wait_lock);
19287
19288 @@ -1629,22 +2210,35 @@ bool __sched __rt_mutex_futex_unlock(struct rt_mutex *lock,
19289          * avoid inversion prior to the wakeup.  preempt_disable()
19290          * therein pairs with rt_mutex_postunlock().
19291          */
19292 -       mark_wakeup_next_waiter(wake_q, lock);
19293 +       mark_wakeup_next_waiter(wake_q, wq_sleeper, lock);
19294
19295         return true; /* call postunlock() */
19296  }
19297
19298 +/**
19299 + * Futex variant, that since futex variants do not use the fast-path, can be
19300 + * simple and will not need to retry.
19301 + */
19302 +bool __sched __rt_mutex_futex_unlock(struct rt_mutex *lock,
19303 +                                   struct wake_q_head *wake_q,
19304 +                                   struct wake_q_head *wq_sleeper)
19305 +{
19306 +       return __rt_mutex_unlock_common(lock, wake_q, wq_sleeper);
19307 +}
19308 +
19309  void __sched rt_mutex_futex_unlock(struct rt_mutex *lock)
19310  {
19311         DEFINE_WAKE_Q(wake_q);
19312 +       DEFINE_WAKE_Q(wake_sleeper_q);
19313 +       unsigned long flags;
19314         bool postunlock;
19315
19316 -       raw_spin_lock_irq(&lock->wait_lock);
19317 -       postunlock = __rt_mutex_futex_unlock(lock, &wake_q);
19318 -       raw_spin_unlock_irq(&lock->wait_lock);
19319 +       raw_spin_lock_irqsave(&lock->wait_lock, flags);
19320 +       postunlock = __rt_mutex_futex_unlock(lock, &wake_q, &wake_sleeper_q);
19321 +       raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
19322
19323         if (postunlock)
19324 -               rt_mutex_postunlock(&wake_q);
19325 +               rt_mutex_postunlock(&wake_q, &wake_sleeper_q);
19326  }
19327
19328  /**
19329 @@ -1683,7 +2277,7 @@ void __rt_mutex_init(struct rt_mutex *lock, const char *name,
19330         if (name && key)
19331                 debug_rt_mutex_init(lock, name, key);
19332  }
19333 -EXPORT_SYMBOL_GPL(__rt_mutex_init);
19334 +EXPORT_SYMBOL(__rt_mutex_init);
19335
19336  /**
19337   * rt_mutex_init_proxy_locked - initialize and lock a rt_mutex on behalf of a
19338 @@ -1703,6 +2297,14 @@ void rt_mutex_init_proxy_locked(struct rt_mutex *lock,
19339                                 struct task_struct *proxy_owner)
19340  {
19341         __rt_mutex_init(lock, NULL, NULL);
19342 +#ifdef CONFIG_DEBUG_SPINLOCK
19343 +       /*
19344 +        * get another key class for the wait_lock. LOCK_PI and UNLOCK_PI is
19345 +        * holding the ->wait_lock of the proxy_lock while unlocking a sleeping
19346 +        * lock.
19347 +        */
19348 +       raw_spin_lock_init(&lock->wait_lock);
19349 +#endif
19350         debug_rt_mutex_proxy_lock(lock, proxy_owner);
19351         rt_mutex_set_owner(lock, proxy_owner);
19352  }
19353 @@ -1735,6 +2337,34 @@ int __rt_mutex_start_proxy_lock(struct rt_mutex *lock,
19354         if (try_to_take_rt_mutex(lock, task, NULL))
19355                 return 1;
19356
19357 +#ifdef CONFIG_PREEMPT_RT_FULL
19358 +       /*
19359 +        * In PREEMPT_RT there's an added race.
19360 +        * If the task, that we are about to requeue, times out,
19361 +        * it can set the PI_WAKEUP_INPROGRESS. This tells the requeue
19362 +        * to skip this task. But right after the task sets
19363 +        * its pi_blocked_on to PI_WAKEUP_INPROGRESS it can then
19364 +        * block on the spin_lock(&hb->lock), which in RT is an rtmutex.
19365 +        * This will replace the PI_WAKEUP_INPROGRESS with the actual
19366 +        * lock that it blocks on. We *must not* place this task
19367 +        * on this proxy lock in that case.
19368 +        *
19369 +        * To prevent this race, we first take the task's pi_lock
19370 +        * and check if it has updated its pi_blocked_on. If it has,
19371 +        * we assume that it woke up and we return -EAGAIN.
19372 +        * Otherwise, we set the task's pi_blocked_on to
19373 +        * PI_REQUEUE_INPROGRESS, so that if the task is waking up
19374 +        * it will know that we are in the process of requeuing it.
19375 +        */
19376 +       raw_spin_lock(&task->pi_lock);
19377 +       if (task->pi_blocked_on) {
19378 +               raw_spin_unlock(&task->pi_lock);
19379 +               return -EAGAIN;
19380 +       }
19381 +       task->pi_blocked_on = PI_REQUEUE_INPROGRESS;
19382 +       raw_spin_unlock(&task->pi_lock);
19383 +#endif
19384 +
19385         /* We enforce deadlock detection for futexes */
19386         ret = task_blocks_on_rt_mutex(lock, waiter, task,
19387                                       RT_MUTEX_FULL_CHAINWALK);
19388 @@ -1749,7 +2379,7 @@ int __rt_mutex_start_proxy_lock(struct rt_mutex *lock,
19389                 ret = 0;
19390         }
19391
19392 -       if (unlikely(ret))
19393 +       if (ret && rt_mutex_has_waiters(lock))
19394                 remove_waiter(lock, waiter);
19395
19396         debug_rt_mutex_print_deadlock(waiter);
19397 @@ -1824,17 +2454,36 @@ int rt_mutex_wait_proxy_lock(struct rt_mutex *lock,
19398                                struct hrtimer_sleeper *to,
19399                                struct rt_mutex_waiter *waiter)
19400  {
19401 +       struct task_struct *tsk = current;
19402         int ret;
19403
19404         raw_spin_lock_irq(&lock->wait_lock);
19405         /* sleep on the mutex */
19406         set_current_state(TASK_INTERRUPTIBLE);
19407 -       ret = __rt_mutex_slowlock(lock, TASK_INTERRUPTIBLE, to, waiter);
19408 +       ret = __rt_mutex_slowlock(lock, TASK_INTERRUPTIBLE, to, waiter, NULL);
19409         /*
19410          * try_to_take_rt_mutex() sets the waiter bit unconditionally. We might
19411          * have to fix that up.
19412          */
19413         fixup_rt_mutex_waiters(lock);
19414 +       /*
19415 +        * RT has a problem here when the wait got interrupted by a timeout
19416 +        * or a signal. task->pi_blocked_on is still set. The task must
19417 +        * acquire the hash bucket lock when returning from this function.
19418 +        *
19419 +        * If the hash bucket lock is contended then the
19420 +        * BUG_ON(rt_mutex_real_waiter(task->pi_blocked_on)) in
19421 +        * task_blocks_on_rt_mutex() will trigger. This can be avoided by
19422 +        * clearing task->pi_blocked_on which removes the task from the
19423 +        * boosting chain of the rtmutex. That's correct because the task
19424 +        * is not longer blocked on it.
19425 +        */
19426 +       if (ret) {
19427 +               raw_spin_lock(&tsk->pi_lock);
19428 +               tsk->pi_blocked_on = NULL;
19429 +               raw_spin_unlock(&tsk->pi_lock);
19430 +       }
19431 +
19432         raw_spin_unlock_irq(&lock->wait_lock);
19433
19434         return ret;
19435 @@ -1895,3 +2544,99 @@ bool rt_mutex_cleanup_proxy_lock(struct rt_mutex *lock,
19436
19437         return cleanup;
19438  }
19439 +
19440 +static inline int
19441 +ww_mutex_deadlock_injection(struct ww_mutex *lock, struct ww_acquire_ctx *ctx)
19442 +{
19443 +#ifdef CONFIG_DEBUG_WW_MUTEX_SLOWPATH
19444 +       unsigned tmp;
19445 +
19446 +       if (ctx->deadlock_inject_countdown-- == 0) {
19447 +               tmp = ctx->deadlock_inject_interval;
19448 +               if (tmp > UINT_MAX/4)
19449 +                       tmp = UINT_MAX;
19450 +               else
19451 +                       tmp = tmp*2 + tmp + tmp/2;
19452 +
19453 +               ctx->deadlock_inject_interval = tmp;
19454 +               ctx->deadlock_inject_countdown = tmp;
19455 +               ctx->contending_lock = lock;
19456 +
19457 +               ww_mutex_unlock(lock);
19458 +
19459 +               return -EDEADLK;
19460 +       }
19461 +#endif
19462 +
19463 +       return 0;
19464 +}
19465 +
19466 +#ifdef CONFIG_PREEMPT_RT_FULL
19467 +int __sched
19468 +ww_mutex_lock_interruptible(struct ww_mutex *lock, struct ww_acquire_ctx *ctx)
19469 +{
19470 +       int ret;
19471 +
19472 +       might_sleep();
19473 +
19474 +       mutex_acquire_nest(&lock->base.dep_map, 0, 0,
19475 +                          ctx ? &ctx->dep_map : NULL, _RET_IP_);
19476 +       ret = rt_mutex_slowlock(&lock->base.lock, TASK_INTERRUPTIBLE, NULL, 0,
19477 +                               ctx);
19478 +       if (ret)
19479 +               mutex_release(&lock->base.dep_map, 1, _RET_IP_);
19480 +       else if (!ret && ctx && ctx->acquired > 1)
19481 +               return ww_mutex_deadlock_injection(lock, ctx);
19482 +
19483 +       return ret;
19484 +}
19485 +EXPORT_SYMBOL_GPL(ww_mutex_lock_interruptible);
19486 +
19487 +int __sched
19488 +ww_mutex_lock(struct ww_mutex *lock, struct ww_acquire_ctx *ctx)
19489 +{
19490 +       int ret;
19491 +
19492 +       might_sleep();
19493 +
19494 +       mutex_acquire_nest(&lock->base.dep_map, 0, 0,
19495 +                          ctx ? &ctx->dep_map : NULL, _RET_IP_);
19496 +       ret = rt_mutex_slowlock(&lock->base.lock, TASK_UNINTERRUPTIBLE, NULL, 0,
19497 +                               ctx);
19498 +       if (ret)
19499 +               mutex_release(&lock->base.dep_map, 1, _RET_IP_);
19500 +       else if (!ret && ctx && ctx->acquired > 1)
19501 +               return ww_mutex_deadlock_injection(lock, ctx);
19502 +
19503 +       return ret;
19504 +}
19505 +EXPORT_SYMBOL_GPL(ww_mutex_lock);
19506 +
19507 +void __sched ww_mutex_unlock(struct ww_mutex *lock)
19508 +{
19509 +       int nest = !!lock->ctx;
19510 +
19511 +       /*
19512 +        * The unlocking fastpath is the 0->1 transition from 'locked'
19513 +        * into 'unlocked' state:
19514 +        */
19515 +       if (nest) {
19516 +#ifdef CONFIG_DEBUG_MUTEXES
19517 +               DEBUG_LOCKS_WARN_ON(!lock->ctx->acquired);
19518 +#endif
19519 +               if (lock->ctx->acquired > 0)
19520 +                       lock->ctx->acquired--;
19521 +               lock->ctx = NULL;
19522 +       }
19523 +
19524 +       mutex_release(&lock->base.dep_map, nest, _RET_IP_);
19525 +       __rt_mutex_unlock(&lock->base.lock);
19526 +}
19527 +EXPORT_SYMBOL(ww_mutex_unlock);
19528 +
19529 +int __rt_mutex_owner_current(struct rt_mutex *lock)
19530 +{
19531 +       return rt_mutex_owner(lock) == current;
19532 +}
19533 +EXPORT_SYMBOL(__rt_mutex_owner_current);
19534 +#endif
19535 diff --git a/kernel/locking/rtmutex_common.h b/kernel/locking/rtmutex_common.h
19536 index 68686b3ec3c1..2a157c78e18c 100644
19537 --- a/kernel/locking/rtmutex_common.h
19538 +++ b/kernel/locking/rtmutex_common.h
19539 @@ -15,6 +15,7 @@
19540
19541  #include <linux/rtmutex.h>
19542  #include <linux/sched/wake_q.h>
19543 +#include <linux/sched/debug.h>
19544
19545  /*
19546   * This is the control structure for tasks blocked on a rt_mutex,
19547 @@ -29,6 +30,7 @@ struct rt_mutex_waiter {
19548         struct rb_node          pi_tree_entry;
19549         struct task_struct      *task;
19550         struct rt_mutex         *lock;
19551 +       bool                    savestate;
19552  #ifdef CONFIG_DEBUG_RT_MUTEXES
19553         unsigned long           ip;
19554         struct pid              *deadlock_task_pid;
19555 @@ -129,12 +131,15 @@ enum rtmutex_chainwalk {
19556  /*
19557   * PI-futex support (proxy locking functions, etc.):
19558   */
19559 +#define PI_WAKEUP_INPROGRESS   ((struct rt_mutex_waiter *) 1)
19560 +#define PI_REQUEUE_INPROGRESS  ((struct rt_mutex_waiter *) 2)
19561 +
19562  extern struct task_struct *rt_mutex_next_owner(struct rt_mutex *lock);
19563  extern void rt_mutex_init_proxy_locked(struct rt_mutex *lock,
19564                                        struct task_struct *proxy_owner);
19565  extern void rt_mutex_proxy_unlock(struct rt_mutex *lock,
19566                                   struct task_struct *proxy_owner);
19567 -extern void rt_mutex_init_waiter(struct rt_mutex_waiter *waiter);
19568 +extern void rt_mutex_init_waiter(struct rt_mutex_waiter *waiter, bool savetate);
19569  extern int __rt_mutex_start_proxy_lock(struct rt_mutex *lock,
19570                                      struct rt_mutex_waiter *waiter,
19571                                      struct task_struct *task);
19572 @@ -152,9 +157,27 @@ extern int __rt_mutex_futex_trylock(struct rt_mutex *l);
19573
19574  extern void rt_mutex_futex_unlock(struct rt_mutex *lock);
19575  extern bool __rt_mutex_futex_unlock(struct rt_mutex *lock,
19576 -                                struct wake_q_head *wqh);
19577 -
19578 -extern void rt_mutex_postunlock(struct wake_q_head *wake_q);
19579 +                                struct wake_q_head *wqh,
19580 +                                struct wake_q_head *wq_sleeper);
19581 +
19582 +extern void rt_mutex_postunlock(struct wake_q_head *wake_q,
19583 +                               struct wake_q_head *wake_sleeper_q);
19584 +
19585 +/* RW semaphore special interface */
19586 +struct ww_acquire_ctx;
19587 +
19588 +extern int __rt_mutex_lock_state(struct rt_mutex *lock, int state);
19589 +extern int __rt_mutex_trylock(struct rt_mutex *lock);
19590 +extern void __rt_mutex_unlock(struct rt_mutex *lock);
19591 +int __sched rt_mutex_slowlock_locked(struct rt_mutex *lock, int state,
19592 +                                    struct hrtimer_sleeper *timeout,
19593 +                                    enum rtmutex_chainwalk chwalk,
19594 +                                    struct ww_acquire_ctx *ww_ctx,
19595 +                                    struct rt_mutex_waiter *waiter);
19596 +void __sched rt_spin_lock_slowlock_locked(struct rt_mutex *lock,
19597 +                                         struct rt_mutex_waiter *waiter,
19598 +                                         unsigned long flags);
19599 +void __sched rt_spin_lock_slowunlock(struct rt_mutex *lock);
19600
19601  #ifdef CONFIG_DEBUG_RT_MUTEXES
19602  # include "rtmutex-debug.h"
19603 diff --git a/kernel/locking/rwlock-rt.c b/kernel/locking/rwlock-rt.c
19604 new file mode 100644
19605 index 000000000000..f2e155b2c4a8
19606 --- /dev/null
19607 +++ b/kernel/locking/rwlock-rt.c
19608 @@ -0,0 +1,378 @@
19609 +/*
19610 + */
19611 +#include <linux/sched/debug.h>
19612 +#include <linux/export.h>
19613 +
19614 +#include "rtmutex_common.h"
19615 +#include <linux/rwlock_types_rt.h>
19616 +
19617 +/*
19618 + * RT-specific reader/writer locks
19619 + *
19620 + * write_lock()
19621 + *  1) Lock lock->rtmutex
19622 + *  2) Remove the reader BIAS to force readers into the slow path
19623 + *  3) Wait until all readers have left the critical region
19624 + *  4) Mark it write locked
19625 + *
19626 + * write_unlock()
19627 + *  1) Remove the write locked marker
19628 + *  2) Set the reader BIAS so readers can use the fast path again
19629 + *  3) Unlock lock->rtmutex to release blocked readers
19630 + *
19631 + * read_lock()
19632 + *  1) Try fast path acquisition (reader BIAS is set)
19633 + *  2) Take lock->rtmutex.wait_lock which protects the writelocked flag
19634 + *  3) If !writelocked, acquire it for read
19635 + *  4) If writelocked, block on lock->rtmutex
19636 + *  5) unlock lock->rtmutex, goto 1)
19637 + *
19638 + * read_unlock()
19639 + *  1) Try fast path release (reader count != 1)
19640 + *  2) Wake the writer waiting in write_lock()#3
19641 + *
19642 + * read_lock()#3 has the consequence, that rw locks on RT are not writer
19643 + * fair, but writers, which should be avoided in RT tasks (think tasklist
19644 + * lock), are subject to the rtmutex priority/DL inheritance mechanism.
19645 + *
19646 + * It's possible to make the rw locks writer fair by keeping a list of
19647 + * active readers. A blocked writer would force all newly incoming readers
19648 + * to block on the rtmutex, but the rtmutex would have to be proxy locked
19649 + * for one reader after the other. We can't use multi-reader inheritance
19650 + * because there is no way to support that with
19651 + * SCHED_DEADLINE. Implementing the one by one reader boosting/handover
19652 + * mechanism is a major surgery for a very dubious value.
19653 + *
19654 + * The risk of writer starvation is there, but the pathological use cases
19655 + * which trigger it are not necessarily the typical RT workloads.
19656 + */
19657 +
19658 +void __rwlock_biased_rt_init(struct rt_rw_lock *lock, const char *name,
19659 +                            struct lock_class_key *key)
19660 +{
19661 +#ifdef CONFIG_DEBUG_LOCK_ALLOC
19662 +       /*
19663 +        * Make sure we are not reinitializing a held semaphore:
19664 +        */
19665 +       debug_check_no_locks_freed((void *)lock, sizeof(*lock));
19666 +       lockdep_init_map(&lock->dep_map, name, key, 0);
19667 +#endif
19668 +       atomic_set(&lock->readers, READER_BIAS);
19669 +       rt_mutex_init(&lock->rtmutex);
19670 +       lock->rtmutex.save_state = 1;
19671 +}
19672 +
19673 +int __read_rt_trylock(struct rt_rw_lock *lock)
19674 +{
19675 +       int r, old;
19676 +
19677 +       /*
19678 +        * Increment reader count, if lock->readers < 0, i.e. READER_BIAS is
19679 +        * set.
19680 +        */
19681 +       for (r = atomic_read(&lock->readers); r < 0;) {
19682 +               old = atomic_cmpxchg(&lock->readers, r, r + 1);
19683 +               if (likely(old == r))
19684 +                       return 1;
19685 +               r = old;
19686 +       }
19687 +       return 0;
19688 +}
19689 +
19690 +void __sched __read_rt_lock(struct rt_rw_lock *lock)
19691 +{
19692 +       struct rt_mutex *m = &lock->rtmutex;
19693 +       struct rt_mutex_waiter waiter;
19694 +       unsigned long flags;
19695 +
19696 +       if (__read_rt_trylock(lock))
19697 +               return;
19698 +
19699 +       raw_spin_lock_irqsave(&m->wait_lock, flags);
19700 +       /*
19701 +        * Allow readers as long as the writer has not completely
19702 +        * acquired the semaphore for write.
19703 +        */
19704 +       if (atomic_read(&lock->readers) != WRITER_BIAS) {
19705 +               atomic_inc(&lock->readers);
19706 +               raw_spin_unlock_irqrestore(&m->wait_lock, flags);
19707 +               return;
19708 +       }
19709 +
19710 +       /*
19711 +        * Call into the slow lock path with the rtmutex->wait_lock
19712 +        * held, so this can't result in the following race:
19713 +        *
19714 +        * Reader1              Reader2         Writer
19715 +        *                      read_lock()
19716 +        *                                      write_lock()
19717 +        *                                      rtmutex_lock(m)
19718 +        *                                      swait()
19719 +        * read_lock()
19720 +        * unlock(m->wait_lock)
19721 +        *                      read_unlock()
19722 +        *                      swake()
19723 +        *                                      lock(m->wait_lock)
19724 +        *                                      lock->writelocked=true
19725 +        *                                      unlock(m->wait_lock)
19726 +        *
19727 +        *                                      write_unlock()
19728 +        *                                      lock->writelocked=false
19729 +        *                                      rtmutex_unlock(m)
19730 +        *                      read_lock()
19731 +        *                                      write_lock()
19732 +        *                                      rtmutex_lock(m)
19733 +        *                                      swait()
19734 +        * rtmutex_lock(m)
19735 +        *
19736 +        * That would put Reader1 behind the writer waiting on
19737 +        * Reader2 to call read_unlock() which might be unbound.
19738 +        */
19739 +       rt_mutex_init_waiter(&waiter, false);
19740 +       rt_spin_lock_slowlock_locked(m, &waiter, flags);
19741 +       /*
19742 +        * The slowlock() above is guaranteed to return with the rtmutex is
19743 +        * now held, so there can't be a writer active. Increment the reader
19744 +        * count and immediately drop the rtmutex again.
19745 +        */
19746 +       atomic_inc(&lock->readers);
19747 +       raw_spin_unlock_irqrestore(&m->wait_lock, flags);
19748 +       rt_spin_lock_slowunlock(m);
19749 +
19750 +       debug_rt_mutex_free_waiter(&waiter);
19751 +}
19752 +
19753 +void __read_rt_unlock(struct rt_rw_lock *lock)
19754 +{
19755 +       struct rt_mutex *m = &lock->rtmutex;
19756 +       struct task_struct *tsk;
19757 +
19758 +       /*
19759 +        * sem->readers can only hit 0 when a writer is waiting for the
19760 +        * active readers to leave the critical region.
19761 +        */
19762 +       if (!atomic_dec_and_test(&lock->readers))
19763 +               return;
19764 +
19765 +       raw_spin_lock_irq(&m->wait_lock);
19766 +       /*
19767 +        * Wake the writer, i.e. the rtmutex owner. It might release the
19768 +        * rtmutex concurrently in the fast path, but to clean up the rw
19769 +        * lock it needs to acquire m->wait_lock. The worst case which can
19770 +        * happen is a spurious wakeup.
19771 +        */
19772 +       tsk = rt_mutex_owner(m);
19773 +       if (tsk)
19774 +               wake_up_process(tsk);
19775 +
19776 +       raw_spin_unlock_irq(&m->wait_lock);
19777 +}
19778 +
19779 +static void __write_unlock_common(struct rt_rw_lock *lock, int bias,
19780 +                                 unsigned long flags)
19781 +{
19782 +       struct rt_mutex *m = &lock->rtmutex;
19783 +
19784 +       atomic_add(READER_BIAS - bias, &lock->readers);
19785 +       raw_spin_unlock_irqrestore(&m->wait_lock, flags);
19786 +       rt_spin_lock_slowunlock(m);
19787 +}
19788 +
19789 +void __sched __write_rt_lock(struct rt_rw_lock *lock)
19790 +{
19791 +       struct rt_mutex *m = &lock->rtmutex;
19792 +       struct task_struct *self = current;
19793 +       unsigned long flags;
19794 +
19795 +       /* Take the rtmutex as a first step */
19796 +       __rt_spin_lock(m);
19797 +
19798 +       /* Force readers into slow path */
19799 +       atomic_sub(READER_BIAS, &lock->readers);
19800 +
19801 +       raw_spin_lock_irqsave(&m->wait_lock, flags);
19802 +
19803 +       raw_spin_lock(&self->pi_lock);
19804 +       self->saved_state = self->state;
19805 +       __set_current_state_no_track(TASK_UNINTERRUPTIBLE);
19806 +       raw_spin_unlock(&self->pi_lock);
19807 +
19808 +       for (;;) {
19809 +               /* Have all readers left the critical region? */
19810 +               if (!atomic_read(&lock->readers)) {
19811 +                       atomic_set(&lock->readers, WRITER_BIAS);
19812 +                       raw_spin_lock(&self->pi_lock);
19813 +                       __set_current_state_no_track(self->saved_state);
19814 +                       self->saved_state = TASK_RUNNING;
19815 +                       raw_spin_unlock(&self->pi_lock);
19816 +                       raw_spin_unlock_irqrestore(&m->wait_lock, flags);
19817 +                       return;
19818 +               }
19819 +
19820 +               raw_spin_unlock_irqrestore(&m->wait_lock, flags);
19821 +
19822 +               if (atomic_read(&lock->readers) != 0)
19823 +                       schedule();
19824 +
19825 +               raw_spin_lock_irqsave(&m->wait_lock, flags);
19826 +
19827 +               raw_spin_lock(&self->pi_lock);
19828 +               __set_current_state_no_track(TASK_UNINTERRUPTIBLE);
19829 +               raw_spin_unlock(&self->pi_lock);
19830 +       }
19831 +}
19832 +
19833 +int __write_rt_trylock(struct rt_rw_lock *lock)
19834 +{
19835 +       struct rt_mutex *m = &lock->rtmutex;
19836 +       unsigned long flags;
19837 +
19838 +       if (!__rt_mutex_trylock(m))
19839 +               return 0;
19840 +
19841 +       atomic_sub(READER_BIAS, &lock->readers);
19842 +
19843 +       raw_spin_lock_irqsave(&m->wait_lock, flags);
19844 +       if (!atomic_read(&lock->readers)) {
19845 +               atomic_set(&lock->readers, WRITER_BIAS);
19846 +               raw_spin_unlock_irqrestore(&m->wait_lock, flags);
19847 +               return 1;
19848 +       }
19849 +       __write_unlock_common(lock, 0, flags);
19850 +       return 0;
19851 +}
19852 +
19853 +void __write_rt_unlock(struct rt_rw_lock *lock)
19854 +{
19855 +       struct rt_mutex *m = &lock->rtmutex;
19856 +       unsigned long flags;
19857 +
19858 +       raw_spin_lock_irqsave(&m->wait_lock, flags);
19859 +       __write_unlock_common(lock, WRITER_BIAS, flags);
19860 +}
19861 +
19862 +/* Map the reader biased implementation */
19863 +static inline int do_read_rt_trylock(rwlock_t *rwlock)
19864 +{
19865 +       return __read_rt_trylock(rwlock);
19866 +}
19867 +
19868 +static inline int do_write_rt_trylock(rwlock_t *rwlock)
19869 +{
19870 +       return __write_rt_trylock(rwlock);
19871 +}
19872 +
19873 +static inline void do_read_rt_lock(rwlock_t *rwlock)
19874 +{
19875 +       __read_rt_lock(rwlock);
19876 +}
19877 +
19878 +static inline void do_write_rt_lock(rwlock_t *rwlock)
19879 +{
19880 +       __write_rt_lock(rwlock);
19881 +}
19882 +
19883 +static inline void do_read_rt_unlock(rwlock_t *rwlock)
19884 +{
19885 +       __read_rt_unlock(rwlock);
19886 +}
19887 +
19888 +static inline void do_write_rt_unlock(rwlock_t *rwlock)
19889 +{
19890 +       __write_rt_unlock(rwlock);
19891 +}
19892 +
19893 +static inline void do_rwlock_rt_init(rwlock_t *rwlock, const char *name,
19894 +                                    struct lock_class_key *key)
19895 +{
19896 +       __rwlock_biased_rt_init(rwlock, name, key);
19897 +}
19898 +
19899 +int __lockfunc rt_read_can_lock(rwlock_t *rwlock)
19900 +{
19901 +       return  atomic_read(&rwlock->readers) < 0;
19902 +}
19903 +
19904 +int __lockfunc rt_write_can_lock(rwlock_t *rwlock)
19905 +{
19906 +       return atomic_read(&rwlock->readers) == READER_BIAS;
19907 +}
19908 +
19909 +/*
19910 + * The common functions which get wrapped into the rwlock API.
19911 + */
19912 +int __lockfunc rt_read_trylock(rwlock_t *rwlock)
19913 +{
19914 +       int ret;
19915 +
19916 +       sleeping_lock_inc();
19917 +       migrate_disable();
19918 +       ret = do_read_rt_trylock(rwlock);
19919 +       if (ret) {
19920 +               rwlock_acquire_read(&rwlock->dep_map, 0, 1, _RET_IP_);
19921 +       } else {
19922 +               migrate_enable();
19923 +               sleeping_lock_dec();
19924 +       }
19925 +       return ret;
19926 +}
19927 +EXPORT_SYMBOL(rt_read_trylock);
19928 +
19929 +int __lockfunc rt_write_trylock(rwlock_t *rwlock)
19930 +{
19931 +       int ret;
19932 +
19933 +       sleeping_lock_inc();
19934 +       migrate_disable();
19935 +       ret = do_write_rt_trylock(rwlock);
19936 +       if (ret) {
19937 +               rwlock_acquire(&rwlock->dep_map, 0, 1, _RET_IP_);
19938 +       } else {
19939 +               migrate_enable();
19940 +               sleeping_lock_dec();
19941 +       }
19942 +       return ret;
19943 +}
19944 +EXPORT_SYMBOL(rt_write_trylock);
19945 +
19946 +void __lockfunc rt_read_lock(rwlock_t *rwlock)
19947 +{
19948 +       sleeping_lock_inc();
19949 +       migrate_disable();
19950 +       rwlock_acquire_read(&rwlock->dep_map, 0, 0, _RET_IP_);
19951 +       do_read_rt_lock(rwlock);
19952 +}
19953 +EXPORT_SYMBOL(rt_read_lock);
19954 +
19955 +void __lockfunc rt_write_lock(rwlock_t *rwlock)
19956 +{
19957 +       sleeping_lock_inc();
19958 +       migrate_disable();
19959 +       rwlock_acquire(&rwlock->dep_map, 0, 0, _RET_IP_);
19960 +       do_write_rt_lock(rwlock);
19961 +}
19962 +EXPORT_SYMBOL(rt_write_lock);
19963 +
19964 +void __lockfunc rt_read_unlock(rwlock_t *rwlock)
19965 +{
19966 +       rwlock_release(&rwlock->dep_map, 1, _RET_IP_);
19967 +       do_read_rt_unlock(rwlock);
19968 +       migrate_enable();
19969 +       sleeping_lock_dec();
19970 +}
19971 +EXPORT_SYMBOL(rt_read_unlock);
19972 +
19973 +void __lockfunc rt_write_unlock(rwlock_t *rwlock)
19974 +{
19975 +       rwlock_release(&rwlock->dep_map, 1, _RET_IP_);
19976 +       do_write_rt_unlock(rwlock);
19977 +       migrate_enable();
19978 +       sleeping_lock_dec();
19979 +}
19980 +EXPORT_SYMBOL(rt_write_unlock);
19981 +
19982 +void __rt_rwlock_init(rwlock_t *rwlock, char *name, struct lock_class_key *key)
19983 +{
19984 +       do_rwlock_rt_init(rwlock, name, key);
19985 +}
19986 +EXPORT_SYMBOL(__rt_rwlock_init);
19987 diff --git a/kernel/locking/rwsem-rt.c b/kernel/locking/rwsem-rt.c
19988 new file mode 100644
19989 index 000000000000..26991ddb6c5a
19990 --- /dev/null
19991 +++ b/kernel/locking/rwsem-rt.c
19992 @@ -0,0 +1,269 @@
19993 +/*
19994 + */
19995 +#include <linux/rwsem.h>
19996 +#include <linux/sched/debug.h>
19997 +#include <linux/sched/signal.h>
19998 +#include <linux/export.h>
19999 +
20000 +#include "rtmutex_common.h"
20001 +
20002 +/*
20003 + * RT-specific reader/writer semaphores
20004 + *
20005 + * down_write()
20006 + *  1) Lock sem->rtmutex
20007 + *  2) Remove the reader BIAS to force readers into the slow path
20008 + *  3) Wait until all readers have left the critical region
20009 + *  4) Mark it write locked
20010 + *
20011 + * up_write()
20012 + *  1) Remove the write locked marker
20013 + *  2) Set the reader BIAS so readers can use the fast path again
20014 + *  3) Unlock sem->rtmutex to release blocked readers
20015 + *
20016 + * down_read()
20017 + *  1) Try fast path acquisition (reader BIAS is set)
20018 + *  2) Take sem->rtmutex.wait_lock which protects the writelocked flag
20019 + *  3) If !writelocked, acquire it for read
20020 + *  4) If writelocked, block on sem->rtmutex
20021 + *  5) unlock sem->rtmutex, goto 1)
20022 + *
20023 + * up_read()
20024 + *  1) Try fast path release (reader count != 1)
20025 + *  2) Wake the writer waiting in down_write()#3
20026 + *
20027 + * down_read()#3 has the consequence, that rw semaphores on RT are not writer
20028 + * fair, but writers, which should be avoided in RT tasks (think mmap_sem),
20029 + * are subject to the rtmutex priority/DL inheritance mechanism.
20030 + *
20031 + * It's possible to make the rw semaphores writer fair by keeping a list of
20032 + * active readers. A blocked writer would force all newly incoming readers to
20033 + * block on the rtmutex, but the rtmutex would have to be proxy locked for one
20034 + * reader after the other. We can't use multi-reader inheritance because there
20035 + * is no way to support that with SCHED_DEADLINE. Implementing the one by one
20036 + * reader boosting/handover mechanism is a major surgery for a very dubious
20037 + * value.
20038 + *
20039 + * The risk of writer starvation is there, but the pathological use cases
20040 + * which trigger it are not necessarily the typical RT workloads.
20041 + */
20042 +
20043 +void __rwsem_init(struct rw_semaphore *sem, const char *name,
20044 +                 struct lock_class_key *key)
20045 +{
20046 +#ifdef CONFIG_DEBUG_LOCK_ALLOC
20047 +       /*
20048 +        * Make sure we are not reinitializing a held semaphore:
20049 +        */
20050 +       debug_check_no_locks_freed((void *)sem, sizeof(*sem));
20051 +       lockdep_init_map(&sem->dep_map, name, key, 0);
20052 +#endif
20053 +       atomic_set(&sem->readers, READER_BIAS);
20054 +}
20055 +EXPORT_SYMBOL(__rwsem_init);
20056 +
20057 +int __down_read_trylock(struct rw_semaphore *sem)
20058 +{
20059 +       int r, old;
20060 +
20061 +       /*
20062 +        * Increment reader count, if sem->readers < 0, i.e. READER_BIAS is
20063 +        * set.
20064 +        */
20065 +       for (r = atomic_read(&sem->readers); r < 0;) {
20066 +               old = atomic_cmpxchg(&sem->readers, r, r + 1);
20067 +               if (likely(old == r))
20068 +                       return 1;
20069 +               r = old;
20070 +       }
20071 +       return 0;
20072 +}
20073 +
20074 +void __sched __down_read(struct rw_semaphore *sem)
20075 +{
20076 +       struct rt_mutex *m = &sem->rtmutex;
20077 +       struct rt_mutex_waiter waiter;
20078 +
20079 +       if (__down_read_trylock(sem))
20080 +               return;
20081 +
20082 +       might_sleep();
20083 +       raw_spin_lock_irq(&m->wait_lock);
20084 +       /*
20085 +        * Allow readers as long as the writer has not completely
20086 +        * acquired the semaphore for write.
20087 +        */
20088 +       if (atomic_read(&sem->readers) != WRITER_BIAS) {
20089 +               atomic_inc(&sem->readers);
20090 +               raw_spin_unlock_irq(&m->wait_lock);
20091 +               return;
20092 +       }
20093 +
20094 +       /*
20095 +        * Call into the slow lock path with the rtmutex->wait_lock
20096 +        * held, so this can't result in the following race:
20097 +        *
20098 +        * Reader1              Reader2         Writer
20099 +        *                      down_read()
20100 +        *                                      down_write()
20101 +        *                                      rtmutex_lock(m)
20102 +        *                                      swait()
20103 +        * down_read()
20104 +        * unlock(m->wait_lock)
20105 +        *                      up_read()
20106 +        *                      swake()
20107 +        *                                      lock(m->wait_lock)
20108 +        *                                      sem->writelocked=true
20109 +        *                                      unlock(m->wait_lock)
20110 +        *
20111 +        *                                      up_write()
20112 +        *                                      sem->writelocked=false
20113 +        *                                      rtmutex_unlock(m)
20114 +        *                      down_read()
20115 +        *                                      down_write()
20116 +        *                                      rtmutex_lock(m)
20117 +        *                                      swait()
20118 +        * rtmutex_lock(m)
20119 +        *
20120 +        * That would put Reader1 behind the writer waiting on
20121 +        * Reader2 to call up_read() which might be unbound.
20122 +        */
20123 +       rt_mutex_init_waiter(&waiter, false);
20124 +       rt_mutex_slowlock_locked(m, TASK_UNINTERRUPTIBLE, NULL,
20125 +                                RT_MUTEX_MIN_CHAINWALK, NULL,
20126 +                                &waiter);
20127 +       /*
20128 +        * The slowlock() above is guaranteed to return with the rtmutex is
20129 +        * now held, so there can't be a writer active. Increment the reader
20130 +        * count and immediately drop the rtmutex again.
20131 +        */
20132 +       atomic_inc(&sem->readers);
20133 +       raw_spin_unlock_irq(&m->wait_lock);
20134 +       __rt_mutex_unlock(m);
20135 +
20136 +       debug_rt_mutex_free_waiter(&waiter);
20137 +}
20138 +
20139 +void __up_read(struct rw_semaphore *sem)
20140 +{
20141 +       struct rt_mutex *m = &sem->rtmutex;
20142 +       struct task_struct *tsk;
20143 +
20144 +       /*
20145 +        * sem->readers can only hit 0 when a writer is waiting for the
20146 +        * active readers to leave the critical region.
20147 +        */
20148 +       if (!atomic_dec_and_test(&sem->readers))
20149 +               return;
20150 +
20151 +       might_sleep();
20152 +       raw_spin_lock_irq(&m->wait_lock);
20153 +       /*
20154 +        * Wake the writer, i.e. the rtmutex owner. It might release the
20155 +        * rtmutex concurrently in the fast path (due to a signal), but to
20156 +        * clean up the rwsem it needs to acquire m->wait_lock. The worst
20157 +        * case which can happen is a spurious wakeup.
20158 +        */
20159 +       tsk = rt_mutex_owner(m);
20160 +       if (tsk)
20161 +               wake_up_process(tsk);
20162 +
20163 +       raw_spin_unlock_irq(&m->wait_lock);
20164 +}
20165 +
20166 +static void __up_write_unlock(struct rw_semaphore *sem, int bias,
20167 +                             unsigned long flags)
20168 +{
20169 +       struct rt_mutex *m = &sem->rtmutex;
20170 +
20171 +       atomic_add(READER_BIAS - bias, &sem->readers);
20172 +       raw_spin_unlock_irqrestore(&m->wait_lock, flags);
20173 +       __rt_mutex_unlock(m);
20174 +}
20175 +
20176 +static int __sched __down_write_common(struct rw_semaphore *sem, int state)
20177 +{
20178 +       struct rt_mutex *m = &sem->rtmutex;
20179 +       unsigned long flags;
20180 +
20181 +       /* Take the rtmutex as a first step */
20182 +       if (__rt_mutex_lock_state(m, state))
20183 +               return -EINTR;
20184 +
20185 +       /* Force readers into slow path */
20186 +       atomic_sub(READER_BIAS, &sem->readers);
20187 +       might_sleep();
20188 +
20189 +       set_current_state(state);
20190 +       for (;;) {
20191 +               raw_spin_lock_irqsave(&m->wait_lock, flags);
20192 +               /* Have all readers left the critical region? */
20193 +               if (!atomic_read(&sem->readers)) {
20194 +                       atomic_set(&sem->readers, WRITER_BIAS);
20195 +                       __set_current_state(TASK_RUNNING);
20196 +                       raw_spin_unlock_irqrestore(&m->wait_lock, flags);
20197 +                       return 0;
20198 +               }
20199 +
20200 +               if (signal_pending_state(state, current)) {
20201 +                       __set_current_state(TASK_RUNNING);
20202 +                       __up_write_unlock(sem, 0, flags);
20203 +                       return -EINTR;
20204 +               }
20205 +               raw_spin_unlock_irqrestore(&m->wait_lock, flags);
20206 +
20207 +               if (atomic_read(&sem->readers) != 0) {
20208 +                       schedule();
20209 +                       set_current_state(state);
20210 +               }
20211 +       }
20212 +}
20213 +
20214 +void __sched __down_write(struct rw_semaphore *sem)
20215 +{
20216 +       __down_write_common(sem, TASK_UNINTERRUPTIBLE);
20217 +}
20218 +
20219 +int __sched __down_write_killable(struct rw_semaphore *sem)
20220 +{
20221 +       return __down_write_common(sem, TASK_KILLABLE);
20222 +}
20223 +
20224 +int __down_write_trylock(struct rw_semaphore *sem)
20225 +{
20226 +       struct rt_mutex *m = &sem->rtmutex;
20227 +       unsigned long flags;
20228 +
20229 +       if (!__rt_mutex_trylock(m))
20230 +               return 0;
20231 +
20232 +       atomic_sub(READER_BIAS, &sem->readers);
20233 +
20234 +       raw_spin_lock_irqsave(&m->wait_lock, flags);
20235 +       if (!atomic_read(&sem->readers)) {
20236 +               atomic_set(&sem->readers, WRITER_BIAS);
20237 +               raw_spin_unlock_irqrestore(&m->wait_lock, flags);
20238 +               return 1;
20239 +       }
20240 +       __up_write_unlock(sem, 0, flags);
20241 +       return 0;
20242 +}
20243 +
20244 +void __up_write(struct rw_semaphore *sem)
20245 +{
20246 +       struct rt_mutex *m = &sem->rtmutex;
20247 +       unsigned long flags;
20248 +
20249 +       raw_spin_lock_irqsave(&m->wait_lock, flags);
20250 +       __up_write_unlock(sem, WRITER_BIAS, flags);
20251 +}
20252 +
20253 +void __downgrade_write(struct rw_semaphore *sem)
20254 +{
20255 +       struct rt_mutex *m = &sem->rtmutex;
20256 +       unsigned long flags;
20257 +
20258 +       raw_spin_lock_irqsave(&m->wait_lock, flags);
20259 +       /* Release it and account current as reader */
20260 +       __up_write_unlock(sem, WRITER_BIAS - 1, flags);
20261 +}
20262 diff --git a/kernel/locking/spinlock.c b/kernel/locking/spinlock.c
20263 index 6e40fdfba326..401bda23f786 100644
20264 --- a/kernel/locking/spinlock.c
20265 +++ b/kernel/locking/spinlock.c
20266 @@ -125,8 +125,11 @@ void __lockfunc __raw_##op##_lock_bh(locktype##_t *lock)           \
20267   *         __[spin|read|write]_lock_bh()
20268   */
20269  BUILD_LOCK_OPS(spin, raw_spinlock);
20270 +
20271 +#ifndef CONFIG_PREEMPT_RT_FULL
20272  BUILD_LOCK_OPS(read, rwlock);
20273  BUILD_LOCK_OPS(write, rwlock);
20274 +#endif
20275
20276  #endif
20277
20278 @@ -210,6 +213,8 @@ void __lockfunc _raw_spin_unlock_bh(raw_spinlock_t *lock)
20279  EXPORT_SYMBOL(_raw_spin_unlock_bh);
20280  #endif
20281
20282 +#ifndef CONFIG_PREEMPT_RT_FULL
20283 +
20284  #ifndef CONFIG_INLINE_READ_TRYLOCK
20285  int __lockfunc _raw_read_trylock(rwlock_t *lock)
20286  {
20287 @@ -354,6 +359,8 @@ void __lockfunc _raw_write_unlock_bh(rwlock_t *lock)
20288  EXPORT_SYMBOL(_raw_write_unlock_bh);
20289  #endif
20290
20291 +#endif /* !PREEMPT_RT_FULL */
20292 +
20293  #ifdef CONFIG_DEBUG_LOCK_ALLOC
20294
20295  void __lockfunc _raw_spin_lock_nested(raw_spinlock_t *lock, int subclass)
20296 diff --git a/kernel/locking/spinlock_debug.c b/kernel/locking/spinlock_debug.c
20297 index 9aa0fccd5d43..76d0b40d9193 100644
20298 --- a/kernel/locking/spinlock_debug.c
20299 +++ b/kernel/locking/spinlock_debug.c
20300 @@ -31,6 +31,7 @@ void __raw_spin_lock_init(raw_spinlock_t *lock, const char *name,
20301
20302  EXPORT_SYMBOL(__raw_spin_lock_init);
20303
20304 +#ifndef CONFIG_PREEMPT_RT_FULL
20305  void __rwlock_init(rwlock_t *lock, const char *name,
20306                    struct lock_class_key *key)
20307  {
20308 @@ -48,6 +49,7 @@ void __rwlock_init(rwlock_t *lock, const char *name,
20309  }
20310
20311  EXPORT_SYMBOL(__rwlock_init);
20312 +#endif
20313
20314  static void spin_dump(raw_spinlock_t *lock, const char *msg)
20315  {
20316 @@ -135,6 +137,7 @@ void do_raw_spin_unlock(raw_spinlock_t *lock)
20317         arch_spin_unlock(&lock->raw_lock);
20318  }
20319
20320 +#ifndef CONFIG_PREEMPT_RT_FULL
20321  static void rwlock_bug(rwlock_t *lock, const char *msg)
20322  {
20323         if (!debug_locks_off())
20324 @@ -224,3 +227,5 @@ void do_raw_write_unlock(rwlock_t *lock)
20325         debug_write_unlock(lock);
20326         arch_write_unlock(&lock->raw_lock);
20327  }
20328 +
20329 +#endif
20330 diff --git a/kernel/panic.c b/kernel/panic.c
20331 index bdd18afa19a4..5da649633795 100644
20332 --- a/kernel/panic.c
20333 +++ b/kernel/panic.c
20334 @@ -482,9 +482,11 @@ static u64 oops_id;
20335
20336  static int init_oops_id(void)
20337  {
20338 +#ifndef CONFIG_PREEMPT_RT_FULL
20339         if (!oops_id)
20340                 get_random_bytes(&oops_id, sizeof(oops_id));
20341         else
20342 +#endif
20343                 oops_id++;
20344
20345         return 0;
20346 diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
20347 index a5c36e9c56a6..a4b83cb0c6e5 100644
20348 --- a/kernel/power/hibernate.c
20349 +++ b/kernel/power/hibernate.c
20350 @@ -287,6 +287,8 @@ static int create_image(int platform_mode)
20351
20352         local_irq_disable();
20353
20354 +       system_state = SYSTEM_SUSPEND;
20355 +
20356         error = syscore_suspend();
20357         if (error) {
20358                 pr_err("Some system devices failed to power down, aborting hibernation\n");
20359 @@ -317,6 +319,7 @@ static int create_image(int platform_mode)
20360         syscore_resume();
20361
20362   Enable_irqs:
20363 +       system_state = SYSTEM_RUNNING;
20364         local_irq_enable();
20365
20366   Enable_cpus:
20367 @@ -445,6 +448,7 @@ static int resume_target_kernel(bool platform_mode)
20368                 goto Enable_cpus;
20369
20370         local_irq_disable();
20371 +       system_state = SYSTEM_SUSPEND;
20372
20373         error = syscore_suspend();
20374         if (error)
20375 @@ -478,6 +482,7 @@ static int resume_target_kernel(bool platform_mode)
20376         syscore_resume();
20377
20378   Enable_irqs:
20379 +       system_state = SYSTEM_RUNNING;
20380         local_irq_enable();
20381
20382   Enable_cpus:
20383 @@ -563,6 +568,7 @@ int hibernation_platform_enter(void)
20384                 goto Enable_cpus;
20385
20386         local_irq_disable();
20387 +       system_state = SYSTEM_SUSPEND;
20388         syscore_suspend();
20389         if (pm_wakeup_pending()) {
20390                 error = -EAGAIN;
20391 @@ -575,6 +581,7 @@ int hibernation_platform_enter(void)
20392
20393   Power_up:
20394         syscore_resume();
20395 +       system_state = SYSTEM_RUNNING;
20396         local_irq_enable();
20397
20398   Enable_cpus:
20399 @@ -672,6 +679,10 @@ static int load_image_and_restore(void)
20400         return error;
20401  }
20402
20403 +#ifndef CONFIG_SUSPEND
20404 +bool pm_in_action;
20405 +#endif
20406 +
20407  /**
20408   * hibernate - Carry out system hibernation, including saving the image.
20409   */
20410 @@ -685,6 +696,8 @@ int hibernate(void)
20411                 return -EPERM;
20412         }
20413
20414 +       pm_in_action = true;
20415 +
20416         lock_system_sleep();
20417         /* The snapshot device should not be opened while we're running */
20418         if (!atomic_add_unless(&snapshot_device_available, -1, 0)) {
20419 @@ -763,6 +776,7 @@ int hibernate(void)
20420         atomic_inc(&snapshot_device_available);
20421   Unlock:
20422         unlock_system_sleep();
20423 +       pm_in_action = false;
20424         pr_info("hibernation exit\n");
20425
20426         return error;
20427 diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
20428 index c0bc2c89697a..b89605fe0e88 100644
20429 --- a/kernel/power/suspend.c
20430 +++ b/kernel/power/suspend.c
20431 @@ -27,6 +27,7 @@
20432  #include <linux/export.h>
20433  #include <linux/suspend.h>
20434  #include <linux/syscore_ops.h>
20435 +#include <linux/swait.h>
20436  #include <linux/ftrace.h>
20437  #include <trace/events/power.h>
20438  #include <linux/compiler.h>
20439 @@ -57,7 +58,7 @@ EXPORT_SYMBOL_GPL(pm_suspend_global_flags);
20440
20441  static const struct platform_suspend_ops *suspend_ops;
20442  static const struct platform_s2idle_ops *s2idle_ops;
20443 -static DECLARE_WAIT_QUEUE_HEAD(s2idle_wait_head);
20444 +static DECLARE_SWAIT_QUEUE_HEAD(s2idle_wait_head);
20445
20446  enum s2idle_states __read_mostly s2idle_state;
20447  static DEFINE_RAW_SPINLOCK(s2idle_lock);
20448 @@ -91,8 +92,8 @@ static void s2idle_enter(void)
20449         /* Push all the CPUs into the idle loop. */
20450         wake_up_all_idle_cpus();
20451         /* Make the current CPU wait so it can enter the idle loop too. */
20452 -       wait_event(s2idle_wait_head,
20453 -                  s2idle_state == S2IDLE_STATE_WAKE);
20454 +       swait_event(s2idle_wait_head,
20455 +                   s2idle_state == S2IDLE_STATE_WAKE);
20456
20457         cpuidle_pause();
20458         put_online_cpus();
20459 @@ -159,7 +160,7 @@ void s2idle_wake(void)
20460         raw_spin_lock_irqsave(&s2idle_lock, flags);
20461         if (s2idle_state > S2IDLE_STATE_NONE) {
20462                 s2idle_state = S2IDLE_STATE_WAKE;
20463 -               wake_up(&s2idle_wait_head);
20464 +               swake_up(&s2idle_wait_head);
20465         }
20466         raw_spin_unlock_irqrestore(&s2idle_lock, flags);
20467  }
20468 @@ -428,6 +429,8 @@ static int suspend_enter(suspend_state_t state, bool *wakeup)
20469         arch_suspend_disable_irqs();
20470         BUG_ON(!irqs_disabled());
20471
20472 +       system_state = SYSTEM_SUSPEND;
20473 +
20474         error = syscore_suspend();
20475         if (!error) {
20476                 *wakeup = pm_wakeup_pending();
20477 @@ -443,6 +446,8 @@ static int suspend_enter(suspend_state_t state, bool *wakeup)
20478                 syscore_resume();
20479         }
20480
20481 +       system_state = SYSTEM_RUNNING;
20482 +
20483         arch_suspend_enable_irqs();
20484         BUG_ON(irqs_disabled());
20485
20486 @@ -589,6 +594,8 @@ static int enter_state(suspend_state_t state)
20487         return error;
20488  }
20489
20490 +bool pm_in_action;
20491 +
20492  /**
20493   * pm_suspend - Externally visible function for suspending the system.
20494   * @state: System sleep state to enter.
20495 @@ -603,6 +610,7 @@ int pm_suspend(suspend_state_t state)
20496         if (state <= PM_SUSPEND_ON || state >= PM_SUSPEND_MAX)
20497                 return -EINVAL;
20498
20499 +       pm_in_action = true;
20500         pr_info("suspend entry (%s)\n", mem_sleep_labels[state]);
20501         error = enter_state(state);
20502         if (error) {
20503 @@ -612,6 +620,7 @@ int pm_suspend(suspend_state_t state)
20504                 suspend_stats.success++;
20505         }
20506         pr_info("suspend exit\n");
20507 +       pm_in_action = false;
20508         return error;
20509  }
20510  EXPORT_SYMBOL(pm_suspend);
20511 diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
20512 index f0223a7d9ed1..13fd0bcf2367 100644
20513 --- a/kernel/printk/printk.c
20514 +++ b/kernel/printk/printk.c
20515 @@ -1348,6 +1348,8 @@ static int syslog_print_all(char __user *buf, int size, bool clear)
20516  {
20517         char *text;
20518         int len = 0;
20519 +       int attempts = 0;
20520 +       int num_msg;
20521
20522         text = kmalloc(LOG_LINE_MAX + PREFIX_MAX, GFP_KERNEL);
20523         if (!text)
20524 @@ -1359,6 +1361,14 @@ static int syslog_print_all(char __user *buf, int size, bool clear)
20525                 u64 seq;
20526                 u32 idx;
20527
20528 +try_again:
20529 +               attempts++;
20530 +               if (attempts > 10) {
20531 +                       len = -EBUSY;
20532 +                       goto out;
20533 +               }
20534 +               num_msg = 0;
20535 +
20536                 /*
20537                  * Find first record that fits, including all following records,
20538                  * into the user-provided buffer for this dump.
20539 @@ -1371,6 +1381,14 @@ static int syslog_print_all(char __user *buf, int size, bool clear)
20540                         len += msg_print_text(msg, true, NULL, 0);
20541                         idx = log_next(idx);
20542                         seq++;
20543 +                       num_msg++;
20544 +                       if (num_msg > 5) {
20545 +                               num_msg = 0;
20546 +                               logbuf_unlock_irq();
20547 +                               logbuf_lock_irq();
20548 +                               if (clear_seq < log_first_seq)
20549 +                                       goto try_again;
20550 +                       }
20551                 }
20552
20553                 /* move first record forward until length fits into the buffer */
20554 @@ -1382,6 +1400,14 @@ static int syslog_print_all(char __user *buf, int size, bool clear)
20555                         len -= msg_print_text(msg, true, NULL, 0);
20556                         idx = log_next(idx);
20557                         seq++;
20558 +                       num_msg++;
20559 +                       if (num_msg > 5) {
20560 +                               num_msg = 0;
20561 +                               logbuf_unlock_irq();
20562 +                               logbuf_lock_irq();
20563 +                               if (clear_seq < log_first_seq)
20564 +                                       goto try_again;
20565 +                       }
20566                 }
20567
20568                 /* last message fitting into this dump */
20569 @@ -1420,6 +1446,7 @@ static int syslog_print_all(char __user *buf, int size, bool clear)
20570                 clear_seq = log_next_seq;
20571                 clear_idx = log_next_idx;
20572         }
20573 +out:
20574         logbuf_unlock_irq();
20575
20576         kfree(text);
20577 @@ -1558,6 +1585,12 @@ static void call_console_drivers(const char *ext_text, size_t ext_len,
20578         if (!console_drivers)
20579                 return;
20580
20581 +       if (IS_ENABLED(CONFIG_PREEMPT_RT_BASE)) {
20582 +               if (in_irq() || in_nmi())
20583 +                       return;
20584 +       }
20585 +
20586 +       migrate_disable();
20587         for_each_console(con) {
20588                 if (exclusive_console && con != exclusive_console)
20589                         continue;
20590 @@ -1573,6 +1606,7 @@ static void call_console_drivers(const char *ext_text, size_t ext_len,
20591                 else
20592                         con->write(con, text, len);
20593         }
20594 +       migrate_enable();
20595  }
20596
20597  int printk_delay_msec __read_mostly;
20598 @@ -1757,12 +1791,22 @@ asmlinkage int vprintk_emit(int facility, int level,
20599
20600         /* If called from the scheduler, we can not call up(). */
20601         if (!in_sched) {
20602 +               int may_trylock = 1;
20603 +
20604 +#ifdef CONFIG_PREEMPT_RT_FULL
20605 +               /*
20606 +                * we can't take a sleeping lock with IRQs or preeption disabled
20607 +                * so we can't print in these contexts
20608 +                */
20609 +               if (!(preempt_count() == 0 && !irqs_disabled()))
20610 +                       may_trylock = 0;
20611 +#endif
20612                 /*
20613                  * Try to acquire and then immediately release the console
20614                  * semaphore.  The release will print out buffers and wake up
20615                  * /dev/kmsg and syslog() users.
20616                  */
20617 -               if (console_trylock())
20618 +               if (may_trylock && console_trylock())
20619                         console_unlock();
20620         }
20621
20622 @@ -1872,26 +1916,6 @@ static bool suppress_message_printing(int level) { return false; }
20623
20624  #endif /* CONFIG_PRINTK */
20625
20626 -#ifdef CONFIG_EARLY_PRINTK
20627 -struct console *early_console;
20628 -
20629 -asmlinkage __visible void early_printk(const char *fmt, ...)
20630 -{
20631 -       va_list ap;
20632 -       char buf[512];
20633 -       int n;
20634 -
20635 -       if (!early_console)
20636 -               return;
20637 -
20638 -       va_start(ap, fmt);
20639 -       n = vscnprintf(buf, sizeof(buf), fmt, ap);
20640 -       va_end(ap);
20641 -
20642 -       early_console->write(early_console, buf, n);
20643 -}
20644 -#endif
20645 -
20646  static int __add_preferred_console(char *name, int idx, char *options,
20647                                    char *brl_options)
20648  {
20649 @@ -2238,10 +2262,15 @@ void console_unlock(void)
20650                 console_seq++;
20651                 raw_spin_unlock(&logbuf_lock);
20652
20653 +#ifdef CONFIG_PREEMPT_RT_FULL
20654 +               printk_safe_exit_irqrestore(flags);
20655 +               call_console_drivers(ext_text, ext_len, text, len);
20656 +#else
20657                 stop_critical_timings();        /* don't trace print latency */
20658                 call_console_drivers(ext_text, ext_len, text, len);
20659                 start_critical_timings();
20660                 printk_safe_exit_irqrestore(flags);
20661 +#endif
20662
20663                 if (do_cond_resched)
20664                         cond_resched();
20665 @@ -2295,6 +2324,11 @@ void console_unblank(void)
20666  {
20667         struct console *c;
20668
20669 +       if (IS_ENABLED(CONFIG_PREEMPT_RT_BASE)) {
20670 +               if (in_irq() || in_nmi())
20671 +                       return;
20672 +       }
20673 +
20674         /*
20675          * console_unblank can no longer be called in interrupt context unless
20676          * oops_in_progress is set to 1..
20677 diff --git a/kernel/printk/printk_safe.c b/kernel/printk/printk_safe.c
20678 index 64f8046586b6..a24e16bef51c 100644
20679 --- a/kernel/printk/printk_safe.c
20680 +++ b/kernel/printk/printk_safe.c
20681 @@ -22,6 +22,7 @@
20682  #include <linux/cpumask.h>
20683  #include <linux/irq_work.h>
20684  #include <linux/printk.h>
20685 +#include <linux/console.h>
20686
20687  #include "internal.h"
20688
20689 @@ -373,8 +374,74 @@ void __printk_safe_exit(void)
20690         this_cpu_dec(printk_context);
20691  }
20692
20693 +#ifdef CONFIG_EARLY_PRINTK
20694 +struct console *early_console;
20695 +
20696 +static void early_vprintk(const char *fmt, va_list ap)
20697 +{
20698 +       if (early_console) {
20699 +               char buf[512];
20700 +               int n = vscnprintf(buf, sizeof(buf), fmt, ap);
20701 +
20702 +               early_console->write(early_console, buf, n);
20703 +       }
20704 +}
20705 +
20706 +asmlinkage void early_printk(const char *fmt, ...)
20707 +{
20708 +       va_list ap;
20709 +
20710 +       va_start(ap, fmt);
20711 +       early_vprintk(fmt, ap);
20712 +       va_end(ap);
20713 +}
20714 +
20715 +/*
20716 + * This is independent of any log levels - a global
20717 + * kill switch that turns off all of printk.
20718 + *
20719 + * Used by the NMI watchdog if early-printk is enabled.
20720 + */
20721 +static bool __read_mostly printk_killswitch;
20722 +
20723 +static int __init force_early_printk_setup(char *str)
20724 +{
20725 +       printk_killswitch = true;
20726 +       return 0;
20727 +}
20728 +early_param("force_early_printk", force_early_printk_setup);
20729 +
20730 +void printk_kill(void)
20731 +{
20732 +       printk_killswitch = true;
20733 +}
20734 +
20735 +#ifdef CONFIG_PRINTK
20736 +static int forced_early_printk(const char *fmt, va_list ap)
20737 +{
20738 +       if (!printk_killswitch)
20739 +               return 0;
20740 +       early_vprintk(fmt, ap);
20741 +       return 1;
20742 +}
20743 +#endif
20744 +
20745 +#else
20746 +static inline int forced_early_printk(const char *fmt, va_list ap)
20747 +{
20748 +       return 0;
20749 +}
20750 +#endif
20751 +
20752  __printf(1, 0) int vprintk_func(const char *fmt, va_list args)
20753  {
20754 +       /*
20755 +        * Fall back to early_printk if a debugging subsystem has
20756 +        * killed printk output
20757 +        */
20758 +       if (unlikely(forced_early_printk(fmt, args)))
20759 +               return 1;
20760 +
20761         /*
20762          * Try to use the main logbuf even in NMI. But avoid calling console
20763          * drivers that might have their own locks.
20764 diff --git a/kernel/ptrace.c b/kernel/ptrace.c
20765 index 84b1367935e4..b32a86f63522 100644
20766 --- a/kernel/ptrace.c
20767 +++ b/kernel/ptrace.c
20768 @@ -175,7 +175,14 @@ static bool ptrace_freeze_traced(struct task_struct *task)
20769
20770         spin_lock_irq(&task->sighand->siglock);
20771         if (task_is_traced(task) && !__fatal_signal_pending(task)) {
20772 -               task->state = __TASK_TRACED;
20773 +               unsigned long flags;
20774 +
20775 +               raw_spin_lock_irqsave(&task->pi_lock, flags);
20776 +               if (task->state & __TASK_TRACED)
20777 +                       task->state = __TASK_TRACED;
20778 +               else
20779 +                       task->saved_state = __TASK_TRACED;
20780 +               raw_spin_unlock_irqrestore(&task->pi_lock, flags);
20781                 ret = true;
20782         }
20783         spin_unlock_irq(&task->sighand->siglock);
20784 diff --git a/kernel/rcu/Kconfig b/kernel/rcu/Kconfig
20785 index 9210379c0353..0be2c96fb640 100644
20786 --- a/kernel/rcu/Kconfig
20787 +++ b/kernel/rcu/Kconfig
20788 @@ -36,7 +36,7 @@ config TINY_RCU
20789
20790  config RCU_EXPERT
20791         bool "Make expert-level adjustments to RCU configuration"
20792 -       default n
20793 +       default y if PREEMPT_RT_FULL
20794         help
20795           This option needs to be enabled if you wish to make
20796           expert-level adjustments to RCU configuration.  By default,
20797 @@ -172,7 +172,7 @@ config RCU_FANOUT_LEAF
20798
20799  config RCU_FAST_NO_HZ
20800         bool "Accelerate last non-dyntick-idle CPU's grace periods"
20801 -       depends on NO_HZ_COMMON && SMP && RCU_EXPERT
20802 +       depends on NO_HZ_COMMON && SMP && RCU_EXPERT && !PREEMPT_RT_FULL
20803         default n
20804         help
20805           This option permits CPUs to enter dynticks-idle state even if
20806 @@ -191,7 +191,7 @@ config RCU_FAST_NO_HZ
20807  config RCU_BOOST
20808         bool "Enable RCU priority boosting"
20809         depends on RT_MUTEXES && PREEMPT_RCU && RCU_EXPERT
20810 -       default n
20811 +       default y if PREEMPT_RT_FULL
20812         help
20813           This option boosts the priority of preempted RCU readers that
20814           block the current preemptible RCU grace period for too long.
20815 diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h
20816 index e4b43fef89f5..0b056c30e9b1 100644
20817 --- a/kernel/rcu/rcu.h
20818 +++ b/kernel/rcu/rcu.h
20819 @@ -462,18 +462,26 @@ static inline void show_rcu_gp_kthreads(void) { }
20820  extern unsigned long rcutorture_testseq;
20821  extern unsigned long rcutorture_vernum;
20822  unsigned long rcu_batches_started(void);
20823 -unsigned long rcu_batches_started_bh(void);
20824  unsigned long rcu_batches_started_sched(void);
20825  unsigned long rcu_batches_completed(void);
20826 -unsigned long rcu_batches_completed_bh(void);
20827  unsigned long rcu_batches_completed_sched(void);
20828  unsigned long rcu_exp_batches_completed(void);
20829  unsigned long rcu_exp_batches_completed_sched(void);
20830  unsigned long srcu_batches_completed(struct srcu_struct *sp);
20831  void show_rcu_gp_kthreads(void);
20832  void rcu_force_quiescent_state(void);
20833 -void rcu_bh_force_quiescent_state(void);
20834  void rcu_sched_force_quiescent_state(void);
20835 +
20836 +#ifndef CONFIG_PREEMPT_RT_FULL
20837 +void rcu_bh_force_quiescent_state(void);
20838 +unsigned long rcu_batches_started_bh(void);
20839 +unsigned long rcu_batches_completed_bh(void);
20840 +#else
20841 +# define rcu_bh_force_quiescent_state  rcu_force_quiescent_state
20842 +# define rcu_batches_completed_bh      rcu_batches_completed
20843 +# define rcu_batches_started_bh                rcu_batches_completed
20844 +#endif
20845 +
20846  #endif /* #else #ifdef CONFIG_TINY_RCU */
20847
20848  #ifdef CONFIG_RCU_NOCB_CPU
20849 diff --git a/kernel/rcu/rcu_segcblist.c b/kernel/rcu/rcu_segcblist.c
20850 index 7649fcd2c4c7..88cba7c2956c 100644
20851 --- a/kernel/rcu/rcu_segcblist.c
20852 +++ b/kernel/rcu/rcu_segcblist.c
20853 @@ -23,6 +23,7 @@
20854  #include <linux/types.h>
20855  #include <linux/kernel.h>
20856  #include <linux/interrupt.h>
20857 +#include <linux/rcupdate.h>
20858
20859  #include "rcu_segcblist.h"
20860
20861 diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
20862 index 45f2ffbc1e78..2e9dbb734d5a 100644
20863 --- a/kernel/rcu/rcutorture.c
20864 +++ b/kernel/rcu/rcutorture.c
20865 @@ -417,6 +417,7 @@ static struct rcu_torture_ops rcu_ops = {
20866         .name           = "rcu"
20867  };
20868
20869 +#ifndef CONFIG_PREEMPT_RT_FULL
20870  /*
20871   * Definitions for rcu_bh torture testing.
20872   */
20873 @@ -456,6 +457,12 @@ static struct rcu_torture_ops rcu_bh_ops = {
20874         .name           = "rcu_bh"
20875  };
20876
20877 +#else
20878 +static struct rcu_torture_ops rcu_bh_ops = {
20879 +       .ttype          = INVALID_RCU_FLAVOR,
20880 +};
20881 +#endif
20882 +
20883  /*
20884   * Don't even think about trying any of these in real life!!!
20885   * The names includes "busted", and they really means it!
20886 diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c
20887 index 6d5880089ff6..0e3b2bd3f2ac 100644
20888 --- a/kernel/rcu/srcutree.c
20889 +++ b/kernel/rcu/srcutree.c
20890 @@ -36,6 +36,8 @@
20891  #include <linux/delay.h>
20892  #include <linux/module.h>
20893  #include <linux/srcu.h>
20894 +#include <linux/cpu.h>
20895 +#include <linux/locallock.h>
20896
20897  #include "rcu.h"
20898  #include "rcu_segcblist.h"
20899 @@ -53,6 +55,33 @@ static void srcu_invoke_callbacks(struct work_struct *work);
20900  static void srcu_reschedule(struct srcu_struct *sp, unsigned long delay);
20901  static void process_srcu(struct work_struct *work);
20902
20903 +/* Wrappers for lock acquisition and release, see raw_spin_lock_rcu_node(). */
20904 +#define spin_lock_rcu_node(p)                                  \
20905 +do {                                                                   \
20906 +       spin_lock(&ACCESS_PRIVATE(p, lock));                    \
20907 +       smp_mb__after_unlock_lock();                                    \
20908 +} while (0)
20909 +
20910 +#define spin_unlock_rcu_node(p) spin_unlock(&ACCESS_PRIVATE(p, lock))
20911 +
20912 +#define spin_lock_irq_rcu_node(p)                                      \
20913 +do {                                                                   \
20914 +       spin_lock_irq(&ACCESS_PRIVATE(p, lock));                        \
20915 +       smp_mb__after_unlock_lock();                                    \
20916 +} while (0)
20917 +
20918 +#define spin_unlock_irq_rcu_node(p)                                    \
20919 +       spin_unlock_irq(&ACCESS_PRIVATE(p, lock))
20920 +
20921 +#define spin_lock_irqsave_rcu_node(p, flags)                   \
20922 +do {                                                                   \
20923 +       spin_lock_irqsave(&ACCESS_PRIVATE(p, lock), flags);     \
20924 +       smp_mb__after_unlock_lock();                                    \
20925 +} while (0)
20926 +
20927 +#define spin_unlock_irqrestore_rcu_node(p, flags)                      \
20928 +       spin_unlock_irqrestore(&ACCESS_PRIVATE(p, lock), flags) \
20929 +
20930  /*
20931   * Initialize SRCU combining tree.  Note that statically allocated
20932   * srcu_struct structures might already have srcu_read_lock() and
20933 @@ -77,7 +106,7 @@ static void init_srcu_struct_nodes(struct srcu_struct *sp, bool is_static)
20934
20935         /* Each pass through this loop initializes one srcu_node structure. */
20936         rcu_for_each_node_breadth_first(sp, snp) {
20937 -               raw_spin_lock_init(&ACCESS_PRIVATE(snp, lock));
20938 +               spin_lock_init(&ACCESS_PRIVATE(snp, lock));
20939                 WARN_ON_ONCE(ARRAY_SIZE(snp->srcu_have_cbs) !=
20940                              ARRAY_SIZE(snp->srcu_data_have_cbs));
20941                 for (i = 0; i < ARRAY_SIZE(snp->srcu_have_cbs); i++) {
20942 @@ -111,7 +140,7 @@ static void init_srcu_struct_nodes(struct srcu_struct *sp, bool is_static)
20943         snp_first = sp->level[level];
20944         for_each_possible_cpu(cpu) {
20945                 sdp = per_cpu_ptr(sp->sda, cpu);
20946 -               raw_spin_lock_init(&ACCESS_PRIVATE(sdp, lock));
20947 +               spin_lock_init(&ACCESS_PRIVATE(sdp, lock));
20948                 rcu_segcblist_init(&sdp->srcu_cblist);
20949                 sdp->srcu_cblist_invoking = false;
20950                 sdp->srcu_gp_seq_needed = sp->srcu_gp_seq;
20951 @@ -170,7 +199,7 @@ int __init_srcu_struct(struct srcu_struct *sp, const char *name,
20952         /* Don't re-initialize a lock while it is held. */
20953         debug_check_no_locks_freed((void *)sp, sizeof(*sp));
20954         lockdep_init_map(&sp->dep_map, name, key, 0);
20955 -       raw_spin_lock_init(&ACCESS_PRIVATE(sp, lock));
20956 +       spin_lock_init(&ACCESS_PRIVATE(sp, lock));
20957         return init_srcu_struct_fields(sp, false);
20958  }
20959  EXPORT_SYMBOL_GPL(__init_srcu_struct);
20960 @@ -187,7 +216,7 @@ EXPORT_SYMBOL_GPL(__init_srcu_struct);
20961   */
20962  int init_srcu_struct(struct srcu_struct *sp)
20963  {
20964 -       raw_spin_lock_init(&ACCESS_PRIVATE(sp, lock));
20965 +       spin_lock_init(&ACCESS_PRIVATE(sp, lock));
20966         return init_srcu_struct_fields(sp, false);
20967  }
20968  EXPORT_SYMBOL_GPL(init_srcu_struct);
20969 @@ -210,13 +239,13 @@ static void check_init_srcu_struct(struct srcu_struct *sp)
20970         /* The smp_load_acquire() pairs with the smp_store_release(). */
20971         if (!rcu_seq_state(smp_load_acquire(&sp->srcu_gp_seq_needed))) /*^^^*/
20972                 return; /* Already initialized. */
20973 -       raw_spin_lock_irqsave_rcu_node(sp, flags);
20974 +       spin_lock_irqsave_rcu_node(sp, flags);
20975         if (!rcu_seq_state(sp->srcu_gp_seq_needed)) {
20976 -               raw_spin_unlock_irqrestore_rcu_node(sp, flags);
20977 +               spin_unlock_irqrestore_rcu_node(sp, flags);
20978                 return;
20979         }
20980         init_srcu_struct_fields(sp, true);
20981 -       raw_spin_unlock_irqrestore_rcu_node(sp, flags);
20982 +       spin_unlock_irqrestore_rcu_node(sp, flags);
20983  }
20984
20985  /*
20986 @@ -424,21 +453,6 @@ static void srcu_gp_start(struct srcu_struct *sp)
20987         WARN_ON_ONCE(state != SRCU_STATE_SCAN1);
20988  }
20989
20990 -/*
20991 - * Track online CPUs to guide callback workqueue placement.
20992 - */
20993 -DEFINE_PER_CPU(bool, srcu_online);
20994 -
20995 -void srcu_online_cpu(unsigned int cpu)
20996 -{
20997 -       WRITE_ONCE(per_cpu(srcu_online, cpu), true);
20998 -}
20999 -
21000 -void srcu_offline_cpu(unsigned int cpu)
21001 -{
21002 -       WRITE_ONCE(per_cpu(srcu_online, cpu), false);
21003 -}
21004 -
21005  /*
21006   * Place the workqueue handler on the specified CPU if online, otherwise
21007   * just run it whereever.  This is useful for placing workqueue handlers
21008 @@ -450,12 +464,12 @@ static bool srcu_queue_delayed_work_on(int cpu, struct workqueue_struct *wq,
21009  {
21010         bool ret;
21011
21012 -       preempt_disable();
21013 -       if (READ_ONCE(per_cpu(srcu_online, cpu)))
21014 +       cpus_read_lock();
21015 +       if (cpu_online(cpu))
21016                 ret = queue_delayed_work_on(cpu, wq, dwork, delay);
21017         else
21018                 ret = queue_delayed_work(wq, dwork, delay);
21019 -       preempt_enable();
21020 +       cpus_read_unlock();
21021         return ret;
21022  }
21023
21024 @@ -513,7 +527,7 @@ static void srcu_gp_end(struct srcu_struct *sp)
21025         mutex_lock(&sp->srcu_cb_mutex);
21026
21027         /* End the current grace period. */
21028 -       raw_spin_lock_irq_rcu_node(sp);
21029 +       spin_lock_irq_rcu_node(sp);
21030         idx = rcu_seq_state(sp->srcu_gp_seq);
21031         WARN_ON_ONCE(idx != SRCU_STATE_SCAN2);
21032         cbdelay = srcu_get_delay(sp);
21033 @@ -522,7 +536,7 @@ static void srcu_gp_end(struct srcu_struct *sp)
21034         gpseq = rcu_seq_current(&sp->srcu_gp_seq);
21035         if (ULONG_CMP_LT(sp->srcu_gp_seq_needed_exp, gpseq))
21036                 sp->srcu_gp_seq_needed_exp = gpseq;
21037 -       raw_spin_unlock_irq_rcu_node(sp);
21038 +       spin_unlock_irq_rcu_node(sp);
21039         mutex_unlock(&sp->srcu_gp_mutex);
21040         /* A new grace period can start at this point.  But only one. */
21041
21042 @@ -530,7 +544,7 @@ static void srcu_gp_end(struct srcu_struct *sp)
21043         idx = rcu_seq_ctr(gpseq) % ARRAY_SIZE(snp->srcu_have_cbs);
21044         idxnext = (idx + 1) % ARRAY_SIZE(snp->srcu_have_cbs);
21045         rcu_for_each_node_breadth_first(sp, snp) {
21046 -               raw_spin_lock_irq_rcu_node(snp);
21047 +               spin_lock_irq_rcu_node(snp);
21048                 cbs = false;
21049                 if (snp >= sp->level[rcu_num_lvls - 1])
21050                         cbs = snp->srcu_have_cbs[idx] == gpseq;
21051 @@ -540,7 +554,7 @@ static void srcu_gp_end(struct srcu_struct *sp)
21052                         snp->srcu_gp_seq_needed_exp = gpseq;
21053                 mask = snp->srcu_data_have_cbs[idx];
21054                 snp->srcu_data_have_cbs[idx] = 0;
21055 -               raw_spin_unlock_irq_rcu_node(snp);
21056 +               spin_unlock_irq_rcu_node(snp);
21057                 if (cbs)
21058                         srcu_schedule_cbs_snp(sp, snp, mask, cbdelay);
21059
21060 @@ -548,11 +562,11 @@ static void srcu_gp_end(struct srcu_struct *sp)
21061                 if (!(gpseq & counter_wrap_check))
21062                         for (cpu = snp->grplo; cpu <= snp->grphi; cpu++) {
21063                                 sdp = per_cpu_ptr(sp->sda, cpu);
21064 -                               raw_spin_lock_irqsave_rcu_node(sdp, flags);
21065 +                               spin_lock_irqsave_rcu_node(sdp, flags);
21066                                 if (ULONG_CMP_GE(gpseq,
21067                                                  sdp->srcu_gp_seq_needed + 100))
21068                                         sdp->srcu_gp_seq_needed = gpseq;
21069 -                               raw_spin_unlock_irqrestore_rcu_node(sdp, flags);
21070 +                               spin_unlock_irqrestore_rcu_node(sdp, flags);
21071                         }
21072         }
21073
21074 @@ -560,17 +574,17 @@ static void srcu_gp_end(struct srcu_struct *sp)
21075         mutex_unlock(&sp->srcu_cb_mutex);
21076
21077         /* Start a new grace period if needed. */
21078 -       raw_spin_lock_irq_rcu_node(sp);
21079 +       spin_lock_irq_rcu_node(sp);
21080         gpseq = rcu_seq_current(&sp->srcu_gp_seq);
21081         if (!rcu_seq_state(gpseq) &&
21082             ULONG_CMP_LT(gpseq, sp->srcu_gp_seq_needed)) {
21083                 srcu_gp_start(sp);
21084 -               raw_spin_unlock_irq_rcu_node(sp);
21085 +               spin_unlock_irq_rcu_node(sp);
21086                 /* Throttle expedited grace periods: Should be rare! */
21087                 srcu_reschedule(sp, rcu_seq_ctr(gpseq) & 0x3ff
21088                                     ? 0 : SRCU_INTERVAL);
21089         } else {
21090 -               raw_spin_unlock_irq_rcu_node(sp);
21091 +               spin_unlock_irq_rcu_node(sp);
21092         }
21093  }
21094
21095 @@ -590,18 +604,18 @@ static void srcu_funnel_exp_start(struct srcu_struct *sp, struct srcu_node *snp,
21096                 if (rcu_seq_done(&sp->srcu_gp_seq, s) ||
21097                     ULONG_CMP_GE(READ_ONCE(snp->srcu_gp_seq_needed_exp), s))
21098                         return;
21099 -               raw_spin_lock_irqsave_rcu_node(snp, flags);
21100 +               spin_lock_irqsave_rcu_node(snp, flags);
21101                 if (ULONG_CMP_GE(snp->srcu_gp_seq_needed_exp, s)) {
21102 -                       raw_spin_unlock_irqrestore_rcu_node(snp, flags);
21103 +                       spin_unlock_irqrestore_rcu_node(snp, flags);
21104                         return;
21105                 }
21106                 WRITE_ONCE(snp->srcu_gp_seq_needed_exp, s);
21107 -               raw_spin_unlock_irqrestore_rcu_node(snp, flags);
21108 +               spin_unlock_irqrestore_rcu_node(snp, flags);
21109         }
21110 -       raw_spin_lock_irqsave_rcu_node(sp, flags);
21111 +       spin_lock_irqsave_rcu_node(sp, flags);
21112         if (!ULONG_CMP_LT(sp->srcu_gp_seq_needed_exp, s))
21113                 sp->srcu_gp_seq_needed_exp = s;
21114 -       raw_spin_unlock_irqrestore_rcu_node(sp, flags);
21115 +       spin_unlock_irqrestore_rcu_node(sp, flags);
21116  }
21117
21118  /*
21119 @@ -623,12 +637,12 @@ static void srcu_funnel_gp_start(struct srcu_struct *sp, struct srcu_data *sdp,
21120         for (; snp != NULL; snp = snp->srcu_parent) {
21121                 if (rcu_seq_done(&sp->srcu_gp_seq, s) && snp != sdp->mynode)
21122                         return; /* GP already done and CBs recorded. */
21123 -               raw_spin_lock_irqsave_rcu_node(snp, flags);
21124 +               spin_lock_irqsave_rcu_node(snp, flags);
21125                 if (ULONG_CMP_GE(snp->srcu_have_cbs[idx], s)) {
21126                         snp_seq = snp->srcu_have_cbs[idx];
21127                         if (snp == sdp->mynode && snp_seq == s)
21128                                 snp->srcu_data_have_cbs[idx] |= sdp->grpmask;
21129 -                       raw_spin_unlock_irqrestore_rcu_node(snp, flags);
21130 +                       spin_unlock_irqrestore_rcu_node(snp, flags);
21131                         if (snp == sdp->mynode && snp_seq != s) {
21132                                 srcu_schedule_cbs_sdp(sdp, do_norm
21133                                                            ? SRCU_INTERVAL
21134 @@ -644,11 +658,11 @@ static void srcu_funnel_gp_start(struct srcu_struct *sp, struct srcu_data *sdp,
21135                         snp->srcu_data_have_cbs[idx] |= sdp->grpmask;
21136                 if (!do_norm && ULONG_CMP_LT(snp->srcu_gp_seq_needed_exp, s))
21137                         snp->srcu_gp_seq_needed_exp = s;
21138 -               raw_spin_unlock_irqrestore_rcu_node(snp, flags);
21139 +               spin_unlock_irqrestore_rcu_node(snp, flags);
21140         }
21141
21142         /* Top of tree, must ensure the grace period will be started. */
21143 -       raw_spin_lock_irqsave_rcu_node(sp, flags);
21144 +       spin_lock_irqsave_rcu_node(sp, flags);
21145         if (ULONG_CMP_LT(sp->srcu_gp_seq_needed, s)) {
21146                 /*
21147                  * Record need for grace period s.  Pair with load
21148 @@ -667,7 +681,7 @@ static void srcu_funnel_gp_start(struct srcu_struct *sp, struct srcu_data *sdp,
21149                 queue_delayed_work(system_power_efficient_wq, &sp->work,
21150                                    srcu_get_delay(sp));
21151         }
21152 -       raw_spin_unlock_irqrestore_rcu_node(sp, flags);
21153 +       spin_unlock_irqrestore_rcu_node(sp, flags);
21154  }
21155
21156  /*
21157 @@ -736,6 +750,8 @@ static void srcu_flip(struct srcu_struct *sp)
21158   * negligible when amoritized over that time period, and the extra latency
21159   * of a needlessly non-expedited grace period is similarly negligible.
21160   */
21161 +static DEFINE_LOCAL_IRQ_LOCK(sp_llock);
21162 +
21163  static bool srcu_might_be_idle(struct srcu_struct *sp)
21164  {
21165         unsigned long curseq;
21166 @@ -744,13 +760,13 @@ static bool srcu_might_be_idle(struct srcu_struct *sp)
21167         unsigned long t;
21168
21169         /* If the local srcu_data structure has callbacks, not idle.  */
21170 -       local_irq_save(flags);
21171 +       local_lock_irqsave(sp_llock, flags);
21172         sdp = this_cpu_ptr(sp->sda);
21173         if (rcu_segcblist_pend_cbs(&sdp->srcu_cblist)) {
21174 -               local_irq_restore(flags);
21175 +               local_unlock_irqrestore(sp_llock, flags);
21176                 return false; /* Callbacks already present, so not idle. */
21177         }
21178 -       local_irq_restore(flags);
21179 +       local_unlock_irqrestore(sp_llock, flags);
21180
21181         /*
21182          * No local callbacks, so probabalistically probe global state.
21183 @@ -828,9 +844,9 @@ void __call_srcu(struct srcu_struct *sp, struct rcu_head *rhp,
21184                 return;
21185         }
21186         rhp->func = func;
21187 -       local_irq_save(flags);
21188 +       local_lock_irqsave(sp_llock, flags);
21189         sdp = this_cpu_ptr(sp->sda);
21190 -       raw_spin_lock_rcu_node(sdp);
21191 +       spin_lock_rcu_node(sdp);
21192         rcu_segcblist_enqueue(&sdp->srcu_cblist, rhp, false);
21193         rcu_segcblist_advance(&sdp->srcu_cblist,
21194                               rcu_seq_current(&sp->srcu_gp_seq));
21195 @@ -844,7 +860,8 @@ void __call_srcu(struct srcu_struct *sp, struct rcu_head *rhp,
21196                 sdp->srcu_gp_seq_needed_exp = s;
21197                 needexp = true;
21198         }
21199 -       raw_spin_unlock_irqrestore_rcu_node(sdp, flags);
21200 +       spin_unlock_rcu_node(sdp);
21201 +       local_unlock_irqrestore(sp_llock, flags);
21202         if (needgp)
21203                 srcu_funnel_gp_start(sp, sdp, s, do_norm);
21204         else if (needexp)
21205 @@ -900,7 +917,7 @@ static void __synchronize_srcu(struct srcu_struct *sp, bool do_norm)
21206
21207         /*
21208          * Make sure that later code is ordered after the SRCU grace
21209 -        * period.  This pairs with the raw_spin_lock_irq_rcu_node()
21210 +        * period.  This pairs with the spin_lock_irq_rcu_node()
21211          * in srcu_invoke_callbacks().  Unlike Tree RCU, this is needed
21212          * because the current CPU might have been totally uninvolved with
21213          * (and thus unordered against) that grace period.
21214 @@ -1024,7 +1041,7 @@ void srcu_barrier(struct srcu_struct *sp)
21215          */
21216         for_each_possible_cpu(cpu) {
21217                 sdp = per_cpu_ptr(sp->sda, cpu);
21218 -               raw_spin_lock_irq_rcu_node(sdp);
21219 +               spin_lock_irq_rcu_node(sdp);
21220                 atomic_inc(&sp->srcu_barrier_cpu_cnt);
21221                 sdp->srcu_barrier_head.func = srcu_barrier_cb;
21222                 debug_rcu_head_queue(&sdp->srcu_barrier_head);
21223 @@ -1033,7 +1050,7 @@ void srcu_barrier(struct srcu_struct *sp)
21224                         debug_rcu_head_unqueue(&sdp->srcu_barrier_head);
21225                         atomic_dec(&sp->srcu_barrier_cpu_cnt);
21226                 }
21227 -               raw_spin_unlock_irq_rcu_node(sdp);
21228 +               spin_unlock_irq_rcu_node(sdp);
21229         }
21230
21231         /* Remove the initial count, at which point reaching zero can happen. */
21232 @@ -1082,17 +1099,17 @@ static void srcu_advance_state(struct srcu_struct *sp)
21233          */
21234         idx = rcu_seq_state(smp_load_acquire(&sp->srcu_gp_seq)); /* ^^^ */
21235         if (idx == SRCU_STATE_IDLE) {
21236 -               raw_spin_lock_irq_rcu_node(sp);
21237 +               spin_lock_irq_rcu_node(sp);
21238                 if (ULONG_CMP_GE(sp->srcu_gp_seq, sp->srcu_gp_seq_needed)) {
21239                         WARN_ON_ONCE(rcu_seq_state(sp->srcu_gp_seq));
21240 -                       raw_spin_unlock_irq_rcu_node(sp);
21241 +                       spin_unlock_irq_rcu_node(sp);
21242                         mutex_unlock(&sp->srcu_gp_mutex);
21243                         return;
21244                 }
21245                 idx = rcu_seq_state(READ_ONCE(sp->srcu_gp_seq));
21246                 if (idx == SRCU_STATE_IDLE)
21247                         srcu_gp_start(sp);
21248 -               raw_spin_unlock_irq_rcu_node(sp);
21249 +               spin_unlock_irq_rcu_node(sp);
21250                 if (idx != SRCU_STATE_IDLE) {
21251                         mutex_unlock(&sp->srcu_gp_mutex);
21252                         return; /* Someone else started the grace period. */
21253 @@ -1141,19 +1158,19 @@ static void srcu_invoke_callbacks(struct work_struct *work)
21254         sdp = container_of(work, struct srcu_data, work.work);
21255         sp = sdp->sp;
21256         rcu_cblist_init(&ready_cbs);
21257 -       raw_spin_lock_irq_rcu_node(sdp);
21258 +       spin_lock_irq_rcu_node(sdp);
21259         rcu_segcblist_advance(&sdp->srcu_cblist,
21260                               rcu_seq_current(&sp->srcu_gp_seq));
21261         if (sdp->srcu_cblist_invoking ||
21262             !rcu_segcblist_ready_cbs(&sdp->srcu_cblist)) {
21263 -               raw_spin_unlock_irq_rcu_node(sdp);
21264 +               spin_unlock_irq_rcu_node(sdp);
21265                 return;  /* Someone else on the job or nothing to do. */
21266         }
21267
21268         /* We are on the job!  Extract and invoke ready callbacks. */
21269         sdp->srcu_cblist_invoking = true;
21270         rcu_segcblist_extract_done_cbs(&sdp->srcu_cblist, &ready_cbs);
21271 -       raw_spin_unlock_irq_rcu_node(sdp);
21272 +       spin_unlock_irq_rcu_node(sdp);
21273         rhp = rcu_cblist_dequeue(&ready_cbs);
21274         for (; rhp != NULL; rhp = rcu_cblist_dequeue(&ready_cbs)) {
21275                 debug_rcu_head_unqueue(rhp);
21276 @@ -1166,13 +1183,13 @@ static void srcu_invoke_callbacks(struct work_struct *work)
21277          * Update counts, accelerate new callbacks, and if needed,
21278          * schedule another round of callback invocation.
21279          */
21280 -       raw_spin_lock_irq_rcu_node(sdp);
21281 +       spin_lock_irq_rcu_node(sdp);
21282         rcu_segcblist_insert_count(&sdp->srcu_cblist, &ready_cbs);
21283         (void)rcu_segcblist_accelerate(&sdp->srcu_cblist,
21284                                        rcu_seq_snap(&sp->srcu_gp_seq));
21285         sdp->srcu_cblist_invoking = false;
21286         more = rcu_segcblist_ready_cbs(&sdp->srcu_cblist);
21287 -       raw_spin_unlock_irq_rcu_node(sdp);
21288 +       spin_unlock_irq_rcu_node(sdp);
21289         if (more)
21290                 srcu_schedule_cbs_sdp(sdp, 0);
21291  }
21292 @@ -1185,7 +1202,7 @@ static void srcu_reschedule(struct srcu_struct *sp, unsigned long delay)
21293  {
21294         bool pushgp = true;
21295
21296 -       raw_spin_lock_irq_rcu_node(sp);
21297 +       spin_lock_irq_rcu_node(sp);
21298         if (ULONG_CMP_GE(sp->srcu_gp_seq, sp->srcu_gp_seq_needed)) {
21299                 if (!WARN_ON_ONCE(rcu_seq_state(sp->srcu_gp_seq))) {
21300                         /* All requests fulfilled, time to go idle. */
21301 @@ -1195,7 +1212,7 @@ static void srcu_reschedule(struct srcu_struct *sp, unsigned long delay)
21302                 /* Outstanding request and no GP.  Start one. */
21303                 srcu_gp_start(sp);
21304         }
21305 -       raw_spin_unlock_irq_rcu_node(sp);
21306 +       spin_unlock_irq_rcu_node(sp);
21307
21308         if (pushgp)
21309                 queue_delayed_work(system_power_efficient_wq, &sp->work, delay);
21310 diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
21311 index 3e3650e94ae6..0a722b56d90b 100644
21312 --- a/kernel/rcu/tree.c
21313 +++ b/kernel/rcu/tree.c
21314 @@ -58,6 +58,11 @@
21315  #include <linux/trace_events.h>
21316  #include <linux/suspend.h>
21317  #include <linux/ftrace.h>
21318 +#include <linux/delay.h>
21319 +#include <linux/gfp.h>
21320 +#include <linux/oom.h>
21321 +#include <linux/smpboot.h>
21322 +#include "../time/tick-internal.h"
21323
21324  #include "tree.h"
21325  #include "rcu.h"
21326 @@ -243,6 +248,19 @@ void rcu_sched_qs(void)
21327                            this_cpu_ptr(&rcu_sched_data), true);
21328  }
21329
21330 +#ifdef CONFIG_PREEMPT_RT_FULL
21331 +static void rcu_preempt_qs(void);
21332 +
21333 +void rcu_bh_qs(void)
21334 +{
21335 +       unsigned long flags;
21336 +
21337 +       /* Callers to this function, rcu_preempt_qs(), must disable irqs. */
21338 +       local_irq_save(flags);
21339 +       rcu_preempt_qs();
21340 +       local_irq_restore(flags);
21341 +}
21342 +#else
21343  void rcu_bh_qs(void)
21344  {
21345         RCU_LOCKDEP_WARN(preemptible(), "rcu_bh_qs() invoked with preemption enabled!!!");
21346 @@ -253,6 +271,7 @@ void rcu_bh_qs(void)
21347                 __this_cpu_write(rcu_bh_data.cpu_no_qs.b.norm, false);
21348         }
21349  }
21350 +#endif
21351
21352  /*
21353   * Steal a bit from the bottom of ->dynticks for idle entry/exit
21354 @@ -564,11 +583,13 @@ EXPORT_SYMBOL_GPL(rcu_batches_started_sched);
21355  /*
21356   * Return the number of RCU BH batches started thus far for debug & stats.
21357   */
21358 +#ifndef CONFIG_PREEMPT_RT_FULL
21359  unsigned long rcu_batches_started_bh(void)
21360  {
21361         return rcu_bh_state.gpnum;
21362  }
21363  EXPORT_SYMBOL_GPL(rcu_batches_started_bh);
21364 +#endif
21365
21366  /*
21367   * Return the number of RCU batches completed thus far for debug & stats.
21368 @@ -588,6 +609,7 @@ unsigned long rcu_batches_completed_sched(void)
21369  }
21370  EXPORT_SYMBOL_GPL(rcu_batches_completed_sched);
21371
21372 +#ifndef CONFIG_PREEMPT_RT_FULL
21373  /*
21374   * Return the number of RCU BH batches completed thus far for debug & stats.
21375   */
21376 @@ -596,6 +618,7 @@ unsigned long rcu_batches_completed_bh(void)
21377         return rcu_bh_state.completed;
21378  }
21379  EXPORT_SYMBOL_GPL(rcu_batches_completed_bh);
21380 +#endif
21381
21382  /*
21383   * Return the number of RCU expedited batches completed thus far for
21384 @@ -619,6 +642,7 @@ unsigned long rcu_exp_batches_completed_sched(void)
21385  }
21386  EXPORT_SYMBOL_GPL(rcu_exp_batches_completed_sched);
21387
21388 +#ifndef CONFIG_PREEMPT_RT_FULL
21389  /*
21390   * Force a quiescent state.
21391   */
21392 @@ -637,6 +661,13 @@ void rcu_bh_force_quiescent_state(void)
21393  }
21394  EXPORT_SYMBOL_GPL(rcu_bh_force_quiescent_state);
21395
21396 +#else
21397 +void rcu_force_quiescent_state(void)
21398 +{
21399 +}
21400 +EXPORT_SYMBOL_GPL(rcu_force_quiescent_state);
21401 +#endif
21402 +
21403  /*
21404   * Force a quiescent state for RCU-sched.
21405   */
21406 @@ -687,9 +718,11 @@ void rcutorture_get_gp_data(enum rcutorture_type test_type, int *flags,
21407         case RCU_FLAVOR:
21408                 rsp = rcu_state_p;
21409                 break;
21410 +#ifndef CONFIG_PREEMPT_RT_FULL
21411         case RCU_BH_FLAVOR:
21412                 rsp = &rcu_bh_state;
21413                 break;
21414 +#endif
21415         case RCU_SCHED_FLAVOR:
21416                 rsp = &rcu_sched_state;
21417                 break;
21418 @@ -2918,18 +2951,17 @@ __rcu_process_callbacks(struct rcu_state *rsp)
21419  /*
21420   * Do RCU core processing for the current CPU.
21421   */
21422 -static __latent_entropy void rcu_process_callbacks(struct softirq_action *unused)
21423 +static __latent_entropy void rcu_process_callbacks(void)
21424  {
21425         struct rcu_state *rsp;
21426
21427         if (cpu_is_offline(smp_processor_id()))
21428                 return;
21429 -       trace_rcu_utilization(TPS("Start RCU core"));
21430         for_each_rcu_flavor(rsp)
21431                 __rcu_process_callbacks(rsp);
21432 -       trace_rcu_utilization(TPS("End RCU core"));
21433  }
21434
21435 +static DEFINE_PER_CPU(struct task_struct *, rcu_cpu_kthread_task);
21436  /*
21437   * Schedule RCU callback invocation.  If the specified type of RCU
21438   * does not support RCU priority boosting, just do a direct call,
21439 @@ -2941,19 +2973,106 @@ static void invoke_rcu_callbacks(struct rcu_state *rsp, struct rcu_data *rdp)
21440  {
21441         if (unlikely(!READ_ONCE(rcu_scheduler_fully_active)))
21442                 return;
21443 -       if (likely(!rsp->boost)) {
21444 -               rcu_do_batch(rsp, rdp);
21445 -               return;
21446 -       }
21447 -       invoke_rcu_callbacks_kthread();
21448 +       rcu_do_batch(rsp, rdp);
21449  }
21450
21451 +static void rcu_wake_cond(struct task_struct *t, int status)
21452 +{
21453 +       /*
21454 +        * If the thread is yielding, only wake it when this
21455 +        * is invoked from idle
21456 +        */
21457 +       if (t && (status != RCU_KTHREAD_YIELDING || is_idle_task(current)))
21458 +               wake_up_process(t);
21459 +}
21460 +
21461 +/*
21462 + * Wake up this CPU's rcuc kthread to do RCU core processing.
21463 + */
21464  static void invoke_rcu_core(void)
21465  {
21466 -       if (cpu_online(smp_processor_id()))
21467 -               raise_softirq(RCU_SOFTIRQ);
21468 +       unsigned long flags;
21469 +       struct task_struct *t;
21470 +
21471 +       if (!cpu_online(smp_processor_id()))
21472 +               return;
21473 +       local_irq_save(flags);
21474 +       __this_cpu_write(rcu_cpu_has_work, 1);
21475 +       t = __this_cpu_read(rcu_cpu_kthread_task);
21476 +       if (t != NULL && current != t)
21477 +               rcu_wake_cond(t, __this_cpu_read(rcu_cpu_kthread_status));
21478 +       local_irq_restore(flags);
21479 +}
21480 +
21481 +static void rcu_cpu_kthread_park(unsigned int cpu)
21482 +{
21483 +       per_cpu(rcu_cpu_kthread_status, cpu) = RCU_KTHREAD_OFFCPU;
21484 +}
21485 +
21486 +static int rcu_cpu_kthread_should_run(unsigned int cpu)
21487 +{
21488 +       return __this_cpu_read(rcu_cpu_has_work);
21489  }
21490
21491 +/*
21492 + * Per-CPU kernel thread that invokes RCU callbacks.  This replaces the
21493 + * RCU softirq used in flavors and configurations of RCU that do not
21494 + * support RCU priority boosting.
21495 + */
21496 +static void rcu_cpu_kthread(unsigned int cpu)
21497 +{
21498 +       unsigned int *statusp = this_cpu_ptr(&rcu_cpu_kthread_status);
21499 +       char work, *workp = this_cpu_ptr(&rcu_cpu_has_work);
21500 +       int spincnt;
21501 +
21502 +       for (spincnt = 0; spincnt < 10; spincnt++) {
21503 +               trace_rcu_utilization(TPS("Start CPU kthread@rcu_wait"));
21504 +               local_bh_disable();
21505 +               *statusp = RCU_KTHREAD_RUNNING;
21506 +               this_cpu_inc(rcu_cpu_kthread_loops);
21507 +               local_irq_disable();
21508 +               work = *workp;
21509 +               *workp = 0;
21510 +               local_irq_enable();
21511 +               if (work)
21512 +                       rcu_process_callbacks();
21513 +               local_bh_enable();
21514 +               if (*workp == 0) {
21515 +                       trace_rcu_utilization(TPS("End CPU kthread@rcu_wait"));
21516 +                       *statusp = RCU_KTHREAD_WAITING;
21517 +                       return;
21518 +               }
21519 +       }
21520 +       *statusp = RCU_KTHREAD_YIELDING;
21521 +       trace_rcu_utilization(TPS("Start CPU kthread@rcu_yield"));
21522 +       schedule_timeout_interruptible(2);
21523 +       trace_rcu_utilization(TPS("End CPU kthread@rcu_yield"));
21524 +       *statusp = RCU_KTHREAD_WAITING;
21525 +}
21526 +
21527 +static struct smp_hotplug_thread rcu_cpu_thread_spec = {
21528 +       .store                  = &rcu_cpu_kthread_task,
21529 +       .thread_should_run      = rcu_cpu_kthread_should_run,
21530 +       .thread_fn              = rcu_cpu_kthread,
21531 +       .thread_comm            = "rcuc/%u",
21532 +       .setup                  = rcu_cpu_kthread_setup,
21533 +       .park                   = rcu_cpu_kthread_park,
21534 +};
21535 +
21536 +/*
21537 + * Spawn per-CPU RCU core processing kthreads.
21538 + */
21539 +static int __init rcu_spawn_core_kthreads(void)
21540 +{
21541 +       int cpu;
21542 +
21543 +       for_each_possible_cpu(cpu)
21544 +               per_cpu(rcu_cpu_has_work, cpu) = 0;
21545 +       BUG_ON(smpboot_register_percpu_thread(&rcu_cpu_thread_spec));
21546 +       return 0;
21547 +}
21548 +early_initcall(rcu_spawn_core_kthreads);
21549 +
21550  /*
21551   * Handle any core-RCU processing required by a call_rcu() invocation.
21552   */
21553 @@ -3113,6 +3232,7 @@ void call_rcu_sched(struct rcu_head *head, rcu_callback_t func)
21554  }
21555  EXPORT_SYMBOL_GPL(call_rcu_sched);
21556
21557 +#ifndef CONFIG_PREEMPT_RT_FULL
21558  /**
21559   * call_rcu_bh() - Queue an RCU for invocation after a quicker grace period.
21560   * @head: structure to be used for queueing the RCU updates.
21561 @@ -3140,6 +3260,7 @@ void call_rcu_bh(struct rcu_head *head, rcu_callback_t func)
21562         __call_rcu(head, func, &rcu_bh_state, -1, 0);
21563  }
21564  EXPORT_SYMBOL_GPL(call_rcu_bh);
21565 +#endif
21566
21567  /*
21568   * Queue an RCU callback for lazy invocation after a grace period.
21569 @@ -3225,6 +3346,7 @@ void synchronize_sched(void)
21570  }
21571  EXPORT_SYMBOL_GPL(synchronize_sched);
21572
21573 +#ifndef CONFIG_PREEMPT_RT_FULL
21574  /**
21575   * synchronize_rcu_bh - wait until an rcu_bh grace period has elapsed.
21576   *
21577 @@ -3251,6 +3373,7 @@ void synchronize_rcu_bh(void)
21578                 wait_rcu_gp(call_rcu_bh);
21579  }
21580  EXPORT_SYMBOL_GPL(synchronize_rcu_bh);
21581 +#endif
21582
21583  /**
21584   * get_state_synchronize_rcu - Snapshot current RCU state
21585 @@ -3601,6 +3724,7 @@ static void _rcu_barrier(struct rcu_state *rsp)
21586         mutex_unlock(&rsp->barrier_mutex);
21587  }
21588
21589 +#ifndef CONFIG_PREEMPT_RT_FULL
21590  /**
21591   * rcu_barrier_bh - Wait until all in-flight call_rcu_bh() callbacks complete.
21592   */
21593 @@ -3609,6 +3733,7 @@ void rcu_barrier_bh(void)
21594         _rcu_barrier(&rcu_bh_state);
21595  }
21596  EXPORT_SYMBOL_GPL(rcu_barrier_bh);
21597 +#endif
21598
21599  /**
21600   * rcu_barrier_sched - Wait for in-flight call_rcu_sched() callbacks.
21601 @@ -3741,8 +3866,6 @@ int rcutree_online_cpu(unsigned int cpu)
21602  {
21603         sync_sched_exp_online_cleanup(cpu);
21604         rcutree_affinity_setting(cpu, -1);
21605 -       if (IS_ENABLED(CONFIG_TREE_SRCU))
21606 -               srcu_online_cpu(cpu);
21607         return 0;
21608  }
21609
21610 @@ -3753,8 +3876,6 @@ int rcutree_online_cpu(unsigned int cpu)
21611  int rcutree_offline_cpu(unsigned int cpu)
21612  {
21613         rcutree_affinity_setting(cpu, cpu);
21614 -       if (IS_ENABLED(CONFIG_TREE_SRCU))
21615 -               srcu_offline_cpu(cpu);
21616         return 0;
21617  }
21618
21619 @@ -4184,12 +4305,13 @@ void __init rcu_init(void)
21620
21621         rcu_bootup_announce();
21622         rcu_init_geometry();
21623 +#ifndef CONFIG_PREEMPT_RT_FULL
21624         rcu_init_one(&rcu_bh_state);
21625 +#endif
21626         rcu_init_one(&rcu_sched_state);
21627         if (dump_tree)
21628                 rcu_dump_rcu_node_tree(&rcu_sched_state);
21629         __rcu_init_preempt();
21630 -       open_softirq(RCU_SOFTIRQ, rcu_process_callbacks);
21631
21632         /*
21633          * We don't need protection against CPU-hotplug here because
21634 @@ -4200,8 +4322,6 @@ void __init rcu_init(void)
21635         for_each_online_cpu(cpu) {
21636                 rcutree_prepare_cpu(cpu);
21637                 rcu_cpu_starting(cpu);
21638 -               if (IS_ENABLED(CONFIG_TREE_SRCU))
21639 -                       srcu_online_cpu(cpu);
21640         }
21641  }
21642
21643 diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
21644 index 8e1f285f0a70..7acc23da94e2 100644
21645 --- a/kernel/rcu/tree.h
21646 +++ b/kernel/rcu/tree.h
21647 @@ -427,7 +427,9 @@ extern struct list_head rcu_struct_flavors;
21648   */
21649  extern struct rcu_state rcu_sched_state;
21650
21651 +#ifndef CONFIG_PREEMPT_RT_FULL
21652  extern struct rcu_state rcu_bh_state;
21653 +#endif
21654
21655  #ifdef CONFIG_PREEMPT_RCU
21656  extern struct rcu_state rcu_preempt_state;
21657 @@ -436,12 +438,10 @@ extern struct rcu_state rcu_preempt_state;
21658  int rcu_dynticks_snap(struct rcu_dynticks *rdtp);
21659  bool rcu_eqs_special_set(int cpu);
21660
21661 -#ifdef CONFIG_RCU_BOOST
21662  DECLARE_PER_CPU(unsigned int, rcu_cpu_kthread_status);
21663  DECLARE_PER_CPU(int, rcu_cpu_kthread_cpu);
21664  DECLARE_PER_CPU(unsigned int, rcu_cpu_kthread_loops);
21665  DECLARE_PER_CPU(char, rcu_cpu_has_work);
21666 -#endif /* #ifdef CONFIG_RCU_BOOST */
21667
21668  #ifndef RCU_TREE_NONCORE
21669
21670 @@ -461,10 +461,9 @@ void call_rcu(struct rcu_head *head, rcu_callback_t func);
21671  static void __init __rcu_init_preempt(void);
21672  static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags);
21673  static void rcu_preempt_boost_start_gp(struct rcu_node *rnp);
21674 -static void invoke_rcu_callbacks_kthread(void);
21675  static bool rcu_is_callbacks_kthread(void);
21676 +static void rcu_cpu_kthread_setup(unsigned int cpu);
21677  #ifdef CONFIG_RCU_BOOST
21678 -static void rcu_preempt_do_callbacks(void);
21679  static int rcu_spawn_one_boost_kthread(struct rcu_state *rsp,
21680                                                  struct rcu_node *rnp);
21681  #endif /* #ifdef CONFIG_RCU_BOOST */
21682 diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
21683 index 8b3102d22823..17ee8d1f38c4 100644
21684 --- a/kernel/rcu/tree_plugin.h
21685 +++ b/kernel/rcu/tree_plugin.h
21686 @@ -24,39 +24,16 @@
21687   *        Paul E. McKenney <paulmck@linux.vnet.ibm.com>
21688   */
21689
21690 -#include <linux/delay.h>
21691 -#include <linux/gfp.h>
21692 -#include <linux/oom.h>
21693 -#include <linux/sched/debug.h>
21694 -#include <linux/smpboot.h>
21695 -#include <uapi/linux/sched/types.h>
21696 -#include "../time/tick-internal.h"
21697 -
21698 -#ifdef CONFIG_RCU_BOOST
21699 -
21700  #include "../locking/rtmutex_common.h"
21701
21702  /*
21703   * Control variables for per-CPU and per-rcu_node kthreads.  These
21704   * handle all flavors of RCU.
21705   */
21706 -static DEFINE_PER_CPU(struct task_struct *, rcu_cpu_kthread_task);
21707  DEFINE_PER_CPU(unsigned int, rcu_cpu_kthread_status);
21708  DEFINE_PER_CPU(unsigned int, rcu_cpu_kthread_loops);
21709  DEFINE_PER_CPU(char, rcu_cpu_has_work);
21710
21711 -#else /* #ifdef CONFIG_RCU_BOOST */
21712 -
21713 -/*
21714 - * Some architectures do not define rt_mutexes, but if !CONFIG_RCU_BOOST,
21715 - * all uses are in dead code.  Provide a definition to keep the compiler
21716 - * happy, but add WARN_ON_ONCE() to complain if used in the wrong place.
21717 - * This probably needs to be excluded from -rt builds.
21718 - */
21719 -#define rt_mutex_owner(a) ({ WARN_ON_ONCE(1); NULL; })
21720 -
21721 -#endif /* #else #ifdef CONFIG_RCU_BOOST */
21722 -
21723  #ifdef CONFIG_RCU_NOCB_CPU
21724  static cpumask_var_t rcu_nocb_mask; /* CPUs to have callbacks offloaded. */
21725  static bool have_rcu_nocb_mask;            /* Was rcu_nocb_mask allocated? */
21726 @@ -324,9 +301,13 @@ static void rcu_preempt_note_context_switch(bool preempt)
21727         struct task_struct *t = current;
21728         struct rcu_data *rdp;
21729         struct rcu_node *rnp;
21730 +       int sleeping_l = 0;
21731
21732         RCU_LOCKDEP_WARN(!irqs_disabled(), "rcu_preempt_note_context_switch() invoked with interrupts enabled!!!\n");
21733 -       WARN_ON_ONCE(!preempt && t->rcu_read_lock_nesting > 0);
21734 +#if defined(CONFIG_PREEMPT_RT_FULL)
21735 +       sleeping_l = t->sleeping_lock;
21736 +#endif
21737 +       WARN_ON_ONCE(!preempt && t->rcu_read_lock_nesting > 0 && !sleeping_l);
21738         if (t->rcu_read_lock_nesting > 0 &&
21739             !t->rcu_read_unlock_special.b.blocked) {
21740
21741 @@ -463,7 +444,7 @@ void rcu_read_unlock_special(struct task_struct *t)
21742         }
21743
21744         /* Hardware IRQ handlers cannot block, complain if they get here. */
21745 -       if (in_irq() || in_serving_softirq()) {
21746 +       if (preempt_count() & (HARDIRQ_MASK | SOFTIRQ_OFFSET)) {
21747                 lockdep_rcu_suspicious(__FILE__, __LINE__,
21748                                        "rcu_read_unlock() from irq or softirq with blocking in critical section!!!\n");
21749                 pr_alert("->rcu_read_unlock_special: %#x (b: %d, enq: %d nq: %d)\n",
21750 @@ -530,7 +511,7 @@ void rcu_read_unlock_special(struct task_struct *t)
21751
21752                 /* Unboost if we were boosted. */
21753                 if (IS_ENABLED(CONFIG_RCU_BOOST) && drop_boost_mutex)
21754 -                       rt_mutex_unlock(&rnp->boost_mtx);
21755 +                       rt_mutex_futex_unlock(&rnp->boost_mtx);
21756
21757                 /*
21758                  * If this was the last task on the expedited lists,
21759 @@ -684,15 +665,6 @@ static void rcu_preempt_check_callbacks(void)
21760                 t->rcu_read_unlock_special.b.need_qs = true;
21761  }
21762
21763 -#ifdef CONFIG_RCU_BOOST
21764 -
21765 -static void rcu_preempt_do_callbacks(void)
21766 -{
21767 -       rcu_do_batch(rcu_state_p, this_cpu_ptr(rcu_data_p));
21768 -}
21769 -
21770 -#endif /* #ifdef CONFIG_RCU_BOOST */
21771 -
21772  /**
21773   * call_rcu() - Queue an RCU callback for invocation after a grace period.
21774   * @head: structure to be used for queueing the RCU updates.
21775 @@ -915,20 +887,23 @@ void exit_rcu(void)
21776
21777  #endif /* #else #ifdef CONFIG_PREEMPT_RCU */
21778
21779 +/*
21780 + * If boosting, set rcuc kthreads to realtime priority.
21781 + */
21782 +static void rcu_cpu_kthread_setup(unsigned int cpu)
21783 +{
21784 +#ifdef CONFIG_RCU_BOOST
21785 +       struct sched_param sp;
21786 +
21787 +       sp.sched_priority = kthread_prio;
21788 +       sched_setscheduler_nocheck(current, SCHED_FIFO, &sp);
21789 +#endif /* #ifdef CONFIG_RCU_BOOST */
21790 +}
21791 +
21792  #ifdef CONFIG_RCU_BOOST
21793
21794  #include "../locking/rtmutex_common.h"
21795
21796 -static void rcu_wake_cond(struct task_struct *t, int status)
21797 -{
21798 -       /*
21799 -        * If the thread is yielding, only wake it when this
21800 -        * is invoked from idle
21801 -        */
21802 -       if (status != RCU_KTHREAD_YIELDING || is_idle_task(current))
21803 -               wake_up_process(t);
21804 -}
21805 -
21806  /*
21807   * Carry out RCU priority boosting on the task indicated by ->exp_tasks
21808   * or ->boost_tasks, advancing the pointer to the next task in the
21809 @@ -1070,23 +1045,6 @@ static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags)
21810         }
21811  }
21812
21813 -/*
21814 - * Wake up the per-CPU kthread to invoke RCU callbacks.
21815 - */
21816 -static void invoke_rcu_callbacks_kthread(void)
21817 -{
21818 -       unsigned long flags;
21819 -
21820 -       local_irq_save(flags);
21821 -       __this_cpu_write(rcu_cpu_has_work, 1);
21822 -       if (__this_cpu_read(rcu_cpu_kthread_task) != NULL &&
21823 -           current != __this_cpu_read(rcu_cpu_kthread_task)) {
21824 -               rcu_wake_cond(__this_cpu_read(rcu_cpu_kthread_task),
21825 -                             __this_cpu_read(rcu_cpu_kthread_status));
21826 -       }
21827 -       local_irq_restore(flags);
21828 -}
21829 -
21830  /*
21831   * Is the current CPU running the RCU-callbacks kthread?
21832   * Caller must have preemption disabled.
21833 @@ -1141,67 +1099,6 @@ static int rcu_spawn_one_boost_kthread(struct rcu_state *rsp,
21834         return 0;
21835  }
21836
21837 -static void rcu_kthread_do_work(void)
21838 -{
21839 -       rcu_do_batch(&rcu_sched_state, this_cpu_ptr(&rcu_sched_data));
21840 -       rcu_do_batch(&rcu_bh_state, this_cpu_ptr(&rcu_bh_data));
21841 -       rcu_preempt_do_callbacks();
21842 -}
21843 -
21844 -static void rcu_cpu_kthread_setup(unsigned int cpu)
21845 -{
21846 -       struct sched_param sp;
21847 -
21848 -       sp.sched_priority = kthread_prio;
21849 -       sched_setscheduler_nocheck(current, SCHED_FIFO, &sp);
21850 -}
21851 -
21852 -static void rcu_cpu_kthread_park(unsigned int cpu)
21853 -{
21854 -       per_cpu(rcu_cpu_kthread_status, cpu) = RCU_KTHREAD_OFFCPU;
21855 -}
21856 -
21857 -static int rcu_cpu_kthread_should_run(unsigned int cpu)
21858 -{
21859 -       return __this_cpu_read(rcu_cpu_has_work);
21860 -}
21861 -
21862 -/*
21863 - * Per-CPU kernel thread that invokes RCU callbacks.  This replaces the
21864 - * RCU softirq used in flavors and configurations of RCU that do not
21865 - * support RCU priority boosting.
21866 - */
21867 -static void rcu_cpu_kthread(unsigned int cpu)
21868 -{
21869 -       unsigned int *statusp = this_cpu_ptr(&rcu_cpu_kthread_status);
21870 -       char work, *workp = this_cpu_ptr(&rcu_cpu_has_work);
21871 -       int spincnt;
21872 -
21873 -       for (spincnt = 0; spincnt < 10; spincnt++) {
21874 -               trace_rcu_utilization(TPS("Start CPU kthread@rcu_wait"));
21875 -               local_bh_disable();
21876 -               *statusp = RCU_KTHREAD_RUNNING;
21877 -               this_cpu_inc(rcu_cpu_kthread_loops);
21878 -               local_irq_disable();
21879 -               work = *workp;
21880 -               *workp = 0;
21881 -               local_irq_enable();
21882 -               if (work)
21883 -                       rcu_kthread_do_work();
21884 -               local_bh_enable();
21885 -               if (*workp == 0) {
21886 -                       trace_rcu_utilization(TPS("End CPU kthread@rcu_wait"));
21887 -                       *statusp = RCU_KTHREAD_WAITING;
21888 -                       return;
21889 -               }
21890 -       }
21891 -       *statusp = RCU_KTHREAD_YIELDING;
21892 -       trace_rcu_utilization(TPS("Start CPU kthread@rcu_yield"));
21893 -       schedule_timeout_interruptible(2);
21894 -       trace_rcu_utilization(TPS("End CPU kthread@rcu_yield"));
21895 -       *statusp = RCU_KTHREAD_WAITING;
21896 -}
21897 -
21898  /*
21899   * Set the per-rcu_node kthread's affinity to cover all CPUs that are
21900   * served by the rcu_node in question.  The CPU hotplug lock is still
21901 @@ -1232,26 +1129,12 @@ static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu)
21902         free_cpumask_var(cm);
21903  }
21904
21905 -static struct smp_hotplug_thread rcu_cpu_thread_spec = {
21906 -       .store                  = &rcu_cpu_kthread_task,
21907 -       .thread_should_run      = rcu_cpu_kthread_should_run,
21908 -       .thread_fn              = rcu_cpu_kthread,
21909 -       .thread_comm            = "rcuc/%u",
21910 -       .setup                  = rcu_cpu_kthread_setup,
21911 -       .park                   = rcu_cpu_kthread_park,
21912 -};
21913 -
21914  /*
21915   * Spawn boost kthreads -- called as soon as the scheduler is running.
21916   */
21917  static void __init rcu_spawn_boost_kthreads(void)
21918  {
21919         struct rcu_node *rnp;
21920 -       int cpu;
21921 -
21922 -       for_each_possible_cpu(cpu)
21923 -               per_cpu(rcu_cpu_has_work, cpu) = 0;
21924 -       BUG_ON(smpboot_register_percpu_thread(&rcu_cpu_thread_spec));
21925         rcu_for_each_leaf_node(rcu_state_p, rnp)
21926                 (void)rcu_spawn_one_boost_kthread(rcu_state_p, rnp);
21927  }
21928 @@ -1274,11 +1157,6 @@ static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags)
21929         raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
21930  }
21931
21932 -static void invoke_rcu_callbacks_kthread(void)
21933 -{
21934 -       WARN_ON_ONCE(1);
21935 -}
21936 -
21937  static bool rcu_is_callbacks_kthread(void)
21938  {
21939         return false;
21940 @@ -1302,7 +1180,7 @@ static void rcu_prepare_kthreads(int cpu)
21941
21942  #endif /* #else #ifdef CONFIG_RCU_BOOST */
21943
21944 -#if !defined(CONFIG_RCU_FAST_NO_HZ)
21945 +#if !defined(CONFIG_RCU_FAST_NO_HZ) || defined(CONFIG_PREEMPT_RT_FULL)
21946
21947  /*
21948   * Check to see if any future RCU-related work will need to be done
21949 @@ -1318,7 +1196,9 @@ int rcu_needs_cpu(u64 basemono, u64 *nextevt)
21950         *nextevt = KTIME_MAX;
21951         return rcu_cpu_has_callbacks(NULL);
21952  }
21953 +#endif /* !defined(CONFIG_RCU_FAST_NO_HZ) || defined(CONFIG_PREEMPT_RT_FULL) */
21954
21955 +#if !defined(CONFIG_RCU_FAST_NO_HZ)
21956  /*
21957   * Because we do not have RCU_FAST_NO_HZ, don't bother cleaning up
21958   * after it.
21959 @@ -1414,6 +1294,8 @@ static bool __maybe_unused rcu_try_advance_all_cbs(void)
21960         return cbs_ready;
21961  }
21962
21963 +#ifndef CONFIG_PREEMPT_RT_FULL
21964 +
21965  /*
21966   * Allow the CPU to enter dyntick-idle mode unless it has callbacks ready
21967   * to invoke.  If the CPU has callbacks, try to advance them.  Tell the
21968 @@ -1456,6 +1338,7 @@ int rcu_needs_cpu(u64 basemono, u64 *nextevt)
21969         *nextevt = basemono + dj * TICK_NSEC;
21970         return 0;
21971  }
21972 +#endif /* #ifndef CONFIG_PREEMPT_RT_FULL */
21973
21974  /*
21975   * Prepare a CPU for idle from an RCU perspective.  The first major task
21976 diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c
21977 index 7a577bd989a4..2006a09680aa 100644
21978 --- a/kernel/rcu/update.c
21979 +++ b/kernel/rcu/update.c
21980 @@ -66,7 +66,7 @@ extern int rcu_expedited; /* from sysctl */
21981  module_param(rcu_expedited, int, 0);
21982  extern int rcu_normal; /* from sysctl */
21983  module_param(rcu_normal, int, 0);
21984 -static int rcu_normal_after_boot;
21985 +static int rcu_normal_after_boot = IS_ENABLED(CONFIG_PREEMPT_RT_FULL);
21986  module_param(rcu_normal_after_boot, int, 0);
21987  #endif /* #ifndef CONFIG_TINY_RCU */
21988
21989 @@ -333,6 +333,7 @@ int rcu_read_lock_held(void)
21990  }
21991  EXPORT_SYMBOL_GPL(rcu_read_lock_held);
21992
21993 +#ifndef CONFIG_PREEMPT_RT_FULL
21994  /**
21995   * rcu_read_lock_bh_held() - might we be in RCU-bh read-side critical section?
21996   *
21997 @@ -359,6 +360,7 @@ int rcu_read_lock_bh_held(void)
21998         return in_softirq() || irqs_disabled();
21999  }
22000  EXPORT_SYMBOL_GPL(rcu_read_lock_bh_held);
22001 +#endif
22002
22003  #endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
22004
22005 diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile
22006 index a9ee16bbc693..9943019095e9 100644
22007 --- a/kernel/sched/Makefile
22008 +++ b/kernel/sched/Makefile
22009 @@ -18,7 +18,7 @@ endif
22010
22011  obj-y += core.o loadavg.o clock.o cputime.o
22012  obj-y += idle_task.o fair.o rt.o deadline.o
22013 -obj-y += wait.o wait_bit.o swait.o completion.o idle.o
22014 +obj-y += wait.o wait_bit.o swait.o swork.o completion.o idle.o
22015  obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o topology.o stop_task.o
22016  obj-$(CONFIG_SCHED_AUTOGROUP) += autogroup.o
22017  obj-$(CONFIG_SCHEDSTATS) += stats.o
22018 diff --git a/kernel/sched/completion.c b/kernel/sched/completion.c
22019 index 2ddaec40956f..0fe2982e46a0 100644
22020 --- a/kernel/sched/completion.c
22021 +++ b/kernel/sched/completion.c
22022 @@ -32,7 +32,7 @@ void complete(struct completion *x)
22023  {
22024         unsigned long flags;
22025
22026 -       spin_lock_irqsave(&x->wait.lock, flags);
22027 +       raw_spin_lock_irqsave(&x->wait.lock, flags);
22028
22029         /*
22030          * Perform commit of crossrelease here.
22031 @@ -41,8 +41,8 @@ void complete(struct completion *x)
22032
22033         if (x->done != UINT_MAX)
22034                 x->done++;
22035 -       __wake_up_locked(&x->wait, TASK_NORMAL, 1);
22036 -       spin_unlock_irqrestore(&x->wait.lock, flags);
22037 +       swake_up_locked(&x->wait);
22038 +       raw_spin_unlock_irqrestore(&x->wait.lock, flags);
22039  }
22040  EXPORT_SYMBOL(complete);
22041
22042 @@ -66,10 +66,10 @@ void complete_all(struct completion *x)
22043  {
22044         unsigned long flags;
22045
22046 -       spin_lock_irqsave(&x->wait.lock, flags);
22047 +       raw_spin_lock_irqsave(&x->wait.lock, flags);
22048         x->done = UINT_MAX;
22049 -       __wake_up_locked(&x->wait, TASK_NORMAL, 0);
22050 -       spin_unlock_irqrestore(&x->wait.lock, flags);
22051 +       swake_up_all_locked(&x->wait);
22052 +       raw_spin_unlock_irqrestore(&x->wait.lock, flags);
22053  }
22054  EXPORT_SYMBOL(complete_all);
22055
22056 @@ -78,20 +78,20 @@ do_wait_for_common(struct completion *x,
22057                    long (*action)(long), long timeout, int state)
22058  {
22059         if (!x->done) {
22060 -               DECLARE_WAITQUEUE(wait, current);
22061 +               DECLARE_SWAITQUEUE(wait);
22062
22063 -               __add_wait_queue_entry_tail_exclusive(&x->wait, &wait);
22064 +               __prepare_to_swait(&x->wait, &wait);
22065                 do {
22066                         if (signal_pending_state(state, current)) {
22067                                 timeout = -ERESTARTSYS;
22068                                 break;
22069                         }
22070                         __set_current_state(state);
22071 -                       spin_unlock_irq(&x->wait.lock);
22072 +                       raw_spin_unlock_irq(&x->wait.lock);
22073                         timeout = action(timeout);
22074 -                       spin_lock_irq(&x->wait.lock);
22075 +                       raw_spin_lock_irq(&x->wait.lock);
22076                 } while (!x->done && timeout);
22077 -               __remove_wait_queue(&x->wait, &wait);
22078 +               __finish_swait(&x->wait, &wait);
22079                 if (!x->done)
22080                         return timeout;
22081         }
22082 @@ -108,9 +108,9 @@ __wait_for_common(struct completion *x,
22083
22084         complete_acquire(x);
22085
22086 -       spin_lock_irq(&x->wait.lock);
22087 +       raw_spin_lock_irq(&x->wait.lock);
22088         timeout = do_wait_for_common(x, action, timeout, state);
22089 -       spin_unlock_irq(&x->wait.lock);
22090 +       raw_spin_unlock_irq(&x->wait.lock);
22091
22092         complete_release(x);
22093
22094 @@ -299,12 +299,12 @@ bool try_wait_for_completion(struct completion *x)
22095         if (!READ_ONCE(x->done))
22096                 return 0;
22097
22098 -       spin_lock_irqsave(&x->wait.lock, flags);
22099 +       raw_spin_lock_irqsave(&x->wait.lock, flags);
22100         if (!x->done)
22101                 ret = 0;
22102         else if (x->done != UINT_MAX)
22103                 x->done--;
22104 -       spin_unlock_irqrestore(&x->wait.lock, flags);
22105 +       raw_spin_unlock_irqrestore(&x->wait.lock, flags);
22106         return ret;
22107  }
22108  EXPORT_SYMBOL(try_wait_for_completion);
22109 @@ -330,8 +330,8 @@ bool completion_done(struct completion *x)
22110          * otherwise we can end up freeing the completion before complete()
22111          * is done referencing it.
22112          */
22113 -       spin_lock_irqsave(&x->wait.lock, flags);
22114 -       spin_unlock_irqrestore(&x->wait.lock, flags);
22115 +       raw_spin_lock_irqsave(&x->wait.lock, flags);
22116 +       raw_spin_unlock_irqrestore(&x->wait.lock, flags);
22117         return true;
22118  }
22119  EXPORT_SYMBOL(completion_done);
22120 diff --git a/kernel/sched/core.c b/kernel/sched/core.c
22121 index 4e89ed8a0fb2..6e6bd5262f23 100644
22122 --- a/kernel/sched/core.c
22123 +++ b/kernel/sched/core.c
22124 @@ -59,7 +59,11 @@ const_debug unsigned int sysctl_sched_features =
22125   * Number of tasks to iterate in a single balance run.
22126   * Limited because this is done with IRQs disabled.
22127   */
22128 +#ifndef CONFIG_PREEMPT_RT_FULL
22129  const_debug unsigned int sysctl_sched_nr_migrate = 32;
22130 +#else
22131 +const_debug unsigned int sysctl_sched_nr_migrate = 8;
22132 +#endif
22133
22134  /*
22135   * period over which we average the RT time consumption, measured
22136 @@ -341,7 +345,7 @@ static void init_rq_hrtick(struct rq *rq)
22137         rq->hrtick_csd.info = rq;
22138  #endif
22139
22140 -       hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
22141 +       hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD);
22142         rq->hrtick_timer.function = hrtick;
22143  }
22144  #else  /* CONFIG_SCHED_HRTICK */
22145 @@ -423,9 +427,15 @@ static bool set_nr_if_polling(struct task_struct *p)
22146  #endif
22147  #endif
22148
22149 -void wake_q_add(struct wake_q_head *head, struct task_struct *task)
22150 +void __wake_q_add(struct wake_q_head *head, struct task_struct *task,
22151 +                 bool sleeper)
22152  {
22153 -       struct wake_q_node *node = &task->wake_q;
22154 +       struct wake_q_node *node;
22155 +
22156 +       if (sleeper)
22157 +               node = &task->wake_q_sleeper;
22158 +       else
22159 +               node = &task->wake_q;
22160
22161         /*
22162          * Atomically grab the task, if ->wake_q is !nil already it means
22163 @@ -447,24 +457,32 @@ void wake_q_add(struct wake_q_head *head, struct task_struct *task)
22164         head->lastp = &node->next;
22165  }
22166
22167 -void wake_up_q(struct wake_q_head *head)
22168 +void __wake_up_q(struct wake_q_head *head, bool sleeper)
22169  {
22170         struct wake_q_node *node = head->first;
22171
22172         while (node != WAKE_Q_TAIL) {
22173                 struct task_struct *task;
22174
22175 -               task = container_of(node, struct task_struct, wake_q);
22176 +               if (sleeper)
22177 +                       task = container_of(node, struct task_struct, wake_q_sleeper);
22178 +               else
22179 +                       task = container_of(node, struct task_struct, wake_q);
22180                 BUG_ON(!task);
22181                 /* Task can safely be re-inserted now: */
22182                 node = node->next;
22183 -               task->wake_q.next = NULL;
22184 -
22185 +               if (sleeper)
22186 +                       task->wake_q_sleeper.next = NULL;
22187 +               else
22188 +                       task->wake_q.next = NULL;
22189                 /*
22190                  * wake_up_process() implies a wmb() to pair with the queueing
22191                  * in wake_q_add() so as not to miss wakeups.
22192                  */
22193 -               wake_up_process(task);
22194 +               if (sleeper)
22195 +                       wake_up_lock_sleeper(task);
22196 +               else
22197 +                       wake_up_process(task);
22198                 put_task_struct(task);
22199         }
22200  }
22201 @@ -500,6 +518,48 @@ void resched_curr(struct rq *rq)
22202                 trace_sched_wake_idle_without_ipi(cpu);
22203  }
22204
22205 +#ifdef CONFIG_PREEMPT_LAZY
22206 +
22207 +static int tsk_is_polling(struct task_struct *p)
22208 +{
22209 +#ifdef TIF_POLLING_NRFLAG
22210 +       return test_tsk_thread_flag(p, TIF_POLLING_NRFLAG);
22211 +#else
22212 +       return 0;
22213 +#endif
22214 +}
22215 +
22216 +void resched_curr_lazy(struct rq *rq)
22217 +{
22218 +       struct task_struct *curr = rq->curr;
22219 +       int cpu;
22220 +
22221 +       if (!sched_feat(PREEMPT_LAZY)) {
22222 +               resched_curr(rq);
22223 +               return;
22224 +       }
22225 +
22226 +       lockdep_assert_held(&rq->lock);
22227 +
22228 +       if (test_tsk_need_resched(curr))
22229 +               return;
22230 +
22231 +       if (test_tsk_need_resched_lazy(curr))
22232 +               return;
22233 +
22234 +       set_tsk_need_resched_lazy(curr);
22235 +
22236 +       cpu = cpu_of(rq);
22237 +       if (cpu == smp_processor_id())
22238 +               return;
22239 +
22240 +       /* NEED_RESCHED_LAZY must be visible before we test polling */
22241 +       smp_mb();
22242 +       if (!tsk_is_polling(curr))
22243 +               smp_send_reschedule(cpu);
22244 +}
22245 +#endif
22246 +
22247  void resched_cpu(int cpu)
22248  {
22249         struct rq *rq = cpu_rq(cpu);
22250 @@ -523,11 +583,14 @@ void resched_cpu(int cpu)
22251   */
22252  int get_nohz_timer_target(void)
22253  {
22254 -       int i, cpu = smp_processor_id();
22255 +       int i, cpu;
22256         struct sched_domain *sd;
22257
22258 +       preempt_disable_rt();
22259 +       cpu = smp_processor_id();
22260 +
22261         if (!idle_cpu(cpu) && is_housekeeping_cpu(cpu))
22262 -               return cpu;
22263 +               goto preempt_en_rt;
22264
22265         rcu_read_lock();
22266         for_each_domain(cpu, sd) {
22267 @@ -546,6 +609,8 @@ int get_nohz_timer_target(void)
22268                 cpu = housekeeping_any_cpu();
22269  unlock:
22270         rcu_read_unlock();
22271 +preempt_en_rt:
22272 +       preempt_enable_rt();
22273         return cpu;
22274  }
22275
22276 @@ -912,10 +977,10 @@ static inline bool is_per_cpu_kthread(struct task_struct *p)
22277   */
22278  static inline bool is_cpu_allowed(struct task_struct *p, int cpu)
22279  {
22280 -       if (!cpumask_test_cpu(cpu, &p->cpus_allowed))
22281 +       if (!cpumask_test_cpu(cpu, p->cpus_ptr))
22282                 return false;
22283
22284 -       if (is_per_cpu_kthread(p))
22285 +       if (is_per_cpu_kthread(p) || __migrate_disabled(p))
22286                 return cpu_online(cpu);
22287
22288         return cpu_active(cpu);
22289 @@ -1007,7 +1072,7 @@ static int migration_cpu_stop(void *data)
22290         local_irq_disable();
22291         /*
22292          * We need to explicitly wake pending tasks before running
22293 -        * __migrate_task() such that we will not miss enforcing cpus_allowed
22294 +        * __migrate_task() such that we will not miss enforcing cpus_ptr
22295          * during wakeups, see set_cpus_allowed_ptr()'s TASK_WAKING test.
22296          */
22297         sched_ttwu_pending();
22298 @@ -1038,11 +1103,19 @@ static int migration_cpu_stop(void *data)
22299   */
22300  void set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask)
22301  {
22302 -       cpumask_copy(&p->cpus_allowed, new_mask);
22303 +       cpumask_copy(&p->cpus_mask, new_mask);
22304         p->nr_cpus_allowed = cpumask_weight(new_mask);
22305  }
22306
22307 -void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
22308 +#if defined(CONFIG_SMP) && defined(CONFIG_PREEMPT_RT_BASE)
22309 +int __migrate_disabled(struct task_struct *p)
22310 +{
22311 +       return p->migrate_disable;
22312 +}
22313 +#endif
22314 +
22315 +static void __do_set_cpus_allowed_tail(struct task_struct *p,
22316 +                                      const struct cpumask *new_mask)
22317  {
22318         struct rq *rq = task_rq(p);
22319         bool queued, running;
22320 @@ -1071,6 +1144,20 @@ void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
22321                 set_curr_task(rq, p);
22322  }
22323
22324 +void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
22325 +{
22326 +#if defined(CONFIG_SMP) && defined(CONFIG_PREEMPT_RT_BASE)
22327 +       if (__migrate_disabled(p)) {
22328 +               lockdep_assert_held(&p->pi_lock);
22329 +
22330 +               cpumask_copy(&p->cpus_mask, new_mask);
22331 +               p->migrate_disable_update = 1;
22332 +               return;
22333 +       }
22334 +#endif
22335 +       __do_set_cpus_allowed_tail(p, new_mask);
22336 +}
22337 +
22338  /*
22339   * Change a given task's CPU affinity. Migrate the thread to a
22340   * proper CPU and schedule it away if the CPU it's executing on
22341 @@ -1108,7 +1195,7 @@ static int __set_cpus_allowed_ptr(struct task_struct *p,
22342                 goto out;
22343         }
22344
22345 -       if (cpumask_equal(&p->cpus_allowed, new_mask))
22346 +       if (cpumask_equal(p->cpus_ptr, new_mask))
22347                 goto out;
22348
22349         if (!cpumask_intersects(new_mask, cpu_valid_mask)) {
22350 @@ -1129,9 +1216,16 @@ static int __set_cpus_allowed_ptr(struct task_struct *p,
22351         }
22352
22353         /* Can the task run on the task's current CPU? If so, we're done */
22354 -       if (cpumask_test_cpu(task_cpu(p), new_mask))
22355 +       if (cpumask_test_cpu(task_cpu(p), new_mask) || __migrate_disabled(p))
22356                 goto out;
22357
22358 +#if defined(CONFIG_SMP) && defined(CONFIG_PREEMPT_RT_BASE)
22359 +       if (__migrate_disabled(p)) {
22360 +               p->migrate_disable_update = 1;
22361 +               goto out;
22362 +       }
22363 +#endif
22364 +
22365         dest_cpu = cpumask_any_and(cpu_valid_mask, new_mask);
22366         if (task_running(rq, p) || p->state == TASK_WAKING) {
22367                 struct migration_arg arg = { p, dest_cpu };
22368 @@ -1269,10 +1363,10 @@ static int migrate_swap_stop(void *data)
22369         if (task_cpu(arg->src_task) != arg->src_cpu)
22370                 goto unlock;
22371
22372 -       if (!cpumask_test_cpu(arg->dst_cpu, &arg->src_task->cpus_allowed))
22373 +       if (!cpumask_test_cpu(arg->dst_cpu, arg->src_task->cpus_ptr))
22374                 goto unlock;
22375
22376 -       if (!cpumask_test_cpu(arg->src_cpu, &arg->dst_task->cpus_allowed))
22377 +       if (!cpumask_test_cpu(arg->src_cpu, arg->dst_task->cpus_ptr))
22378                 goto unlock;
22379
22380         __migrate_swap_task(arg->src_task, arg->dst_cpu);
22381 @@ -1313,10 +1407,10 @@ int migrate_swap(struct task_struct *cur, struct task_struct *p)
22382         if (!cpu_active(arg.src_cpu) || !cpu_active(arg.dst_cpu))
22383                 goto out;
22384
22385 -       if (!cpumask_test_cpu(arg.dst_cpu, &arg.src_task->cpus_allowed))
22386 +       if (!cpumask_test_cpu(arg.dst_cpu, arg.src_task->cpus_ptr))
22387                 goto out;
22388
22389 -       if (!cpumask_test_cpu(arg.src_cpu, &arg.dst_task->cpus_allowed))
22390 +       if (!cpumask_test_cpu(arg.src_cpu, arg.dst_task->cpus_ptr))
22391                 goto out;
22392
22393         trace_sched_swap_numa(cur, arg.src_cpu, p, arg.dst_cpu);
22394 @@ -1326,6 +1420,18 @@ int migrate_swap(struct task_struct *cur, struct task_struct *p)
22395         return ret;
22396  }
22397
22398 +static bool check_task_state(struct task_struct *p, long match_state)
22399 +{
22400 +       bool match = false;
22401 +
22402 +       raw_spin_lock_irq(&p->pi_lock);
22403 +       if (p->state == match_state || p->saved_state == match_state)
22404 +               match = true;
22405 +       raw_spin_unlock_irq(&p->pi_lock);
22406 +
22407 +       return match;
22408 +}
22409 +
22410  /*
22411   * wait_task_inactive - wait for a thread to unschedule.
22412   *
22413 @@ -1370,7 +1476,7 @@ unsigned long wait_task_inactive(struct task_struct *p, long match_state)
22414                  * is actually now running somewhere else!
22415                  */
22416                 while (task_running(rq, p)) {
22417 -                       if (match_state && unlikely(p->state != match_state))
22418 +                       if (match_state && !check_task_state(p, match_state))
22419                                 return 0;
22420                         cpu_relax();
22421                 }
22422 @@ -1385,7 +1491,8 @@ unsigned long wait_task_inactive(struct task_struct *p, long match_state)
22423                 running = task_running(rq, p);
22424                 queued = task_on_rq_queued(p);
22425                 ncsw = 0;
22426 -               if (!match_state || p->state == match_state)
22427 +               if (!match_state || p->state == match_state ||
22428 +                   p->saved_state == match_state)
22429                         ncsw = p->nvcsw | LONG_MIN; /* sets MSB */
22430                 task_rq_unlock(rq, p, &rf);
22431
22432 @@ -1460,7 +1567,7 @@ void kick_process(struct task_struct *p)
22433  EXPORT_SYMBOL_GPL(kick_process);
22434
22435  /*
22436 - * ->cpus_allowed is protected by both rq->lock and p->pi_lock
22437 + * ->cpus_ptr is protected by both rq->lock and p->pi_lock
22438   *
22439   * A few notes on cpu_active vs cpu_online:
22440   *
22441 @@ -1500,14 +1607,14 @@ static int select_fallback_rq(int cpu, struct task_struct *p)
22442                 for_each_cpu(dest_cpu, nodemask) {
22443                         if (!cpu_active(dest_cpu))
22444                                 continue;
22445 -                       if (cpumask_test_cpu(dest_cpu, &p->cpus_allowed))
22446 +                       if (cpumask_test_cpu(dest_cpu, p->cpus_ptr))
22447                                 return dest_cpu;
22448                 }
22449         }
22450
22451         for (;;) {
22452                 /* Any allowed, online CPU? */
22453 -               for_each_cpu(dest_cpu, &p->cpus_allowed) {
22454 +               for_each_cpu(dest_cpu, p->cpus_ptr) {
22455                         if (!is_cpu_allowed(p, dest_cpu))
22456                                 continue;
22457
22458 @@ -1551,7 +1658,7 @@ static int select_fallback_rq(int cpu, struct task_struct *p)
22459  }
22460
22461  /*
22462 - * The caller (fork, wakeup) owns p->pi_lock, ->cpus_allowed is stable.
22463 + * The caller (fork, wakeup) owns p->pi_lock, ->cpus_ptr is stable.
22464   */
22465  static inline
22466  int select_task_rq(struct task_struct *p, int cpu, int sd_flags, int wake_flags)
22467 @@ -1561,11 +1668,11 @@ int select_task_rq(struct task_struct *p, int cpu, int sd_flags, int wake_flags)
22468         if (p->nr_cpus_allowed > 1)
22469                 cpu = p->sched_class->select_task_rq(p, cpu, sd_flags, wake_flags);
22470         else
22471 -               cpu = cpumask_any(&p->cpus_allowed);
22472 +               cpu = cpumask_any(p->cpus_ptr);
22473
22474         /*
22475          * In order not to call set_task_cpu() on a blocking task we need
22476 -        * to rely on ttwu() to place the task on a valid ->cpus_allowed
22477 +        * to rely on ttwu() to place the task on a valid ->cpus_ptr
22478          * CPU.
22479          *
22480          * Since this is common to all placement strategies, this lives here.
22481 @@ -1668,10 +1775,6 @@ static inline void ttwu_activate(struct rq *rq, struct task_struct *p, int en_fl
22482  {
22483         activate_task(rq, p, en_flags);
22484         p->on_rq = TASK_ON_RQ_QUEUED;
22485 -
22486 -       /* If a worker is waking up, notify the workqueue: */
22487 -       if (p->flags & PF_WQ_WORKER)
22488 -               wq_worker_waking_up(p, cpu_of(rq));
22489  }
22490
22491  /*
22492 @@ -1995,8 +2098,27 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
22493          */
22494         raw_spin_lock_irqsave(&p->pi_lock, flags);
22495         smp_mb__after_spinlock();
22496 -       if (!(p->state & state))
22497 +       if (!(p->state & state)) {
22498 +               /*
22499 +                * The task might be running due to a spinlock sleeper
22500 +                * wakeup. Check the saved state and set it to running
22501 +                * if the wakeup condition is true.
22502 +                */
22503 +               if (!(wake_flags & WF_LOCK_SLEEPER)) {
22504 +                       if (p->saved_state & state) {
22505 +                               p->saved_state = TASK_RUNNING;
22506 +                               success = 1;
22507 +                       }
22508 +               }
22509                 goto out;
22510 +       }
22511 +
22512 +       /*
22513 +        * If this is a regular wakeup, then we can unconditionally
22514 +        * clear the saved state of a "lock sleeper".
22515 +        */
22516 +       if (!(wake_flags & WF_LOCK_SLEEPER))
22517 +               p->saved_state = TASK_RUNNING;
22518
22519         trace_sched_waking(p);
22520
22521 @@ -2092,56 +2214,6 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
22522         return success;
22523  }
22524
22525 -/**
22526 - * try_to_wake_up_local - try to wake up a local task with rq lock held
22527 - * @p: the thread to be awakened
22528 - * @rf: request-queue flags for pinning
22529 - *
22530 - * Put @p on the run-queue if it's not already there. The caller must
22531 - * ensure that this_rq() is locked, @p is bound to this_rq() and not
22532 - * the current task.
22533 - */
22534 -static void try_to_wake_up_local(struct task_struct *p, struct rq_flags *rf)
22535 -{
22536 -       struct rq *rq = task_rq(p);
22537 -
22538 -       if (WARN_ON_ONCE(rq != this_rq()) ||
22539 -           WARN_ON_ONCE(p == current))
22540 -               return;
22541 -
22542 -       lockdep_assert_held(&rq->lock);
22543 -
22544 -       if (!raw_spin_trylock(&p->pi_lock)) {
22545 -               /*
22546 -                * This is OK, because current is on_cpu, which avoids it being
22547 -                * picked for load-balance and preemption/IRQs are still
22548 -                * disabled avoiding further scheduler activity on it and we've
22549 -                * not yet picked a replacement task.
22550 -                */
22551 -               rq_unlock(rq, rf);
22552 -               raw_spin_lock(&p->pi_lock);
22553 -               rq_relock(rq, rf);
22554 -       }
22555 -
22556 -       if (!(p->state & TASK_NORMAL))
22557 -               goto out;
22558 -
22559 -       trace_sched_waking(p);
22560 -
22561 -       if (!task_on_rq_queued(p)) {
22562 -               if (p->in_iowait) {
22563 -                       delayacct_blkio_end(p);
22564 -                       atomic_dec(&rq->nr_iowait);
22565 -               }
22566 -               ttwu_activate(rq, p, ENQUEUE_WAKEUP | ENQUEUE_NOCLOCK);
22567 -       }
22568 -
22569 -       ttwu_do_wakeup(rq, p, 0, rf);
22570 -       ttwu_stat(p, smp_processor_id(), 0);
22571 -out:
22572 -       raw_spin_unlock(&p->pi_lock);
22573 -}
22574 -
22575  /**
22576   * wake_up_process - Wake up a specific process
22577   * @p: The process to be woken up.
22578 @@ -2160,6 +2232,18 @@ int wake_up_process(struct task_struct *p)
22579  }
22580  EXPORT_SYMBOL(wake_up_process);
22581
22582 +/**
22583 + * wake_up_lock_sleeper - Wake up a specific process blocked on a "sleeping lock"
22584 + * @p: The process to be woken up.
22585 + *
22586 + * Same as wake_up_process() above, but wake_flags=WF_LOCK_SLEEPER to indicate
22587 + * the nature of the wakeup.
22588 + */
22589 +int wake_up_lock_sleeper(struct task_struct *p)
22590 +{
22591 +       return try_to_wake_up(p, TASK_UNINTERRUPTIBLE, WF_LOCK_SLEEPER);
22592 +}
22593 +
22594  int wake_up_state(struct task_struct *p, unsigned int state)
22595  {
22596         return try_to_wake_up(p, state, 0);
22597 @@ -2420,6 +2504,9 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p)
22598         p->on_cpu = 0;
22599  #endif
22600         init_task_preempt_count(p);
22601 +#ifdef CONFIG_HAVE_PREEMPT_LAZY
22602 +       task_thread_info(p)->preempt_lazy_count = 0;
22603 +#endif
22604  #ifdef CONFIG_SMP
22605         plist_node_init(&p->pushable_tasks, MAX_PRIO);
22606         RB_CLEAR_NODE(&p->pushable_dl_tasks);
22607 @@ -2462,7 +2549,7 @@ void wake_up_new_task(struct task_struct *p)
22608  #ifdef CONFIG_SMP
22609         /*
22610          * Fork balancing, do it here and not earlier because:
22611 -        *  - cpus_allowed can change in the fork path
22612 +        *  - cpus_ptr can change in the fork path
22613          *  - any previously selected CPU might disappear through hotplug
22614          *
22615          * Use __set_task_cpu() to avoid calling sched_class::migrate_task_rq,
22616 @@ -2675,21 +2762,16 @@ static struct rq *finish_task_switch(struct task_struct *prev)
22617         finish_arch_post_lock_switch();
22618
22619         fire_sched_in_preempt_notifiers(current);
22620 +       /*
22621 +        * We use mmdrop_delayed() here so we don't have to do the
22622 +        * full __mmdrop() when we are the last user.
22623 +        */
22624         if (mm)
22625 -               mmdrop(mm);
22626 +               mmdrop_delayed(mm);
22627         if (unlikely(prev_state == TASK_DEAD)) {
22628                 if (prev->sched_class->task_dead)
22629                         prev->sched_class->task_dead(prev);
22630
22631 -               /*
22632 -                * Remove function-return probe instances associated with this
22633 -                * task and put them back on the free list.
22634 -                */
22635 -               kprobe_flush_task(prev);
22636 -
22637 -               /* Task is done with its stack. */
22638 -               put_task_stack(prev);
22639 -
22640                 put_task_struct(prev);
22641         }
22642
22643 @@ -3336,25 +3418,13 @@ static void __sched notrace __schedule(bool preempt)
22644                                 atomic_inc(&rq->nr_iowait);
22645                                 delayacct_blkio_start();
22646                         }
22647 -
22648 -                       /*
22649 -                        * If a worker went to sleep, notify and ask workqueue
22650 -                        * whether it wants to wake up a task to maintain
22651 -                        * concurrency.
22652 -                        */
22653 -                       if (prev->flags & PF_WQ_WORKER) {
22654 -                               struct task_struct *to_wakeup;
22655 -
22656 -                               to_wakeup = wq_worker_sleeping(prev);
22657 -                               if (to_wakeup)
22658 -                                       try_to_wake_up_local(to_wakeup, &rf);
22659 -                       }
22660                 }
22661                 switch_count = &prev->nvcsw;
22662         }
22663
22664         next = pick_next_task(rq, prev, &rf);
22665         clear_tsk_need_resched(prev);
22666 +       clear_tsk_need_resched_lazy(prev);
22667         clear_preempt_need_resched();
22668
22669         if (likely(prev != next)) {
22670 @@ -3407,8 +3477,24 @@ void __noreturn do_task_dead(void)
22671
22672  static inline void sched_submit_work(struct task_struct *tsk)
22673  {
22674 -       if (!tsk->state || tsk_is_pi_blocked(tsk))
22675 +       if (!tsk->state)
22676                 return;
22677 +       /*
22678 +        * If a worker went to sleep, notify and ask workqueue whether
22679 +        * it wants to wake up a task to maintain concurrency.
22680 +        * As this function is called inside the schedule() context,
22681 +        * we disable preemption to avoid it calling schedule() again
22682 +        * in the possible wakeup of a kworker.
22683 +        */
22684 +       if (tsk->flags & PF_WQ_WORKER) {
22685 +               preempt_disable();
22686 +               wq_worker_sleeping(tsk);
22687 +               preempt_enable_no_resched();
22688 +       }
22689 +
22690 +       if (tsk_is_pi_blocked(tsk))
22691 +               return;
22692 +
22693         /*
22694          * If we are going to sleep and we have plugged IO queued,
22695          * make sure to submit it to avoid deadlocks.
22696 @@ -3417,6 +3503,12 @@ static inline void sched_submit_work(struct task_struct *tsk)
22697                 blk_schedule_flush_plug(tsk);
22698  }
22699
22700 +static void sched_update_worker(struct task_struct *tsk)
22701 +{
22702 +       if (tsk->flags & PF_WQ_WORKER)
22703 +               wq_worker_running(tsk);
22704 +}
22705 +
22706  asmlinkage __visible void __sched schedule(void)
22707  {
22708         struct task_struct *tsk = current;
22709 @@ -3427,6 +3519,7 @@ asmlinkage __visible void __sched schedule(void)
22710                 __schedule(false);
22711                 sched_preempt_enable_no_resched();
22712         } while (need_resched());
22713 +       sched_update_worker(tsk);
22714  }
22715  EXPORT_SYMBOL(schedule);
22716
22717 @@ -3515,6 +3608,30 @@ static void __sched notrace preempt_schedule_common(void)
22718         } while (need_resched());
22719  }
22720
22721 +#ifdef CONFIG_PREEMPT_LAZY
22722 +/*
22723 + * If TIF_NEED_RESCHED is then we allow to be scheduled away since this is
22724 + * set by a RT task. Oterwise we try to avoid beeing scheduled out as long as
22725 + * preempt_lazy_count counter >0.
22726 + */
22727 +static __always_inline int preemptible_lazy(void)
22728 +{
22729 +       if (test_thread_flag(TIF_NEED_RESCHED))
22730 +               return 1;
22731 +       if (current_thread_info()->preempt_lazy_count)
22732 +               return 0;
22733 +       return 1;
22734 +}
22735 +
22736 +#else
22737 +
22738 +static inline int preemptible_lazy(void)
22739 +{
22740 +       return 1;
22741 +}
22742 +
22743 +#endif
22744 +
22745  #ifdef CONFIG_PREEMPT
22746  /*
22747   * this is the entry point to schedule() from in-kernel preemption
22748 @@ -3529,7 +3646,8 @@ asmlinkage __visible void __sched notrace preempt_schedule(void)
22749          */
22750         if (likely(!preemptible()))
22751                 return;
22752 -
22753 +       if (!preemptible_lazy())
22754 +               return;
22755         preempt_schedule_common();
22756  }
22757  NOKPROBE_SYMBOL(preempt_schedule);
22758 @@ -3556,6 +3674,9 @@ asmlinkage __visible void __sched notrace preempt_schedule_notrace(void)
22759         if (likely(!preemptible()))
22760                 return;
22761
22762 +       if (!preemptible_lazy())
22763 +               return;
22764 +
22765         do {
22766                 /*
22767                  * Because the function tracer can trace preempt_count_sub()
22768 @@ -3578,7 +3699,16 @@ asmlinkage __visible void __sched notrace preempt_schedule_notrace(void)
22769                  * an infinite recursion.
22770                  */
22771                 prev_ctx = exception_enter();
22772 +               /*
22773 +                * The add/subtract must not be traced by the function
22774 +                * tracer. But we still want to account for the
22775 +                * preempt off latency tracer. Since the _notrace versions
22776 +                * of add/subtract skip the accounting for latency tracer
22777 +                * we must force it manually.
22778 +                */
22779 +               start_critical_timings();
22780                 __schedule(true);
22781 +               stop_critical_timings();
22782                 exception_exit(prev_ctx);
22783
22784                 preempt_latency_stop(1);
22785 @@ -4164,7 +4294,7 @@ static int __sched_setscheduler(struct task_struct *p,
22786                          * the entire root_domain to become SCHED_DEADLINE. We
22787                          * will also fail if there's no bandwidth available.
22788                          */
22789 -                       if (!cpumask_subset(span, &p->cpus_allowed) ||
22790 +                       if (!cpumask_subset(span, p->cpus_ptr) ||
22791                             rq->rd->dl_bw.bw == 0) {
22792                                 task_rq_unlock(rq, p, &rf);
22793                                 return -EPERM;
22794 @@ -4758,7 +4888,7 @@ long sched_getaffinity(pid_t pid, struct cpumask *mask)
22795                 goto out_unlock;
22796
22797         raw_spin_lock_irqsave(&p->pi_lock, flags);
22798 -       cpumask_and(mask, &p->cpus_allowed, cpu_active_mask);
22799 +       cpumask_and(mask, &p->cpus_mask, cpu_active_mask);
22800         raw_spin_unlock_irqrestore(&p->pi_lock, flags);
22801
22802  out_unlock:
22803 @@ -4877,6 +5007,7 @@ int __cond_resched_lock(spinlock_t *lock)
22804  }
22805  EXPORT_SYMBOL(__cond_resched_lock);
22806
22807 +#ifndef CONFIG_PREEMPT_RT_FULL
22808  int __sched __cond_resched_softirq(void)
22809  {
22810         BUG_ON(!in_softirq());
22811 @@ -4890,6 +5021,7 @@ int __sched __cond_resched_softirq(void)
22812         return 0;
22813  }
22814  EXPORT_SYMBOL(__cond_resched_softirq);
22815 +#endif
22816
22817  /**
22818   * yield - yield the current processor to other threads.
22819 @@ -5284,7 +5416,9 @@ void init_idle(struct task_struct *idle, int cpu)
22820
22821         /* Set the preempt count _outside_ the spinlocks! */
22822         init_idle_preempt_count(idle, cpu);
22823 -
22824 +#ifdef CONFIG_HAVE_PREEMPT_LAZY
22825 +       task_thread_info(idle)->preempt_lazy_count = 0;
22826 +#endif
22827         /*
22828          * The idle tasks have their own, simple scheduling class:
22829          */
22830 @@ -5323,7 +5457,7 @@ int task_can_attach(struct task_struct *p,
22831          * allowed nodes is unnecessary.  Thus, cpusets are not
22832          * applicable for such threads.  This prevents checking for
22833          * success of set_cpus_allowed_ptr() on all attached tasks
22834 -        * before cpus_allowed may be changed.
22835 +        * before cpus_mask may be changed.
22836          */
22837         if (p->flags & PF_NO_SETAFFINITY) {
22838                 ret = -EINVAL;
22839 @@ -5350,7 +5484,7 @@ int migrate_task_to(struct task_struct *p, int target_cpu)
22840         if (curr_cpu == target_cpu)
22841                 return 0;
22842
22843 -       if (!cpumask_test_cpu(target_cpu, &p->cpus_allowed))
22844 +       if (!cpumask_test_cpu(target_cpu, p->cpus_ptr))
22845                 return -EINVAL;
22846
22847         /* TODO: This is not properly updating schedstats */
22848 @@ -5389,6 +5523,8 @@ void sched_setnuma(struct task_struct *p, int nid)
22849  #endif /* CONFIG_NUMA_BALANCING */
22850
22851  #ifdef CONFIG_HOTPLUG_CPU
22852 +static DEFINE_PER_CPU(struct mm_struct *, idle_last_mm);
22853 +
22854  /*
22855   * Ensure that the idle task is using init_mm right before its CPU goes
22856   * offline.
22857 @@ -5403,7 +5539,12 @@ void idle_task_exit(void)
22858                 switch_mm(mm, &init_mm, current);
22859                 finish_arch_post_lock_switch();
22860         }
22861 -       mmdrop(mm);
22862 +       /*
22863 +        * Defer the cleanup to an alive cpu. On RT we can neither
22864 +        * call mmdrop() nor mmdrop_delayed() from here.
22865 +        */
22866 +       per_cpu(idle_last_mm, smp_processor_id()) = mm;
22867 +
22868  }
22869
22870  /*
22871 @@ -5487,7 +5628,7 @@ static void migrate_tasks(struct rq *dead_rq, struct rq_flags *rf)
22872                 put_prev_task(rq, next);
22873
22874                 /*
22875 -                * Rules for changing task_struct::cpus_allowed are holding
22876 +                * Rules for changing task_struct::cpus_mask are holding
22877                  * both pi_lock and rq->lock, such that holding either
22878                  * stabilizes the mask.
22879                  *
22880 @@ -5718,6 +5859,10 @@ int sched_cpu_dying(unsigned int cpu)
22881         update_max_interval();
22882         nohz_balance_exit_idle(cpu);
22883         hrtick_clear(rq);
22884 +       if (per_cpu(idle_last_mm, cpu)) {
22885 +               mmdrop_delayed(per_cpu(idle_last_mm, cpu));
22886 +               per_cpu(idle_last_mm, cpu) = NULL;
22887 +       }
22888         return 0;
22889  }
22890  #endif
22891 @@ -5964,7 +6109,7 @@ void __init sched_init(void)
22892  #ifdef CONFIG_DEBUG_ATOMIC_SLEEP
22893  static inline int preempt_count_equals(int preempt_offset)
22894  {
22895 -       int nested = preempt_count() + rcu_preempt_depth();
22896 +       int nested = preempt_count() + sched_rcu_preempt_depth();
22897
22898         return (nested == preempt_offset);
22899  }
22900 @@ -6756,3 +6901,196 @@ const u32 sched_prio_to_wmult[40] = {
22901   /*  10 */  39045157,  49367440,  61356676,  76695844,  95443717,
22902   /*  15 */ 119304647, 148102320, 186737708, 238609294, 286331153,
22903  };
22904 +
22905 +#if defined(CONFIG_SMP) && defined(CONFIG_PREEMPT_RT_BASE)
22906 +
22907 +static inline void
22908 +update_nr_migratory(struct task_struct *p, long delta)
22909 +{
22910 +       if (unlikely((p->sched_class == &rt_sched_class ||
22911 +                     p->sched_class == &dl_sched_class) &&
22912 +                     p->nr_cpus_allowed > 1)) {
22913 +               if (p->sched_class == &rt_sched_class)
22914 +                       task_rq(p)->rt.rt_nr_migratory += delta;
22915 +               else
22916 +                       task_rq(p)->dl.dl_nr_migratory += delta;
22917 +       }
22918 +}
22919 +
22920 +static inline void
22921 +migrate_disable_update_cpus_allowed(struct task_struct *p)
22922 +{
22923 +       struct rq *rq;
22924 +       struct rq_flags rf;
22925 +
22926 +       p->cpus_ptr = cpumask_of(smp_processor_id());
22927 +
22928 +       rq = task_rq_lock(p, &rf);
22929 +       update_nr_migratory(p, -1);
22930 +       p->nr_cpus_allowed = 1;
22931 +       task_rq_unlock(rq, p, &rf);
22932 +}
22933 +
22934 +static inline void
22935 +migrate_enable_update_cpus_allowed(struct task_struct *p)
22936 +{
22937 +       struct rq *rq;
22938 +       struct rq_flags rf;
22939 +
22940 +       p->cpus_ptr = &p->cpus_mask;
22941 +
22942 +       rq = task_rq_lock(p, &rf);
22943 +       p->nr_cpus_allowed = cpumask_weight(&p->cpus_mask);
22944 +       update_nr_migratory(p, 1);
22945 +       task_rq_unlock(rq, p, &rf);
22946 +}
22947 +
22948 +void migrate_disable(void)
22949 +{
22950 +       struct task_struct *p = current;
22951 +
22952 +       if (in_atomic() || irqs_disabled()) {
22953 +#ifdef CONFIG_SCHED_DEBUG
22954 +               p->migrate_disable_atomic++;
22955 +#endif
22956 +               return;
22957 +       }
22958 +#ifdef CONFIG_SCHED_DEBUG
22959 +       if (unlikely(p->migrate_disable_atomic)) {
22960 +               tracing_off();
22961 +               WARN_ON_ONCE(1);
22962 +       }
22963 +#endif
22964 +
22965 +       if (p->migrate_disable) {
22966 +               p->migrate_disable++;
22967 +               return;
22968 +       }
22969 +
22970 +       preempt_disable();
22971 +       preempt_lazy_disable();
22972 +       pin_current_cpu();
22973 +
22974 +       migrate_disable_update_cpus_allowed(p);
22975 +       p->migrate_disable = 1;
22976 +
22977 +       preempt_enable();
22978 +}
22979 +EXPORT_SYMBOL(migrate_disable);
22980 +
22981 +void migrate_enable(void)
22982 +{
22983 +       struct task_struct *p = current;
22984 +
22985 +       if (in_atomic() || irqs_disabled()) {
22986 +#ifdef CONFIG_SCHED_DEBUG
22987 +               p->migrate_disable_atomic--;
22988 +#endif
22989 +               return;
22990 +       }
22991 +
22992 +#ifdef CONFIG_SCHED_DEBUG
22993 +       if (unlikely(p->migrate_disable_atomic)) {
22994 +               tracing_off();
22995 +               WARN_ON_ONCE(1);
22996 +       }
22997 +#endif
22998 +
22999 +       WARN_ON_ONCE(p->migrate_disable <= 0);
23000 +       if (p->migrate_disable > 1) {
23001 +               p->migrate_disable--;
23002 +               return;
23003 +       }
23004 +
23005 +       preempt_disable();
23006 +
23007 +       p->migrate_disable = 0;
23008 +       migrate_enable_update_cpus_allowed(p);
23009 +
23010 +       if (p->migrate_disable_update) {
23011 +               struct rq *rq;
23012 +               struct rq_flags rf;
23013 +
23014 +               rq = task_rq_lock(p, &rf);
23015 +               update_rq_clock(rq);
23016 +
23017 +               __do_set_cpus_allowed_tail(p, &p->cpus_mask);
23018 +               task_rq_unlock(rq, p, &rf);
23019 +
23020 +               p->migrate_disable_update = 0;
23021 +
23022 +               WARN_ON(smp_processor_id() != task_cpu(p));
23023 +               if (!cpumask_test_cpu(task_cpu(p), &p->cpus_mask)) {
23024 +                       const struct cpumask *cpu_valid_mask = cpu_active_mask;
23025 +                       struct migration_arg arg;
23026 +                       unsigned int dest_cpu;
23027 +
23028 +                       if (p->flags & PF_KTHREAD) {
23029 +                               /*
23030 +                                * Kernel threads are allowed on online && !active CPUs
23031 +                                */
23032 +                               cpu_valid_mask = cpu_online_mask;
23033 +                       }
23034 +                       dest_cpu = cpumask_any_and(cpu_valid_mask, &p->cpus_mask);
23035 +                       arg.task = p;
23036 +                       arg.dest_cpu = dest_cpu;
23037 +
23038 +                       unpin_current_cpu();
23039 +                       preempt_lazy_enable();
23040 +                       preempt_enable();
23041 +                       stop_one_cpu(task_cpu(p), migration_cpu_stop, &arg);
23042 +                       tlb_migrate_finish(p->mm);
23043 +
23044 +                       return;
23045 +               }
23046 +       }
23047 +       unpin_current_cpu();
23048 +       preempt_lazy_enable();
23049 +       preempt_enable();
23050 +}
23051 +EXPORT_SYMBOL(migrate_enable);
23052 +
23053 +#elif !defined(CONFIG_SMP) && defined(CONFIG_PREEMPT_RT_BASE)
23054 +void migrate_disable(void)
23055 +{
23056 +#ifdef CONFIG_SCHED_DEBUG
23057 +       struct task_struct *p = current;
23058 +
23059 +       if (in_atomic() || irqs_disabled()) {
23060 +               p->migrate_disable_atomic++;
23061 +               return;
23062 +       }
23063 +
23064 +       if (unlikely(p->migrate_disable_atomic)) {
23065 +               tracing_off();
23066 +               WARN_ON_ONCE(1);
23067 +       }
23068 +
23069 +       p->migrate_disable++;
23070 +#endif
23071 +       barrier();
23072 +}
23073 +EXPORT_SYMBOL(migrate_disable);
23074 +
23075 +void migrate_enable(void)
23076 +{
23077 +#ifdef CONFIG_SCHED_DEBUG
23078 +       struct task_struct *p = current;
23079 +
23080 +       if (in_atomic() || irqs_disabled()) {
23081 +               p->migrate_disable_atomic--;
23082 +               return;
23083 +       }
23084 +
23085 +       if (unlikely(p->migrate_disable_atomic)) {
23086 +               tracing_off();
23087 +               WARN_ON_ONCE(1);
23088 +       }
23089 +
23090 +       WARN_ON_ONCE(p->migrate_disable <= 0);
23091 +       p->migrate_disable--;
23092 +#endif
23093 +       barrier();
23094 +}
23095 +EXPORT_SYMBOL(migrate_enable);
23096 +#endif
23097 diff --git a/kernel/sched/cpudeadline.c b/kernel/sched/cpudeadline.c
23098 index 8d9562d890d3..91a0702fe3df 100644
23099 --- a/kernel/sched/cpudeadline.c
23100 +++ b/kernel/sched/cpudeadline.c
23101 @@ -127,13 +127,13 @@ int cpudl_find(struct cpudl *cp, struct task_struct *p,
23102         const struct sched_dl_entity *dl_se = &p->dl;
23103
23104         if (later_mask &&
23105 -           cpumask_and(later_mask, cp->free_cpus, &p->cpus_allowed)) {
23106 +           cpumask_and(later_mask, cp->free_cpus, p->cpus_ptr)) {
23107                 return 1;
23108         } else {
23109                 int best_cpu = cpudl_maximum(cp);
23110                 WARN_ON(best_cpu != -1 && !cpu_present(best_cpu));
23111
23112 -               if (cpumask_test_cpu(best_cpu, &p->cpus_allowed) &&
23113 +               if (cpumask_test_cpu(best_cpu, p->cpus_ptr) &&
23114                     dl_time_before(dl_se->deadline, cp->elements[0].dl)) {
23115                         if (later_mask)
23116                                 cpumask_set_cpu(best_cpu, later_mask);
23117 diff --git a/kernel/sched/cpupri.c b/kernel/sched/cpupri.c
23118 index 2511aba36b89..7b9bc1de0e6c 100644
23119 --- a/kernel/sched/cpupri.c
23120 +++ b/kernel/sched/cpupri.c
23121 @@ -103,11 +103,11 @@ int cpupri_find(struct cpupri *cp, struct task_struct *p,
23122                 if (skip)
23123                         continue;
23124
23125 -               if (cpumask_any_and(&p->cpus_allowed, vec->mask) >= nr_cpu_ids)
23126 +               if (cpumask_any_and(p->cpus_ptr, vec->mask) >= nr_cpu_ids)
23127                         continue;
23128
23129                 if (lowest_mask) {
23130 -                       cpumask_and(lowest_mask, &p->cpus_allowed, vec->mask);
23131 +                       cpumask_and(lowest_mask, p->cpus_ptr, vec->mask);
23132
23133                         /*
23134                          * We have to ensure that we have at least one bit
23135 diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
23136 index b2589c7e9439..28a75a9526ac 100644
23137 --- a/kernel/sched/deadline.c
23138 +++ b/kernel/sched/deadline.c
23139 @@ -504,7 +504,7 @@ static struct rq *dl_task_offline_migration(struct rq *rq, struct task_struct *p
23140                  * If we cannot preempt any rq, fall back to pick any
23141                  * online cpu.
23142                  */
23143 -               cpu = cpumask_any_and(cpu_active_mask, &p->cpus_allowed);
23144 +               cpu = cpumask_any_and(cpu_active_mask, p->cpus_ptr);
23145                 if (cpu >= nr_cpu_ids) {
23146                         /*
23147                          * Fail to find any suitable cpu.
23148 @@ -1020,7 +1020,7 @@ void init_dl_task_timer(struct sched_dl_entity *dl_se)
23149  {
23150         struct hrtimer *timer = &dl_se->dl_timer;
23151
23152 -       hrtimer_init(timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
23153 +       hrtimer_init(timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD);
23154         timer->function = dl_task_timer;
23155  }
23156
23157 @@ -1753,7 +1753,7 @@ static void set_curr_task_dl(struct rq *rq)
23158  static int pick_dl_task(struct rq *rq, struct task_struct *p, int cpu)
23159  {
23160         if (!task_running(rq, p) &&
23161 -           cpumask_test_cpu(cpu, &p->cpus_allowed))
23162 +           cpumask_test_cpu(cpu, p->cpus_ptr))
23163                 return 1;
23164         return 0;
23165  }
23166 @@ -1903,7 +1903,7 @@ static struct rq *find_lock_later_rq(struct task_struct *task, struct rq *rq)
23167                 /* Retry if something changed. */
23168                 if (double_lock_balance(rq, later_rq)) {
23169                         if (unlikely(task_rq(task) != rq ||
23170 -                                    !cpumask_test_cpu(later_rq->cpu, &task->cpus_allowed) ||
23171 +                                    !cpumask_test_cpu(later_rq->cpu, task->cpus_ptr) ||
23172                                      task_running(rq, task) ||
23173                                      !dl_task(task) ||
23174                                      !task_on_rq_queued(task))) {
23175 diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
23176 index 2f93e4a2d9f6..b5b43861c2b6 100644
23177 --- a/kernel/sched/debug.c
23178 +++ b/kernel/sched/debug.c
23179 @@ -1017,6 +1017,10 @@ void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns,
23180                 P(dl.runtime);
23181                 P(dl.deadline);
23182         }
23183 +#if defined(CONFIG_SMP) && defined(CONFIG_PREEMPT_RT_BASE)
23184 +       P(migrate_disable);
23185 +#endif
23186 +       P(nr_cpus_allowed);
23187  #undef PN_SCHEDSTAT
23188  #undef PN
23189  #undef __PN
23190 diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
23191 index b2d699f28304..20e7d867af7a 100644
23192 --- a/kernel/sched/fair.c
23193 +++ b/kernel/sched/fair.c
23194 @@ -1598,7 +1598,7 @@ static void task_numa_compare(struct task_numa_env *env,
23195          */
23196         if (cur) {
23197                 /* Skip this swap candidate if cannot move to the source cpu */
23198 -               if (!cpumask_test_cpu(env->src_cpu, &cur->cpus_allowed))
23199 +               if (!cpumask_test_cpu(env->src_cpu, cur->cpus_ptr))
23200                         goto unlock;
23201
23202                 /*
23203 @@ -1708,7 +1708,7 @@ static void task_numa_find_cpu(struct task_numa_env *env,
23204
23205         for_each_cpu(cpu, cpumask_of_node(env->dst_nid)) {
23206                 /* Skip this CPU if the source task cannot migrate */
23207 -               if (!cpumask_test_cpu(cpu, &env->p->cpus_allowed))
23208 +               if (!cpumask_test_cpu(cpu, env->p->cpus_ptr))
23209                         continue;
23210
23211                 env->dst_cpu = cpu;
23212 @@ -3842,7 +3842,7 @@ check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
23213         ideal_runtime = sched_slice(cfs_rq, curr);
23214         delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime;
23215         if (delta_exec > ideal_runtime) {
23216 -               resched_curr(rq_of(cfs_rq));
23217 +               resched_curr_lazy(rq_of(cfs_rq));
23218                 /*
23219                  * The current task ran long enough, ensure it doesn't get
23220                  * re-elected due to buddy favours.
23221 @@ -3866,7 +3866,7 @@ check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
23222                 return;
23223
23224         if (delta > ideal_runtime)
23225 -               resched_curr(rq_of(cfs_rq));
23226 +               resched_curr_lazy(rq_of(cfs_rq));
23227  }
23228
23229  static void
23230 @@ -4008,7 +4008,7 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
23231          * validating it and just reschedule.
23232          */
23233         if (queued) {
23234 -               resched_curr(rq_of(cfs_rq));
23235 +               resched_curr_lazy(rq_of(cfs_rq));
23236                 return;
23237         }
23238         /*
23239 @@ -4190,7 +4190,7 @@ static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec)
23240          * hierarchy can be throttled
23241          */
23242         if (!assign_cfs_rq_runtime(cfs_rq) && likely(cfs_rq->curr))
23243 -               resched_curr(rq_of(cfs_rq));
23244 +               resched_curr_lazy(rq_of(cfs_rq));
23245  }
23246
23247  static __always_inline
23248 @@ -4686,9 +4686,9 @@ void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
23249         cfs_b->period = ns_to_ktime(default_cfs_period());
23250
23251         INIT_LIST_HEAD(&cfs_b->throttled_cfs_rq);
23252 -       hrtimer_init(&cfs_b->period_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED);
23253 +       hrtimer_init(&cfs_b->period_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED_HARD);
23254         cfs_b->period_timer.function = sched_cfs_period_timer;
23255 -       hrtimer_init(&cfs_b->slack_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
23256 +       hrtimer_init(&cfs_b->slack_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD);
23257         cfs_b->slack_timer.function = sched_cfs_slack_timer;
23258  }
23259
23260 @@ -4839,7 +4839,7 @@ static void hrtick_start_fair(struct rq *rq, struct task_struct *p)
23261
23262                 if (delta < 0) {
23263                         if (rq->curr == p)
23264 -                               resched_curr(rq);
23265 +                               resched_curr_lazy(rq);
23266                         return;
23267                 }
23268                 hrtick_start(rq, delta);
23269 @@ -5477,7 +5477,7 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p,
23270
23271                 /* Skip over this group if it has no CPUs allowed */
23272                 if (!cpumask_intersects(sched_group_span(group),
23273 -                                       &p->cpus_allowed))
23274 +                                       p->cpus_ptr))
23275                         continue;
23276
23277                 local_group = cpumask_test_cpu(this_cpu,
23278 @@ -5597,7 +5597,7 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
23279                 return cpumask_first(sched_group_span(group));
23280
23281         /* Traverse only the allowed CPUs */
23282 -       for_each_cpu_and(i, sched_group_span(group), &p->cpus_allowed) {
23283 +       for_each_cpu_and(i, sched_group_span(group), p->cpus_ptr) {
23284                 if (idle_cpu(i)) {
23285                         struct rq *rq = cpu_rq(i);
23286                         struct cpuidle_state *idle = idle_get_state(rq);
23287 @@ -5700,7 +5700,7 @@ static int select_idle_core(struct task_struct *p, struct sched_domain *sd, int
23288         if (!test_idle_cores(target, false))
23289                 return -1;
23290
23291 -       cpumask_and(cpus, sched_domain_span(sd), &p->cpus_allowed);
23292 +       cpumask_and(cpus, sched_domain_span(sd), p->cpus_ptr);
23293
23294         for_each_cpu_wrap(core, cpus, target) {
23295                 bool idle = true;
23296 @@ -5734,7 +5734,7 @@ static int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int t
23297                 return -1;
23298
23299         for_each_cpu(cpu, cpu_smt_mask(target)) {
23300 -               if (!cpumask_test_cpu(cpu, &p->cpus_allowed))
23301 +               if (!cpumask_test_cpu(cpu, p->cpus_ptr))
23302                         continue;
23303                 if (idle_cpu(cpu))
23304                         return cpu;
23305 @@ -5797,7 +5797,7 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int t
23306         for_each_cpu_wrap(cpu, sched_domain_span(sd), target) {
23307                 if (!--nr)
23308                         return -1;
23309 -               if (!cpumask_test_cpu(cpu, &p->cpus_allowed))
23310 +               if (!cpumask_test_cpu(cpu, p->cpus_ptr))
23311                         continue;
23312                 if (idle_cpu(cpu))
23313                         break;
23314 @@ -5952,7 +5952,7 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f
23315         if (sd_flag & SD_BALANCE_WAKE) {
23316                 record_wakee(p);
23317                 want_affine = !wake_wide(p) && !wake_cap(p, cpu, prev_cpu)
23318 -                             && cpumask_test_cpu(cpu, &p->cpus_allowed);
23319 +                             && cpumask_test_cpu(cpu, p->cpus_ptr);
23320         }
23321
23322         rcu_read_lock();
23323 @@ -6233,7 +6233,7 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
23324         return;
23325
23326  preempt:
23327 -       resched_curr(rq);
23328 +       resched_curr_lazy(rq);
23329         /*
23330          * Only set the backward buddy when the current task is still
23331          * on the rq. This can happen when a wakeup gets interleaved
23332 @@ -6701,14 +6701,14 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
23333         /*
23334          * We do not migrate tasks that are:
23335          * 1) throttled_lb_pair, or
23336 -        * 2) cannot be migrated to this CPU due to cpus_allowed, or
23337 +        * 2) cannot be migrated to this CPU due to cpus_ptr, or
23338          * 3) running (obviously), or
23339          * 4) are cache-hot on their current CPU.
23340          */
23341         if (throttled_lb_pair(task_group(p), env->src_cpu, env->dst_cpu))
23342                 return 0;
23343
23344 -       if (!cpumask_test_cpu(env->dst_cpu, &p->cpus_allowed)) {
23345 +       if (!cpumask_test_cpu(env->dst_cpu, p->cpus_ptr)) {
23346                 int cpu;
23347
23348                 schedstat_inc(p->se.statistics.nr_failed_migrations_affine);
23349 @@ -6728,7 +6728,7 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
23350
23351                 /* Prevent to re-select dst_cpu via env's cpus */
23352                 for_each_cpu_and(cpu, env->dst_grpmask, env->cpus) {
23353 -                       if (cpumask_test_cpu(cpu, &p->cpus_allowed)) {
23354 +                       if (cpumask_test_cpu(cpu, p->cpus_ptr)) {
23355                                 env->flags |= LBF_DST_PINNED;
23356                                 env->new_dst_cpu = cpu;
23357                                 break;
23358 @@ -7297,7 +7297,7 @@ check_cpu_capacity(struct rq *rq, struct sched_domain *sd)
23359
23360  /*
23361   * Group imbalance indicates (and tries to solve) the problem where balancing
23362 - * groups is inadequate due to ->cpus_allowed constraints.
23363 + * groups is inadequate due to ->cpus_ptr constraints.
23364   *
23365   * Imagine a situation of two groups of 4 cpus each and 4 tasks each with a
23366   * cpumask covering 1 cpu of the first group and 3 cpus of the second group.
23367 @@ -7873,7 +7873,7 @@ static struct sched_group *find_busiest_group(struct lb_env *env)
23368         /*
23369          * If the busiest group is imbalanced the below checks don't
23370          * work because they assume all things are equal, which typically
23371 -        * isn't true due to cpus_allowed constraints and the like.
23372 +        * isn't true due to cpus_ptr constraints and the like.
23373          */
23374         if (busiest->group_type == group_imbalanced)
23375                 goto force_balance;
23376 @@ -8265,7 +8265,7 @@ static int load_balance(int this_cpu, struct rq *this_rq,
23377                          * if the curr task on busiest cpu can't be
23378                          * moved to this_cpu
23379                          */
23380 -                       if (!cpumask_test_cpu(this_cpu, &busiest->curr->cpus_allowed)) {
23381 +                       if (!cpumask_test_cpu(this_cpu, busiest->curr->cpus_ptr)) {
23382                                 raw_spin_unlock_irqrestore(&busiest->lock,
23383                                                             flags);
23384                                 env.flags |= LBF_ALL_PINNED;
23385 @@ -9087,7 +9087,7 @@ static void task_fork_fair(struct task_struct *p)
23386                  * 'current' within the tree based on its new key value.
23387                  */
23388                 swap(curr->vruntime, se->vruntime);
23389 -               resched_curr(rq);
23390 +               resched_curr_lazy(rq);
23391         }
23392
23393         se->vruntime -= cfs_rq->min_vruntime;
23394 @@ -9111,7 +9111,7 @@ prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio)
23395          */
23396         if (rq->curr == p) {
23397                 if (p->prio > oldprio)
23398 -                       resched_curr(rq);
23399 +                       resched_curr_lazy(rq);
23400         } else
23401                 check_preempt_curr(rq, p, 0);
23402  }
23403 diff --git a/kernel/sched/features.h b/kernel/sched/features.h
23404 index 9552fd5854bf..fb069998b518 100644
23405 --- a/kernel/sched/features.h
23406 +++ b/kernel/sched/features.h
23407 @@ -46,11 +46,19 @@ SCHED_FEAT(LB_BIAS, true)
23408   */
23409  SCHED_FEAT(NONTASK_CAPACITY, true)
23410
23411 +#ifdef CONFIG_PREEMPT_RT_FULL
23412 +SCHED_FEAT(TTWU_QUEUE, false)
23413 +# ifdef CONFIG_PREEMPT_LAZY
23414 +SCHED_FEAT(PREEMPT_LAZY, true)
23415 +# endif
23416 +#else
23417 +
23418  /*
23419   * Queue remote wakeups on the target CPU and process them
23420   * using the scheduler IPI. Reduces rq->lock contention/bounces.
23421   */
23422  SCHED_FEAT(TTWU_QUEUE, true)
23423 +#endif
23424
23425  /*
23426   * When doing wakeups, attempt to limit superfluous scans of the LLC domain.
23427 diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
23428 index cb9a5b8532fa..6c72332dab3f 100644
23429 --- a/kernel/sched/rt.c
23430 +++ b/kernel/sched/rt.c
23431 @@ -47,8 +47,8 @@ void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime)
23432
23433         raw_spin_lock_init(&rt_b->rt_runtime_lock);
23434
23435 -       hrtimer_init(&rt_b->rt_period_timer,
23436 -                       CLOCK_MONOTONIC, HRTIMER_MODE_REL);
23437 +       hrtimer_init(&rt_b->rt_period_timer, CLOCK_MONOTONIC,
23438 +                    HRTIMER_MODE_REL_HARD);
23439         rt_b->rt_period_timer.function = sched_rt_period_timer;
23440  }
23441
23442 @@ -1596,7 +1596,7 @@ static void put_prev_task_rt(struct rq *rq, struct task_struct *p)
23443  static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu)
23444  {
23445         if (!task_running(rq, p) &&
23446 -           cpumask_test_cpu(cpu, &p->cpus_allowed))
23447 +           cpumask_test_cpu(cpu, p->cpus_ptr))
23448                 return 1;
23449         return 0;
23450  }
23451 @@ -1731,7 +1731,7 @@ static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq)
23452                          * Also make sure that it wasn't scheduled on its rq.
23453                          */
23454                         if (unlikely(task_rq(task) != rq ||
23455 -                                    !cpumask_test_cpu(lowest_rq->cpu, &task->cpus_allowed) ||
23456 +                                    !cpumask_test_cpu(lowest_rq->cpu, task->cpus_ptr) ||
23457                                      task_running(rq, task) ||
23458                                      !rt_task(task) ||
23459                                      !task_on_rq_queued(task))) {
23460 diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
23461 index b29376169f3f..96481980c8c7 100644
23462 --- a/kernel/sched/sched.h
23463 +++ b/kernel/sched/sched.h
23464 @@ -1354,6 +1354,7 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
23465  #define WF_SYNC                0x01            /* waker goes to sleep after wakeup */
23466  #define WF_FORK                0x02            /* child wakeup after fork */
23467  #define WF_MIGRATED    0x4             /* internal use, task got migrated */
23468 +#define WF_LOCK_SLEEPER        0x08            /* wakeup spinlock "sleeper" */
23469
23470  /*
23471   * To aid in avoiding the subversion of "niceness" due to uneven distribution
23472 @@ -1545,6 +1546,15 @@ extern void init_sched_fair_class(void);
23473  extern void resched_curr(struct rq *rq);
23474  extern void resched_cpu(int cpu);
23475
23476 +#ifdef CONFIG_PREEMPT_LAZY
23477 +extern void resched_curr_lazy(struct rq *rq);
23478 +#else
23479 +static inline void resched_curr_lazy(struct rq *rq)
23480 +{
23481 +       resched_curr(rq);
23482 +}
23483 +#endif
23484 +
23485  extern struct rt_bandwidth def_rt_bandwidth;
23486  extern void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime);
23487
23488 diff --git a/kernel/sched/swait.c b/kernel/sched/swait.c
23489 index 9ff1555341ed..b14638a05ec9 100644
23490 --- a/kernel/sched/swait.c
23491 +++ b/kernel/sched/swait.c
23492 @@ -1,6 +1,7 @@
23493  // SPDX-License-Identifier: GPL-2.0
23494  #include <linux/sched/signal.h>
23495  #include <linux/swait.h>
23496 +#include <linux/suspend.h>
23497
23498  void __init_swait_queue_head(struct swait_queue_head *q, const char *name,
23499                              struct lock_class_key *key)
23500 @@ -30,6 +31,25 @@ void swake_up_locked(struct swait_queue_head *q)
23501  }
23502  EXPORT_SYMBOL(swake_up_locked);
23503
23504 +void swake_up_all_locked(struct swait_queue_head *q)
23505 +{
23506 +       struct swait_queue *curr;
23507 +       int wakes = 0;
23508 +
23509 +       while (!list_empty(&q->task_list)) {
23510 +
23511 +               curr = list_first_entry(&q->task_list, typeof(*curr),
23512 +                                       task_list);
23513 +               wake_up_process(curr->task);
23514 +               list_del_init(&curr->task_list);
23515 +               wakes++;
23516 +       }
23517 +       if (pm_in_action)
23518 +               return;
23519 +       WARN(wakes > 2, "complete_all() with %d waiters\n", wakes);
23520 +}
23521 +EXPORT_SYMBOL(swake_up_all_locked);
23522 +
23523  void swake_up(struct swait_queue_head *q)
23524  {
23525         unsigned long flags;
23526 @@ -49,6 +69,7 @@ void swake_up_all(struct swait_queue_head *q)
23527         struct swait_queue *curr;
23528         LIST_HEAD(tmp);
23529
23530 +       WARN_ON(irqs_disabled());
23531         raw_spin_lock_irq(&q->lock);
23532         list_splice_init(&q->task_list, &tmp);
23533         while (!list_empty(&tmp)) {
23534 diff --git a/kernel/sched/swork.c b/kernel/sched/swork.c
23535 new file mode 100644
23536 index 000000000000..1950f40ca725
23537 --- /dev/null
23538 +++ b/kernel/sched/swork.c
23539 @@ -0,0 +1,173 @@
23540 +/*
23541 + * Copyright (C) 2014 BMW Car IT GmbH, Daniel Wagner daniel.wagner@bmw-carit.de
23542 + *
23543 + * Provides a framework for enqueuing callbacks from irq context
23544 + * PREEMPT_RT_FULL safe. The callbacks are executed in kthread context.
23545 + */
23546 +
23547 +#include <linux/swait.h>
23548 +#include <linux/swork.h>
23549 +#include <linux/kthread.h>
23550 +#include <linux/slab.h>
23551 +#include <linux/spinlock.h>
23552 +#include <linux/export.h>
23553 +
23554 +#define SWORK_EVENT_PENDING     (1 << 0)
23555 +
23556 +static DEFINE_MUTEX(worker_mutex);
23557 +static struct sworker *glob_worker;
23558 +
23559 +struct sworker {
23560 +       struct list_head events;
23561 +       struct swait_queue_head wq;
23562 +
23563 +       raw_spinlock_t lock;
23564 +
23565 +       struct task_struct *task;
23566 +       int refs;
23567 +};
23568 +
23569 +static bool swork_readable(struct sworker *worker)
23570 +{
23571 +       bool r;
23572 +
23573 +       if (kthread_should_stop())
23574 +               return true;
23575 +
23576 +       raw_spin_lock_irq(&worker->lock);
23577 +       r = !list_empty(&worker->events);
23578 +       raw_spin_unlock_irq(&worker->lock);
23579 +
23580 +       return r;
23581 +}
23582 +
23583 +static int swork_kthread(void *arg)
23584 +{
23585 +       struct sworker *worker = arg;
23586 +
23587 +       for (;;) {
23588 +               swait_event_interruptible(worker->wq,
23589 +                                       swork_readable(worker));
23590 +               if (kthread_should_stop())
23591 +                       break;
23592 +
23593 +               raw_spin_lock_irq(&worker->lock);
23594 +               while (!list_empty(&worker->events)) {
23595 +                       struct swork_event *sev;
23596 +
23597 +                       sev = list_first_entry(&worker->events,
23598 +                                       struct swork_event, item);
23599 +                       list_del(&sev->item);
23600 +                       raw_spin_unlock_irq(&worker->lock);
23601 +
23602 +                       WARN_ON_ONCE(!test_and_clear_bit(SWORK_EVENT_PENDING,
23603 +                                                        &sev->flags));
23604 +                       sev->func(sev);
23605 +                       raw_spin_lock_irq(&worker->lock);
23606 +               }
23607 +               raw_spin_unlock_irq(&worker->lock);
23608 +       }
23609 +       return 0;
23610 +}
23611 +
23612 +static struct sworker *swork_create(void)
23613 +{
23614 +       struct sworker *worker;
23615 +
23616 +       worker = kzalloc(sizeof(*worker), GFP_KERNEL);
23617 +       if (!worker)
23618 +               return ERR_PTR(-ENOMEM);
23619 +
23620 +       INIT_LIST_HEAD(&worker->events);
23621 +       raw_spin_lock_init(&worker->lock);
23622 +       init_swait_queue_head(&worker->wq);
23623 +
23624 +       worker->task = kthread_run(swork_kthread, worker, "kswork");
23625 +       if (IS_ERR(worker->task)) {
23626 +               kfree(worker);
23627 +               return ERR_PTR(-ENOMEM);
23628 +       }
23629 +
23630 +       return worker;
23631 +}
23632 +
23633 +static void swork_destroy(struct sworker *worker)
23634 +{
23635 +       kthread_stop(worker->task);
23636 +
23637 +       WARN_ON(!list_empty(&worker->events));
23638 +       kfree(worker);
23639 +}
23640 +
23641 +/**
23642 + * swork_queue - queue swork
23643 + *
23644 + * Returns %false if @work was already on a queue, %true otherwise.
23645 + *
23646 + * The work is queued and processed on a random CPU
23647 + */
23648 +bool swork_queue(struct swork_event *sev)
23649 +{
23650 +       unsigned long flags;
23651 +
23652 +       if (test_and_set_bit(SWORK_EVENT_PENDING, &sev->flags))
23653 +               return false;
23654 +
23655 +       raw_spin_lock_irqsave(&glob_worker->lock, flags);
23656 +       list_add_tail(&sev->item, &glob_worker->events);
23657 +       raw_spin_unlock_irqrestore(&glob_worker->lock, flags);
23658 +
23659 +       swake_up(&glob_worker->wq);
23660 +       return true;
23661 +}
23662 +EXPORT_SYMBOL_GPL(swork_queue);
23663 +
23664 +/**
23665 + * swork_get - get an instance of the sworker
23666 + *
23667 + * Returns an negative error code if the initialization if the worker did not
23668 + * work, %0 otherwise.
23669 + *
23670 + */
23671 +int swork_get(void)
23672 +{
23673 +       struct sworker *worker;
23674 +
23675 +       mutex_lock(&worker_mutex);
23676 +       if (!glob_worker) {
23677 +               worker = swork_create();
23678 +               if (IS_ERR(worker)) {
23679 +                       mutex_unlock(&worker_mutex);
23680 +                       return -ENOMEM;
23681 +               }
23682 +
23683 +               glob_worker = worker;
23684 +       }
23685 +
23686 +       glob_worker->refs++;
23687 +       mutex_unlock(&worker_mutex);
23688 +
23689 +       return 0;
23690 +}
23691 +EXPORT_SYMBOL_GPL(swork_get);
23692 +
23693 +/**
23694 + * swork_put - puts an instance of the sworker
23695 + *
23696 + * Will destroy the sworker thread. This function must not be called until all
23697 + * queued events have been completed.
23698 + */
23699 +void swork_put(void)
23700 +{
23701 +       mutex_lock(&worker_mutex);
23702 +
23703 +       glob_worker->refs--;
23704 +       if (glob_worker->refs > 0)
23705 +               goto out;
23706 +
23707 +       swork_destroy(glob_worker);
23708 +       glob_worker = NULL;
23709 +out:
23710 +       mutex_unlock(&worker_mutex);
23711 +}
23712 +EXPORT_SYMBOL_GPL(swork_put);
23713 diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
23714 index 659e075ef70b..bb22e3620a90 100644
23715 --- a/kernel/sched/topology.c
23716 +++ b/kernel/sched/topology.c
23717 @@ -286,6 +286,7 @@ static int init_rootdomain(struct root_domain *rd)
23718         rd->rto_cpu = -1;
23719         raw_spin_lock_init(&rd->rto_lock);
23720         init_irq_work(&rd->rto_push_work, rto_push_irq_work_func);
23721 +       rd->rto_push_work.flags |= IRQ_WORK_HARD_IRQ;
23722  #endif
23723
23724         init_dl_bw(&rd->dl_bw);
23725 diff --git a/kernel/signal.c b/kernel/signal.c
23726 index 4439ba9dc5d9..d8f75a030292 100644
23727 --- a/kernel/signal.c
23728 +++ b/kernel/signal.c
23729 @@ -19,6 +19,7 @@
23730  #include <linux/sched/task.h>
23731  #include <linux/sched/task_stack.h>
23732  #include <linux/sched/cputime.h>
23733 +#include <linux/sched/rt.h>
23734  #include <linux/fs.h>
23735  #include <linux/tty.h>
23736  #include <linux/binfmts.h>
23737 @@ -360,13 +361,30 @@ static bool task_participate_group_stop(struct task_struct *task)
23738         return false;
23739  }
23740
23741 +static inline struct sigqueue *get_task_cache(struct task_struct *t)
23742 +{
23743 +       struct sigqueue *q = t->sigqueue_cache;
23744 +
23745 +       if (cmpxchg(&t->sigqueue_cache, q, NULL) != q)
23746 +               return NULL;
23747 +       return q;
23748 +}
23749 +
23750 +static inline int put_task_cache(struct task_struct *t, struct sigqueue *q)
23751 +{
23752 +       if (cmpxchg(&t->sigqueue_cache, NULL, q) == NULL)
23753 +               return 0;
23754 +       return 1;
23755 +}
23756 +
23757  /*
23758   * allocate a new signal queue record
23759   * - this may be called without locks if and only if t == current, otherwise an
23760   *   appropriate lock must be held to stop the target task from exiting
23761   */
23762  static struct sigqueue *
23763 -__sigqueue_alloc(int sig, struct task_struct *t, gfp_t flags, int override_rlimit)
23764 +__sigqueue_do_alloc(int sig, struct task_struct *t, gfp_t flags,
23765 +                   int override_rlimit, int fromslab)
23766  {
23767         struct sigqueue *q = NULL;
23768         struct user_struct *user;
23769 @@ -383,7 +401,10 @@ __sigqueue_alloc(int sig, struct task_struct *t, gfp_t flags, int override_rlimi
23770         if (override_rlimit ||
23771             atomic_read(&user->sigpending) <=
23772                         task_rlimit(t, RLIMIT_SIGPENDING)) {
23773 -               q = kmem_cache_alloc(sigqueue_cachep, flags);
23774 +               if (!fromslab)
23775 +                       q = get_task_cache(t);
23776 +               if (!q)
23777 +                       q = kmem_cache_alloc(sigqueue_cachep, flags);
23778         } else {
23779                 print_dropped_signal(sig);
23780         }
23781 @@ -400,6 +421,13 @@ __sigqueue_alloc(int sig, struct task_struct *t, gfp_t flags, int override_rlimi
23782         return q;
23783  }
23784
23785 +static struct sigqueue *
23786 +__sigqueue_alloc(int sig, struct task_struct *t, gfp_t flags,
23787 +                int override_rlimit)
23788 +{
23789 +       return __sigqueue_do_alloc(sig, t, flags, override_rlimit, 0);
23790 +}
23791 +
23792  static void __sigqueue_free(struct sigqueue *q)
23793  {
23794         if (q->flags & SIGQUEUE_PREALLOC)
23795 @@ -409,6 +437,21 @@ static void __sigqueue_free(struct sigqueue *q)
23796         kmem_cache_free(sigqueue_cachep, q);
23797  }
23798
23799 +static void sigqueue_free_current(struct sigqueue *q)
23800 +{
23801 +       struct user_struct *up;
23802 +
23803 +       if (q->flags & SIGQUEUE_PREALLOC)
23804 +               return;
23805 +
23806 +       up = q->user;
23807 +       if (rt_prio(current->normal_prio) && !put_task_cache(current, q)) {
23808 +               atomic_dec(&up->sigpending);
23809 +               free_uid(up);
23810 +       } else
23811 +                 __sigqueue_free(q);
23812 +}
23813 +
23814  void flush_sigqueue(struct sigpending *queue)
23815  {
23816         struct sigqueue *q;
23817 @@ -421,6 +464,21 @@ void flush_sigqueue(struct sigpending *queue)
23818         }
23819  }
23820
23821 +/*
23822 + * Called from __exit_signal. Flush tsk->pending and
23823 + * tsk->sigqueue_cache
23824 + */
23825 +void flush_task_sigqueue(struct task_struct *tsk)
23826 +{
23827 +       struct sigqueue *q;
23828 +
23829 +       flush_sigqueue(&tsk->pending);
23830 +
23831 +       q = get_task_cache(tsk);
23832 +       if (q)
23833 +               kmem_cache_free(sigqueue_cachep, q);
23834 +}
23835 +
23836  /*
23837   * Flush all pending signals for this kthread.
23838   */
23839 @@ -542,7 +600,7 @@ static void collect_signal(int sig, struct sigpending *list, siginfo_t *info,
23840                         (info->si_code == SI_TIMER) &&
23841                         (info->si_sys_private);
23842
23843 -               __sigqueue_free(first);
23844 +               sigqueue_free_current(first);
23845         } else {
23846                 /*
23847                  * Ok, it wasn't in the queue.  This must be
23848 @@ -578,6 +636,8 @@ int dequeue_signal(struct task_struct *tsk, sigset_t *mask, siginfo_t *info)
23849         bool resched_timer = false;
23850         int signr;
23851
23852 +       WARN_ON_ONCE(tsk != current);
23853 +
23854         /* We only dequeue private signals from ourselves, we don't let
23855          * signalfd steal them
23856          */
23857 @@ -1177,8 +1237,8 @@ int do_send_sig_info(int sig, struct siginfo *info, struct task_struct *p,
23858   * We don't want to have recursive SIGSEGV's etc, for example,
23859   * that is why we also clear SIGNAL_UNKILLABLE.
23860   */
23861 -int
23862 -force_sig_info(int sig, struct siginfo *info, struct task_struct *t)
23863 +static int
23864 +do_force_sig_info(int sig, struct siginfo *info, struct task_struct *t)
23865  {
23866         unsigned long int flags;
23867         int ret, blocked, ignored;
23868 @@ -1207,6 +1267,39 @@ force_sig_info(int sig, struct siginfo *info, struct task_struct *t)
23869         return ret;
23870  }
23871
23872 +int force_sig_info(int sig, struct siginfo *info, struct task_struct *t)
23873 +{
23874 +/*
23875 + * On some archs, PREEMPT_RT has to delay sending a signal from a trap
23876 + * since it can not enable preemption, and the signal code's spin_locks
23877 + * turn into mutexes. Instead, it must set TIF_NOTIFY_RESUME which will
23878 + * send the signal on exit of the trap.
23879 + */
23880 +#ifdef ARCH_RT_DELAYS_SIGNAL_SEND
23881 +       if (in_atomic()) {
23882 +               if (WARN_ON_ONCE(t != current))
23883 +                       return 0;
23884 +               if (WARN_ON_ONCE(t->forced_info.si_signo))
23885 +                       return 0;
23886 +
23887 +               if (is_si_special(info)) {
23888 +                       WARN_ON_ONCE(info != SEND_SIG_PRIV);
23889 +                       t->forced_info.si_signo = sig;
23890 +                       t->forced_info.si_errno = 0;
23891 +                       t->forced_info.si_code = SI_KERNEL;
23892 +                       t->forced_info.si_pid = 0;
23893 +                       t->forced_info.si_uid = 0;
23894 +               } else {
23895 +                       t->forced_info = *info;
23896 +               }
23897 +
23898 +               set_tsk_thread_flag(t, TIF_NOTIFY_RESUME);
23899 +               return 0;
23900 +       }
23901 +#endif
23902 +       return do_force_sig_info(sig, info, t);
23903 +}
23904 +
23905  /*
23906   * Nuke all other threads in the group.
23907   */
23908 @@ -1241,12 +1334,12 @@ struct sighand_struct *__lock_task_sighand(struct task_struct *tsk,
23909                  * Disable interrupts early to avoid deadlocks.
23910                  * See rcu_read_unlock() comment header for details.
23911                  */
23912 -               local_irq_save(*flags);
23913 +               local_irq_save_nort(*flags);
23914                 rcu_read_lock();
23915                 sighand = rcu_dereference(tsk->sighand);
23916                 if (unlikely(sighand == NULL)) {
23917                         rcu_read_unlock();
23918 -                       local_irq_restore(*flags);
23919 +                       local_irq_restore_nort(*flags);
23920                         break;
23921                 }
23922                 /*
23923 @@ -1267,7 +1360,7 @@ struct sighand_struct *__lock_task_sighand(struct task_struct *tsk,
23924                 }
23925                 spin_unlock(&sighand->siglock);
23926                 rcu_read_unlock();
23927 -               local_irq_restore(*flags);
23928 +               local_irq_restore_nort(*flags);
23929         }
23930
23931         return sighand;
23932 @@ -1514,7 +1607,8 @@ EXPORT_SYMBOL(kill_pid);
23933   */
23934  struct sigqueue *sigqueue_alloc(void)
23935  {
23936 -       struct sigqueue *q = __sigqueue_alloc(-1, current, GFP_KERNEL, 0);
23937 +       /* Preallocated sigqueue objects always from the slabcache ! */
23938 +       struct sigqueue *q = __sigqueue_do_alloc(-1, current, GFP_KERNEL, 0, 1);
23939
23940         if (q)
23941                 q->flags |= SIGQUEUE_PREALLOC;
23942 @@ -1888,15 +1982,7 @@ static void ptrace_stop(int exit_code, int why, int clear_code, siginfo_t *info)
23943                 if (gstop_done && ptrace_reparented(current))
23944                         do_notify_parent_cldstop(current, false, why);
23945
23946 -               /*
23947 -                * Don't want to allow preemption here, because
23948 -                * sys_ptrace() needs this task to be inactive.
23949 -                *
23950 -                * XXX: implement read_unlock_no_resched().
23951 -                */
23952 -               preempt_disable();
23953                 read_unlock(&tasklist_lock);
23954 -               preempt_enable_no_resched();
23955                 freezable_schedule();
23956         } else {
23957                 /*
23958 diff --git a/kernel/softirq.c b/kernel/softirq.c
23959 index a4c87cf27f9d..583c9ecf04e3 100644
23960 --- a/kernel/softirq.c
23961 +++ b/kernel/softirq.c
23962 @@ -21,11 +21,14 @@
23963  #include <linux/freezer.h>
23964  #include <linux/kthread.h>
23965  #include <linux/rcupdate.h>
23966 +#include <linux/delay.h>
23967  #include <linux/ftrace.h>
23968  #include <linux/smp.h>
23969  #include <linux/smpboot.h>
23970  #include <linux/tick.h>
23971 +#include <linux/locallock.h>
23972  #include <linux/irq.h>
23973 +#include <linux/sched/types.h>
23974
23975  #define CREATE_TRACE_POINTS
23976  #include <trace/events/irq.h>
23977 @@ -56,12 +59,108 @@ EXPORT_SYMBOL(irq_stat);
23978  static struct softirq_action softirq_vec[NR_SOFTIRQS] __cacheline_aligned_in_smp;
23979
23980  DEFINE_PER_CPU(struct task_struct *, ksoftirqd);
23981 +#ifdef CONFIG_PREEMPT_RT_FULL
23982 +#define TIMER_SOFTIRQS ((1 << TIMER_SOFTIRQ) | (1 << HRTIMER_SOFTIRQ))
23983 +DEFINE_PER_CPU(struct task_struct *, ktimer_softirqd);
23984 +#endif
23985
23986  const char * const softirq_to_name[NR_SOFTIRQS] = {
23987         "HI", "TIMER", "NET_TX", "NET_RX", "BLOCK", "IRQ_POLL",
23988         "TASKLET", "SCHED", "HRTIMER", "RCU"
23989  };
23990
23991 +#ifdef CONFIG_NO_HZ_COMMON
23992 +# ifdef CONFIG_PREEMPT_RT_FULL
23993 +
23994 +struct softirq_runner {
23995 +       struct task_struct *runner[NR_SOFTIRQS];
23996 +};
23997 +
23998 +static DEFINE_PER_CPU(struct softirq_runner, softirq_runners);
23999 +
24000 +static inline void softirq_set_runner(unsigned int sirq)
24001 +{
24002 +       struct softirq_runner *sr = this_cpu_ptr(&softirq_runners);
24003 +
24004 +       sr->runner[sirq] = current;
24005 +}
24006 +
24007 +static inline void softirq_clr_runner(unsigned int sirq)
24008 +{
24009 +       struct softirq_runner *sr = this_cpu_ptr(&softirq_runners);
24010 +
24011 +       sr->runner[sirq] = NULL;
24012 +}
24013 +
24014 +/*
24015 + * On preempt-rt a softirq running context might be blocked on a
24016 + * lock. There might be no other runnable task on this CPU because the
24017 + * lock owner runs on some other CPU. So we have to go into idle with
24018 + * the pending bit set. Therefor we need to check this otherwise we
24019 + * warn about false positives which confuses users and defeats the
24020 + * whole purpose of this test.
24021 + *
24022 + * This code is called with interrupts disabled.
24023 + */
24024 +void softirq_check_pending_idle(void)
24025 +{
24026 +       static int rate_limit;
24027 +       struct softirq_runner *sr = this_cpu_ptr(&softirq_runners);
24028 +       u32 warnpending;
24029 +       int i;
24030 +
24031 +       if (rate_limit >= 10)
24032 +               return;
24033 +
24034 +       warnpending = local_softirq_pending() & SOFTIRQ_STOP_IDLE_MASK;
24035 +       for (i = 0; i < NR_SOFTIRQS; i++) {
24036 +               struct task_struct *tsk = sr->runner[i];
24037 +
24038 +               /*
24039 +                * The wakeup code in rtmutex.c wakes up the task
24040 +                * _before_ it sets pi_blocked_on to NULL under
24041 +                * tsk->pi_lock. So we need to check for both: state
24042 +                * and pi_blocked_on.
24043 +                */
24044 +               if (tsk) {
24045 +                       raw_spin_lock(&tsk->pi_lock);
24046 +                       if (tsk->pi_blocked_on || tsk->state == TASK_RUNNING) {
24047 +                               /* Clear all bits pending in that task */
24048 +                               warnpending &= ~(tsk->softirqs_raised);
24049 +                               warnpending &= ~(1 << i);
24050 +                       }
24051 +                       raw_spin_unlock(&tsk->pi_lock);
24052 +               }
24053 +       }
24054 +
24055 +       if (warnpending) {
24056 +               printk(KERN_ERR "NOHZ: local_softirq_pending %02x\n",
24057 +                      warnpending);
24058 +               rate_limit++;
24059 +       }
24060 +}
24061 +# else
24062 +/*
24063 + * On !PREEMPT_RT we just printk rate limited:
24064 + */
24065 +void softirq_check_pending_idle(void)
24066 +{
24067 +       static int rate_limit;
24068 +
24069 +       if (rate_limit < 10 && !in_softirq() &&
24070 +                       (local_softirq_pending() & SOFTIRQ_STOP_IDLE_MASK)) {
24071 +               printk(KERN_ERR "NOHZ: local_softirq_pending %02x\n",
24072 +                      local_softirq_pending());
24073 +               rate_limit++;
24074 +       }
24075 +}
24076 +# endif
24077 +
24078 +#else /* !CONFIG_NO_HZ_COMMON */
24079 +static inline void softirq_set_runner(unsigned int sirq) { }
24080 +static inline void softirq_clr_runner(unsigned int sirq) { }
24081 +#endif
24082 +
24083  /*
24084   * we cannot loop indefinitely here to avoid userspace starvation,
24085   * but we also don't want to introduce a worst case 1/HZ latency
24086 @@ -77,6 +176,38 @@ static void wakeup_softirqd(void)
24087                 wake_up_process(tsk);
24088  }
24089
24090 +#ifdef CONFIG_PREEMPT_RT_FULL
24091 +static void wakeup_timer_softirqd(void)
24092 +{
24093 +       /* Interrupts are disabled: no need to stop preemption */
24094 +       struct task_struct *tsk = __this_cpu_read(ktimer_softirqd);
24095 +
24096 +       if (tsk && tsk->state != TASK_RUNNING)
24097 +               wake_up_process(tsk);
24098 +}
24099 +#endif
24100 +
24101 +static void handle_softirq(unsigned int vec_nr)
24102 +{
24103 +       struct softirq_action *h = softirq_vec + vec_nr;
24104 +       int prev_count;
24105 +
24106 +       prev_count = preempt_count();
24107 +
24108 +       kstat_incr_softirqs_this_cpu(vec_nr);
24109 +
24110 +       trace_softirq_entry(vec_nr);
24111 +       h->action(h);
24112 +       trace_softirq_exit(vec_nr);
24113 +       if (unlikely(prev_count != preempt_count())) {
24114 +               pr_err("huh, entered softirq %u %s %p with preempt_count %08x, exited with %08x?\n",
24115 +                      vec_nr, softirq_to_name[vec_nr], h->action,
24116 +                      prev_count, preempt_count());
24117 +               preempt_count_set(prev_count);
24118 +       }
24119 +}
24120 +
24121 +#ifndef CONFIG_PREEMPT_RT_FULL
24122  /*
24123   * If ksoftirqd is scheduled, we do not want to process pending softirqs
24124   * right now. Let ksoftirqd handle this at its own rate, to get fairness,
24125 @@ -92,6 +223,47 @@ static bool ksoftirqd_running(unsigned long pending)
24126         return tsk && (tsk->state == TASK_RUNNING);
24127  }
24128
24129 +static inline int ksoftirqd_softirq_pending(void)
24130 +{
24131 +       return local_softirq_pending();
24132 +}
24133 +
24134 +static void handle_pending_softirqs(u32 pending)
24135 +{
24136 +       struct softirq_action *h = softirq_vec;
24137 +       int softirq_bit;
24138 +
24139 +       local_irq_enable();
24140 +
24141 +       h = softirq_vec;
24142 +
24143 +       while ((softirq_bit = ffs(pending))) {
24144 +               unsigned int vec_nr;
24145 +
24146 +               h += softirq_bit - 1;
24147 +               vec_nr = h - softirq_vec;
24148 +               handle_softirq(vec_nr);
24149 +
24150 +               h++;
24151 +               pending >>= softirq_bit;
24152 +       }
24153 +
24154 +       rcu_bh_qs();
24155 +       local_irq_disable();
24156 +}
24157 +
24158 +static void run_ksoftirqd(unsigned int cpu)
24159 +{
24160 +       local_irq_disable();
24161 +       if (ksoftirqd_softirq_pending()) {
24162 +               __do_softirq();
24163 +               local_irq_enable();
24164 +               cond_resched_rcu_qs();
24165 +               return;
24166 +       }
24167 +       local_irq_enable();
24168 +}
24169 +
24170  /*
24171   * preempt_count and SOFTIRQ_OFFSET usage:
24172   * - preempt_count is changed by SOFTIRQ_OFFSET on entering or leaving
24173 @@ -247,10 +419,8 @@ asmlinkage __visible void __softirq_entry __do_softirq(void)
24174         unsigned long end = jiffies + MAX_SOFTIRQ_TIME;
24175         unsigned long old_flags = current->flags;
24176         int max_restart = MAX_SOFTIRQ_RESTART;
24177 -       struct softirq_action *h;
24178         bool in_hardirq;
24179         __u32 pending;
24180 -       int softirq_bit;
24181
24182         /*
24183          * Mask out PF_MEMALLOC s current task context is borrowed for the
24184 @@ -269,36 +439,7 @@ asmlinkage __visible void __softirq_entry __do_softirq(void)
24185         /* Reset the pending bitmask before enabling irqs */
24186         set_softirq_pending(0);
24187
24188 -       local_irq_enable();
24189 -
24190 -       h = softirq_vec;
24191 -
24192 -       while ((softirq_bit = ffs(pending))) {
24193 -               unsigned int vec_nr;
24194 -               int prev_count;
24195 -
24196 -               h += softirq_bit - 1;
24197 -
24198 -               vec_nr = h - softirq_vec;
24199 -               prev_count = preempt_count();
24200 -
24201 -               kstat_incr_softirqs_this_cpu(vec_nr);
24202 -
24203 -               trace_softirq_entry(vec_nr);
24204 -               h->action(h);
24205 -               trace_softirq_exit(vec_nr);
24206 -               if (unlikely(prev_count != preempt_count())) {
24207 -                       pr_err("huh, entered softirq %u %s %p with preempt_count %08x, exited with %08x?\n",
24208 -                              vec_nr, softirq_to_name[vec_nr], h->action,
24209 -                              prev_count, preempt_count());
24210 -                       preempt_count_set(prev_count);
24211 -               }
24212 -               h++;
24213 -               pending >>= softirq_bit;
24214 -       }
24215 -
24216 -       rcu_bh_qs();
24217 -       local_irq_disable();
24218 +       handle_pending_softirqs(pending);
24219
24220         pending = local_softirq_pending();
24221         if (pending) {
24222 @@ -334,6 +475,309 @@ asmlinkage __visible void do_softirq(void)
24223         local_irq_restore(flags);
24224  }
24225
24226 +/*
24227 + * This function must run with irqs disabled!
24228 + */
24229 +void raise_softirq_irqoff(unsigned int nr)
24230 +{
24231 +       __raise_softirq_irqoff(nr);
24232 +
24233 +       /*
24234 +        * If we're in an interrupt or softirq, we're done
24235 +        * (this also catches softirq-disabled code). We will
24236 +        * actually run the softirq once we return from
24237 +        * the irq or softirq.
24238 +        *
24239 +        * Otherwise we wake up ksoftirqd to make sure we
24240 +        * schedule the softirq soon.
24241 +        */
24242 +       if (!in_interrupt())
24243 +               wakeup_softirqd();
24244 +}
24245 +
24246 +void __raise_softirq_irqoff(unsigned int nr)
24247 +{
24248 +       trace_softirq_raise(nr);
24249 +       or_softirq_pending(1UL << nr);
24250 +}
24251 +
24252 +static inline void local_bh_disable_nort(void) { local_bh_disable(); }
24253 +static inline void _local_bh_enable_nort(void) { _local_bh_enable(); }
24254 +static void ksoftirqd_set_sched_params(unsigned int cpu) { }
24255 +
24256 +#else /* !PREEMPT_RT_FULL */
24257 +
24258 +/*
24259 + * On RT we serialize softirq execution with a cpu local lock per softirq
24260 + */
24261 +static DEFINE_PER_CPU(struct local_irq_lock [NR_SOFTIRQS], local_softirq_locks);
24262 +
24263 +void __init softirq_early_init(void)
24264 +{
24265 +       int i;
24266 +
24267 +       for (i = 0; i < NR_SOFTIRQS; i++)
24268 +               local_irq_lock_init(local_softirq_locks[i]);
24269 +}
24270 +
24271 +static void lock_softirq(int which)
24272 +{
24273 +       local_lock(local_softirq_locks[which]);
24274 +}
24275 +
24276 +static void unlock_softirq(int which)
24277 +{
24278 +       local_unlock(local_softirq_locks[which]);
24279 +}
24280 +
24281 +static void do_single_softirq(int which)
24282 +{
24283 +       unsigned long old_flags = current->flags;
24284 +
24285 +       current->flags &= ~PF_MEMALLOC;
24286 +       vtime_account_irq_enter(current);
24287 +       current->flags |= PF_IN_SOFTIRQ;
24288 +       lockdep_softirq_enter();
24289 +       local_irq_enable();
24290 +       handle_softirq(which);
24291 +       local_irq_disable();
24292 +       lockdep_softirq_exit();
24293 +       current->flags &= ~PF_IN_SOFTIRQ;
24294 +       vtime_account_irq_enter(current);
24295 +       current_restore_flags(old_flags, PF_MEMALLOC);
24296 +}
24297 +
24298 +/*
24299 + * Called with interrupts disabled. Process softirqs which were raised
24300 + * in current context (or on behalf of ksoftirqd).
24301 + */
24302 +static void do_current_softirqs(void)
24303 +{
24304 +       while (current->softirqs_raised) {
24305 +               int i = __ffs(current->softirqs_raised);
24306 +               unsigned int pending, mask = (1U << i);
24307 +
24308 +               current->softirqs_raised &= ~mask;
24309 +               local_irq_enable();
24310 +
24311 +               /*
24312 +                * If the lock is contended, we boost the owner to
24313 +                * process the softirq or leave the critical section
24314 +                * now.
24315 +                */
24316 +               lock_softirq(i);
24317 +               local_irq_disable();
24318 +               softirq_set_runner(i);
24319 +               /*
24320 +                * Check with the local_softirq_pending() bits,
24321 +                * whether we need to process this still or if someone
24322 +                * else took care of it.
24323 +                */
24324 +               pending = local_softirq_pending();
24325 +               if (pending & mask) {
24326 +                       set_softirq_pending(pending & ~mask);
24327 +                       do_single_softirq(i);
24328 +               }
24329 +               softirq_clr_runner(i);
24330 +               WARN_ON(current->softirq_nestcnt != 1);
24331 +               local_irq_enable();
24332 +               unlock_softirq(i);
24333 +               local_irq_disable();
24334 +       }
24335 +}
24336 +
24337 +void __local_bh_disable(void)
24338 +{
24339 +       if (++current->softirq_nestcnt == 1)
24340 +               migrate_disable();
24341 +}
24342 +EXPORT_SYMBOL(__local_bh_disable);
24343 +
24344 +void __local_bh_enable(void)
24345 +{
24346 +       if (WARN_ON(current->softirq_nestcnt == 0))
24347 +               return;
24348 +
24349 +       local_irq_disable();
24350 +       if (current->softirq_nestcnt == 1 && current->softirqs_raised)
24351 +               do_current_softirqs();
24352 +       local_irq_enable();
24353 +
24354 +       if (--current->softirq_nestcnt == 0)
24355 +               migrate_enable();
24356 +}
24357 +EXPORT_SYMBOL(__local_bh_enable);
24358 +
24359 +void _local_bh_enable(void)
24360 +{
24361 +       if (WARN_ON(current->softirq_nestcnt == 0))
24362 +               return;
24363 +       if (--current->softirq_nestcnt == 0)
24364 +               migrate_enable();
24365 +}
24366 +EXPORT_SYMBOL(_local_bh_enable);
24367 +
24368 +int in_serving_softirq(void)
24369 +{
24370 +       return current->flags & PF_IN_SOFTIRQ;
24371 +}
24372 +EXPORT_SYMBOL(in_serving_softirq);
24373 +
24374 +/* Called with preemption disabled */
24375 +static void run_ksoftirqd(unsigned int cpu)
24376 +{
24377 +       local_irq_disable();
24378 +       current->softirq_nestcnt++;
24379 +
24380 +       do_current_softirqs();
24381 +       current->softirq_nestcnt--;
24382 +       local_irq_enable();
24383 +       cond_resched_rcu_qs();
24384 +}
24385 +
24386 +/*
24387 + * Called from netif_rx_ni(). Preemption enabled, but migration
24388 + * disabled. So the cpu can't go away under us.
24389 + */
24390 +void thread_do_softirq(void)
24391 +{
24392 +       if (!in_serving_softirq() && current->softirqs_raised) {
24393 +               current->softirq_nestcnt++;
24394 +               do_current_softirqs();
24395 +               current->softirq_nestcnt--;
24396 +       }
24397 +}
24398 +
24399 +static void do_raise_softirq_irqoff(unsigned int nr)
24400 +{
24401 +       unsigned int mask;
24402 +
24403 +       mask = 1UL << nr;
24404 +
24405 +       trace_softirq_raise(nr);
24406 +       or_softirq_pending(mask);
24407 +
24408 +       /*
24409 +        * If we are not in a hard interrupt and inside a bh disabled
24410 +        * region, we simply raise the flag on current. local_bh_enable()
24411 +        * will make sure that the softirq is executed. Otherwise we
24412 +        * delegate it to ksoftirqd.
24413 +        */
24414 +       if (!in_irq() && current->softirq_nestcnt)
24415 +               current->softirqs_raised |= mask;
24416 +       else if (!__this_cpu_read(ksoftirqd) || !__this_cpu_read(ktimer_softirqd))
24417 +               return;
24418 +
24419 +       if (mask & TIMER_SOFTIRQS)
24420 +               __this_cpu_read(ktimer_softirqd)->softirqs_raised |= mask;
24421 +       else
24422 +               __this_cpu_read(ksoftirqd)->softirqs_raised |= mask;
24423 +}
24424 +
24425 +static void wakeup_proper_softirq(unsigned int nr)
24426 +{
24427 +       if ((1UL << nr) & TIMER_SOFTIRQS)
24428 +               wakeup_timer_softirqd();
24429 +       else
24430 +               wakeup_softirqd();
24431 +}
24432 +
24433 +void __raise_softirq_irqoff(unsigned int nr)
24434 +{
24435 +       do_raise_softirq_irqoff(nr);
24436 +       if (!in_irq() && !current->softirq_nestcnt)
24437 +               wakeup_proper_softirq(nr);
24438 +}
24439 +
24440 +/*
24441 + * Same as __raise_softirq_irqoff() but will process them in ksoftirqd
24442 + */
24443 +void __raise_softirq_irqoff_ksoft(unsigned int nr)
24444 +{
24445 +       unsigned int mask;
24446 +
24447 +       if (WARN_ON_ONCE(!__this_cpu_read(ksoftirqd) ||
24448 +                        !__this_cpu_read(ktimer_softirqd)))
24449 +               return;
24450 +       mask = 1UL << nr;
24451 +
24452 +       trace_softirq_raise(nr);
24453 +       or_softirq_pending(mask);
24454 +       if (mask & TIMER_SOFTIRQS)
24455 +               __this_cpu_read(ktimer_softirqd)->softirqs_raised |= mask;
24456 +       else
24457 +               __this_cpu_read(ksoftirqd)->softirqs_raised |= mask;
24458 +       wakeup_proper_softirq(nr);
24459 +}
24460 +
24461 +/*
24462 + * This function must run with irqs disabled!
24463 + */
24464 +void raise_softirq_irqoff(unsigned int nr)
24465 +{
24466 +       do_raise_softirq_irqoff(nr);
24467 +
24468 +       /*
24469 +        * If we're in an hard interrupt we let irq return code deal
24470 +        * with the wakeup of ksoftirqd.
24471 +        */
24472 +       if (in_irq())
24473 +               return;
24474 +       /*
24475 +        * If we are in thread context but outside of a bh disabled
24476 +        * region, we need to wake ksoftirqd as well.
24477 +        *
24478 +        * CHECKME: Some of the places which do that could be wrapped
24479 +        * into local_bh_disable/enable pairs. Though it's unclear
24480 +        * whether this is worth the effort. To find those places just
24481 +        * raise a WARN() if the condition is met.
24482 +        */
24483 +       if (!current->softirq_nestcnt)
24484 +               wakeup_proper_softirq(nr);
24485 +}
24486 +
24487 +static inline int ksoftirqd_softirq_pending(void)
24488 +{
24489 +       return current->softirqs_raised;
24490 +}
24491 +
24492 +static inline void local_bh_disable_nort(void) { }
24493 +static inline void _local_bh_enable_nort(void) { }
24494 +
24495 +static inline void ksoftirqd_set_sched_params(unsigned int cpu)
24496 +{
24497 +       /* Take over all but timer pending softirqs when starting */
24498 +       local_irq_disable();
24499 +       current->softirqs_raised = local_softirq_pending() & ~TIMER_SOFTIRQS;
24500 +       local_irq_enable();
24501 +}
24502 +
24503 +static inline void ktimer_softirqd_set_sched_params(unsigned int cpu)
24504 +{
24505 +       struct sched_param param = { .sched_priority = 1 };
24506 +
24507 +       sched_setscheduler(current, SCHED_FIFO, &param);
24508 +
24509 +       /* Take over timer pending softirqs when starting */
24510 +       local_irq_disable();
24511 +       current->softirqs_raised = local_softirq_pending() & TIMER_SOFTIRQS;
24512 +       local_irq_enable();
24513 +}
24514 +
24515 +static inline void ktimer_softirqd_clr_sched_params(unsigned int cpu,
24516 +                                                   bool online)
24517 +{
24518 +       struct sched_param param = { .sched_priority = 0 };
24519 +
24520 +       sched_setscheduler(current, SCHED_NORMAL, &param);
24521 +}
24522 +
24523 +static int ktimer_softirqd_should_run(unsigned int cpu)
24524 +{
24525 +       return current->softirqs_raised;
24526 +}
24527 +
24528 +#endif /* PREEMPT_RT_FULL */
24529  /*
24530   * Enter an interrupt context.
24531   */
24532 @@ -345,9 +789,9 @@ void irq_enter(void)
24533                  * Prevent raise_softirq from needlessly waking up ksoftirqd
24534                  * here, as softirq will be serviced on return from interrupt.
24535                  */
24536 -               local_bh_disable();
24537 +               local_bh_disable_nort();
24538                 tick_irq_enter();
24539 -               _local_bh_enable();
24540 +               _local_bh_enable_nort();
24541         }
24542
24543         __irq_enter();
24544 @@ -355,6 +799,7 @@ void irq_enter(void)
24545
24546  static inline void invoke_softirq(void)
24547  {
24548 +#ifndef CONFIG_PREEMPT_RT_FULL
24549         if (ksoftirqd_running(local_softirq_pending()))
24550                 return;
24551
24552 @@ -377,6 +822,18 @@ static inline void invoke_softirq(void)
24553         } else {
24554                 wakeup_softirqd();
24555         }
24556 +#else /* PREEMPT_RT_FULL */
24557 +       unsigned long flags;
24558 +
24559 +       local_irq_save(flags);
24560 +       if (__this_cpu_read(ksoftirqd) &&
24561 +                       __this_cpu_read(ksoftirqd)->softirqs_raised)
24562 +               wakeup_softirqd();
24563 +       if (__this_cpu_read(ktimer_softirqd) &&
24564 +                       __this_cpu_read(ktimer_softirqd)->softirqs_raised)
24565 +               wakeup_timer_softirqd();
24566 +       local_irq_restore(flags);
24567 +#endif
24568  }
24569
24570  static inline void tick_irq_exit(void)
24571 @@ -385,7 +842,8 @@ static inline void tick_irq_exit(void)
24572         int cpu = smp_processor_id();
24573
24574         /* Make sure that timer wheel updates are propagated */
24575 -       if ((idle_cpu(cpu) && !need_resched()) || tick_nohz_full_cpu(cpu)) {
24576 +       if ((idle_cpu(cpu) || tick_nohz_full_cpu(cpu)) &&
24577 +           !need_resched() && !local_softirq_pending()) {
24578                 if (!in_irq())
24579                         tick_nohz_irq_exit();
24580         }
24581 @@ -413,26 +871,6 @@ void irq_exit(void)
24582         trace_hardirq_exit(); /* must be last! */
24583  }
24584
24585 -/*
24586 - * This function must run with irqs disabled!
24587 - */
24588 -inline void raise_softirq_irqoff(unsigned int nr)
24589 -{
24590 -       __raise_softirq_irqoff(nr);
24591 -
24592 -       /*
24593 -        * If we're in an interrupt or softirq, we're done
24594 -        * (this also catches softirq-disabled code). We will
24595 -        * actually run the softirq once we return from
24596 -        * the irq or softirq.
24597 -        *
24598 -        * Otherwise we wake up ksoftirqd to make sure we
24599 -        * schedule the softirq soon.
24600 -        */
24601 -       if (!in_interrupt())
24602 -               wakeup_softirqd();
24603 -}
24604 -
24605  void raise_softirq(unsigned int nr)
24606  {
24607         unsigned long flags;
24608 @@ -442,12 +880,6 @@ void raise_softirq(unsigned int nr)
24609         local_irq_restore(flags);
24610  }
24611
24612 -void __raise_softirq_irqoff(unsigned int nr)
24613 -{
24614 -       trace_softirq_raise(nr);
24615 -       or_softirq_pending(1UL << nr);
24616 -}
24617 -
24618  void open_softirq(int nr, void (*action)(struct softirq_action *))
24619  {
24620         softirq_vec[nr].action = action;
24621 @@ -464,15 +896,45 @@ struct tasklet_head {
24622  static DEFINE_PER_CPU(struct tasklet_head, tasklet_vec);
24623  static DEFINE_PER_CPU(struct tasklet_head, tasklet_hi_vec);
24624
24625 +static void inline
24626 +__tasklet_common_schedule(struct tasklet_struct *t, struct tasklet_head *head, unsigned int nr)
24627 +{
24628 +       if (tasklet_trylock(t)) {
24629 +again:
24630 +               /* We may have been preempted before tasklet_trylock
24631 +                * and __tasklet_action may have already run.
24632 +                * So double check the sched bit while the takslet
24633 +                * is locked before adding it to the list.
24634 +                */
24635 +               if (test_bit(TASKLET_STATE_SCHED, &t->state)) {
24636 +                       t->next = NULL;
24637 +                       *head->tail = t;
24638 +                       head->tail = &(t->next);
24639 +                       raise_softirq_irqoff(nr);
24640 +                       tasklet_unlock(t);
24641 +               } else {
24642 +                       /* This is subtle. If we hit the corner case above
24643 +                        * It is possible that we get preempted right here,
24644 +                        * and another task has successfully called
24645 +                        * tasklet_schedule(), then this function, and
24646 +                        * failed on the trylock. Thus we must be sure
24647 +                        * before releasing the tasklet lock, that the
24648 +                        * SCHED_BIT is clear. Otherwise the tasklet
24649 +                        * may get its SCHED_BIT set, but not added to the
24650 +                        * list
24651 +                        */
24652 +                       if (!tasklet_tryunlock(t))
24653 +                               goto again;
24654 +               }
24655 +       }
24656 +}
24657 +
24658  void __tasklet_schedule(struct tasklet_struct *t)
24659  {
24660         unsigned long flags;
24661
24662         local_irq_save(flags);
24663 -       t->next = NULL;
24664 -       *__this_cpu_read(tasklet_vec.tail) = t;
24665 -       __this_cpu_write(tasklet_vec.tail, &(t->next));
24666 -       raise_softirq_irqoff(TASKLET_SOFTIRQ);
24667 +       __tasklet_common_schedule(t, this_cpu_ptr(&tasklet_vec), TASKLET_SOFTIRQ);
24668         local_irq_restore(flags);
24669  }
24670  EXPORT_SYMBOL(__tasklet_schedule);
24671 @@ -482,50 +944,108 @@ void __tasklet_hi_schedule(struct tasklet_struct *t)
24672         unsigned long flags;
24673
24674         local_irq_save(flags);
24675 -       t->next = NULL;
24676 -       *__this_cpu_read(tasklet_hi_vec.tail) = t;
24677 -       __this_cpu_write(tasklet_hi_vec.tail,  &(t->next));
24678 -       raise_softirq_irqoff(HI_SOFTIRQ);
24679 +       __tasklet_common_schedule(t, this_cpu_ptr(&tasklet_hi_vec), HI_SOFTIRQ);
24680         local_irq_restore(flags);
24681  }
24682  EXPORT_SYMBOL(__tasklet_hi_schedule);
24683
24684 -static __latent_entropy void tasklet_action(struct softirq_action *a)
24685 +void tasklet_enable(struct tasklet_struct *t)
24686  {
24687 -       struct tasklet_struct *list;
24688 +       if (!atomic_dec_and_test(&t->count))
24689 +               return;
24690 +       if (test_and_clear_bit(TASKLET_STATE_PENDING, &t->state))
24691 +               tasklet_schedule(t);
24692 +}
24693 +EXPORT_SYMBOL(tasklet_enable);
24694
24695 -       local_irq_disable();
24696 -       list = __this_cpu_read(tasklet_vec.head);
24697 -       __this_cpu_write(tasklet_vec.head, NULL);
24698 -       __this_cpu_write(tasklet_vec.tail, this_cpu_ptr(&tasklet_vec.head));
24699 -       local_irq_enable();
24700 +static void __tasklet_action(struct softirq_action *a,
24701 +                            struct tasklet_struct *list)
24702 +{
24703 +       int loops = 1000000;
24704
24705         while (list) {
24706                 struct tasklet_struct *t = list;
24707
24708                 list = list->next;
24709
24710 -               if (tasklet_trylock(t)) {
24711 -                       if (!atomic_read(&t->count)) {
24712 -                               if (!test_and_clear_bit(TASKLET_STATE_SCHED,
24713 -                                                       &t->state))
24714 -                                       BUG();
24715 -                               t->func(t->data);
24716 -                               tasklet_unlock(t);
24717 -                               continue;
24718 -                       }
24719 -                       tasklet_unlock(t);
24720 +               /*
24721 +                * Should always succeed - after a tasklist got on the
24722 +                * list (after getting the SCHED bit set from 0 to 1),
24723 +                * nothing but the tasklet softirq it got queued to can
24724 +                * lock it:
24725 +                */
24726 +               if (!tasklet_trylock(t)) {
24727 +                       WARN_ON(1);
24728 +                       continue;
24729                 }
24730
24731 -               local_irq_disable();
24732                 t->next = NULL;
24733 -               *__this_cpu_read(tasklet_vec.tail) = t;
24734 -               __this_cpu_write(tasklet_vec.tail, &(t->next));
24735 -               __raise_softirq_irqoff(TASKLET_SOFTIRQ);
24736 -               local_irq_enable();
24737 +
24738 +               /*
24739 +                * If we cannot handle the tasklet because it's disabled,
24740 +                * mark it as pending. tasklet_enable() will later
24741 +                * re-schedule the tasklet.
24742 +                */
24743 +               if (unlikely(atomic_read(&t->count))) {
24744 +out_disabled:
24745 +                       /* implicit unlock: */
24746 +                       wmb();
24747 +                       t->state = TASKLET_STATEF_PENDING;
24748 +                       continue;
24749 +               }
24750 +
24751 +               /*
24752 +                * After this point on the tasklet might be rescheduled
24753 +                * on another CPU, but it can only be added to another
24754 +                * CPU's tasklet list if we unlock the tasklet (which we
24755 +                * dont do yet).
24756 +                */
24757 +               if (!test_and_clear_bit(TASKLET_STATE_SCHED, &t->state))
24758 +                       WARN_ON(1);
24759 +
24760 +again:
24761 +               t->func(t->data);
24762 +
24763 +               /*
24764 +                * Try to unlock the tasklet. We must use cmpxchg, because
24765 +                * another CPU might have scheduled or disabled the tasklet.
24766 +                * We only allow the STATE_RUN -> 0 transition here.
24767 +                */
24768 +               while (!tasklet_tryunlock(t)) {
24769 +                       /*
24770 +                        * If it got disabled meanwhile, bail out:
24771 +                        */
24772 +                       if (atomic_read(&t->count))
24773 +                               goto out_disabled;
24774 +                       /*
24775 +                        * If it got scheduled meanwhile, re-execute
24776 +                        * the tasklet function:
24777 +                        */
24778 +                       if (test_and_clear_bit(TASKLET_STATE_SCHED, &t->state))
24779 +                               goto again;
24780 +                       if (!--loops) {
24781 +                               printk("hm, tasklet state: %08lx\n", t->state);
24782 +                               WARN_ON(1);
24783 +                               tasklet_unlock(t);
24784 +                               break;
24785 +                       }
24786 +               }
24787         }
24788  }
24789
24790 +static __latent_entropy void tasklet_action(struct softirq_action *a)
24791 +{
24792 +       struct tasklet_struct *list;
24793 +
24794 +       local_irq_disable();
24795 +       list = __this_cpu_read(tasklet_vec.head);
24796 +       __this_cpu_write(tasklet_vec.head, NULL);
24797 +       __this_cpu_write(tasklet_vec.tail, this_cpu_ptr(&tasklet_vec.head));
24798 +       local_irq_enable();
24799 +
24800 +       __tasklet_action(a, list);
24801 +}
24802 +
24803  static __latent_entropy void tasklet_hi_action(struct softirq_action *a)
24804  {
24805         struct tasklet_struct *list;
24806 @@ -536,30 +1056,7 @@ static __latent_entropy void tasklet_hi_action(struct softirq_action *a)
24807         __this_cpu_write(tasklet_hi_vec.tail, this_cpu_ptr(&tasklet_hi_vec.head));
24808         local_irq_enable();
24809
24810 -       while (list) {
24811 -               struct tasklet_struct *t = list;
24812 -
24813 -               list = list->next;
24814 -
24815 -               if (tasklet_trylock(t)) {
24816 -                       if (!atomic_read(&t->count)) {
24817 -                               if (!test_and_clear_bit(TASKLET_STATE_SCHED,
24818 -                                                       &t->state))
24819 -                                       BUG();
24820 -                               t->func(t->data);
24821 -                               tasklet_unlock(t);
24822 -                               continue;
24823 -                       }
24824 -                       tasklet_unlock(t);
24825 -               }
24826 -
24827 -               local_irq_disable();
24828 -               t->next = NULL;
24829 -               *__this_cpu_read(tasklet_hi_vec.tail) = t;
24830 -               __this_cpu_write(tasklet_hi_vec.tail, &(t->next));
24831 -               __raise_softirq_irqoff(HI_SOFTIRQ);
24832 -               local_irq_enable();
24833 -       }
24834 +       __tasklet_action(a, list);
24835  }
24836
24837  void tasklet_init(struct tasklet_struct *t,
24838 @@ -580,7 +1077,7 @@ void tasklet_kill(struct tasklet_struct *t)
24839
24840         while (test_and_set_bit(TASKLET_STATE_SCHED, &t->state)) {
24841                 do {
24842 -                       yield();
24843 +                       msleep(1);
24844                 } while (test_bit(TASKLET_STATE_SCHED, &t->state));
24845         }
24846         tasklet_unlock_wait(t);
24847 @@ -588,57 +1085,6 @@ void tasklet_kill(struct tasklet_struct *t)
24848  }
24849  EXPORT_SYMBOL(tasklet_kill);
24850
24851 -/*
24852 - * tasklet_hrtimer
24853 - */
24854 -
24855 -/*
24856 - * The trampoline is called when the hrtimer expires. It schedules a tasklet
24857 - * to run __tasklet_hrtimer_trampoline() which in turn will call the intended
24858 - * hrtimer callback, but from softirq context.
24859 - */
24860 -static enum hrtimer_restart __hrtimer_tasklet_trampoline(struct hrtimer *timer)
24861 -{
24862 -       struct tasklet_hrtimer *ttimer =
24863 -               container_of(timer, struct tasklet_hrtimer, timer);
24864 -
24865 -       tasklet_hi_schedule(&ttimer->tasklet);
24866 -       return HRTIMER_NORESTART;
24867 -}
24868 -
24869 -/*
24870 - * Helper function which calls the hrtimer callback from
24871 - * tasklet/softirq context
24872 - */
24873 -static void __tasklet_hrtimer_trampoline(unsigned long data)
24874 -{
24875 -       struct tasklet_hrtimer *ttimer = (void *)data;
24876 -       enum hrtimer_restart restart;
24877 -
24878 -       restart = ttimer->function(&ttimer->timer);
24879 -       if (restart != HRTIMER_NORESTART)
24880 -               hrtimer_restart(&ttimer->timer);
24881 -}
24882 -
24883 -/**
24884 - * tasklet_hrtimer_init - Init a tasklet/hrtimer combo for softirq callbacks
24885 - * @ttimer:     tasklet_hrtimer which is initialized
24886 - * @function:   hrtimer callback function which gets called from softirq context
24887 - * @which_clock: clock id (CLOCK_MONOTONIC/CLOCK_REALTIME)
24888 - * @mode:       hrtimer mode (HRTIMER_MODE_ABS/HRTIMER_MODE_REL)
24889 - */
24890 -void tasklet_hrtimer_init(struct tasklet_hrtimer *ttimer,
24891 -                         enum hrtimer_restart (*function)(struct hrtimer *),
24892 -                         clockid_t which_clock, enum hrtimer_mode mode)
24893 -{
24894 -       hrtimer_init(&ttimer->timer, which_clock, mode);
24895 -       ttimer->timer.function = __hrtimer_tasklet_trampoline;
24896 -       tasklet_init(&ttimer->tasklet, __tasklet_hrtimer_trampoline,
24897 -                    (unsigned long)ttimer);
24898 -       ttimer->function = function;
24899 -}
24900 -EXPORT_SYMBOL_GPL(tasklet_hrtimer_init);
24901 -
24902  void __init softirq_init(void)
24903  {
24904         int cpu;
24905 @@ -654,25 +1100,26 @@ void __init softirq_init(void)
24906         open_softirq(HI_SOFTIRQ, tasklet_hi_action);
24907  }
24908
24909 -static int ksoftirqd_should_run(unsigned int cpu)
24910 -{
24911 -       return local_softirq_pending();
24912 -}
24913 -
24914 -static void run_ksoftirqd(unsigned int cpu)
24915 +#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT_FULL)
24916 +void tasklet_unlock_wait(struct tasklet_struct *t)
24917  {
24918 -       local_irq_disable();
24919 -       if (local_softirq_pending()) {
24920 +       while (test_bit(TASKLET_STATE_RUN, &(t)->state)) {
24921                 /*
24922 -                * We can safely run softirq on inline stack, as we are not deep
24923 -                * in the task stack here.
24924 +                * Hack for now to avoid this busy-loop:
24925                  */
24926 -               __do_softirq();
24927 -               local_irq_enable();
24928 -               cond_resched_rcu_qs();
24929 -               return;
24930 +#ifdef CONFIG_PREEMPT_RT_FULL
24931 +               msleep(1);
24932 +#else
24933 +               barrier();
24934 +#endif
24935         }
24936 -       local_irq_enable();
24937 +}
24938 +EXPORT_SYMBOL(tasklet_unlock_wait);
24939 +#endif
24940 +
24941 +static int ksoftirqd_should_run(unsigned int cpu)
24942 +{
24943 +       return ksoftirqd_softirq_pending();
24944  }
24945
24946  #ifdef CONFIG_HOTPLUG_CPU
24947 @@ -739,17 +1186,31 @@ static int takeover_tasklets(unsigned int cpu)
24948
24949  static struct smp_hotplug_thread softirq_threads = {
24950         .store                  = &ksoftirqd,
24951 +       .setup                  = ksoftirqd_set_sched_params,
24952         .thread_should_run      = ksoftirqd_should_run,
24953         .thread_fn              = run_ksoftirqd,
24954         .thread_comm            = "ksoftirqd/%u",
24955  };
24956
24957 +#ifdef CONFIG_PREEMPT_RT_FULL
24958 +static struct smp_hotplug_thread softirq_timer_threads = {
24959 +       .store                  = &ktimer_softirqd,
24960 +       .setup                  = ktimer_softirqd_set_sched_params,
24961 +       .cleanup                = ktimer_softirqd_clr_sched_params,
24962 +       .thread_should_run      = ktimer_softirqd_should_run,
24963 +       .thread_fn              = run_ksoftirqd,
24964 +       .thread_comm            = "ktimersoftd/%u",
24965 +};
24966 +#endif
24967 +
24968  static __init int spawn_ksoftirqd(void)
24969  {
24970         cpuhp_setup_state_nocalls(CPUHP_SOFTIRQ_DEAD, "softirq:dead", NULL,
24971                                   takeover_tasklets);
24972         BUG_ON(smpboot_register_percpu_thread(&softirq_threads));
24973 -
24974 +#ifdef CONFIG_PREEMPT_RT_FULL
24975 +       BUG_ON(smpboot_register_percpu_thread(&softirq_timer_threads));
24976 +#endif
24977         return 0;
24978  }
24979  early_initcall(spawn_ksoftirqd);
24980 diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
24981 index 067cb83f37ea..56f2f2e01229 100644
24982 --- a/kernel/stop_machine.c
24983 +++ b/kernel/stop_machine.c
24984 @@ -503,6 +503,8 @@ static void cpu_stopper_thread(unsigned int cpu)
24985                 struct cpu_stop_done *done = work->done;
24986                 int ret;
24987
24988 +               /* XXX */
24989 +
24990                 /* cpu stop callbacks must not sleep, make in_atomic() == T */
24991                 preempt_count_inc();
24992                 ret = fn(arg);
24993 diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c
24994 index fa5de5e8de61..6020ee66e517 100644
24995 --- a/kernel/time/alarmtimer.c
24996 +++ b/kernel/time/alarmtimer.c
24997 @@ -436,7 +436,7 @@ int alarm_cancel(struct alarm *alarm)
24998                 int ret = alarm_try_to_cancel(alarm);
24999                 if (ret >= 0)
25000                         return ret;
25001 -               cpu_relax();
25002 +               hrtimer_wait_for_timer(&alarm->timer);
25003         }
25004  }
25005  EXPORT_SYMBOL_GPL(alarm_cancel);
25006 diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
25007 index d00e85ac10d6..b59e009087a9 100644
25008 --- a/kernel/time/hrtimer.c
25009 +++ b/kernel/time/hrtimer.c
25010 @@ -59,6 +59,15 @@
25011
25012  #include "tick-internal.h"
25013
25014 +/*
25015 + * Masks for selecting the soft and hard context timers from
25016 + * cpu_base->active
25017 + */
25018 +#define MASK_SHIFT             (HRTIMER_BASE_MONOTONIC_SOFT)
25019 +#define HRTIMER_ACTIVE_HARD    ((1U << MASK_SHIFT) - 1)
25020 +#define HRTIMER_ACTIVE_SOFT    (HRTIMER_ACTIVE_HARD << MASK_SHIFT)
25021 +#define HRTIMER_ACTIVE_ALL     (HRTIMER_ACTIVE_SOFT | HRTIMER_ACTIVE_HARD)
25022 +
25023  /*
25024   * The timer bases:
25025   *
25026 @@ -70,7 +79,6 @@
25027  DEFINE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases) =
25028  {
25029         .lock = __RAW_SPIN_LOCK_UNLOCKED(hrtimer_bases.lock),
25030 -       .seq = SEQCNT_ZERO(hrtimer_bases.seq),
25031         .clock_base =
25032         {
25033                 {
25034 @@ -93,6 +101,26 @@ DEFINE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases) =
25035                         .clockid = CLOCK_TAI,
25036                         .get_time = &ktime_get_clocktai,
25037                 },
25038 +               {
25039 +                       .index = HRTIMER_BASE_MONOTONIC_SOFT,
25040 +                       .clockid = CLOCK_MONOTONIC,
25041 +                       .get_time = &ktime_get,
25042 +               },
25043 +               {
25044 +                       .index = HRTIMER_BASE_REALTIME_SOFT,
25045 +                       .clockid = CLOCK_REALTIME,
25046 +                       .get_time = &ktime_get_real,
25047 +               },
25048 +               {
25049 +                       .index = HRTIMER_BASE_BOOTTIME_SOFT,
25050 +                       .clockid = CLOCK_BOOTTIME,
25051 +                       .get_time = &ktime_get_boottime,
25052 +               },
25053 +               {
25054 +                       .index = HRTIMER_BASE_TAI_SOFT,
25055 +                       .clockid = CLOCK_TAI,
25056 +                       .get_time = &ktime_get_clocktai,
25057 +               },
25058         }
25059  };
25060
25061 @@ -118,7 +146,6 @@ static const int hrtimer_clock_to_base_table[MAX_CLOCKS] = {
25062   * timer->base->cpu_base
25063   */
25064  static struct hrtimer_cpu_base migration_cpu_base = {
25065 -       .seq = SEQCNT_ZERO(migration_cpu_base),
25066         .clock_base = { { .cpu_base = &migration_cpu_base, }, },
25067  };
25068
25069 @@ -156,45 +183,33 @@ struct hrtimer_clock_base *lock_hrtimer_base(const struct hrtimer *timer,
25070  }
25071
25072  /*
25073 - * With HIGHRES=y we do not migrate the timer when it is expiring
25074 - * before the next event on the target cpu because we cannot reprogram
25075 - * the target cpu hardware and we would cause it to fire late.
25076 + * We do not migrate the timer when it is expiring before the next
25077 + * event on the target cpu. When high resolution is enabled, we cannot
25078 + * reprogram the target cpu hardware and we would cause it to fire
25079 + * late. To keep it simple, we handle the high resolution enabled and
25080 + * disabled case similar.
25081   *
25082   * Called with cpu_base->lock of target cpu held.
25083   */
25084  static int
25085  hrtimer_check_target(struct hrtimer *timer, struct hrtimer_clock_base *new_base)
25086  {
25087 -#ifdef CONFIG_HIGH_RES_TIMERS
25088         ktime_t expires;
25089
25090 -       if (!new_base->cpu_base->hres_active)
25091 -               return 0;
25092 -
25093         expires = ktime_sub(hrtimer_get_expires(timer), new_base->offset);
25094 -       return expires <= new_base->cpu_base->expires_next;
25095 -#else
25096 -       return 0;
25097 -#endif
25098 +       return expires < new_base->cpu_base->expires_next;
25099  }
25100
25101 -#ifdef CONFIG_NO_HZ_COMMON
25102 -static inline
25103 -struct hrtimer_cpu_base *get_target_base(struct hrtimer_cpu_base *base,
25104 -                                        int pinned)
25105 -{
25106 -       if (pinned || !base->migration_enabled)
25107 -               return base;
25108 -       return &per_cpu(hrtimer_bases, get_nohz_timer_target());
25109 -}
25110 -#else
25111  static inline
25112  struct hrtimer_cpu_base *get_target_base(struct hrtimer_cpu_base *base,
25113                                          int pinned)
25114  {
25115 +#if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON)
25116 +       if (static_branch_unlikely(&timers_migration_enabled) && !pinned)
25117 +               return &per_cpu(hrtimer_bases, get_nohz_timer_target());
25118 +#endif
25119         return base;
25120  }
25121 -#endif
25122
25123  /*
25124   * We switch the timer base to a power-optimized selected CPU target,
25125 @@ -396,7 +411,8 @@ static inline void debug_hrtimer_init(struct hrtimer *timer)
25126         debug_object_init(timer, &hrtimer_debug_descr);
25127  }
25128
25129 -static inline void debug_hrtimer_activate(struct hrtimer *timer)
25130 +static inline void debug_hrtimer_activate(struct hrtimer *timer,
25131 +                                         enum hrtimer_mode mode)
25132  {
25133         debug_object_activate(timer, &hrtimer_debug_descr);
25134  }
25135 @@ -429,8 +445,10 @@ void destroy_hrtimer_on_stack(struct hrtimer *timer)
25136  EXPORT_SYMBOL_GPL(destroy_hrtimer_on_stack);
25137
25138  #else
25139 +
25140  static inline void debug_hrtimer_init(struct hrtimer *timer) { }
25141 -static inline void debug_hrtimer_activate(struct hrtimer *timer) { }
25142 +static inline void debug_hrtimer_activate(struct hrtimer *timer,
25143 +                                         enum hrtimer_mode mode) { }
25144  static inline void debug_hrtimer_deactivate(struct hrtimer *timer) { }
25145  #endif
25146
25147 @@ -442,10 +460,11 @@ debug_init(struct hrtimer *timer, clockid_t clockid,
25148         trace_hrtimer_init(timer, clockid, mode);
25149  }
25150
25151 -static inline void debug_activate(struct hrtimer *timer)
25152 +static inline void debug_activate(struct hrtimer *timer,
25153 +                                 enum hrtimer_mode mode)
25154  {
25155 -       debug_hrtimer_activate(timer);
25156 -       trace_hrtimer_start(timer);
25157 +       debug_hrtimer_activate(timer, mode);
25158 +       trace_hrtimer_start(timer, mode);
25159  }
25160
25161  static inline void debug_deactivate(struct hrtimer *timer)
25162 @@ -454,35 +473,43 @@ static inline void debug_deactivate(struct hrtimer *timer)
25163         trace_hrtimer_cancel(timer);
25164  }
25165
25166 -#if defined(CONFIG_NO_HZ_COMMON) || defined(CONFIG_HIGH_RES_TIMERS)
25167 -static inline void hrtimer_update_next_timer(struct hrtimer_cpu_base *cpu_base,
25168 -                                            struct hrtimer *timer)
25169 +static struct hrtimer_clock_base *
25170 +__next_base(struct hrtimer_cpu_base *cpu_base, unsigned int *active)
25171  {
25172 -#ifdef CONFIG_HIGH_RES_TIMERS
25173 -       cpu_base->next_timer = timer;
25174 -#endif
25175 +       unsigned int idx;
25176 +
25177 +       if (!*active)
25178 +               return NULL;
25179 +
25180 +       idx = __ffs(*active);
25181 +       *active &= ~(1U << idx);
25182 +
25183 +       return &cpu_base->clock_base[idx];
25184  }
25185
25186 -static ktime_t __hrtimer_get_next_event(struct hrtimer_cpu_base *cpu_base)
25187 +#define for_each_active_base(base, cpu_base, active)   \
25188 +       while ((base = __next_base((cpu_base), &(active))))
25189 +
25190 +static ktime_t __hrtimer_next_event_base(struct hrtimer_cpu_base *cpu_base,
25191 +                                        unsigned int active,
25192 +                                        ktime_t expires_next)
25193  {
25194 -       struct hrtimer_clock_base *base = cpu_base->clock_base;
25195 -       unsigned int active = cpu_base->active_bases;
25196 -       ktime_t expires, expires_next = KTIME_MAX;
25197 +       struct hrtimer_clock_base *base;
25198 +       ktime_t expires;
25199
25200 -       hrtimer_update_next_timer(cpu_base, NULL);
25201 -       for (; active; base++, active >>= 1) {
25202 +       for_each_active_base(base, cpu_base, active) {
25203                 struct timerqueue_node *next;
25204                 struct hrtimer *timer;
25205
25206 -               if (!(active & 0x01))
25207 -                       continue;
25208 -
25209                 next = timerqueue_getnext(&base->active);
25210                 timer = container_of(next, struct hrtimer, node);
25211                 expires = ktime_sub(hrtimer_get_expires(timer), base->offset);
25212                 if (expires < expires_next) {
25213                         expires_next = expires;
25214 -                       hrtimer_update_next_timer(cpu_base, timer);
25215 +                       if (timer->is_soft)
25216 +                               cpu_base->softirq_next_timer = timer;
25217 +                       else
25218 +                               cpu_base->next_timer = timer;
25219                 }
25220         }
25221         /*
25222 @@ -494,7 +521,47 @@ static ktime_t __hrtimer_get_next_event(struct hrtimer_cpu_base *cpu_base)
25223                 expires_next = 0;
25224         return expires_next;
25225  }
25226 -#endif
25227 +
25228 +/*
25229 + * Recomputes cpu_base::*next_timer and returns the earliest expires_next but
25230 + * does not set cpu_base::*expires_next, that is done by hrtimer_reprogram.
25231 + *
25232 + * When a softirq is pending, we can ignore the HRTIMER_ACTIVE_SOFT bases,
25233 + * those timers will get run whenever the softirq gets handled, at the end of
25234 + * hrtimer_run_softirq(), hrtimer_update_softirq_timer() will re-add these bases.
25235 + *
25236 + * Therefore softirq values are those from the HRTIMER_ACTIVE_SOFT clock bases.
25237 + * The !softirq values are the minima across HRTIMER_ACTIVE_ALL, unless an actual
25238 + * softirq is pending, in which case they're the minima of HRTIMER_ACTIVE_HARD.
25239 + *
25240 + * @active_mask must be one of:
25241 + *  - HRTIMER_ACTIVE_ALL,
25242 + *  - HRTIMER_ACTIVE_SOFT, or
25243 + *  - HRTIMER_ACTIVE_HARD.
25244 + */
25245 +static ktime_t
25246 +__hrtimer_get_next_event(struct hrtimer_cpu_base *cpu_base, unsigned int active_mask)
25247 +{
25248 +       unsigned int active;
25249 +       struct hrtimer *next_timer = NULL;
25250 +       ktime_t expires_next = KTIME_MAX;
25251 +
25252 +       if (!cpu_base->softirq_activated && (active_mask & HRTIMER_ACTIVE_SOFT)) {
25253 +               active = cpu_base->active_bases & HRTIMER_ACTIVE_SOFT;
25254 +               cpu_base->softirq_next_timer = NULL;
25255 +               expires_next = __hrtimer_next_event_base(cpu_base, active, KTIME_MAX);
25256 +
25257 +               next_timer = cpu_base->softirq_next_timer;
25258 +       }
25259 +
25260 +       if (active_mask & HRTIMER_ACTIVE_HARD) {
25261 +               active = cpu_base->active_bases & HRTIMER_ACTIVE_HARD;
25262 +               cpu_base->next_timer = next_timer;
25263 +               expires_next = __hrtimer_next_event_base(cpu_base, active, expires_next);
25264 +       }
25265 +
25266 +       return expires_next;
25267 +}
25268
25269  static inline ktime_t hrtimer_update_base(struct hrtimer_cpu_base *base)
25270  {
25271 @@ -502,36 +569,14 @@ static inline ktime_t hrtimer_update_base(struct hrtimer_cpu_base *base)
25272         ktime_t *offs_boot = &base->clock_base[HRTIMER_BASE_BOOTTIME].offset;
25273         ktime_t *offs_tai = &base->clock_base[HRTIMER_BASE_TAI].offset;
25274
25275 -       return ktime_get_update_offsets_now(&base->clock_was_set_seq,
25276 +       ktime_t now = ktime_get_update_offsets_now(&base->clock_was_set_seq,
25277                                             offs_real, offs_boot, offs_tai);
25278 -}
25279 -
25280 -/* High resolution timer related functions */
25281 -#ifdef CONFIG_HIGH_RES_TIMERS
25282 -
25283 -/*
25284 - * High resolution timer enabled ?
25285 - */
25286 -static bool hrtimer_hres_enabled __read_mostly  = true;
25287 -unsigned int hrtimer_resolution __read_mostly = LOW_RES_NSEC;
25288 -EXPORT_SYMBOL_GPL(hrtimer_resolution);
25289 -
25290 -/*
25291 - * Enable / Disable high resolution mode
25292 - */
25293 -static int __init setup_hrtimer_hres(char *str)
25294 -{
25295 -       return (kstrtobool(str, &hrtimer_hres_enabled) == 0);
25296 -}
25297
25298 -__setup("highres=", setup_hrtimer_hres);
25299 +       base->clock_base[HRTIMER_BASE_REALTIME_SOFT].offset = *offs_real;
25300 +       base->clock_base[HRTIMER_BASE_BOOTTIME_SOFT].offset = *offs_boot;
25301 +       base->clock_base[HRTIMER_BASE_TAI_SOFT].offset = *offs_tai;
25302
25303 -/*
25304 - * hrtimer_high_res_enabled - query, if the highres mode is enabled
25305 - */
25306 -static inline int hrtimer_is_hres_enabled(void)
25307 -{
25308 -       return hrtimer_hres_enabled;
25309 +       return now;
25310  }
25311
25312  /*
25313 @@ -539,7 +584,8 @@ static inline int hrtimer_is_hres_enabled(void)
25314   */
25315  static inline int __hrtimer_hres_active(struct hrtimer_cpu_base *cpu_base)
25316  {
25317 -       return cpu_base->hres_active;
25318 +       return IS_ENABLED(CONFIG_HIGH_RES_TIMERS) ?
25319 +               cpu_base->hres_active : 0;
25320  }
25321
25322  static inline int hrtimer_hres_active(void)
25323 @@ -557,10 +603,23 @@ hrtimer_force_reprogram(struct hrtimer_cpu_base *cpu_base, int skip_equal)
25324  {
25325         ktime_t expires_next;
25326
25327 -       if (!cpu_base->hres_active)
25328 -               return;
25329 +       /*
25330 +        * Find the current next expiration time.
25331 +        */
25332 +       expires_next = __hrtimer_get_next_event(cpu_base, HRTIMER_ACTIVE_ALL);
25333
25334 -       expires_next = __hrtimer_get_next_event(cpu_base);
25335 +       if (cpu_base->next_timer && cpu_base->next_timer->is_soft) {
25336 +               /*
25337 +                * When the softirq is activated, hrtimer has to be
25338 +                * programmed with the first hard hrtimer because soft
25339 +                * timer interrupt could occur too late.
25340 +                */
25341 +               if (cpu_base->softirq_activated)
25342 +                       expires_next = __hrtimer_get_next_event(cpu_base,
25343 +                                                               HRTIMER_ACTIVE_HARD);
25344 +               else
25345 +                       cpu_base->softirq_expires_next = expires_next;
25346 +       }
25347
25348         if (skip_equal && expires_next == cpu_base->expires_next)
25349                 return;
25350 @@ -568,6 +627,9 @@ hrtimer_force_reprogram(struct hrtimer_cpu_base *cpu_base, int skip_equal)
25351         cpu_base->expires_next = expires_next;
25352
25353         /*
25354 +        * If hres is not active, hardware does not have to be
25355 +        * reprogrammed yet.
25356 +        *
25357          * If a hang was detected in the last timer interrupt then we
25358          * leave the hang delay active in the hardware. We want the
25359          * system to make progress. That also prevents the following
25360 @@ -581,83 +643,38 @@ hrtimer_force_reprogram(struct hrtimer_cpu_base *cpu_base, int skip_equal)
25361          * set. So we'd effectivly block all timers until the T2 event
25362          * fires.
25363          */
25364 -       if (cpu_base->hang_detected)
25365 +       if (!__hrtimer_hres_active(cpu_base) || cpu_base->hang_detected)
25366                 return;
25367
25368         tick_program_event(cpu_base->expires_next, 1);
25369  }
25370
25371 +/* High resolution timer related functions */
25372 +#ifdef CONFIG_HIGH_RES_TIMERS
25373 +
25374  /*
25375 - * When a timer is enqueued and expires earlier than the already enqueued
25376 - * timers, we have to check, whether it expires earlier than the timer for
25377 - * which the clock event device was armed.
25378 - *
25379 - * Called with interrupts disabled and base->cpu_base.lock held
25380 + * High resolution timer enabled ?
25381   */
25382 -static void hrtimer_reprogram(struct hrtimer *timer,
25383 -                             struct hrtimer_clock_base *base)
25384 -{
25385 -       struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases);
25386 -       ktime_t expires = ktime_sub(hrtimer_get_expires(timer), base->offset);
25387 -
25388 -       WARN_ON_ONCE(hrtimer_get_expires_tv64(timer) < 0);
25389 -
25390 -       /*
25391 -        * If the timer is not on the current cpu, we cannot reprogram
25392 -        * the other cpus clock event device.
25393 -        */
25394 -       if (base->cpu_base != cpu_base)
25395 -               return;
25396 -
25397 -       /*
25398 -        * If the hrtimer interrupt is running, then it will
25399 -        * reevaluate the clock bases and reprogram the clock event
25400 -        * device. The callbacks are always executed in hard interrupt
25401 -        * context so we don't need an extra check for a running
25402 -        * callback.
25403 -        */
25404 -       if (cpu_base->in_hrtirq)
25405 -               return;
25406 -
25407 -       /*
25408 -        * CLOCK_REALTIME timer might be requested with an absolute
25409 -        * expiry time which is less than base->offset. Set it to 0.
25410 -        */
25411 -       if (expires < 0)
25412 -               expires = 0;
25413 -
25414 -       if (expires >= cpu_base->expires_next)
25415 -               return;
25416 -
25417 -       /* Update the pointer to the next expiring timer */
25418 -       cpu_base->next_timer = timer;
25419 -
25420 -       /*
25421 -        * If a hang was detected in the last timer interrupt then we
25422 -        * do not schedule a timer which is earlier than the expiry
25423 -        * which we enforced in the hang detection. We want the system
25424 -        * to make progress.
25425 -        */
25426 -       if (cpu_base->hang_detected)
25427 -               return;
25428 +static bool hrtimer_hres_enabled __read_mostly  = true;
25429 +unsigned int hrtimer_resolution __read_mostly = LOW_RES_NSEC;
25430 +EXPORT_SYMBOL_GPL(hrtimer_resolution);
25431
25432 -       /*
25433 -        * Program the timer hardware. We enforce the expiry for
25434 -        * events which are already in the past.
25435 -        */
25436 -       cpu_base->expires_next = expires;
25437 -       tick_program_event(expires, 1);
25438 +/*
25439 + * Enable / Disable high resolution mode
25440 + */
25441 +static int __init setup_hrtimer_hres(char *str)
25442 +{
25443 +       return (kstrtobool(str, &hrtimer_hres_enabled) == 0);
25444  }
25445
25446 +__setup("highres=", setup_hrtimer_hres);
25447 +
25448  /*
25449 - * Initialize the high resolution related parts of cpu_base
25450 + * hrtimer_high_res_enabled - query, if the highres mode is enabled
25451   */
25452 -static inline void hrtimer_init_hres(struct hrtimer_cpu_base *base)
25453 +static inline int hrtimer_is_hres_enabled(void)
25454  {
25455 -       base->expires_next = KTIME_MAX;
25456 -       base->hang_detected = 0;
25457 -       base->hres_active = 0;
25458 -       base->next_timer = NULL;
25459 +       return hrtimer_hres_enabled;
25460  }
25461
25462  /*
25463 @@ -669,7 +686,7 @@ static void retrigger_next_event(void *arg)
25464  {
25465         struct hrtimer_cpu_base *base = this_cpu_ptr(&hrtimer_bases);
25466
25467 -       if (!base->hres_active)
25468 +       if (!__hrtimer_hres_active(base))
25469                 return;
25470
25471         raw_spin_lock(&base->lock);
25472 @@ -698,6 +715,29 @@ static void hrtimer_switch_to_hres(void)
25473         retrigger_next_event(NULL);
25474  }
25475
25476 +#ifdef CONFIG_PREEMPT_RT_FULL
25477 +
25478 +static struct swork_event clock_set_delay_work;
25479 +
25480 +static void run_clock_set_delay(struct swork_event *event)
25481 +{
25482 +       clock_was_set();
25483 +}
25484 +
25485 +void clock_was_set_delayed(void)
25486 +{
25487 +       swork_queue(&clock_set_delay_work);
25488 +}
25489 +
25490 +static __init int create_clock_set_delay_thread(void)
25491 +{
25492 +       WARN_ON(swork_get());
25493 +       INIT_SWORK(&clock_set_delay_work, run_clock_set_delay);
25494 +       return 0;
25495 +}
25496 +early_initcall(create_clock_set_delay_thread);
25497 +#else /* PREEMPT_RT_FULL */
25498 +
25499  static void clock_was_set_work(struct work_struct *work)
25500  {
25501         clock_was_set();
25502 @@ -713,25 +753,105 @@ void clock_was_set_delayed(void)
25503  {
25504         schedule_work(&hrtimer_work);
25505  }
25506 +#endif
25507
25508  #else
25509
25510 -static inline int __hrtimer_hres_active(struct hrtimer_cpu_base *b) { return 0; }
25511 -static inline int hrtimer_hres_active(void) { return 0; }
25512  static inline int hrtimer_is_hres_enabled(void) { return 0; }
25513  static inline void hrtimer_switch_to_hres(void) { }
25514 -static inline void
25515 -hrtimer_force_reprogram(struct hrtimer_cpu_base *base, int skip_equal) { }
25516 -static inline int hrtimer_reprogram(struct hrtimer *timer,
25517 -                                   struct hrtimer_clock_base *base)
25518 -{
25519 -       return 0;
25520 -}
25521 -static inline void hrtimer_init_hres(struct hrtimer_cpu_base *base) { }
25522  static inline void retrigger_next_event(void *arg) { }
25523
25524  #endif /* CONFIG_HIGH_RES_TIMERS */
25525
25526 +/*
25527 + * When a timer is enqueued and expires earlier than the already enqueued
25528 + * timers, we have to check, whether it expires earlier than the timer for
25529 + * which the clock event device was armed.
25530 + *
25531 + * Called with interrupts disabled and base->cpu_base.lock held
25532 + */
25533 +static void hrtimer_reprogram(struct hrtimer *timer, bool reprogram)
25534 +{
25535 +       struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases);
25536 +       struct hrtimer_clock_base *base = timer->base;
25537 +       ktime_t expires = ktime_sub(hrtimer_get_expires(timer), base->offset);
25538 +
25539 +       WARN_ON_ONCE(hrtimer_get_expires_tv64(timer) < 0);
25540 +
25541 +       /*
25542 +        * CLOCK_REALTIME timer might be requested with an absolute
25543 +        * expiry time which is less than base->offset. Set it to 0.
25544 +        */
25545 +       if (expires < 0)
25546 +               expires = 0;
25547 +
25548 +       if (timer->is_soft) {
25549 +               /*
25550 +                * soft hrtimer could be started on a remote CPU. In this
25551 +                * case softirq_expires_next needs to be updated on the
25552 +                * remote CPU. The soft hrtimer will not expire before the
25553 +                * first hard hrtimer on the remote CPU -
25554 +                * hrtimer_check_target() prevents this case.
25555 +                */
25556 +               struct hrtimer_cpu_base *timer_cpu_base = base->cpu_base;
25557 +
25558 +               if (timer_cpu_base->softirq_activated)
25559 +                       return;
25560 +
25561 +               if (!ktime_before(expires, timer_cpu_base->softirq_expires_next))
25562 +                       return;
25563 +
25564 +               timer_cpu_base->softirq_next_timer = timer;
25565 +               timer_cpu_base->softirq_expires_next = expires;
25566 +
25567 +               if (!ktime_before(expires, timer_cpu_base->expires_next) ||
25568 +                   !reprogram)
25569 +                       return;
25570 +       }
25571 +
25572 +       /*
25573 +        * If the timer is not on the current cpu, we cannot reprogram
25574 +        * the other cpus clock event device.
25575 +        */
25576 +       if (base->cpu_base != cpu_base)
25577 +               return;
25578 +
25579 +       /*
25580 +        * If the hrtimer interrupt is running, then it will
25581 +        * reevaluate the clock bases and reprogram the clock event
25582 +        * device. The callbacks are always executed in hard interrupt
25583 +        * context so we don't need an extra check for a running
25584 +        * callback.
25585 +        */
25586 +       if (cpu_base->in_hrtirq)
25587 +               return;
25588 +
25589 +       if (expires >= cpu_base->expires_next)
25590 +               return;
25591 +
25592 +       /* Update the pointer to the next expiring timer */
25593 +       cpu_base->next_timer = timer;
25594 +       cpu_base->expires_next = expires;
25595 +
25596 +       /*
25597 +        * If hres is not active, hardware does not have to be
25598 +        * programmed yet.
25599 +        *
25600 +        * If a hang was detected in the last timer interrupt then we
25601 +        * do not schedule a timer which is earlier than the expiry
25602 +        * which we enforced in the hang detection. We want the system
25603 +        * to make progress.
25604 +        */
25605 +       if (!__hrtimer_hres_active(cpu_base) || cpu_base->hang_detected)
25606 +               return;
25607 +
25608 +       /*
25609 +        * Program the timer hardware. We enforce the expiry for
25610 +        * events which are already in the past.
25611 +        */
25612 +       tick_program_event(expires, 1);
25613 +}
25614 +
25615  /*
25616   * Clock realtime was set
25617   *
25618 @@ -830,6 +950,33 @@ u64 hrtimer_forward(struct hrtimer *timer, ktime_t now, ktime_t interval)
25619  }
25620  EXPORT_SYMBOL_GPL(hrtimer_forward);
25621
25622 +#ifdef CONFIG_PREEMPT_RT_BASE
25623 +# define wake_up_timer_waiters(b)      wake_up(&(b)->wait)
25624 +
25625 +/**
25626 + * hrtimer_wait_for_timer - Wait for a running timer
25627 + *
25628 + * @timer:     timer to wait for
25629 + *
25630 + * The function waits in case the timers callback function is
25631 + * currently executed on the waitqueue of the timer base. The
25632 + * waitqueue is woken up after the timer callback function has
25633 + * finished execution.
25634 + */
25635 +void hrtimer_wait_for_timer(const struct hrtimer *timer)
25636 +{
25637 +       struct hrtimer_clock_base *base = timer->base;
25638 +
25639 +       if (base && base->cpu_base &&
25640 +           base->index >= HRTIMER_BASE_MONOTONIC_SOFT)
25641 +               wait_event(base->cpu_base->wait,
25642 +                               !(hrtimer_callback_running(timer)));
25643 +}
25644 +
25645 +#else
25646 +# define wake_up_timer_waiters(b)      do { } while (0)
25647 +#endif
25648 +
25649  /*
25650   * enqueue_hrtimer - internal function to (re)start a timer
25651   *
25652 @@ -839,9 +986,10 @@ EXPORT_SYMBOL_GPL(hrtimer_forward);
25653   * Returns 1 when the new timer is the leftmost timer in the tree.
25654   */
25655  static int enqueue_hrtimer(struct hrtimer *timer,
25656 -                          struct hrtimer_clock_base *base)
25657 +                          struct hrtimer_clock_base *base,
25658 +                          enum hrtimer_mode mode)
25659  {
25660 -       debug_activate(timer);
25661 +       debug_activate(timer, mode);
25662
25663         base->cpu_base->active_bases |= 1 << base->index;
25664
25665 @@ -874,7 +1022,6 @@ static void __remove_hrtimer(struct hrtimer *timer,
25666         if (!timerqueue_del(&base->active, &timer->node))
25667                 cpu_base->active_bases &= ~(1 << base->index);
25668
25669 -#ifdef CONFIG_HIGH_RES_TIMERS
25670         /*
25671          * Note: If reprogram is false we do not update
25672          * cpu_base->next_timer. This happens when we remove the first
25673 @@ -885,7 +1032,6 @@ static void __remove_hrtimer(struct hrtimer *timer,
25674          */
25675         if (reprogram && timer == cpu_base->next_timer)
25676                 hrtimer_force_reprogram(cpu_base, 1);
25677 -#endif
25678  }
25679
25680  /*
25681 @@ -934,22 +1080,36 @@ static inline ktime_t hrtimer_update_lowres(struct hrtimer *timer, ktime_t tim,
25682         return tim;
25683  }
25684
25685 -/**
25686 - * hrtimer_start_range_ns - (re)start an hrtimer on the current CPU
25687 - * @timer:     the timer to be added
25688 - * @tim:       expiry time
25689 - * @delta_ns:  "slack" range for the timer
25690 - * @mode:      expiry mode: absolute (HRTIMER_MODE_ABS) or
25691 - *             relative (HRTIMER_MODE_REL)
25692 - */
25693 -void hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim,
25694 -                           u64 delta_ns, const enum hrtimer_mode mode)
25695 +static void
25696 +hrtimer_update_softirq_timer(struct hrtimer_cpu_base *cpu_base, bool reprogram)
25697  {
25698 -       struct hrtimer_clock_base *base, *new_base;
25699 -       unsigned long flags;
25700 -       int leftmost;
25701 +       ktime_t expires;
25702
25703 -       base = lock_hrtimer_base(timer, &flags);
25704 +       /*
25705 +        * Find the next SOFT expiration.
25706 +        */
25707 +       expires = __hrtimer_get_next_event(cpu_base, HRTIMER_ACTIVE_SOFT);
25708 +
25709 +       /*
25710 +        * reprogramming needs to be triggered, even if the next soft
25711 +        * hrtimer expires at the same time than the next hard
25712 +        * hrtimer. cpu_base->softirq_expires_next needs to be updated!
25713 +        */
25714 +       if (expires == KTIME_MAX)
25715 +               return;
25716 +
25717 +       /*
25718 +        * cpu_base->*next_timer is recomputed by __hrtimer_get_next_event()
25719 +        * cpu_base->*expires_next is only set by hrtimer_reprogram()
25720 +        */
25721 +       hrtimer_reprogram(cpu_base->softirq_next_timer, reprogram);
25722 +}
25723 +
25724 +static int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim,
25725 +                                   u64 delta_ns, const enum hrtimer_mode mode,
25726 +                                   struct hrtimer_clock_base *base)
25727 +{
25728 +       struct hrtimer_clock_base *new_base;
25729
25730         /* Remove an active timer from the queue: */
25731         remove_hrtimer(timer, base, true);
25732 @@ -964,21 +1124,37 @@ void hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim,
25733         /* Switch the timer base, if necessary: */
25734         new_base = switch_hrtimer_base(timer, base, mode & HRTIMER_MODE_PINNED);
25735
25736 -       leftmost = enqueue_hrtimer(timer, new_base);
25737 -       if (!leftmost)
25738 -               goto unlock;
25739 +       return enqueue_hrtimer(timer, new_base, mode);
25740 +}
25741 +
25742 +/**
25743 + * hrtimer_start_range_ns - (re)start an hrtimer
25744 + * @timer:     the timer to be added
25745 + * @tim:       expiry time
25746 + * @delta_ns:  "slack" range for the timer
25747 + * @mode:      timer mode: absolute (HRTIMER_MODE_ABS) or
25748 + *             relative (HRTIMER_MODE_REL), and pinned (HRTIMER_MODE_PINNED);
25749 + *             softirq based mode is considered for debug purpose only!
25750 + */
25751 +void hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim,
25752 +                           u64 delta_ns, const enum hrtimer_mode mode)
25753 +{
25754 +       struct hrtimer_clock_base *base;
25755 +       unsigned long flags;
25756 +
25757 +       /*
25758 +        * Check whether the HRTIMER_MODE_SOFT bit and hrtimer.is_soft
25759 +        * match.
25760 +        */
25761 +#ifndef CONFIG_PREEMPT_RT_BASE
25762 +       WARN_ON_ONCE(!(mode & HRTIMER_MODE_SOFT) ^ !timer->is_soft);
25763 +#endif
25764 +
25765 +       base = lock_hrtimer_base(timer, &flags);
25766 +
25767 +       if (__hrtimer_start_range_ns(timer, tim, delta_ns, mode, base))
25768 +               hrtimer_reprogram(timer, true);
25769
25770 -       if (!hrtimer_is_hres_active(timer)) {
25771 -               /*
25772 -                * Kick to reschedule the next tick to handle the new timer
25773 -                * on dynticks target.
25774 -                */
25775 -               if (new_base->cpu_base->nohz_active)
25776 -                       wake_up_nohz_cpu(new_base->cpu_base->cpu);
25777 -       } else {
25778 -               hrtimer_reprogram(timer, new_base);
25779 -       }
25780 -unlock:
25781         unlock_hrtimer_base(timer, &flags);
25782  }
25783  EXPORT_SYMBOL_GPL(hrtimer_start_range_ns);
25784 @@ -1035,7 +1211,7 @@ int hrtimer_cancel(struct hrtimer *timer)
25785
25786                 if (ret >= 0)
25787                         return ret;
25788 -               cpu_relax();
25789 +               hrtimer_wait_for_timer(timer);
25790         }
25791  }
25792  EXPORT_SYMBOL_GPL(hrtimer_cancel);
25793 @@ -1076,7 +1252,7 @@ u64 hrtimer_get_next_event(void)
25794         raw_spin_lock_irqsave(&cpu_base->lock, flags);
25795
25796         if (!__hrtimer_hres_active(cpu_base))
25797 -               expires = __hrtimer_get_next_event(cpu_base);
25798 +               expires = __hrtimer_get_next_event(cpu_base, HRTIMER_ACTIVE_ALL);
25799
25800         raw_spin_unlock_irqrestore(&cpu_base->lock, flags);
25801
25802 @@ -1099,8 +1275,16 @@ static inline int hrtimer_clockid_to_base(clockid_t clock_id)
25803  static void __hrtimer_init(struct hrtimer *timer, clockid_t clock_id,
25804                            enum hrtimer_mode mode)
25805  {
25806 -       struct hrtimer_cpu_base *cpu_base;
25807 +       bool softtimer;
25808         int base;
25809 +       struct hrtimer_cpu_base *cpu_base;
25810 +
25811 +       softtimer = !!(mode & HRTIMER_MODE_SOFT);
25812 +#ifdef CONFIG_PREEMPT_RT_FULL
25813 +       if (!softtimer && !(mode & HRTIMER_MODE_HARD))
25814 +               softtimer = true;
25815 +#endif
25816 +       base = softtimer ? HRTIMER_MAX_CLOCK_BASES / 2 : 0;
25817
25818         memset(timer, 0, sizeof(struct hrtimer));
25819
25820 @@ -1114,7 +1298,8 @@ static void __hrtimer_init(struct hrtimer *timer, clockid_t clock_id,
25821         if (clock_id == CLOCK_REALTIME && mode & HRTIMER_MODE_REL)
25822                 clock_id = CLOCK_MONOTONIC;
25823
25824 -       base = hrtimer_clockid_to_base(clock_id);
25825 +       base += hrtimer_clockid_to_base(clock_id);
25826 +       timer->is_soft = softtimer;
25827         timer->base = &cpu_base->clock_base[base];
25828         timerqueue_init(&timer->node);
25829  }
25830 @@ -1123,7 +1308,13 @@ static void __hrtimer_init(struct hrtimer *timer, clockid_t clock_id,
25831   * hrtimer_init - initialize a timer to the given clock
25832   * @timer:     the timer to be initialized
25833   * @clock_id:  the clock to be used
25834 - * @mode:      timer mode abs/rel
25835 + * @mode:       The modes which are relevant for intitialization:
25836 + *              HRTIMER_MODE_ABS, HRTIMER_MODE_REL, HRTIMER_MODE_ABS_SOFT,
25837 + *              HRTIMER_MODE_REL_SOFT
25838 + *
25839 + *              The PINNED variants of the above can be handed in,
25840 + *              but the PINNED bit is ignored as pinning happens
25841 + *              when the hrtimer is started
25842   */
25843  void hrtimer_init(struct hrtimer *timer, clockid_t clock_id,
25844                   enum hrtimer_mode mode)
25845 @@ -1142,19 +1333,19 @@ EXPORT_SYMBOL_GPL(hrtimer_init);
25846   */
25847  bool hrtimer_active(const struct hrtimer *timer)
25848  {
25849 -       struct hrtimer_cpu_base *cpu_base;
25850 +       struct hrtimer_clock_base *base;
25851         unsigned int seq;
25852
25853         do {
25854 -               cpu_base = READ_ONCE(timer->base->cpu_base);
25855 -               seq = raw_read_seqcount_begin(&cpu_base->seq);
25856 +               base = READ_ONCE(timer->base);
25857 +               seq = raw_read_seqcount_begin(&base->seq);
25858
25859                 if (timer->state != HRTIMER_STATE_INACTIVE ||
25860 -                   cpu_base->running == timer)
25861 +                   base->running == timer)
25862                         return true;
25863
25864 -       } while (read_seqcount_retry(&cpu_base->seq, seq) ||
25865 -                cpu_base != READ_ONCE(timer->base->cpu_base));
25866 +       } while (read_seqcount_retry(&base->seq, seq) ||
25867 +                base != READ_ONCE(timer->base));
25868
25869         return false;
25870  }
25871 @@ -1180,7 +1371,8 @@ EXPORT_SYMBOL_GPL(hrtimer_active);
25872
25873  static void __run_hrtimer(struct hrtimer_cpu_base *cpu_base,
25874                           struct hrtimer_clock_base *base,
25875 -                         struct hrtimer *timer, ktime_t *now)
25876 +                         struct hrtimer *timer, ktime_t *now,
25877 +                         unsigned long flags)
25878  {
25879         enum hrtimer_restart (*fn)(struct hrtimer *);
25880         int restart;
25881 @@ -1188,16 +1380,16 @@ static void __run_hrtimer(struct hrtimer_cpu_base *cpu_base,
25882         lockdep_assert_held(&cpu_base->lock);
25883
25884         debug_deactivate(timer);
25885 -       cpu_base->running = timer;
25886 +       base->running = timer;
25887
25888         /*
25889          * Separate the ->running assignment from the ->state assignment.
25890          *
25891          * As with a regular write barrier, this ensures the read side in
25892 -        * hrtimer_active() cannot observe cpu_base->running == NULL &&
25893 +        * hrtimer_active() cannot observe base->running == NULL &&
25894          * timer->state == INACTIVE.
25895          */
25896 -       raw_write_seqcount_barrier(&cpu_base->seq);
25897 +       raw_write_seqcount_barrier(&base->seq);
25898
25899         __remove_hrtimer(timer, base, HRTIMER_STATE_INACTIVE, 0);
25900         fn = timer->function;
25901 @@ -1211,15 +1403,15 @@ static void __run_hrtimer(struct hrtimer_cpu_base *cpu_base,
25902                 timer->is_rel = false;
25903
25904         /*
25905 -        * Because we run timers from hardirq context, there is no chance
25906 -        * they get migrated to another cpu, therefore its safe to unlock
25907 -        * the timer base.
25908 +        * The timer is marked as running in the cpu base, so it is
25909 +        * protected against migration to a different CPU even if the lock
25910 +        * is dropped.
25911          */
25912 -       raw_spin_unlock(&cpu_base->lock);
25913 +       raw_spin_unlock_irqrestore(&cpu_base->lock, flags);
25914         trace_hrtimer_expire_entry(timer, now);
25915         restart = fn(timer);
25916         trace_hrtimer_expire_exit(timer);
25917 -       raw_spin_lock(&cpu_base->lock);
25918 +       raw_spin_lock_irq(&cpu_base->lock);
25919
25920         /*
25921          * Note: We clear the running state after enqueue_hrtimer and
25922 @@ -1232,33 +1424,31 @@ static void __run_hrtimer(struct hrtimer_cpu_base *cpu_base,
25923          */
25924         if (restart != HRTIMER_NORESTART &&
25925             !(timer->state & HRTIMER_STATE_ENQUEUED))
25926 -               enqueue_hrtimer(timer, base);
25927 +               enqueue_hrtimer(timer, base, HRTIMER_MODE_ABS);
25928
25929         /*
25930          * Separate the ->running assignment from the ->state assignment.
25931          *
25932          * As with a regular write barrier, this ensures the read side in
25933 -        * hrtimer_active() cannot observe cpu_base->running == NULL &&
25934 +        * hrtimer_active() cannot observe base->running.timer == NULL &&
25935          * timer->state == INACTIVE.
25936          */
25937 -       raw_write_seqcount_barrier(&cpu_base->seq);
25938 +       raw_write_seqcount_barrier(&base->seq);
25939
25940 -       WARN_ON_ONCE(cpu_base->running != timer);
25941 -       cpu_base->running = NULL;
25942 +       WARN_ON_ONCE(base->running != timer);
25943 +       base->running = NULL;
25944  }
25945
25946 -static void __hrtimer_run_queues(struct hrtimer_cpu_base *cpu_base, ktime_t now)
25947 +static void __hrtimer_run_queues(struct hrtimer_cpu_base *cpu_base, ktime_t now,
25948 +                                unsigned long flags, unsigned int active_mask)
25949  {
25950 -       struct hrtimer_clock_base *base = cpu_base->clock_base;
25951 -       unsigned int active = cpu_base->active_bases;
25952 +       struct hrtimer_clock_base *base;
25953 +       unsigned int active = cpu_base->active_bases & active_mask;
25954
25955 -       for (; active; base++, active >>= 1) {
25956 +       for_each_active_base(base, cpu_base, active) {
25957                 struct timerqueue_node *node;
25958                 ktime_t basenow;
25959
25960 -               if (!(active & 0x01))
25961 -                       continue;
25962 -
25963                 basenow = ktime_add(now, base->offset);
25964
25965                 while ((node = timerqueue_getnext(&base->active))) {
25966 @@ -1281,11 +1471,29 @@ static void __hrtimer_run_queues(struct hrtimer_cpu_base *cpu_base, ktime_t now)
25967                         if (basenow < hrtimer_get_softexpires_tv64(timer))
25968                                 break;
25969
25970 -                       __run_hrtimer(cpu_base, base, timer, &basenow);
25971 +                       __run_hrtimer(cpu_base, base, timer, &basenow, flags);
25972                 }
25973         }
25974  }
25975
25976 +static __latent_entropy void hrtimer_run_softirq(struct softirq_action *h)
25977 +{
25978 +       struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases);
25979 +       unsigned long flags;
25980 +       ktime_t now;
25981 +
25982 +       raw_spin_lock_irqsave(&cpu_base->lock, flags);
25983 +
25984 +       now = hrtimer_update_base(cpu_base);
25985 +       __hrtimer_run_queues(cpu_base, now, flags, HRTIMER_ACTIVE_SOFT);
25986 +
25987 +       cpu_base->softirq_activated = 0;
25988 +       hrtimer_update_softirq_timer(cpu_base, true);
25989 +
25990 +       raw_spin_unlock_irqrestore(&cpu_base->lock, flags);
25991 +       wake_up_timer_waiters(cpu_base);
25992 +}
25993 +
25994  #ifdef CONFIG_HIGH_RES_TIMERS
25995
25996  /*
25997 @@ -1296,13 +1504,14 @@ void hrtimer_interrupt(struct clock_event_device *dev)
25998  {
25999         struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases);
26000         ktime_t expires_next, now, entry_time, delta;
26001 +       unsigned long flags;
26002         int retries = 0;
26003
26004         BUG_ON(!cpu_base->hres_active);
26005         cpu_base->nr_events++;
26006         dev->next_event = KTIME_MAX;
26007
26008 -       raw_spin_lock(&cpu_base->lock);
26009 +       raw_spin_lock_irqsave(&cpu_base->lock, flags);
26010         entry_time = now = hrtimer_update_base(cpu_base);
26011  retry:
26012         cpu_base->in_hrtirq = 1;
26013 @@ -1315,17 +1524,23 @@ void hrtimer_interrupt(struct clock_event_device *dev)
26014          */
26015         cpu_base->expires_next = KTIME_MAX;
26016
26017 -       __hrtimer_run_queues(cpu_base, now);
26018 +       if (!ktime_before(now, cpu_base->softirq_expires_next)) {
26019 +               cpu_base->softirq_expires_next = KTIME_MAX;
26020 +               cpu_base->softirq_activated = 1;
26021 +               raise_softirq_irqoff(HRTIMER_SOFTIRQ);
26022 +       }
26023 +
26024 +       __hrtimer_run_queues(cpu_base, now, flags, HRTIMER_ACTIVE_HARD);
26025
26026         /* Reevaluate the clock bases for the next expiry */
26027 -       expires_next = __hrtimer_get_next_event(cpu_base);
26028 +       expires_next = __hrtimer_get_next_event(cpu_base, HRTIMER_ACTIVE_ALL);
26029         /*
26030          * Store the new expiry value so the migration code can verify
26031          * against it.
26032          */
26033         cpu_base->expires_next = expires_next;
26034         cpu_base->in_hrtirq = 0;
26035 -       raw_spin_unlock(&cpu_base->lock);
26036 +       raw_spin_unlock_irqrestore(&cpu_base->lock, flags);
26037
26038         /* Reprogramming necessary ? */
26039         if (!tick_program_event(expires_next, 0)) {
26040 @@ -1346,7 +1561,7 @@ void hrtimer_interrupt(struct clock_event_device *dev)
26041          * Acquire base lock for updating the offsets and retrieving
26042          * the current time.
26043          */
26044 -       raw_spin_lock(&cpu_base->lock);
26045 +       raw_spin_lock_irqsave(&cpu_base->lock, flags);
26046         now = hrtimer_update_base(cpu_base);
26047         cpu_base->nr_retries++;
26048         if (++retries < 3)
26049 @@ -1359,7 +1574,8 @@ void hrtimer_interrupt(struct clock_event_device *dev)
26050          */
26051         cpu_base->nr_hangs++;
26052         cpu_base->hang_detected = 1;
26053 -       raw_spin_unlock(&cpu_base->lock);
26054 +       raw_spin_unlock_irqrestore(&cpu_base->lock, flags);
26055 +
26056         delta = ktime_sub(now, entry_time);
26057         if ((unsigned int)delta > cpu_base->max_hang_time)
26058                 cpu_base->max_hang_time = (unsigned int) delta;
26059 @@ -1401,6 +1617,7 @@ static inline void __hrtimer_peek_ahead_timers(void) { }
26060  void hrtimer_run_queues(void)
26061  {
26062         struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases);
26063 +       unsigned long flags;
26064         ktime_t now;
26065
26066         if (__hrtimer_hres_active(cpu_base))
26067 @@ -1418,10 +1635,17 @@ void hrtimer_run_queues(void)
26068                 return;
26069         }
26070
26071 -       raw_spin_lock(&cpu_base->lock);
26072 +       raw_spin_lock_irqsave(&cpu_base->lock, flags);
26073         now = hrtimer_update_base(cpu_base);
26074 -       __hrtimer_run_queues(cpu_base, now);
26075 -       raw_spin_unlock(&cpu_base->lock);
26076 +
26077 +       if (!ktime_before(now, cpu_base->softirq_expires_next)) {
26078 +               cpu_base->softirq_expires_next = KTIME_MAX;
26079 +               cpu_base->softirq_activated = 1;
26080 +               raise_softirq_irqoff(HRTIMER_SOFTIRQ);
26081 +       }
26082 +
26083 +       __hrtimer_run_queues(cpu_base, now, flags, HRTIMER_ACTIVE_HARD);
26084 +       raw_spin_unlock_irqrestore(&cpu_base->lock, flags);
26085  }
26086
26087  /*
26088 @@ -1440,13 +1664,65 @@ static enum hrtimer_restart hrtimer_wakeup(struct hrtimer *timer)
26089         return HRTIMER_NORESTART;
26090  }
26091
26092 -void hrtimer_init_sleeper(struct hrtimer_sleeper *sl, struct task_struct *task)
26093 +#ifdef CONFIG_PREEMPT_RT_FULL
26094 +static bool task_is_realtime(struct task_struct *tsk)
26095  {
26096 +       int policy = tsk->policy;
26097 +
26098 +       if (policy == SCHED_FIFO || policy == SCHED_RR)
26099 +               return true;
26100 +       if (policy == SCHED_DEADLINE)
26101 +               return true;
26102 +       return false;
26103 +}
26104 +#endif
26105 +
26106 +static void __hrtimer_init_sleeper(struct hrtimer_sleeper *sl,
26107 +                                  clockid_t clock_id,
26108 +                                  enum hrtimer_mode mode,
26109 +                                  struct task_struct *task)
26110 +{
26111 +#ifdef CONFIG_PREEMPT_RT_FULL
26112 +       if (!(mode & (HRTIMER_MODE_SOFT | HRTIMER_MODE_HARD))) {
26113 +               if (task_is_realtime(current) || system_state != SYSTEM_RUNNING)
26114 +                       mode |= HRTIMER_MODE_HARD;
26115 +               else
26116 +                       mode |= HRTIMER_MODE_SOFT;
26117 +       }
26118 +#endif
26119 +       __hrtimer_init(&sl->timer, clock_id, mode);
26120         sl->timer.function = hrtimer_wakeup;
26121         sl->task = task;
26122  }
26123 +
26124 +/**
26125 + * hrtimer_init_sleeper - initialize sleeper to the given clock
26126 + * @sl:                sleeper to be initialized
26127 + * @clock_id:  the clock to be used
26128 + * @mode:      timer mode abs/rel
26129 + * @task:      the task to wake up
26130 + */
26131 +void hrtimer_init_sleeper(struct hrtimer_sleeper *sl, clockid_t clock_id,
26132 +                         enum hrtimer_mode mode, struct task_struct *task)
26133 +{
26134 +       debug_init(&sl->timer, clock_id, mode);
26135 +       __hrtimer_init_sleeper(sl, clock_id, mode, task);
26136 +
26137 +}
26138  EXPORT_SYMBOL_GPL(hrtimer_init_sleeper);
26139
26140 +#ifdef CONFIG_DEBUG_OBJECTS_TIMERS
26141 +void hrtimer_init_sleeper_on_stack(struct hrtimer_sleeper *sl,
26142 +                                  clockid_t clock_id,
26143 +                                  enum hrtimer_mode mode,
26144 +                                  struct task_struct *task)
26145 +{
26146 +       debug_object_init_on_stack(&sl->timer, &hrtimer_debug_descr);
26147 +       __hrtimer_init_sleeper(sl, clock_id, mode, task);
26148 +}
26149 +EXPORT_SYMBOL_GPL(hrtimer_init_sleeper_on_stack);
26150 +#endif
26151 +
26152  int nanosleep_copyout(struct restart_block *restart, struct timespec64 *ts)
26153  {
26154         switch(restart->nanosleep.type) {
26155 @@ -1470,8 +1746,6 @@ static int __sched do_nanosleep(struct hrtimer_sleeper *t, enum hrtimer_mode mod
26156  {
26157         struct restart_block *restart;
26158
26159 -       hrtimer_init_sleeper(t, current);
26160 -
26161         do {
26162                 set_current_state(TASK_INTERRUPTIBLE);
26163                 hrtimer_start_expires(&t->timer, mode);
26164 @@ -1508,10 +1782,9 @@ static long __sched hrtimer_nanosleep_restart(struct restart_block *restart)
26165         struct hrtimer_sleeper t;
26166         int ret;
26167
26168 -       hrtimer_init_on_stack(&t.timer, restart->nanosleep.clockid,
26169 -                               HRTIMER_MODE_ABS);
26170 +       hrtimer_init_sleeper_on_stack(&t, restart->nanosleep.clockid,
26171 +                                     HRTIMER_MODE_ABS, current);
26172         hrtimer_set_expires_tv64(&t.timer, restart->nanosleep.expires);
26173 -
26174         ret = do_nanosleep(&t, HRTIMER_MODE_ABS);
26175         destroy_hrtimer_on_stack(&t.timer);
26176         return ret;
26177 @@ -1529,7 +1802,7 @@ long hrtimer_nanosleep(const struct timespec64 *rqtp,
26178         if (dl_task(current) || rt_task(current))
26179                 slack = 0;
26180
26181 -       hrtimer_init_on_stack(&t.timer, clockid, mode);
26182 +       hrtimer_init_sleeper_on_stack(&t, clockid, mode, current);
26183         hrtimer_set_expires_range_ns(&t.timer, timespec64_to_ktime(*rqtp), slack);
26184         ret = do_nanosleep(&t, mode);
26185         if (ret != -ERESTART_RESTARTBLOCK)
26186 @@ -1585,6 +1858,27 @@ COMPAT_SYSCALL_DEFINE2(nanosleep, struct compat_timespec __user *, rqtp,
26187  }
26188  #endif
26189
26190 +#ifdef CONFIG_PREEMPT_RT_FULL
26191 +/*
26192 + * Sleep for 1 ms in hope whoever holds what we want will let it go.
26193 + */
26194 +void cpu_chill(void)
26195 +{
26196 +       ktime_t chill_time;
26197 +       unsigned int freeze_flag = current->flags & PF_NOFREEZE;
26198 +
26199 +       chill_time = ktime_set(0, NSEC_PER_MSEC);
26200 +       set_current_state(TASK_UNINTERRUPTIBLE);
26201 +       current->flags |= PF_NOFREEZE;
26202 +       sleeping_lock_inc();
26203 +       schedule_hrtimeout(&chill_time, HRTIMER_MODE_REL_HARD);
26204 +       sleeping_lock_dec();
26205 +       if (!freeze_flag)
26206 +               current->flags &= ~PF_NOFREEZE;
26207 +}
26208 +EXPORT_SYMBOL(cpu_chill);
26209 +#endif
26210 +
26211  /*
26212   * Functions related to boot-time initialization:
26213   */
26214 @@ -1598,9 +1892,17 @@ int hrtimers_prepare_cpu(unsigned int cpu)
26215                 timerqueue_init_head(&cpu_base->clock_base[i].active);
26216         }
26217
26218 -       cpu_base->active_bases = 0;
26219         cpu_base->cpu = cpu;
26220 -       hrtimer_init_hres(cpu_base);
26221 +       cpu_base->active_bases = 0;
26222 +       cpu_base->hres_active = 0;
26223 +       cpu_base->hang_detected = 0;
26224 +       cpu_base->next_timer = NULL;
26225 +       cpu_base->softirq_next_timer = NULL;
26226 +       cpu_base->expires_next = KTIME_MAX;
26227 +       cpu_base->softirq_expires_next = KTIME_MAX;
26228 +#ifdef CONFIG_PREEMPT_RT_BASE
26229 +       init_waitqueue_head(&cpu_base->wait);
26230 +#endif
26231         return 0;
26232  }
26233
26234 @@ -1632,7 +1934,7 @@ static void migrate_hrtimer_list(struct hrtimer_clock_base *old_base,
26235                  * sort out already expired timers and reprogram the
26236                  * event device.
26237                  */
26238 -               enqueue_hrtimer(timer, new_base);
26239 +               enqueue_hrtimer(timer, new_base, HRTIMER_MODE_ABS);
26240         }
26241  }
26242
26243 @@ -1644,6 +1946,12 @@ int hrtimers_dead_cpu(unsigned int scpu)
26244         BUG_ON(cpu_online(scpu));
26245         tick_cancel_sched_timer(scpu);
26246
26247 +       /*
26248 +        * this BH disable ensures that raise_softirq_irqoff() does
26249 +        * not wakeup ksoftirqd (and acquire the pi-lock) while
26250 +        * holding the cpu_base lock
26251 +        */
26252 +       local_bh_disable();
26253         local_irq_disable();
26254         old_base = &per_cpu(hrtimer_bases, scpu);
26255         new_base = this_cpu_ptr(&hrtimer_bases);
26256 @@ -1659,12 +1967,19 @@ int hrtimers_dead_cpu(unsigned int scpu)
26257                                      &new_base->clock_base[i]);
26258         }
26259
26260 +       /*
26261 +        * The migration might have changed the first expiring softirq
26262 +        * timer on this CPU. Update it.
26263 +        */
26264 +       hrtimer_update_softirq_timer(new_base, false);
26265 +
26266         raw_spin_unlock(&old_base->lock);
26267         raw_spin_unlock(&new_base->lock);
26268
26269         /* Check, if we got expired work to do */
26270         __hrtimer_peek_ahead_timers();
26271         local_irq_enable();
26272 +       local_bh_enable();
26273         return 0;
26274  }
26275
26276 @@ -1673,18 +1988,19 @@ int hrtimers_dead_cpu(unsigned int scpu)
26277  void __init hrtimers_init(void)
26278  {
26279         hrtimers_prepare_cpu(smp_processor_id());
26280 +       open_softirq(HRTIMER_SOFTIRQ, hrtimer_run_softirq);
26281  }
26282
26283  /**
26284   * schedule_hrtimeout_range_clock - sleep until timeout
26285   * @expires:   timeout value (ktime_t)
26286   * @delta:     slack in expires timeout (ktime_t)
26287 - * @mode:      timer mode, HRTIMER_MODE_ABS or HRTIMER_MODE_REL
26288 - * @clock:     timer clock, CLOCK_MONOTONIC or CLOCK_REALTIME
26289 + * @mode:      timer mode
26290 + * @clock_id:  timer clock to be used
26291   */
26292  int __sched
26293  schedule_hrtimeout_range_clock(ktime_t *expires, u64 delta,
26294 -                              const enum hrtimer_mode mode, int clock)
26295 +                              const enum hrtimer_mode mode, clockid_t clock_id)
26296  {
26297         struct hrtimer_sleeper t;
26298
26299 @@ -1705,11 +2021,9 @@ schedule_hrtimeout_range_clock(ktime_t *expires, u64 delta,
26300                 return -EINTR;
26301         }
26302
26303 -       hrtimer_init_on_stack(&t.timer, clock, mode);
26304 +       hrtimer_init_sleeper_on_stack(&t, clock_id, mode, current);
26305         hrtimer_set_expires_range_ns(&t.timer, *expires, delta);
26306
26307 -       hrtimer_init_sleeper(&t, current);
26308 -
26309         hrtimer_start_expires(&t.timer, mode);
26310
26311         if (likely(t.task))
26312 @@ -1727,7 +2041,7 @@ schedule_hrtimeout_range_clock(ktime_t *expires, u64 delta,
26313   * schedule_hrtimeout_range - sleep until timeout
26314   * @expires:   timeout value (ktime_t)
26315   * @delta:     slack in expires timeout (ktime_t)
26316 - * @mode:      timer mode, HRTIMER_MODE_ABS or HRTIMER_MODE_REL
26317 + * @mode:      timer mode
26318   *
26319   * Make the current task sleep until the given expiry time has
26320   * elapsed. The routine will return immediately unless
26321 @@ -1766,7 +2080,7 @@ EXPORT_SYMBOL_GPL(schedule_hrtimeout_range);
26322  /**
26323   * schedule_hrtimeout - sleep until timeout
26324   * @expires:   timeout value (ktime_t)
26325 - * @mode:      timer mode, HRTIMER_MODE_ABS or HRTIMER_MODE_REL
26326 + * @mode:      timer mode
26327   *
26328   * Make the current task sleep until the given expiry time has
26329   * elapsed. The routine will return immediately unless
26330 diff --git a/kernel/time/itimer.c b/kernel/time/itimer.c
26331 index f26acef5d7b4..760f38528365 100644
26332 --- a/kernel/time/itimer.c
26333 +++ b/kernel/time/itimer.c
26334 @@ -214,6 +214,7 @@ int do_setitimer(int which, struct itimerval *value, struct itimerval *ovalue)
26335                 /* We are sharing ->siglock with it_real_fn() */
26336                 if (hrtimer_try_to_cancel(timer) < 0) {
26337                         spin_unlock_irq(&tsk->sighand->siglock);
26338 +                       hrtimer_wait_for_timer(&tsk->signal->real_timer);
26339                         goto again;
26340                 }
26341                 expires = timeval_to_ktime(value->it_value);
26342 diff --git a/kernel/time/jiffies.c b/kernel/time/jiffies.c
26343 index 497719127bf9..62acb8914c9e 100644
26344 --- a/kernel/time/jiffies.c
26345 +++ b/kernel/time/jiffies.c
26346 @@ -74,7 +74,8 @@ static struct clocksource clocksource_jiffies = {
26347         .max_cycles     = 10,
26348  };
26349
26350 -__cacheline_aligned_in_smp DEFINE_SEQLOCK(jiffies_lock);
26351 +__cacheline_aligned_in_smp DEFINE_RAW_SPINLOCK(jiffies_lock);
26352 +__cacheline_aligned_in_smp seqcount_t jiffies_seq;
26353
26354  #if (BITS_PER_LONG < 64)
26355  u64 get_jiffies_64(void)
26356 @@ -83,9 +84,9 @@ u64 get_jiffies_64(void)
26357         u64 ret;
26358
26359         do {
26360 -               seq = read_seqbegin(&jiffies_lock);
26361 +               seq = read_seqcount_begin(&jiffies_seq);
26362                 ret = jiffies_64;
26363 -       } while (read_seqretry(&jiffies_lock, seq));
26364 +       } while (read_seqcount_retry(&jiffies_seq, seq));
26365         return ret;
26366  }
26367  EXPORT_SYMBOL(get_jiffies_64);
26368 diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c
26369 index 2da660d53a4b..c7b7d047d12e 100644
26370 --- a/kernel/time/posix-cpu-timers.c
26371 +++ b/kernel/time/posix-cpu-timers.c
26372 @@ -3,8 +3,10 @@
26373   * Implement CPU time clocks for the POSIX clock interface.
26374   */
26375
26376 +#include <uapi/linux/sched/types.h>
26377  #include <linux/sched/signal.h>
26378  #include <linux/sched/cputime.h>
26379 +#include <linux/sched/rt.h>
26380  #include <linux/posix-timers.h>
26381  #include <linux/errno.h>
26382  #include <linux/math64.h>
26383 @@ -14,6 +16,7 @@
26384  #include <linux/tick.h>
26385  #include <linux/workqueue.h>
26386  #include <linux/compat.h>
26387 +#include <linux/smpboot.h>
26388
26389  #include "posix-timers.h"
26390
26391 @@ -603,7 +606,7 @@ static int posix_cpu_timer_set(struct k_itimer *timer, int timer_flags,
26392         /*
26393          * Disarm any old timer after extracting its expiry time.
26394          */
26395 -       WARN_ON_ONCE(!irqs_disabled());
26396 +       WARN_ON_ONCE_NONRT(!irqs_disabled());
26397
26398         ret = 0;
26399         old_incr = timer->it.cpu.incr;
26400 @@ -1034,7 +1037,7 @@ static void posix_cpu_timer_rearm(struct k_itimer *timer)
26401         /*
26402          * Now re-arm for the new expiry time.
26403          */
26404 -       WARN_ON_ONCE(!irqs_disabled());
26405 +       WARN_ON_ONCE_NONRT(!irqs_disabled());
26406         arm_timer(timer);
26407  unlock:
26408         unlock_task_sighand(p, &flags);
26409 @@ -1119,13 +1122,13 @@ static inline int fastpath_timer_check(struct task_struct *tsk)
26410   * already updated our counts.  We need to check if any timers fire now.
26411   * Interrupts are disabled.
26412   */
26413 -void run_posix_cpu_timers(struct task_struct *tsk)
26414 +static void __run_posix_cpu_timers(struct task_struct *tsk)
26415  {
26416         LIST_HEAD(firing);
26417         struct k_itimer *timer, *next;
26418         unsigned long flags;
26419
26420 -       WARN_ON_ONCE(!irqs_disabled());
26421 +       WARN_ON_ONCE_NONRT(!irqs_disabled());
26422
26423         /*
26424          * The fast path checks that there are no expired thread or thread
26425 @@ -1179,6 +1182,152 @@ void run_posix_cpu_timers(struct task_struct *tsk)
26426         }
26427  }
26428
26429 +#ifdef CONFIG_PREEMPT_RT_BASE
26430 +#include <linux/kthread.h>
26431 +#include <linux/cpu.h>
26432 +DEFINE_PER_CPU(struct task_struct *, posix_timer_task);
26433 +DEFINE_PER_CPU(struct task_struct *, posix_timer_tasklist);
26434 +DEFINE_PER_CPU(bool, posix_timer_th_active);
26435 +
26436 +static void posix_cpu_kthread_fn(unsigned int cpu)
26437 +{
26438 +       struct task_struct *tsk = NULL;
26439 +       struct task_struct *next = NULL;
26440 +
26441 +       BUG_ON(per_cpu(posix_timer_task, cpu) != current);
26442 +
26443 +       /* grab task list */
26444 +       raw_local_irq_disable();
26445 +       tsk = per_cpu(posix_timer_tasklist, cpu);
26446 +       per_cpu(posix_timer_tasklist, cpu) = NULL;
26447 +       raw_local_irq_enable();
26448 +
26449 +       /* its possible the list is empty, just return */
26450 +       if (!tsk)
26451 +               return;
26452 +
26453 +       /* Process task list */
26454 +       while (1) {
26455 +               /* save next */
26456 +               next = tsk->posix_timer_list;
26457 +
26458 +               /* run the task timers, clear its ptr and
26459 +                * unreference it
26460 +                */
26461 +               __run_posix_cpu_timers(tsk);
26462 +               tsk->posix_timer_list = NULL;
26463 +               put_task_struct(tsk);
26464 +
26465 +               /* check if this is the last on the list */
26466 +               if (next == tsk)
26467 +                       break;
26468 +               tsk = next;
26469 +       }
26470 +}
26471 +
26472 +static inline int __fastpath_timer_check(struct task_struct *tsk)
26473 +{
26474 +       /* tsk == current, ensure it is safe to use ->signal/sighand */
26475 +       if (unlikely(tsk->exit_state))
26476 +               return 0;
26477 +
26478 +       if (!task_cputime_zero(&tsk->cputime_expires))
26479 +                       return 1;
26480 +
26481 +       if (!task_cputime_zero(&tsk->signal->cputime_expires))
26482 +                       return 1;
26483 +
26484 +       return 0;
26485 +}
26486 +
26487 +void run_posix_cpu_timers(struct task_struct *tsk)
26488 +{
26489 +       unsigned int cpu = smp_processor_id();
26490 +       struct task_struct *tasklist;
26491 +
26492 +       BUG_ON(!irqs_disabled());
26493 +
26494 +       if (per_cpu(posix_timer_th_active, cpu) != true)
26495 +               return;
26496 +
26497 +       /* get per-cpu references */
26498 +       tasklist = per_cpu(posix_timer_tasklist, cpu);
26499 +
26500 +       /* check to see if we're already queued */
26501 +       if (!tsk->posix_timer_list && __fastpath_timer_check(tsk)) {
26502 +               get_task_struct(tsk);
26503 +               if (tasklist) {
26504 +                       tsk->posix_timer_list = tasklist;
26505 +               } else {
26506 +                       /*
26507 +                        * The list is terminated by a self-pointing
26508 +                        * task_struct
26509 +                        */
26510 +                       tsk->posix_timer_list = tsk;
26511 +               }
26512 +               per_cpu(posix_timer_tasklist, cpu) = tsk;
26513 +
26514 +               wake_up_process(per_cpu(posix_timer_task, cpu));
26515 +       }
26516 +}
26517 +
26518 +static int posix_cpu_kthread_should_run(unsigned int cpu)
26519 +{
26520 +       return __this_cpu_read(posix_timer_tasklist) != NULL;
26521 +}
26522 +
26523 +static void posix_cpu_kthread_park(unsigned int cpu)
26524 +{
26525 +       this_cpu_write(posix_timer_th_active, false);
26526 +}
26527 +
26528 +static void posix_cpu_kthread_unpark(unsigned int cpu)
26529 +{
26530 +       this_cpu_write(posix_timer_th_active, true);
26531 +}
26532 +
26533 +static void posix_cpu_kthread_setup(unsigned int cpu)
26534 +{
26535 +       struct sched_param sp;
26536 +
26537 +       sp.sched_priority = MAX_RT_PRIO - 1;
26538 +       sched_setscheduler_nocheck(current, SCHED_FIFO, &sp);
26539 +       posix_cpu_kthread_unpark(cpu);
26540 +}
26541 +
26542 +static struct smp_hotplug_thread posix_cpu_thread = {
26543 +       .store                  = &posix_timer_task,
26544 +       .thread_should_run      = posix_cpu_kthread_should_run,
26545 +       .thread_fn              = posix_cpu_kthread_fn,
26546 +       .thread_comm            = "posixcputmr/%u",
26547 +       .setup                  = posix_cpu_kthread_setup,
26548 +       .park                   = posix_cpu_kthread_park,
26549 +       .unpark                 = posix_cpu_kthread_unpark,
26550 +};
26551 +
26552 +static int __init posix_cpu_thread_init(void)
26553 +{
26554 +       /* Start one for boot CPU. */
26555 +       unsigned long cpu;
26556 +       int ret;
26557 +
26558 +       /* init the per-cpu posix_timer_tasklets */
26559 +       for_each_possible_cpu(cpu)
26560 +               per_cpu(posix_timer_tasklist, cpu) = NULL;
26561 +
26562 +       ret = smpboot_register_percpu_thread(&posix_cpu_thread);
26563 +       WARN_ON(ret);
26564 +
26565 +       return 0;
26566 +}
26567 +early_initcall(posix_cpu_thread_init);
26568 +#else /* CONFIG_PREEMPT_RT_BASE */
26569 +void run_posix_cpu_timers(struct task_struct *tsk)
26570 +{
26571 +       __run_posix_cpu_timers(tsk);
26572 +}
26573 +#endif /* CONFIG_PREEMPT_RT_BASE */
26574 +
26575  /*
26576   * Set one of the process-wide special case CPU timers or RLIMIT_CPU.
26577   * The tsk->sighand->siglock must be held by the caller.
26578 diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c
26579 index 55d45fe2cc17..5a59538f3d16 100644
26580 --- a/kernel/time/posix-timers.c
26581 +++ b/kernel/time/posix-timers.c
26582 @@ -443,6 +443,7 @@ static enum hrtimer_restart posix_timer_fn(struct hrtimer *timer)
26583  static struct pid *good_sigevent(sigevent_t * event)
26584  {
26585         struct task_struct *rtn = current->group_leader;
26586 +       int sig = event->sigev_signo;
26587
26588         switch (event->sigev_notify) {
26589         case SIGEV_SIGNAL | SIGEV_THREAD_ID:
26590 @@ -452,7 +453,8 @@ static struct pid *good_sigevent(sigevent_t * event)
26591                 /* FALLTHRU */
26592         case SIGEV_SIGNAL:
26593         case SIGEV_THREAD:
26594 -               if (event->sigev_signo <= 0 || event->sigev_signo > SIGRTMAX)
26595 +               if (sig <= 0 || sig > SIGRTMAX ||
26596 +                   sig_kernel_only(sig) || sig_kernel_coredump(sig))
26597                         return NULL;
26598                 /* FALLTHRU */
26599         case SIGEV_NONE:
26600 @@ -478,7 +480,7 @@ static struct k_itimer * alloc_posix_timer(void)
26601
26602  static void k_itimer_rcu_free(struct rcu_head *head)
26603  {
26604 -       struct k_itimer *tmr = container_of(head, struct k_itimer, it.rcu);
26605 +       struct k_itimer *tmr = container_of(head, struct k_itimer, rcu);
26606
26607         kmem_cache_free(posix_timers_cache, tmr);
26608  }
26609 @@ -495,7 +497,7 @@ static void release_posix_timer(struct k_itimer *tmr, int it_id_set)
26610         }
26611         put_pid(tmr->it_pid);
26612         sigqueue_free(tmr->sigq);
26613 -       call_rcu(&tmr->it.rcu, k_itimer_rcu_free);
26614 +       call_rcu(&tmr->rcu, k_itimer_rcu_free);
26615  }
26616
26617  static int common_timer_create(struct k_itimer *new_timer)
26618 @@ -834,6 +836,22 @@ static void common_hrtimer_arm(struct k_itimer *timr, ktime_t expires,
26619                 hrtimer_start_expires(timer, HRTIMER_MODE_ABS);
26620  }
26621
26622 +/*
26623 + * Protected by RCU!
26624 + */
26625 +static void timer_wait_for_callback(const struct k_clock *kc, struct k_itimer *timr)
26626 +{
26627 +#ifdef CONFIG_PREEMPT_RT_FULL
26628 +       if (kc->timer_arm == common_hrtimer_arm)
26629 +               hrtimer_wait_for_timer(&timr->it.real.timer);
26630 +       else if (kc == &alarm_clock)
26631 +               hrtimer_wait_for_timer(&timr->it.alarm.alarmtimer.timer);
26632 +       else
26633 +               /* FIXME: Whacky hack for posix-cpu-timers */
26634 +               schedule_timeout(1);
26635 +#endif
26636 +}
26637 +
26638  static int common_hrtimer_try_to_cancel(struct k_itimer *timr)
26639  {
26640         return hrtimer_try_to_cancel(&timr->it.real.timer);
26641 @@ -898,6 +916,7 @@ static int do_timer_settime(timer_t timer_id, int flags,
26642         if (!timr)
26643                 return -EINVAL;
26644
26645 +       rcu_read_lock();
26646         kc = timr->kclock;
26647         if (WARN_ON_ONCE(!kc || !kc->timer_set))
26648                 error = -EINVAL;
26649 @@ -906,9 +925,12 @@ static int do_timer_settime(timer_t timer_id, int flags,
26650
26651         unlock_timer(timr, flag);
26652         if (error == TIMER_RETRY) {
26653 +               timer_wait_for_callback(kc, timr);
26654                 old_spec64 = NULL;      // We already got the old time...
26655 +               rcu_read_unlock();
26656                 goto retry;
26657         }
26658 +       rcu_read_unlock();
26659
26660         return error;
26661  }
26662 @@ -990,10 +1012,15 @@ SYSCALL_DEFINE1(timer_delete, timer_t, timer_id)
26663         if (!timer)
26664                 return -EINVAL;
26665
26666 +       rcu_read_lock();
26667         if (timer_delete_hook(timer) == TIMER_RETRY) {
26668                 unlock_timer(timer, flags);
26669 +               timer_wait_for_callback(clockid_to_kclock(timer->it_clock),
26670 +                                       timer);
26671 +               rcu_read_unlock();
26672                 goto retry_delete;
26673         }
26674 +       rcu_read_unlock();
26675
26676         spin_lock(&current->sighand->siglock);
26677         list_del(&timer->list);
26678 @@ -1019,8 +1046,18 @@ static void itimer_delete(struct k_itimer *timer)
26679  retry_delete:
26680         spin_lock_irqsave(&timer->it_lock, flags);
26681
26682 +       /* On RT we can race with a deletion */
26683 +       if (!timer->it_signal) {
26684 +               unlock_timer(timer, flags);
26685 +               return;
26686 +       }
26687 +
26688         if (timer_delete_hook(timer) == TIMER_RETRY) {
26689 +               rcu_read_lock();
26690                 unlock_timer(timer, flags);
26691 +               timer_wait_for_callback(clockid_to_kclock(timer->it_clock),
26692 +                                       timer);
26693 +               rcu_read_unlock();
26694                 goto retry_delete;
26695         }
26696         list_del(&timer->list);
26697 diff --git a/kernel/time/tick-broadcast-hrtimer.c b/kernel/time/tick-broadcast-hrtimer.c
26698 index 58045eb976c3..f0a34afbc252 100644
26699 --- a/kernel/time/tick-broadcast-hrtimer.c
26700 +++ b/kernel/time/tick-broadcast-hrtimer.c
26701 @@ -106,7 +106,7 @@ static enum hrtimer_restart bc_handler(struct hrtimer *t)
26702
26703  void tick_setup_hrtimer_broadcast(void)
26704  {
26705 -       hrtimer_init(&bctimer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
26706 +       hrtimer_init(&bctimer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_HARD);
26707         bctimer.function = bc_handler;
26708         clockevents_register_device(&ce_broadcast_hrtimer);
26709  }
26710 diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c
26711 index 49edc1c4f3e6..7a87a4488a5e 100644
26712 --- a/kernel/time/tick-common.c
26713 +++ b/kernel/time/tick-common.c
26714 @@ -79,13 +79,15 @@ int tick_is_oneshot_available(void)
26715  static void tick_periodic(int cpu)
26716  {
26717         if (tick_do_timer_cpu == cpu) {
26718 -               write_seqlock(&jiffies_lock);
26719 +               raw_spin_lock(&jiffies_lock);
26720 +               write_seqcount_begin(&jiffies_seq);
26721
26722                 /* Keep track of the next tick event */
26723                 tick_next_period = ktime_add(tick_next_period, tick_period);
26724
26725                 do_timer(1);
26726 -               write_sequnlock(&jiffies_lock);
26727 +               write_seqcount_end(&jiffies_seq);
26728 +               raw_spin_unlock(&jiffies_lock);
26729                 update_wall_time();
26730         }
26731
26732 @@ -157,9 +159,9 @@ void tick_setup_periodic(struct clock_event_device *dev, int broadcast)
26733                 ktime_t next;
26734
26735                 do {
26736 -                       seq = read_seqbegin(&jiffies_lock);
26737 +                       seq = read_seqcount_begin(&jiffies_seq);
26738                         next = tick_next_period;
26739 -               } while (read_seqretry(&jiffies_lock, seq));
26740 +               } while (read_seqcount_retry(&jiffies_seq, seq));
26741
26742                 clockevents_switch_state(dev, CLOCK_EVT_STATE_ONESHOT);
26743
26744 @@ -490,6 +492,7 @@ void tick_freeze(void)
26745         if (tick_freeze_depth == num_online_cpus()) {
26746                 trace_suspend_resume(TPS("timekeeping_freeze"),
26747                                      smp_processor_id(), true);
26748 +               system_state = SYSTEM_SUSPEND;
26749                 timekeeping_suspend();
26750         } else {
26751                 tick_suspend_local();
26752 @@ -513,6 +516,7 @@ void tick_unfreeze(void)
26753
26754         if (tick_freeze_depth == num_online_cpus()) {
26755                 timekeeping_resume();
26756 +               system_state = SYSTEM_RUNNING;
26757                 trace_suspend_resume(TPS("timekeeping_freeze"),
26758                                      smp_processor_id(), false);
26759         } else {
26760 diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h
26761 index f8e1845aa464..e277284c2831 100644
26762 --- a/kernel/time/tick-internal.h
26763 +++ b/kernel/time/tick-internal.h
26764 @@ -150,16 +150,15 @@ static inline void tick_nohz_init(void) { }
26765
26766  #ifdef CONFIG_NO_HZ_COMMON
26767  extern unsigned long tick_nohz_active;
26768 -#else
26769 +extern void timers_update_nohz(void);
26770 +# ifdef CONFIG_SMP
26771 +extern struct static_key_false timers_migration_enabled;
26772 +# endif
26773 +#else /* CONFIG_NO_HZ_COMMON */
26774 +static inline void timers_update_nohz(void) { }
26775  #define tick_nohz_active (0)
26776  #endif
26777
26778 -#if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON)
26779 -extern void timers_update_migration(bool update_nohz);
26780 -#else
26781 -static inline void timers_update_migration(bool update_nohz) { }
26782 -#endif
26783 -
26784  DECLARE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases);
26785
26786  extern u64 get_next_timer_interrupt(unsigned long basej, u64 basem);
26787 diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
26788 index a8fa0a896b78..643b36a0b8e1 100644
26789 --- a/kernel/time/tick-sched.c
26790 +++ b/kernel/time/tick-sched.c
26791 @@ -66,7 +66,8 @@ static void tick_do_update_jiffies64(ktime_t now)
26792                 return;
26793
26794         /* Reevaluate with jiffies_lock held */
26795 -       write_seqlock(&jiffies_lock);
26796 +       raw_spin_lock(&jiffies_lock);
26797 +       write_seqcount_begin(&jiffies_seq);
26798
26799         delta = ktime_sub(now, last_jiffies_update);
26800         if (delta >= tick_period) {
26801 @@ -89,10 +90,12 @@ static void tick_do_update_jiffies64(ktime_t now)
26802                 /* Keep the tick_next_period variable up to date */
26803                 tick_next_period = ktime_add(last_jiffies_update, tick_period);
26804         } else {
26805 -               write_sequnlock(&jiffies_lock);
26806 +               write_seqcount_end(&jiffies_seq);
26807 +               raw_spin_unlock(&jiffies_lock);
26808                 return;
26809         }
26810 -       write_sequnlock(&jiffies_lock);
26811 +       write_seqcount_end(&jiffies_seq);
26812 +       raw_spin_unlock(&jiffies_lock);
26813         update_wall_time();
26814  }
26815
26816 @@ -103,12 +106,14 @@ static ktime_t tick_init_jiffy_update(void)
26817  {
26818         ktime_t period;
26819
26820 -       write_seqlock(&jiffies_lock);
26821 +       raw_spin_lock(&jiffies_lock);
26822 +       write_seqcount_begin(&jiffies_seq);
26823         /* Did we start the jiffies update yet ? */
26824         if (last_jiffies_update == 0)
26825                 last_jiffies_update = tick_next_period;
26826         period = last_jiffies_update;
26827 -       write_sequnlock(&jiffies_lock);
26828 +       write_seqcount_end(&jiffies_seq);
26829 +       raw_spin_unlock(&jiffies_lock);
26830         return period;
26831  }
26832
26833 @@ -225,6 +230,7 @@ static void nohz_full_kick_func(struct irq_work *work)
26834
26835  static DEFINE_PER_CPU(struct irq_work, nohz_full_kick_work) = {
26836         .func = nohz_full_kick_func,
26837 +       .flags = IRQ_WORK_HARD_IRQ,
26838  };
26839
26840  /*
26841 @@ -689,10 +695,10 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts,
26842
26843         /* Read jiffies and the time when jiffies were updated last */
26844         do {
26845 -               seq = read_seqbegin(&jiffies_lock);
26846 +               seq = read_seqcount_begin(&jiffies_seq);
26847                 basemono = last_jiffies_update;
26848                 basejiff = jiffies;
26849 -       } while (read_seqretry(&jiffies_lock, seq));
26850 +       } while (read_seqcount_retry(&jiffies_seq, seq));
26851         ts->last_jiffies = basejiff;
26852
26853         /*
26854 @@ -906,14 +912,7 @@ static bool can_stop_idle_tick(int cpu, struct tick_sched *ts)
26855                 return false;
26856
26857         if (unlikely(local_softirq_pending() && cpu_online(cpu))) {
26858 -               static int ratelimit;
26859 -
26860 -               if (ratelimit < 10 && !in_softirq() &&
26861 -                   (local_softirq_pending() & SOFTIRQ_STOP_IDLE_MASK)) {
26862 -                       pr_warn("NOHZ: local_softirq_pending %02x\n",
26863 -                               (unsigned int) local_softirq_pending());
26864 -                       ratelimit++;
26865 -               }
26866 +               softirq_check_pending_idle();
26867                 return false;
26868         }
26869
26870 @@ -1132,7 +1131,7 @@ static inline void tick_nohz_activate(struct tick_sched *ts, int mode)
26871         ts->nohz_mode = mode;
26872         /* One update is enough */
26873         if (!test_and_set_bit(0, &tick_nohz_active))
26874 -               timers_update_migration(true);
26875 +               timers_update_nohz();
26876  }
26877
26878  /**
26879 @@ -1250,7 +1249,7 @@ void tick_setup_sched_timer(void)
26880         /*
26881          * Emulate tick processing via per-CPU hrtimers:
26882          */
26883 -       hrtimer_init(&ts->sched_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
26884 +       hrtimer_init(&ts->sched_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_HARD);
26885         ts->sched_timer.function = tick_sched_timer;
26886
26887         /* Get the next period (per-CPU) */
26888 diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
26889 index 2cafb49aa65e..2720f2c29a6d 100644
26890 --- a/kernel/time/timekeeping.c
26891 +++ b/kernel/time/timekeeping.c
26892 @@ -2326,8 +2326,10 @@ EXPORT_SYMBOL(hardpps);
26893   */
26894  void xtime_update(unsigned long ticks)
26895  {
26896 -       write_seqlock(&jiffies_lock);
26897 +       raw_spin_lock(&jiffies_lock);
26898 +       write_seqcount_begin(&jiffies_seq);
26899         do_timer(ticks);
26900 -       write_sequnlock(&jiffies_lock);
26901 +       write_seqcount_end(&jiffies_seq);
26902 +       raw_spin_unlock(&jiffies_lock);
26903         update_wall_time();
26904  }
26905 diff --git a/kernel/time/timekeeping.h b/kernel/time/timekeeping.h
26906 index c9f9af339914..0c0f52bf1927 100644
26907 --- a/kernel/time/timekeeping.h
26908 +++ b/kernel/time/timekeeping.h
26909 @@ -18,7 +18,8 @@ extern void timekeeping_resume(void);
26910  extern void do_timer(unsigned long ticks);
26911  extern void update_wall_time(void);
26912
26913 -extern seqlock_t jiffies_lock;
26914 +extern raw_spinlock_t jiffies_lock;
26915 +extern seqcount_t jiffies_seq;
26916
26917  #define CS_NAME_LEN    32
26918
26919 diff --git a/kernel/time/timer.c b/kernel/time/timer.c
26920 index f17c76a1a05f..5fadd754ce20 100644
26921 --- a/kernel/time/timer.c
26922 +++ b/kernel/time/timer.c
26923 @@ -44,6 +44,7 @@
26924  #include <linux/sched/debug.h>
26925  #include <linux/slab.h>
26926  #include <linux/compat.h>
26927 +#include <linux/swait.h>
26928
26929  #include <linux/uaccess.h>
26930  #include <asm/unistd.h>
26931 @@ -197,11 +198,12 @@ EXPORT_SYMBOL(jiffies_64);
26932  struct timer_base {
26933         raw_spinlock_t          lock;
26934         struct timer_list       *running_timer;
26935 +#ifdef CONFIG_PREEMPT_RT_FULL
26936 +       struct swait_queue_head wait_for_running_timer;
26937 +#endif
26938         unsigned long           clk;
26939         unsigned long           next_expiry;
26940         unsigned int            cpu;
26941 -       bool                    migration_enabled;
26942 -       bool                    nohz_active;
26943         bool                    is_idle;
26944         bool                    must_forward_clk;
26945         DECLARE_BITMAP(pending_map, WHEEL_SIZE);
26946 @@ -210,45 +212,73 @@ struct timer_base {
26947
26948  static DEFINE_PER_CPU(struct timer_base, timer_bases[NR_BASES]);
26949
26950 -#if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON)
26951 +#ifdef CONFIG_NO_HZ_COMMON
26952 +
26953 +static DEFINE_STATIC_KEY_FALSE(timers_nohz_active);
26954 +static DEFINE_MUTEX(timer_keys_mutex);
26955 +
26956 +static struct swork_event timer_update_swork;
26957 +
26958 +#ifdef CONFIG_SMP
26959  unsigned int sysctl_timer_migration = 1;
26960
26961 -void timers_update_migration(bool update_nohz)
26962 +DEFINE_STATIC_KEY_FALSE(timers_migration_enabled);
26963 +
26964 +static void timers_update_migration(void)
26965  {
26966         bool on = sysctl_timer_migration && tick_nohz_active;
26967 -       unsigned int cpu;
26968
26969 -       /* Avoid the loop, if nothing to update */
26970 -       if (this_cpu_read(timer_bases[BASE_STD].migration_enabled) == on)
26971 -               return;
26972 +       if (on)
26973 +               static_branch_enable(&timers_migration_enabled);
26974 +       else
26975 +               static_branch_disable(&timers_migration_enabled);
26976 +}
26977 +#else
26978 +static inline void timers_update_migration(void) { }
26979 +#endif /* !CONFIG_SMP */
26980
26981 -       for_each_possible_cpu(cpu) {
26982 -               per_cpu(timer_bases[BASE_STD].migration_enabled, cpu) = on;
26983 -               per_cpu(timer_bases[BASE_DEF].migration_enabled, cpu) = on;
26984 -               per_cpu(hrtimer_bases.migration_enabled, cpu) = on;
26985 -               if (!update_nohz)
26986 -                       continue;
26987 -               per_cpu(timer_bases[BASE_STD].nohz_active, cpu) = true;
26988 -               per_cpu(timer_bases[BASE_DEF].nohz_active, cpu) = true;
26989 -               per_cpu(hrtimer_bases.nohz_active, cpu) = true;
26990 -       }
26991 +static void timer_update_keys(struct swork_event *event)
26992 +{
26993 +       mutex_lock(&timer_keys_mutex);
26994 +       timers_update_migration();
26995 +       static_branch_enable(&timers_nohz_active);
26996 +       mutex_unlock(&timer_keys_mutex);
26997 +}
26998 +
26999 +void timers_update_nohz(void)
27000 +{
27001 +       swork_queue(&timer_update_swork);
27002 +}
27003 +
27004 +static __init int hrtimer_init_thread(void)
27005 +{
27006 +       WARN_ON(swork_get());
27007 +       INIT_SWORK(&timer_update_swork, timer_update_keys);
27008 +       return 0;
27009  }
27010 +early_initcall(hrtimer_init_thread);
27011
27012  int timer_migration_handler(struct ctl_table *table, int write,
27013                             void __user *buffer, size_t *lenp,
27014                             loff_t *ppos)
27015  {
27016 -       static DEFINE_MUTEX(mutex);
27017         int ret;
27018
27019 -       mutex_lock(&mutex);
27020 +       mutex_lock(&timer_keys_mutex);
27021         ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
27022         if (!ret && write)
27023 -               timers_update_migration(false);
27024 -       mutex_unlock(&mutex);
27025 +               timers_update_migration();
27026 +       mutex_unlock(&timer_keys_mutex);
27027         return ret;
27028  }
27029 -#endif
27030 +
27031 +static inline bool is_timers_nohz_active(void)
27032 +{
27033 +       return static_branch_unlikely(&timers_nohz_active);
27034 +}
27035 +#else
27036 +static inline bool is_timers_nohz_active(void) { return false; }
27037 +#endif /* NO_HZ_COMMON */
27038
27039  static unsigned long round_jiffies_common(unsigned long j, int cpu,
27040                 bool force_up)
27041 @@ -534,7 +564,7 @@ __internal_add_timer(struct timer_base *base, struct timer_list *timer)
27042  static void
27043  trigger_dyntick_cpu(struct timer_base *base, struct timer_list *timer)
27044  {
27045 -       if (!IS_ENABLED(CONFIG_NO_HZ_COMMON) || !base->nohz_active)
27046 +       if (!is_timers_nohz_active())
27047                 return;
27048
27049         /*
27050 @@ -840,21 +870,20 @@ static inline struct timer_base *get_timer_base(u32 tflags)
27051         return get_timer_cpu_base(tflags, tflags & TIMER_CPUMASK);
27052  }
27053
27054 -#ifdef CONFIG_NO_HZ_COMMON
27055  static inline struct timer_base *
27056  get_target_base(struct timer_base *base, unsigned tflags)
27057  {
27058 -#ifdef CONFIG_SMP
27059 -       if ((tflags & TIMER_PINNED) || !base->migration_enabled)
27060 -               return get_timer_this_cpu_base(tflags);
27061 -       return get_timer_cpu_base(tflags, get_nohz_timer_target());
27062 -#else
27063 -       return get_timer_this_cpu_base(tflags);
27064 +#if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON)
27065 +       if (static_branch_unlikely(&timers_migration_enabled) &&
27066 +           !(tflags & TIMER_PINNED))
27067 +               return get_timer_cpu_base(tflags, get_nohz_timer_target());
27068  #endif
27069 +       return get_timer_this_cpu_base(tflags);
27070  }
27071
27072  static inline void forward_timer_base(struct timer_base *base)
27073  {
27074 +#ifdef CONFIG_NO_HZ_COMMON
27075         unsigned long jnow;
27076
27077         /*
27078 @@ -878,16 +907,8 @@ static inline void forward_timer_base(struct timer_base *base)
27079                 base->clk = jnow;
27080         else
27081                 base->clk = base->next_expiry;
27082 -}
27083 -#else
27084 -static inline struct timer_base *
27085 -get_target_base(struct timer_base *base, unsigned tflags)
27086 -{
27087 -       return get_timer_this_cpu_base(tflags);
27088 -}
27089 -
27090 -static inline void forward_timer_base(struct timer_base *base) { }
27091  #endif
27092 +}
27093
27094
27095  /*
27096 @@ -1130,6 +1151,33 @@ void add_timer_on(struct timer_list *timer, int cpu)
27097  }
27098  EXPORT_SYMBOL_GPL(add_timer_on);
27099
27100 +#ifdef CONFIG_PREEMPT_RT_FULL
27101 +/*
27102 + * Wait for a running timer
27103 + */
27104 +static void wait_for_running_timer(struct timer_list *timer)
27105 +{
27106 +       struct timer_base *base;
27107 +       u32 tf = timer->flags;
27108 +
27109 +       if (tf & TIMER_MIGRATING)
27110 +               return;
27111 +
27112 +       base = get_timer_base(tf);
27113 +       swait_event(base->wait_for_running_timer,
27114 +                   base->running_timer != timer);
27115 +}
27116 +
27117 +# define wakeup_timer_waiters(b)       swake_up_all(&(b)->wait_for_running_timer)
27118 +#else
27119 +static inline void wait_for_running_timer(struct timer_list *timer)
27120 +{
27121 +       cpu_relax();
27122 +}
27123 +
27124 +# define wakeup_timer_waiters(b)       do { } while (0)
27125 +#endif
27126 +
27127  /**
27128   * del_timer - deactivate a timer.
27129   * @timer: the timer to be deactivated
27130 @@ -1185,7 +1233,7 @@ int try_to_del_timer_sync(struct timer_list *timer)
27131  }
27132  EXPORT_SYMBOL(try_to_del_timer_sync);
27133
27134 -#ifdef CONFIG_SMP
27135 +#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT_FULL)
27136  /**
27137   * del_timer_sync - deactivate a timer and wait for the handler to finish.
27138   * @timer: the timer to be deactivated
27139 @@ -1245,7 +1293,7 @@ int del_timer_sync(struct timer_list *timer)
27140                 int ret = try_to_del_timer_sync(timer);
27141                 if (ret >= 0)
27142                         return ret;
27143 -               cpu_relax();
27144 +               wait_for_running_timer(timer);
27145         }
27146  }
27147  EXPORT_SYMBOL(del_timer_sync);
27148 @@ -1309,13 +1357,16 @@ static void expire_timers(struct timer_base *base, struct hlist_head *head)
27149                 fn = timer->function;
27150                 data = timer->data;
27151
27152 -               if (timer->flags & TIMER_IRQSAFE) {
27153 +               if (!IS_ENABLED(CONFIG_PREEMPT_RT_FULL) &&
27154 +                   timer->flags & TIMER_IRQSAFE) {
27155                         raw_spin_unlock(&base->lock);
27156                         call_timer_fn(timer, fn, data);
27157 +                       base->running_timer = NULL;
27158                         raw_spin_lock(&base->lock);
27159                 } else {
27160                         raw_spin_unlock_irq(&base->lock);
27161                         call_timer_fn(timer, fn, data);
27162 +                       base->running_timer = NULL;
27163                         raw_spin_lock_irq(&base->lock);
27164                 }
27165         }
27166 @@ -1586,7 +1637,7 @@ void update_process_times(int user_tick)
27167         account_process_tick(p, user_tick);
27168         run_local_timers();
27169         rcu_check_callbacks(user_tick);
27170 -#ifdef CONFIG_IRQ_WORK
27171 +#if defined(CONFIG_IRQ_WORK)
27172         if (in_irq())
27173                 irq_work_tick();
27174  #endif
27175 @@ -1633,8 +1684,8 @@ static inline void __run_timers(struct timer_base *base)
27176                 while (levels--)
27177                         expire_timers(base, heads + levels);
27178         }
27179 -       base->running_timer = NULL;
27180         raw_spin_unlock_irq(&base->lock);
27181 +       wakeup_timer_waiters(base);
27182  }
27183
27184  /*
27185 @@ -1644,6 +1695,7 @@ static __latent_entropy void run_timer_softirq(struct softirq_action *h)
27186  {
27187         struct timer_base *base = this_cpu_ptr(&timer_bases[BASE_STD]);
27188
27189 +       irq_work_tick_soft();
27190         __run_timers(base);
27191         if (IS_ENABLED(CONFIG_NO_HZ_COMMON))
27192                 __run_timers(this_cpu_ptr(&timer_bases[BASE_DEF]));
27193 @@ -1867,6 +1919,9 @@ static void __init init_timer_cpu(int cpu)
27194                 base->cpu = cpu;
27195                 raw_spin_lock_init(&base->lock);
27196                 base->clk = jiffies;
27197 +#ifdef CONFIG_PREEMPT_RT_FULL
27198 +               init_swait_queue_head(&base->wait_for_running_timer);
27199 +#endif
27200         }
27201  }
27202
27203 diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
27204 index 4ad6f6ca18c1..55d39a3fbdf7 100644
27205 --- a/kernel/trace/Kconfig
27206 +++ b/kernel/trace/Kconfig
27207 @@ -585,7 +585,10 @@ config HIST_TRIGGERS
27208           event activity as an initial guide for further investigation
27209           using more advanced tools.
27210
27211 -         See Documentation/trace/events.txt.
27212 +         Inter-event tracing of quantities such as latencies is also
27213 +         supported using hist triggers under this option.
27214 +
27215 +         See Documentation/trace/histogram.txt.
27216           If in doubt, say N.
27217
27218  config MMIOTRACE_TEST
27219 diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
27220 index a1d5e0949dcf..e8ca1e01facd 100644
27221 --- a/kernel/trace/ring_buffer.c
27222 +++ b/kernel/trace/ring_buffer.c
27223 @@ -41,6 +41,8 @@ int ring_buffer_print_entry_header(struct trace_seq *s)
27224                          RINGBUF_TYPE_PADDING);
27225         trace_seq_printf(s, "\ttime_extend : type == %d\n",
27226                          RINGBUF_TYPE_TIME_EXTEND);
27227 +       trace_seq_printf(s, "\ttime_stamp : type == %d\n",
27228 +                        RINGBUF_TYPE_TIME_STAMP);
27229         trace_seq_printf(s, "\tdata max type_len  == %d\n",
27230                          RINGBUF_TYPE_DATA_TYPE_LEN_MAX);
27231
27232 @@ -140,12 +142,15 @@ int ring_buffer_print_entry_header(struct trace_seq *s)
27233
27234  enum {
27235         RB_LEN_TIME_EXTEND = 8,
27236 -       RB_LEN_TIME_STAMP = 16,
27237 +       RB_LEN_TIME_STAMP =  8,
27238  };
27239
27240  #define skip_time_extend(event) \
27241         ((struct ring_buffer_event *)((char *)event + RB_LEN_TIME_EXTEND))
27242
27243 +#define extended_time(event) \
27244 +       (event->type_len >= RINGBUF_TYPE_TIME_EXTEND)
27245 +
27246  static inline int rb_null_event(struct ring_buffer_event *event)
27247  {
27248         return event->type_len == RINGBUF_TYPE_PADDING && !event->time_delta;
27249 @@ -209,7 +214,7 @@ rb_event_ts_length(struct ring_buffer_event *event)
27250  {
27251         unsigned len = 0;
27252
27253 -       if (event->type_len == RINGBUF_TYPE_TIME_EXTEND) {
27254 +       if (extended_time(event)) {
27255                 /* time extends include the data event after it */
27256                 len = RB_LEN_TIME_EXTEND;
27257                 event = skip_time_extend(event);
27258 @@ -231,7 +236,7 @@ unsigned ring_buffer_event_length(struct ring_buffer_event *event)
27259  {
27260         unsigned length;
27261
27262 -       if (event->type_len == RINGBUF_TYPE_TIME_EXTEND)
27263 +       if (extended_time(event))
27264                 event = skip_time_extend(event);
27265
27266         length = rb_event_length(event);
27267 @@ -248,7 +253,7 @@ EXPORT_SYMBOL_GPL(ring_buffer_event_length);
27268  static __always_inline void *
27269  rb_event_data(struct ring_buffer_event *event)
27270  {
27271 -       if (event->type_len == RINGBUF_TYPE_TIME_EXTEND)
27272 +       if (extended_time(event))
27273                 event = skip_time_extend(event);
27274         BUG_ON(event->type_len > RINGBUF_TYPE_DATA_TYPE_LEN_MAX);
27275         /* If length is in len field, then array[0] has the data */
27276 @@ -275,6 +280,27 @@ EXPORT_SYMBOL_GPL(ring_buffer_event_data);
27277  #define TS_MASK                ((1ULL << TS_SHIFT) - 1)
27278  #define TS_DELTA_TEST  (~TS_MASK)
27279
27280 +/**
27281 + * ring_buffer_event_time_stamp - return the event's extended timestamp
27282 + * @event: the event to get the timestamp of
27283 + *
27284 + * Returns the extended timestamp associated with a data event.
27285 + * An extended time_stamp is a 64-bit timestamp represented
27286 + * internally in a special way that makes the best use of space
27287 + * contained within a ring buffer event.  This function decodes
27288 + * it and maps it to a straight u64 value.
27289 + */
27290 +u64 ring_buffer_event_time_stamp(struct ring_buffer_event *event)
27291 +{
27292 +       u64 ts;
27293 +
27294 +       ts = event->array[0];
27295 +       ts <<= TS_SHIFT;
27296 +       ts += event->time_delta;
27297 +
27298 +       return ts;
27299 +}
27300 +
27301  /* Flag when events were overwritten */
27302  #define RB_MISSED_EVENTS       (1 << 31)
27303  /* Missed count stored at end */
27304 @@ -451,6 +477,7 @@ struct ring_buffer_per_cpu {
27305         struct buffer_page              *reader_page;
27306         unsigned long                   lost_events;
27307         unsigned long                   last_overrun;
27308 +       unsigned long                   nest;
27309         local_t                         entries_bytes;
27310         local_t                         entries;
27311         local_t                         overrun;
27312 @@ -488,6 +515,7 @@ struct ring_buffer {
27313         u64                             (*clock)(void);
27314
27315         struct rb_irq_work              irq_work;
27316 +       bool                            time_stamp_abs;
27317  };
27318
27319  struct ring_buffer_iter {
27320 @@ -1387,6 +1415,16 @@ void ring_buffer_set_clock(struct ring_buffer *buffer,
27321         buffer->clock = clock;
27322  }
27323
27324 +void ring_buffer_set_time_stamp_abs(struct ring_buffer *buffer, bool abs)
27325 +{
27326 +       buffer->time_stamp_abs = abs;
27327 +}
27328 +
27329 +bool ring_buffer_time_stamp_abs(struct ring_buffer *buffer)
27330 +{
27331 +       return buffer->time_stamp_abs;
27332 +}
27333 +
27334  static void rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer);
27335
27336  static inline unsigned long rb_page_entries(struct buffer_page *bpage)
27337 @@ -2219,12 +2257,15 @@ rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer,
27338
27339  /* Slow path, do not inline */
27340  static noinline struct ring_buffer_event *
27341 -rb_add_time_stamp(struct ring_buffer_event *event, u64 delta)
27342 +rb_add_time_stamp(struct ring_buffer_event *event, u64 delta, bool abs)
27343  {
27344 -       event->type_len = RINGBUF_TYPE_TIME_EXTEND;
27345 +       if (abs)
27346 +               event->type_len = RINGBUF_TYPE_TIME_STAMP;
27347 +       else
27348 +               event->type_len = RINGBUF_TYPE_TIME_EXTEND;
27349
27350 -       /* Not the first event on the page? */
27351 -       if (rb_event_index(event)) {
27352 +       /* Not the first event on the page, or not delta? */
27353 +       if (abs || rb_event_index(event)) {
27354                 event->time_delta = delta & TS_MASK;
27355                 event->array[0] = delta >> TS_SHIFT;
27356         } else {
27357 @@ -2267,7 +2308,9 @@ rb_update_event(struct ring_buffer_per_cpu *cpu_buffer,
27358          * add it to the start of the resevered space.
27359          */
27360         if (unlikely(info->add_timestamp)) {
27361 -               event = rb_add_time_stamp(event, delta);
27362 +               bool abs = ring_buffer_time_stamp_abs(cpu_buffer->buffer);
27363 +
27364 +               event = rb_add_time_stamp(event, info->delta, abs);
27365                 length -= RB_LEN_TIME_EXTEND;
27366                 delta = 0;
27367         }
27368 @@ -2455,7 +2498,7 @@ static __always_inline void rb_end_commit(struct ring_buffer_per_cpu *cpu_buffer
27369
27370  static inline void rb_event_discard(struct ring_buffer_event *event)
27371  {
27372 -       if (event->type_len == RINGBUF_TYPE_TIME_EXTEND)
27373 +       if (extended_time(event))
27374                 event = skip_time_extend(event);
27375
27376         /* array[0] holds the actual length for the discarded event */
27377 @@ -2499,10 +2542,11 @@ rb_update_write_stamp(struct ring_buffer_per_cpu *cpu_buffer,
27378                         cpu_buffer->write_stamp =
27379                                 cpu_buffer->commit_page->page->time_stamp;
27380                 else if (event->type_len == RINGBUF_TYPE_TIME_EXTEND) {
27381 -                       delta = event->array[0];
27382 -                       delta <<= TS_SHIFT;
27383 -                       delta += event->time_delta;
27384 +                       delta = ring_buffer_event_time_stamp(event);
27385                         cpu_buffer->write_stamp += delta;
27386 +               } else if (event->type_len == RINGBUF_TYPE_TIME_STAMP) {
27387 +                       delta = ring_buffer_event_time_stamp(event);
27388 +                       cpu_buffer->write_stamp = delta;
27389                 } else
27390                         cpu_buffer->write_stamp += event->time_delta;
27391         }
27392 @@ -2585,22 +2629,19 @@ static __always_inline int
27393  trace_recursive_lock(struct ring_buffer_per_cpu *cpu_buffer)
27394  {
27395         unsigned int val = cpu_buffer->current_context;
27396 +       unsigned long pc = preempt_count();
27397         int bit;
27398
27399 -       if (in_interrupt()) {
27400 -               if (in_nmi())
27401 -                       bit = RB_CTX_NMI;
27402 -               else if (in_irq())
27403 -                       bit = RB_CTX_IRQ;
27404 -               else
27405 -                       bit = RB_CTX_SOFTIRQ;
27406 -       } else
27407 +       if (!(pc & (NMI_MASK | HARDIRQ_MASK | SOFTIRQ_OFFSET)))
27408                 bit = RB_CTX_NORMAL;
27409 +       else
27410 +               bit = pc & NMI_MASK ? RB_CTX_NMI :
27411 +                       pc & HARDIRQ_MASK ? RB_CTX_IRQ : RB_CTX_SOFTIRQ;
27412
27413 -       if (unlikely(val & (1 << bit)))
27414 +       if (unlikely(val & (1 << (bit + cpu_buffer->nest))))
27415                 return 1;
27416
27417 -       val |= (1 << bit);
27418 +       val |= (1 << (bit + cpu_buffer->nest));
27419         cpu_buffer->current_context = val;
27420
27421         return 0;
27422 @@ -2609,7 +2650,57 @@ trace_recursive_lock(struct ring_buffer_per_cpu *cpu_buffer)
27423  static __always_inline void
27424  trace_recursive_unlock(struct ring_buffer_per_cpu *cpu_buffer)
27425  {
27426 -       cpu_buffer->current_context &= cpu_buffer->current_context - 1;
27427 +       cpu_buffer->current_context &=
27428 +               cpu_buffer->current_context - (1 << cpu_buffer->nest);
27429 +}
27430 +
27431 +/* The recursive locking above uses 4 bits */
27432 +#define NESTED_BITS 4
27433 +
27434 +/**
27435 + * ring_buffer_nest_start - Allow to trace while nested
27436 + * @buffer: The ring buffer to modify
27437 + *
27438 + * The ring buffer has a safty mechanism to prevent recursion.
27439 + * But there may be a case where a trace needs to be done while
27440 + * tracing something else. In this case, calling this function
27441 + * will allow this function to nest within a currently active
27442 + * ring_buffer_lock_reserve().
27443 + *
27444 + * Call this function before calling another ring_buffer_lock_reserve() and
27445 + * call ring_buffer_nest_end() after the nested ring_buffer_unlock_commit().
27446 + */
27447 +void ring_buffer_nest_start(struct ring_buffer *buffer)
27448 +{
27449 +       struct ring_buffer_per_cpu *cpu_buffer;
27450 +       int cpu;
27451 +
27452 +       /* Enabled by ring_buffer_nest_end() */
27453 +       preempt_disable_notrace();
27454 +       cpu = raw_smp_processor_id();
27455 +       cpu_buffer = buffer->buffers[cpu];
27456 +       /* This is the shift value for the above recusive locking */
27457 +       cpu_buffer->nest += NESTED_BITS;
27458 +}
27459 +
27460 +/**
27461 + * ring_buffer_nest_end - Allow to trace while nested
27462 + * @buffer: The ring buffer to modify
27463 + *
27464 + * Must be called after ring_buffer_nest_start() and after the
27465 + * ring_buffer_unlock_commit().
27466 + */
27467 +void ring_buffer_nest_end(struct ring_buffer *buffer)
27468 +{
27469 +       struct ring_buffer_per_cpu *cpu_buffer;
27470 +       int cpu;
27471 +
27472 +       /* disabled by ring_buffer_nest_start() */
27473 +       cpu = raw_smp_processor_id();
27474 +       cpu_buffer = buffer->buffers[cpu];
27475 +       /* This is the shift value for the above recusive locking */
27476 +       cpu_buffer->nest -= NESTED_BITS;
27477 +       preempt_enable_notrace();
27478  }
27479
27480  /**
27481 @@ -2685,7 +2776,7 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
27482          * If this is the first commit on the page, then it has the same
27483          * timestamp as the page itself.
27484          */
27485 -       if (!tail)
27486 +       if (!tail && !ring_buffer_time_stamp_abs(cpu_buffer->buffer))
27487                 info->delta = 0;
27488
27489         /* See if we shot pass the end of this buffer page */
27490 @@ -2762,8 +2853,11 @@ rb_reserve_next_event(struct ring_buffer *buffer,
27491         /* make sure this diff is calculated here */
27492         barrier();
27493
27494 -       /* Did the write stamp get updated already? */
27495 -       if (likely(info.ts >= cpu_buffer->write_stamp)) {
27496 +       if (ring_buffer_time_stamp_abs(buffer)) {
27497 +               info.delta = info.ts;
27498 +               rb_handle_timestamp(cpu_buffer, &info);
27499 +       } else /* Did the write stamp get updated already? */
27500 +               if (likely(info.ts >= cpu_buffer->write_stamp)) {
27501                 info.delta = diff;
27502                 if (unlikely(test_time_stamp(info.delta)))
27503                         rb_handle_timestamp(cpu_buffer, &info);
27504 @@ -3461,14 +3555,13 @@ rb_update_read_stamp(struct ring_buffer_per_cpu *cpu_buffer,
27505                 return;
27506
27507         case RINGBUF_TYPE_TIME_EXTEND:
27508 -               delta = event->array[0];
27509 -               delta <<= TS_SHIFT;
27510 -               delta += event->time_delta;
27511 +               delta = ring_buffer_event_time_stamp(event);
27512                 cpu_buffer->read_stamp += delta;
27513                 return;
27514
27515         case RINGBUF_TYPE_TIME_STAMP:
27516 -               /* FIXME: not implemented */
27517 +               delta = ring_buffer_event_time_stamp(event);
27518 +               cpu_buffer->read_stamp = delta;
27519                 return;
27520
27521         case RINGBUF_TYPE_DATA:
27522 @@ -3492,14 +3585,13 @@ rb_update_iter_read_stamp(struct ring_buffer_iter *iter,
27523                 return;
27524
27525         case RINGBUF_TYPE_TIME_EXTEND:
27526 -               delta = event->array[0];
27527 -               delta <<= TS_SHIFT;
27528 -               delta += event->time_delta;
27529 +               delta = ring_buffer_event_time_stamp(event);
27530                 iter->read_stamp += delta;
27531                 return;
27532
27533         case RINGBUF_TYPE_TIME_STAMP:
27534 -               /* FIXME: not implemented */
27535 +               delta = ring_buffer_event_time_stamp(event);
27536 +               iter->read_stamp = delta;
27537                 return;
27538
27539         case RINGBUF_TYPE_DATA:
27540 @@ -3723,6 +3815,8 @@ rb_buffer_peek(struct ring_buffer_per_cpu *cpu_buffer, u64 *ts,
27541         struct buffer_page *reader;
27542         int nr_loops = 0;
27543
27544 +       if (ts)
27545 +               *ts = 0;
27546   again:
27547         /*
27548          * We repeat when a time extend is encountered.
27549 @@ -3759,12 +3853,17 @@ rb_buffer_peek(struct ring_buffer_per_cpu *cpu_buffer, u64 *ts,
27550                 goto again;
27551
27552         case RINGBUF_TYPE_TIME_STAMP:
27553 -               /* FIXME: not implemented */
27554 +               if (ts) {
27555 +                       *ts = ring_buffer_event_time_stamp(event);
27556 +                       ring_buffer_normalize_time_stamp(cpu_buffer->buffer,
27557 +                                                        cpu_buffer->cpu, ts);
27558 +               }
27559 +               /* Internal data, OK to advance */
27560                 rb_advance_reader(cpu_buffer);
27561                 goto again;
27562
27563         case RINGBUF_TYPE_DATA:
27564 -               if (ts) {
27565 +               if (ts && !(*ts)) {
27566                         *ts = cpu_buffer->read_stamp + event->time_delta;
27567                         ring_buffer_normalize_time_stamp(cpu_buffer->buffer,
27568                                                          cpu_buffer->cpu, ts);
27569 @@ -3789,6 +3888,9 @@ rb_iter_peek(struct ring_buffer_iter *iter, u64 *ts)
27570         struct ring_buffer_event *event;
27571         int nr_loops = 0;
27572
27573 +       if (ts)
27574 +               *ts = 0;
27575 +
27576         cpu_buffer = iter->cpu_buffer;
27577         buffer = cpu_buffer->buffer;
27578
27579 @@ -3841,12 +3943,17 @@ rb_iter_peek(struct ring_buffer_iter *iter, u64 *ts)
27580                 goto again;
27581
27582         case RINGBUF_TYPE_TIME_STAMP:
27583 -               /* FIXME: not implemented */
27584 +               if (ts) {
27585 +                       *ts = ring_buffer_event_time_stamp(event);
27586 +                       ring_buffer_normalize_time_stamp(cpu_buffer->buffer,
27587 +                                                        cpu_buffer->cpu, ts);
27588 +               }
27589 +               /* Internal data, OK to advance */
27590                 rb_advance_iter(iter);
27591                 goto again;
27592
27593         case RINGBUF_TYPE_DATA:
27594 -               if (ts) {
27595 +               if (ts && !(*ts)) {
27596                         *ts = iter->read_stamp + event->time_delta;
27597                         ring_buffer_normalize_time_stamp(buffer,
27598                                                          cpu_buffer->cpu, ts);
27599 diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
27600 index e9cbb96cd99e..4fc60e5ec4b9 100644
27601 --- a/kernel/trace/trace.c
27602 +++ b/kernel/trace/trace.c
27603 @@ -1170,6 +1170,14 @@ static struct {
27604         ARCH_TRACE_CLOCKS
27605  };
27606
27607 +bool trace_clock_in_ns(struct trace_array *tr)
27608 +{
27609 +       if (trace_clocks[tr->clock_id].in_ns)
27610 +               return true;
27611 +
27612 +       return false;
27613 +}
27614 +
27615  /*
27616   * trace_parser_get_init - gets the buffer for trace parser
27617   */
27618 @@ -2127,6 +2135,7 @@ tracing_generic_entry_update(struct trace_entry *entry, unsigned long flags,
27619         struct task_struct *tsk = current;
27620
27621         entry->preempt_count            = pc & 0xff;
27622 +       entry->preempt_lazy_count       = preempt_lazy_count();
27623         entry->pid                      = (tsk) ? tsk->pid : 0;
27624         entry->flags =
27625  #ifdef CONFIG_TRACE_IRQFLAGS_SUPPORT
27626 @@ -2137,8 +2146,11 @@ tracing_generic_entry_update(struct trace_entry *entry, unsigned long flags,
27627                 ((pc & NMI_MASK    ) ? TRACE_FLAG_NMI     : 0) |
27628                 ((pc & HARDIRQ_MASK) ? TRACE_FLAG_HARDIRQ : 0) |
27629                 ((pc & SOFTIRQ_OFFSET) ? TRACE_FLAG_SOFTIRQ : 0) |
27630 -               (tif_need_resched() ? TRACE_FLAG_NEED_RESCHED : 0) |
27631 +               (tif_need_resched_now() ? TRACE_FLAG_NEED_RESCHED : 0) |
27632 +               (need_resched_lazy() ? TRACE_FLAG_NEED_RESCHED_LAZY : 0) |
27633                 (test_preempt_need_resched() ? TRACE_FLAG_PREEMPT_RESCHED : 0);
27634 +
27635 +       entry->migrate_disable = (tsk) ? __migrate_disabled(tsk) & 0xFF : 0;
27636  }
27637  EXPORT_SYMBOL_GPL(tracing_generic_entry_update);
27638
27639 @@ -2275,7 +2287,7 @@ trace_event_buffer_lock_reserve(struct ring_buffer **current_rb,
27640
27641         *current_rb = trace_file->tr->trace_buffer.buffer;
27642
27643 -       if ((trace_file->flags &
27644 +       if (!ring_buffer_time_stamp_abs(*current_rb) && (trace_file->flags &
27645              (EVENT_FILE_FL_SOFT_DISABLED | EVENT_FILE_FL_FILTERED)) &&
27646             (entry = this_cpu_read(trace_buffered_event))) {
27647                 /* Try to use the per cpu buffer first */
27648 @@ -3342,14 +3354,17 @@ get_total_entries(struct trace_buffer *buf,
27649
27650  static void print_lat_help_header(struct seq_file *m)
27651  {
27652 -       seq_puts(m, "#                  _------=> CPU#            \n"
27653 -                   "#                 / _-----=> irqs-off        \n"
27654 -                   "#                | / _----=> need-resched    \n"
27655 -                   "#                || / _---=> hardirq/softirq \n"
27656 -                   "#                ||| / _--=> preempt-depth   \n"
27657 -                   "#                |||| /     delay            \n"
27658 -                   "#  cmd     pid   ||||| time  |   caller      \n"
27659 -                   "#     \\   /      |||||  \\    |   /         \n");
27660 +       seq_puts(m, "#                  _--------=> CPU#              \n"
27661 +                   "#                 / _-------=> irqs-off          \n"
27662 +                   "#                | / _------=> need-resched      \n"
27663 +                   "#                || / _-----=> need-resched_lazy \n"
27664 +                   "#                ||| / _----=> hardirq/softirq   \n"
27665 +                   "#                |||| / _---=> preempt-depth     \n"
27666 +                   "#                ||||| / _--=> preempt-lazy-depth\n"
27667 +                   "#                |||||| / _-=> migrate-disable   \n"
27668 +                   "#                ||||||| /     delay             \n"
27669 +                   "# cmd     pid    |||||||| time   |  caller       \n"
27670 +                   "#     \\   /      ||||||||   \\    |  /            \n");
27671  }
27672
27673  static void print_event_info(struct trace_buffer *buf, struct seq_file *m)
27674 @@ -3385,15 +3400,17 @@ static void print_func_help_header_irq(struct trace_buffer *buf, struct seq_file
27675                    tgid ? tgid_space : space);
27676         seq_printf(m, "#                          %s / _----=> need-resched\n",
27677                    tgid ? tgid_space : space);
27678 -       seq_printf(m, "#                          %s| / _---=> hardirq/softirq\n",
27679 +       seq_printf(m, "#                          %s| /  _----=> need-resched_lazy\n",
27680 +                  tgid ? tgid_space : space);
27681 +       seq_printf(m, "#                          %s|| / _---=> hardirq/softirq\n",
27682                    tgid ? tgid_space : space);
27683 -       seq_printf(m, "#                          %s|| / _--=> preempt-depth\n",
27684 +       seq_printf(m, "#                          %s||| / _--=> preempt-depth\n",
27685                    tgid ? tgid_space : space);
27686 -       seq_printf(m, "#                          %s||| /     delay\n",
27687 +       seq_printf(m, "#                          %s|||| /     delay\n",
27688                    tgid ? tgid_space : space);
27689 -       seq_printf(m, "#           TASK-PID %sCPU#  ||||    TIMESTAMP  FUNCTION\n",
27690 +       seq_printf(m, "#           TASK-PID %sCPU#  |||||    TIMESTAMP  FUNCTION\n",
27691                    tgid ? "   TGID   " : space);
27692 -       seq_printf(m, "#              | |   %s  |   ||||       |         |\n",
27693 +       seq_printf(m, "#              | |   %s  |   |||||       |         |\n",
27694                    tgid ? "     |    " : space);
27695  }
27696
27697 @@ -4531,6 +4548,9 @@ static const char readme_msg[] =
27698  #ifdef CONFIG_X86_64
27699         "     x86-tsc:   TSC cycle counter\n"
27700  #endif
27701 +       "\n  timestamp_mode\t-view the mode used to timestamp events\n"
27702 +       "       delta:   Delta difference against a buffer-wide timestamp\n"
27703 +       "    absolute:   Absolute (standalone) timestamp\n"
27704         "\n  trace_marker\t\t- Writes into this file writes into the kernel buffer\n"
27705         "\n  trace_marker_raw\t\t- Writes into this file writes binary data into the kernel buffer\n"
27706         "  tracing_cpumask\t- Limit which CPUs to trace\n"
27707 @@ -4707,8 +4727,9 @@ static const char readme_msg[] =
27708         "\t            .sym        display an address as a symbol\n"
27709         "\t            .sym-offset display an address as a symbol and offset\n"
27710         "\t            .execname   display a common_pid as a program name\n"
27711 -       "\t            .syscall    display a syscall id as a syscall name\n\n"
27712 -       "\t            .log2       display log2 value rather than raw number\n\n"
27713 +       "\t            .syscall    display a syscall id as a syscall name\n"
27714 +       "\t            .log2       display log2 value rather than raw number\n"
27715 +       "\t            .usecs      display a common_timestamp in microseconds\n\n"
27716         "\t    The 'pause' parameter can be used to pause an existing hist\n"
27717         "\t    trigger or to start a hist trigger but not log any events\n"
27718         "\t    until told to do so.  'continue' can be used to start or\n"
27719 @@ -6218,7 +6239,7 @@ static int tracing_clock_show(struct seq_file *m, void *v)
27720         return 0;
27721  }
27722
27723 -static int tracing_set_clock(struct trace_array *tr, const char *clockstr)
27724 +int tracing_set_clock(struct trace_array *tr, const char *clockstr)
27725  {
27726         int i;
27727
27728 @@ -6298,6 +6319,71 @@ static int tracing_clock_open(struct inode *inode, struct file *file)
27729         return ret;
27730  }
27731
27732 +static int tracing_time_stamp_mode_show(struct seq_file *m, void *v)
27733 +{
27734 +       struct trace_array *tr = m->private;
27735 +
27736 +       mutex_lock(&trace_types_lock);
27737 +
27738 +       if (ring_buffer_time_stamp_abs(tr->trace_buffer.buffer))
27739 +               seq_puts(m, "delta [absolute]\n");
27740 +       else
27741 +               seq_puts(m, "[delta] absolute\n");
27742 +
27743 +       mutex_unlock(&trace_types_lock);
27744 +
27745 +       return 0;
27746 +}
27747 +
27748 +static int tracing_time_stamp_mode_open(struct inode *inode, struct file *file)
27749 +{
27750 +       struct trace_array *tr = inode->i_private;
27751 +       int ret;
27752 +
27753 +       if (tracing_disabled)
27754 +               return -ENODEV;
27755 +
27756 +       if (trace_array_get(tr))
27757 +               return -ENODEV;
27758 +
27759 +       ret = single_open(file, tracing_time_stamp_mode_show, inode->i_private);
27760 +       if (ret < 0)
27761 +               trace_array_put(tr);
27762 +
27763 +       return ret;
27764 +}
27765 +
27766 +int tracing_set_time_stamp_abs(struct trace_array *tr, bool abs)
27767 +{
27768 +       int ret = 0;
27769 +
27770 +       mutex_lock(&trace_types_lock);
27771 +
27772 +       if (abs && tr->time_stamp_abs_ref++)
27773 +               goto out;
27774 +
27775 +       if (!abs) {
27776 +               if (WARN_ON_ONCE(!tr->time_stamp_abs_ref)) {
27777 +                       ret = -EINVAL;
27778 +                       goto out;
27779 +               }
27780 +
27781 +               if (--tr->time_stamp_abs_ref)
27782 +                       goto out;
27783 +       }
27784 +
27785 +       ring_buffer_set_time_stamp_abs(tr->trace_buffer.buffer, abs);
27786 +
27787 +#ifdef CONFIG_TRACER_MAX_TRACE
27788 +       if (tr->max_buffer.buffer)
27789 +               ring_buffer_set_time_stamp_abs(tr->max_buffer.buffer, abs);
27790 +#endif
27791 + out:
27792 +       mutex_unlock(&trace_types_lock);
27793 +
27794 +       return ret;
27795 +}
27796 +
27797  struct ftrace_buffer_info {
27798         struct trace_iterator   iter;
27799         void                    *spare;
27800 @@ -6545,6 +6631,13 @@ static const struct file_operations trace_clock_fops = {
27801         .write          = tracing_clock_write,
27802  };
27803
27804 +static const struct file_operations trace_time_stamp_mode_fops = {
27805 +       .open           = tracing_time_stamp_mode_open,
27806 +       .read           = seq_read,
27807 +       .llseek         = seq_lseek,
27808 +       .release        = tracing_single_release_tr,
27809 +};
27810 +
27811  #ifdef CONFIG_TRACER_SNAPSHOT
27812  static const struct file_operations snapshot_fops = {
27813         .open           = tracing_snapshot_open,
27814 @@ -7684,6 +7777,7 @@ static int instance_mkdir(const char *name)
27815         struct trace_array *tr;
27816         int ret;
27817
27818 +       mutex_lock(&event_mutex);
27819         mutex_lock(&trace_types_lock);
27820
27821         ret = -EEXIST;
27822 @@ -7716,6 +7810,7 @@ static int instance_mkdir(const char *name)
27823
27824         INIT_LIST_HEAD(&tr->systems);
27825         INIT_LIST_HEAD(&tr->events);
27826 +       INIT_LIST_HEAD(&tr->hist_vars);
27827
27828         if (allocate_trace_buffers(tr, trace_buf_size) < 0)
27829                 goto out_free_tr;
27830 @@ -7739,6 +7834,7 @@ static int instance_mkdir(const char *name)
27831         list_add(&tr->list, &ftrace_trace_arrays);
27832
27833         mutex_unlock(&trace_types_lock);
27834 +       mutex_unlock(&event_mutex);
27835
27836         return 0;
27837
27838 @@ -7750,6 +7846,7 @@ static int instance_mkdir(const char *name)
27839
27840   out_unlock:
27841         mutex_unlock(&trace_types_lock);
27842 +       mutex_unlock(&event_mutex);
27843
27844         return ret;
27845
27846 @@ -7762,6 +7859,7 @@ static int instance_rmdir(const char *name)
27847         int ret;
27848         int i;
27849
27850 +       mutex_lock(&event_mutex);
27851         mutex_lock(&trace_types_lock);
27852
27853         ret = -ENODEV;
27854 @@ -7807,6 +7905,7 @@ static int instance_rmdir(const char *name)
27855
27856   out_unlock:
27857         mutex_unlock(&trace_types_lock);
27858 +       mutex_unlock(&event_mutex);
27859
27860         return ret;
27861  }
27862 @@ -7864,6 +7963,9 @@ init_tracer_tracefs(struct trace_array *tr, struct dentry *d_tracer)
27863         trace_create_file("tracing_on", 0644, d_tracer,
27864                           tr, &rb_simple_fops);
27865
27866 +       trace_create_file("timestamp_mode", 0444, d_tracer, tr,
27867 +                         &trace_time_stamp_mode_fops);
27868 +
27869         create_trace_options_dir(tr);
27870
27871  #if defined(CONFIG_TRACER_MAX_TRACE) || defined(CONFIG_HWLAT_TRACER)
27872 @@ -8275,6 +8377,92 @@ void ftrace_dump(enum ftrace_dump_mode oops_dump_mode)
27873  }
27874  EXPORT_SYMBOL_GPL(ftrace_dump);
27875
27876 +int trace_run_command(const char *buf, int (*createfn)(int, char **))
27877 +{
27878 +       char **argv;
27879 +       int argc, ret;
27880 +
27881 +       argc = 0;
27882 +       ret = 0;
27883 +       argv = argv_split(GFP_KERNEL, buf, &argc);
27884 +       if (!argv)
27885 +               return -ENOMEM;
27886 +
27887 +       if (argc)
27888 +               ret = createfn(argc, argv);
27889 +
27890 +       argv_free(argv);
27891 +
27892 +       return ret;
27893 +}
27894 +
27895 +#define WRITE_BUFSIZE  4096
27896 +
27897 +ssize_t trace_parse_run_command(struct file *file, const char __user *buffer,
27898 +                               size_t count, loff_t *ppos,
27899 +                               int (*createfn)(int, char **))
27900 +{
27901 +       char *kbuf, *buf, *tmp;
27902 +       int ret = 0;
27903 +       size_t done = 0;
27904 +       size_t size;
27905 +
27906 +       kbuf = kmalloc(WRITE_BUFSIZE, GFP_KERNEL);
27907 +       if (!kbuf)
27908 +               return -ENOMEM;
27909 +
27910 +       while (done < count) {
27911 +               size = count - done;
27912 +
27913 +               if (size >= WRITE_BUFSIZE)
27914 +                       size = WRITE_BUFSIZE - 1;
27915 +
27916 +               if (copy_from_user(kbuf, buffer + done, size)) {
27917 +                       ret = -EFAULT;
27918 +                       goto out;
27919 +               }
27920 +               kbuf[size] = '\0';
27921 +               buf = kbuf;
27922 +               do {
27923 +                       tmp = strchr(buf, '\n');
27924 +                       if (tmp) {
27925 +                               *tmp = '\0';
27926 +                               size = tmp - buf + 1;
27927 +                       } else {
27928 +                               size = strlen(buf);
27929 +                               if (done + size < count) {
27930 +                                       if (buf != kbuf)
27931 +                                               break;
27932 +                                       /* This can accept WRITE_BUFSIZE - 2 ('\n' + '\0') */
27933 +                                       pr_warn("Line length is too long: Should be less than %d\n",
27934 +                                               WRITE_BUFSIZE - 2);
27935 +                                       ret = -EINVAL;
27936 +                                       goto out;
27937 +                               }
27938 +                       }
27939 +                       done += size;
27940 +
27941 +                       /* Remove comments */
27942 +                       tmp = strchr(buf, '#');
27943 +
27944 +                       if (tmp)
27945 +                               *tmp = '\0';
27946 +
27947 +                       ret = trace_run_command(buf, createfn);
27948 +                       if (ret)
27949 +                               goto out;
27950 +                       buf += size;
27951 +
27952 +               } while (done < count);
27953 +       }
27954 +       ret = done;
27955 +
27956 +out:
27957 +       kfree(kbuf);
27958 +
27959 +       return ret;
27960 +}
27961 +
27962  __init static int tracer_alloc_buffers(void)
27963  {
27964         int ring_buf_size;
27965 @@ -8375,6 +8563,7 @@ __init static int tracer_alloc_buffers(void)
27966
27967         INIT_LIST_HEAD(&global_trace.systems);
27968         INIT_LIST_HEAD(&global_trace.events);
27969 +       INIT_LIST_HEAD(&global_trace.hist_vars);
27970         list_add(&global_trace.list, &ftrace_trace_arrays);
27971
27972         apply_trace_boot_options();
27973 diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
27974 index 851cd1605085..18bf383f46e8 100644
27975 --- a/kernel/trace/trace.h
27976 +++ b/kernel/trace/trace.h
27977 @@ -127,6 +127,7 @@ struct kretprobe_trace_entry_head {
27978   *  NEED_RESCHED       - reschedule is requested
27979   *  HARDIRQ            - inside an interrupt handler
27980   *  SOFTIRQ            - inside a softirq handler
27981 + *  NEED_RESCHED_LAZY  - lazy reschedule is requested
27982   */
27983  enum trace_flag_type {
27984         TRACE_FLAG_IRQS_OFF             = 0x01,
27985 @@ -136,6 +137,7 @@ enum trace_flag_type {
27986         TRACE_FLAG_SOFTIRQ              = 0x10,
27987         TRACE_FLAG_PREEMPT_RESCHED      = 0x20,
27988         TRACE_FLAG_NMI                  = 0x40,
27989 +       TRACE_FLAG_NEED_RESCHED_LAZY    = 0x80,
27990  };
27991
27992  #define TRACE_BUF_SIZE         1024
27993 @@ -273,6 +275,8 @@ struct trace_array {
27994         /* function tracing enabled */
27995         int                     function_enabled;
27996  #endif
27997 +       int                     time_stamp_abs_ref;
27998 +       struct list_head        hist_vars;
27999  };
28000
28001  enum {
28002 @@ -286,6 +290,11 @@ extern struct mutex trace_types_lock;
28003  extern int trace_array_get(struct trace_array *tr);
28004  extern void trace_array_put(struct trace_array *tr);
28005
28006 +extern int tracing_set_time_stamp_abs(struct trace_array *tr, bool abs);
28007 +extern int tracing_set_clock(struct trace_array *tr, const char *clockstr);
28008 +
28009 +extern bool trace_clock_in_ns(struct trace_array *tr);
28010 +
28011  /*
28012   * The global tracer (top) should be the first trace array added,
28013   * but we check the flag anyway.
28014 @@ -1293,7 +1302,7 @@ __event_trigger_test_discard(struct trace_event_file *file,
28015         unsigned long eflags = file->flags;
28016
28017         if (eflags & EVENT_FILE_FL_TRIGGER_COND)
28018 -               *tt = event_triggers_call(file, entry);
28019 +               *tt = event_triggers_call(file, entry, event);
28020
28021         if (test_bit(EVENT_FILE_FL_SOFT_DISABLED_BIT, &file->flags) ||
28022             (unlikely(file->flags & EVENT_FILE_FL_FILTERED) &&
28023 @@ -1330,7 +1339,7 @@ event_trigger_unlock_commit(struct trace_event_file *file,
28024                 trace_buffer_unlock_commit(file->tr, buffer, event, irq_flags, pc);
28025
28026         if (tt)
28027 -               event_triggers_post_call(file, tt, entry);
28028 +               event_triggers_post_call(file, tt, entry, event);
28029  }
28030
28031  /**
28032 @@ -1363,7 +1372,7 @@ event_trigger_unlock_commit_regs(struct trace_event_file *file,
28033                                                 irq_flags, pc, regs);
28034
28035         if (tt)
28036 -               event_triggers_post_call(file, tt, entry);
28037 +               event_triggers_post_call(file, tt, entry, event);
28038  }
28039
28040  #define FILTER_PRED_INVALID    ((unsigned short)-1)
28041 @@ -1545,6 +1554,8 @@ extern void pause_named_trigger(struct event_trigger_data *data);
28042  extern void unpause_named_trigger(struct event_trigger_data *data);
28043  extern void set_named_trigger_data(struct event_trigger_data *data,
28044                                    struct event_trigger_data *named_data);
28045 +extern struct event_trigger_data *
28046 +get_named_trigger_data(struct event_trigger_data *data);
28047  extern int register_event_command(struct event_command *cmd);
28048  extern int unregister_event_command(struct event_command *cmd);
28049  extern int register_trigger_hist_enable_disable_cmds(void);
28050 @@ -1588,7 +1599,8 @@ extern int register_trigger_hist_enable_disable_cmds(void);
28051   */
28052  struct event_trigger_ops {
28053         void                    (*func)(struct event_trigger_data *data,
28054 -                                       void *rec);
28055 +                                       void *rec,
28056 +                                       struct ring_buffer_event *rbe);
28057         int                     (*init)(struct event_trigger_ops *ops,
28058                                         struct event_trigger_data *data);
28059         void                    (*free)(struct event_trigger_ops *ops,
28060 @@ -1755,6 +1767,13 @@ void trace_printk_start_comm(void);
28061  int trace_keep_overwrite(struct tracer *tracer, u32 mask, int set);
28062  int set_tracer_flag(struct trace_array *tr, unsigned int mask, int enabled);
28063
28064 +#define MAX_EVENT_NAME_LEN     64
28065 +
28066 +extern int trace_run_command(const char *buf, int (*createfn)(int, char**));
28067 +extern ssize_t trace_parse_run_command(struct file *file,
28068 +               const char __user *buffer, size_t count, loff_t *ppos,
28069 +               int (*createfn)(int, char**));
28070 +
28071  /*
28072   * Normal trace_printk() and friends allocates special buffers
28073   * to do the manipulation, as well as saves the print formats
28074 diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
28075 index d53268a4e167..9ba230a4052f 100644
28076 --- a/kernel/trace/trace_events.c
28077 +++ b/kernel/trace/trace_events.c
28078 @@ -187,6 +187,8 @@ static int trace_define_common_fields(void)
28079         __common_field(unsigned char, flags);
28080         __common_field(unsigned char, preempt_count);
28081         __common_field(int, pid);
28082 +       __common_field(unsigned short, migrate_disable);
28083 +       __common_field(unsigned short, padding);
28084
28085         return ret;
28086  }
28087 @@ -1406,8 +1408,8 @@ static int subsystem_open(struct inode *inode, struct file *filp)
28088                 return -ENODEV;
28089
28090         /* Make sure the system still exists */
28091 -       mutex_lock(&trace_types_lock);
28092         mutex_lock(&event_mutex);
28093 +       mutex_lock(&trace_types_lock);
28094         list_for_each_entry(tr, &ftrace_trace_arrays, list) {
28095                 list_for_each_entry(dir, &tr->systems, list) {
28096                         if (dir == inode->i_private) {
28097 @@ -1421,8 +1423,8 @@ static int subsystem_open(struct inode *inode, struct file *filp)
28098                 }
28099         }
28100   exit_loop:
28101 -       mutex_unlock(&event_mutex);
28102         mutex_unlock(&trace_types_lock);
28103 +       mutex_unlock(&event_mutex);
28104
28105         if (!system)
28106                 return -ENODEV;
28107 @@ -2308,15 +2310,15 @@ static void __add_event_to_tracers(struct trace_event_call *call);
28108  int trace_add_event_call(struct trace_event_call *call)
28109  {
28110         int ret;
28111 -       mutex_lock(&trace_types_lock);
28112         mutex_lock(&event_mutex);
28113 +       mutex_lock(&trace_types_lock);
28114
28115         ret = __register_event(call, NULL);
28116         if (ret >= 0)
28117                 __add_event_to_tracers(call);
28118
28119 -       mutex_unlock(&event_mutex);
28120         mutex_unlock(&trace_types_lock);
28121 +       mutex_unlock(&event_mutex);
28122         return ret;
28123  }
28124
28125 @@ -2370,13 +2372,13 @@ int trace_remove_event_call(struct trace_event_call *call)
28126  {
28127         int ret;
28128
28129 -       mutex_lock(&trace_types_lock);
28130         mutex_lock(&event_mutex);
28131 +       mutex_lock(&trace_types_lock);
28132         down_write(&trace_event_sem);
28133         ret = probe_remove_event_call(call);
28134         up_write(&trace_event_sem);
28135 -       mutex_unlock(&event_mutex);
28136         mutex_unlock(&trace_types_lock);
28137 +       mutex_unlock(&event_mutex);
28138
28139         return ret;
28140  }
28141 @@ -2438,8 +2440,8 @@ static int trace_module_notify(struct notifier_block *self,
28142  {
28143         struct module *mod = data;
28144
28145 -       mutex_lock(&trace_types_lock);
28146         mutex_lock(&event_mutex);
28147 +       mutex_lock(&trace_types_lock);
28148         switch (val) {
28149         case MODULE_STATE_COMING:
28150                 trace_module_add_events(mod);
28151 @@ -2448,8 +2450,8 @@ static int trace_module_notify(struct notifier_block *self,
28152                 trace_module_remove_events(mod);
28153                 break;
28154         }
28155 -       mutex_unlock(&event_mutex);
28156         mutex_unlock(&trace_types_lock);
28157 +       mutex_unlock(&event_mutex);
28158
28159         return 0;
28160  }
28161 @@ -2964,24 +2966,24 @@ create_event_toplevel_files(struct dentry *parent, struct trace_array *tr)
28162   * creates the event hierachry in the @parent/events directory.
28163   *
28164   * Returns 0 on success.
28165 + *
28166 + * Must be called with event_mutex held.
28167   */
28168  int event_trace_add_tracer(struct dentry *parent, struct trace_array *tr)
28169  {
28170         int ret;
28171
28172 -       mutex_lock(&event_mutex);
28173 +       lockdep_assert_held(&event_mutex);
28174
28175         ret = create_event_toplevel_files(parent, tr);
28176         if (ret)
28177 -               goto out_unlock;
28178 +               goto out;
28179
28180         down_write(&trace_event_sem);
28181         __trace_add_event_dirs(tr);
28182         up_write(&trace_event_sem);
28183
28184 - out_unlock:
28185 -       mutex_unlock(&event_mutex);
28186 -
28187 + out:
28188         return ret;
28189  }
28190
28191 @@ -3010,9 +3012,10 @@ early_event_add_tracer(struct dentry *parent, struct trace_array *tr)
28192         return ret;
28193  }
28194
28195 +/* Must be called with event_mutex held */
28196  int event_trace_del_tracer(struct trace_array *tr)
28197  {
28198 -       mutex_lock(&event_mutex);
28199 +       lockdep_assert_held(&event_mutex);
28200
28201         /* Disable any event triggers and associated soft-disabled events */
28202         clear_event_triggers(tr);
28203 @@ -3033,8 +3036,6 @@ int event_trace_del_tracer(struct trace_array *tr)
28204
28205         tr->event_dir = NULL;
28206
28207 -       mutex_unlock(&event_mutex);
28208 -
28209         return 0;
28210  }
28211
28212 diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c
28213 index 7eb975a2d0e1..24bc0769fdd6 100644
28214 --- a/kernel/trace/trace_events_hist.c
28215 +++ b/kernel/trace/trace_events_hist.c
28216 @@ -20,13 +20,39 @@
28217  #include <linux/slab.h>
28218  #include <linux/stacktrace.h>
28219  #include <linux/rculist.h>
28220 +#include <linux/tracefs.h>
28221
28222  #include "tracing_map.h"
28223  #include "trace.h"
28224
28225 +#define SYNTH_SYSTEM           "synthetic"
28226 +#define SYNTH_FIELDS_MAX       16
28227 +
28228 +#define STR_VAR_LEN_MAX                32 /* must be multiple of sizeof(u64) */
28229 +
28230  struct hist_field;
28231
28232 -typedef u64 (*hist_field_fn_t) (struct hist_field *field, void *event);
28233 +typedef u64 (*hist_field_fn_t) (struct hist_field *field,
28234 +                               struct tracing_map_elt *elt,
28235 +                               struct ring_buffer_event *rbe,
28236 +                               void *event);
28237 +
28238 +#define HIST_FIELD_OPERANDS_MAX        2
28239 +#define HIST_FIELDS_MAX                (TRACING_MAP_FIELDS_MAX + TRACING_MAP_VARS_MAX)
28240 +#define HIST_ACTIONS_MAX       8
28241 +
28242 +enum field_op_id {
28243 +       FIELD_OP_NONE,
28244 +       FIELD_OP_PLUS,
28245 +       FIELD_OP_MINUS,
28246 +       FIELD_OP_UNARY_MINUS,
28247 +};
28248 +
28249 +struct hist_var {
28250 +       char                            *name;
28251 +       struct hist_trigger_data        *hist_data;
28252 +       unsigned int                    idx;
28253 +};
28254
28255  struct hist_field {
28256         struct ftrace_event_field       *field;
28257 @@ -34,26 +60,50 @@ struct hist_field {
28258         hist_field_fn_t                 fn;
28259         unsigned int                    size;
28260         unsigned int                    offset;
28261 +       unsigned int                    is_signed;
28262 +       const char                      *type;
28263 +       struct hist_field               *operands[HIST_FIELD_OPERANDS_MAX];
28264 +       struct hist_trigger_data        *hist_data;
28265 +       struct hist_var                 var;
28266 +       enum field_op_id                operator;
28267 +       char                            *system;
28268 +       char                            *event_name;
28269 +       char                            *name;
28270 +       unsigned int                    var_idx;
28271 +       unsigned int                    var_ref_idx;
28272 +       bool                            read_once;
28273  };
28274
28275 -static u64 hist_field_none(struct hist_field *field, void *event)
28276 +static u64 hist_field_none(struct hist_field *field,
28277 +                          struct tracing_map_elt *elt,
28278 +                          struct ring_buffer_event *rbe,
28279 +                          void *event)
28280  {
28281         return 0;
28282  }
28283
28284 -static u64 hist_field_counter(struct hist_field *field, void *event)
28285 +static u64 hist_field_counter(struct hist_field *field,
28286 +                             struct tracing_map_elt *elt,
28287 +                             struct ring_buffer_event *rbe,
28288 +                             void *event)
28289  {
28290         return 1;
28291  }
28292
28293 -static u64 hist_field_string(struct hist_field *hist_field, void *event)
28294 +static u64 hist_field_string(struct hist_field *hist_field,
28295 +                            struct tracing_map_elt *elt,
28296 +                            struct ring_buffer_event *rbe,
28297 +                            void *event)
28298  {
28299         char *addr = (char *)(event + hist_field->field->offset);
28300
28301         return (u64)(unsigned long)addr;
28302  }
28303
28304 -static u64 hist_field_dynstring(struct hist_field *hist_field, void *event)
28305 +static u64 hist_field_dynstring(struct hist_field *hist_field,
28306 +                               struct tracing_map_elt *elt,
28307 +                               struct ring_buffer_event *rbe,
28308 +                               void *event)
28309  {
28310         u32 str_item = *(u32 *)(event + hist_field->field->offset);
28311         int str_loc = str_item & 0xffff;
28312 @@ -62,22 +112,74 @@ static u64 hist_field_dynstring(struct hist_field *hist_field, void *event)
28313         return (u64)(unsigned long)addr;
28314  }
28315
28316 -static u64 hist_field_pstring(struct hist_field *hist_field, void *event)
28317 +static u64 hist_field_pstring(struct hist_field *hist_field,
28318 +                             struct tracing_map_elt *elt,
28319 +                             struct ring_buffer_event *rbe,
28320 +                             void *event)
28321  {
28322         char **addr = (char **)(event + hist_field->field->offset);
28323
28324         return (u64)(unsigned long)*addr;
28325  }
28326
28327 -static u64 hist_field_log2(struct hist_field *hist_field, void *event)
28328 +static u64 hist_field_log2(struct hist_field *hist_field,
28329 +                          struct tracing_map_elt *elt,
28330 +                          struct ring_buffer_event *rbe,
28331 +                          void *event)
28332  {
28333 -       u64 val = *(u64 *)(event + hist_field->field->offset);
28334 +       struct hist_field *operand = hist_field->operands[0];
28335 +
28336 +       u64 val = operand->fn(operand, elt, rbe, event);
28337
28338         return (u64) ilog2(roundup_pow_of_two(val));
28339  }
28340
28341 +static u64 hist_field_plus(struct hist_field *hist_field,
28342 +                          struct tracing_map_elt *elt,
28343 +                          struct ring_buffer_event *rbe,
28344 +                          void *event)
28345 +{
28346 +       struct hist_field *operand1 = hist_field->operands[0];
28347 +       struct hist_field *operand2 = hist_field->operands[1];
28348 +
28349 +       u64 val1 = operand1->fn(operand1, elt, rbe, event);
28350 +       u64 val2 = operand2->fn(operand2, elt, rbe, event);
28351 +
28352 +       return val1 + val2;
28353 +}
28354 +
28355 +static u64 hist_field_minus(struct hist_field *hist_field,
28356 +                           struct tracing_map_elt *elt,
28357 +                           struct ring_buffer_event *rbe,
28358 +                           void *event)
28359 +{
28360 +       struct hist_field *operand1 = hist_field->operands[0];
28361 +       struct hist_field *operand2 = hist_field->operands[1];
28362 +
28363 +       u64 val1 = operand1->fn(operand1, elt, rbe, event);
28364 +       u64 val2 = operand2->fn(operand2, elt, rbe, event);
28365 +
28366 +       return val1 - val2;
28367 +}
28368 +
28369 +static u64 hist_field_unary_minus(struct hist_field *hist_field,
28370 +                                 struct tracing_map_elt *elt,
28371 +                                 struct ring_buffer_event *rbe,
28372 +                                 void *event)
28373 +{
28374 +       struct hist_field *operand = hist_field->operands[0];
28375 +
28376 +       s64 sval = (s64)operand->fn(operand, elt, rbe, event);
28377 +       u64 val = (u64)-sval;
28378 +
28379 +       return val;
28380 +}
28381 +
28382  #define DEFINE_HIST_FIELD_FN(type)                                     \
28383 -static u64 hist_field_##type(struct hist_field *hist_field, void *event)\
28384 +       static u64 hist_field_##type(struct hist_field *hist_field,     \
28385 +                                    struct tracing_map_elt *elt,       \
28386 +                                    struct ring_buffer_event *rbe,     \
28387 +                                    void *event)                       \
28388  {                                                                      \
28389         type *addr = (type *)(event + hist_field->field->offset);       \
28390                                                                         \
28391 @@ -110,16 +212,29 @@ DEFINE_HIST_FIELD_FN(u8);
28392  #define HIST_KEY_SIZE_MAX      (MAX_FILTER_STR_VAL + HIST_STACKTRACE_SIZE)
28393
28394  enum hist_field_flags {
28395 -       HIST_FIELD_FL_HITCOUNT          = 1,
28396 -       HIST_FIELD_FL_KEY               = 2,
28397 -       HIST_FIELD_FL_STRING            = 4,
28398 -       HIST_FIELD_FL_HEX               = 8,
28399 -       HIST_FIELD_FL_SYM               = 16,
28400 -       HIST_FIELD_FL_SYM_OFFSET        = 32,
28401 -       HIST_FIELD_FL_EXECNAME          = 64,
28402 -       HIST_FIELD_FL_SYSCALL           = 128,
28403 -       HIST_FIELD_FL_STACKTRACE        = 256,
28404 -       HIST_FIELD_FL_LOG2              = 512,
28405 +       HIST_FIELD_FL_HITCOUNT          = 1 << 0,
28406 +       HIST_FIELD_FL_KEY               = 1 << 1,
28407 +       HIST_FIELD_FL_STRING            = 1 << 2,
28408 +       HIST_FIELD_FL_HEX               = 1 << 3,
28409 +       HIST_FIELD_FL_SYM               = 1 << 4,
28410 +       HIST_FIELD_FL_SYM_OFFSET        = 1 << 5,
28411 +       HIST_FIELD_FL_EXECNAME          = 1 << 6,
28412 +       HIST_FIELD_FL_SYSCALL           = 1 << 7,
28413 +       HIST_FIELD_FL_STACKTRACE        = 1 << 8,
28414 +       HIST_FIELD_FL_LOG2              = 1 << 9,
28415 +       HIST_FIELD_FL_TIMESTAMP         = 1 << 10,
28416 +       HIST_FIELD_FL_TIMESTAMP_USECS   = 1 << 11,
28417 +       HIST_FIELD_FL_VAR               = 1 << 12,
28418 +       HIST_FIELD_FL_EXPR              = 1 << 13,
28419 +       HIST_FIELD_FL_VAR_REF           = 1 << 14,
28420 +       HIST_FIELD_FL_CPU               = 1 << 15,
28421 +       HIST_FIELD_FL_ALIAS             = 1 << 16,
28422 +};
28423 +
28424 +struct var_defs {
28425 +       unsigned int    n_vars;
28426 +       char            *name[TRACING_MAP_VARS_MAX];
28427 +       char            *expr[TRACING_MAP_VARS_MAX];
28428  };
28429
28430  struct hist_trigger_attrs {
28431 @@ -127,298 +242,3585 @@ struct hist_trigger_attrs {
28432         char            *vals_str;
28433         char            *sort_key_str;
28434         char            *name;
28435 +       char            *clock;
28436         bool            pause;
28437         bool            cont;
28438         bool            clear;
28439 +       bool            ts_in_usecs;
28440         unsigned int    map_bits;
28441 +
28442 +       char            *assignment_str[TRACING_MAP_VARS_MAX];
28443 +       unsigned int    n_assignments;
28444 +
28445 +       char            *action_str[HIST_ACTIONS_MAX];
28446 +       unsigned int    n_actions;
28447 +
28448 +       struct var_defs var_defs;
28449 +};
28450 +
28451 +struct field_var {
28452 +       struct hist_field       *var;
28453 +       struct hist_field       *val;
28454 +};
28455 +
28456 +struct field_var_hist {
28457 +       struct hist_trigger_data        *hist_data;
28458 +       char                            *cmd;
28459  };
28460
28461  struct hist_trigger_data {
28462 -       struct hist_field               *fields[TRACING_MAP_FIELDS_MAX];
28463 +       struct hist_field               *fields[HIST_FIELDS_MAX];
28464         unsigned int                    n_vals;
28465         unsigned int                    n_keys;
28466         unsigned int                    n_fields;
28467 +       unsigned int                    n_vars;
28468         unsigned int                    key_size;
28469         struct tracing_map_sort_key     sort_keys[TRACING_MAP_SORT_KEYS_MAX];
28470         unsigned int                    n_sort_keys;
28471         struct trace_event_file         *event_file;
28472         struct hist_trigger_attrs       *attrs;
28473         struct tracing_map              *map;
28474 +       bool                            enable_timestamps;
28475 +       bool                            remove;
28476 +       struct hist_field               *var_refs[TRACING_MAP_VARS_MAX];
28477 +       unsigned int                    n_var_refs;
28478 +
28479 +       struct action_data              *actions[HIST_ACTIONS_MAX];
28480 +       unsigned int                    n_actions;
28481 +
28482 +       struct hist_field               *synth_var_refs[SYNTH_FIELDS_MAX];
28483 +       unsigned int                    n_synth_var_refs;
28484 +       struct field_var                *field_vars[SYNTH_FIELDS_MAX];
28485 +       unsigned int                    n_field_vars;
28486 +       unsigned int                    n_field_var_str;
28487 +       struct field_var_hist           *field_var_hists[SYNTH_FIELDS_MAX];
28488 +       unsigned int                    n_field_var_hists;
28489 +
28490 +       struct field_var                *max_vars[SYNTH_FIELDS_MAX];
28491 +       unsigned int                    n_max_vars;
28492 +       unsigned int                    n_max_var_str;
28493  };
28494
28495 -static hist_field_fn_t select_value_fn(int field_size, int field_is_signed)
28496 -{
28497 -       hist_field_fn_t fn = NULL;
28498 +struct synth_field {
28499 +       char *type;
28500 +       char *name;
28501 +       size_t size;
28502 +       bool is_signed;
28503 +       bool is_string;
28504 +};
28505
28506 -       switch (field_size) {
28507 -       case 8:
28508 -               if (field_is_signed)
28509 -                       fn = hist_field_s64;
28510 -               else
28511 -                       fn = hist_field_u64;
28512 -               break;
28513 -       case 4:
28514 -               if (field_is_signed)
28515 -                       fn = hist_field_s32;
28516 -               else
28517 -                       fn = hist_field_u32;
28518 -               break;
28519 -       case 2:
28520 -               if (field_is_signed)
28521 -                       fn = hist_field_s16;
28522 -               else
28523 -                       fn = hist_field_u16;
28524 -               break;
28525 -       case 1:
28526 -               if (field_is_signed)
28527 -                       fn = hist_field_s8;
28528 -               else
28529 -                       fn = hist_field_u8;
28530 -               break;
28531 -       }
28532 +struct synth_event {
28533 +       struct list_head                        list;
28534 +       int                                     ref;
28535 +       char                                    *name;
28536 +       struct synth_field                      **fields;
28537 +       unsigned int                            n_fields;
28538 +       unsigned int                            n_u64;
28539 +       struct trace_event_class                class;
28540 +       struct trace_event_call                 call;
28541 +       struct tracepoint                       *tp;
28542 +};
28543
28544 -       return fn;
28545 +struct action_data;
28546 +
28547 +typedef void (*action_fn_t) (struct hist_trigger_data *hist_data,
28548 +                            struct tracing_map_elt *elt, void *rec,
28549 +                            struct ring_buffer_event *rbe,
28550 +                            struct action_data *data, u64 *var_ref_vals);
28551 +
28552 +struct action_data {
28553 +       action_fn_t             fn;
28554 +       unsigned int            n_params;
28555 +       char                    *params[SYNTH_FIELDS_MAX];
28556 +
28557 +       union {
28558 +               struct {
28559 +                       unsigned int            var_ref_idx;
28560 +                       char                    *match_event;
28561 +                       char                    *match_event_system;
28562 +                       char                    *synth_event_name;
28563 +                       struct synth_event      *synth_event;
28564 +               } onmatch;
28565 +
28566 +               struct {
28567 +                       char                    *var_str;
28568 +                       char                    *fn_name;
28569 +                       unsigned int            max_var_ref_idx;
28570 +                       struct hist_field       *max_var;
28571 +                       struct hist_field       *var;
28572 +               } onmax;
28573 +       };
28574 +};
28575 +
28576 +
28577 +static char last_hist_cmd[MAX_FILTER_STR_VAL];
28578 +static char hist_err_str[MAX_FILTER_STR_VAL];
28579 +
28580 +static void last_cmd_set(char *str)
28581 +{
28582 +       if (!str)
28583 +               return;
28584 +
28585 +       strncpy(last_hist_cmd, str, MAX_FILTER_STR_VAL - 1);
28586  }
28587
28588 -static int parse_map_size(char *str)
28589 +static void hist_err(char *str, char *var)
28590  {
28591 -       unsigned long size, map_bits;
28592 -       int ret;
28593 +       int maxlen = MAX_FILTER_STR_VAL - 1;
28594
28595 -       strsep(&str, "=");
28596 -       if (!str) {
28597 -               ret = -EINVAL;
28598 -               goto out;
28599 -       }
28600 +       if (!str)
28601 +               return;
28602
28603 -       ret = kstrtoul(str, 0, &size);
28604 -       if (ret)
28605 -               goto out;
28606 +       if (strlen(hist_err_str))
28607 +               return;
28608
28609 -       map_bits = ilog2(roundup_pow_of_two(size));
28610 -       if (map_bits < TRACING_MAP_BITS_MIN ||
28611 -           map_bits > TRACING_MAP_BITS_MAX)
28612 -               ret = -EINVAL;
28613 -       else
28614 -               ret = map_bits;
28615 - out:
28616 -       return ret;
28617 +       if (!var)
28618 +               var = "";
28619 +
28620 +       if (strlen(hist_err_str) + strlen(str) + strlen(var) > maxlen)
28621 +               return;
28622 +
28623 +       strcat(hist_err_str, str);
28624 +       strcat(hist_err_str, var);
28625  }
28626
28627 -static void destroy_hist_trigger_attrs(struct hist_trigger_attrs *attrs)
28628 +static void hist_err_event(char *str, char *system, char *event, char *var)
28629  {
28630 -       if (!attrs)
28631 -               return;
28632 +       char err[MAX_FILTER_STR_VAL];
28633
28634 -       kfree(attrs->name);
28635 -       kfree(attrs->sort_key_str);
28636 -       kfree(attrs->keys_str);
28637 -       kfree(attrs->vals_str);
28638 -       kfree(attrs);
28639 +       if (system && var)
28640 +               snprintf(err, MAX_FILTER_STR_VAL, "%s.%s.%s", system, event, var);
28641 +       else if (system)
28642 +               snprintf(err, MAX_FILTER_STR_VAL, "%s.%s", system, event);
28643 +       else
28644 +               strncpy(err, var, MAX_FILTER_STR_VAL);
28645 +
28646 +       hist_err(str, err);
28647  }
28648
28649 -static struct hist_trigger_attrs *parse_hist_trigger_attrs(char *trigger_str)
28650 +static void hist_err_clear(void)
28651  {
28652 -       struct hist_trigger_attrs *attrs;
28653 -       int ret = 0;
28654 +       hist_err_str[0] = '\0';
28655 +}
28656
28657 -       attrs = kzalloc(sizeof(*attrs), GFP_KERNEL);
28658 -       if (!attrs)
28659 -               return ERR_PTR(-ENOMEM);
28660 +static bool have_hist_err(void)
28661 +{
28662 +       if (strlen(hist_err_str))
28663 +               return true;
28664
28665 -       while (trigger_str) {
28666 -               char *str = strsep(&trigger_str, ":");
28667 +       return false;
28668 +}
28669
28670 -               if ((strncmp(str, "key=", strlen("key=")) == 0) ||
28671 -                   (strncmp(str, "keys=", strlen("keys=")) == 0))
28672 -                       attrs->keys_str = kstrdup(str, GFP_KERNEL);
28673 -               else if ((strncmp(str, "val=", strlen("val=")) == 0) ||
28674 -                        (strncmp(str, "vals=", strlen("vals=")) == 0) ||
28675 -                        (strncmp(str, "values=", strlen("values=")) == 0))
28676 -                       attrs->vals_str = kstrdup(str, GFP_KERNEL);
28677 -               else if (strncmp(str, "sort=", strlen("sort=")) == 0)
28678 -                       attrs->sort_key_str = kstrdup(str, GFP_KERNEL);
28679 -               else if (strncmp(str, "name=", strlen("name=")) == 0)
28680 -                       attrs->name = kstrdup(str, GFP_KERNEL);
28681 -               else if (strcmp(str, "pause") == 0)
28682 -                       attrs->pause = true;
28683 -               else if ((strcmp(str, "cont") == 0) ||
28684 -                        (strcmp(str, "continue") == 0))
28685 -                       attrs->cont = true;
28686 -               else if (strcmp(str, "clear") == 0)
28687 -                       attrs->clear = true;
28688 -               else if (strncmp(str, "size=", strlen("size=")) == 0) {
28689 -                       int map_bits = parse_map_size(str);
28690 +static LIST_HEAD(synth_event_list);
28691 +static DEFINE_MUTEX(synth_event_mutex);
28692
28693 -                       if (map_bits < 0) {
28694 -                               ret = map_bits;
28695 -                               goto free;
28696 -                       }
28697 -                       attrs->map_bits = map_bits;
28698 +struct synth_trace_event {
28699 +       struct trace_entry      ent;
28700 +       u64                     fields[];
28701 +};
28702 +
28703 +static int synth_event_define_fields(struct trace_event_call *call)
28704 +{
28705 +       struct synth_trace_event trace;
28706 +       int offset = offsetof(typeof(trace), fields);
28707 +       struct synth_event *event = call->data;
28708 +       unsigned int i, size, n_u64;
28709 +       char *name, *type;
28710 +       bool is_signed;
28711 +       int ret = 0;
28712 +
28713 +       for (i = 0, n_u64 = 0; i < event->n_fields; i++) {
28714 +               size = event->fields[i]->size;
28715 +               is_signed = event->fields[i]->is_signed;
28716 +               type = event->fields[i]->type;
28717 +               name = event->fields[i]->name;
28718 +               ret = trace_define_field(call, type, name, offset, size,
28719 +                                        is_signed, FILTER_OTHER);
28720 +               if (ret)
28721 +                       break;
28722 +
28723 +               if (event->fields[i]->is_string) {
28724 +                       offset += STR_VAR_LEN_MAX;
28725 +                       n_u64 += STR_VAR_LEN_MAX / sizeof(u64);
28726                 } else {
28727 -                       ret = -EINVAL;
28728 -                       goto free;
28729 +                       offset += sizeof(u64);
28730 +                       n_u64++;
28731                 }
28732         }
28733
28734 -       if (!attrs->keys_str) {
28735 -               ret = -EINVAL;
28736 -               goto free;
28737 -       }
28738 +       event->n_u64 = n_u64;
28739
28740 -       return attrs;
28741 - free:
28742 -       destroy_hist_trigger_attrs(attrs);
28743 +       return ret;
28744 +}
28745
28746 -       return ERR_PTR(ret);
28747 +static bool synth_field_signed(char *type)
28748 +{
28749 +       if (strncmp(type, "u", 1) == 0)
28750 +               return false;
28751 +
28752 +       return true;
28753  }
28754
28755 -static inline void save_comm(char *comm, struct task_struct *task)
28756 +static int synth_field_is_string(char *type)
28757  {
28758 -       if (!task->pid) {
28759 -               strcpy(comm, "<idle>");
28760 -               return;
28761 -       }
28762 +       if (strstr(type, "char[") != NULL)
28763 +               return true;
28764
28765 -       if (WARN_ON_ONCE(task->pid < 0)) {
28766 -               strcpy(comm, "<XXX>");
28767 -               return;
28768 -       }
28769 +       return false;
28770 +}
28771
28772 -       memcpy(comm, task->comm, TASK_COMM_LEN);
28773 +static int synth_field_string_size(char *type)
28774 +{
28775 +       char buf[4], *end, *start;
28776 +       unsigned int len;
28777 +       int size, err;
28778 +
28779 +       start = strstr(type, "char[");
28780 +       if (start == NULL)
28781 +               return -EINVAL;
28782 +       start += strlen("char[");
28783 +
28784 +       end = strchr(type, ']');
28785 +       if (!end || end < start)
28786 +               return -EINVAL;
28787 +
28788 +       len = end - start;
28789 +       if (len > 3)
28790 +               return -EINVAL;
28791 +
28792 +       strncpy(buf, start, len);
28793 +       buf[len] = '\0';
28794 +
28795 +       err = kstrtouint(buf, 0, &size);
28796 +       if (err)
28797 +               return err;
28798 +
28799 +       if (size > STR_VAR_LEN_MAX)
28800 +               return -EINVAL;
28801 +
28802 +       return size;
28803  }
28804
28805 -static void hist_trigger_elt_comm_free(struct tracing_map_elt *elt)
28806 +static int synth_field_size(char *type)
28807  {
28808 -       kfree((char *)elt->private_data);
28809 +       int size = 0;
28810 +
28811 +       if (strcmp(type, "s64") == 0)
28812 +               size = sizeof(s64);
28813 +       else if (strcmp(type, "u64") == 0)
28814 +               size = sizeof(u64);
28815 +       else if (strcmp(type, "s32") == 0)
28816 +               size = sizeof(s32);
28817 +       else if (strcmp(type, "u32") == 0)
28818 +               size = sizeof(u32);
28819 +       else if (strcmp(type, "s16") == 0)
28820 +               size = sizeof(s16);
28821 +       else if (strcmp(type, "u16") == 0)
28822 +               size = sizeof(u16);
28823 +       else if (strcmp(type, "s8") == 0)
28824 +               size = sizeof(s8);
28825 +       else if (strcmp(type, "u8") == 0)
28826 +               size = sizeof(u8);
28827 +       else if (strcmp(type, "char") == 0)
28828 +               size = sizeof(char);
28829 +       else if (strcmp(type, "unsigned char") == 0)
28830 +               size = sizeof(unsigned char);
28831 +       else if (strcmp(type, "int") == 0)
28832 +               size = sizeof(int);
28833 +       else if (strcmp(type, "unsigned int") == 0)
28834 +               size = sizeof(unsigned int);
28835 +       else if (strcmp(type, "long") == 0)
28836 +               size = sizeof(long);
28837 +       else if (strcmp(type, "unsigned long") == 0)
28838 +               size = sizeof(unsigned long);
28839 +       else if (strcmp(type, "pid_t") == 0)
28840 +               size = sizeof(pid_t);
28841 +       else if (synth_field_is_string(type))
28842 +               size = synth_field_string_size(type);
28843 +
28844 +       return size;
28845  }
28846
28847 -static int hist_trigger_elt_comm_alloc(struct tracing_map_elt *elt)
28848 +static const char *synth_field_fmt(char *type)
28849  {
28850 -       struct hist_trigger_data *hist_data = elt->map->private_data;
28851 -       struct hist_field *key_field;
28852 -       unsigned int i;
28853 +       const char *fmt = "%llu";
28854 +
28855 +       if (strcmp(type, "s64") == 0)
28856 +               fmt = "%lld";
28857 +       else if (strcmp(type, "u64") == 0)
28858 +               fmt = "%llu";
28859 +       else if (strcmp(type, "s32") == 0)
28860 +               fmt = "%d";
28861 +       else if (strcmp(type, "u32") == 0)
28862 +               fmt = "%u";
28863 +       else if (strcmp(type, "s16") == 0)
28864 +               fmt = "%d";
28865 +       else if (strcmp(type, "u16") == 0)
28866 +               fmt = "%u";
28867 +       else if (strcmp(type, "s8") == 0)
28868 +               fmt = "%d";
28869 +       else if (strcmp(type, "u8") == 0)
28870 +               fmt = "%u";
28871 +       else if (strcmp(type, "char") == 0)
28872 +               fmt = "%d";
28873 +       else if (strcmp(type, "unsigned char") == 0)
28874 +               fmt = "%u";
28875 +       else if (strcmp(type, "int") == 0)
28876 +               fmt = "%d";
28877 +       else if (strcmp(type, "unsigned int") == 0)
28878 +               fmt = "%u";
28879 +       else if (strcmp(type, "long") == 0)
28880 +               fmt = "%ld";
28881 +       else if (strcmp(type, "unsigned long") == 0)
28882 +               fmt = "%lu";
28883 +       else if (strcmp(type, "pid_t") == 0)
28884 +               fmt = "%d";
28885 +       else if (synth_field_is_string(type))
28886 +               fmt = "%s";
28887 +
28888 +       return fmt;
28889 +}
28890
28891 -       for_each_hist_key_field(i, hist_data) {
28892 -               key_field = hist_data->fields[i];
28893 +static enum print_line_t print_synth_event(struct trace_iterator *iter,
28894 +                                          int flags,
28895 +                                          struct trace_event *event)
28896 +{
28897 +       struct trace_array *tr = iter->tr;
28898 +       struct trace_seq *s = &iter->seq;
28899 +       struct synth_trace_event *entry;
28900 +       struct synth_event *se;
28901 +       unsigned int i, n_u64;
28902 +       char print_fmt[32];
28903 +       const char *fmt;
28904
28905 -               if (key_field->flags & HIST_FIELD_FL_EXECNAME) {
28906 -                       unsigned int size = TASK_COMM_LEN + 1;
28907 +       entry = (struct synth_trace_event *)iter->ent;
28908 +       se = container_of(event, struct synth_event, call.event);
28909
28910 -                       elt->private_data = kzalloc(size, GFP_KERNEL);
28911 -                       if (!elt->private_data)
28912 -                               return -ENOMEM;
28913 -                       break;
28914 +       trace_seq_printf(s, "%s: ", se->name);
28915 +
28916 +       for (i = 0, n_u64 = 0; i < se->n_fields; i++) {
28917 +               if (trace_seq_has_overflowed(s))
28918 +                       goto end;
28919 +
28920 +               fmt = synth_field_fmt(se->fields[i]->type);
28921 +
28922 +               /* parameter types */
28923 +               if (tr->trace_flags & TRACE_ITER_VERBOSE)
28924 +                       trace_seq_printf(s, "%s ", fmt);
28925 +
28926 +               snprintf(print_fmt, sizeof(print_fmt), "%%s=%s%%s", fmt);
28927 +
28928 +               /* parameter values */
28929 +               if (se->fields[i]->is_string) {
28930 +                       trace_seq_printf(s, print_fmt, se->fields[i]->name,
28931 +                                        (char *)&entry->fields[n_u64],
28932 +                                        i == se->n_fields - 1 ? "" : " ");
28933 +                       n_u64 += STR_VAR_LEN_MAX / sizeof(u64);
28934 +               } else {
28935 +                       trace_seq_printf(s, print_fmt, se->fields[i]->name,
28936 +                                        entry->fields[n_u64],
28937 +                                        i == se->n_fields - 1 ? "" : " ");
28938 +                       n_u64++;
28939                 }
28940         }
28941 +end:
28942 +       trace_seq_putc(s, '\n');
28943
28944 -       return 0;
28945 +       return trace_handle_return(s);
28946  }
28947
28948 -static void hist_trigger_elt_comm_copy(struct tracing_map_elt *to,
28949 -                                      struct tracing_map_elt *from)
28950 +static struct trace_event_functions synth_event_funcs = {
28951 +       .trace          = print_synth_event
28952 +};
28953 +
28954 +static notrace void trace_event_raw_event_synth(void *__data,
28955 +                                               u64 *var_ref_vals,
28956 +                                               unsigned int var_ref_idx)
28957  {
28958 -       char *comm_from = from->private_data;
28959 -       char *comm_to = to->private_data;
28960 +       struct trace_event_file *trace_file = __data;
28961 +       struct synth_trace_event *entry;
28962 +       struct trace_event_buffer fbuffer;
28963 +       struct ring_buffer *buffer;
28964 +       struct synth_event *event;
28965 +       unsigned int i, n_u64;
28966 +       int fields_size = 0;
28967 +
28968 +       event = trace_file->event_call->data;
28969 +
28970 +       if (trace_trigger_soft_disabled(trace_file))
28971 +               return;
28972 +
28973 +       fields_size = event->n_u64 * sizeof(u64);
28974 +
28975 +       /*
28976 +        * Avoid ring buffer recursion detection, as this event
28977 +        * is being performed within another event.
28978 +        */
28979 +       buffer = trace_file->tr->trace_buffer.buffer;
28980 +       ring_buffer_nest_start(buffer);
28981 +
28982 +       entry = trace_event_buffer_reserve(&fbuffer, trace_file,
28983 +                                          sizeof(*entry) + fields_size);
28984 +       if (!entry)
28985 +               goto out;
28986 +
28987 +       for (i = 0, n_u64 = 0; i < event->n_fields; i++) {
28988 +               if (event->fields[i]->is_string) {
28989 +                       char *str_val = (char *)(long)var_ref_vals[var_ref_idx + i];
28990 +                       char *str_field = (char *)&entry->fields[n_u64];
28991 +
28992 +                       strscpy(str_field, str_val, STR_VAR_LEN_MAX);
28993 +                       n_u64 += STR_VAR_LEN_MAX / sizeof(u64);
28994 +               } else {
28995 +                       entry->fields[n_u64] = var_ref_vals[var_ref_idx + i];
28996 +                       n_u64++;
28997 +               }
28998 +       }
28999 +
29000 +       trace_event_buffer_commit(&fbuffer);
29001 +out:
29002 +       ring_buffer_nest_end(buffer);
29003 +}
29004
29005 -       if (comm_from)
29006 -               memcpy(comm_to, comm_from, TASK_COMM_LEN + 1);
29007 +static void free_synth_event_print_fmt(struct trace_event_call *call)
29008 +{
29009 +       if (call) {
29010 +               kfree(call->print_fmt);
29011 +               call->print_fmt = NULL;
29012 +       }
29013  }
29014
29015 -static void hist_trigger_elt_comm_init(struct tracing_map_elt *elt)
29016 +static int __set_synth_event_print_fmt(struct synth_event *event,
29017 +                                      char *buf, int len)
29018  {
29019 -       char *comm = elt->private_data;
29020 +       const char *fmt;
29021 +       int pos = 0;
29022 +       int i;
29023 +
29024 +       /* When len=0, we just calculate the needed length */
29025 +#define LEN_OR_ZERO (len ? len - pos : 0)
29026 +
29027 +       pos += snprintf(buf + pos, LEN_OR_ZERO, "\"");
29028 +       for (i = 0; i < event->n_fields; i++) {
29029 +               fmt = synth_field_fmt(event->fields[i]->type);
29030 +               pos += snprintf(buf + pos, LEN_OR_ZERO, "%s=%s%s",
29031 +                               event->fields[i]->name, fmt,
29032 +                               i == event->n_fields - 1 ? "" : ", ");
29033 +       }
29034 +       pos += snprintf(buf + pos, LEN_OR_ZERO, "\"");
29035 +
29036 +       for (i = 0; i < event->n_fields; i++) {
29037 +               pos += snprintf(buf + pos, LEN_OR_ZERO,
29038 +                               ", REC->%s", event->fields[i]->name);
29039 +       }
29040 +
29041 +#undef LEN_OR_ZERO
29042
29043 -       if (comm)
29044 -               save_comm(comm, current);
29045 +       /* return the length of print_fmt */
29046 +       return pos;
29047  }
29048
29049 -static const struct tracing_map_ops hist_trigger_elt_comm_ops = {
29050 -       .elt_alloc      = hist_trigger_elt_comm_alloc,
29051 -       .elt_copy       = hist_trigger_elt_comm_copy,
29052 -       .elt_free       = hist_trigger_elt_comm_free,
29053 -       .elt_init       = hist_trigger_elt_comm_init,
29054 -};
29055 +static int set_synth_event_print_fmt(struct trace_event_call *call)
29056 +{
29057 +       struct synth_event *event = call->data;
29058 +       char *print_fmt;
29059 +       int len;
29060 +
29061 +       /* First: called with 0 length to calculate the needed length */
29062 +       len = __set_synth_event_print_fmt(event, NULL, 0);
29063 +
29064 +       print_fmt = kmalloc(len + 1, GFP_KERNEL);
29065 +       if (!print_fmt)
29066 +               return -ENOMEM;
29067 +
29068 +       /* Second: actually write the @print_fmt */
29069 +       __set_synth_event_print_fmt(event, print_fmt, len + 1);
29070 +       call->print_fmt = print_fmt;
29071
29072 -static void destroy_hist_field(struct hist_field *hist_field)
29073 +       return 0;
29074 +}
29075 +
29076 +static void free_synth_field(struct synth_field *field)
29077  {
29078 -       kfree(hist_field);
29079 +       kfree(field->type);
29080 +       kfree(field->name);
29081 +       kfree(field);
29082  }
29083
29084 -static struct hist_field *create_hist_field(struct ftrace_event_field *field,
29085 -                                           unsigned long flags)
29086 +static struct synth_field *parse_synth_field(char *field_type,
29087 +                                            char *field_name)
29088  {
29089 -       struct hist_field *hist_field;
29090 +       struct synth_field *field;
29091 +       int len, ret = 0;
29092 +       char *array;
29093
29094 -       if (field && is_function_field(field))
29095 -               return NULL;
29096 +       if (field_type[0] == ';')
29097 +               field_type++;
29098
29099 -       hist_field = kzalloc(sizeof(struct hist_field), GFP_KERNEL);
29100 -       if (!hist_field)
29101 -               return NULL;
29102 +       len = strlen(field_name);
29103 +       if (field_name[len - 1] == ';')
29104 +               field_name[len - 1] = '\0';
29105
29106 -       if (flags & HIST_FIELD_FL_HITCOUNT) {
29107 -               hist_field->fn = hist_field_counter;
29108 -               goto out;
29109 +       field = kzalloc(sizeof(*field), GFP_KERNEL);
29110 +       if (!field)
29111 +               return ERR_PTR(-ENOMEM);
29112 +
29113 +       len = strlen(field_type) + 1;
29114 +       array = strchr(field_name, '[');
29115 +       if (array)
29116 +               len += strlen(array);
29117 +       field->type = kzalloc(len, GFP_KERNEL);
29118 +       if (!field->type) {
29119 +               ret = -ENOMEM;
29120 +               goto free;
29121 +       }
29122 +       strcat(field->type, field_type);
29123 +       if (array) {
29124 +               strcat(field->type, array);
29125 +               *array = '\0';
29126         }
29127
29128 -       if (flags & HIST_FIELD_FL_STACKTRACE) {
29129 -               hist_field->fn = hist_field_none;
29130 -               goto out;
29131 +       field->size = synth_field_size(field->type);
29132 +       if (!field->size) {
29133 +               ret = -EINVAL;
29134 +               goto free;
29135         }
29136
29137 -       if (flags & HIST_FIELD_FL_LOG2) {
29138 -               hist_field->fn = hist_field_log2;
29139 -               goto out;
29140 +       if (synth_field_is_string(field->type))
29141 +               field->is_string = true;
29142 +
29143 +       field->is_signed = synth_field_signed(field->type);
29144 +
29145 +       field->name = kstrdup(field_name, GFP_KERNEL);
29146 +       if (!field->name) {
29147 +               ret = -ENOMEM;
29148 +               goto free;
29149 +       }
29150 + out:
29151 +       return field;
29152 + free:
29153 +       free_synth_field(field);
29154 +       field = ERR_PTR(ret);
29155 +       goto out;
29156 +}
29157 +
29158 +static void free_synth_tracepoint(struct tracepoint *tp)
29159 +{
29160 +       if (!tp)
29161 +               return;
29162 +
29163 +       kfree(tp->name);
29164 +       kfree(tp);
29165 +}
29166 +
29167 +static struct tracepoint *alloc_synth_tracepoint(char *name)
29168 +{
29169 +       struct tracepoint *tp;
29170 +
29171 +       tp = kzalloc(sizeof(*tp), GFP_KERNEL);
29172 +       if (!tp)
29173 +               return ERR_PTR(-ENOMEM);
29174 +
29175 +       tp->name = kstrdup(name, GFP_KERNEL);
29176 +       if (!tp->name) {
29177 +               kfree(tp);
29178 +               return ERR_PTR(-ENOMEM);
29179 +       }
29180 +
29181 +       return tp;
29182 +}
29183 +
29184 +typedef void (*synth_probe_func_t) (void *__data, u64 *var_ref_vals,
29185 +                                   unsigned int var_ref_idx);
29186 +
29187 +static inline void trace_synth(struct synth_event *event, u64 *var_ref_vals,
29188 +                              unsigned int var_ref_idx)
29189 +{
29190 +       struct tracepoint *tp = event->tp;
29191 +
29192 +       if (unlikely(atomic_read(&tp->key.enabled) > 0)) {
29193 +               struct tracepoint_func *probe_func_ptr;
29194 +               synth_probe_func_t probe_func;
29195 +               void *__data;
29196 +
29197 +               if (!(cpu_online(raw_smp_processor_id())))
29198 +                       return;
29199 +
29200 +               probe_func_ptr = rcu_dereference_sched((tp)->funcs);
29201 +               if (probe_func_ptr) {
29202 +                       do {
29203 +                               probe_func = probe_func_ptr->func;
29204 +                               __data = probe_func_ptr->data;
29205 +                               probe_func(__data, var_ref_vals, var_ref_idx);
29206 +                       } while ((++probe_func_ptr)->func);
29207 +               }
29208 +       }
29209 +}
29210 +
29211 +static struct synth_event *find_synth_event(const char *name)
29212 +{
29213 +       struct synth_event *event;
29214 +
29215 +       list_for_each_entry(event, &synth_event_list, list) {
29216 +               if (strcmp(event->name, name) == 0)
29217 +                       return event;
29218 +       }
29219 +
29220 +       return NULL;
29221 +}
29222 +
29223 +static int register_synth_event(struct synth_event *event)
29224 +{
29225 +       struct trace_event_call *call = &event->call;
29226 +       int ret = 0;
29227 +
29228 +       event->call.class = &event->class;
29229 +       event->class.system = kstrdup(SYNTH_SYSTEM, GFP_KERNEL);
29230 +       if (!event->class.system) {
29231 +               ret = -ENOMEM;
29232 +               goto out;
29233 +       }
29234 +
29235 +       event->tp = alloc_synth_tracepoint(event->name);
29236 +       if (IS_ERR(event->tp)) {
29237 +               ret = PTR_ERR(event->tp);
29238 +               event->tp = NULL;
29239 +               goto out;
29240 +       }
29241 +
29242 +       INIT_LIST_HEAD(&call->class->fields);
29243 +       call->event.funcs = &synth_event_funcs;
29244 +       call->class->define_fields = synth_event_define_fields;
29245 +
29246 +       ret = register_trace_event(&call->event);
29247 +       if (!ret) {
29248 +               ret = -ENODEV;
29249 +               goto out;
29250 +       }
29251 +       call->flags = TRACE_EVENT_FL_TRACEPOINT;
29252 +       call->class->reg = trace_event_reg;
29253 +       call->class->probe = trace_event_raw_event_synth;
29254 +       call->data = event;
29255 +       call->tp = event->tp;
29256 +
29257 +       ret = trace_add_event_call(call);
29258 +       if (ret) {
29259 +               pr_warn("Failed to register synthetic event: %s\n",
29260 +                       trace_event_name(call));
29261 +               goto err;
29262 +       }
29263 +
29264 +       ret = set_synth_event_print_fmt(call);
29265 +       if (ret < 0) {
29266 +               trace_remove_event_call(call);
29267 +               goto err;
29268 +       }
29269 + out:
29270 +       return ret;
29271 + err:
29272 +       unregister_trace_event(&call->event);
29273 +       goto out;
29274 +}
29275 +
29276 +static int unregister_synth_event(struct synth_event *event)
29277 +{
29278 +       struct trace_event_call *call = &event->call;
29279 +       int ret;
29280 +
29281 +       ret = trace_remove_event_call(call);
29282 +
29283 +       return ret;
29284 +}
29285 +
29286 +static void free_synth_event(struct synth_event *event)
29287 +{
29288 +       unsigned int i;
29289 +
29290 +       if (!event)
29291 +               return;
29292 +
29293 +       for (i = 0; i < event->n_fields; i++)
29294 +               free_synth_field(event->fields[i]);
29295 +
29296 +       kfree(event->fields);
29297 +       kfree(event->name);
29298 +       kfree(event->class.system);
29299 +       free_synth_tracepoint(event->tp);
29300 +       free_synth_event_print_fmt(&event->call);
29301 +       kfree(event);
29302 +}
29303 +
29304 +static struct synth_event *alloc_synth_event(char *event_name, int n_fields,
29305 +                                            struct synth_field **fields)
29306 +{
29307 +       struct synth_event *event;
29308 +       unsigned int i;
29309 +
29310 +       event = kzalloc(sizeof(*event), GFP_KERNEL);
29311 +       if (!event) {
29312 +               event = ERR_PTR(-ENOMEM);
29313 +               goto out;
29314 +       }
29315 +
29316 +       event->name = kstrdup(event_name, GFP_KERNEL);
29317 +       if (!event->name) {
29318 +               kfree(event);
29319 +               event = ERR_PTR(-ENOMEM);
29320 +               goto out;
29321 +       }
29322 +
29323 +       event->fields = kcalloc(n_fields, sizeof(*event->fields), GFP_KERNEL);
29324 +       if (!event->fields) {
29325 +               free_synth_event(event);
29326 +               event = ERR_PTR(-ENOMEM);
29327 +               goto out;
29328 +       }
29329 +
29330 +       for (i = 0; i < n_fields; i++)
29331 +               event->fields[i] = fields[i];
29332 +
29333 +       event->n_fields = n_fields;
29334 + out:
29335 +       return event;
29336 +}
29337 +
29338 +static void action_trace(struct hist_trigger_data *hist_data,
29339 +                        struct tracing_map_elt *elt, void *rec,
29340 +                        struct ring_buffer_event *rbe,
29341 +                        struct action_data *data, u64 *var_ref_vals)
29342 +{
29343 +       struct synth_event *event = data->onmatch.synth_event;
29344 +
29345 +       trace_synth(event, var_ref_vals, data->onmatch.var_ref_idx);
29346 +}
29347 +
29348 +struct hist_var_data {
29349 +       struct list_head list;
29350 +       struct hist_trigger_data *hist_data;
29351 +};
29352 +
29353 +static void add_or_delete_synth_event(struct synth_event *event, int delete)
29354 +{
29355 +       if (delete)
29356 +               free_synth_event(event);
29357 +       else {
29358 +               mutex_lock(&synth_event_mutex);
29359 +               if (!find_synth_event(event->name))
29360 +                       list_add(&event->list, &synth_event_list);
29361 +               else
29362 +                       free_synth_event(event);
29363 +               mutex_unlock(&synth_event_mutex);
29364 +       }
29365 +}
29366 +
29367 +static int create_synth_event(int argc, char **argv)
29368 +{
29369 +       struct synth_field *field, *fields[SYNTH_FIELDS_MAX];
29370 +       struct synth_event *event = NULL;
29371 +       bool delete_event = false;
29372 +       int i, n_fields = 0, ret = 0;
29373 +       char *name;
29374 +
29375 +       mutex_lock(&synth_event_mutex);
29376 +
29377 +       /*
29378 +        * Argument syntax:
29379 +        *  - Add synthetic event: <event_name> field[;field] ...
29380 +        *  - Remove synthetic event: !<event_name> field[;field] ...
29381 +        *      where 'field' = type field_name
29382 +        */
29383 +       if (argc < 1) {
29384 +               ret = -EINVAL;
29385 +               goto out;
29386 +       }
29387 +
29388 +       name = argv[0];
29389 +       if (name[0] == '!') {
29390 +               delete_event = true;
29391 +               name++;
29392 +       }
29393 +
29394 +       event = find_synth_event(name);
29395 +       if (event) {
29396 +               if (delete_event) {
29397 +                       if (event->ref) {
29398 +                               event = NULL;
29399 +                               ret = -EBUSY;
29400 +                               goto out;
29401 +                       }
29402 +                       list_del(&event->list);
29403 +                       goto out;
29404 +               }
29405 +               event = NULL;
29406 +               ret = -EEXIST;
29407 +               goto out;
29408 +       } else if (delete_event)
29409 +               goto out;
29410 +
29411 +       if (argc < 2) {
29412 +               ret = -EINVAL;
29413 +               goto out;
29414 +       }
29415 +
29416 +       for (i = 1; i < argc - 1; i++) {
29417 +               if (strcmp(argv[i], ";") == 0)
29418 +                       continue;
29419 +               if (n_fields == SYNTH_FIELDS_MAX) {
29420 +                       ret = -EINVAL;
29421 +                       goto err;
29422 +               }
29423 +
29424 +               field = parse_synth_field(argv[i], argv[i + 1]);
29425 +               if (IS_ERR(field)) {
29426 +                       ret = PTR_ERR(field);
29427 +                       goto err;
29428 +               }
29429 +               fields[n_fields] = field;
29430 +               i++; n_fields++;
29431 +       }
29432 +
29433 +       if (i < argc) {
29434 +               ret = -EINVAL;
29435 +               goto err;
29436 +       }
29437 +
29438 +       event = alloc_synth_event(name, n_fields, fields);
29439 +       if (IS_ERR(event)) {
29440 +               ret = PTR_ERR(event);
29441 +               event = NULL;
29442 +               goto err;
29443 +       }
29444 + out:
29445 +       mutex_unlock(&synth_event_mutex);
29446 +
29447 +       if (event) {
29448 +               if (delete_event) {
29449 +                       ret = unregister_synth_event(event);
29450 +                       add_or_delete_synth_event(event, !ret);
29451 +               } else {
29452 +                       ret = register_synth_event(event);
29453 +                       add_or_delete_synth_event(event, ret);
29454 +               }
29455 +       }
29456 +
29457 +       return ret;
29458 + err:
29459 +       mutex_unlock(&synth_event_mutex);
29460 +
29461 +       for (i = 0; i < n_fields; i++)
29462 +               free_synth_field(fields[i]);
29463 +       free_synth_event(event);
29464 +
29465 +       return ret;
29466 +}
29467 +
29468 +static int release_all_synth_events(void)
29469 +{
29470 +       struct list_head release_events;
29471 +       struct synth_event *event, *e;
29472 +       int ret = 0;
29473 +
29474 +       INIT_LIST_HEAD(&release_events);
29475 +
29476 +       mutex_lock(&synth_event_mutex);
29477 +
29478 +       list_for_each_entry(event, &synth_event_list, list) {
29479 +               if (event->ref) {
29480 +                       mutex_unlock(&synth_event_mutex);
29481 +                       return -EBUSY;
29482 +               }
29483 +       }
29484 +
29485 +       list_splice_init(&event->list, &release_events);
29486 +
29487 +       mutex_unlock(&synth_event_mutex);
29488 +
29489 +       list_for_each_entry_safe(event, e, &release_events, list) {
29490 +               list_del(&event->list);
29491 +
29492 +               ret = unregister_synth_event(event);
29493 +               add_or_delete_synth_event(event, !ret);
29494 +       }
29495 +
29496 +       return ret;
29497 +}
29498 +
29499 +
29500 +static void *synth_events_seq_start(struct seq_file *m, loff_t *pos)
29501 +{
29502 +       mutex_lock(&synth_event_mutex);
29503 +
29504 +       return seq_list_start(&synth_event_list, *pos);
29505 +}
29506 +
29507 +static void *synth_events_seq_next(struct seq_file *m, void *v, loff_t *pos)
29508 +{
29509 +       return seq_list_next(v, &synth_event_list, pos);
29510 +}
29511 +
29512 +static void synth_events_seq_stop(struct seq_file *m, void *v)
29513 +{
29514 +       mutex_unlock(&synth_event_mutex);
29515 +}
29516 +
29517 +static int synth_events_seq_show(struct seq_file *m, void *v)
29518 +{
29519 +       struct synth_field *field;
29520 +       struct synth_event *event = v;
29521 +       unsigned int i;
29522 +
29523 +       seq_printf(m, "%s\t", event->name);
29524 +
29525 +       for (i = 0; i < event->n_fields; i++) {
29526 +               field = event->fields[i];
29527 +
29528 +               /* parameter values */
29529 +               seq_printf(m, "%s %s%s", field->type, field->name,
29530 +                          i == event->n_fields - 1 ? "" : "; ");
29531 +       }
29532 +
29533 +       seq_putc(m, '\n');
29534 +
29535 +       return 0;
29536 +}
29537 +
29538 +static const struct seq_operations synth_events_seq_op = {
29539 +       .start  = synth_events_seq_start,
29540 +       .next   = synth_events_seq_next,
29541 +       .stop   = synth_events_seq_stop,
29542 +       .show   = synth_events_seq_show
29543 +};
29544 +
29545 +static int synth_events_open(struct inode *inode, struct file *file)
29546 +{
29547 +       int ret;
29548 +
29549 +       if ((file->f_mode & FMODE_WRITE) && (file->f_flags & O_TRUNC)) {
29550 +               ret = release_all_synth_events();
29551 +               if (ret < 0)
29552 +                       return ret;
29553 +       }
29554 +
29555 +       return seq_open(file, &synth_events_seq_op);
29556 +}
29557 +
29558 +static ssize_t synth_events_write(struct file *file,
29559 +                                 const char __user *buffer,
29560 +                                 size_t count, loff_t *ppos)
29561 +{
29562 +       return trace_parse_run_command(file, buffer, count, ppos,
29563 +                                      create_synth_event);
29564 +}
29565 +
29566 +static const struct file_operations synth_events_fops = {
29567 +       .open           = synth_events_open,
29568 +       .write          = synth_events_write,
29569 +       .read           = seq_read,
29570 +       .llseek         = seq_lseek,
29571 +       .release        = seq_release,
29572 +};
29573 +
29574 +static u64 hist_field_timestamp(struct hist_field *hist_field,
29575 +                               struct tracing_map_elt *elt,
29576 +                               struct ring_buffer_event *rbe,
29577 +                               void *event)
29578 +{
29579 +       struct hist_trigger_data *hist_data = hist_field->hist_data;
29580 +       struct trace_array *tr = hist_data->event_file->tr;
29581 +
29582 +       u64 ts = ring_buffer_event_time_stamp(rbe);
29583 +
29584 +       if (hist_data->attrs->ts_in_usecs && trace_clock_in_ns(tr))
29585 +               ts = ns2usecs(ts);
29586 +
29587 +       return ts;
29588 +}
29589 +
29590 +static u64 hist_field_cpu(struct hist_field *hist_field,
29591 +                         struct tracing_map_elt *elt,
29592 +                         struct ring_buffer_event *rbe,
29593 +                         void *event)
29594 +{
29595 +       int cpu = smp_processor_id();
29596 +
29597 +       return cpu;
29598 +}
29599 +
29600 +static struct hist_field *
29601 +check_field_for_var_ref(struct hist_field *hist_field,
29602 +                       struct hist_trigger_data *var_data,
29603 +                       unsigned int var_idx)
29604 +{
29605 +       struct hist_field *found = NULL;
29606 +
29607 +       if (hist_field && hist_field->flags & HIST_FIELD_FL_VAR_REF) {
29608 +               if (hist_field->var.idx == var_idx &&
29609 +                   hist_field->var.hist_data == var_data) {
29610 +                       found = hist_field;
29611 +               }
29612 +       }
29613 +
29614 +       return found;
29615 +}
29616 +
29617 +static struct hist_field *
29618 +check_field_for_var_refs(struct hist_trigger_data *hist_data,
29619 +                        struct hist_field *hist_field,
29620 +                        struct hist_trigger_data *var_data,
29621 +                        unsigned int var_idx,
29622 +                        unsigned int level)
29623 +{
29624 +       struct hist_field *found = NULL;
29625 +       unsigned int i;
29626 +
29627 +       if (level > 3)
29628 +               return found;
29629 +
29630 +       if (!hist_field)
29631 +               return found;
29632 +
29633 +       found = check_field_for_var_ref(hist_field, var_data, var_idx);
29634 +       if (found)
29635 +               return found;
29636 +
29637 +       for (i = 0; i < HIST_FIELD_OPERANDS_MAX; i++) {
29638 +               struct hist_field *operand;
29639 +
29640 +               operand = hist_field->operands[i];
29641 +               found = check_field_for_var_refs(hist_data, operand, var_data,
29642 +                                                var_idx, level + 1);
29643 +               if (found)
29644 +                       return found;
29645 +       }
29646 +
29647 +       return found;
29648 +}
29649 +
29650 +static struct hist_field *find_var_ref(struct hist_trigger_data *hist_data,
29651 +                                      struct hist_trigger_data *var_data,
29652 +                                      unsigned int var_idx)
29653 +{
29654 +       struct hist_field *hist_field, *found = NULL;
29655 +       unsigned int i;
29656 +
29657 +       for_each_hist_field(i, hist_data) {
29658 +               hist_field = hist_data->fields[i];
29659 +               found = check_field_for_var_refs(hist_data, hist_field,
29660 +                                                var_data, var_idx, 0);
29661 +               if (found)
29662 +                       return found;
29663 +       }
29664 +
29665 +       for (i = 0; i < hist_data->n_synth_var_refs; i++) {
29666 +               hist_field = hist_data->synth_var_refs[i];
29667 +               found = check_field_for_var_refs(hist_data, hist_field,
29668 +                                                var_data, var_idx, 0);
29669 +               if (found)
29670 +                       return found;
29671 +       }
29672 +
29673 +       return found;
29674 +}
29675 +
29676 +static struct hist_field *find_any_var_ref(struct hist_trigger_data *hist_data,
29677 +                                          unsigned int var_idx)
29678 +{
29679 +       struct trace_array *tr = hist_data->event_file->tr;
29680 +       struct hist_field *found = NULL;
29681 +       struct hist_var_data *var_data;
29682 +
29683 +       list_for_each_entry(var_data, &tr->hist_vars, list) {
29684 +               if (var_data->hist_data == hist_data)
29685 +                       continue;
29686 +               found = find_var_ref(var_data->hist_data, hist_data, var_idx);
29687 +               if (found)
29688 +                       break;
29689 +       }
29690 +
29691 +       return found;
29692 +}
29693 +
29694 +static bool check_var_refs(struct hist_trigger_data *hist_data)
29695 +{
29696 +       struct hist_field *field;
29697 +       bool found = false;
29698 +       int i;
29699 +
29700 +       for_each_hist_field(i, hist_data) {
29701 +               field = hist_data->fields[i];
29702 +               if (field && field->flags & HIST_FIELD_FL_VAR) {
29703 +                       if (find_any_var_ref(hist_data, field->var.idx)) {
29704 +                               found = true;
29705 +                               break;
29706 +                       }
29707 +               }
29708 +       }
29709 +
29710 +       return found;
29711 +}
29712 +
29713 +static struct hist_var_data *find_hist_vars(struct hist_trigger_data *hist_data)
29714 +{
29715 +       struct trace_array *tr = hist_data->event_file->tr;
29716 +       struct hist_var_data *var_data, *found = NULL;
29717 +
29718 +       list_for_each_entry(var_data, &tr->hist_vars, list) {
29719 +               if (var_data->hist_data == hist_data) {
29720 +                       found = var_data;
29721 +                       break;
29722 +               }
29723 +       }
29724 +
29725 +       return found;
29726 +}
29727 +
29728 +static bool field_has_hist_vars(struct hist_field *hist_field,
29729 +                               unsigned int level)
29730 +{
29731 +       int i;
29732 +
29733 +       if (level > 3)
29734 +               return false;
29735 +
29736 +       if (!hist_field)
29737 +               return false;
29738 +
29739 +       if (hist_field->flags & HIST_FIELD_FL_VAR ||
29740 +           hist_field->flags & HIST_FIELD_FL_VAR_REF)
29741 +               return true;
29742 +
29743 +       for (i = 0; i < HIST_FIELD_OPERANDS_MAX; i++) {
29744 +               struct hist_field *operand;
29745 +
29746 +               operand = hist_field->operands[i];
29747 +               if (field_has_hist_vars(operand, level + 1))
29748 +                       return true;
29749 +       }
29750 +
29751 +       return false;
29752 +}
29753 +
29754 +static bool has_hist_vars(struct hist_trigger_data *hist_data)
29755 +{
29756 +       struct hist_field *hist_field;
29757 +       int i;
29758 +
29759 +       for_each_hist_field(i, hist_data) {
29760 +               hist_field = hist_data->fields[i];
29761 +               if (field_has_hist_vars(hist_field, 0))
29762 +                       return true;
29763 +       }
29764 +
29765 +       return false;
29766 +}
29767 +
29768 +static int save_hist_vars(struct hist_trigger_data *hist_data)
29769 +{
29770 +       struct trace_array *tr = hist_data->event_file->tr;
29771 +       struct hist_var_data *var_data;
29772 +
29773 +       var_data = find_hist_vars(hist_data);
29774 +       if (var_data)
29775 +               return 0;
29776 +
29777 +       if (trace_array_get(tr) < 0)
29778 +               return -ENODEV;
29779 +
29780 +       var_data = kzalloc(sizeof(*var_data), GFP_KERNEL);
29781 +       if (!var_data) {
29782 +               trace_array_put(tr);
29783 +               return -ENOMEM;
29784 +       }
29785 +
29786 +       var_data->hist_data = hist_data;
29787 +       list_add(&var_data->list, &tr->hist_vars);
29788 +
29789 +       return 0;
29790 +}
29791 +
29792 +static void remove_hist_vars(struct hist_trigger_data *hist_data)
29793 +{
29794 +       struct trace_array *tr = hist_data->event_file->tr;
29795 +       struct hist_var_data *var_data;
29796 +
29797 +       var_data = find_hist_vars(hist_data);
29798 +       if (!var_data)
29799 +               return;
29800 +
29801 +       if (WARN_ON(check_var_refs(hist_data)))
29802 +               return;
29803 +
29804 +       list_del(&var_data->list);
29805 +
29806 +       kfree(var_data);
29807 +
29808 +       trace_array_put(tr);
29809 +}
29810 +
29811 +static struct hist_field *find_var_field(struct hist_trigger_data *hist_data,
29812 +                                        const char *var_name)
29813 +{
29814 +       struct hist_field *hist_field, *found = NULL;
29815 +       int i;
29816 +
29817 +       for_each_hist_field(i, hist_data) {
29818 +               hist_field = hist_data->fields[i];
29819 +               if (hist_field && hist_field->flags & HIST_FIELD_FL_VAR &&
29820 +                   strcmp(hist_field->var.name, var_name) == 0) {
29821 +                       found = hist_field;
29822 +                       break;
29823 +               }
29824 +       }
29825 +
29826 +       return found;
29827 +}
29828 +
29829 +static struct hist_field *find_var(struct hist_trigger_data *hist_data,
29830 +                                  struct trace_event_file *file,
29831 +                                  const char *var_name)
29832 +{
29833 +       struct hist_trigger_data *test_data;
29834 +       struct event_trigger_data *test;
29835 +       struct hist_field *hist_field;
29836 +
29837 +       hist_field = find_var_field(hist_data, var_name);
29838 +       if (hist_field)
29839 +               return hist_field;
29840 +
29841 +       list_for_each_entry_rcu(test, &file->triggers, list) {
29842 +               if (test->cmd_ops->trigger_type == ETT_EVENT_HIST) {
29843 +                       test_data = test->private_data;
29844 +                       hist_field = find_var_field(test_data, var_name);
29845 +                       if (hist_field)
29846 +                               return hist_field;
29847 +               }
29848 +       }
29849 +
29850 +       return NULL;
29851 +}
29852 +
29853 +static struct trace_event_file *find_var_file(struct trace_array *tr,
29854 +                                             char *system,
29855 +                                             char *event_name,
29856 +                                             char *var_name)
29857 +{
29858 +       struct hist_trigger_data *var_hist_data;
29859 +       struct hist_var_data *var_data;
29860 +       struct trace_event_file *file, *found = NULL;
29861 +
29862 +       if (system)
29863 +               return find_event_file(tr, system, event_name);
29864 +
29865 +       list_for_each_entry(var_data, &tr->hist_vars, list) {
29866 +               var_hist_data = var_data->hist_data;
29867 +               file = var_hist_data->event_file;
29868 +               if (file == found)
29869 +                       continue;
29870 +
29871 +               if (find_var_field(var_hist_data, var_name)) {
29872 +                       if (found) {
29873 +                               hist_err_event("Variable name not unique, need to use fully qualified name (subsys.event.var) for variable: ", system, event_name, var_name);
29874 +                               return NULL;
29875 +                       }
29876 +
29877 +                       found = file;
29878 +               }
29879 +       }
29880 +
29881 +       return found;
29882 +}
29883 +
29884 +static struct hist_field *find_file_var(struct trace_event_file *file,
29885 +                                       const char *var_name)
29886 +{
29887 +       struct hist_trigger_data *test_data;
29888 +       struct event_trigger_data *test;
29889 +       struct hist_field *hist_field;
29890 +
29891 +       list_for_each_entry_rcu(test, &file->triggers, list) {
29892 +               if (test->cmd_ops->trigger_type == ETT_EVENT_HIST) {
29893 +                       test_data = test->private_data;
29894 +                       hist_field = find_var_field(test_data, var_name);
29895 +                       if (hist_field)
29896 +                               return hist_field;
29897 +               }
29898 +       }
29899 +
29900 +       return NULL;
29901 +}
29902 +
29903 +static struct hist_field *
29904 +find_match_var(struct hist_trigger_data *hist_data, char *var_name)
29905 +{
29906 +       struct trace_array *tr = hist_data->event_file->tr;
29907 +       struct hist_field *hist_field, *found = NULL;
29908 +       struct trace_event_file *file;
29909 +       unsigned int i;
29910 +
29911 +       for (i = 0; i < hist_data->n_actions; i++) {
29912 +               struct action_data *data = hist_data->actions[i];
29913 +
29914 +               if (data->fn == action_trace) {
29915 +                       char *system = data->onmatch.match_event_system;
29916 +                       char *event_name = data->onmatch.match_event;
29917 +
29918 +                       file = find_var_file(tr, system, event_name, var_name);
29919 +                       if (!file)
29920 +                               continue;
29921 +                       hist_field = find_file_var(file, var_name);
29922 +                       if (hist_field) {
29923 +                               if (found) {
29924 +                                       hist_err_event("Variable name not unique, need to use fully qualified name (subsys.event.var) for variable: ", system, event_name, var_name);
29925 +                                       return ERR_PTR(-EINVAL);
29926 +                               }
29927 +
29928 +                               found = hist_field;
29929 +                       }
29930 +               }
29931 +       }
29932 +       return found;
29933 +}
29934 +
29935 +static struct hist_field *find_event_var(struct hist_trigger_data *hist_data,
29936 +                                        char *system,
29937 +                                        char *event_name,
29938 +                                        char *var_name)
29939 +{
29940 +       struct trace_array *tr = hist_data->event_file->tr;
29941 +       struct hist_field *hist_field = NULL;
29942 +       struct trace_event_file *file;
29943 +
29944 +       if (!system || !event_name) {
29945 +               hist_field = find_match_var(hist_data, var_name);
29946 +               if (IS_ERR(hist_field))
29947 +                       return NULL;
29948 +               if (hist_field)
29949 +                       return hist_field;
29950 +       }
29951 +
29952 +       file = find_var_file(tr, system, event_name, var_name);
29953 +       if (!file)
29954 +               return NULL;
29955 +
29956 +       hist_field = find_file_var(file, var_name);
29957 +
29958 +       return hist_field;
29959 +}
29960 +
29961 +struct hist_elt_data {
29962 +       char *comm;
29963 +       u64 *var_ref_vals;
29964 +       char *field_var_str[SYNTH_FIELDS_MAX];
29965 +};
29966 +
29967 +static u64 hist_field_var_ref(struct hist_field *hist_field,
29968 +                             struct tracing_map_elt *elt,
29969 +                             struct ring_buffer_event *rbe,
29970 +                             void *event)
29971 +{
29972 +       struct hist_elt_data *elt_data;
29973 +       u64 var_val = 0;
29974 +
29975 +       elt_data = elt->private_data;
29976 +       var_val = elt_data->var_ref_vals[hist_field->var_ref_idx];
29977 +
29978 +       return var_val;
29979 +}
29980 +
29981 +static bool resolve_var_refs(struct hist_trigger_data *hist_data, void *key,
29982 +                            u64 *var_ref_vals, bool self)
29983 +{
29984 +       struct hist_trigger_data *var_data;
29985 +       struct tracing_map_elt *var_elt;
29986 +       struct hist_field *hist_field;
29987 +       unsigned int i, var_idx;
29988 +       bool resolved = true;
29989 +       u64 var_val = 0;
29990 +
29991 +       for (i = 0; i < hist_data->n_var_refs; i++) {
29992 +               hist_field = hist_data->var_refs[i];
29993 +               var_idx = hist_field->var.idx;
29994 +               var_data = hist_field->var.hist_data;
29995 +
29996 +               if (var_data == NULL) {
29997 +                       resolved = false;
29998 +                       break;
29999 +               }
30000 +
30001 +               if ((self && var_data != hist_data) ||
30002 +                   (!self && var_data == hist_data))
30003 +                       continue;
30004 +
30005 +               var_elt = tracing_map_lookup(var_data->map, key);
30006 +               if (!var_elt) {
30007 +                       resolved = false;
30008 +                       break;
30009 +               }
30010 +
30011 +               if (!tracing_map_var_set(var_elt, var_idx)) {
30012 +                       resolved = false;
30013 +                       break;
30014 +               }
30015 +
30016 +               if (self || !hist_field->read_once)
30017 +                       var_val = tracing_map_read_var(var_elt, var_idx);
30018 +               else
30019 +                       var_val = tracing_map_read_var_once(var_elt, var_idx);
30020 +
30021 +               var_ref_vals[i] = var_val;
30022 +       }
30023 +
30024 +       return resolved;
30025 +}
30026 +
30027 +static const char *hist_field_name(struct hist_field *field,
30028 +                                  unsigned int level)
30029 +{
30030 +       const char *field_name = "";
30031 +
30032 +       if (level > 1)
30033 +               return field_name;
30034 +
30035 +       if (field->field)
30036 +               field_name = field->field->name;
30037 +       else if (field->flags & HIST_FIELD_FL_LOG2 ||
30038 +                field->flags & HIST_FIELD_FL_ALIAS)
30039 +               field_name = hist_field_name(field->operands[0], ++level);
30040 +       else if (field->flags & HIST_FIELD_FL_CPU)
30041 +               field_name = "cpu";
30042 +       else if (field->flags & HIST_FIELD_FL_EXPR ||
30043 +                field->flags & HIST_FIELD_FL_VAR_REF) {
30044 +               if (field->system) {
30045 +                       static char full_name[MAX_FILTER_STR_VAL];
30046 +
30047 +                       strcat(full_name, field->system);
30048 +                       strcat(full_name, ".");
30049 +                       strcat(full_name, field->event_name);
30050 +                       strcat(full_name, ".");
30051 +                       strcat(full_name, field->name);
30052 +                       field_name = full_name;
30053 +               } else
30054 +                       field_name = field->name;
30055 +       } else if (field->flags & HIST_FIELD_FL_TIMESTAMP)
30056 +               field_name = "common_timestamp";
30057 +
30058 +       if (field_name == NULL)
30059 +               field_name = "";
30060 +
30061 +       return field_name;
30062 +}
30063 +
30064 +static hist_field_fn_t select_value_fn(int field_size, int field_is_signed)
30065 +{
30066 +       hist_field_fn_t fn = NULL;
30067 +
30068 +       switch (field_size) {
30069 +       case 8:
30070 +               if (field_is_signed)
30071 +                       fn = hist_field_s64;
30072 +               else
30073 +                       fn = hist_field_u64;
30074 +               break;
30075 +       case 4:
30076 +               if (field_is_signed)
30077 +                       fn = hist_field_s32;
30078 +               else
30079 +                       fn = hist_field_u32;
30080 +               break;
30081 +       case 2:
30082 +               if (field_is_signed)
30083 +                       fn = hist_field_s16;
30084 +               else
30085 +                       fn = hist_field_u16;
30086 +               break;
30087 +       case 1:
30088 +               if (field_is_signed)
30089 +                       fn = hist_field_s8;
30090 +               else
30091 +                       fn = hist_field_u8;
30092 +               break;
30093 +       }
30094 +
30095 +       return fn;
30096 +}
30097 +
30098 +static int parse_map_size(char *str)
30099 +{
30100 +       unsigned long size, map_bits;
30101 +       int ret;
30102 +
30103 +       strsep(&str, "=");
30104 +       if (!str) {
30105 +               ret = -EINVAL;
30106 +               goto out;
30107 +       }
30108 +
30109 +       ret = kstrtoul(str, 0, &size);
30110 +       if (ret)
30111 +               goto out;
30112 +
30113 +       map_bits = ilog2(roundup_pow_of_two(size));
30114 +       if (map_bits < TRACING_MAP_BITS_MIN ||
30115 +           map_bits > TRACING_MAP_BITS_MAX)
30116 +               ret = -EINVAL;
30117 +       else
30118 +               ret = map_bits;
30119 + out:
30120 +       return ret;
30121 +}
30122 +
30123 +static void destroy_hist_trigger_attrs(struct hist_trigger_attrs *attrs)
30124 +{
30125 +       unsigned int i;
30126 +
30127 +       if (!attrs)
30128 +               return;
30129 +
30130 +       for (i = 0; i < attrs->n_assignments; i++)
30131 +               kfree(attrs->assignment_str[i]);
30132 +
30133 +       for (i = 0; i < attrs->n_actions; i++)
30134 +               kfree(attrs->action_str[i]);
30135 +
30136 +       kfree(attrs->name);
30137 +       kfree(attrs->sort_key_str);
30138 +       kfree(attrs->keys_str);
30139 +       kfree(attrs->vals_str);
30140 +       kfree(attrs->clock);
30141 +       kfree(attrs);
30142 +}
30143 +
30144 +static int parse_action(char *str, struct hist_trigger_attrs *attrs)
30145 +{
30146 +       int ret = -EINVAL;
30147 +
30148 +       if (attrs->n_actions >= HIST_ACTIONS_MAX)
30149 +               return ret;
30150 +
30151 +       if ((strncmp(str, "onmatch(", strlen("onmatch(")) == 0) ||
30152 +           (strncmp(str, "onmax(", strlen("onmax(")) == 0)) {
30153 +               attrs->action_str[attrs->n_actions] = kstrdup(str, GFP_KERNEL);
30154 +               if (!attrs->action_str[attrs->n_actions]) {
30155 +                       ret = -ENOMEM;
30156 +                       return ret;
30157 +               }
30158 +               attrs->n_actions++;
30159 +               ret = 0;
30160 +       }
30161 +
30162 +       return ret;
30163 +}
30164 +
30165 +static int parse_assignment(char *str, struct hist_trigger_attrs *attrs)
30166 +{
30167 +       int ret = 0;
30168 +
30169 +       if ((strncmp(str, "key=", strlen("key=")) == 0) ||
30170 +           (strncmp(str, "keys=", strlen("keys=")) == 0)) {
30171 +               attrs->keys_str = kstrdup(str, GFP_KERNEL);
30172 +               if (!attrs->keys_str) {
30173 +                       ret = -ENOMEM;
30174 +                       goto out;
30175 +               }
30176 +       } else if ((strncmp(str, "val=", strlen("val=")) == 0) ||
30177 +                (strncmp(str, "vals=", strlen("vals=")) == 0) ||
30178 +                (strncmp(str, "values=", strlen("values=")) == 0)) {
30179 +               attrs->vals_str = kstrdup(str, GFP_KERNEL);
30180 +               if (!attrs->vals_str) {
30181 +                       ret = -ENOMEM;
30182 +                       goto out;
30183 +               }
30184 +       } else if (strncmp(str, "sort=", strlen("sort=")) == 0) {
30185 +               attrs->sort_key_str = kstrdup(str, GFP_KERNEL);
30186 +               if (!attrs->sort_key_str) {
30187 +                       ret = -ENOMEM;
30188 +                       goto out;
30189 +               }
30190 +       } else if (strncmp(str, "name=", strlen("name=")) == 0) {
30191 +               attrs->name = kstrdup(str, GFP_KERNEL);
30192 +               if (!attrs->name) {
30193 +                       ret = -ENOMEM;
30194 +                       goto out;
30195 +               }
30196 +       } else if (strncmp(str, "clock=", strlen("clock=")) == 0) {
30197 +               strsep(&str, "=");
30198 +               if (!str) {
30199 +                       ret = -EINVAL;
30200 +                       goto out;
30201 +               }
30202 +
30203 +               str = strstrip(str);
30204 +               attrs->clock = kstrdup(str, GFP_KERNEL);
30205 +               if (!attrs->clock) {
30206 +                       ret = -ENOMEM;
30207 +                       goto out;
30208 +               }
30209 +       } else if (strncmp(str, "size=", strlen("size=")) == 0) {
30210 +               int map_bits = parse_map_size(str);
30211 +
30212 +               if (map_bits < 0) {
30213 +                       ret = map_bits;
30214 +                       goto out;
30215 +               }
30216 +               attrs->map_bits = map_bits;
30217 +       } else {
30218 +               char *assignment;
30219 +
30220 +               if (attrs->n_assignments == TRACING_MAP_VARS_MAX) {
30221 +                       hist_err("Too many variables defined: ", str);
30222 +                       ret = -EINVAL;
30223 +                       goto out;
30224 +               }
30225 +
30226 +               assignment = kstrdup(str, GFP_KERNEL);
30227 +               if (!assignment) {
30228 +                       ret = -ENOMEM;
30229 +                       goto out;
30230 +               }
30231 +
30232 +               attrs->assignment_str[attrs->n_assignments++] = assignment;
30233 +       }
30234 + out:
30235 +       return ret;
30236 +}
30237 +
30238 +static struct hist_trigger_attrs *parse_hist_trigger_attrs(char *trigger_str)
30239 +{
30240 +       struct hist_trigger_attrs *attrs;
30241 +       int ret = 0;
30242 +
30243 +       attrs = kzalloc(sizeof(*attrs), GFP_KERNEL);
30244 +       if (!attrs)
30245 +               return ERR_PTR(-ENOMEM);
30246 +
30247 +       while (trigger_str) {
30248 +               char *str = strsep(&trigger_str, ":");
30249 +
30250 +               if (strchr(str, '=')) {
30251 +                       ret = parse_assignment(str, attrs);
30252 +                       if (ret)
30253 +                               goto free;
30254 +               } else if (strcmp(str, "pause") == 0)
30255 +                       attrs->pause = true;
30256 +               else if ((strcmp(str, "cont") == 0) ||
30257 +                        (strcmp(str, "continue") == 0))
30258 +                       attrs->cont = true;
30259 +               else if (strcmp(str, "clear") == 0)
30260 +                       attrs->clear = true;
30261 +               else {
30262 +                       ret = parse_action(str, attrs);
30263 +                       if (ret)
30264 +                               goto free;
30265 +               }
30266 +       }
30267 +
30268 +       if (!attrs->keys_str) {
30269 +               ret = -EINVAL;
30270 +               goto free;
30271 +       }
30272 +
30273 +       if (!attrs->clock) {
30274 +               attrs->clock = kstrdup("global", GFP_KERNEL);
30275 +               if (!attrs->clock) {
30276 +                       ret = -ENOMEM;
30277 +                       goto free;
30278 +               }
30279 +       }
30280 +
30281 +       return attrs;
30282 + free:
30283 +       destroy_hist_trigger_attrs(attrs);
30284 +
30285 +       return ERR_PTR(ret);
30286 +}
30287 +
30288 +static inline void save_comm(char *comm, struct task_struct *task)
30289 +{
30290 +       if (!task->pid) {
30291 +               strcpy(comm, "<idle>");
30292 +               return;
30293 +       }
30294 +
30295 +       if (WARN_ON_ONCE(task->pid < 0)) {
30296 +               strcpy(comm, "<XXX>");
30297 +               return;
30298 +       }
30299 +
30300 +       memcpy(comm, task->comm, TASK_COMM_LEN);
30301 +}
30302 +
30303 +static void hist_elt_data_free(struct hist_elt_data *elt_data)
30304 +{
30305 +       unsigned int i;
30306 +
30307 +       for (i = 0; i < SYNTH_FIELDS_MAX; i++)
30308 +               kfree(elt_data->field_var_str[i]);
30309 +
30310 +       kfree(elt_data->comm);
30311 +       kfree(elt_data);
30312 +}
30313 +
30314 +static void hist_trigger_elt_data_free(struct tracing_map_elt *elt)
30315 +{
30316 +       struct hist_elt_data *elt_data = elt->private_data;
30317 +
30318 +       hist_elt_data_free(elt_data);
30319 +}
30320 +
30321 +static int hist_trigger_elt_data_alloc(struct tracing_map_elt *elt)
30322 +{
30323 +       struct hist_trigger_data *hist_data = elt->map->private_data;
30324 +       unsigned int size = TASK_COMM_LEN;
30325 +       struct hist_elt_data *elt_data;
30326 +       struct hist_field *key_field;
30327 +       unsigned int i, n_str;
30328 +
30329 +       elt_data = kzalloc(sizeof(*elt_data), GFP_KERNEL);
30330 +       if (!elt_data)
30331 +               return -ENOMEM;
30332 +
30333 +       for_each_hist_key_field(i, hist_data) {
30334 +               key_field = hist_data->fields[i];
30335 +
30336 +               if (key_field->flags & HIST_FIELD_FL_EXECNAME) {
30337 +                       elt_data->comm = kzalloc(size, GFP_KERNEL);
30338 +                       if (!elt_data->comm) {
30339 +                               kfree(elt_data);
30340 +                               return -ENOMEM;
30341 +                       }
30342 +                       break;
30343 +               }
30344 +       }
30345 +
30346 +       n_str = hist_data->n_field_var_str + hist_data->n_max_var_str;
30347 +
30348 +       size = STR_VAR_LEN_MAX;
30349 +
30350 +       for (i = 0; i < n_str; i++) {
30351 +               elt_data->field_var_str[i] = kzalloc(size, GFP_KERNEL);
30352 +               if (!elt_data->field_var_str[i]) {
30353 +                       hist_elt_data_free(elt_data);
30354 +                       return -ENOMEM;
30355 +               }
30356 +       }
30357 +
30358 +       elt->private_data = elt_data;
30359 +
30360 +       return 0;
30361 +}
30362 +
30363 +static void hist_trigger_elt_data_init(struct tracing_map_elt *elt)
30364 +{
30365 +       struct hist_elt_data *elt_data = elt->private_data;
30366 +
30367 +       if (elt_data->comm)
30368 +               save_comm(elt_data->comm, current);
30369 +}
30370 +
30371 +static const struct tracing_map_ops hist_trigger_elt_data_ops = {
30372 +       .elt_alloc      = hist_trigger_elt_data_alloc,
30373 +       .elt_free       = hist_trigger_elt_data_free,
30374 +       .elt_init       = hist_trigger_elt_data_init,
30375 +};
30376 +
30377 +static const char *get_hist_field_flags(struct hist_field *hist_field)
30378 +{
30379 +       const char *flags_str = NULL;
30380 +
30381 +       if (hist_field->flags & HIST_FIELD_FL_HEX)
30382 +               flags_str = "hex";
30383 +       else if (hist_field->flags & HIST_FIELD_FL_SYM)
30384 +               flags_str = "sym";
30385 +       else if (hist_field->flags & HIST_FIELD_FL_SYM_OFFSET)
30386 +               flags_str = "sym-offset";
30387 +       else if (hist_field->flags & HIST_FIELD_FL_EXECNAME)
30388 +               flags_str = "execname";
30389 +       else if (hist_field->flags & HIST_FIELD_FL_SYSCALL)
30390 +               flags_str = "syscall";
30391 +       else if (hist_field->flags & HIST_FIELD_FL_LOG2)
30392 +               flags_str = "log2";
30393 +       else if (hist_field->flags & HIST_FIELD_FL_TIMESTAMP_USECS)
30394 +               flags_str = "usecs";
30395 +
30396 +       return flags_str;
30397 +}
30398 +
30399 +static void expr_field_str(struct hist_field *field, char *expr)
30400 +{
30401 +       if (field->flags & HIST_FIELD_FL_VAR_REF)
30402 +               strcat(expr, "$");
30403 +
30404 +       strcat(expr, hist_field_name(field, 0));
30405 +
30406 +       if (field->flags && !(field->flags & HIST_FIELD_FL_VAR_REF)) {
30407 +               const char *flags_str = get_hist_field_flags(field);
30408 +
30409 +               if (flags_str) {
30410 +                       strcat(expr, ".");
30411 +                       strcat(expr, flags_str);
30412 +               }
30413 +       }
30414 +}
30415 +
30416 +static char *expr_str(struct hist_field *field, unsigned int level)
30417 +{
30418 +       char *expr;
30419 +
30420 +       if (level > 1)
30421 +               return NULL;
30422 +
30423 +       expr = kzalloc(MAX_FILTER_STR_VAL, GFP_KERNEL);
30424 +       if (!expr)
30425 +               return NULL;
30426 +
30427 +       if (!field->operands[0]) {
30428 +               expr_field_str(field, expr);
30429 +               return expr;
30430 +       }
30431 +
30432 +       if (field->operator == FIELD_OP_UNARY_MINUS) {
30433 +               char *subexpr;
30434 +
30435 +               strcat(expr, "-(");
30436 +               subexpr = expr_str(field->operands[0], ++level);
30437 +               if (!subexpr) {
30438 +                       kfree(expr);
30439 +                       return NULL;
30440 +               }
30441 +               strcat(expr, subexpr);
30442 +               strcat(expr, ")");
30443 +
30444 +               kfree(subexpr);
30445 +
30446 +               return expr;
30447 +       }
30448 +
30449 +       expr_field_str(field->operands[0], expr);
30450 +
30451 +       switch (field->operator) {
30452 +       case FIELD_OP_MINUS:
30453 +               strcat(expr, "-");
30454 +               break;
30455 +       case FIELD_OP_PLUS:
30456 +               strcat(expr, "+");
30457 +               break;
30458 +       default:
30459 +               kfree(expr);
30460 +               return NULL;
30461 +       }
30462 +
30463 +       expr_field_str(field->operands[1], expr);
30464 +
30465 +       return expr;
30466 +}
30467 +
30468 +static int contains_operator(char *str)
30469 +{
30470 +       enum field_op_id field_op = FIELD_OP_NONE;
30471 +       char *op;
30472 +
30473 +       op = strpbrk(str, "+-");
30474 +       if (!op)
30475 +               return FIELD_OP_NONE;
30476 +
30477 +       switch (*op) {
30478 +       case '-':
30479 +               if (*str == '-')
30480 +                       field_op = FIELD_OP_UNARY_MINUS;
30481 +               else
30482 +                       field_op = FIELD_OP_MINUS;
30483 +               break;
30484 +       case '+':
30485 +               field_op = FIELD_OP_PLUS;
30486 +               break;
30487 +       default:
30488 +               break;
30489 +       }
30490 +
30491 +       return field_op;
30492 +}
30493 +
30494 +static void destroy_hist_field(struct hist_field *hist_field,
30495 +                              unsigned int level)
30496 +{
30497 +       unsigned int i;
30498 +
30499 +       if (level > 3)
30500 +               return;
30501 +
30502 +       if (!hist_field)
30503 +               return;
30504 +
30505 +       for (i = 0; i < HIST_FIELD_OPERANDS_MAX; i++)
30506 +               destroy_hist_field(hist_field->operands[i], level + 1);
30507 +
30508 +       kfree(hist_field->var.name);
30509 +       kfree(hist_field->name);
30510 +       kfree(hist_field->type);
30511 +
30512 +       kfree(hist_field);
30513 +}
30514 +
30515 +static struct hist_field *create_hist_field(struct hist_trigger_data *hist_data,
30516 +                                           struct ftrace_event_field *field,
30517 +                                           unsigned long flags,
30518 +                                           char *var_name)
30519 +{
30520 +       struct hist_field *hist_field;
30521 +
30522 +       if (field && is_function_field(field))
30523 +               return NULL;
30524 +
30525 +       hist_field = kzalloc(sizeof(struct hist_field), GFP_KERNEL);
30526 +       if (!hist_field)
30527 +               return NULL;
30528 +
30529 +       hist_field->hist_data = hist_data;
30530 +
30531 +       if (flags & HIST_FIELD_FL_EXPR || flags & HIST_FIELD_FL_ALIAS)
30532 +               goto out; /* caller will populate */
30533 +
30534 +       if (flags & HIST_FIELD_FL_VAR_REF) {
30535 +               hist_field->fn = hist_field_var_ref;
30536 +               goto out;
30537 +       }
30538 +
30539 +       if (flags & HIST_FIELD_FL_HITCOUNT) {
30540 +               hist_field->fn = hist_field_counter;
30541 +               hist_field->size = sizeof(u64);
30542 +               hist_field->type = kstrdup("u64", GFP_KERNEL);
30543 +               if (!hist_field->type)
30544 +                       goto free;
30545 +               goto out;
30546 +       }
30547 +
30548 +       if (flags & HIST_FIELD_FL_STACKTRACE) {
30549 +               hist_field->fn = hist_field_none;
30550 +               goto out;
30551 +       }
30552 +
30553 +       if (flags & HIST_FIELD_FL_LOG2) {
30554 +               unsigned long fl = flags & ~HIST_FIELD_FL_LOG2;
30555 +               hist_field->fn = hist_field_log2;
30556 +               hist_field->operands[0] = create_hist_field(hist_data, field, fl, NULL);
30557 +               hist_field->size = hist_field->operands[0]->size;
30558 +               hist_field->type = kstrdup(hist_field->operands[0]->type, GFP_KERNEL);
30559 +               if (!hist_field->type)
30560 +                       goto free;
30561 +               goto out;
30562 +       }
30563 +
30564 +       if (flags & HIST_FIELD_FL_TIMESTAMP) {
30565 +               hist_field->fn = hist_field_timestamp;
30566 +               hist_field->size = sizeof(u64);
30567 +               hist_field->type = kstrdup("u64", GFP_KERNEL);
30568 +               if (!hist_field->type)
30569 +                       goto free;
30570 +               goto out;
30571 +       }
30572 +
30573 +       if (flags & HIST_FIELD_FL_CPU) {
30574 +               hist_field->fn = hist_field_cpu;
30575 +               hist_field->size = sizeof(int);
30576 +               hist_field->type = kstrdup("unsigned int", GFP_KERNEL);
30577 +               if (!hist_field->type)
30578 +                       goto free;
30579 +               goto out;
30580 +       }
30581 +
30582 +       if (WARN_ON_ONCE(!field))
30583 +               goto out;
30584 +
30585 +       if (is_string_field(field)) {
30586 +               flags |= HIST_FIELD_FL_STRING;
30587 +
30588 +               hist_field->size = MAX_FILTER_STR_VAL;
30589 +               hist_field->type = kstrdup(field->type, GFP_KERNEL);
30590 +               if (!hist_field->type)
30591 +                       goto free;
30592 +
30593 +               if (field->filter_type == FILTER_STATIC_STRING)
30594 +                       hist_field->fn = hist_field_string;
30595 +               else if (field->filter_type == FILTER_DYN_STRING)
30596 +                       hist_field->fn = hist_field_dynstring;
30597 +               else
30598 +                       hist_field->fn = hist_field_pstring;
30599 +       } else {
30600 +               hist_field->size = field->size;
30601 +               hist_field->is_signed = field->is_signed;
30602 +               hist_field->type = kstrdup(field->type, GFP_KERNEL);
30603 +               if (!hist_field->type)
30604 +                       goto free;
30605 +
30606 +               hist_field->fn = select_value_fn(field->size,
30607 +                                                field->is_signed);
30608 +               if (!hist_field->fn) {
30609 +                       destroy_hist_field(hist_field, 0);
30610 +                       return NULL;
30611 +               }
30612 +       }
30613 + out:
30614 +       hist_field->field = field;
30615 +       hist_field->flags = flags;
30616 +
30617 +       if (var_name) {
30618 +               hist_field->var.name = kstrdup(var_name, GFP_KERNEL);
30619 +               if (!hist_field->var.name)
30620 +                       goto free;
30621 +       }
30622 +
30623 +       return hist_field;
30624 + free:
30625 +       destroy_hist_field(hist_field, 0);
30626 +       return NULL;
30627 +}
30628 +
30629 +static void destroy_hist_fields(struct hist_trigger_data *hist_data)
30630 +{
30631 +       unsigned int i;
30632 +
30633 +       for (i = 0; i < HIST_FIELDS_MAX; i++) {
30634 +               if (hist_data->fields[i]) {
30635 +                       destroy_hist_field(hist_data->fields[i], 0);
30636 +                       hist_data->fields[i] = NULL;
30637 +               }
30638 +       }
30639 +}
30640 +
30641 +static int init_var_ref(struct hist_field *ref_field,
30642 +                       struct hist_field *var_field,
30643 +                       char *system, char *event_name)
30644 +{
30645 +       int err = 0;
30646 +
30647 +       ref_field->var.idx = var_field->var.idx;
30648 +       ref_field->var.hist_data = var_field->hist_data;
30649 +       ref_field->size = var_field->size;
30650 +       ref_field->is_signed = var_field->is_signed;
30651 +       ref_field->flags |= var_field->flags &
30652 +               (HIST_FIELD_FL_TIMESTAMP | HIST_FIELD_FL_TIMESTAMP_USECS);
30653 +
30654 +       if (system) {
30655 +               ref_field->system = kstrdup(system, GFP_KERNEL);
30656 +               if (!ref_field->system)
30657 +                       return -ENOMEM;
30658 +       }
30659 +
30660 +       if (event_name) {
30661 +               ref_field->event_name = kstrdup(event_name, GFP_KERNEL);
30662 +               if (!ref_field->event_name) {
30663 +                       err = -ENOMEM;
30664 +                       goto free;
30665 +               }
30666 +       }
30667 +
30668 +       if (var_field->var.name) {
30669 +               ref_field->name = kstrdup(var_field->var.name, GFP_KERNEL);
30670 +               if (!ref_field->name) {
30671 +                       err = -ENOMEM;
30672 +                       goto free;
30673 +               }
30674 +       } else if (var_field->name) {
30675 +               ref_field->name = kstrdup(var_field->name, GFP_KERNEL);
30676 +               if (!ref_field->name) {
30677 +                       err = -ENOMEM;
30678 +                       goto free;
30679 +               }
30680 +       }
30681 +
30682 +       ref_field->type = kstrdup(var_field->type, GFP_KERNEL);
30683 +       if (!ref_field->type) {
30684 +               err = -ENOMEM;
30685 +               goto free;
30686 +       }
30687 + out:
30688 +       return err;
30689 + free:
30690 +       kfree(ref_field->system);
30691 +       kfree(ref_field->event_name);
30692 +       kfree(ref_field->name);
30693 +
30694 +       goto out;
30695 +}
30696 +
30697 +static struct hist_field *create_var_ref(struct hist_field *var_field,
30698 +                                        char *system, char *event_name)
30699 +{
30700 +       unsigned long flags = HIST_FIELD_FL_VAR_REF;
30701 +       struct hist_field *ref_field;
30702 +
30703 +       ref_field = create_hist_field(var_field->hist_data, NULL, flags, NULL);
30704 +       if (ref_field) {
30705 +               if (init_var_ref(ref_field, var_field, system, event_name)) {
30706 +                       destroy_hist_field(ref_field, 0);
30707 +                       return NULL;
30708 +               }
30709 +       }
30710 +
30711 +       return ref_field;
30712 +}
30713 +
30714 +static bool is_var_ref(char *var_name)
30715 +{
30716 +       if (!var_name || strlen(var_name) < 2 || var_name[0] != '$')
30717 +               return false;
30718 +
30719 +       return true;
30720 +}
30721 +
30722 +static char *field_name_from_var(struct hist_trigger_data *hist_data,
30723 +                                char *var_name)
30724 +{
30725 +       char *name, *field;
30726 +       unsigned int i;
30727 +
30728 +       for (i = 0; i < hist_data->attrs->var_defs.n_vars; i++) {
30729 +               name = hist_data->attrs->var_defs.name[i];
30730 +
30731 +               if (strcmp(var_name, name) == 0) {
30732 +                       field = hist_data->attrs->var_defs.expr[i];
30733 +                       if (contains_operator(field) || is_var_ref(field))
30734 +                               continue;
30735 +                       return field;
30736 +               }
30737 +       }
30738 +
30739 +       return NULL;
30740 +}
30741 +
30742 +static char *local_field_var_ref(struct hist_trigger_data *hist_data,
30743 +                                char *system, char *event_name,
30744 +                                char *var_name)
30745 +{
30746 +       struct trace_event_call *call;
30747 +
30748 +       if (system && event_name) {
30749 +               call = hist_data->event_file->event_call;
30750 +
30751 +               if (strcmp(system, call->class->system) != 0)
30752 +                       return NULL;
30753 +
30754 +               if (strcmp(event_name, trace_event_name(call)) != 0)
30755 +                       return NULL;
30756 +       }
30757 +
30758 +       if (!!system != !!event_name)
30759 +               return NULL;
30760 +
30761 +       if (!is_var_ref(var_name))
30762 +               return NULL;
30763 +
30764 +       var_name++;
30765 +
30766 +       return field_name_from_var(hist_data, var_name);
30767 +}
30768 +
30769 +static struct hist_field *parse_var_ref(struct hist_trigger_data *hist_data,
30770 +                                       char *system, char *event_name,
30771 +                                       char *var_name)
30772 +{
30773 +       struct hist_field *var_field = NULL, *ref_field = NULL;
30774 +
30775 +       if (!is_var_ref(var_name))
30776 +               return NULL;
30777 +
30778 +       var_name++;
30779 +
30780 +       var_field = find_event_var(hist_data, system, event_name, var_name);
30781 +       if (var_field)
30782 +               ref_field = create_var_ref(var_field, system, event_name);
30783 +
30784 +       if (!ref_field)
30785 +               hist_err_event("Couldn't find variable: $",
30786 +                              system, event_name, var_name);
30787 +
30788 +       return ref_field;
30789 +}
30790 +
30791 +static struct ftrace_event_field *
30792 +parse_field(struct hist_trigger_data *hist_data, struct trace_event_file *file,
30793 +           char *field_str, unsigned long *flags)
30794 +{
30795 +       struct ftrace_event_field *field = NULL;
30796 +       char *field_name, *modifier, *str;
30797 +
30798 +       modifier = str = kstrdup(field_str, GFP_KERNEL);
30799 +       if (!modifier)
30800 +               return ERR_PTR(-ENOMEM);
30801 +
30802 +       field_name = strsep(&modifier, ".");
30803 +       if (modifier) {
30804 +               if (strcmp(modifier, "hex") == 0)
30805 +                       *flags |= HIST_FIELD_FL_HEX;
30806 +               else if (strcmp(modifier, "sym") == 0)
30807 +                       *flags |= HIST_FIELD_FL_SYM;
30808 +               else if (strcmp(modifier, "sym-offset") == 0)
30809 +                       *flags |= HIST_FIELD_FL_SYM_OFFSET;
30810 +               else if ((strcmp(modifier, "execname") == 0) &&
30811 +                        (strcmp(field_name, "common_pid") == 0))
30812 +                       *flags |= HIST_FIELD_FL_EXECNAME;
30813 +               else if (strcmp(modifier, "syscall") == 0)
30814 +                       *flags |= HIST_FIELD_FL_SYSCALL;
30815 +               else if (strcmp(modifier, "log2") == 0)
30816 +                       *flags |= HIST_FIELD_FL_LOG2;
30817 +               else if (strcmp(modifier, "usecs") == 0)
30818 +                       *flags |= HIST_FIELD_FL_TIMESTAMP_USECS;
30819 +               else {
30820 +                       hist_err("Invalid field modifier: ", modifier);
30821 +                       field = ERR_PTR(-EINVAL);
30822 +                       goto out;
30823 +               }
30824 +       }
30825 +
30826 +       if (strcmp(field_name, "common_timestamp") == 0) {
30827 +               *flags |= HIST_FIELD_FL_TIMESTAMP;
30828 +               hist_data->enable_timestamps = true;
30829 +               if (*flags & HIST_FIELD_FL_TIMESTAMP_USECS)
30830 +                       hist_data->attrs->ts_in_usecs = true;
30831 +       } else if (strcmp(field_name, "cpu") == 0)
30832 +               *flags |= HIST_FIELD_FL_CPU;
30833 +       else {
30834 +               field = trace_find_event_field(file->event_call, field_name);
30835 +               if (!field || !field->size) {
30836 +                       hist_err("Couldn't find field: ", field_name);
30837 +                       field = ERR_PTR(-EINVAL);
30838 +                       goto out;
30839 +               }
30840 +       }
30841 + out:
30842 +       kfree(str);
30843 +
30844 +       return field;
30845 +}
30846 +
30847 +static struct hist_field *create_alias(struct hist_trigger_data *hist_data,
30848 +                                      struct hist_field *var_ref,
30849 +                                      char *var_name)
30850 +{
30851 +       struct hist_field *alias = NULL;
30852 +       unsigned long flags = HIST_FIELD_FL_ALIAS | HIST_FIELD_FL_VAR;
30853 +
30854 +       alias = create_hist_field(hist_data, NULL, flags, var_name);
30855 +       if (!alias)
30856 +               return NULL;
30857 +
30858 +       alias->fn = var_ref->fn;
30859 +       alias->operands[0] = var_ref;
30860 +
30861 +       if (init_var_ref(alias, var_ref, var_ref->system, var_ref->event_name)) {
30862 +               destroy_hist_field(alias, 0);
30863 +               return NULL;
30864 +       }
30865 +
30866 +       return alias;
30867 +}
30868 +
30869 +static struct hist_field *parse_atom(struct hist_trigger_data *hist_data,
30870 +                                    struct trace_event_file *file, char *str,
30871 +                                    unsigned long *flags, char *var_name)
30872 +{
30873 +       char *s, *ref_system = NULL, *ref_event = NULL, *ref_var = str;
30874 +       struct ftrace_event_field *field = NULL;
30875 +       struct hist_field *hist_field = NULL;
30876 +       int ret = 0;
30877 +
30878 +       s = strchr(str, '.');
30879 +       if (s) {
30880 +               s = strchr(++s, '.');
30881 +               if (s) {
30882 +                       ref_system = strsep(&str, ".");
30883 +                       if (!str) {
30884 +                               ret = -EINVAL;
30885 +                               goto out;
30886 +                       }
30887 +                       ref_event = strsep(&str, ".");
30888 +                       if (!str) {
30889 +                               ret = -EINVAL;
30890 +                               goto out;
30891 +                       }
30892 +                       ref_var = str;
30893 +               }
30894 +       }
30895 +
30896 +       s = local_field_var_ref(hist_data, ref_system, ref_event, ref_var);
30897 +       if (!s) {
30898 +               hist_field = parse_var_ref(hist_data, ref_system, ref_event, ref_var);
30899 +               if (hist_field) {
30900 +                       hist_data->var_refs[hist_data->n_var_refs] = hist_field;
30901 +                       hist_field->var_ref_idx = hist_data->n_var_refs++;
30902 +                       if (var_name) {
30903 +                               hist_field = create_alias(hist_data, hist_field, var_name);
30904 +                               if (!hist_field) {
30905 +                                       ret = -ENOMEM;
30906 +                                       goto out;
30907 +                               }
30908 +                       }
30909 +                       return hist_field;
30910 +               }
30911 +       } else
30912 +               str = s;
30913 +
30914 +       field = parse_field(hist_data, file, str, flags);
30915 +       if (IS_ERR(field)) {
30916 +               ret = PTR_ERR(field);
30917 +               goto out;
30918 +       }
30919 +
30920 +       hist_field = create_hist_field(hist_data, field, *flags, var_name);
30921 +       if (!hist_field) {
30922 +               ret = -ENOMEM;
30923 +               goto out;
30924 +       }
30925 +
30926 +       return hist_field;
30927 + out:
30928 +       return ERR_PTR(ret);
30929 +}
30930 +
30931 +static struct hist_field *parse_expr(struct hist_trigger_data *hist_data,
30932 +                                    struct trace_event_file *file,
30933 +                                    char *str, unsigned long flags,
30934 +                                    char *var_name, unsigned int level);
30935 +
30936 +static struct hist_field *parse_unary(struct hist_trigger_data *hist_data,
30937 +                                     struct trace_event_file *file,
30938 +                                     char *str, unsigned long flags,
30939 +                                     char *var_name, unsigned int level)
30940 +{
30941 +       struct hist_field *operand1, *expr = NULL;
30942 +       unsigned long operand_flags;
30943 +       int ret = 0;
30944 +       char *s;
30945 +
30946 +       // we support only -(xxx) i.e. explicit parens required
30947 +
30948 +       if (level > 3) {
30949 +               hist_err("Too many subexpressions (3 max): ", str);
30950 +               ret = -EINVAL;
30951 +               goto free;
30952 +       }
30953 +
30954 +       str++; // skip leading '-'
30955 +
30956 +       s = strchr(str, '(');
30957 +       if (s)
30958 +               str++;
30959 +       else {
30960 +               ret = -EINVAL;
30961 +               goto free;
30962 +       }
30963 +
30964 +       s = strrchr(str, ')');
30965 +       if (s)
30966 +               *s = '\0';
30967 +       else {
30968 +               ret = -EINVAL; // no closing ')'
30969 +               goto free;
30970 +       }
30971 +
30972 +       flags |= HIST_FIELD_FL_EXPR;
30973 +       expr = create_hist_field(hist_data, NULL, flags, var_name);
30974 +       if (!expr) {
30975 +               ret = -ENOMEM;
30976 +               goto free;
30977 +       }
30978 +
30979 +       operand_flags = 0;
30980 +       operand1 = parse_expr(hist_data, file, str, operand_flags, NULL, ++level);
30981 +       if (IS_ERR(operand1)) {
30982 +               ret = PTR_ERR(operand1);
30983 +               goto free;
30984 +       }
30985 +
30986 +       expr->flags |= operand1->flags &
30987 +               (HIST_FIELD_FL_TIMESTAMP | HIST_FIELD_FL_TIMESTAMP_USECS);
30988 +       expr->fn = hist_field_unary_minus;
30989 +       expr->operands[0] = operand1;
30990 +       expr->operator = FIELD_OP_UNARY_MINUS;
30991 +       expr->name = expr_str(expr, 0);
30992 +       expr->type = kstrdup(operand1->type, GFP_KERNEL);
30993 +       if (!expr->type) {
30994 +               ret = -ENOMEM;
30995 +               goto free;
30996 +       }
30997 +
30998 +       return expr;
30999 + free:
31000 +       destroy_hist_field(expr, 0);
31001 +       return ERR_PTR(ret);
31002 +}
31003 +
31004 +static int check_expr_operands(struct hist_field *operand1,
31005 +                              struct hist_field *operand2)
31006 +{
31007 +       unsigned long operand1_flags = operand1->flags;
31008 +       unsigned long operand2_flags = operand2->flags;
31009 +
31010 +       if ((operand1_flags & HIST_FIELD_FL_VAR_REF) ||
31011 +           (operand1_flags & HIST_FIELD_FL_ALIAS)) {
31012 +               struct hist_field *var;
31013 +
31014 +               var = find_var_field(operand1->var.hist_data, operand1->name);
31015 +               if (!var)
31016 +                       return -EINVAL;
31017 +               operand1_flags = var->flags;
31018 +       }
31019 +
31020 +       if ((operand2_flags & HIST_FIELD_FL_VAR_REF) ||
31021 +           (operand2_flags & HIST_FIELD_FL_ALIAS)) {
31022 +               struct hist_field *var;
31023 +
31024 +               var = find_var_field(operand2->var.hist_data, operand2->name);
31025 +               if (!var)
31026 +                       return -EINVAL;
31027 +               operand2_flags = var->flags;
31028 +       }
31029 +
31030 +       if ((operand1_flags & HIST_FIELD_FL_TIMESTAMP_USECS) !=
31031 +           (operand2_flags & HIST_FIELD_FL_TIMESTAMP_USECS)) {
31032 +               hist_err("Timestamp units in expression don't match", NULL);
31033 +               return -EINVAL;
31034 +       }
31035 +
31036 +       return 0;
31037 +}
31038 +
31039 +static struct hist_field *parse_expr(struct hist_trigger_data *hist_data,
31040 +                                    struct trace_event_file *file,
31041 +                                    char *str, unsigned long flags,
31042 +                                    char *var_name, unsigned int level)
31043 +{
31044 +       struct hist_field *operand1 = NULL, *operand2 = NULL, *expr = NULL;
31045 +       unsigned long operand_flags;
31046 +       int field_op, ret = -EINVAL;
31047 +       char *sep, *operand1_str;
31048 +
31049 +       if (level > 3) {
31050 +               hist_err("Too many subexpressions (3 max): ", str);
31051 +               return ERR_PTR(-EINVAL);
31052 +       }
31053 +
31054 +       field_op = contains_operator(str);
31055 +
31056 +       if (field_op == FIELD_OP_NONE)
31057 +               return parse_atom(hist_data, file, str, &flags, var_name);
31058 +
31059 +       if (field_op == FIELD_OP_UNARY_MINUS)
31060 +               return parse_unary(hist_data, file, str, flags, var_name, ++level);
31061 +
31062 +       switch (field_op) {
31063 +       case FIELD_OP_MINUS:
31064 +               sep = "-";
31065 +               break;
31066 +       case FIELD_OP_PLUS:
31067 +               sep = "+";
31068 +               break;
31069 +       default:
31070 +               goto free;
31071 +       }
31072 +
31073 +       operand1_str = strsep(&str, sep);
31074 +       if (!operand1_str || !str)
31075 +               goto free;
31076 +
31077 +       operand_flags = 0;
31078 +       operand1 = parse_atom(hist_data, file, operand1_str,
31079 +                             &operand_flags, NULL);
31080 +       if (IS_ERR(operand1)) {
31081 +               ret = PTR_ERR(operand1);
31082 +               operand1 = NULL;
31083 +               goto free;
31084 +       }
31085 +
31086 +       // rest of string could be another expression e.g. b+c in a+b+c
31087 +       operand_flags = 0;
31088 +       operand2 = parse_expr(hist_data, file, str, operand_flags, NULL, ++level);
31089 +       if (IS_ERR(operand2)) {
31090 +               ret = PTR_ERR(operand2);
31091 +               operand2 = NULL;
31092 +               goto free;
31093 +       }
31094 +
31095 +       ret = check_expr_operands(operand1, operand2);
31096 +       if (ret)
31097 +               goto free;
31098 +
31099 +       flags |= HIST_FIELD_FL_EXPR;
31100 +
31101 +       flags |= operand1->flags &
31102 +               (HIST_FIELD_FL_TIMESTAMP | HIST_FIELD_FL_TIMESTAMP_USECS);
31103 +
31104 +       expr = create_hist_field(hist_data, NULL, flags, var_name);
31105 +       if (!expr) {
31106 +               ret = -ENOMEM;
31107 +               goto free;
31108 +       }
31109 +
31110 +       operand1->read_once = true;
31111 +       operand2->read_once = true;
31112 +
31113 +       expr->operands[0] = operand1;
31114 +       expr->operands[1] = operand2;
31115 +       expr->operator = field_op;
31116 +       expr->name = expr_str(expr, 0);
31117 +       expr->type = kstrdup(operand1->type, GFP_KERNEL);
31118 +       if (!expr->type) {
31119 +               ret = -ENOMEM;
31120 +               goto free;
31121 +       }
31122 +
31123 +       switch (field_op) {
31124 +       case FIELD_OP_MINUS:
31125 +               expr->fn = hist_field_minus;
31126 +               break;
31127 +       case FIELD_OP_PLUS:
31128 +               expr->fn = hist_field_plus;
31129 +               break;
31130 +       default:
31131 +               ret = -EINVAL;
31132 +               goto free;
31133 +       }
31134 +
31135 +       return expr;
31136 + free:
31137 +       destroy_hist_field(operand1, 0);
31138 +       destroy_hist_field(operand2, 0);
31139 +       destroy_hist_field(expr, 0);
31140 +
31141 +       return ERR_PTR(ret);
31142 +}
31143 +
31144 +static char *find_trigger_filter(struct hist_trigger_data *hist_data,
31145 +                                struct trace_event_file *file)
31146 +{
31147 +       struct event_trigger_data *test;
31148 +
31149 +       list_for_each_entry_rcu(test, &file->triggers, list) {
31150 +               if (test->cmd_ops->trigger_type == ETT_EVENT_HIST) {
31151 +                       if (test->private_data == hist_data)
31152 +                               return test->filter_str;
31153 +               }
31154 +       }
31155 +
31156 +       return NULL;
31157 +}
31158 +
31159 +static struct event_command trigger_hist_cmd;
31160 +static int event_hist_trigger_func(struct event_command *cmd_ops,
31161 +                                  struct trace_event_file *file,
31162 +                                  char *glob, char *cmd, char *param);
31163 +
31164 +static bool compatible_keys(struct hist_trigger_data *target_hist_data,
31165 +                           struct hist_trigger_data *hist_data,
31166 +                           unsigned int n_keys)
31167 +{
31168 +       struct hist_field *target_hist_field, *hist_field;
31169 +       unsigned int n, i, j;
31170 +
31171 +       if (hist_data->n_fields - hist_data->n_vals != n_keys)
31172 +               return false;
31173 +
31174 +       i = hist_data->n_vals;
31175 +       j = target_hist_data->n_vals;
31176 +
31177 +       for (n = 0; n < n_keys; n++) {
31178 +               hist_field = hist_data->fields[i + n];
31179 +               target_hist_field = target_hist_data->fields[j + n];
31180 +
31181 +               if (strcmp(hist_field->type, target_hist_field->type) != 0)
31182 +                       return false;
31183 +               if (hist_field->size != target_hist_field->size)
31184 +                       return false;
31185 +               if (hist_field->is_signed != target_hist_field->is_signed)
31186 +                       return false;
31187 +       }
31188 +
31189 +       return true;
31190 +}
31191 +
31192 +static struct hist_trigger_data *
31193 +find_compatible_hist(struct hist_trigger_data *target_hist_data,
31194 +                    struct trace_event_file *file)
31195 +{
31196 +       struct hist_trigger_data *hist_data;
31197 +       struct event_trigger_data *test;
31198 +       unsigned int n_keys;
31199 +
31200 +       n_keys = target_hist_data->n_fields - target_hist_data->n_vals;
31201 +
31202 +       list_for_each_entry_rcu(test, &file->triggers, list) {
31203 +               if (test->cmd_ops->trigger_type == ETT_EVENT_HIST) {
31204 +                       hist_data = test->private_data;
31205 +
31206 +                       if (compatible_keys(target_hist_data, hist_data, n_keys))
31207 +                               return hist_data;
31208 +               }
31209 +       }
31210 +
31211 +       return NULL;
31212 +}
31213 +
31214 +static struct trace_event_file *event_file(struct trace_array *tr,
31215 +                                          char *system, char *event_name)
31216 +{
31217 +       struct trace_event_file *file;
31218 +
31219 +       file = find_event_file(tr, system, event_name);
31220 +       if (!file)
31221 +               return ERR_PTR(-EINVAL);
31222 +
31223 +       return file;
31224 +}
31225 +
31226 +static struct hist_field *
31227 +find_synthetic_field_var(struct hist_trigger_data *target_hist_data,
31228 +                        char *system, char *event_name, char *field_name)
31229 +{
31230 +       struct hist_field *event_var;
31231 +       char *synthetic_name;
31232 +
31233 +       synthetic_name = kzalloc(MAX_FILTER_STR_VAL, GFP_KERNEL);
31234 +       if (!synthetic_name)
31235 +               return ERR_PTR(-ENOMEM);
31236 +
31237 +       strcpy(synthetic_name, "synthetic_");
31238 +       strcat(synthetic_name, field_name);
31239 +
31240 +       event_var = find_event_var(target_hist_data, system, event_name, synthetic_name);
31241 +
31242 +       kfree(synthetic_name);
31243 +
31244 +       return event_var;
31245 +}
31246 +
31247 +/**
31248 + * create_field_var_hist - Automatically create a histogram and var for a field
31249 + * @target_hist_data: The target hist trigger
31250 + * @subsys_name: Optional subsystem name
31251 + * @event_name: Optional event name
31252 + * @field_name: The name of the field (and the resulting variable)
31253 + *
31254 + * Hist trigger actions fetch data from variables, not directly from
31255 + * events.  However, for convenience, users are allowed to directly
31256 + * specify an event field in an action, which will be automatically
31257 + * converted into a variable on their behalf.
31258 +
31259 + * If a user specifies a field on an event that isn't the event the
31260 + * histogram currently being defined (the target event histogram), the
31261 + * only way that can be accomplished is if a new hist trigger is
31262 + * created and the field variable defined on that.
31263 + *
31264 + * This function creates a new histogram compatible with the target
31265 + * event (meaning a histogram with the same key as the target
31266 + * histogram), and creates a variable for the specified field, but
31267 + * with 'synthetic_' prepended to the variable name in order to avoid
31268 + * collision with normal field variables.
31269 + *
31270 + * Return: The variable created for the field.
31271 + */
31272 +static struct hist_field *
31273 +create_field_var_hist(struct hist_trigger_data *target_hist_data,
31274 +                     char *subsys_name, char *event_name, char *field_name)
31275 +{
31276 +       struct trace_array *tr = target_hist_data->event_file->tr;
31277 +       struct hist_field *event_var = ERR_PTR(-EINVAL);
31278 +       struct hist_trigger_data *hist_data;
31279 +       unsigned int i, n, first = true;
31280 +       struct field_var_hist *var_hist;
31281 +       struct trace_event_file *file;
31282 +       struct hist_field *key_field;
31283 +       char *saved_filter;
31284 +       char *cmd;
31285 +       int ret;
31286 +
31287 +       if (target_hist_data->n_field_var_hists >= SYNTH_FIELDS_MAX) {
31288 +               hist_err_event("onmatch: Too many field variables defined: ",
31289 +                              subsys_name, event_name, field_name);
31290 +               return ERR_PTR(-EINVAL);
31291 +       }
31292 +
31293 +       file = event_file(tr, subsys_name, event_name);
31294 +
31295 +       if (IS_ERR(file)) {
31296 +               hist_err_event("onmatch: Event file not found: ",
31297 +                              subsys_name, event_name, field_name);
31298 +               ret = PTR_ERR(file);
31299 +               return ERR_PTR(ret);
31300 +       }
31301 +
31302 +       /*
31303 +        * Look for a histogram compatible with target.  We'll use the
31304 +        * found histogram specification to create a new matching
31305 +        * histogram with our variable on it.  target_hist_data is not
31306 +        * yet a registered histogram so we can't use that.
31307 +        */
31308 +       hist_data = find_compatible_hist(target_hist_data, file);
31309 +       if (!hist_data) {
31310 +               hist_err_event("onmatch: Matching event histogram not found: ",
31311 +                              subsys_name, event_name, field_name);
31312 +               return ERR_PTR(-EINVAL);
31313 +       }
31314 +
31315 +       /* See if a synthetic field variable has already been created */
31316 +       event_var = find_synthetic_field_var(target_hist_data, subsys_name,
31317 +                                            event_name, field_name);
31318 +       if (!IS_ERR_OR_NULL(event_var))
31319 +               return event_var;
31320 +
31321 +       var_hist = kzalloc(sizeof(*var_hist), GFP_KERNEL);
31322 +       if (!var_hist)
31323 +               return ERR_PTR(-ENOMEM);
31324 +
31325 +       cmd = kzalloc(MAX_FILTER_STR_VAL, GFP_KERNEL);
31326 +       if (!cmd) {
31327 +               kfree(var_hist);
31328 +               return ERR_PTR(-ENOMEM);
31329 +       }
31330 +
31331 +       /* Use the same keys as the compatible histogram */
31332 +       strcat(cmd, "keys=");
31333 +
31334 +       for_each_hist_key_field(i, hist_data) {
31335 +               key_field = hist_data->fields[i];
31336 +               if (!first)
31337 +                       strcat(cmd, ",");
31338 +               strcat(cmd, key_field->field->name);
31339 +               first = false;
31340 +       }
31341 +
31342 +       /* Create the synthetic field variable specification */
31343 +       strcat(cmd, ":synthetic_");
31344 +       strcat(cmd, field_name);
31345 +       strcat(cmd, "=");
31346 +       strcat(cmd, field_name);
31347 +
31348 +       /* Use the same filter as the compatible histogram */
31349 +       saved_filter = find_trigger_filter(hist_data, file);
31350 +       if (saved_filter) {
31351 +               strcat(cmd, " if ");
31352 +               strcat(cmd, saved_filter);
31353 +       }
31354 +
31355 +       var_hist->cmd = kstrdup(cmd, GFP_KERNEL);
31356 +       if (!var_hist->cmd) {
31357 +               kfree(cmd);
31358 +               kfree(var_hist);
31359 +               return ERR_PTR(-ENOMEM);
31360 +       }
31361 +
31362 +       /* Save the compatible histogram information */
31363 +       var_hist->hist_data = hist_data;
31364 +
31365 +       /* Create the new histogram with our variable */
31366 +       ret = event_hist_trigger_func(&trigger_hist_cmd, file,
31367 +                                     "", "hist", cmd);
31368 +       if (ret) {
31369 +               kfree(cmd);
31370 +               kfree(var_hist->cmd);
31371 +               kfree(var_hist);
31372 +               hist_err_event("onmatch: Couldn't create histogram for field: ",
31373 +                              subsys_name, event_name, field_name);
31374 +               return ERR_PTR(ret);
31375 +       }
31376 +
31377 +       kfree(cmd);
31378 +
31379 +       /* If we can't find the variable, something went wrong */
31380 +       event_var = find_synthetic_field_var(target_hist_data, subsys_name,
31381 +                                            event_name, field_name);
31382 +       if (IS_ERR_OR_NULL(event_var)) {
31383 +               kfree(var_hist->cmd);
31384 +               kfree(var_hist);
31385 +               hist_err_event("onmatch: Couldn't find synthetic variable: ",
31386 +                              subsys_name, event_name, field_name);
31387 +               return ERR_PTR(-EINVAL);
31388 +       }
31389 +
31390 +       n = target_hist_data->n_field_var_hists;
31391 +       target_hist_data->field_var_hists[n] = var_hist;
31392 +       target_hist_data->n_field_var_hists++;
31393 +
31394 +       return event_var;
31395 +}
31396 +
31397 +static struct hist_field *
31398 +find_target_event_var(struct hist_trigger_data *hist_data,
31399 +                     char *subsys_name, char *event_name, char *var_name)
31400 +{
31401 +       struct trace_event_file *file = hist_data->event_file;
31402 +       struct hist_field *hist_field = NULL;
31403 +
31404 +       if (subsys_name) {
31405 +               struct trace_event_call *call;
31406 +
31407 +               if (!event_name)
31408 +                       return NULL;
31409 +
31410 +               call = file->event_call;
31411 +
31412 +               if (strcmp(subsys_name, call->class->system) != 0)
31413 +                       return NULL;
31414 +
31415 +               if (strcmp(event_name, trace_event_name(call)) != 0)
31416 +                       return NULL;
31417 +       }
31418 +
31419 +       hist_field = find_var_field(hist_data, var_name);
31420 +
31421 +       return hist_field;
31422 +}
31423 +
31424 +static inline void __update_field_vars(struct tracing_map_elt *elt,
31425 +                                      struct ring_buffer_event *rbe,
31426 +                                      void *rec,
31427 +                                      struct field_var **field_vars,
31428 +                                      unsigned int n_field_vars,
31429 +                                      unsigned int field_var_str_start)
31430 +{
31431 +       struct hist_elt_data *elt_data = elt->private_data;
31432 +       unsigned int i, j, var_idx;
31433 +       u64 var_val;
31434 +
31435 +       for (i = 0, j = field_var_str_start; i < n_field_vars; i++) {
31436 +               struct field_var *field_var = field_vars[i];
31437 +               struct hist_field *var = field_var->var;
31438 +               struct hist_field *val = field_var->val;
31439 +
31440 +               var_val = val->fn(val, elt, rbe, rec);
31441 +               var_idx = var->var.idx;
31442 +
31443 +               if (val->flags & HIST_FIELD_FL_STRING) {
31444 +                       char *str = elt_data->field_var_str[j++];
31445 +                       char *val_str = (char *)(uintptr_t)var_val;
31446 +
31447 +                       strscpy(str, val_str, STR_VAR_LEN_MAX);
31448 +                       var_val = (u64)(uintptr_t)str;
31449 +               }
31450 +               tracing_map_set_var(elt, var_idx, var_val);
31451 +       }
31452 +}
31453 +
31454 +static void update_field_vars(struct hist_trigger_data *hist_data,
31455 +                             struct tracing_map_elt *elt,
31456 +                             struct ring_buffer_event *rbe,
31457 +                             void *rec)
31458 +{
31459 +       __update_field_vars(elt, rbe, rec, hist_data->field_vars,
31460 +                           hist_data->n_field_vars, 0);
31461 +}
31462 +
31463 +static void update_max_vars(struct hist_trigger_data *hist_data,
31464 +                           struct tracing_map_elt *elt,
31465 +                           struct ring_buffer_event *rbe,
31466 +                           void *rec)
31467 +{
31468 +       __update_field_vars(elt, rbe, rec, hist_data->max_vars,
31469 +                           hist_data->n_max_vars, hist_data->n_field_var_str);
31470 +}
31471 +
31472 +static struct hist_field *create_var(struct hist_trigger_data *hist_data,
31473 +                                    struct trace_event_file *file,
31474 +                                    char *name, int size, const char *type)
31475 +{
31476 +       struct hist_field *var;
31477 +       int idx;
31478 +
31479 +       if (find_var(hist_data, file, name) && !hist_data->remove) {
31480 +               var = ERR_PTR(-EINVAL);
31481 +               goto out;
31482 +       }
31483 +
31484 +       var = kzalloc(sizeof(struct hist_field), GFP_KERNEL);
31485 +       if (!var) {
31486 +               var = ERR_PTR(-ENOMEM);
31487 +               goto out;
31488 +       }
31489 +
31490 +       idx = tracing_map_add_var(hist_data->map);
31491 +       if (idx < 0) {
31492 +               kfree(var);
31493 +               var = ERR_PTR(-EINVAL);
31494 +               goto out;
31495 +       }
31496 +
31497 +       var->flags = HIST_FIELD_FL_VAR;
31498 +       var->var.idx = idx;
31499 +       var->var.hist_data = var->hist_data = hist_data;
31500 +       var->size = size;
31501 +       var->var.name = kstrdup(name, GFP_KERNEL);
31502 +       var->type = kstrdup(type, GFP_KERNEL);
31503 +       if (!var->var.name || !var->type) {
31504 +               kfree(var->var.name);
31505 +               kfree(var->type);
31506 +               kfree(var);
31507 +               var = ERR_PTR(-ENOMEM);
31508 +       }
31509 + out:
31510 +       return var;
31511 +}
31512 +
31513 +static struct field_var *create_field_var(struct hist_trigger_data *hist_data,
31514 +                                         struct trace_event_file *file,
31515 +                                         char *field_name)
31516 +{
31517 +       struct hist_field *val = NULL, *var = NULL;
31518 +       unsigned long flags = HIST_FIELD_FL_VAR;
31519 +       struct field_var *field_var;
31520 +       int ret = 0;
31521 +
31522 +       if (hist_data->n_field_vars >= SYNTH_FIELDS_MAX) {
31523 +               hist_err("Too many field variables defined: ", field_name);
31524 +               ret = -EINVAL;
31525 +               goto err;
31526 +       }
31527 +
31528 +       val = parse_atom(hist_data, file, field_name, &flags, NULL);
31529 +       if (IS_ERR(val)) {
31530 +               hist_err("Couldn't parse field variable: ", field_name);
31531 +               ret = PTR_ERR(val);
31532 +               goto err;
31533 +       }
31534 +
31535 +       var = create_var(hist_data, file, field_name, val->size, val->type);
31536 +       if (IS_ERR(var)) {
31537 +               hist_err("Couldn't create or find variable: ", field_name);
31538 +               kfree(val);
31539 +               ret = PTR_ERR(var);
31540 +               goto err;
31541 +       }
31542 +
31543 +       field_var = kzalloc(sizeof(struct field_var), GFP_KERNEL);
31544 +       if (!field_var) {
31545 +               kfree(val);
31546 +               kfree(var);
31547 +               ret =  -ENOMEM;
31548 +               goto err;
31549 +       }
31550 +
31551 +       field_var->var = var;
31552 +       field_var->val = val;
31553 + out:
31554 +       return field_var;
31555 + err:
31556 +       field_var = ERR_PTR(ret);
31557 +       goto out;
31558 +}
31559 +
31560 +/**
31561 + * create_target_field_var - Automatically create a variable for a field
31562 + * @target_hist_data: The target hist trigger
31563 + * @subsys_name: Optional subsystem name
31564 + * @event_name: Optional event name
31565 + * @var_name: The name of the field (and the resulting variable)
31566 + *
31567 + * Hist trigger actions fetch data from variables, not directly from
31568 + * events.  However, for convenience, users are allowed to directly
31569 + * specify an event field in an action, which will be automatically
31570 + * converted into a variable on their behalf.
31571 +
31572 + * This function creates a field variable with the name var_name on
31573 + * the hist trigger currently being defined on the target event.  If
31574 + * subsys_name and event_name are specified, this function simply
31575 + * verifies that they do in fact match the target event subsystem and
31576 + * event name.
31577 + *
31578 + * Return: The variable created for the field.
31579 + */
31580 +static struct field_var *
31581 +create_target_field_var(struct hist_trigger_data *target_hist_data,
31582 +                       char *subsys_name, char *event_name, char *var_name)
31583 +{
31584 +       struct trace_event_file *file = target_hist_data->event_file;
31585 +
31586 +       if (subsys_name) {
31587 +               struct trace_event_call *call;
31588 +
31589 +               if (!event_name)
31590 +                       return NULL;
31591 +
31592 +               call = file->event_call;
31593 +
31594 +               if (strcmp(subsys_name, call->class->system) != 0)
31595 +                       return NULL;
31596 +
31597 +               if (strcmp(event_name, trace_event_name(call)) != 0)
31598 +                       return NULL;
31599 +       }
31600 +
31601 +       return create_field_var(target_hist_data, file, var_name);
31602 +}
31603 +
31604 +static void onmax_print(struct seq_file *m,
31605 +                       struct hist_trigger_data *hist_data,
31606 +                       struct tracing_map_elt *elt,
31607 +                       struct action_data *data)
31608 +{
31609 +       unsigned int i, save_var_idx, max_idx = data->onmax.max_var->var.idx;
31610 +
31611 +       seq_printf(m, "\n\tmax: %10llu", tracing_map_read_var(elt, max_idx));
31612 +
31613 +       for (i = 0; i < hist_data->n_max_vars; i++) {
31614 +               struct hist_field *save_val = hist_data->max_vars[i]->val;
31615 +               struct hist_field *save_var = hist_data->max_vars[i]->var;
31616 +               u64 val;
31617 +
31618 +               save_var_idx = save_var->var.idx;
31619 +
31620 +               val = tracing_map_read_var(elt, save_var_idx);
31621 +
31622 +               if (save_val->flags & HIST_FIELD_FL_STRING) {
31623 +                       seq_printf(m, "  %s: %-32s", save_var->var.name,
31624 +                                  (char *)(uintptr_t)(val));
31625 +               } else
31626 +                       seq_printf(m, "  %s: %10llu", save_var->var.name, val);
31627 +       }
31628 +}
31629 +
31630 +static void onmax_save(struct hist_trigger_data *hist_data,
31631 +                      struct tracing_map_elt *elt, void *rec,
31632 +                      struct ring_buffer_event *rbe,
31633 +                      struct action_data *data, u64 *var_ref_vals)
31634 +{
31635 +       unsigned int max_idx = data->onmax.max_var->var.idx;
31636 +       unsigned int max_var_ref_idx = data->onmax.max_var_ref_idx;
31637 +
31638 +       u64 var_val, max_val;
31639 +
31640 +       var_val = var_ref_vals[max_var_ref_idx];
31641 +       max_val = tracing_map_read_var(elt, max_idx);
31642 +
31643 +       if (var_val <= max_val)
31644 +               return;
31645 +
31646 +       tracing_map_set_var(elt, max_idx, var_val);
31647 +
31648 +       update_max_vars(hist_data, elt, rbe, rec);
31649 +}
31650 +
31651 +static void onmax_destroy(struct action_data *data)
31652 +{
31653 +       unsigned int i;
31654 +
31655 +       destroy_hist_field(data->onmax.max_var, 0);
31656 +       destroy_hist_field(data->onmax.var, 0);
31657 +
31658 +       kfree(data->onmax.var_str);
31659 +       kfree(data->onmax.fn_name);
31660 +
31661 +       for (i = 0; i < data->n_params; i++)
31662 +               kfree(data->params[i]);
31663 +
31664 +       kfree(data);
31665 +}
31666 +
31667 +static int onmax_create(struct hist_trigger_data *hist_data,
31668 +                       struct action_data *data)
31669 +{
31670 +       struct trace_event_file *file = hist_data->event_file;
31671 +       struct hist_field *var_field, *ref_field, *max_var;
31672 +       unsigned int var_ref_idx = hist_data->n_var_refs;
31673 +       struct field_var *field_var;
31674 +       char *onmax_var_str, *param;
31675 +       unsigned long flags;
31676 +       unsigned int i;
31677 +       int ret = 0;
31678 +
31679 +       onmax_var_str = data->onmax.var_str;
31680 +       if (onmax_var_str[0] != '$') {
31681 +               hist_err("onmax: For onmax(x), x must be a variable: ", onmax_var_str);
31682 +               return -EINVAL;
31683 +       }
31684 +       onmax_var_str++;
31685 +
31686 +       var_field = find_target_event_var(hist_data, NULL, NULL, onmax_var_str);
31687 +       if (!var_field) {
31688 +               hist_err("onmax: Couldn't find onmax variable: ", onmax_var_str);
31689 +               return -EINVAL;
31690 +       }
31691 +
31692 +       flags = HIST_FIELD_FL_VAR_REF;
31693 +       ref_field = create_hist_field(hist_data, NULL, flags, NULL);
31694 +       if (!ref_field)
31695 +               return -ENOMEM;
31696 +
31697 +       if (init_var_ref(ref_field, var_field, NULL, NULL)) {
31698 +               destroy_hist_field(ref_field, 0);
31699 +               ret = -ENOMEM;
31700 +               goto out;
31701 +       }
31702 +       hist_data->var_refs[hist_data->n_var_refs] = ref_field;
31703 +       ref_field->var_ref_idx = hist_data->n_var_refs++;
31704 +       data->onmax.var = ref_field;
31705 +
31706 +       data->fn = onmax_save;
31707 +       data->onmax.max_var_ref_idx = var_ref_idx;
31708 +       max_var = create_var(hist_data, file, "max", sizeof(u64), "u64");
31709 +       if (IS_ERR(max_var)) {
31710 +               hist_err("onmax: Couldn't create onmax variable: ", "max");
31711 +               ret = PTR_ERR(max_var);
31712 +               goto out;
31713 +       }
31714 +       data->onmax.max_var = max_var;
31715 +
31716 +       for (i = 0; i < data->n_params; i++) {
31717 +               param = kstrdup(data->params[i], GFP_KERNEL);
31718 +               if (!param) {
31719 +                       ret = -ENOMEM;
31720 +                       goto out;
31721 +               }
31722 +
31723 +               field_var = create_target_field_var(hist_data, NULL, NULL, param);
31724 +               if (IS_ERR(field_var)) {
31725 +                       hist_err("onmax: Couldn't create field variable: ", param);
31726 +                       ret = PTR_ERR(field_var);
31727 +                       kfree(param);
31728 +                       goto out;
31729 +               }
31730 +
31731 +               hist_data->max_vars[hist_data->n_max_vars++] = field_var;
31732 +               if (field_var->val->flags & HIST_FIELD_FL_STRING)
31733 +                       hist_data->n_max_var_str++;
31734 +
31735 +               kfree(param);
31736 +       }
31737 + out:
31738 +       return ret;
31739 +}
31740 +
31741 +static int parse_action_params(char *params, struct action_data *data)
31742 +{
31743 +       char *param, *saved_param;
31744 +       int ret = 0;
31745 +
31746 +       while (params) {
31747 +               if (data->n_params >= SYNTH_FIELDS_MAX)
31748 +                       goto out;
31749 +
31750 +               param = strsep(&params, ",");
31751 +               if (!param) {
31752 +                       ret = -EINVAL;
31753 +                       goto out;
31754 +               }
31755 +
31756 +               param = strstrip(param);
31757 +               if (strlen(param) < 2) {
31758 +                       hist_err("Invalid action param: ", param);
31759 +                       ret = -EINVAL;
31760 +                       goto out;
31761 +               }
31762 +
31763 +               saved_param = kstrdup(param, GFP_KERNEL);
31764 +               if (!saved_param) {
31765 +                       ret = -ENOMEM;
31766 +                       goto out;
31767 +               }
31768 +
31769 +               data->params[data->n_params++] = saved_param;
31770 +       }
31771 + out:
31772 +       return ret;
31773 +}
31774 +
31775 +static struct action_data *onmax_parse(char *str)
31776 +{
31777 +       char *onmax_fn_name, *onmax_var_str;
31778 +       struct action_data *data;
31779 +       int ret = -EINVAL;
31780 +
31781 +       data = kzalloc(sizeof(*data), GFP_KERNEL);
31782 +       if (!data)
31783 +               return ERR_PTR(-ENOMEM);
31784 +
31785 +       onmax_var_str = strsep(&str, ")");
31786 +       if (!onmax_var_str || !str) {
31787 +               ret = -EINVAL;
31788 +               goto free;
31789 +       }
31790 +
31791 +       data->onmax.var_str = kstrdup(onmax_var_str, GFP_KERNEL);
31792 +       if (!data->onmax.var_str) {
31793 +               ret = -ENOMEM;
31794 +               goto free;
31795 +       }
31796 +
31797 +       strsep(&str, ".");
31798 +       if (!str)
31799 +               goto free;
31800 +
31801 +       onmax_fn_name = strsep(&str, "(");
31802 +       if (!onmax_fn_name || !str)
31803 +               goto free;
31804 +
31805 +       if (strncmp(onmax_fn_name, "save", strlen("save")) == 0) {
31806 +               char *params = strsep(&str, ")");
31807 +
31808 +               if (!params) {
31809 +                       ret = -EINVAL;
31810 +                       goto free;
31811 +               }
31812 +
31813 +               ret = parse_action_params(params, data);
31814 +               if (ret)
31815 +                       goto free;
31816 +       } else
31817 +               goto free;
31818 +
31819 +       data->onmax.fn_name = kstrdup(onmax_fn_name, GFP_KERNEL);
31820 +       if (!data->onmax.fn_name) {
31821 +               ret = -ENOMEM;
31822 +               goto free;
31823 +       }
31824 + out:
31825 +       return data;
31826 + free:
31827 +       onmax_destroy(data);
31828 +       data = ERR_PTR(ret);
31829 +       goto out;
31830 +}
31831 +
31832 +static void onmatch_destroy(struct action_data *data)
31833 +{
31834 +       unsigned int i;
31835 +
31836 +       mutex_lock(&synth_event_mutex);
31837 +
31838 +       kfree(data->onmatch.match_event);
31839 +       kfree(data->onmatch.match_event_system);
31840 +       kfree(data->onmatch.synth_event_name);
31841 +
31842 +       for (i = 0; i < data->n_params; i++)
31843 +               kfree(data->params[i]);
31844 +
31845 +       if (data->onmatch.synth_event)
31846 +               data->onmatch.synth_event->ref--;
31847 +
31848 +       kfree(data);
31849 +
31850 +       mutex_unlock(&synth_event_mutex);
31851 +}
31852 +
31853 +static void destroy_field_var(struct field_var *field_var)
31854 +{
31855 +       if (!field_var)
31856 +               return;
31857 +
31858 +       destroy_hist_field(field_var->var, 0);
31859 +       destroy_hist_field(field_var->val, 0);
31860 +
31861 +       kfree(field_var);
31862 +}
31863 +
31864 +static void destroy_field_vars(struct hist_trigger_data *hist_data)
31865 +{
31866 +       unsigned int i;
31867 +
31868 +       for (i = 0; i < hist_data->n_field_vars; i++)
31869 +               destroy_field_var(hist_data->field_vars[i]);
31870 +}
31871 +
31872 +static void save_field_var(struct hist_trigger_data *hist_data,
31873 +                          struct field_var *field_var)
31874 +{
31875 +       hist_data->field_vars[hist_data->n_field_vars++] = field_var;
31876 +
31877 +       if (field_var->val->flags & HIST_FIELD_FL_STRING)
31878 +               hist_data->n_field_var_str++;
31879 +}
31880 +
31881 +
31882 +static void destroy_synth_var_refs(struct hist_trigger_data *hist_data)
31883 +{
31884 +       unsigned int i;
31885 +
31886 +       for (i = 0; i < hist_data->n_synth_var_refs; i++)
31887 +               destroy_hist_field(hist_data->synth_var_refs[i], 0);
31888 +}
31889 +
31890 +static void save_synth_var_ref(struct hist_trigger_data *hist_data,
31891 +                        struct hist_field *var_ref)
31892 +{
31893 +       hist_data->synth_var_refs[hist_data->n_synth_var_refs++] = var_ref;
31894 +
31895 +       hist_data->var_refs[hist_data->n_var_refs] = var_ref;
31896 +       var_ref->var_ref_idx = hist_data->n_var_refs++;
31897 +}
31898 +
31899 +static int check_synth_field(struct synth_event *event,
31900 +                            struct hist_field *hist_field,
31901 +                            unsigned int field_pos)
31902 +{
31903 +       struct synth_field *field;
31904 +
31905 +       if (field_pos >= event->n_fields)
31906 +               return -EINVAL;
31907 +
31908 +       field = event->fields[field_pos];
31909 +
31910 +       if (strcmp(field->type, hist_field->type) != 0)
31911 +               return -EINVAL;
31912 +
31913 +       return 0;
31914 +}
31915 +
31916 +static struct hist_field *
31917 +onmatch_find_var(struct hist_trigger_data *hist_data, struct action_data *data,
31918 +                char *system, char *event, char *var)
31919 +{
31920 +       struct hist_field *hist_field;
31921 +
31922 +       var++; /* skip '$' */
31923 +
31924 +       hist_field = find_target_event_var(hist_data, system, event, var);
31925 +       if (!hist_field) {
31926 +               if (!system) {
31927 +                       system = data->onmatch.match_event_system;
31928 +                       event = data->onmatch.match_event;
31929 +               }
31930 +
31931 +               hist_field = find_event_var(hist_data, system, event, var);
31932 +       }
31933 +
31934 +       if (!hist_field)
31935 +               hist_err_event("onmatch: Couldn't find onmatch param: $", system, event, var);
31936 +
31937 +       return hist_field;
31938 +}
31939 +
31940 +static struct hist_field *
31941 +onmatch_create_field_var(struct hist_trigger_data *hist_data,
31942 +                        struct action_data *data, char *system,
31943 +                        char *event, char *var)
31944 +{
31945 +       struct hist_field *hist_field = NULL;
31946 +       struct field_var *field_var;
31947 +
31948 +       /*
31949 +        * First try to create a field var on the target event (the
31950 +        * currently being defined).  This will create a variable for
31951 +        * unqualified fields on the target event, or if qualified,
31952 +        * target fields that have qualified names matching the target.
31953 +        */
31954 +       field_var = create_target_field_var(hist_data, system, event, var);
31955 +
31956 +       if (field_var && !IS_ERR(field_var)) {
31957 +               save_field_var(hist_data, field_var);
31958 +               hist_field = field_var->var;
31959 +       } else {
31960 +               field_var = NULL;
31961 +               /*
31962 +                * If no explicit system.event is specfied, default to
31963 +                * looking for fields on the onmatch(system.event.xxx)
31964 +                * event.
31965 +                */
31966 +               if (!system) {
31967 +                       system = data->onmatch.match_event_system;
31968 +                       event = data->onmatch.match_event;
31969 +               }
31970 +
31971 +               /*
31972 +                * At this point, we're looking at a field on another
31973 +                * event.  Because we can't modify a hist trigger on
31974 +                * another event to add a variable for a field, we need
31975 +                * to create a new trigger on that event and create the
31976 +                * variable at the same time.
31977 +                */
31978 +               hist_field = create_field_var_hist(hist_data, system, event, var);
31979 +               if (IS_ERR(hist_field))
31980 +                       goto free;
31981 +       }
31982 + out:
31983 +       return hist_field;
31984 + free:
31985 +       destroy_field_var(field_var);
31986 +       hist_field = NULL;
31987 +       goto out;
31988 +}
31989 +
31990 +static int onmatch_create(struct hist_trigger_data *hist_data,
31991 +                         struct trace_event_file *file,
31992 +                         struct action_data *data)
31993 +{
31994 +       char *event_name, *param, *system = NULL;
31995 +       struct hist_field *hist_field, *var_ref;
31996 +       unsigned int i, var_ref_idx;
31997 +       unsigned int field_pos = 0;
31998 +       struct synth_event *event;
31999 +       int ret = 0;
32000 +
32001 +       mutex_lock(&synth_event_mutex);
32002 +       event = find_synth_event(data->onmatch.synth_event_name);
32003 +       if (!event) {
32004 +               hist_err("onmatch: Couldn't find synthetic event: ", data->onmatch.synth_event_name);
32005 +               mutex_unlock(&synth_event_mutex);
32006 +               return -EINVAL;
32007 +       }
32008 +       event->ref++;
32009 +       mutex_unlock(&synth_event_mutex);
32010 +
32011 +       var_ref_idx = hist_data->n_var_refs;
32012 +
32013 +       for (i = 0; i < data->n_params; i++) {
32014 +               char *p;
32015 +
32016 +               p = param = kstrdup(data->params[i], GFP_KERNEL);
32017 +               if (!param) {
32018 +                       ret = -ENOMEM;
32019 +                       goto err;
32020 +               }
32021 +
32022 +               system = strsep(&param, ".");
32023 +               if (!param) {
32024 +                       param = (char *)system;
32025 +                       system = event_name = NULL;
32026 +               } else {
32027 +                       event_name = strsep(&param, ".");
32028 +                       if (!param) {
32029 +                               kfree(p);
32030 +                               ret = -EINVAL;
32031 +                               goto err;
32032 +                       }
32033 +               }
32034 +
32035 +               if (param[0] == '$')
32036 +                       hist_field = onmatch_find_var(hist_data, data, system,
32037 +                                                     event_name, param);
32038 +               else
32039 +                       hist_field = onmatch_create_field_var(hist_data, data,
32040 +                                                             system,
32041 +                                                             event_name,
32042 +                                                             param);
32043 +
32044 +               if (!hist_field) {
32045 +                       kfree(p);
32046 +                       ret = -EINVAL;
32047 +                       goto err;
32048 +               }
32049 +
32050 +               if (check_synth_field(event, hist_field, field_pos) == 0) {
32051 +                       var_ref = create_var_ref(hist_field, system, event_name);
32052 +                       if (!var_ref) {
32053 +                               kfree(p);
32054 +                               ret = -ENOMEM;
32055 +                               goto err;
32056 +                       }
32057 +
32058 +                       save_synth_var_ref(hist_data, var_ref);
32059 +                       field_pos++;
32060 +                       kfree(p);
32061 +                       continue;
32062 +               }
32063 +
32064 +               hist_err_event("onmatch: Param type doesn't match synthetic event field type: ",
32065 +                              system, event_name, param);
32066 +               kfree(p);
32067 +               ret = -EINVAL;
32068 +               goto err;
32069 +       }
32070 +
32071 +       if (field_pos != event->n_fields) {
32072 +               hist_err("onmatch: Param count doesn't match synthetic event field count: ", event->name);
32073 +               ret = -EINVAL;
32074 +               goto err;
32075 +       }
32076 +
32077 +       data->fn = action_trace;
32078 +       data->onmatch.synth_event = event;
32079 +       data->onmatch.var_ref_idx = var_ref_idx;
32080 + out:
32081 +       return ret;
32082 + err:
32083 +       mutex_lock(&synth_event_mutex);
32084 +       event->ref--;
32085 +       mutex_unlock(&synth_event_mutex);
32086 +
32087 +       goto out;
32088 +}
32089 +
32090 +static struct action_data *onmatch_parse(struct trace_array *tr, char *str)
32091 +{
32092 +       char *match_event, *match_event_system;
32093 +       char *synth_event_name, *params;
32094 +       struct action_data *data;
32095 +       int ret = -EINVAL;
32096 +
32097 +       data = kzalloc(sizeof(*data), GFP_KERNEL);
32098 +       if (!data)
32099 +               return ERR_PTR(-ENOMEM);
32100 +
32101 +       match_event = strsep(&str, ")");
32102 +       if (!match_event || !str) {
32103 +               hist_err("onmatch: Missing closing paren: ", match_event);
32104 +               goto free;
32105 +       }
32106 +
32107 +       match_event_system = strsep(&match_event, ".");
32108 +       if (!match_event) {
32109 +               hist_err("onmatch: Missing subsystem for match event: ", match_event_system);
32110 +               goto free;
32111 +       }
32112 +
32113 +       if (IS_ERR(event_file(tr, match_event_system, match_event))) {
32114 +               hist_err_event("onmatch: Invalid subsystem or event name: ",
32115 +                              match_event_system, match_event, NULL);
32116 +               goto free;
32117         }
32118
32119 -       if (WARN_ON_ONCE(!field))
32120 -               goto out;
32121 +       data->onmatch.match_event = kstrdup(match_event, GFP_KERNEL);
32122 +       if (!data->onmatch.match_event) {
32123 +               ret = -ENOMEM;
32124 +               goto free;
32125 +       }
32126
32127 -       if (is_string_field(field)) {
32128 -               flags |= HIST_FIELD_FL_STRING;
32129 +       data->onmatch.match_event_system = kstrdup(match_event_system, GFP_KERNEL);
32130 +       if (!data->onmatch.match_event_system) {
32131 +               ret = -ENOMEM;
32132 +               goto free;
32133 +       }
32134
32135 -               if (field->filter_type == FILTER_STATIC_STRING)
32136 -                       hist_field->fn = hist_field_string;
32137 -               else if (field->filter_type == FILTER_DYN_STRING)
32138 -                       hist_field->fn = hist_field_dynstring;
32139 -               else
32140 -                       hist_field->fn = hist_field_pstring;
32141 -       } else {
32142 -               hist_field->fn = select_value_fn(field->size,
32143 -                                                field->is_signed);
32144 -               if (!hist_field->fn) {
32145 -                       destroy_hist_field(hist_field);
32146 -                       return NULL;
32147 -               }
32148 +       strsep(&str, ".");
32149 +       if (!str) {
32150 +               hist_err("onmatch: Missing . after onmatch(): ", str);
32151 +               goto free;
32152         }
32153 - out:
32154 -       hist_field->field = field;
32155 -       hist_field->flags = flags;
32156
32157 -       return hist_field;
32158 -}
32159 +       synth_event_name = strsep(&str, "(");
32160 +       if (!synth_event_name || !str) {
32161 +               hist_err("onmatch: Missing opening paramlist paren: ", synth_event_name);
32162 +               goto free;
32163 +       }
32164
32165 -static void destroy_hist_fields(struct hist_trigger_data *hist_data)
32166 -{
32167 -       unsigned int i;
32168 +       data->onmatch.synth_event_name = kstrdup(synth_event_name, GFP_KERNEL);
32169 +       if (!data->onmatch.synth_event_name) {
32170 +               ret = -ENOMEM;
32171 +               goto free;
32172 +       }
32173
32174 -       for (i = 0; i < TRACING_MAP_FIELDS_MAX; i++) {
32175 -               if (hist_data->fields[i]) {
32176 -                       destroy_hist_field(hist_data->fields[i]);
32177 -                       hist_data->fields[i] = NULL;
32178 -               }
32179 +       params = strsep(&str, ")");
32180 +       if (!params || !str || (str && strlen(str))) {
32181 +               hist_err("onmatch: Missing closing paramlist paren: ", params);
32182 +               goto free;
32183         }
32184 +
32185 +       ret = parse_action_params(params, data);
32186 +       if (ret)
32187 +               goto free;
32188 + out:
32189 +       return data;
32190 + free:
32191 +       onmatch_destroy(data);
32192 +       data = ERR_PTR(ret);
32193 +       goto out;
32194  }
32195
32196  static int create_hitcount_val(struct hist_trigger_data *hist_data)
32197  {
32198         hist_data->fields[HITCOUNT_IDX] =
32199 -               create_hist_field(NULL, HIST_FIELD_FL_HITCOUNT);
32200 +               create_hist_field(hist_data, NULL, HIST_FIELD_FL_HITCOUNT, NULL);
32201         if (!hist_data->fields[HITCOUNT_IDX])
32202                 return -ENOMEM;
32203
32204         hist_data->n_vals++;
32205 +       hist_data->n_fields++;
32206
32207         if (WARN_ON(hist_data->n_vals > TRACING_MAP_VALS_MAX))
32208                 return -EINVAL;
32209 @@ -426,54 +3828,71 @@ static int create_hitcount_val(struct hist_trigger_data *hist_data)
32210         return 0;
32211  }
32212
32213 +static int __create_val_field(struct hist_trigger_data *hist_data,
32214 +                             unsigned int val_idx,
32215 +                             struct trace_event_file *file,
32216 +                             char *var_name, char *field_str,
32217 +                             unsigned long flags)
32218 +{
32219 +       struct hist_field *hist_field;
32220 +       int ret = 0;
32221 +
32222 +       hist_field = parse_expr(hist_data, file, field_str, flags, var_name, 0);
32223 +       if (IS_ERR(hist_field)) {
32224 +               ret = PTR_ERR(hist_field);
32225 +               goto out;
32226 +       }
32227 +
32228 +       hist_data->fields[val_idx] = hist_field;
32229 +
32230 +       ++hist_data->n_vals;
32231 +       ++hist_data->n_fields;
32232 +
32233 +       if (WARN_ON(hist_data->n_vals > TRACING_MAP_VALS_MAX + TRACING_MAP_VARS_MAX))
32234 +               ret = -EINVAL;
32235 + out:
32236 +       return ret;
32237 +}
32238 +
32239  static int create_val_field(struct hist_trigger_data *hist_data,
32240                             unsigned int val_idx,
32241                             struct trace_event_file *file,
32242                             char *field_str)
32243  {
32244 -       struct ftrace_event_field *field = NULL;
32245 -       unsigned long flags = 0;
32246 -       char *field_name;
32247 -       int ret = 0;
32248 -
32249         if (WARN_ON(val_idx >= TRACING_MAP_VALS_MAX))
32250                 return -EINVAL;
32251
32252 -       field_name = strsep(&field_str, ".");
32253 -       if (field_str) {
32254 -               if (strcmp(field_str, "hex") == 0)
32255 -                       flags |= HIST_FIELD_FL_HEX;
32256 -               else {
32257 -                       ret = -EINVAL;
32258 -                       goto out;
32259 -               }
32260 -       }
32261 +       return __create_val_field(hist_data, val_idx, file, NULL, field_str, 0);
32262 +}
32263
32264 -       field = trace_find_event_field(file->event_call, field_name);
32265 -       if (!field || !field->size) {
32266 -               ret = -EINVAL;
32267 -               goto out;
32268 -       }
32269 +static int create_var_field(struct hist_trigger_data *hist_data,
32270 +                           unsigned int val_idx,
32271 +                           struct trace_event_file *file,
32272 +                           char *var_name, char *expr_str)
32273 +{
32274 +       unsigned long flags = 0;
32275
32276 -       hist_data->fields[val_idx] = create_hist_field(field, flags);
32277 -       if (!hist_data->fields[val_idx]) {
32278 -               ret = -ENOMEM;
32279 -               goto out;
32280 +       if (WARN_ON(val_idx >= TRACING_MAP_VALS_MAX + TRACING_MAP_VARS_MAX))
32281 +               return -EINVAL;
32282 +
32283 +       if (find_var(hist_data, file, var_name) && !hist_data->remove) {
32284 +               hist_err("Variable already defined: ", var_name);
32285 +               return -EINVAL;
32286         }
32287
32288 -       ++hist_data->n_vals;
32289 +       flags |= HIST_FIELD_FL_VAR;
32290 +       hist_data->n_vars++;
32291 +       if (WARN_ON(hist_data->n_vars > TRACING_MAP_VARS_MAX))
32292 +               return -EINVAL;
32293
32294 -       if (WARN_ON(hist_data->n_vals > TRACING_MAP_VALS_MAX))
32295 -               ret = -EINVAL;
32296 - out:
32297 -       return ret;
32298 +       return __create_val_field(hist_data, val_idx, file, var_name, expr_str, flags);
32299  }
32300
32301  static int create_val_fields(struct hist_trigger_data *hist_data,
32302                              struct trace_event_file *file)
32303  {
32304         char *fields_str, *field_str;
32305 -       unsigned int i, j;
32306 +       unsigned int i, j = 1;
32307         int ret;
32308
32309         ret = create_hitcount_val(hist_data);
32310 @@ -493,12 +3912,15 @@ static int create_val_fields(struct hist_trigger_data *hist_data,
32311                 field_str = strsep(&fields_str, ",");
32312                 if (!field_str)
32313                         break;
32314 +
32315                 if (strcmp(field_str, "hitcount") == 0)
32316                         continue;
32317 +
32318                 ret = create_val_field(hist_data, j++, file, field_str);
32319                 if (ret)
32320                         goto out;
32321         }
32322 +
32323         if (fields_str && (strcmp(fields_str, "hitcount") != 0))
32324                 ret = -EINVAL;
32325   out:
32326 @@ -511,12 +3933,13 @@ static int create_key_field(struct hist_trigger_data *hist_data,
32327                             struct trace_event_file *file,
32328                             char *field_str)
32329  {
32330 -       struct ftrace_event_field *field = NULL;
32331 +       struct hist_field *hist_field = NULL;
32332 +
32333         unsigned long flags = 0;
32334         unsigned int key_size;
32335         int ret = 0;
32336
32337 -       if (WARN_ON(key_idx >= TRACING_MAP_FIELDS_MAX))
32338 +       if (WARN_ON(key_idx >= HIST_FIELDS_MAX))
32339                 return -EINVAL;
32340
32341         flags |= HIST_FIELD_FL_KEY;
32342 @@ -524,57 +3947,40 @@ static int create_key_field(struct hist_trigger_data *hist_data,
32343         if (strcmp(field_str, "stacktrace") == 0) {
32344                 flags |= HIST_FIELD_FL_STACKTRACE;
32345                 key_size = sizeof(unsigned long) * HIST_STACKTRACE_DEPTH;
32346 +               hist_field = create_hist_field(hist_data, NULL, flags, NULL);
32347         } else {
32348 -               char *field_name = strsep(&field_str, ".");
32349 -
32350 -               if (field_str) {
32351 -                       if (strcmp(field_str, "hex") == 0)
32352 -                               flags |= HIST_FIELD_FL_HEX;
32353 -                       else if (strcmp(field_str, "sym") == 0)
32354 -                               flags |= HIST_FIELD_FL_SYM;
32355 -                       else if (strcmp(field_str, "sym-offset") == 0)
32356 -                               flags |= HIST_FIELD_FL_SYM_OFFSET;
32357 -                       else if ((strcmp(field_str, "execname") == 0) &&
32358 -                                (strcmp(field_name, "common_pid") == 0))
32359 -                               flags |= HIST_FIELD_FL_EXECNAME;
32360 -                       else if (strcmp(field_str, "syscall") == 0)
32361 -                               flags |= HIST_FIELD_FL_SYSCALL;
32362 -                       else if (strcmp(field_str, "log2") == 0)
32363 -                               flags |= HIST_FIELD_FL_LOG2;
32364 -                       else {
32365 -                               ret = -EINVAL;
32366 -                               goto out;
32367 -                       }
32368 +               hist_field = parse_expr(hist_data, file, field_str, flags,
32369 +                                       NULL, 0);
32370 +               if (IS_ERR(hist_field)) {
32371 +                       ret = PTR_ERR(hist_field);
32372 +                       goto out;
32373                 }
32374
32375 -               field = trace_find_event_field(file->event_call, field_name);
32376 -               if (!field || !field->size) {
32377 +               if (hist_field->flags & HIST_FIELD_FL_VAR_REF) {
32378 +                       hist_err("Using variable references as keys not supported: ", field_str);
32379 +                       destroy_hist_field(hist_field, 0);
32380                         ret = -EINVAL;
32381                         goto out;
32382                 }
32383
32384 -               if (is_string_field(field))
32385 -                       key_size = MAX_FILTER_STR_VAL;
32386 -               else
32387 -                       key_size = field->size;
32388 +               key_size = hist_field->size;
32389         }
32390
32391 -       hist_data->fields[key_idx] = create_hist_field(field, flags);
32392 -       if (!hist_data->fields[key_idx]) {
32393 -               ret = -ENOMEM;
32394 -               goto out;
32395 -       }
32396 +       hist_data->fields[key_idx] = hist_field;
32397
32398         key_size = ALIGN(key_size, sizeof(u64));
32399         hist_data->fields[key_idx]->size = key_size;
32400         hist_data->fields[key_idx]->offset = key_offset;
32401 +
32402         hist_data->key_size += key_size;
32403 +
32404         if (hist_data->key_size > HIST_KEY_SIZE_MAX) {
32405                 ret = -EINVAL;
32406                 goto out;
32407         }
32408
32409         hist_data->n_keys++;
32410 +       hist_data->n_fields++;
32411
32412         if (WARN_ON(hist_data->n_keys > TRACING_MAP_KEYS_MAX))
32413                 return -EINVAL;
32414 @@ -618,21 +4024,113 @@ static int create_key_fields(struct hist_trigger_data *hist_data,
32415         return ret;
32416  }
32417
32418 +static int create_var_fields(struct hist_trigger_data *hist_data,
32419 +                            struct trace_event_file *file)
32420 +{
32421 +       unsigned int i, j = hist_data->n_vals;
32422 +       int ret = 0;
32423 +
32424 +       unsigned int n_vars = hist_data->attrs->var_defs.n_vars;
32425 +
32426 +       for (i = 0; i < n_vars; i++) {
32427 +               char *var_name = hist_data->attrs->var_defs.name[i];
32428 +               char *expr = hist_data->attrs->var_defs.expr[i];
32429 +
32430 +               ret = create_var_field(hist_data, j++, file, var_name, expr);
32431 +               if (ret)
32432 +                       goto out;
32433 +       }
32434 + out:
32435 +       return ret;
32436 +}
32437 +
32438 +static void free_var_defs(struct hist_trigger_data *hist_data)
32439 +{
32440 +       unsigned int i;
32441 +
32442 +       for (i = 0; i < hist_data->attrs->var_defs.n_vars; i++) {
32443 +               kfree(hist_data->attrs->var_defs.name[i]);
32444 +               kfree(hist_data->attrs->var_defs.expr[i]);
32445 +       }
32446 +
32447 +       hist_data->attrs->var_defs.n_vars = 0;
32448 +}
32449 +
32450 +static int parse_var_defs(struct hist_trigger_data *hist_data)
32451 +{
32452 +       char *s, *str, *var_name, *field_str;
32453 +       unsigned int i, j, n_vars = 0;
32454 +       int ret = 0;
32455 +
32456 +       for (i = 0; i < hist_data->attrs->n_assignments; i++) {
32457 +               str = hist_data->attrs->assignment_str[i];
32458 +               for (j = 0; j < TRACING_MAP_VARS_MAX; j++) {
32459 +                       field_str = strsep(&str, ",");
32460 +                       if (!field_str)
32461 +                               break;
32462 +
32463 +                       var_name = strsep(&field_str, "=");
32464 +                       if (!var_name || !field_str) {
32465 +                               hist_err("Malformed assignment: ", var_name);
32466 +                               ret = -EINVAL;
32467 +                               goto free;
32468 +                       }
32469 +
32470 +                       if (n_vars == TRACING_MAP_VARS_MAX) {
32471 +                               hist_err("Too many variables defined: ", var_name);
32472 +                               ret = -EINVAL;
32473 +                               goto free;
32474 +                       }
32475 +
32476 +                       s = kstrdup(var_name, GFP_KERNEL);
32477 +                       if (!s) {
32478 +                               ret = -ENOMEM;
32479 +                               goto free;
32480 +                       }
32481 +                       hist_data->attrs->var_defs.name[n_vars] = s;
32482 +
32483 +                       s = kstrdup(field_str, GFP_KERNEL);
32484 +                       if (!s) {
32485 +                               kfree(hist_data->attrs->var_defs.name[n_vars]);
32486 +                               ret = -ENOMEM;
32487 +                               goto free;
32488 +                       }
32489 +                       hist_data->attrs->var_defs.expr[n_vars++] = s;
32490 +
32491 +                       hist_data->attrs->var_defs.n_vars = n_vars;
32492 +               }
32493 +       }
32494 +
32495 +       return ret;
32496 + free:
32497 +       free_var_defs(hist_data);
32498 +
32499 +       return ret;
32500 +}
32501 +
32502  static int create_hist_fields(struct hist_trigger_data *hist_data,
32503                               struct trace_event_file *file)
32504  {
32505         int ret;
32506
32507 +       ret = parse_var_defs(hist_data);
32508 +       if (ret)
32509 +               goto out;
32510 +
32511         ret = create_val_fields(hist_data, file);
32512         if (ret)
32513                 goto out;
32514
32515 -       ret = create_key_fields(hist_data, file);
32516 +       ret = create_var_fields(hist_data, file);
32517         if (ret)
32518                 goto out;
32519
32520 -       hist_data->n_fields = hist_data->n_vals + hist_data->n_keys;
32521 +       ret = create_key_fields(hist_data, file);
32522 +       if (ret)
32523 +               goto out;
32524   out:
32525 +       free_var_defs(hist_data);
32526 +
32527         return ret;
32528  }
32529
32530 @@ -653,10 +4151,9 @@ static int is_descending(const char *str)
32531  static int create_sort_keys(struct hist_trigger_data *hist_data)
32532  {
32533         char *fields_str = hist_data->attrs->sort_key_str;
32534 -       struct ftrace_event_field *field = NULL;
32535         struct tracing_map_sort_key *sort_key;
32536         int descending, ret = 0;
32537 -       unsigned int i, j;
32538 +       unsigned int i, j, k;
32539
32540         hist_data->n_sort_keys = 1; /* we always have at least one, hitcount */
32541
32542 @@ -670,7 +4167,9 @@ static int create_sort_keys(struct hist_trigger_data *hist_data)
32543         }
32544
32545         for (i = 0; i < TRACING_MAP_SORT_KEYS_MAX; i++) {
32546 +               struct hist_field *hist_field;
32547                 char *field_str, *field_name;
32548 +               const char *test_name;
32549
32550                 sort_key = &hist_data->sort_keys[i];
32551
32552 @@ -702,10 +4201,19 @@ static int create_sort_keys(struct hist_trigger_data *hist_data)
32553                         continue;
32554                 }
32555
32556 -               for (j = 1; j < hist_data->n_fields; j++) {
32557 -                       field = hist_data->fields[j]->field;
32558 -                       if (field && (strcmp(field_name, field->name) == 0)) {
32559 -                               sort_key->field_idx = j;
32560 +               for (j = 1, k = 1; j < hist_data->n_fields; j++) {
32561 +                       unsigned int idx;
32562 +
32563 +                       hist_field = hist_data->fields[j];
32564 +                       if (hist_field->flags & HIST_FIELD_FL_VAR)
32565 +                               continue;
32566 +
32567 +                       idx = k++;
32568 +
32569 +                       test_name = hist_field_name(hist_field, 0);
32570 +
32571 +                       if (strcmp(field_name, test_name) == 0) {
32572 +                               sort_key->field_idx = idx;
32573                                 descending = is_descending(field_str);
32574                                 if (descending < 0) {
32575                                         ret = descending;
32576 @@ -720,16 +4228,230 @@ static int create_sort_keys(struct hist_trigger_data *hist_data)
32577                         break;
32578                 }
32579         }
32580 -       hist_data->n_sort_keys = i;
32581 - out:
32582 -       return ret;
32583 +
32584 +       hist_data->n_sort_keys = i;
32585 + out:
32586 +       return ret;
32587 +}
32588 +
32589 +static void destroy_actions(struct hist_trigger_data *hist_data)
32590 +{
32591 +       unsigned int i;
32592 +
32593 +       for (i = 0; i < hist_data->n_actions; i++) {
32594 +               struct action_data *data = hist_data->actions[i];
32595 +
32596 +               if (data->fn == action_trace)
32597 +                       onmatch_destroy(data);
32598 +               else if (data->fn == onmax_save)
32599 +                       onmax_destroy(data);
32600 +               else
32601 +                       kfree(data);
32602 +       }
32603 +}
32604 +
32605 +static int parse_actions(struct hist_trigger_data *hist_data)
32606 +{
32607 +       struct trace_array *tr = hist_data->event_file->tr;
32608 +       struct action_data *data;
32609 +       unsigned int i;
32610 +       int ret = 0;
32611 +       char *str;
32612 +
32613 +       for (i = 0; i < hist_data->attrs->n_actions; i++) {
32614 +               str = hist_data->attrs->action_str[i];
32615 +
32616 +               if (strncmp(str, "onmatch(", strlen("onmatch(")) == 0) {
32617 +                       char *action_str = str + strlen("onmatch(");
32618 +
32619 +                       data = onmatch_parse(tr, action_str);
32620 +                       if (IS_ERR(data)) {
32621 +                               ret = PTR_ERR(data);
32622 +                               break;
32623 +                       }
32624 +                       data->fn = action_trace;
32625 +               } else if (strncmp(str, "onmax(", strlen("onmax(")) == 0) {
32626 +                       char *action_str = str + strlen("onmax(");
32627 +
32628 +                       data = onmax_parse(action_str);
32629 +                       if (IS_ERR(data)) {
32630 +                               ret = PTR_ERR(data);
32631 +                               break;
32632 +                       }
32633 +                       data->fn = onmax_save;
32634 +               } else {
32635 +                       ret = -EINVAL;
32636 +                       break;
32637 +               }
32638 +
32639 +               hist_data->actions[hist_data->n_actions++] = data;
32640 +       }
32641 +
32642 +       return ret;
32643 +}
32644 +
32645 +static int create_actions(struct hist_trigger_data *hist_data,
32646 +                         struct trace_event_file *file)
32647 +{
32648 +       struct action_data *data;
32649 +       unsigned int i;
32650 +       int ret = 0;
32651 +
32652 +       for (i = 0; i < hist_data->attrs->n_actions; i++) {
32653 +               data = hist_data->actions[i];
32654 +
32655 +               if (data->fn == action_trace) {
32656 +                       ret = onmatch_create(hist_data, file, data);
32657 +                       if (ret)
32658 +                               return ret;
32659 +               } else if (data->fn == onmax_save) {
32660 +                       ret = onmax_create(hist_data, data);
32661 +                       if (ret)
32662 +                               return ret;
32663 +               }
32664 +       }
32665 +
32666 +       return ret;
32667 +}
32668 +
32669 +static void print_actions(struct seq_file *m,
32670 +                         struct hist_trigger_data *hist_data,
32671 +                         struct tracing_map_elt *elt)
32672 +{
32673 +       unsigned int i;
32674 +
32675 +       for (i = 0; i < hist_data->n_actions; i++) {
32676 +               struct action_data *data = hist_data->actions[i];
32677 +
32678 +               if (data->fn == onmax_save)
32679 +                       onmax_print(m, hist_data, elt, data);
32680 +       }
32681 +}
32682 +
32683 +static void print_onmax_spec(struct seq_file *m,
32684 +                            struct hist_trigger_data *hist_data,
32685 +                            struct action_data *data)
32686 +{
32687 +       unsigned int i;
32688 +
32689 +       seq_puts(m, ":onmax(");
32690 +       seq_printf(m, "%s", data->onmax.var_str);
32691 +       seq_printf(m, ").%s(", data->onmax.fn_name);
32692 +
32693 +       for (i = 0; i < hist_data->n_max_vars; i++) {
32694 +               seq_printf(m, "%s", hist_data->max_vars[i]->var->var.name);
32695 +               if (i < hist_data->n_max_vars - 1)
32696 +                       seq_puts(m, ",");
32697 +       }
32698 +       seq_puts(m, ")");
32699 +}
32700 +
32701 +static void print_onmatch_spec(struct seq_file *m,
32702 +                              struct hist_trigger_data *hist_data,
32703 +                              struct action_data *data)
32704 +{
32705 +       unsigned int i;
32706 +
32707 +       seq_printf(m, ":onmatch(%s.%s).", data->onmatch.match_event_system,
32708 +                  data->onmatch.match_event);
32709 +
32710 +       seq_printf(m, "%s(", data->onmatch.synth_event->name);
32711 +
32712 +       for (i = 0; i < data->n_params; i++) {
32713 +               if (i)
32714 +                       seq_puts(m, ",");
32715 +               seq_printf(m, "%s", data->params[i]);
32716 +       }
32717 +
32718 +       seq_puts(m, ")");
32719 +}
32720 +
32721 +static bool actions_match(struct hist_trigger_data *hist_data,
32722 +                         struct hist_trigger_data *hist_data_test)
32723 +{
32724 +       unsigned int i, j;
32725 +
32726 +       if (hist_data->n_actions != hist_data_test->n_actions)
32727 +               return false;
32728 +
32729 +       for (i = 0; i < hist_data->n_actions; i++) {
32730 +               struct action_data *data = hist_data->actions[i];
32731 +               struct action_data *data_test = hist_data_test->actions[i];
32732 +
32733 +               if (data->fn != data_test->fn)
32734 +                       return false;
32735 +
32736 +               if (data->n_params != data_test->n_params)
32737 +                       return false;
32738 +
32739 +               for (j = 0; j < data->n_params; j++) {
32740 +                       if (strcmp(data->params[j], data_test->params[j]) != 0)
32741 +                               return false;
32742 +               }
32743 +
32744 +               if (data->fn == action_trace) {
32745 +                       if (strcmp(data->onmatch.synth_event_name,
32746 +                                  data_test->onmatch.synth_event_name) != 0)
32747 +                               return false;
32748 +                       if (strcmp(data->onmatch.match_event_system,
32749 +                                  data_test->onmatch.match_event_system) != 0)
32750 +                               return false;
32751 +                       if (strcmp(data->onmatch.match_event,
32752 +                                  data_test->onmatch.match_event) != 0)
32753 +                               return false;
32754 +               } else if (data->fn == onmax_save) {
32755 +                       if (strcmp(data->onmax.var_str,
32756 +                                  data_test->onmax.var_str) != 0)
32757 +                               return false;
32758 +                       if (strcmp(data->onmax.fn_name,
32759 +                                  data_test->onmax.fn_name) != 0)
32760 +                               return false;
32761 +               }
32762 +       }
32763 +
32764 +       return true;
32765 +}
32766 +
32767 +
32768 +static void print_actions_spec(struct seq_file *m,
32769 +                              struct hist_trigger_data *hist_data)
32770 +{
32771 +       unsigned int i;
32772 +
32773 +       for (i = 0; i < hist_data->n_actions; i++) {
32774 +               struct action_data *data = hist_data->actions[i];
32775 +
32776 +               if (data->fn == action_trace)
32777 +                       print_onmatch_spec(m, hist_data, data);
32778 +               else if (data->fn == onmax_save)
32779 +                       print_onmax_spec(m, hist_data, data);
32780 +       }
32781 +}
32782 +
32783 +static void destroy_field_var_hists(struct hist_trigger_data *hist_data)
32784 +{
32785 +       unsigned int i;
32786 +
32787 +       for (i = 0; i < hist_data->n_field_var_hists; i++) {
32788 +               kfree(hist_data->field_var_hists[i]->cmd);
32789 +               kfree(hist_data->field_var_hists[i]);
32790 +       }
32791  }
32792
32793  static void destroy_hist_data(struct hist_trigger_data *hist_data)
32794  {
32795 +       if (!hist_data)
32796 +               return;
32797 +
32798         destroy_hist_trigger_attrs(hist_data->attrs);
32799         destroy_hist_fields(hist_data);
32800         tracing_map_destroy(hist_data->map);
32801 +
32802 +       destroy_actions(hist_data);
32803 +       destroy_field_vars(hist_data);
32804 +       destroy_field_var_hists(hist_data);
32805 +       destroy_synth_var_refs(hist_data);
32806 +
32807         kfree(hist_data);
32808  }
32809
32810 @@ -738,7 +4460,7 @@ static int create_tracing_map_fields(struct hist_trigger_data *hist_data)
32811         struct tracing_map *map = hist_data->map;
32812         struct ftrace_event_field *field;
32813         struct hist_field *hist_field;
32814 -       int i, idx;
32815 +       int i, idx = 0;
32816
32817         for_each_hist_field(i, hist_data) {
32818                 hist_field = hist_data->fields[i];
32819 @@ -749,6 +4471,9 @@ static int create_tracing_map_fields(struct hist_trigger_data *hist_data)
32820
32821                         if (hist_field->flags & HIST_FIELD_FL_STACKTRACE)
32822                                 cmp_fn = tracing_map_cmp_none;
32823 +                       else if (!field)
32824 +                               cmp_fn = tracing_map_cmp_num(hist_field->size,
32825 +                                                            hist_field->is_signed);
32826                         else if (is_string_field(field))
32827                                 cmp_fn = tracing_map_cmp_string;
32828                         else
32829 @@ -757,36 +4482,29 @@ static int create_tracing_map_fields(struct hist_trigger_data *hist_data)
32830                         idx = tracing_map_add_key_field(map,
32831                                                         hist_field->offset,
32832                                                         cmp_fn);
32833 -
32834 -               } else
32835 +               } else if (!(hist_field->flags & HIST_FIELD_FL_VAR))
32836                         idx = tracing_map_add_sum_field(map);
32837
32838                 if (idx < 0)
32839                         return idx;
32840 -       }
32841 -
32842 -       return 0;
32843 -}
32844 -
32845 -static bool need_tracing_map_ops(struct hist_trigger_data *hist_data)
32846 -{
32847 -       struct hist_field *key_field;
32848 -       unsigned int i;
32849 -
32850 -       for_each_hist_key_field(i, hist_data) {
32851 -               key_field = hist_data->fields[i];
32852
32853 -               if (key_field->flags & HIST_FIELD_FL_EXECNAME)
32854 -                       return true;
32855 +               if (hist_field->flags & HIST_FIELD_FL_VAR) {
32856 +                       idx = tracing_map_add_var(map);
32857 +                       if (idx < 0)
32858 +                               return idx;
32859 +                       hist_field->var.idx = idx;
32860 +                       hist_field->var.hist_data = hist_data;
32861 +               }
32862         }
32863
32864 -       return false;
32865 +       return 0;
32866  }
32867
32868  static struct hist_trigger_data *
32869  create_hist_data(unsigned int map_bits,
32870                  struct hist_trigger_attrs *attrs,
32871 -                struct trace_event_file *file)
32872 +                struct trace_event_file *file,
32873 +                bool remove)
32874  {
32875         const struct tracing_map_ops *map_ops = NULL;
32876         struct hist_trigger_data *hist_data;
32877 @@ -797,6 +4515,12 @@ create_hist_data(unsigned int map_bits,
32878                 return ERR_PTR(-ENOMEM);
32879
32880         hist_data->attrs = attrs;
32881 +       hist_data->remove = remove;
32882 +       hist_data->event_file = file;
32883 +
32884 +       ret = parse_actions(hist_data);
32885 +       if (ret)
32886 +               goto free;
32887
32888         ret = create_hist_fields(hist_data, file);
32889         if (ret)
32890 @@ -806,8 +4530,7 @@ create_hist_data(unsigned int map_bits,
32891         if (ret)
32892                 goto free;
32893
32894 -       if (need_tracing_map_ops(hist_data))
32895 -               map_ops = &hist_trigger_elt_comm_ops;
32896 +       map_ops = &hist_trigger_elt_data_ops;
32897
32898         hist_data->map = tracing_map_create(map_bits, hist_data->key_size,
32899                                             map_ops, hist_data);
32900 @@ -820,12 +4543,6 @@ create_hist_data(unsigned int map_bits,
32901         ret = create_tracing_map_fields(hist_data);
32902         if (ret)
32903                 goto free;
32904 -
32905 -       ret = tracing_map_init(hist_data->map);
32906 -       if (ret)
32907 -               goto free;
32908 -
32909 -       hist_data->event_file = file;
32910   out:
32911         return hist_data;
32912   free:
32913 @@ -839,18 +4556,39 @@ create_hist_data(unsigned int map_bits,
32914  }
32915
32916  static void hist_trigger_elt_update(struct hist_trigger_data *hist_data,
32917 -                                   struct tracing_map_elt *elt,
32918 -                                   void *rec)
32919 +                                   struct tracing_map_elt *elt, void *rec,
32920 +                                   struct ring_buffer_event *rbe,
32921 +                                   u64 *var_ref_vals)
32922  {
32923 +       struct hist_elt_data *elt_data;
32924         struct hist_field *hist_field;
32925 -       unsigned int i;
32926 +       unsigned int i, var_idx;
32927         u64 hist_val;
32928
32929 +       elt_data = elt->private_data;
32930 +       elt_data->var_ref_vals = var_ref_vals;
32931 +
32932         for_each_hist_val_field(i, hist_data) {
32933                 hist_field = hist_data->fields[i];
32934 -               hist_val = hist_field->fn(hist_field, rec);
32935 +               hist_val = hist_field->fn(hist_field, elt, rbe, rec);
32936 +               if (hist_field->flags & HIST_FIELD_FL_VAR) {
32937 +                       var_idx = hist_field->var.idx;
32938 +                       tracing_map_set_var(elt, var_idx, hist_val);
32939 +                       continue;
32940 +               }
32941                 tracing_map_update_sum(elt, i, hist_val);
32942         }
32943 +
32944 +       for_each_hist_key_field(i, hist_data) {
32945 +               hist_field = hist_data->fields[i];
32946 +               if (hist_field->flags & HIST_FIELD_FL_VAR) {
32947 +                       hist_val = hist_field->fn(hist_field, elt, rbe, rec);
32948 +                       var_idx = hist_field->var.idx;
32949 +                       tracing_map_set_var(elt, var_idx, hist_val);
32950 +               }
32951 +       }
32952 +
32953 +       update_field_vars(hist_data, elt, rbe, rec);
32954  }
32955
32956  static inline void add_to_key(char *compound_key, void *key,
32957 @@ -877,15 +4615,31 @@ static inline void add_to_key(char *compound_key, void *key,
32958         memcpy(compound_key + key_field->offset, key, size);
32959  }
32960
32961 -static void event_hist_trigger(struct event_trigger_data *data, void *rec)
32962 +static void
32963 +hist_trigger_actions(struct hist_trigger_data *hist_data,
32964 +                    struct tracing_map_elt *elt, void *rec,
32965 +                    struct ring_buffer_event *rbe, u64 *var_ref_vals)
32966 +{
32967 +       struct action_data *data;
32968 +       unsigned int i;
32969 +
32970 +       for (i = 0; i < hist_data->n_actions; i++) {
32971 +               data = hist_data->actions[i];
32972 +               data->fn(hist_data, elt, rec, rbe, data, var_ref_vals);
32973 +       }
32974 +}
32975 +
32976 +static void event_hist_trigger(struct event_trigger_data *data, void *rec,
32977 +                              struct ring_buffer_event *rbe)
32978  {
32979         struct hist_trigger_data *hist_data = data->private_data;
32980         bool use_compound_key = (hist_data->n_keys > 1);
32981         unsigned long entries[HIST_STACKTRACE_DEPTH];
32982 +       u64 var_ref_vals[TRACING_MAP_VARS_MAX];
32983         char compound_key[HIST_KEY_SIZE_MAX];
32984 +       struct tracing_map_elt *elt = NULL;
32985         struct stack_trace stacktrace;
32986         struct hist_field *key_field;
32987 -       struct tracing_map_elt *elt;
32988         u64 field_contents;
32989         void *key = NULL;
32990         unsigned int i;
32991 @@ -906,7 +4660,7 @@ static void event_hist_trigger(struct event_trigger_data *data, void *rec)
32992
32993                         key = entries;
32994                 } else {
32995 -                       field_contents = key_field->fn(key_field, rec);
32996 +                       field_contents = key_field->fn(key_field, elt, rbe, rec);
32997                         if (key_field->flags & HIST_FIELD_FL_STRING) {
32998                                 key = (void *)(unsigned long)field_contents;
32999                                 use_compound_key = true;
33000 @@ -921,9 +4675,18 @@ static void event_hist_trigger(struct event_trigger_data *data, void *rec)
33001         if (use_compound_key)
33002                 key = compound_key;
33003
33004 +       if (hist_data->n_var_refs &&
33005 +           !resolve_var_refs(hist_data, key, var_ref_vals, false))
33006 +               return;
33007 +
33008         elt = tracing_map_insert(hist_data->map, key);
33009 -       if (elt)
33010 -               hist_trigger_elt_update(hist_data, elt, rec);
33011 +       if (!elt)
33012 +               return;
33013 +
33014 +       hist_trigger_elt_update(hist_data, elt, rec, rbe, var_ref_vals);
33015 +
33016 +       if (resolve_var_refs(hist_data, key, var_ref_vals, true))
33017 +               hist_trigger_actions(hist_data, elt, rec, rbe, var_ref_vals);
33018  }
33019
33020  static void hist_trigger_stacktrace_print(struct seq_file *m,
33021 @@ -952,6 +4715,7 @@ hist_trigger_entry_print(struct seq_file *m,
33022         struct hist_field *key_field;
33023         char str[KSYM_SYMBOL_LEN];
33024         bool multiline = false;
33025 +       const char *field_name;
33026         unsigned int i;
33027         u64 uval;
33028
33029 @@ -963,26 +4727,33 @@ hist_trigger_entry_print(struct seq_file *m,
33030                 if (i > hist_data->n_vals)
33031                         seq_puts(m, ", ");
33032
33033 +               field_name = hist_field_name(key_field, 0);
33034 +
33035                 if (key_field->flags & HIST_FIELD_FL_HEX) {
33036                         uval = *(u64 *)(key + key_field->offset);
33037 -                       seq_printf(m, "%s: %llx",
33038 -                                  key_field->field->name, uval);
33039 +                       seq_printf(m, "%s: %llx", field_name, uval);
33040                 } else if (key_field->flags & HIST_FIELD_FL_SYM) {
33041                         uval = *(u64 *)(key + key_field->offset);
33042                         sprint_symbol_no_offset(str, uval);
33043 -                       seq_printf(m, "%s: [%llx] %-45s",
33044 -                                  key_field->field->name, uval, str);
33045 +                       seq_printf(m, "%s: [%llx] %-45s", field_name,
33046 +                                  uval, str);
33047                 } else if (key_field->flags & HIST_FIELD_FL_SYM_OFFSET) {
33048                         uval = *(u64 *)(key + key_field->offset);
33049                         sprint_symbol(str, uval);
33050 -                       seq_printf(m, "%s: [%llx] %-55s",
33051 -                                  key_field->field->name, uval, str);
33052 +                       seq_printf(m, "%s: [%llx] %-55s", field_name,
33053 +                                  uval, str);
33054                 } else if (key_field->flags & HIST_FIELD_FL_EXECNAME) {
33055 -                       char *comm = elt->private_data;
33056 +                       struct hist_elt_data *elt_data = elt->private_data;
33057 +                       char *comm;
33058 +
33059 +                       if (WARN_ON_ONCE(!elt_data))
33060 +                               return;
33061 +
33062 +                       comm = elt_data->comm;
33063
33064                         uval = *(u64 *)(key + key_field->offset);
33065 -                       seq_printf(m, "%s: %-16s[%10llu]",
33066 -                                  key_field->field->name, comm, uval);
33067 +                       seq_printf(m, "%s: %-16s[%10llu]", field_name,
33068 +                                  comm, uval);
33069                 } else if (key_field->flags & HIST_FIELD_FL_SYSCALL) {
33070                         const char *syscall_name;
33071
33072 @@ -991,8 +4762,8 @@ hist_trigger_entry_print(struct seq_file *m,
33073                         if (!syscall_name)
33074                                 syscall_name = "unknown_syscall";
33075
33076 -                       seq_printf(m, "%s: %-30s[%3llu]",
33077 -                                  key_field->field->name, syscall_name, uval);
33078 +                       seq_printf(m, "%s: %-30s[%3llu]", field_name,
33079 +                                  syscall_name, uval);
33080                 } else if (key_field->flags & HIST_FIELD_FL_STACKTRACE) {
33081                         seq_puts(m, "stacktrace:\n");
33082                         hist_trigger_stacktrace_print(m,
33083 @@ -1000,15 +4771,14 @@ hist_trigger_entry_print(struct seq_file *m,
33084                                                       HIST_STACKTRACE_DEPTH);
33085                         multiline = true;
33086                 } else if (key_field->flags & HIST_FIELD_FL_LOG2) {
33087 -                       seq_printf(m, "%s: ~ 2^%-2llu", key_field->field->name,
33088 +                       seq_printf(m, "%s: ~ 2^%-2llu", field_name,
33089                                    *(u64 *)(key + key_field->offset));
33090                 } else if (key_field->flags & HIST_FIELD_FL_STRING) {
33091 -                       seq_printf(m, "%s: %-50s", key_field->field->name,
33092 +                       seq_printf(m, "%s: %-50s", field_name,
33093                                    (char *)(key + key_field->offset));
33094                 } else {
33095                         uval = *(u64 *)(key + key_field->offset);
33096 -                       seq_printf(m, "%s: %10llu", key_field->field->name,
33097 -                                  uval);
33098 +                       seq_printf(m, "%s: %10llu", field_name, uval);
33099                 }
33100         }
33101
33102 @@ -1021,17 +4791,23 @@ hist_trigger_entry_print(struct seq_file *m,
33103                    tracing_map_read_sum(elt, HITCOUNT_IDX));
33104
33105         for (i = 1; i < hist_data->n_vals; i++) {
33106 +               field_name = hist_field_name(hist_data->fields[i], 0);
33107 +
33108 +               if (hist_data->fields[i]->flags & HIST_FIELD_FL_VAR ||
33109 +                   hist_data->fields[i]->flags & HIST_FIELD_FL_EXPR)
33110 +                       continue;
33111 +
33112                 if (hist_data->fields[i]->flags & HIST_FIELD_FL_HEX) {
33113 -                       seq_printf(m, "  %s: %10llx",
33114 -                                  hist_data->fields[i]->field->name,
33115 +                       seq_printf(m, "  %s: %10llx", field_name,
33116                                    tracing_map_read_sum(elt, i));
33117                 } else {
33118 -                       seq_printf(m, "  %s: %10llu",
33119 -                                  hist_data->fields[i]->field->name,
33120 +                       seq_printf(m, "  %s: %10llu", field_name,
33121                                    tracing_map_read_sum(elt, i));
33122                 }
33123         }
33124
33125 +       print_actions(m, hist_data, elt);
33126 +
33127         seq_puts(m, "\n");
33128  }
33129
33130 @@ -1102,6 +4878,11 @@ static int hist_show(struct seq_file *m, void *v)
33131                         hist_trigger_show(m, data, n++);
33132         }
33133
33134 +       if (have_hist_err()) {
33135 +               seq_printf(m, "\nERROR: %s\n", hist_err_str);
33136 +               seq_printf(m, "  Last command: %s\n", last_hist_cmd);
33137 +       }
33138 +
33139   out_unlock:
33140         mutex_unlock(&event_mutex);
33141
33142 @@ -1120,34 +4901,31 @@ const struct file_operations event_hist_fops = {
33143         .release = single_release,
33144  };
33145
33146 -static const char *get_hist_field_flags(struct hist_field *hist_field)
33147 +static void hist_field_print(struct seq_file *m, struct hist_field *hist_field)
33148  {
33149 -       const char *flags_str = NULL;
33150 +       const char *field_name = hist_field_name(hist_field, 0);
33151
33152 -       if (hist_field->flags & HIST_FIELD_FL_HEX)
33153 -               flags_str = "hex";
33154 -       else if (hist_field->flags & HIST_FIELD_FL_SYM)
33155 -               flags_str = "sym";
33156 -       else if (hist_field->flags & HIST_FIELD_FL_SYM_OFFSET)
33157 -               flags_str = "sym-offset";
33158 -       else if (hist_field->flags & HIST_FIELD_FL_EXECNAME)
33159 -               flags_str = "execname";
33160 -       else if (hist_field->flags & HIST_FIELD_FL_SYSCALL)
33161 -               flags_str = "syscall";
33162 -       else if (hist_field->flags & HIST_FIELD_FL_LOG2)
33163 -               flags_str = "log2";
33164 +       if (hist_field->var.name)
33165 +               seq_printf(m, "%s=", hist_field->var.name);
33166
33167 -       return flags_str;
33168 -}
33169 +       if (hist_field->flags & HIST_FIELD_FL_CPU)
33170 +               seq_puts(m, "cpu");
33171 +       else if (field_name) {
33172 +               if (hist_field->flags & HIST_FIELD_FL_VAR_REF ||
33173 +                   hist_field->flags & HIST_FIELD_FL_ALIAS)
33174 +                       seq_putc(m, '$');
33175 +               seq_printf(m, "%s", field_name);
33176 +       } else if (hist_field->flags & HIST_FIELD_FL_TIMESTAMP)
33177 +               seq_puts(m, "common_timestamp");
33178
33179 -static void hist_field_print(struct seq_file *m, struct hist_field *hist_field)
33180 -{
33181 -       seq_printf(m, "%s", hist_field->field->name);
33182         if (hist_field->flags) {
33183 -               const char *flags_str = get_hist_field_flags(hist_field);
33184 +               if (!(hist_field->flags & HIST_FIELD_FL_VAR_REF) &&
33185 +                   !(hist_field->flags & HIST_FIELD_FL_EXPR)) {
33186 +                       const char *flags = get_hist_field_flags(hist_field);
33187
33188 -               if (flags_str)
33189 -                       seq_printf(m, ".%s", flags_str);
33190 +                       if (flags)
33191 +                               seq_printf(m, ".%s", flags);
33192 +               }
33193         }
33194  }
33195
33196 @@ -1156,7 +4934,8 @@ static int event_hist_trigger_print(struct seq_file *m,
33197                                     struct event_trigger_data *data)
33198  {
33199         struct hist_trigger_data *hist_data = data->private_data;
33200 -       struct hist_field *key_field;
33201 +       struct hist_field *field;
33202 +       bool have_var = false;
33203         unsigned int i;
33204
33205         seq_puts(m, "hist:");
33206 @@ -1167,25 +4946,47 @@ static int event_hist_trigger_print(struct seq_file *m,
33207         seq_puts(m, "keys=");
33208
33209         for_each_hist_key_field(i, hist_data) {
33210 -               key_field = hist_data->fields[i];
33211 +               field = hist_data->fields[i];
33212
33213                 if (i > hist_data->n_vals)
33214                         seq_puts(m, ",");
33215
33216 -               if (key_field->flags & HIST_FIELD_FL_STACKTRACE)
33217 +               if (field->flags & HIST_FIELD_FL_STACKTRACE)
33218                         seq_puts(m, "stacktrace");
33219                 else
33220 -                       hist_field_print(m, key_field);
33221 +                       hist_field_print(m, field);
33222         }
33223
33224         seq_puts(m, ":vals=");
33225
33226         for_each_hist_val_field(i, hist_data) {
33227 +               field = hist_data->fields[i];
33228 +               if (field->flags & HIST_FIELD_FL_VAR) {
33229 +                       have_var = true;
33230 +                       continue;
33231 +               }
33232 +
33233                 if (i == HITCOUNT_IDX)
33234                         seq_puts(m, "hitcount");
33235                 else {
33236                         seq_puts(m, ",");
33237 -                       hist_field_print(m, hist_data->fields[i]);
33238 +                       hist_field_print(m, field);
33239 +               }
33240 +       }
33241 +
33242 +       if (have_var) {
33243 +               unsigned int n = 0;
33244 +
33245 +               seq_puts(m, ":");
33246 +
33247 +               for_each_hist_val_field(i, hist_data) {
33248 +                       field = hist_data->fields[i];
33249 +
33250 +                       if (field->flags & HIST_FIELD_FL_VAR) {
33251 +                               if (n++)
33252 +                                       seq_puts(m, ",");
33253 +                               hist_field_print(m, field);
33254 +                       }
33255                 }
33256         }
33257
33258 @@ -1193,28 +4994,36 @@ static int event_hist_trigger_print(struct seq_file *m,
33259
33260         for (i = 0; i < hist_data->n_sort_keys; i++) {
33261                 struct tracing_map_sort_key *sort_key;
33262 +               unsigned int idx, first_key_idx;
33263 +
33264 +               /* skip VAR vals */
33265 +               first_key_idx = hist_data->n_vals - hist_data->n_vars;
33266
33267                 sort_key = &hist_data->sort_keys[i];
33268 +               idx = sort_key->field_idx;
33269 +
33270 +               if (WARN_ON(idx >= HIST_FIELDS_MAX))
33271 +                       return -EINVAL;
33272
33273                 if (i > 0)
33274                         seq_puts(m, ",");
33275
33276 -               if (sort_key->field_idx == HITCOUNT_IDX)
33277 +               if (idx == HITCOUNT_IDX)
33278                         seq_puts(m, "hitcount");
33279                 else {
33280 -                       unsigned int idx = sort_key->field_idx;
33281 -
33282 -                       if (WARN_ON(idx >= TRACING_MAP_FIELDS_MAX))
33283 -                               return -EINVAL;
33284 -
33285 +                       if (idx >= first_key_idx)
33286 +                               idx += hist_data->n_vars;
33287                         hist_field_print(m, hist_data->fields[idx]);
33288                 }
33289
33290                 if (sort_key->descending)
33291                         seq_puts(m, ".descending");
33292         }
33293 -
33294         seq_printf(m, ":size=%u", (1 << hist_data->map->map_bits));
33295 +       if (hist_data->enable_timestamps)
33296 +               seq_printf(m, ":clock=%s", hist_data->attrs->clock);
33297 +
33298 +       print_actions_spec(m, hist_data);
33299
33300         if (data->filter_str)
33301                 seq_printf(m, " if %s", data->filter_str);
33302 @@ -1242,6 +5051,21 @@ static int event_hist_trigger_init(struct event_trigger_ops *ops,
33303         return 0;
33304  }
33305
33306 +static void unregister_field_var_hists(struct hist_trigger_data *hist_data)
33307 +{
33308 +       struct trace_event_file *file;
33309 +       unsigned int i;
33310 +       char *cmd;
33311 +       int ret;
33312 +
33313 +       for (i = 0; i < hist_data->n_field_var_hists; i++) {
33314 +               file = hist_data->field_var_hists[i]->hist_data->event_file;
33315 +               cmd = hist_data->field_var_hists[i]->cmd;
33316 +               ret = event_hist_trigger_func(&trigger_hist_cmd, file,
33317 +                                             "!hist", "hist", cmd);
33318 +       }
33319 +}
33320 +
33321  static void event_hist_trigger_free(struct event_trigger_ops *ops,
33322                                     struct event_trigger_data *data)
33323  {
33324 @@ -1254,7 +5078,13 @@ static void event_hist_trigger_free(struct event_trigger_ops *ops,
33325         if (!data->ref) {
33326                 if (data->name)
33327                         del_named_trigger(data);
33328 +
33329                 trigger_data_free(data);
33330 +
33331 +               remove_hist_vars(hist_data);
33332 +
33333 +               unregister_field_var_hists(hist_data);
33334 +
33335                 destroy_hist_data(hist_data);
33336         }
33337  }
33338 @@ -1381,6 +5211,15 @@ static bool hist_trigger_match(struct event_trigger_data *data,
33339                         return false;
33340                 if (key_field->offset != key_field_test->offset)
33341                         return false;
33342 +               if (key_field->size != key_field_test->size)
33343 +                       return false;
33344 +               if (key_field->is_signed != key_field_test->is_signed)
33345 +                       return false;
33346 +               if (!!key_field->var.name != !!key_field_test->var.name)
33347 +                       return false;
33348 +               if (key_field->var.name &&
33349 +                   strcmp(key_field->var.name, key_field_test->var.name) != 0)
33350 +                       return false;
33351         }
33352
33353         for (i = 0; i < hist_data->n_sort_keys; i++) {
33354 @@ -1396,6 +5235,9 @@ static bool hist_trigger_match(struct event_trigger_data *data,
33355             (strcmp(data->filter_str, data_test->filter_str) != 0))
33356                 return false;
33357
33358 +       if (!actions_match(hist_data, hist_data_test))
33359 +               return false;
33360 +
33361         return true;
33362  }
33363
33364 @@ -1412,6 +5254,7 @@ static int hist_register_trigger(char *glob, struct event_trigger_ops *ops,
33365                 if (named_data) {
33366                         if (!hist_trigger_match(data, named_data, named_data,
33367                                                 true)) {
33368 +                               hist_err("Named hist trigger doesn't match existing named trigger (includes variables): ", hist_data->attrs->name);
33369                                 ret = -EINVAL;
33370                                 goto out;
33371                         }
33372 @@ -1431,13 +5274,16 @@ static int hist_register_trigger(char *glob, struct event_trigger_ops *ops,
33373                                 test->paused = false;
33374                         else if (hist_data->attrs->clear)
33375                                 hist_clear(test);
33376 -                       else
33377 +                       else {
33378 +                               hist_err("Hist trigger already exists", NULL);
33379                                 ret = -EEXIST;
33380 +                       }
33381                         goto out;
33382                 }
33383         }
33384   new:
33385         if (hist_data->attrs->cont || hist_data->attrs->clear) {
33386 +               hist_err("Can't clear or continue a nonexistent hist trigger", NULL);
33387                 ret = -ENOENT;
33388                 goto out;
33389         }
33390 @@ -1446,7 +5292,6 @@ static int hist_register_trigger(char *glob, struct event_trigger_ops *ops,
33391                 data->paused = true;
33392
33393         if (named_data) {
33394 -               destroy_hist_data(data->private_data);
33395                 data->private_data = named_data->private_data;
33396                 set_named_trigger_data(data, named_data);
33397                 data->ops = &event_hist_trigger_named_ops;
33398 @@ -1458,8 +5303,32 @@ static int hist_register_trigger(char *glob, struct event_trigger_ops *ops,
33399                         goto out;
33400         }
33401
33402 -       list_add_rcu(&data->list, &file->triggers);
33403 +       if (hist_data->enable_timestamps) {
33404 +               char *clock = hist_data->attrs->clock;
33405 +
33406 +               ret = tracing_set_clock(file->tr, hist_data->attrs->clock);
33407 +               if (ret) {
33408 +                       hist_err("Couldn't set trace_clock: ", clock);
33409 +                       goto out;
33410 +               }
33411 +
33412 +               tracing_set_time_stamp_abs(file->tr, true);
33413 +       }
33414 +
33415 +       if (named_data)
33416 +               destroy_hist_data(hist_data);
33417 +
33418         ret++;
33419 + out:
33420 +       return ret;
33421 +}
33422 +
33423 +static int hist_trigger_enable(struct event_trigger_data *data,
33424 +                              struct trace_event_file *file)
33425 +{
33426 +       int ret = 0;
33427 +
33428 +       list_add_tail_rcu(&data->list, &file->triggers);
33429
33430         update_cond_flag(file);
33431
33432 @@ -1468,10 +5337,55 @@ static int hist_register_trigger(char *glob, struct event_trigger_ops *ops,
33433                 update_cond_flag(file);
33434                 ret--;
33435         }
33436 - out:
33437 +
33438         return ret;
33439  }
33440
33441 +static bool have_hist_trigger_match(struct event_trigger_data *data,
33442 +                                   struct trace_event_file *file)
33443 +{
33444 +       struct hist_trigger_data *hist_data = data->private_data;
33445 +       struct event_trigger_data *test, *named_data = NULL;
33446 +       bool match = false;
33447 +
33448 +       if (hist_data->attrs->name)
33449 +               named_data = find_named_trigger(hist_data->attrs->name);
33450 +
33451 +       list_for_each_entry_rcu(test, &file->triggers, list) {
33452 +               if (test->cmd_ops->trigger_type == ETT_EVENT_HIST) {
33453 +                       if (hist_trigger_match(data, test, named_data, false)) {
33454 +                               match = true;
33455 +                               break;
33456 +                       }
33457 +               }
33458 +       }
33459 +
33460 +       return match;
33461 +}
33462 +
33463 +static bool hist_trigger_check_refs(struct event_trigger_data *data,
33464 +                                   struct trace_event_file *file)
33465 +{
33466 +       struct hist_trigger_data *hist_data = data->private_data;
33467 +       struct event_trigger_data *test, *named_data = NULL;
33468 +
33469 +       if (hist_data->attrs->name)
33470 +               named_data = find_named_trigger(hist_data->attrs->name);
33471 +
33472 +       list_for_each_entry_rcu(test, &file->triggers, list) {
33473 +               if (test->cmd_ops->trigger_type == ETT_EVENT_HIST) {
33474 +                       if (!hist_trigger_match(data, test, named_data, false))
33475 +                               continue;
33476 +                       hist_data = test->private_data;
33477 +                       if (check_var_refs(hist_data))
33478 +                               return true;
33479 +                       break;
33480 +               }
33481 +       }
33482 +
33483 +       return false;
33484 +}
33485 +
33486  static void hist_unregister_trigger(char *glob, struct event_trigger_ops *ops,
33487                                     struct event_trigger_data *data,
33488                                     struct trace_event_file *file)
33489 @@ -1497,17 +5411,55 @@ static void hist_unregister_trigger(char *glob, struct event_trigger_ops *ops,
33490
33491         if (unregistered && test->ops->free)
33492                 test->ops->free(test->ops, test);
33493 +
33494 +       if (hist_data->enable_timestamps) {
33495 +               if (!hist_data->remove || unregistered)
33496 +                       tracing_set_time_stamp_abs(file->tr, false);
33497 +       }
33498 +}
33499 +
33500 +static bool hist_file_check_refs(struct trace_event_file *file)
33501 +{
33502 +       struct hist_trigger_data *hist_data;
33503 +       struct event_trigger_data *test;
33504 +
33505 +       list_for_each_entry_rcu(test, &file->triggers, list) {
33506 +               if (test->cmd_ops->trigger_type == ETT_EVENT_HIST) {
33507 +                       hist_data = test->private_data;
33508 +                       if (check_var_refs(hist_data))
33509 +                               return true;
33510 +               }
33511 +       }
33512 +
33513 +       return false;
33514  }
33515
33516  static void hist_unreg_all(struct trace_event_file *file)
33517  {
33518         struct event_trigger_data *test, *n;
33519 +       struct hist_trigger_data *hist_data;
33520 +       struct synth_event *se;
33521 +       const char *se_name;
33522 +
33523 +       if (hist_file_check_refs(file))
33524 +               return;
33525
33526         list_for_each_entry_safe(test, n, &file->triggers, list) {
33527                 if (test->cmd_ops->trigger_type == ETT_EVENT_HIST) {
33528 +                       hist_data = test->private_data;
33529                         list_del_rcu(&test->list);
33530                         trace_event_trigger_enable_disable(file, 0);
33531 +
33532 +                       mutex_lock(&synth_event_mutex);
33533 +                       se_name = trace_event_name(file->event_call);
33534 +                       se = find_synth_event(se_name);
33535 +                       if (se)
33536 +                               se->ref--;
33537 +                       mutex_unlock(&synth_event_mutex);
33538 +
33539                         update_cond_flag(file);
33540 +                       if (hist_data->enable_timestamps)
33541 +                               tracing_set_time_stamp_abs(file->tr, false);
33542                         if (test->ops->free)
33543                                 test->ops->free(test->ops, test);
33544                 }
33545 @@ -1523,16 +5475,54 @@ static int event_hist_trigger_func(struct event_command *cmd_ops,
33546         struct hist_trigger_attrs *attrs;
33547         struct event_trigger_ops *trigger_ops;
33548         struct hist_trigger_data *hist_data;
33549 -       char *trigger;
33550 +       struct synth_event *se;
33551 +       const char *se_name;
33552 +       bool remove = false;
33553 +       char *trigger, *p;
33554         int ret = 0;
33555
33556 +       if (glob && strlen(glob)) {
33557 +               last_cmd_set(param);
33558 +               hist_err_clear();
33559 +       }
33560 +
33561         if (!param)
33562                 return -EINVAL;
33563
33564 -       /* separate the trigger from the filter (k:v [if filter]) */
33565 -       trigger = strsep(&param, " \t");
33566 -       if (!trigger)
33567 -               return -EINVAL;
33568 +       if (glob[0] == '!')
33569 +               remove = true;
33570 +
33571 +       /*
33572 +        * separate the trigger from the filter (k:v [if filter])
33573 +        * allowing for whitespace in the trigger
33574 +        */
33575 +       p = trigger = param;
33576 +       do {
33577 +               p = strstr(p, "if");
33578 +               if (!p)
33579 +                       break;
33580 +               if (p == param)
33581 +                       return -EINVAL;
33582 +               if (*(p - 1) != ' ' && *(p - 1) != '\t') {
33583 +                       p++;
33584 +                       continue;
33585 +               }
33586 +               if (p >= param + strlen(param) - strlen("if") - 1)
33587 +                       return -EINVAL;
33588 +               if (*(p + strlen("if")) != ' ' && *(p + strlen("if")) != '\t') {
33589 +                       p++;
33590 +                       continue;
33591 +               }
33592 +               break;
33593 +       } while (p);
33594 +
33595 +       if (!p)
33596 +               param = NULL;
33597 +       else {
33598 +               *(p - 1) = '\0';
33599 +               param = strstrip(p);
33600 +               trigger = strstrip(trigger);
33601 +       }
33602
33603         attrs = parse_hist_trigger_attrs(trigger);
33604         if (IS_ERR(attrs))
33605 @@ -1541,7 +5531,7 @@ static int event_hist_trigger_func(struct event_command *cmd_ops,
33606         if (attrs->map_bits)
33607                 hist_trigger_bits = attrs->map_bits;
33608
33609 -       hist_data = create_hist_data(hist_trigger_bits, attrs, file);
33610 +       hist_data = create_hist_data(hist_trigger_bits, attrs, file, remove);
33611         if (IS_ERR(hist_data)) {
33612                 destroy_hist_trigger_attrs(attrs);
33613                 return PTR_ERR(hist_data);
33614 @@ -1549,10 +5539,11 @@ static int event_hist_trigger_func(struct event_command *cmd_ops,
33615
33616         trigger_ops = cmd_ops->get_trigger_ops(cmd, trigger);
33617
33618 -       ret = -ENOMEM;
33619         trigger_data = kzalloc(sizeof(*trigger_data), GFP_KERNEL);
33620 -       if (!trigger_data)
33621 +       if (!trigger_data) {
33622 +               ret = -ENOMEM;
33623                 goto out_free;
33624 +       }
33625
33626         trigger_data->count = -1;
33627         trigger_data->ops = trigger_ops;
33628 @@ -1570,8 +5561,24 @@ static int event_hist_trigger_func(struct event_command *cmd_ops,
33629                         goto out_free;
33630         }
33631
33632 -       if (glob[0] == '!') {
33633 +       if (remove) {
33634 +               if (!have_hist_trigger_match(trigger_data, file))
33635 +                       goto out_free;
33636 +
33637 +               if (hist_trigger_check_refs(trigger_data, file)) {
33638 +                       ret = -EBUSY;
33639 +                       goto out_free;
33640 +               }
33641 +
33642                 cmd_ops->unreg(glob+1, trigger_ops, trigger_data, file);
33643 +
33644 +               mutex_lock(&synth_event_mutex);
33645 +               se_name = trace_event_name(file->event_call);
33646 +               se = find_synth_event(se_name);
33647 +               if (se)
33648 +                       se->ref--;
33649 +               mutex_unlock(&synth_event_mutex);
33650 +
33651                 ret = 0;
33652                 goto out_free;
33653         }
33654 @@ -1588,14 +5595,47 @@ static int event_hist_trigger_func(struct event_command *cmd_ops,
33655                 goto out_free;
33656         } else if (ret < 0)
33657                 goto out_free;
33658 +
33659 +       if (get_named_trigger_data(trigger_data))
33660 +               goto enable;
33661 +
33662 +       if (has_hist_vars(hist_data))
33663 +               save_hist_vars(hist_data);
33664 +
33665 +       ret = create_actions(hist_data, file);
33666 +       if (ret)
33667 +               goto out_unreg;
33668 +
33669 +       ret = tracing_map_init(hist_data->map);
33670 +       if (ret)
33671 +               goto out_unreg;
33672 +enable:
33673 +       ret = hist_trigger_enable(trigger_data, file);
33674 +       if (ret)
33675 +               goto out_unreg;
33676 +
33677 +       mutex_lock(&synth_event_mutex);
33678 +       se_name = trace_event_name(file->event_call);
33679 +       se = find_synth_event(se_name);
33680 +       if (se)
33681 +               se->ref++;
33682 +       mutex_unlock(&synth_event_mutex);
33683 +
33684         /* Just return zero, not the number of registered triggers */
33685         ret = 0;
33686   out:
33687 +       if (ret == 0)
33688 +               hist_err_clear();
33689 +
33690         return ret;
33691 + out_unreg:
33692 +       cmd_ops->unreg(glob+1, trigger_ops, trigger_data, file);
33693   out_free:
33694         if (cmd_ops->set_filter)
33695                 cmd_ops->set_filter(NULL, trigger_data, NULL);
33696
33697 +       remove_hist_vars(hist_data);
33698 +
33699         kfree(trigger_data);
33700
33701         destroy_hist_data(hist_data);
33702 @@ -1625,7 +5665,8 @@ __init int register_trigger_hist_cmd(void)
33703  }
33704
33705  static void
33706 -hist_enable_trigger(struct event_trigger_data *data, void *rec)
33707 +hist_enable_trigger(struct event_trigger_data *data, void *rec,
33708 +                   struct ring_buffer_event *event)
33709  {
33710         struct enable_trigger_data *enable_data = data->private_data;
33711         struct event_trigger_data *test;
33712 @@ -1641,7 +5682,8 @@ hist_enable_trigger(struct event_trigger_data *data, void *rec)
33713  }
33714
33715  static void
33716 -hist_enable_count_trigger(struct event_trigger_data *data, void *rec)
33717 +hist_enable_count_trigger(struct event_trigger_data *data, void *rec,
33718 +                         struct ring_buffer_event *event)
33719  {
33720         if (!data->count)
33721                 return;
33722 @@ -1649,7 +5691,7 @@ hist_enable_count_trigger(struct event_trigger_data *data, void *rec)
33723         if (data->count != -1)
33724                 (data->count)--;
33725
33726 -       hist_enable_trigger(data, rec);
33727 +       hist_enable_trigger(data, rec, event);
33728  }
33729
33730  static struct event_trigger_ops hist_enable_trigger_ops = {
33731 @@ -1754,3 +5796,31 @@ __init int register_trigger_hist_enable_disable_cmds(void)
33732
33733         return ret;
33734  }
33735 +
33736 +static __init int trace_events_hist_init(void)
33737 +{
33738 +       struct dentry *entry = NULL;
33739 +       struct dentry *d_tracer;
33740 +       int err = 0;
33741 +
33742 +       d_tracer = tracing_init_dentry();
33743 +       if (IS_ERR(d_tracer)) {
33744 +               err = PTR_ERR(d_tracer);
33745 +               goto err;
33746 +       }
33747 +
33748 +       entry = tracefs_create_file("synthetic_events", 0644, d_tracer,
33749 +                                   NULL, &synth_events_fops);
33750 +       if (!entry) {
33751 +               err = -ENODEV;
33752 +               goto err;
33753 +       }
33754 +
33755 +       return err;
33756 + err:
33757 +       pr_warn("Could not create tracefs 'synthetic_events' entry\n");
33758 +
33759 +       return err;
33760 +}
33761 +
33762 +fs_initcall(trace_events_hist_init);
33763 diff --git a/kernel/trace/trace_events_trigger.c b/kernel/trace/trace_events_trigger.c
33764 index 43254c5e7e16..24d42350d738 100644
33765 --- a/kernel/trace/trace_events_trigger.c
33766 +++ b/kernel/trace/trace_events_trigger.c
33767 @@ -63,7 +63,8 @@ void trigger_data_free(struct event_trigger_data *data)
33768   * any trigger that should be deferred, ETT_NONE if nothing to defer.
33769   */
33770  enum event_trigger_type
33771 -event_triggers_call(struct trace_event_file *file, void *rec)
33772 +event_triggers_call(struct trace_event_file *file, void *rec,
33773 +                   struct ring_buffer_event *event)
33774  {
33775         struct event_trigger_data *data;
33776         enum event_trigger_type tt = ETT_NONE;
33777 @@ -76,7 +77,7 @@ event_triggers_call(struct trace_event_file *file, void *rec)
33778                 if (data->paused)
33779                         continue;
33780                 if (!rec) {
33781 -                       data->ops->func(data, rec);
33782 +                       data->ops->func(data, rec, event);
33783                         continue;
33784                 }
33785                 filter = rcu_dereference_sched(data->filter);
33786 @@ -86,7 +87,7 @@ event_triggers_call(struct trace_event_file *file, void *rec)
33787                         tt |= data->cmd_ops->trigger_type;
33788                         continue;
33789                 }
33790 -               data->ops->func(data, rec);
33791 +               data->ops->func(data, rec, event);
33792         }
33793         return tt;
33794  }
33795 @@ -108,7 +109,7 @@ EXPORT_SYMBOL_GPL(event_triggers_call);
33796  void
33797  event_triggers_post_call(struct trace_event_file *file,
33798                          enum event_trigger_type tt,
33799 -                        void *rec)
33800 +                        void *rec, struct ring_buffer_event *event)
33801  {
33802         struct event_trigger_data *data;
33803
33804 @@ -116,7 +117,7 @@ event_triggers_post_call(struct trace_event_file *file,
33805                 if (data->paused)
33806                         continue;
33807                 if (data->cmd_ops->trigger_type & tt)
33808 -                       data->ops->func(data, rec);
33809 +                       data->ops->func(data, rec, event);
33810         }
33811  }
33812  EXPORT_SYMBOL_GPL(event_triggers_post_call);
33813 @@ -914,8 +915,15 @@ void set_named_trigger_data(struct event_trigger_data *data,
33814         data->named_data = named_data;
33815  }
33816
33817 +struct event_trigger_data *
33818 +get_named_trigger_data(struct event_trigger_data *data)
33819 +{
33820 +       return data->named_data;
33821 +}
33822 +
33823  static void
33824 -traceon_trigger(struct event_trigger_data *data, void *rec)
33825 +traceon_trigger(struct event_trigger_data *data, void *rec,
33826 +               struct ring_buffer_event *event)
33827  {
33828         if (tracing_is_on())
33829                 return;
33830 @@ -924,7 +932,8 @@ traceon_trigger(struct event_trigger_data *data, void *rec)
33831  }
33832
33833  static void
33834 -traceon_count_trigger(struct event_trigger_data *data, void *rec)
33835 +traceon_count_trigger(struct event_trigger_data *data, void *rec,
33836 +                     struct ring_buffer_event *event)
33837  {
33838         if (tracing_is_on())
33839                 return;
33840 @@ -939,7 +948,8 @@ traceon_count_trigger(struct event_trigger_data *data, void *rec)
33841  }
33842
33843  static void
33844 -traceoff_trigger(struct event_trigger_data *data, void *rec)
33845 +traceoff_trigger(struct event_trigger_data *data, void *rec,
33846 +                struct ring_buffer_event *event)
33847  {
33848         if (!tracing_is_on())
33849                 return;
33850 @@ -948,7 +958,8 @@ traceoff_trigger(struct event_trigger_data *data, void *rec)
33851  }
33852
33853  static void
33854 -traceoff_count_trigger(struct event_trigger_data *data, void *rec)
33855 +traceoff_count_trigger(struct event_trigger_data *data, void *rec,
33856 +                      struct ring_buffer_event *event)
33857  {
33858         if (!tracing_is_on())
33859                 return;
33860 @@ -1045,7 +1056,8 @@ static struct event_command trigger_traceoff_cmd = {
33861
33862  #ifdef CONFIG_TRACER_SNAPSHOT
33863  static void
33864 -snapshot_trigger(struct event_trigger_data *data, void *rec)
33865 +snapshot_trigger(struct event_trigger_data *data, void *rec,
33866 +                struct ring_buffer_event *event)
33867  {
33868         struct trace_event_file *file = data->private_data;
33869
33870 @@ -1056,7 +1068,8 @@ snapshot_trigger(struct event_trigger_data *data, void *rec)
33871  }
33872
33873  static void
33874 -snapshot_count_trigger(struct event_trigger_data *data, void *rec)
33875 +snapshot_count_trigger(struct event_trigger_data *data, void *rec,
33876 +                      struct ring_buffer_event *event)
33877  {
33878         if (!data->count)
33879                 return;
33880 @@ -1064,7 +1077,7 @@ snapshot_count_trigger(struct event_trigger_data *data, void *rec)
33881         if (data->count != -1)
33882                 (data->count)--;
33883
33884 -       snapshot_trigger(data, rec);
33885 +       snapshot_trigger(data, rec, event);
33886  }
33887
33888  static int
33889 @@ -1143,13 +1156,15 @@ static __init int register_trigger_snapshot_cmd(void) { return 0; }
33890  #define STACK_SKIP 3
33891
33892  static void
33893 -stacktrace_trigger(struct event_trigger_data *data, void *rec)
33894 +stacktrace_trigger(struct event_trigger_data *data, void *rec,
33895 +                  struct ring_buffer_event *event)
33896  {
33897         trace_dump_stack(STACK_SKIP);
33898  }
33899
33900  static void
33901 -stacktrace_count_trigger(struct event_trigger_data *data, void *rec)
33902 +stacktrace_count_trigger(struct event_trigger_data *data, void *rec,
33903 +                        struct ring_buffer_event *event)
33904  {
33905         if (!data->count)
33906                 return;
33907 @@ -1157,7 +1172,7 @@ stacktrace_count_trigger(struct event_trigger_data *data, void *rec)
33908         if (data->count != -1)
33909                 (data->count)--;
33910
33911 -       stacktrace_trigger(data, rec);
33912 +       stacktrace_trigger(data, rec, event);
33913  }
33914
33915  static int
33916 @@ -1219,7 +1234,8 @@ static __init void unregister_trigger_traceon_traceoff_cmds(void)
33917  }
33918
33919  static void
33920 -event_enable_trigger(struct event_trigger_data *data, void *rec)
33921 +event_enable_trigger(struct event_trigger_data *data, void *rec,
33922 +                    struct ring_buffer_event *event)
33923  {
33924         struct enable_trigger_data *enable_data = data->private_data;
33925
33926 @@ -1230,7 +1246,8 @@ event_enable_trigger(struct event_trigger_data *data, void *rec)
33927  }
33928
33929  static void
33930 -event_enable_count_trigger(struct event_trigger_data *data, void *rec)
33931 +event_enable_count_trigger(struct event_trigger_data *data, void *rec,
33932 +                          struct ring_buffer_event *event)
33933  {
33934         struct enable_trigger_data *enable_data = data->private_data;
33935
33936 @@ -1244,7 +1261,7 @@ event_enable_count_trigger(struct event_trigger_data *data, void *rec)
33937         if (data->count != -1)
33938                 (data->count)--;
33939
33940 -       event_enable_trigger(data, rec);
33941 +       event_enable_trigger(data, rec, event);
33942  }
33943
33944  int event_enable_trigger_print(struct seq_file *m,
33945 diff --git a/kernel/trace/trace_hwlat.c b/kernel/trace/trace_hwlat.c
33946 index d7c8e4ec3d9d..518c61a1bceb 100644
33947 --- a/kernel/trace/trace_hwlat.c
33948 +++ b/kernel/trace/trace_hwlat.c
33949 @@ -279,7 +279,7 @@ static void move_to_next_cpu(void)
33950          * of this thread, than stop migrating for the duration
33951          * of the current test.
33952          */
33953 -       if (!cpumask_equal(current_mask, &current->cpus_allowed))
33954 +       if (!cpumask_equal(current_mask, current->cpus_ptr))
33955                 goto disable;
33956
33957         get_online_cpus();
33958 diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
33959 index ea20274a105a..3c40d4174052 100644
33960 --- a/kernel/trace/trace_kprobe.c
33961 +++ b/kernel/trace/trace_kprobe.c
33962 @@ -918,8 +918,8 @@ static int probes_open(struct inode *inode, struct file *file)
33963  static ssize_t probes_write(struct file *file, const char __user *buffer,
33964                             size_t count, loff_t *ppos)
33965  {
33966 -       return traceprobe_probes_write(file, buffer, count, ppos,
33967 -                       create_trace_kprobe);
33968 +       return trace_parse_run_command(file, buffer, count, ppos,
33969 +                                      create_trace_kprobe);
33970  }
33971
33972  static const struct file_operations kprobe_events_ops = {
33973 @@ -1444,9 +1444,9 @@ static __init int kprobe_trace_self_tests_init(void)
33974
33975         pr_info("Testing kprobe tracing: ");
33976
33977 -       ret = traceprobe_command("p:testprobe kprobe_trace_selftest_target "
33978 -                                 "$stack $stack0 +0($stack)",
33979 -                                 create_trace_kprobe);
33980 +       ret = trace_run_command("p:testprobe kprobe_trace_selftest_target "
33981 +                               "$stack $stack0 +0($stack)",
33982 +                               create_trace_kprobe);
33983         if (WARN_ON_ONCE(ret)) {
33984                 pr_warn("error on probing function entry.\n");
33985                 warn++;
33986 @@ -1466,8 +1466,8 @@ static __init int kprobe_trace_self_tests_init(void)
33987                 }
33988         }
33989
33990 -       ret = traceprobe_command("r:testprobe2 kprobe_trace_selftest_target "
33991 -                                 "$retval", create_trace_kprobe);
33992 +       ret = trace_run_command("r:testprobe2 kprobe_trace_selftest_target "
33993 +                               "$retval", create_trace_kprobe);
33994         if (WARN_ON_ONCE(ret)) {
33995                 pr_warn("error on probing function return.\n");
33996                 warn++;
33997 @@ -1537,13 +1537,13 @@ static __init int kprobe_trace_self_tests_init(void)
33998                         disable_trace_kprobe(tk, file);
33999         }
34000
34001 -       ret = traceprobe_command("-:testprobe", create_trace_kprobe);
34002 +       ret = trace_run_command("-:testprobe", create_trace_kprobe);
34003         if (WARN_ON_ONCE(ret)) {
34004                 pr_warn("error on deleting a probe.\n");
34005                 warn++;
34006         }
34007
34008 -       ret = traceprobe_command("-:testprobe2", create_trace_kprobe);
34009 +       ret = trace_run_command("-:testprobe2", create_trace_kprobe);
34010         if (WARN_ON_ONCE(ret)) {
34011                 pr_warn("error on deleting a probe.\n");
34012                 warn++;
34013 diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
34014 index 4500b00e4e36..74a4bfc2c6b7 100644
34015 --- a/kernel/trace/trace_output.c
34016 +++ b/kernel/trace/trace_output.c
34017 @@ -447,6 +447,7 @@ int trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry)
34018  {
34019         char hardsoft_irq;
34020         char need_resched;
34021 +       char need_resched_lazy;
34022         char irqs_off;
34023         int hardirq;
34024         int softirq;
34025 @@ -477,6 +478,9 @@ int trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry)
34026                 break;
34027         }
34028
34029 +       need_resched_lazy =
34030 +               (entry->flags & TRACE_FLAG_NEED_RESCHED_LAZY) ? 'L' : '.';
34031 +
34032         hardsoft_irq =
34033                 (nmi && hardirq)     ? 'Z' :
34034                 nmi                  ? 'z' :
34035 @@ -485,14 +489,25 @@ int trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry)
34036                 softirq              ? 's' :
34037                                        '.' ;
34038
34039 -       trace_seq_printf(s, "%c%c%c",
34040 -                        irqs_off, need_resched, hardsoft_irq);
34041 +       trace_seq_printf(s, "%c%c%c%c",
34042 +                        irqs_off, need_resched, need_resched_lazy,
34043 +                        hardsoft_irq);
34044
34045         if (entry->preempt_count)
34046                 trace_seq_printf(s, "%x", entry->preempt_count);
34047         else
34048                 trace_seq_putc(s, '.');
34049
34050 +       if (entry->preempt_lazy_count)
34051 +               trace_seq_printf(s, "%x", entry->preempt_lazy_count);
34052 +       else
34053 +               trace_seq_putc(s, '.');
34054 +
34055 +       if (entry->migrate_disable)
34056 +               trace_seq_printf(s, "%x", entry->migrate_disable);
34057 +       else
34058 +               trace_seq_putc(s, '.');
34059 +
34060         return !trace_seq_has_overflowed(s);
34061  }
34062
34063 diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c
34064 index fe4513330412..daf54bda4dc8 100644
34065 --- a/kernel/trace/trace_probe.c
34066 +++ b/kernel/trace/trace_probe.c
34067 @@ -621,92 +621,6 @@ void traceprobe_free_probe_arg(struct probe_arg *arg)
34068         kfree(arg->comm);
34069  }
34070
34071 -int traceprobe_command(const char *buf, int (*createfn)(int, char **))
34072 -{
34073 -       char **argv;
34074 -       int argc, ret;
34075 -
34076 -       argc = 0;
34077 -       ret = 0;
34078 -       argv = argv_split(GFP_KERNEL, buf, &argc);
34079 -       if (!argv)
34080 -               return -ENOMEM;
34081 -
34082 -       if (argc)
34083 -               ret = createfn(argc, argv);
34084 -
34085 -       argv_free(argv);
34086 -
34087 -       return ret;
34088 -}
34089 -
34090 -#define WRITE_BUFSIZE  4096
34091 -
34092 -ssize_t traceprobe_probes_write(struct file *file, const char __user *buffer,
34093 -                               size_t count, loff_t *ppos,
34094 -                               int (*createfn)(int, char **))
34095 -{
34096 -       char *kbuf, *buf, *tmp;
34097 -       int ret = 0;
34098 -       size_t done = 0;
34099 -       size_t size;
34100 -
34101 -       kbuf = kmalloc(WRITE_BUFSIZE, GFP_KERNEL);
34102 -       if (!kbuf)
34103 -               return -ENOMEM;
34104 -
34105 -       while (done < count) {
34106 -               size = count - done;
34107 -
34108 -               if (size >= WRITE_BUFSIZE)
34109 -                       size = WRITE_BUFSIZE - 1;
34110 -
34111 -               if (copy_from_user(kbuf, buffer + done, size)) {
34112 -                       ret = -EFAULT;
34113 -                       goto out;
34114 -               }
34115 -               kbuf[size] = '\0';
34116 -               buf = kbuf;
34117 -               do {
34118 -                       tmp = strchr(buf, '\n');
34119 -                       if (tmp) {
34120 -                               *tmp = '\0';
34121 -                               size = tmp - buf + 1;
34122 -                       } else {
34123 -                               size = strlen(buf);
34124 -                               if (done + size < count) {
34125 -                                       if (buf != kbuf)
34126 -                                               break;
34127 -                                       /* This can accept WRITE_BUFSIZE - 2 ('\n' + '\0') */
34128 -                                       pr_warn("Line length is too long: Should be less than %d\n",
34129 -                                               WRITE_BUFSIZE - 2);
34130 -                                       ret = -EINVAL;
34131 -                                       goto out;
34132 -                               }
34133 -                       }
34134 -                       done += size;
34135 -
34136 -                       /* Remove comments */
34137 -                       tmp = strchr(buf, '#');
34138 -
34139 -                       if (tmp)
34140 -                               *tmp = '\0';
34141 -
34142 -                       ret = traceprobe_command(buf, createfn);
34143 -                       if (ret)
34144 -                               goto out;
34145 -                       buf += size;
34146 -
34147 -               } while (done < count);
34148 -       }
34149 -       ret = done;
34150 -
34151 -out:
34152 -       kfree(kbuf);
34153 -
34154 -       return ret;
34155 -}
34156 -
34157  static int __set_print_fmt(struct trace_probe *tp, char *buf, int len,
34158                            bool is_return)
34159  {
34160 diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h
34161 index dc39472ca9e4..a0d750e3d17c 100644
34162 --- a/kernel/trace/trace_probe.h
34163 +++ b/kernel/trace/trace_probe.h
34164 @@ -42,7 +42,6 @@
34165
34166  #define MAX_TRACE_ARGS         128
34167  #define MAX_ARGSTR_LEN         63
34168 -#define MAX_EVENT_NAME_LEN     64
34169  #define MAX_STRING_SIZE                PATH_MAX
34170
34171  /* Reserved field names */
34172 @@ -356,12 +355,6 @@ extern void traceprobe_free_probe_arg(struct probe_arg *arg);
34173
34174  extern int traceprobe_split_symbol_offset(char *symbol, long *offset);
34175
34176 -extern ssize_t traceprobe_probes_write(struct file *file,
34177 -               const char __user *buffer, size_t count, loff_t *ppos,
34178 -               int (*createfn)(int, char**));
34179 -
34180 -extern int traceprobe_command(const char *buf, int (*createfn)(int, char**));
34181 -
34182  /* Sum up total data length for dynamic arraies (strings) */
34183  static nokprobe_inline int
34184  __get_data_size(struct trace_probe *tp, struct pt_regs *regs)
34185 diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c
34186 index ea0d90a31fc9..2ccfbb8efeb2 100644
34187 --- a/kernel/trace/trace_uprobe.c
34188 +++ b/kernel/trace/trace_uprobe.c
34189 @@ -647,7 +647,7 @@ static int probes_open(struct inode *inode, struct file *file)
34190  static ssize_t probes_write(struct file *file, const char __user *buffer,
34191                             size_t count, loff_t *ppos)
34192  {
34193 -       return traceprobe_probes_write(file, buffer, count, ppos, create_trace_uprobe);
34194 +       return trace_parse_run_command(file, buffer, count, ppos, create_trace_uprobe);
34195  }
34196
34197  static const struct file_operations uprobe_events_ops = {
34198 diff --git a/kernel/trace/tracing_map.c b/kernel/trace/tracing_map.c
34199 index 305039b122fa..5cadb1b8b5fe 100644
34200 --- a/kernel/trace/tracing_map.c
34201 +++ b/kernel/trace/tracing_map.c
34202 @@ -66,6 +66,73 @@ u64 tracing_map_read_sum(struct tracing_map_elt *elt, unsigned int i)
34203         return (u64)atomic64_read(&elt->fields[i].sum);
34204  }
34205
34206 +/**
34207 + * tracing_map_set_var - Assign a tracing_map_elt's variable field
34208 + * @elt: The tracing_map_elt
34209 + * @i: The index of the given variable associated with the tracing_map_elt
34210 + * @n: The value to assign
34211 + *
34212 + * Assign n to variable i associated with the specified tracing_map_elt
34213 + * instance.  The index i is the index returned by the call to
34214 + * tracing_map_add_var() when the tracing map was set up.
34215 + */
34216 +void tracing_map_set_var(struct tracing_map_elt *elt, unsigned int i, u64 n)
34217 +{
34218 +       atomic64_set(&elt->vars[i], n);
34219 +       elt->var_set[i] = true;
34220 +}
34221 +
34222 +/**
34223 + * tracing_map_var_set - Return whether or not a variable has been set
34224 + * @elt: The tracing_map_elt
34225 + * @i: The index of the given variable associated with the tracing_map_elt
34226 + *
34227 + * Return true if the variable has been set, false otherwise.  The
34228 + * index i is the index returned by the call to tracing_map_add_var()
34229 + * when the tracing map was set up.
34230 + */
34231 +bool tracing_map_var_set(struct tracing_map_elt *elt, unsigned int i)
34232 +{
34233 +       return elt->var_set[i];
34234 +}
34235 +
34236 +/**
34237 + * tracing_map_read_var - Return the value of a tracing_map_elt's variable field
34238 + * @elt: The tracing_map_elt
34239 + * @i: The index of the given variable associated with the tracing_map_elt
34240 + *
34241 + * Retrieve the value of the variable i associated with the specified
34242 + * tracing_map_elt instance.  The index i is the index returned by the
34243 + * call to tracing_map_add_var() when the tracing map was set
34244 + * up.
34245 + *
34246 + * Return: The variable value associated with field i for elt.
34247 + */
34248 +u64 tracing_map_read_var(struct tracing_map_elt *elt, unsigned int i)
34249 +{
34250 +       return (u64)atomic64_read(&elt->vars[i]);
34251 +}
34252 +
34253 +/**
34254 + * tracing_map_read_var_once - Return and reset a tracing_map_elt's variable field
34255 + * @elt: The tracing_map_elt
34256 + * @i: The index of the given variable associated with the tracing_map_elt
34257 + *
34258 + * Retrieve the value of the variable i associated with the specified
34259 + * tracing_map_elt instance, and reset the variable to the 'not set'
34260 + * state.  The index i is the index returned by the call to
34261 + * tracing_map_add_var() when the tracing map was set up.  The reset
34262 + * essentially makes the variable a read-once variable if it's only
34263 + * accessed using this function.
34264 + *
34265 + * Return: The variable value associated with field i for elt.
34266 + */
34267 +u64 tracing_map_read_var_once(struct tracing_map_elt *elt, unsigned int i)
34268 +{
34269 +       elt->var_set[i] = false;
34270 +       return (u64)atomic64_read(&elt->vars[i]);
34271 +}
34272 +
34273  int tracing_map_cmp_string(void *val_a, void *val_b)
34274  {
34275         char *a = val_a;
34276 @@ -170,6 +237,28 @@ int tracing_map_add_sum_field(struct tracing_map *map)
34277         return tracing_map_add_field(map, tracing_map_cmp_atomic64);
34278  }
34279
34280 +/**
34281 + * tracing_map_add_var - Add a field describing a tracing_map var
34282 + * @map: The tracing_map
34283 + *
34284 + * Add a var to the map and return the index identifying it in the map
34285 + * and associated tracing_map_elts.  This is the index used for
34286 + * instance to update a var for a particular tracing_map_elt using
34287 + * tracing_map_update_var() or reading it via tracing_map_read_var().
34288 + *
34289 + * Return: The index identifying the var in the map and associated
34290 + * tracing_map_elts, or -EINVAL on error.
34291 + */
34292 +int tracing_map_add_var(struct tracing_map *map)
34293 +{
34294 +       int ret = -EINVAL;
34295 +
34296 +       if (map->n_vars < TRACING_MAP_VARS_MAX)
34297 +               ret = map->n_vars++;
34298 +
34299 +       return ret;
34300 +}
34301 +
34302  /**
34303   * tracing_map_add_key_field - Add a field describing a tracing_map key
34304   * @map: The tracing_map
34305 @@ -280,6 +369,11 @@ static void tracing_map_elt_clear(struct tracing_map_elt *elt)
34306                 if (elt->fields[i].cmp_fn == tracing_map_cmp_atomic64)
34307                         atomic64_set(&elt->fields[i].sum, 0);
34308
34309 +       for (i = 0; i < elt->map->n_vars; i++) {
34310 +               atomic64_set(&elt->vars[i], 0);
34311 +               elt->var_set[i] = false;
34312 +       }
34313 +
34314         if (elt->map->ops && elt->map->ops->elt_clear)
34315                 elt->map->ops->elt_clear(elt);
34316  }
34317 @@ -306,6 +400,8 @@ static void tracing_map_elt_free(struct tracing_map_elt *elt)
34318         if (elt->map->ops && elt->map->ops->elt_free)
34319                 elt->map->ops->elt_free(elt);
34320         kfree(elt->fields);
34321 +       kfree(elt->vars);
34322 +       kfree(elt->var_set);
34323         kfree(elt->key);
34324         kfree(elt);
34325  }
34326 @@ -333,6 +429,18 @@ static struct tracing_map_elt *tracing_map_elt_alloc(struct tracing_map *map)
34327                 goto free;
34328         }
34329
34330 +       elt->vars = kcalloc(map->n_vars, sizeof(*elt->vars), GFP_KERNEL);
34331 +       if (!elt->vars) {
34332 +               err = -ENOMEM;
34333 +               goto free;
34334 +       }
34335 +
34336 +       elt->var_set = kcalloc(map->n_vars, sizeof(*elt->var_set), GFP_KERNEL);
34337 +       if (!elt->var_set) {
34338 +               err = -ENOMEM;
34339 +               goto free;
34340 +       }
34341 +
34342         tracing_map_elt_init_fields(elt);
34343
34344         if (map->ops && map->ops->elt_alloc) {
34345 @@ -414,7 +522,9 @@ static inline struct tracing_map_elt *
34346  __tracing_map_insert(struct tracing_map *map, void *key, bool lookup_only)
34347  {
34348         u32 idx, key_hash, test_key;
34349 +       int dup_try = 0;
34350         struct tracing_map_entry *entry;
34351 +       struct tracing_map_elt *val;
34352
34353         key_hash = jhash(key, map->key_size, 0);
34354         if (key_hash == 0)
34355 @@ -426,10 +536,33 @@ __tracing_map_insert(struct tracing_map *map, void *key, bool lookup_only)
34356                 entry = TRACING_MAP_ENTRY(map->map, idx);
34357                 test_key = entry->key;
34358
34359 -               if (test_key && test_key == key_hash && entry->val &&
34360 -                   keys_match(key, entry->val->key, map->key_size)) {
34361 -                       atomic64_inc(&map->hits);
34362 -                       return entry->val;
34363 +               if (test_key && test_key == key_hash) {
34364 +                       val = READ_ONCE(entry->val);
34365 +                       if (val &&
34366 +                           keys_match(key, val->key, map->key_size)) {
34367 +                               if (!lookup_only)
34368 +                                       atomic64_inc(&map->hits);
34369 +                               return val;
34370 +                       } else if (unlikely(!val)) {
34371 +                               /*
34372 +                                * The key is present. But, val (pointer to elt
34373 +                                * struct) is still NULL. which means some other
34374 +                                * thread is in the process of inserting an
34375 +                                * element.
34376 +                                *
34377 +                                * On top of that, it's key_hash is same as the
34378 +                                * one being inserted right now. So, it's
34379 +                                * possible that the element has the same
34380 +                                * key as well.
34381 +                                */
34382 +
34383 +                               dup_try++;
34384 +                               if (dup_try > map->map_size) {
34385 +                                       atomic64_inc(&map->drops);
34386 +                                       break;
34387 +                               }
34388 +                               continue;
34389 +                       }
34390                 }
34391
34392                 if (!test_key) {
34393 @@ -451,6 +584,13 @@ __tracing_map_insert(struct tracing_map *map, void *key, bool lookup_only)
34394                                 atomic64_inc(&map->hits);
34395
34396                                 return entry->val;
34397 +                       } else {
34398 +                               /*
34399 +                                * cmpxchg() failed. Loop around once
34400 +                                * more to check what key was inserted.
34401 +                                */
34402 +                               dup_try++;
34403 +                               continue;
34404                         }
34405                 }
34406
34407 @@ -815,67 +955,15 @@ create_sort_entry(void *key, struct tracing_map_elt *elt)
34408         return sort_entry;
34409  }
34410
34411 -static struct tracing_map_elt *copy_elt(struct tracing_map_elt *elt)
34412 -{
34413 -       struct tracing_map_elt *dup_elt;
34414 -       unsigned int i;
34415 -
34416 -       dup_elt = tracing_map_elt_alloc(elt->map);
34417 -       if (IS_ERR(dup_elt))
34418 -               return NULL;
34419 -
34420 -       if (elt->map->ops && elt->map->ops->elt_copy)
34421 -               elt->map->ops->elt_copy(dup_elt, elt);
34422 -
34423 -       dup_elt->private_data = elt->private_data;
34424 -       memcpy(dup_elt->key, elt->key, elt->map->key_size);
34425 -
34426 -       for (i = 0; i < elt->map->n_fields; i++) {
34427 -               atomic64_set(&dup_elt->fields[i].sum,
34428 -                            atomic64_read(&elt->fields[i].sum));
34429 -               dup_elt->fields[i].cmp_fn = elt->fields[i].cmp_fn;
34430 -       }
34431 -
34432 -       return dup_elt;
34433 -}
34434 -
34435 -static int merge_dup(struct tracing_map_sort_entry **sort_entries,
34436 -                    unsigned int target, unsigned int dup)
34437 -{
34438 -       struct tracing_map_elt *target_elt, *elt;
34439 -       bool first_dup = (target - dup) == 1;
34440 -       int i;
34441 -
34442 -       if (first_dup) {
34443 -               elt = sort_entries[target]->elt;
34444 -               target_elt = copy_elt(elt);
34445 -               if (!target_elt)
34446 -                       return -ENOMEM;
34447 -               sort_entries[target]->elt = target_elt;
34448 -               sort_entries[target]->elt_copied = true;
34449 -       } else
34450 -               target_elt = sort_entries[target]->elt;
34451 -
34452 -       elt = sort_entries[dup]->elt;
34453 -
34454 -       for (i = 0; i < elt->map->n_fields; i++)
34455 -               atomic64_add(atomic64_read(&elt->fields[i].sum),
34456 -                            &target_elt->fields[i].sum);
34457 -
34458 -       sort_entries[dup]->dup = true;
34459 -
34460 -       return 0;
34461 -}
34462 -
34463 -static int merge_dups(struct tracing_map_sort_entry **sort_entries,
34464 +static void detect_dups(struct tracing_map_sort_entry **sort_entries,
34465                       int n_entries, unsigned int key_size)
34466  {
34467         unsigned int dups = 0, total_dups = 0;
34468 -       int err, i, j;
34469 +       int i;
34470         void *key;
34471
34472         if (n_entries < 2)
34473 -               return total_dups;
34474 +               return;
34475
34476         sort(sort_entries, n_entries, sizeof(struct tracing_map_sort_entry *),
34477              (int (*)(const void *, const void *))cmp_entries_dup, NULL);
34478 @@ -884,30 +972,14 @@ static int merge_dups(struct tracing_map_sort_entry **sort_entries,
34479         for (i = 1; i < n_entries; i++) {
34480                 if (!memcmp(sort_entries[i]->key, key, key_size)) {
34481                         dups++; total_dups++;
34482 -                       err = merge_dup(sort_entries, i - dups, i);
34483 -                       if (err)
34484 -                               return err;
34485                         continue;
34486                 }
34487                 key = sort_entries[i]->key;
34488                 dups = 0;
34489         }
34490
34491 -       if (!total_dups)
34492 -               return total_dups;
34493 -
34494 -       for (i = 0, j = 0; i < n_entries; i++) {
34495 -               if (!sort_entries[i]->dup) {
34496 -                       sort_entries[j] = sort_entries[i];
34497 -                       if (j++ != i)
34498 -                               sort_entries[i] = NULL;
34499 -               } else {
34500 -                       destroy_sort_entry(sort_entries[i]);
34501 -                       sort_entries[i] = NULL;
34502 -               }
34503 -       }
34504 -
34505 -       return total_dups;
34506 +       WARN_ONCE(total_dups > 0,
34507 +                 "Duplicates detected: %d\n", total_dups);
34508  }
34509
34510  static bool is_key(struct tracing_map *map, unsigned int field_idx)
34511 @@ -1033,10 +1105,7 @@ int tracing_map_sort_entries(struct tracing_map *map,
34512                 return 1;
34513         }
34514
34515 -       ret = merge_dups(entries, n_entries, map->key_size);
34516 -       if (ret < 0)
34517 -               goto free;
34518 -       n_entries -= ret;
34519 +       detect_dups(entries, n_entries, map->key_size);
34520
34521         if (is_key(map, sort_keys[0].field_idx))
34522                 cmp_entries_fn = cmp_entries_key;
34523 diff --git a/kernel/trace/tracing_map.h b/kernel/trace/tracing_map.h
34524 index ab0ca77331d0..053eb92b2d31 100644
34525 --- a/kernel/trace/tracing_map.h
34526 +++ b/kernel/trace/tracing_map.h
34527 @@ -6,10 +6,11 @@
34528  #define TRACING_MAP_BITS_MAX           17
34529  #define TRACING_MAP_BITS_MIN           7
34530
34531 -#define TRACING_MAP_KEYS_MAX           2
34532 +#define TRACING_MAP_KEYS_MAX           3
34533  #define TRACING_MAP_VALS_MAX           3
34534  #define TRACING_MAP_FIELDS_MAX         (TRACING_MAP_KEYS_MAX + \
34535                                          TRACING_MAP_VALS_MAX)
34536 +#define TRACING_MAP_VARS_MAX           16
34537  #define TRACING_MAP_SORT_KEYS_MAX      2
34538
34539  typedef int (*tracing_map_cmp_fn_t) (void *val_a, void *val_b);
34540 @@ -137,6 +138,8 @@ struct tracing_map_field {
34541  struct tracing_map_elt {
34542         struct tracing_map              *map;
34543         struct tracing_map_field        *fields;
34544 +       atomic64_t                      *vars;
34545 +       bool                            *var_set;
34546         void                            *key;
34547         void                            *private_data;
34548  };
34549 @@ -192,6 +195,7 @@ struct tracing_map {
34550         int                             key_idx[TRACING_MAP_KEYS_MAX];
34551         unsigned int                    n_keys;
34552         struct tracing_map_sort_key     sort_key;
34553 +       unsigned int                    n_vars;
34554         atomic64_t                      hits;
34555         atomic64_t                      drops;
34556  };
34557 @@ -215,11 +219,6 @@ struct tracing_map {
34558   *     Element allocation occurs before tracing begins, when the
34559   *     tracing_map_init() call is made by client code.
34560   *
34561 - * @elt_copy: At certain points in the lifetime of an element, it may
34562 - *     need to be copied.  The copy should include a copy of the
34563 - *     client-allocated data, which can be copied into the 'to'
34564 - *     element from the 'from' element.
34565 - *
34566   * @elt_free: When a tracing_map_elt is freed, this function is called
34567   *     and allows client-allocated per-element data to be freed.
34568   *
34569 @@ -233,8 +232,6 @@ struct tracing_map {
34570   */
34571  struct tracing_map_ops {
34572         int                     (*elt_alloc)(struct tracing_map_elt *elt);
34573 -       void                    (*elt_copy)(struct tracing_map_elt *to,
34574 -                                           struct tracing_map_elt *from);
34575         void                    (*elt_free)(struct tracing_map_elt *elt);
34576         void                    (*elt_clear)(struct tracing_map_elt *elt);
34577         void                    (*elt_init)(struct tracing_map_elt *elt);
34578 @@ -248,6 +245,7 @@ tracing_map_create(unsigned int map_bits,
34579  extern int tracing_map_init(struct tracing_map *map);
34580
34581  extern int tracing_map_add_sum_field(struct tracing_map *map);
34582 +extern int tracing_map_add_var(struct tracing_map *map);
34583  extern int tracing_map_add_key_field(struct tracing_map *map,
34584                                      unsigned int offset,
34585                                      tracing_map_cmp_fn_t cmp_fn);
34586 @@ -267,7 +265,13 @@ extern int tracing_map_cmp_none(void *val_a, void *val_b);
34587
34588  extern void tracing_map_update_sum(struct tracing_map_elt *elt,
34589                                    unsigned int i, u64 n);
34590 +extern void tracing_map_set_var(struct tracing_map_elt *elt,
34591 +                               unsigned int i, u64 n);
34592 +extern bool tracing_map_var_set(struct tracing_map_elt *elt, unsigned int i);
34593  extern u64 tracing_map_read_sum(struct tracing_map_elt *elt, unsigned int i);
34594 +extern u64 tracing_map_read_var(struct tracing_map_elt *elt, unsigned int i);
34595 +extern u64 tracing_map_read_var_once(struct tracing_map_elt *elt, unsigned int i);
34596 +
34597  extern void tracing_map_set_field_descr(struct tracing_map *map,
34598                                         unsigned int i,
34599                                         unsigned int key_offset,
34600 diff --git a/kernel/user.c b/kernel/user.c
34601 index 00281add65b2..f4cf1841f2fd 100644
34602 --- a/kernel/user.c
34603 +++ b/kernel/user.c
34604 @@ -162,11 +162,11 @@ void free_uid(struct user_struct *up)
34605         if (!up)
34606                 return;
34607
34608 -       local_irq_save(flags);
34609 +       local_irq_save_nort(flags);
34610         if (atomic_dec_and_lock(&up->__count, &uidhash_lock))
34611                 free_user(up, flags);
34612         else
34613 -               local_irq_restore(flags);
34614 +               local_irq_restore_nort(flags);
34615  }
34616
34617  struct user_struct *alloc_uid(kuid_t uid)
34618 diff --git a/kernel/watchdog.c b/kernel/watchdog.c
34619 index 087994b23f8b..ea4c09109ce4 100644
34620 --- a/kernel/watchdog.c
34621 +++ b/kernel/watchdog.c
34622 @@ -462,7 +462,7 @@ static void watchdog_enable(unsigned int cpu)
34623          * Start the timer first to prevent the NMI watchdog triggering
34624          * before the timer has a chance to fire.
34625          */
34626 -       hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
34627 +       hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD);
34628         hrtimer->function = watchdog_timer_fn;
34629         hrtimer_start(hrtimer, ns_to_ktime(sample_period),
34630                       HRTIMER_MODE_REL_PINNED);
34631 diff --git a/kernel/watchdog_hld.c b/kernel/watchdog_hld.c
34632 index 4ece6028007a..210dccc57c04 100644
34633 --- a/kernel/watchdog_hld.c
34634 +++ b/kernel/watchdog_hld.c
34635 @@ -24,6 +24,8 @@ static DEFINE_PER_CPU(bool, hard_watchdog_warn);
34636  static DEFINE_PER_CPU(bool, watchdog_nmi_touch);
34637  static DEFINE_PER_CPU(struct perf_event *, watchdog_ev);
34638  static DEFINE_PER_CPU(struct perf_event *, dead_event);
34639 +static DEFINE_RAW_SPINLOCK(watchdog_output_lock);
34640 +
34641  static struct cpumask dead_events_mask;
34642
34643  static unsigned long hardlockup_allcpu_dumped;
34644 @@ -134,6 +136,13 @@ static void watchdog_overflow_callback(struct perf_event *event,
34645                 /* only print hardlockups once */
34646                 if (__this_cpu_read(hard_watchdog_warn) == true)
34647                         return;
34648 +               /*
34649 +                * If early-printk is enabled then make sure we do not
34650 +                * lock up in printk() and kill console logging:
34651 +                */
34652 +               printk_kill();
34653 +
34654 +               raw_spin_lock(&watchdog_output_lock);
34655
34656                 pr_emerg("Watchdog detected hard LOCKUP on cpu %d", this_cpu);
34657                 print_modules();
34658 @@ -151,6 +160,7 @@ static void watchdog_overflow_callback(struct perf_event *event,
34659                                 !test_and_set_bit(0, &hardlockup_allcpu_dumped))
34660                         trigger_allbutself_cpu_backtrace();
34661
34662 +               raw_spin_unlock(&watchdog_output_lock);
34663                 if (hardlockup_panic)
34664                         nmi_panic(regs, "Hard LOCKUP");
34665
34666 diff --git a/kernel/workqueue.c b/kernel/workqueue.c
34667 index 08bc551976b2..76297cce5602 100644
34668 --- a/kernel/workqueue.c
34669 +++ b/kernel/workqueue.c
34670 @@ -49,6 +49,8 @@
34671  #include <linux/moduleparam.h>
34672  #include <linux/uaccess.h>
34673  #include <linux/nmi.h>
34674 +#include <linux/locallock.h>
34675 +#include <linux/delay.h>
34676
34677  #include "workqueue_internal.h"
34678
34679 @@ -123,11 +125,16 @@ enum {
34680   *    cpu or grabbing pool->lock is enough for read access.  If
34681   *    POOL_DISASSOCIATED is set, it's identical to L.
34682   *
34683 + *    On RT we need the extra protection via rt_lock_idle_list() for
34684 + *    the list manipulations against read access from
34685 + *    wq_worker_sleeping(). All other places are nicely serialized via
34686 + *    pool->lock.
34687 + *
34688   * A: pool->attach_mutex protected.
34689   *
34690   * PL: wq_pool_mutex protected.
34691   *
34692 - * PR: wq_pool_mutex protected for writes.  Sched-RCU protected for reads.
34693 + * PR: wq_pool_mutex protected for writes.  RCU protected for reads.
34694   *
34695   * PW: wq_pool_mutex and wq->mutex protected for writes.  Either for reads.
34696   *
34697 @@ -136,7 +143,7 @@ enum {
34698   *
34699   * WQ: wq->mutex protected.
34700   *
34701 - * WR: wq->mutex protected for writes.  Sched-RCU protected for reads.
34702 + * WR: wq->mutex protected for writes.  RCU protected for reads.
34703   *
34704   * MD: wq_mayday_lock protected.
34705   */
34706 @@ -186,7 +193,7 @@ struct worker_pool {
34707         atomic_t                nr_running ____cacheline_aligned_in_smp;
34708
34709         /*
34710 -        * Destruction of pool is sched-RCU protected to allow dereferences
34711 +        * Destruction of pool is RCU protected to allow dereferences
34712          * from get_work_pool().
34713          */
34714         struct rcu_head         rcu;
34715 @@ -215,7 +222,7 @@ struct pool_workqueue {
34716         /*
34717          * Release of unbound pwq is punted to system_wq.  See put_pwq()
34718          * and pwq_unbound_release_workfn() for details.  pool_workqueue
34719 -        * itself is also sched-RCU protected so that the first pwq can be
34720 +        * itself is also RCU protected so that the first pwq can be
34721          * determined without grabbing wq->mutex.
34722          */
34723         struct work_struct      unbound_release_work;
34724 @@ -352,6 +359,8 @@ EXPORT_SYMBOL_GPL(system_power_efficient_wq);
34725  struct workqueue_struct *system_freezable_power_efficient_wq __read_mostly;
34726  EXPORT_SYMBOL_GPL(system_freezable_power_efficient_wq);
34727
34728 +static DEFINE_LOCAL_IRQ_LOCK(pendingb_lock);
34729 +
34730  static int worker_thread(void *__worker);
34731  static void workqueue_sysfs_unregister(struct workqueue_struct *wq);
34732
34733 @@ -359,20 +368,20 @@ static void workqueue_sysfs_unregister(struct workqueue_struct *wq);
34734  #include <trace/events/workqueue.h>
34735
34736  #define assert_rcu_or_pool_mutex()                                     \
34737 -       RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held() &&                 \
34738 +       RCU_LOCKDEP_WARN(!rcu_read_lock_held() &&                       \
34739                          !lockdep_is_held(&wq_pool_mutex),              \
34740 -                        "sched RCU or wq_pool_mutex should be held")
34741 +                        "RCU or wq_pool_mutex should be held")
34742
34743  #define assert_rcu_or_wq_mutex(wq)                                     \
34744 -       RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held() &&                 \
34745 +       RCU_LOCKDEP_WARN(!rcu_read_lock_held() &&                       \
34746                          !lockdep_is_held(&wq->mutex),                  \
34747 -                        "sched RCU or wq->mutex should be held")
34748 +                        "RCU or wq->mutex should be held")
34749
34750  #define assert_rcu_or_wq_mutex_or_pool_mutex(wq)                       \
34751 -       RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held() &&                 \
34752 +       RCU_LOCKDEP_WARN(!rcu_read_lock_held() &&                       \
34753                          !lockdep_is_held(&wq->mutex) &&                \
34754                          !lockdep_is_held(&wq_pool_mutex),              \
34755 -                        "sched RCU, wq->mutex or wq_pool_mutex should be held")
34756 +                        "RCU, wq->mutex or wq_pool_mutex should be held")
34757
34758  #define for_each_cpu_worker_pool(pool, cpu)                            \
34759         for ((pool) = &per_cpu(cpu_worker_pools, cpu)[0];               \
34760 @@ -384,7 +393,7 @@ static void workqueue_sysfs_unregister(struct workqueue_struct *wq);
34761   * @pool: iteration cursor
34762   * @pi: integer used for iteration
34763   *
34764 - * This must be called either with wq_pool_mutex held or sched RCU read
34765 + * This must be called either with wq_pool_mutex held or RCU read
34766   * locked.  If the pool needs to be used beyond the locking in effect, the
34767   * caller is responsible for guaranteeing that the pool stays online.
34768   *
34769 @@ -416,7 +425,7 @@ static void workqueue_sysfs_unregister(struct workqueue_struct *wq);
34770   * @pwq: iteration cursor
34771   * @wq: the target workqueue
34772   *
34773 - * This must be called either with wq->mutex held or sched RCU read locked.
34774 + * This must be called either with wq->mutex held or RCU read locked.
34775   * If the pwq needs to be used beyond the locking in effect, the caller is
34776   * responsible for guaranteeing that the pwq stays online.
34777   *
34778 @@ -428,6 +437,31 @@ static void workqueue_sysfs_unregister(struct workqueue_struct *wq);
34779                 if (({ assert_rcu_or_wq_mutex(wq); false; })) { }       \
34780                 else
34781
34782 +#ifdef CONFIG_PREEMPT_RT_BASE
34783 +static inline void rt_lock_idle_list(struct worker_pool *pool)
34784 +{
34785 +       preempt_disable();
34786 +}
34787 +static inline void rt_unlock_idle_list(struct worker_pool *pool)
34788 +{
34789 +       preempt_enable();
34790 +}
34791 +static inline void sched_lock_idle_list(struct worker_pool *pool) { }
34792 +static inline void sched_unlock_idle_list(struct worker_pool *pool) { }
34793 +#else
34794 +static inline void rt_lock_idle_list(struct worker_pool *pool) { }
34795 +static inline void rt_unlock_idle_list(struct worker_pool *pool) { }
34796 +static inline void sched_lock_idle_list(struct worker_pool *pool)
34797 +{
34798 +       spin_lock_irq(&pool->lock);
34799 +}
34800 +static inline void sched_unlock_idle_list(struct worker_pool *pool)
34801 +{
34802 +       spin_unlock_irq(&pool->lock);
34803 +}
34804 +#endif
34805 +
34806 +
34807  #ifdef CONFIG_DEBUG_OBJECTS_WORK
34808
34809  static struct debug_obj_descr work_debug_descr;
34810 @@ -552,7 +586,7 @@ static int worker_pool_assign_id(struct worker_pool *pool)
34811   * @wq: the target workqueue
34812   * @node: the node ID
34813   *
34814 - * This must be called with any of wq_pool_mutex, wq->mutex or sched RCU
34815 + * This must be called with any of wq_pool_mutex, wq->mutex or RCU
34816   * read locked.
34817   * If the pwq needs to be used beyond the locking in effect, the caller is
34818   * responsible for guaranteeing that the pwq stays online.
34819 @@ -696,8 +730,8 @@ static struct pool_workqueue *get_work_pwq(struct work_struct *work)
34820   * @work: the work item of interest
34821   *
34822   * Pools are created and destroyed under wq_pool_mutex, and allows read
34823 - * access under sched-RCU read lock.  As such, this function should be
34824 - * called under wq_pool_mutex or with preemption disabled.
34825 + * access under RCU read lock.  As such, this function should be
34826 + * called under wq_pool_mutex or inside of a rcu_read_lock() region.
34827   *
34828   * All fields of the returned pool are accessible as long as the above
34829   * mentioned locking is in effect.  If the returned pool needs to be used
34830 @@ -834,50 +868,45 @@ static struct worker *first_idle_worker(struct worker_pool *pool)
34831   */
34832  static void wake_up_worker(struct worker_pool *pool)
34833  {
34834 -       struct worker *worker = first_idle_worker(pool);
34835 +       struct worker *worker;
34836 +
34837 +       rt_lock_idle_list(pool);
34838 +
34839 +       worker = first_idle_worker(pool);
34840
34841         if (likely(worker))
34842                 wake_up_process(worker->task);
34843 +
34844 +       rt_unlock_idle_list(pool);
34845  }
34846
34847  /**
34848 - * wq_worker_waking_up - a worker is waking up
34849 + * wq_worker_running - a worker is running again
34850   * @task: task waking up
34851 - * @cpu: CPU @task is waking up to
34852   *
34853 - * This function is called during try_to_wake_up() when a worker is
34854 - * being awoken.
34855 - *
34856 - * CONTEXT:
34857 - * spin_lock_irq(rq->lock)
34858 + * This function is called when a worker returns from schedule()
34859   */
34860 -void wq_worker_waking_up(struct task_struct *task, int cpu)
34861 +void wq_worker_running(struct task_struct *task)
34862  {
34863         struct worker *worker = kthread_data(task);
34864
34865 -       if (!(worker->flags & WORKER_NOT_RUNNING)) {
34866 -               WARN_ON_ONCE(worker->pool->cpu != cpu);
34867 +       if (!worker->sleeping)
34868 +               return;
34869 +       if (!(worker->flags & WORKER_NOT_RUNNING))
34870                 atomic_inc(&worker->pool->nr_running);
34871 -       }
34872 +       worker->sleeping = 0;
34873  }
34874
34875  /**
34876   * wq_worker_sleeping - a worker is going to sleep
34877   * @task: task going to sleep
34878   *
34879 - * This function is called during schedule() when a busy worker is
34880 - * going to sleep.  Worker on the same cpu can be woken up by
34881 - * returning pointer to its task.
34882 - *
34883 - * CONTEXT:
34884 - * spin_lock_irq(rq->lock)
34885 - *
34886 - * Return:
34887 - * Worker task on @cpu to wake up, %NULL if none.
34888 + * This function is called from schedule() when a busy worker is
34889 + * going to sleep.
34890   */
34891 -struct task_struct *wq_worker_sleeping(struct task_struct *task)
34892 +void wq_worker_sleeping(struct task_struct *task)
34893  {
34894 -       struct worker *worker = kthread_data(task), *to_wakeup = NULL;
34895 +       struct worker *worker = kthread_data(task);
34896         struct worker_pool *pool;
34897
34898         /*
34899 @@ -886,29 +915,26 @@ struct task_struct *wq_worker_sleeping(struct task_struct *task)
34900          * checking NOT_RUNNING.
34901          */
34902         if (worker->flags & WORKER_NOT_RUNNING)
34903 -               return NULL;
34904 +               return;
34905
34906         pool = worker->pool;
34907
34908 -       /* this can only happen on the local cpu */
34909 -       if (WARN_ON_ONCE(pool->cpu != raw_smp_processor_id()))
34910 -               return NULL;
34911 +       if (WARN_ON_ONCE(worker->sleeping))
34912 +               return;
34913 +
34914 +       worker->sleeping = 1;
34915
34916         /*
34917          * The counterpart of the following dec_and_test, implied mb,
34918          * worklist not empty test sequence is in insert_work().
34919          * Please read comment there.
34920 -        *
34921 -        * NOT_RUNNING is clear.  This means that we're bound to and
34922 -        * running on the local cpu w/ rq lock held and preemption
34923 -        * disabled, which in turn means that none else could be
34924 -        * manipulating idle_list, so dereferencing idle_list without pool
34925 -        * lock is safe.
34926          */
34927         if (atomic_dec_and_test(&pool->nr_running) &&
34928 -           !list_empty(&pool->worklist))
34929 -               to_wakeup = first_idle_worker(pool);
34930 -       return to_wakeup ? to_wakeup->task : NULL;
34931 +           !list_empty(&pool->worklist)) {
34932 +               sched_lock_idle_list(pool);
34933 +               wake_up_worker(pool);
34934 +               sched_unlock_idle_list(pool);
34935 +       }
34936  }
34937
34938  /**
34939 @@ -1102,12 +1128,14 @@ static void put_pwq_unlocked(struct pool_workqueue *pwq)
34940  {
34941         if (pwq) {
34942                 /*
34943 -                * As both pwqs and pools are sched-RCU protected, the
34944 +                * As both pwqs and pools are RCU protected, the
34945                  * following lock operations are safe.
34946                  */
34947 -               spin_lock_irq(&pwq->pool->lock);
34948 +               rcu_read_lock();
34949 +               local_spin_lock_irq(pendingb_lock, &pwq->pool->lock);
34950                 put_pwq(pwq);
34951 -               spin_unlock_irq(&pwq->pool->lock);
34952 +               local_spin_unlock_irq(pendingb_lock, &pwq->pool->lock);
34953 +               rcu_read_unlock();
34954         }
34955  }
34956
34957 @@ -1211,7 +1239,7 @@ static int try_to_grab_pending(struct work_struct *work, bool is_dwork,
34958         struct worker_pool *pool;
34959         struct pool_workqueue *pwq;
34960
34961 -       local_irq_save(*flags);
34962 +       local_lock_irqsave(pendingb_lock, *flags);
34963
34964         /* try to steal the timer if it exists */
34965         if (is_dwork) {
34966 @@ -1230,6 +1258,7 @@ static int try_to_grab_pending(struct work_struct *work, bool is_dwork,
34967         if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work)))
34968                 return 0;
34969
34970 +       rcu_read_lock();
34971         /*
34972          * The queueing is in progress, or it is already queued. Try to
34973          * steal it from ->worklist without clearing WORK_STRUCT_PENDING.
34974 @@ -1268,14 +1297,16 @@ static int try_to_grab_pending(struct work_struct *work, bool is_dwork,
34975                 set_work_pool_and_keep_pending(work, pool->id);
34976
34977                 spin_unlock(&pool->lock);
34978 +               rcu_read_unlock();
34979                 return 1;
34980         }
34981         spin_unlock(&pool->lock);
34982  fail:
34983 -       local_irq_restore(*flags);
34984 +       rcu_read_unlock();
34985 +       local_unlock_irqrestore(pendingb_lock, *flags);
34986         if (work_is_canceling(work))
34987                 return -ENOENT;
34988 -       cpu_relax();
34989 +       cpu_chill();
34990         return -EAGAIN;
34991  }
34992
34993 @@ -1377,7 +1408,7 @@ static void __queue_work(int cpu, struct workqueue_struct *wq,
34994          * queued or lose PENDING.  Grabbing PENDING and queueing should
34995          * happen with IRQ disabled.
34996          */
34997 -       WARN_ON_ONCE(!irqs_disabled());
34998 +       WARN_ON_ONCE_NONRT(!irqs_disabled());
34999
35000         debug_work_activate(work);
35001
35002 @@ -1385,6 +1416,7 @@ static void __queue_work(int cpu, struct workqueue_struct *wq,
35003         if (unlikely(wq->flags & __WQ_DRAINING) &&
35004             WARN_ON_ONCE(!is_chained_work(wq)))
35005                 return;
35006 +       rcu_read_lock();
35007  retry:
35008         if (req_cpu == WORK_CPU_UNBOUND)
35009                 cpu = wq_select_unbound_cpu(raw_smp_processor_id());
35010 @@ -1441,10 +1473,8 @@ static void __queue_work(int cpu, struct workqueue_struct *wq,
35011         /* pwq determined, queue */
35012         trace_workqueue_queue_work(req_cpu, pwq, work);
35013
35014 -       if (WARN_ON(!list_empty(&work->entry))) {
35015 -               spin_unlock(&pwq->pool->lock);
35016 -               return;
35017 -       }
35018 +       if (WARN_ON(!list_empty(&work->entry)))
35019 +               goto out;
35020
35021         pwq->nr_in_flight[pwq->work_color]++;
35022         work_flags = work_color_to_flags(pwq->work_color);
35023 @@ -1462,7 +1492,9 @@ static void __queue_work(int cpu, struct workqueue_struct *wq,
35024
35025         insert_work(pwq, work, worklist, work_flags);
35026
35027 +out:
35028         spin_unlock(&pwq->pool->lock);
35029 +       rcu_read_unlock();
35030  }
35031
35032  /**
35033 @@ -1482,14 +1514,14 @@ bool queue_work_on(int cpu, struct workqueue_struct *wq,
35034         bool ret = false;
35035         unsigned long flags;
35036
35037 -       local_irq_save(flags);
35038 +       local_lock_irqsave(pendingb_lock,flags);
35039
35040         if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work))) {
35041                 __queue_work(cpu, wq, work);
35042                 ret = true;
35043         }
35044
35045 -       local_irq_restore(flags);
35046 +       local_unlock_irqrestore(pendingb_lock, flags);
35047         return ret;
35048  }
35049  EXPORT_SYMBOL(queue_work_on);
35050 @@ -1498,8 +1530,11 @@ void delayed_work_timer_fn(unsigned long __data)
35051  {
35052         struct delayed_work *dwork = (struct delayed_work *)__data;
35053
35054 +       /* XXX */
35055 +       /* local_lock(pendingb_lock); */
35056         /* should have been called from irqsafe timer with irq already off */
35057         __queue_work(dwork->cpu, dwork->wq, &dwork->work);
35058 +       /* local_unlock(pendingb_lock); */
35059  }
35060  EXPORT_SYMBOL(delayed_work_timer_fn);
35061
35062 @@ -1555,14 +1590,14 @@ bool queue_delayed_work_on(int cpu, struct workqueue_struct *wq,
35063         unsigned long flags;
35064
35065         /* read the comment in __queue_work() */
35066 -       local_irq_save(flags);
35067 +       local_lock_irqsave(pendingb_lock, flags);
35068
35069         if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work))) {
35070                 __queue_delayed_work(cpu, wq, dwork, delay);
35071                 ret = true;
35072         }
35073
35074 -       local_irq_restore(flags);
35075 +       local_unlock_irqrestore(pendingb_lock, flags);
35076         return ret;
35077  }
35078  EXPORT_SYMBOL(queue_delayed_work_on);
35079 @@ -1597,7 +1632,7 @@ bool mod_delayed_work_on(int cpu, struct workqueue_struct *wq,
35080
35081         if (likely(ret >= 0)) {
35082                 __queue_delayed_work(cpu, wq, dwork, delay);
35083 -               local_irq_restore(flags);
35084 +               local_unlock_irqrestore(pendingb_lock, flags);
35085         }
35086
35087         /* -ENOENT from try_to_grab_pending() becomes %true */
35088 @@ -1630,7 +1665,9 @@ static void worker_enter_idle(struct worker *worker)
35089         worker->last_active = jiffies;
35090
35091         /* idle_list is LIFO */
35092 +       rt_lock_idle_list(pool);
35093         list_add(&worker->entry, &pool->idle_list);
35094 +       rt_unlock_idle_list(pool);
35095
35096         if (too_many_workers(pool) && !timer_pending(&pool->idle_timer))
35097                 mod_timer(&pool->idle_timer, jiffies + IDLE_WORKER_TIMEOUT);
35098 @@ -1663,7 +1700,9 @@ static void worker_leave_idle(struct worker *worker)
35099                 return;
35100         worker_clr_flags(worker, WORKER_IDLE);
35101         pool->nr_idle--;
35102 +       rt_lock_idle_list(pool);
35103         list_del_init(&worker->entry);
35104 +       rt_unlock_idle_list(pool);
35105  }
35106
35107  static struct worker *alloc_worker(int node)
35108 @@ -1829,7 +1868,9 @@ static void destroy_worker(struct worker *worker)
35109         pool->nr_workers--;
35110         pool->nr_idle--;
35111
35112 +       rt_lock_idle_list(pool);
35113         list_del_init(&worker->entry);
35114 +       rt_unlock_idle_list(pool);
35115         worker->flags |= WORKER_DIE;
35116         wake_up_process(worker->task);
35117  }
35118 @@ -2815,14 +2856,14 @@ static bool start_flush_work(struct work_struct *work, struct wq_barrier *barr)
35119
35120         might_sleep();
35121
35122 -       local_irq_disable();
35123 +       rcu_read_lock();
35124         pool = get_work_pool(work);
35125         if (!pool) {
35126 -               local_irq_enable();
35127 +               rcu_read_unlock();
35128                 return false;
35129         }
35130
35131 -       spin_lock(&pool->lock);
35132 +       spin_lock_irq(&pool->lock);
35133         /* see the comment in try_to_grab_pending() with the same code */
35134         pwq = get_work_pwq(work);
35135         if (pwq) {
35136 @@ -2853,10 +2894,11 @@ static bool start_flush_work(struct work_struct *work, struct wq_barrier *barr)
35137                 lock_map_acquire(&pwq->wq->lockdep_map);
35138                 lock_map_release(&pwq->wq->lockdep_map);
35139         }
35140 -
35141 +       rcu_read_unlock();
35142         return true;
35143  already_gone:
35144         spin_unlock_irq(&pool->lock);
35145 +       rcu_read_unlock();
35146         return false;
35147  }
35148
35149 @@ -2946,7 +2988,7 @@ static bool __cancel_work_timer(struct work_struct *work, bool is_dwork)
35150
35151         /* tell other tasks trying to grab @work to back off */
35152         mark_work_canceling(work);
35153 -       local_irq_restore(flags);
35154 +       local_unlock_irqrestore(pendingb_lock, flags);
35155
35156         /*
35157          * This allows canceling during early boot.  We know that @work
35158 @@ -3007,10 +3049,10 @@ EXPORT_SYMBOL_GPL(cancel_work_sync);
35159   */
35160  bool flush_delayed_work(struct delayed_work *dwork)
35161  {
35162 -       local_irq_disable();
35163 +       local_lock_irq(pendingb_lock);
35164         if (del_timer_sync(&dwork->timer))
35165                 __queue_work(dwork->cpu, dwork->wq, &dwork->work);
35166 -       local_irq_enable();
35167 +       local_unlock_irq(pendingb_lock);
35168         return flush_work(&dwork->work);
35169  }
35170  EXPORT_SYMBOL(flush_delayed_work);
35171 @@ -3028,7 +3070,7 @@ static bool __cancel_work(struct work_struct *work, bool is_dwork)
35172                 return false;
35173
35174         set_work_pool_and_clear_pending(work, get_work_pool_id(work));
35175 -       local_irq_restore(flags);
35176 +       local_unlock_irqrestore(pendingb_lock, flags);
35177         return ret;
35178  }
35179
35180 @@ -3284,7 +3326,7 @@ static void rcu_free_pool(struct rcu_head *rcu)
35181   * put_unbound_pool - put a worker_pool
35182   * @pool: worker_pool to put
35183   *
35184 - * Put @pool.  If its refcnt reaches zero, it gets destroyed in sched-RCU
35185 + * Put @pool.  If its refcnt reaches zero, it gets destroyed in RCU
35186   * safe manner.  get_unbound_pool() calls this function on its failure path
35187   * and this function should be able to release pools which went through,
35188   * successfully or not, init_worker_pool().
35189 @@ -3338,8 +3380,8 @@ static void put_unbound_pool(struct worker_pool *pool)
35190         del_timer_sync(&pool->idle_timer);
35191         del_timer_sync(&pool->mayday_timer);
35192
35193 -       /* sched-RCU protected to allow dereferences from get_work_pool() */
35194 -       call_rcu_sched(&pool->rcu, rcu_free_pool);
35195 +       /* RCU protected to allow dereferences from get_work_pool() */
35196 +       call_rcu(&pool->rcu, rcu_free_pool);
35197  }
35198
35199  /**
35200 @@ -3446,14 +3488,14 @@ static void pwq_unbound_release_workfn(struct work_struct *work)
35201         put_unbound_pool(pool);
35202         mutex_unlock(&wq_pool_mutex);
35203
35204 -       call_rcu_sched(&pwq->rcu, rcu_free_pwq);
35205 +       call_rcu(&pwq->rcu, rcu_free_pwq);
35206
35207         /*
35208          * If we're the last pwq going away, @wq is already dead and no one
35209          * is gonna access it anymore.  Schedule RCU free.
35210          */
35211         if (is_last)
35212 -               call_rcu_sched(&wq->rcu, rcu_free_wq);
35213 +               call_rcu(&wq->rcu, rcu_free_wq);
35214  }
35215
35216  /**
35217 @@ -4128,7 +4170,7 @@ void destroy_workqueue(struct workqueue_struct *wq)
35218                  * The base ref is never dropped on per-cpu pwqs.  Directly
35219                  * schedule RCU free.
35220                  */
35221 -               call_rcu_sched(&wq->rcu, rcu_free_wq);
35222 +               call_rcu(&wq->rcu, rcu_free_wq);
35223         } else {
35224                 /*
35225                  * We're the sole accessor of @wq at this point.  Directly
35226 @@ -4238,7 +4280,8 @@ bool workqueue_congested(int cpu, struct workqueue_struct *wq)
35227         struct pool_workqueue *pwq;
35228         bool ret;
35229
35230 -       rcu_read_lock_sched();
35231 +       rcu_read_lock();
35232 +       preempt_disable();
35233
35234         if (cpu == WORK_CPU_UNBOUND)
35235                 cpu = smp_processor_id();
35236 @@ -4249,7 +4292,8 @@ bool workqueue_congested(int cpu, struct workqueue_struct *wq)
35237                 pwq = unbound_pwq_by_node(wq, cpu_to_node(cpu));
35238
35239         ret = !list_empty(&pwq->delayed_works);
35240 -       rcu_read_unlock_sched();
35241 +       preempt_enable();
35242 +       rcu_read_unlock();
35243
35244         return ret;
35245  }
35246 @@ -4275,15 +4319,15 @@ unsigned int work_busy(struct work_struct *work)
35247         if (work_pending(work))
35248                 ret |= WORK_BUSY_PENDING;
35249
35250 -       local_irq_save(flags);
35251 +       rcu_read_lock();
35252         pool = get_work_pool(work);
35253         if (pool) {
35254 -               spin_lock(&pool->lock);
35255 +               spin_lock_irqsave(&pool->lock, flags);
35256                 if (find_worker_executing_work(pool, work))
35257                         ret |= WORK_BUSY_RUNNING;
35258 -               spin_unlock(&pool->lock);
35259 +               spin_unlock_irqrestore(&pool->lock, flags);
35260         }
35261 -       local_irq_restore(flags);
35262 +       rcu_read_unlock();
35263
35264         return ret;
35265  }
35266 @@ -4472,7 +4516,7 @@ void show_workqueue_state(void)
35267         unsigned long flags;
35268         int pi;
35269
35270 -       rcu_read_lock_sched();
35271 +       rcu_read_lock();
35272
35273         pr_info("Showing busy workqueues and worker pools:\n");
35274
35275 @@ -4537,7 +4581,7 @@ void show_workqueue_state(void)
35276                 touch_nmi_watchdog();
35277         }
35278
35279 -       rcu_read_unlock_sched();
35280 +       rcu_read_unlock();
35281  }
35282
35283  /*
35284 @@ -4898,16 +4942,16 @@ bool freeze_workqueues_busy(void)
35285                  * nr_active is monotonically decreasing.  It's safe
35286                  * to peek without lock.
35287                  */
35288 -               rcu_read_lock_sched();
35289 +               rcu_read_lock();
35290                 for_each_pwq(pwq, wq) {
35291                         WARN_ON_ONCE(pwq->nr_active < 0);
35292                         if (pwq->nr_active) {
35293                                 busy = true;
35294 -                               rcu_read_unlock_sched();
35295 +                               rcu_read_unlock();
35296                                 goto out_unlock;
35297                         }
35298                 }
35299 -               rcu_read_unlock_sched();
35300 +               rcu_read_unlock();
35301         }
35302  out_unlock:
35303         mutex_unlock(&wq_pool_mutex);
35304 @@ -5097,7 +5141,8 @@ static ssize_t wq_pool_ids_show(struct device *dev,
35305         const char *delim = "";
35306         int node, written = 0;
35307
35308 -       rcu_read_lock_sched();
35309 +       get_online_cpus();
35310 +       rcu_read_lock();
35311         for_each_node(node) {
35312                 written += scnprintf(buf + written, PAGE_SIZE - written,
35313                                      "%s%d:%d", delim, node,
35314 @@ -5105,7 +5150,8 @@ static ssize_t wq_pool_ids_show(struct device *dev,
35315                 delim = " ";
35316         }
35317         written += scnprintf(buf + written, PAGE_SIZE - written, "\n");
35318 -       rcu_read_unlock_sched();
35319 +       rcu_read_unlock();
35320 +       put_online_cpus();
35321
35322         return written;
35323  }
35324 diff --git a/kernel/workqueue_internal.h b/kernel/workqueue_internal.h
35325 index d390d1be3748..2dbcfe9bc364 100644
35326 --- a/kernel/workqueue_internal.h
35327 +++ b/kernel/workqueue_internal.h
35328 @@ -45,6 +45,7 @@ struct worker {
35329         unsigned long           last_active;    /* L: last active timestamp */
35330         unsigned int            flags;          /* X: flags */
35331         int                     id;             /* I: worker id */
35332 +       int                     sleeping;       /* None */
35333
35334         /*
35335          * Opaque string set with work_set_desc().  Printed out with task
35336 @@ -70,7 +71,7 @@ static inline struct worker *current_wq_worker(void)
35337   * Scheduler hooks for concurrency managed workqueue.  Only to be used from
35338   * sched/core.c and workqueue.c.
35339   */
35340 -void wq_worker_waking_up(struct task_struct *task, int cpu);
35341 -struct task_struct *wq_worker_sleeping(struct task_struct *task);
35342 +void wq_worker_running(struct task_struct *task);
35343 +void wq_worker_sleeping(struct task_struct *task);
35344
35345  #endif /* _KERNEL_WORKQUEUE_INTERNAL_H */
35346 diff --git a/lib/Kconfig b/lib/Kconfig
35347 index b1445b22a6de..9ab51b78991a 100644
35348 --- a/lib/Kconfig
35349 +++ b/lib/Kconfig
35350 @@ -428,6 +428,7 @@ config CHECK_SIGNATURE
35351
35352  config CPUMASK_OFFSTACK
35353         bool "Force CPU masks off stack" if DEBUG_PER_CPU_MAPS
35354 +       depends on !PREEMPT_RT_FULL
35355         help
35356           Use dynamic allocation for cpumask_var_t, instead of putting
35357           them on the stack.  This is a bit more expensive, but avoids
35358 diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
35359 index 62d0e25c054c..401b7ed164b5 100644
35360 --- a/lib/Kconfig.debug
35361 +++ b/lib/Kconfig.debug
35362 @@ -1197,7 +1197,7 @@ config DEBUG_ATOMIC_SLEEP
35363
35364  config DEBUG_LOCKING_API_SELFTESTS
35365         bool "Locking API boot-time self-tests"
35366 -       depends on DEBUG_KERNEL
35367 +       depends on DEBUG_KERNEL && !PREEMPT_RT_FULL
35368         help
35369           Say Y here if you want the kernel to run a short self-test during
35370           bootup. The self-test checks whether common types of locking bugs
35371 diff --git a/lib/debugobjects.c b/lib/debugobjects.c
35372 index 99308479b1c8..161da6c6e173 100644
35373 --- a/lib/debugobjects.c
35374 +++ b/lib/debugobjects.c
35375 @@ -339,7 +339,10 @@ __debug_object_init(void *addr, struct debug_obj_descr *descr, int onstack)
35376         struct debug_obj *obj;
35377         unsigned long flags;
35378
35379 -       fill_pool();
35380 +#ifdef CONFIG_PREEMPT_RT_FULL
35381 +       if (preempt_count() == 0 && !irqs_disabled())
35382 +#endif
35383 +               fill_pool();
35384
35385         db = get_bucket((unsigned long) addr);
35386
35387 diff --git a/lib/irq_poll.c b/lib/irq_poll.c
35388 index 86a709954f5a..9c069ef83d6d 100644
35389 --- a/lib/irq_poll.c
35390 +++ b/lib/irq_poll.c
35391 @@ -37,6 +37,7 @@ void irq_poll_sched(struct irq_poll *iop)
35392         list_add_tail(&iop->list, this_cpu_ptr(&blk_cpu_iopoll));
35393         __raise_softirq_irqoff(IRQ_POLL_SOFTIRQ);
35394         local_irq_restore(flags);
35395 +       preempt_check_resched_rt();
35396  }
35397  EXPORT_SYMBOL(irq_poll_sched);
35398
35399 @@ -72,6 +73,7 @@ void irq_poll_complete(struct irq_poll *iop)
35400         local_irq_save(flags);
35401         __irq_poll_complete(iop);
35402         local_irq_restore(flags);
35403 +       preempt_check_resched_rt();
35404  }
35405  EXPORT_SYMBOL(irq_poll_complete);
35406
35407 @@ -96,6 +98,7 @@ static void __latent_entropy irq_poll_softirq(struct softirq_action *h)
35408                 }
35409
35410                 local_irq_enable();
35411 +               preempt_check_resched_rt();
35412
35413                 /* Even though interrupts have been re-enabled, this
35414                  * access is safe because interrupts can only add new
35415 @@ -133,6 +136,7 @@ static void __latent_entropy irq_poll_softirq(struct softirq_action *h)
35416                 __raise_softirq_irqoff(IRQ_POLL_SOFTIRQ);
35417
35418         local_irq_enable();
35419 +       preempt_check_resched_rt();
35420  }
35421
35422  /**
35423 @@ -196,6 +200,7 @@ static int irq_poll_cpu_dead(unsigned int cpu)
35424                          this_cpu_ptr(&blk_cpu_iopoll));
35425         __raise_softirq_irqoff(IRQ_POLL_SOFTIRQ);
35426         local_irq_enable();
35427 +       preempt_check_resched_rt();
35428
35429         return 0;
35430  }
35431 diff --git a/lib/locking-selftest.c b/lib/locking-selftest.c
35432 index b5c1293ce147..075e225f4111 100644
35433 --- a/lib/locking-selftest.c
35434 +++ b/lib/locking-selftest.c
35435 @@ -742,6 +742,8 @@ GENERATE_TESTCASE(init_held_rtmutex);
35436  #include "locking-selftest-spin-hardirq.h"
35437  GENERATE_PERMUTATIONS_2_EVENTS(irqsafe1_hard_spin)
35438
35439 +#ifndef CONFIG_PREEMPT_RT_FULL
35440 +
35441  #include "locking-selftest-rlock-hardirq.h"
35442  GENERATE_PERMUTATIONS_2_EVENTS(irqsafe1_hard_rlock)
35443
35444 @@ -757,9 +759,12 @@ GENERATE_PERMUTATIONS_2_EVENTS(irqsafe1_soft_rlock)
35445  #include "locking-selftest-wlock-softirq.h"
35446  GENERATE_PERMUTATIONS_2_EVENTS(irqsafe1_soft_wlock)
35447
35448 +#endif
35449 +
35450  #undef E1
35451  #undef E2
35452
35453 +#ifndef CONFIG_PREEMPT_RT_FULL
35454  /*
35455   * Enabling hardirqs with a softirq-safe lock held:
35456   */
35457 @@ -792,6 +797,8 @@ GENERATE_PERMUTATIONS_2_EVENTS(irqsafe2A_rlock)
35458  #undef E1
35459  #undef E2
35460
35461 +#endif
35462 +
35463  /*
35464   * Enabling irqs with an irq-safe lock held:
35465   */
35466 @@ -815,6 +822,8 @@ GENERATE_PERMUTATIONS_2_EVENTS(irqsafe2A_rlock)
35467  #include "locking-selftest-spin-hardirq.h"
35468  GENERATE_PERMUTATIONS_2_EVENTS(irqsafe2B_hard_spin)
35469
35470 +#ifndef CONFIG_PREEMPT_RT_FULL
35471 +
35472  #include "locking-selftest-rlock-hardirq.h"
35473  GENERATE_PERMUTATIONS_2_EVENTS(irqsafe2B_hard_rlock)
35474
35475 @@ -830,6 +839,8 @@ GENERATE_PERMUTATIONS_2_EVENTS(irqsafe2B_soft_rlock)
35476  #include "locking-selftest-wlock-softirq.h"
35477  GENERATE_PERMUTATIONS_2_EVENTS(irqsafe2B_soft_wlock)
35478
35479 +#endif
35480 +
35481  #undef E1
35482  #undef E2
35483
35484 @@ -861,6 +872,8 @@ GENERATE_PERMUTATIONS_2_EVENTS(irqsafe2B_soft_wlock)
35485  #include "locking-selftest-spin-hardirq.h"
35486  GENERATE_PERMUTATIONS_3_EVENTS(irqsafe3_hard_spin)
35487
35488 +#ifndef CONFIG_PREEMPT_RT_FULL
35489 +
35490  #include "locking-selftest-rlock-hardirq.h"
35491  GENERATE_PERMUTATIONS_3_EVENTS(irqsafe3_hard_rlock)
35492
35493 @@ -876,6 +889,8 @@ GENERATE_PERMUTATIONS_3_EVENTS(irqsafe3_soft_rlock)
35494  #include "locking-selftest-wlock-softirq.h"
35495  GENERATE_PERMUTATIONS_3_EVENTS(irqsafe3_soft_wlock)
35496
35497 +#endif
35498 +
35499  #undef E1
35500  #undef E2
35501  #undef E3
35502 @@ -909,6 +924,8 @@ GENERATE_PERMUTATIONS_3_EVENTS(irqsafe3_soft_wlock)
35503  #include "locking-selftest-spin-hardirq.h"
35504  GENERATE_PERMUTATIONS_3_EVENTS(irqsafe4_hard_spin)
35505
35506 +#ifndef CONFIG_PREEMPT_RT_FULL
35507 +
35508  #include "locking-selftest-rlock-hardirq.h"
35509  GENERATE_PERMUTATIONS_3_EVENTS(irqsafe4_hard_rlock)
35510
35511 @@ -924,10 +941,14 @@ GENERATE_PERMUTATIONS_3_EVENTS(irqsafe4_soft_rlock)
35512  #include "locking-selftest-wlock-softirq.h"
35513  GENERATE_PERMUTATIONS_3_EVENTS(irqsafe4_soft_wlock)
35514
35515 +#endif
35516 +
35517  #undef E1
35518  #undef E2
35519  #undef E3
35520
35521 +#ifndef CONFIG_PREEMPT_RT_FULL
35522 +
35523  /*
35524   * read-lock / write-lock irq inversion.
35525   *
35526 @@ -990,6 +1011,10 @@ GENERATE_PERMUTATIONS_3_EVENTS(irq_inversion_soft_wlock)
35527  #undef E2
35528  #undef E3
35529
35530 +#endif
35531 +
35532 +#ifndef CONFIG_PREEMPT_RT_FULL
35533 +
35534  /*
35535   * read-lock / write-lock recursion that is actually safe.
35536   */
35537 @@ -1028,6 +1053,8 @@ GENERATE_PERMUTATIONS_3_EVENTS(irq_read_recursion_soft)
35538  #undef E2
35539  #undef E3
35540
35541 +#endif
35542 +
35543  /*
35544   * read-lock / write-lock recursion that is unsafe.
35545   */
35546 @@ -2057,6 +2084,7 @@ void locking_selftest(void)
35547
35548         printk("  --------------------------------------------------------------------------\n");
35549
35550 +#ifndef CONFIG_PREEMPT_RT_FULL
35551         /*
35552          * irq-context testcases:
35553          */
35554 @@ -2069,6 +2097,28 @@ void locking_selftest(void)
35555
35556         DO_TESTCASE_6x2("irq read-recursion", irq_read_recursion);
35557  //     DO_TESTCASE_6x2B("irq read-recursion #2", irq_read_recursion2);
35558 +#else
35559 +       /* On -rt, we only do hardirq context test for raw spinlock */
35560 +       DO_TESTCASE_1B("hard-irqs-on + irq-safe-A", irqsafe1_hard_spin, 12);
35561 +       DO_TESTCASE_1B("hard-irqs-on + irq-safe-A", irqsafe1_hard_spin, 21);
35562 +
35563 +       DO_TESTCASE_1B("hard-safe-A + irqs-on", irqsafe2B_hard_spin, 12);
35564 +       DO_TESTCASE_1B("hard-safe-A + irqs-on", irqsafe2B_hard_spin, 21);
35565 +
35566 +       DO_TESTCASE_1B("hard-safe-A + unsafe-B #1", irqsafe3_hard_spin, 123);
35567 +       DO_TESTCASE_1B("hard-safe-A + unsafe-B #1", irqsafe3_hard_spin, 132);
35568 +       DO_TESTCASE_1B("hard-safe-A + unsafe-B #1", irqsafe3_hard_spin, 213);
35569 +       DO_TESTCASE_1B("hard-safe-A + unsafe-B #1", irqsafe3_hard_spin, 231);
35570 +       DO_TESTCASE_1B("hard-safe-A + unsafe-B #1", irqsafe3_hard_spin, 312);
35571 +       DO_TESTCASE_1B("hard-safe-A + unsafe-B #1", irqsafe3_hard_spin, 321);
35572 +
35573 +       DO_TESTCASE_1B("hard-safe-A + unsafe-B #2", irqsafe4_hard_spin, 123);
35574 +       DO_TESTCASE_1B("hard-safe-A + unsafe-B #2", irqsafe4_hard_spin, 132);
35575 +       DO_TESTCASE_1B("hard-safe-A + unsafe-B #2", irqsafe4_hard_spin, 213);
35576 +       DO_TESTCASE_1B("hard-safe-A + unsafe-B #2", irqsafe4_hard_spin, 231);
35577 +       DO_TESTCASE_1B("hard-safe-A + unsafe-B #2", irqsafe4_hard_spin, 312);
35578 +       DO_TESTCASE_1B("hard-safe-A + unsafe-B #2", irqsafe4_hard_spin, 321);
35579 +#endif
35580
35581         ww_tests();
35582
35583 diff --git a/lib/percpu_ida.c b/lib/percpu_ida.c
35584 index 6016f1deb1f5..cdd43086b55b 100644
35585 --- a/lib/percpu_ida.c
35586 +++ b/lib/percpu_ida.c
35587 @@ -27,6 +27,9 @@
35588  #include <linux/string.h>
35589  #include <linux/spinlock.h>
35590  #include <linux/percpu_ida.h>
35591 +#include <linux/locallock.h>
35592 +
35593 +static DEFINE_LOCAL_IRQ_LOCK(irq_off_lock);
35594
35595  struct percpu_ida_cpu {
35596         /*
35597 @@ -149,13 +152,13 @@ int percpu_ida_alloc(struct percpu_ida *pool, int state)
35598         unsigned long flags;
35599         int tag;
35600
35601 -       local_irq_save(flags);
35602 +       local_lock_irqsave(irq_off_lock, flags);
35603         tags = this_cpu_ptr(pool->tag_cpu);
35604
35605         /* Fastpath */
35606         tag = alloc_local_tag(tags);
35607         if (likely(tag >= 0)) {
35608 -               local_irq_restore(flags);
35609 +               local_unlock_irqrestore(irq_off_lock, flags);
35610                 return tag;
35611         }
35612
35613 @@ -174,6 +177,7 @@ int percpu_ida_alloc(struct percpu_ida *pool, int state)
35614
35615                 if (!tags->nr_free)
35616                         alloc_global_tags(pool, tags);
35617 +
35618                 if (!tags->nr_free)
35619                         steal_tags(pool, tags);
35620
35621 @@ -185,7 +189,7 @@ int percpu_ida_alloc(struct percpu_ida *pool, int state)
35622                 }
35623
35624                 spin_unlock(&pool->lock);
35625 -               local_irq_restore(flags);
35626 +               local_unlock_irqrestore(irq_off_lock, flags);
35627
35628                 if (tag >= 0 || state == TASK_RUNNING)
35629                         break;
35630 @@ -197,7 +201,7 @@ int percpu_ida_alloc(struct percpu_ida *pool, int state)
35631
35632                 schedule();
35633
35634 -               local_irq_save(flags);
35635 +               local_lock_irqsave(irq_off_lock, flags);
35636                 tags = this_cpu_ptr(pool->tag_cpu);
35637         }
35638         if (state != TASK_RUNNING)
35639 @@ -222,7 +226,7 @@ void percpu_ida_free(struct percpu_ida *pool, unsigned tag)
35640
35641         BUG_ON(tag >= pool->nr_tags);
35642
35643 -       local_irq_save(flags);
35644 +       local_lock_irqsave(irq_off_lock, flags);
35645         tags = this_cpu_ptr(pool->tag_cpu);
35646
35647         spin_lock(&tags->lock);
35648 @@ -254,7 +258,7 @@ void percpu_ida_free(struct percpu_ida *pool, unsigned tag)
35649                 spin_unlock(&pool->lock);
35650         }
35651
35652 -       local_irq_restore(flags);
35653 +       local_unlock_irqrestore(irq_off_lock, flags);
35654  }
35655  EXPORT_SYMBOL_GPL(percpu_ida_free);
35656
35657 @@ -346,7 +350,7 @@ int percpu_ida_for_each_free(struct percpu_ida *pool, percpu_ida_cb fn,
35658         struct percpu_ida_cpu *remote;
35659         unsigned cpu, i, err = 0;
35660
35661 -       local_irq_save(flags);
35662 +       local_lock_irqsave(irq_off_lock, flags);
35663         for_each_possible_cpu(cpu) {
35664                 remote = per_cpu_ptr(pool->tag_cpu, cpu);
35665                 spin_lock(&remote->lock);
35666 @@ -368,7 +372,7 @@ int percpu_ida_for_each_free(struct percpu_ida *pool, percpu_ida_cb fn,
35667         }
35668         spin_unlock(&pool->lock);
35669  out:
35670 -       local_irq_restore(flags);
35671 +       local_unlock_irqrestore(irq_off_lock, flags);
35672         return err;
35673  }
35674  EXPORT_SYMBOL_GPL(percpu_ida_for_each_free);
35675 diff --git a/lib/radix-tree.c b/lib/radix-tree.c
35676 index d172f0341b80..c1da1109a107 100644
35677 --- a/lib/radix-tree.c
35678 +++ b/lib/radix-tree.c
35679 @@ -37,7 +37,7 @@
35680  #include <linux/rcupdate.h>
35681  #include <linux/slab.h>
35682  #include <linux/string.h>
35683 -
35684 +#include <linux/locallock.h>
35685
35686  /* Number of nodes in fully populated tree of given height */
35687  static unsigned long height_to_maxnodes[RADIX_TREE_MAX_PATH + 1] __read_mostly;
35688 @@ -86,6 +86,7 @@ struct radix_tree_preload {
35689         struct radix_tree_node *nodes;
35690  };
35691  static DEFINE_PER_CPU(struct radix_tree_preload, radix_tree_preloads) = { 0, };
35692 +static DEFINE_LOCAL_IRQ_LOCK(radix_tree_preloads_lock);
35693
35694  static inline struct radix_tree_node *entry_to_node(void *ptr)
35695  {
35696 @@ -404,12 +405,13 @@ radix_tree_node_alloc(gfp_t gfp_mask, struct radix_tree_node *parent,
35697                  * succeed in getting a node here (and never reach
35698                  * kmem_cache_alloc)
35699                  */
35700 -               rtp = this_cpu_ptr(&radix_tree_preloads);
35701 +               rtp = &get_locked_var(radix_tree_preloads_lock, radix_tree_preloads);
35702                 if (rtp->nr) {
35703                         ret = rtp->nodes;
35704                         rtp->nodes = ret->parent;
35705                         rtp->nr--;
35706                 }
35707 +               put_locked_var(radix_tree_preloads_lock, radix_tree_preloads);
35708                 /*
35709                  * Update the allocation stack trace as this is more useful
35710                  * for debugging.
35711 @@ -475,14 +477,14 @@ static __must_check int __radix_tree_preload(gfp_t gfp_mask, unsigned nr)
35712          */
35713         gfp_mask &= ~__GFP_ACCOUNT;
35714
35715 -       preempt_disable();
35716 +       local_lock(radix_tree_preloads_lock);
35717         rtp = this_cpu_ptr(&radix_tree_preloads);
35718         while (rtp->nr < nr) {
35719 -               preempt_enable();
35720 +               local_unlock(radix_tree_preloads_lock);
35721                 node = kmem_cache_alloc(radix_tree_node_cachep, gfp_mask);
35722                 if (node == NULL)
35723                         goto out;
35724 -               preempt_disable();
35725 +               local_lock(radix_tree_preloads_lock);
35726                 rtp = this_cpu_ptr(&radix_tree_preloads);
35727                 if (rtp->nr < nr) {
35728                         node->parent = rtp->nodes;
35729 @@ -524,7 +526,7 @@ int radix_tree_maybe_preload(gfp_t gfp_mask)
35730         if (gfpflags_allow_blocking(gfp_mask))
35731                 return __radix_tree_preload(gfp_mask, RADIX_TREE_PRELOAD_SIZE);
35732         /* Preloading doesn't help anything with this gfp mask, skip it */
35733 -       preempt_disable();
35734 +       local_lock(radix_tree_preloads_lock);
35735         return 0;
35736  }
35737  EXPORT_SYMBOL(radix_tree_maybe_preload);
35738 @@ -562,7 +564,7 @@ int radix_tree_maybe_preload_order(gfp_t gfp_mask, int order)
35739
35740         /* Preloading doesn't help anything with this gfp mask, skip it */
35741         if (!gfpflags_allow_blocking(gfp_mask)) {
35742 -               preempt_disable();
35743 +               local_lock(radix_tree_preloads_lock);
35744                 return 0;
35745         }
35746
35747 @@ -596,6 +598,12 @@ int radix_tree_maybe_preload_order(gfp_t gfp_mask, int order)
35748         return __radix_tree_preload(gfp_mask, nr_nodes);
35749  }
35750
35751 +void radix_tree_preload_end(void)
35752 +{
35753 +       local_unlock(radix_tree_preloads_lock);
35754 +}
35755 +EXPORT_SYMBOL(radix_tree_preload_end);
35756 +
35757  static unsigned radix_tree_load_root(const struct radix_tree_root *root,
35758                 struct radix_tree_node **nodep, unsigned long *maxindex)
35759  {
35760 @@ -2105,10 +2113,16 @@ EXPORT_SYMBOL(radix_tree_tagged);
35761  void idr_preload(gfp_t gfp_mask)
35762  {
35763         if (__radix_tree_preload(gfp_mask, IDR_PRELOAD_SIZE))
35764 -               preempt_disable();
35765 +               local_lock(radix_tree_preloads_lock);
35766  }
35767  EXPORT_SYMBOL(idr_preload);
35768
35769 +void idr_preload_end(void)
35770 +{
35771 +       local_unlock(radix_tree_preloads_lock);
35772 +}
35773 +EXPORT_SYMBOL(idr_preload_end);
35774 +
35775  /**
35776   * ida_pre_get - reserve resources for ida allocation
35777   * @ida: ida handle
35778 @@ -2125,7 +2139,7 @@ int ida_pre_get(struct ida *ida, gfp_t gfp)
35779          * to return to the ida_pre_get() step.
35780          */
35781         if (!__radix_tree_preload(gfp, IDA_PRELOAD_SIZE))
35782 -               preempt_enable();
35783 +               local_unlock(radix_tree_preloads_lock);
35784
35785         if (!this_cpu_read(ida_bitmap)) {
35786                 struct ida_bitmap *bitmap = kmalloc(sizeof(*bitmap), gfp);
35787 diff --git a/lib/scatterlist.c b/lib/scatterlist.c
35788 index be7b4dd6b68d..d06c15d3d186 100644
35789 --- a/lib/scatterlist.c
35790 +++ b/lib/scatterlist.c
35791 @@ -620,7 +620,7 @@ void sg_miter_stop(struct sg_mapping_iter *miter)
35792                         flush_kernel_dcache_page(miter->page);
35793
35794                 if (miter->__flags & SG_MITER_ATOMIC) {
35795 -                       WARN_ON_ONCE(preemptible());
35796 +                       WARN_ON_ONCE(!pagefault_disabled());
35797                         kunmap_atomic(miter->addr);
35798                 } else
35799                         kunmap(miter->page);
35800 diff --git a/lib/smp_processor_id.c b/lib/smp_processor_id.c
35801 index 835cc6df2776..6f4a4ae881c8 100644
35802 --- a/lib/smp_processor_id.c
35803 +++ b/lib/smp_processor_id.c
35804 @@ -23,7 +23,7 @@ notrace static unsigned int check_preemption_disabled(const char *what1,
35805          * Kernel threads bound to a single CPU can safely use
35806          * smp_processor_id():
35807          */
35808 -       if (cpumask_equal(&current->cpus_allowed, cpumask_of(this_cpu)))
35809 +       if (cpumask_equal(current->cpus_ptr, cpumask_of(this_cpu)))
35810                 goto out;
35811
35812         /*
35813 diff --git a/lib/timerqueue.c b/lib/timerqueue.c
35814 index 4a720ed4fdaf..0d54bcbc8170 100644
35815 --- a/lib/timerqueue.c
35816 +++ b/lib/timerqueue.c
35817 @@ -33,8 +33,9 @@
35818   * @head: head of timerqueue
35819   * @node: timer node to be added
35820   *
35821 - * Adds the timer node to the timerqueue, sorted by the
35822 - * node's expires value.
35823 + * Adds the timer node to the timerqueue, sorted by the node's expires
35824 + * value. Returns true if the newly added timer is the first expiring timer in
35825 + * the queue.
35826   */
35827  bool timerqueue_add(struct timerqueue_head *head, struct timerqueue_node *node)
35828  {
35829 @@ -70,7 +71,8 @@ EXPORT_SYMBOL_GPL(timerqueue_add);
35830   * @head: head of timerqueue
35831   * @node: timer node to be removed
35832   *
35833 - * Removes the timer node from the timerqueue.
35834 + * Removes the timer node from the timerqueue. Returns true if the queue is
35835 + * not empty after the remove.
35836   */
35837  bool timerqueue_del(struct timerqueue_head *head, struct timerqueue_node *node)
35838  {
35839 diff --git a/localversion-rt b/localversion-rt
35840 new file mode 100644
35841 index 000000000000..8a777ac42aab
35842 --- /dev/null
35843 +++ b/localversion-rt
35844 @@ -0,0 +1 @@
35845 +-rt47
35846 diff --git a/mm/Kconfig b/mm/Kconfig
35847 index 59efbd3337e0..3df123c0bc3f 100644
35848 --- a/mm/Kconfig
35849 +++ b/mm/Kconfig
35850 @@ -385,7 +385,7 @@ config NOMMU_INITIAL_TRIM_EXCESS
35851
35852  config TRANSPARENT_HUGEPAGE
35853         bool "Transparent Hugepage Support"
35854 -       depends on HAVE_ARCH_TRANSPARENT_HUGEPAGE
35855 +       depends on HAVE_ARCH_TRANSPARENT_HUGEPAGE && !PREEMPT_RT_FULL
35856         select COMPACTION
35857         select RADIX_TREE_MULTIORDER
35858         help
35859 diff --git a/mm/backing-dev.c b/mm/backing-dev.c
35860 index 9386c98dac12..5e9d804c37cb 100644
35861 --- a/mm/backing-dev.c
35862 +++ b/mm/backing-dev.c
35863 @@ -470,9 +470,9 @@ void wb_congested_put(struct bdi_writeback_congested *congested)
35864  {
35865         unsigned long flags;
35866
35867 -       local_irq_save(flags);
35868 +       local_irq_save_nort(flags);
35869         if (!atomic_dec_and_lock(&congested->refcnt, &cgwb_lock)) {
35870 -               local_irq_restore(flags);
35871 +               local_irq_restore_nort(flags);
35872                 return;
35873         }
35874
35875 diff --git a/mm/compaction.c b/mm/compaction.c
35876 index 85395dc6eb13..d6c8ed009e93 100644
35877 --- a/mm/compaction.c
35878 +++ b/mm/compaction.c
35879 @@ -1634,10 +1634,12 @@ static enum compact_result compact_zone(struct zone *zone, struct compact_contro
35880                                 block_start_pfn(cc->migrate_pfn, cc->order);
35881
35882                         if (cc->last_migrated_pfn < current_block_start) {
35883 -                               cpu = get_cpu();
35884 +                               cpu = get_cpu_light();
35885 +                               local_lock_irq(swapvec_lock);
35886                                 lru_add_drain_cpu(cpu);
35887 +                               local_unlock_irq(swapvec_lock);
35888                                 drain_local_pages(zone);
35889 -                               put_cpu();
35890 +                               put_cpu_light();
35891                                 /* No more flushing until we migrate again */
35892                                 cc->last_migrated_pfn = 0;
35893                         }
35894 diff --git a/mm/filemap.c b/mm/filemap.c
35895 index e2e738cc08b1..c47070dae8b9 100644
35896 --- a/mm/filemap.c
35897 +++ b/mm/filemap.c
35898 @@ -110,6 +110,7 @@
35899   * ->i_mmap_rwsem
35900   *   ->tasklist_lock            (memory_failure, collect_procs_ao)
35901   */
35902 +DECLARE_LOCAL_IRQ_LOCK(shadow_nodes_lock);
35903
35904  static int page_cache_tree_insert(struct address_space *mapping,
35905                                   struct page *page, void **shadowp)
35906 @@ -133,8 +134,10 @@ static int page_cache_tree_insert(struct address_space *mapping,
35907                 if (shadowp)
35908                         *shadowp = p;
35909         }
35910 +       local_lock(shadow_nodes_lock);
35911         __radix_tree_replace(&mapping->page_tree, node, slot, page,
35912 -                            workingset_update_node, mapping);
35913 +                            __workingset_update_node, mapping);
35914 +       local_unlock(shadow_nodes_lock);
35915         mapping->nrpages++;
35916         return 0;
35917  }
35918 @@ -151,6 +154,7 @@ static void page_cache_tree_delete(struct address_space *mapping,
35919         VM_BUG_ON_PAGE(PageTail(page), page);
35920         VM_BUG_ON_PAGE(nr != 1 && shadow, page);
35921
35922 +       local_lock(shadow_nodes_lock);
35923         for (i = 0; i < nr; i++) {
35924                 struct radix_tree_node *node;
35925                 void **slot;
35926 @@ -162,8 +166,9 @@ static void page_cache_tree_delete(struct address_space *mapping,
35927
35928                 radix_tree_clear_tags(&mapping->page_tree, node, slot);
35929                 __radix_tree_replace(&mapping->page_tree, node, slot, shadow,
35930 -                                    workingset_update_node, mapping);
35931 +                                    __workingset_update_node, mapping);
35932         }
35933 +       local_unlock(shadow_nodes_lock);
35934
35935         if (shadow) {
35936                 mapping->nrexceptional += nr;
35937 diff --git a/mm/highmem.c b/mm/highmem.c
35938 index 59db3223a5d6..22aa3ddbd87b 100644
35939 --- a/mm/highmem.c
35940 +++ b/mm/highmem.c
35941 @@ -30,10 +30,11 @@
35942  #include <linux/kgdb.h>
35943  #include <asm/tlbflush.h>
35944
35945 -
35946 +#ifndef CONFIG_PREEMPT_RT_FULL
35947  #if defined(CONFIG_HIGHMEM) || defined(CONFIG_X86_32)
35948  DEFINE_PER_CPU(int, __kmap_atomic_idx);
35949  #endif
35950 +#endif
35951
35952  /*
35953   * Virtual_count is not a pure "count".
35954 @@ -108,8 +109,9 @@ static inline wait_queue_head_t *get_pkmap_wait_queue_head(unsigned int color)
35955  unsigned long totalhigh_pages __read_mostly;
35956  EXPORT_SYMBOL(totalhigh_pages);
35957
35958 -
35959 +#ifndef CONFIG_PREEMPT_RT_FULL
35960  EXPORT_PER_CPU_SYMBOL(__kmap_atomic_idx);
35961 +#endif
35962
35963  unsigned int nr_free_highpages (void)
35964  {
35965 diff --git a/mm/memcontrol.c b/mm/memcontrol.c
35966 index 6a9a7e1066ef..3cc297730103 100644
35967 --- a/mm/memcontrol.c
35968 +++ b/mm/memcontrol.c
35969 @@ -69,6 +69,7 @@
35970  #include <net/sock.h>
35971  #include <net/ip.h>
35972  #include "slab.h"
35973 +#include <linux/locallock.h>
35974
35975  #include <linux/uaccess.h>
35976
35977 @@ -94,6 +95,8 @@ int do_swap_account __read_mostly;
35978  #define do_swap_account                0
35979  #endif
35980
35981 +static DEFINE_LOCAL_IRQ_LOCK(event_lock);
35982 +
35983  /* Whether legacy memory+swap accounting is active */
35984  static bool do_memsw_account(void)
35985  {
35986 @@ -1831,7 +1834,7 @@ static void drain_all_stock(struct mem_cgroup *root_memcg)
35987          * as well as workers from this path always operate on the local
35988          * per-cpu data. CPU up doesn't touch memcg_stock at all.
35989          */
35990 -       curcpu = get_cpu();
35991 +       curcpu = get_cpu_light();
35992         for_each_online_cpu(cpu) {
35993                 struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu);
35994                 struct mem_cgroup *memcg;
35995 @@ -1851,7 +1854,7 @@ static void drain_all_stock(struct mem_cgroup *root_memcg)
35996                 }
35997                 css_put(&memcg->css);
35998         }
35999 -       put_cpu();
36000 +       put_cpu_light();
36001         mutex_unlock(&percpu_charge_mutex);
36002  }
36003
36004 @@ -4631,12 +4634,12 @@ static int mem_cgroup_move_account(struct page *page,
36005
36006         ret = 0;
36007
36008 -       local_irq_disable();
36009 +       local_lock_irq(event_lock);
36010         mem_cgroup_charge_statistics(to, page, compound, nr_pages);
36011         memcg_check_events(to, page);
36012         mem_cgroup_charge_statistics(from, page, compound, -nr_pages);
36013         memcg_check_events(from, page);
36014 -       local_irq_enable();
36015 +       local_unlock_irq(event_lock);
36016  out_unlock:
36017         unlock_page(page);
36018  out:
36019 @@ -5579,10 +5582,10 @@ void mem_cgroup_commit_charge(struct page *page, struct mem_cgroup *memcg,
36020
36021         commit_charge(page, memcg, lrucare);
36022
36023 -       local_irq_disable();
36024 +       local_lock_irq(event_lock);
36025         mem_cgroup_charge_statistics(memcg, page, compound, nr_pages);
36026         memcg_check_events(memcg, page);
36027 -       local_irq_enable();
36028 +       local_unlock_irq(event_lock);
36029
36030         if (do_memsw_account() && PageSwapCache(page)) {
36031                 swp_entry_t entry = { .val = page_private(page) };
36032 @@ -5651,7 +5654,7 @@ static void uncharge_batch(const struct uncharge_gather *ug)
36033                 memcg_oom_recover(ug->memcg);
36034         }
36035
36036 -       local_irq_save(flags);
36037 +       local_lock_irqsave(event_lock, flags);
36038         __this_cpu_sub(ug->memcg->stat->count[MEMCG_RSS], ug->nr_anon);
36039         __this_cpu_sub(ug->memcg->stat->count[MEMCG_CACHE], ug->nr_file);
36040         __this_cpu_sub(ug->memcg->stat->count[MEMCG_RSS_HUGE], ug->nr_huge);
36041 @@ -5659,7 +5662,7 @@ static void uncharge_batch(const struct uncharge_gather *ug)
36042         __this_cpu_add(ug->memcg->stat->events[PGPGOUT], ug->pgpgout);
36043         __this_cpu_add(ug->memcg->stat->nr_page_events, nr_pages);
36044         memcg_check_events(ug->memcg, ug->dummy_page);
36045 -       local_irq_restore(flags);
36046 +       local_unlock_irqrestore(event_lock, flags);
36047
36048         if (!mem_cgroup_is_root(ug->memcg))
36049                 css_put_many(&ug->memcg->css, nr_pages);
36050 @@ -5822,10 +5825,10 @@ void mem_cgroup_migrate(struct page *oldpage, struct page *newpage)
36051
36052         commit_charge(newpage, memcg, false);
36053
36054 -       local_irq_save(flags);
36055 +       local_lock_irqsave(event_lock, flags);
36056         mem_cgroup_charge_statistics(memcg, newpage, compound, nr_pages);
36057         memcg_check_events(memcg, newpage);
36058 -       local_irq_restore(flags);
36059 +       local_unlock_irqrestore(event_lock, flags);
36060  }
36061
36062  DEFINE_STATIC_KEY_FALSE(memcg_sockets_enabled_key);
36063 @@ -6017,6 +6020,7 @@ void mem_cgroup_swapout(struct page *page, swp_entry_t entry)
36064         struct mem_cgroup *memcg, *swap_memcg;
36065         unsigned int nr_entries;
36066         unsigned short oldid;
36067 +       unsigned long flags;
36068
36069         VM_BUG_ON_PAGE(PageLRU(page), page);
36070         VM_BUG_ON_PAGE(page_count(page), page);
36071 @@ -6062,13 +6066,17 @@ void mem_cgroup_swapout(struct page *page, swp_entry_t entry)
36072          * important here to have the interrupts disabled because it is the
36073          * only synchronisation we have for udpating the per-CPU variables.
36074          */
36075 +       local_lock_irqsave(event_lock, flags);
36076 +#ifndef CONFIG_PREEMPT_RT_BASE
36077         VM_BUG_ON(!irqs_disabled());
36078 +#endif
36079         mem_cgroup_charge_statistics(memcg, page, PageTransHuge(page),
36080                                      -nr_entries);
36081         memcg_check_events(memcg, page);
36082
36083         if (!mem_cgroup_is_root(memcg))
36084                 css_put_many(&memcg->css, nr_entries);
36085 +       local_unlock_irqrestore(event_lock, flags);
36086  }
36087
36088  /**
36089 diff --git a/mm/mmu_context.c b/mm/mmu_context.c
36090 index 3e612ae748e9..d0ccc070979f 100644
36091 --- a/mm/mmu_context.c
36092 +++ b/mm/mmu_context.c
36093 @@ -25,6 +25,7 @@ void use_mm(struct mm_struct *mm)
36094         struct task_struct *tsk = current;
36095
36096         task_lock(tsk);
36097 +       preempt_disable_rt();
36098         active_mm = tsk->active_mm;
36099         if (active_mm != mm) {
36100                 mmgrab(mm);
36101 @@ -32,6 +33,7 @@ void use_mm(struct mm_struct *mm)
36102         }
36103         tsk->mm = mm;
36104         switch_mm(active_mm, mm, tsk);
36105 +       preempt_enable_rt();
36106         task_unlock(tsk);
36107  #ifdef finish_arch_post_lock_switch
36108         finish_arch_post_lock_switch();
36109 diff --git a/mm/page_alloc.c b/mm/page_alloc.c
36110 index a604b5da6755..525a6f2d5144 100644
36111 --- a/mm/page_alloc.c
36112 +++ b/mm/page_alloc.c
36113 @@ -61,6 +61,7 @@
36114  #include <linux/hugetlb.h>
36115  #include <linux/sched/rt.h>
36116  #include <linux/sched/mm.h>
36117 +#include <linux/locallock.h>
36118  #include <linux/page_owner.h>
36119  #include <linux/kthread.h>
36120  #include <linux/memcontrol.h>
36121 @@ -286,6 +287,18 @@ EXPORT_SYMBOL(nr_node_ids);
36122  EXPORT_SYMBOL(nr_online_nodes);
36123  #endif
36124
36125 +static DEFINE_LOCAL_IRQ_LOCK(pa_lock);
36126 +
36127 +#ifdef CONFIG_PREEMPT_RT_BASE
36128 +# define cpu_lock_irqsave(cpu, flags)          \
36129 +       local_lock_irqsave_on(pa_lock, flags, cpu)
36130 +# define cpu_unlock_irqrestore(cpu, flags)     \
36131 +       local_unlock_irqrestore_on(pa_lock, flags, cpu)
36132 +#else
36133 +# define cpu_lock_irqsave(cpu, flags)          local_irq_save(flags)
36134 +# define cpu_unlock_irqrestore(cpu, flags)     local_irq_restore(flags)
36135 +#endif
36136 +
36137  int page_group_by_mobility_disabled __read_mostly;
36138
36139  #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
36140 @@ -1094,7 +1107,7 @@ static bool bulkfree_pcp_prepare(struct page *page)
36141  #endif /* CONFIG_DEBUG_VM */
36142
36143  /*
36144 - * Frees a number of pages from the PCP lists
36145 + * Frees a number of pages which have been collected from the pcp lists.
36146   * Assumes all pages on list are in same zone, and of same order.
36147   * count is the number of pages to free.
36148   *
36149 @@ -1105,15 +1118,53 @@ static bool bulkfree_pcp_prepare(struct page *page)
36150   * pinned" detection logic.
36151   */
36152  static void free_pcppages_bulk(struct zone *zone, int count,
36153 -                                       struct per_cpu_pages *pcp)
36154 +                              struct list_head *list)
36155  {
36156 -       int migratetype = 0;
36157 -       int batch_free = 0;
36158         bool isolated_pageblocks;
36159 +       unsigned long flags;
36160
36161 -       spin_lock(&zone->lock);
36162 +       spin_lock_irqsave(&zone->lock, flags);
36163         isolated_pageblocks = has_isolate_pageblock(zone);
36164
36165 +       while (!list_empty(list)) {
36166 +               struct page *page;
36167 +               int mt; /* migratetype of the to-be-freed page */
36168 +
36169 +               page = list_first_entry(list, struct page, lru);
36170 +               /* must delete as __free_one_page list manipulates */
36171 +               list_del(&page->lru);
36172 +
36173 +               mt = get_pcppage_migratetype(page);
36174 +               /* MIGRATE_ISOLATE page should not go to pcplists */
36175 +               VM_BUG_ON_PAGE(is_migrate_isolate(mt), page);
36176 +               /* Pageblock could have been isolated meanwhile */
36177 +               if (unlikely(isolated_pageblocks))
36178 +                       mt = get_pageblock_migratetype(page);
36179 +
36180 +               if (bulkfree_pcp_prepare(page))
36181 +                       continue;
36182 +
36183 +               __free_one_page(page, page_to_pfn(page), zone, 0, mt);
36184 +               trace_mm_page_pcpu_drain(page, 0, mt);
36185 +               count--;
36186 +       }
36187 +       WARN_ON(count != 0);
36188 +       spin_unlock_irqrestore(&zone->lock, flags);
36189 +}
36190 +
36191 +/*
36192 + * Moves a number of pages from the PCP lists to free list which
36193 + * is freed outside of the locked region.
36194 + *
36195 + * Assumes all pages on list are in same zone, and of same order.
36196 + * count is the number of pages to free.
36197 + */
36198 +static void isolate_pcp_pages(int count, struct per_cpu_pages *src,
36199 +                             struct list_head *dst)
36200 +{
36201 +       int migratetype = 0;
36202 +       int batch_free = 0;
36203 +
36204         while (count) {
36205                 struct page *page;
36206                 struct list_head *list;
36207 @@ -1129,7 +1180,7 @@ static void free_pcppages_bulk(struct zone *zone, int count,
36208                         batch_free++;
36209                         if (++migratetype == MIGRATE_PCPTYPES)
36210                                 migratetype = 0;
36211 -                       list = &pcp->lists[migratetype];
36212 +                       list = &src->lists[migratetype];
36213                 } while (list_empty(list));
36214
36215                 /* This is the only non-empty list. Free them all. */
36216 @@ -1137,27 +1188,12 @@ static void free_pcppages_bulk(struct zone *zone, int count,
36217                         batch_free = count;
36218
36219                 do {
36220 -                       int mt; /* migratetype of the to-be-freed page */
36221 -
36222                         page = list_last_entry(list, struct page, lru);
36223 -                       /* must delete as __free_one_page list manipulates */
36224                         list_del(&page->lru);
36225
36226 -                       mt = get_pcppage_migratetype(page);
36227 -                       /* MIGRATE_ISOLATE page should not go to pcplists */
36228 -                       VM_BUG_ON_PAGE(is_migrate_isolate(mt), page);
36229 -                       /* Pageblock could have been isolated meanwhile */
36230 -                       if (unlikely(isolated_pageblocks))
36231 -                               mt = get_pageblock_migratetype(page);
36232 -
36233 -                       if (bulkfree_pcp_prepare(page))
36234 -                               continue;
36235 -
36236 -                       __free_one_page(page, page_to_pfn(page), zone, 0, mt);
36237 -                       trace_mm_page_pcpu_drain(page, 0, mt);
36238 +                       list_add(&page->lru, dst);
36239                 } while (--count && --batch_free && !list_empty(list));
36240         }
36241 -       spin_unlock(&zone->lock);
36242  }
36243
36244  static void free_one_page(struct zone *zone,
36245 @@ -1165,13 +1201,15 @@ static void free_one_page(struct zone *zone,
36246                                 unsigned int order,
36247                                 int migratetype)
36248  {
36249 -       spin_lock(&zone->lock);
36250 +       unsigned long flags;
36251 +
36252 +       spin_lock_irqsave(&zone->lock, flags);
36253         if (unlikely(has_isolate_pageblock(zone) ||
36254                 is_migrate_isolate(migratetype))) {
36255                 migratetype = get_pfnblock_migratetype(page, pfn);
36256         }
36257         __free_one_page(page, pfn, zone, order, migratetype);
36258 -       spin_unlock(&zone->lock);
36259 +       spin_unlock_irqrestore(&zone->lock, flags);
36260  }
36261
36262  static void __meminit __init_single_page(struct page *page, unsigned long pfn,
36263 @@ -1257,10 +1295,10 @@ static void __free_pages_ok(struct page *page, unsigned int order)
36264                 return;
36265
36266         migratetype = get_pfnblock_migratetype(page, pfn);
36267 -       local_irq_save(flags);
36268 +       local_lock_irqsave(pa_lock, flags);
36269         __count_vm_events(PGFREE, 1 << order);
36270         free_one_page(page_zone(page), page, pfn, order, migratetype);
36271 -       local_irq_restore(flags);
36272 +       local_unlock_irqrestore(pa_lock, flags);
36273  }
36274
36275  static void __init __free_pages_boot_core(struct page *page, unsigned int order)
36276 @@ -2378,16 +2416,18 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order,
36277  void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp)
36278  {
36279         unsigned long flags;
36280 +       LIST_HEAD(dst);
36281         int to_drain, batch;
36282
36283 -       local_irq_save(flags);
36284 +       local_lock_irqsave(pa_lock, flags);
36285         batch = READ_ONCE(pcp->batch);
36286         to_drain = min(pcp->count, batch);
36287         if (to_drain > 0) {
36288 -               free_pcppages_bulk(zone, to_drain, pcp);
36289 +               isolate_pcp_pages(to_drain, pcp, &dst);
36290                 pcp->count -= to_drain;
36291         }
36292 -       local_irq_restore(flags);
36293 +       local_unlock_irqrestore(pa_lock, flags);
36294 +       free_pcppages_bulk(zone, to_drain, &dst);
36295  }
36296  #endif
36297
36298 @@ -2403,16 +2443,21 @@ static void drain_pages_zone(unsigned int cpu, struct zone *zone)
36299         unsigned long flags;
36300         struct per_cpu_pageset *pset;
36301         struct per_cpu_pages *pcp;
36302 +       LIST_HEAD(dst);
36303 +       int count;
36304
36305 -       local_irq_save(flags);
36306 +       cpu_lock_irqsave(cpu, flags);
36307         pset = per_cpu_ptr(zone->pageset, cpu);
36308
36309         pcp = &pset->pcp;
36310 -       if (pcp->count) {
36311 -               free_pcppages_bulk(zone, pcp->count, pcp);
36312 +       count = pcp->count;
36313 +       if (count) {
36314 +               isolate_pcp_pages(count, pcp, &dst);
36315                 pcp->count = 0;
36316         }
36317 -       local_irq_restore(flags);
36318 +       cpu_unlock_irqrestore(cpu, flags);
36319 +       if (count)
36320 +               free_pcppages_bulk(zone, count, &dst);
36321  }
36322
36323  /*
36324 @@ -2447,6 +2492,7 @@ void drain_local_pages(struct zone *zone)
36325                 drain_pages(cpu);
36326  }
36327
36328 +#ifndef CONFIG_PREEMPT_RT_BASE
36329  static void drain_local_pages_wq(struct work_struct *work)
36330  {
36331         /*
36332 @@ -2460,6 +2506,7 @@ static void drain_local_pages_wq(struct work_struct *work)
36333         drain_local_pages(NULL);
36334         preempt_enable();
36335  }
36336 +#endif
36337
36338  /*
36339   * Spill all the per-cpu pages from all CPUs back into the buddy allocator.
36340 @@ -2526,7 +2573,14 @@ void drain_all_pages(struct zone *zone)
36341                 else
36342                         cpumask_clear_cpu(cpu, &cpus_with_pcps);
36343         }
36344 -
36345 +#ifdef CONFIG_PREEMPT_RT_BASE
36346 +       for_each_cpu(cpu, &cpus_with_pcps) {
36347 +               if (zone)
36348 +                       drain_pages_zone(cpu, zone);
36349 +               else
36350 +                       drain_pages(cpu);
36351 +       }
36352 +#else
36353         for_each_cpu(cpu, &cpus_with_pcps) {
36354                 struct work_struct *work = per_cpu_ptr(&pcpu_drain, cpu);
36355                 INIT_WORK(work, drain_local_pages_wq);
36356 @@ -2534,6 +2588,7 @@ void drain_all_pages(struct zone *zone)
36357         }
36358         for_each_cpu(cpu, &cpus_with_pcps)
36359                 flush_work(per_cpu_ptr(&pcpu_drain, cpu));
36360 +#endif
36361
36362         mutex_unlock(&pcpu_drain_mutex);
36363  }
36364 @@ -2610,7 +2665,7 @@ void free_hot_cold_page(struct page *page, bool cold)
36365
36366         migratetype = get_pfnblock_migratetype(page, pfn);
36367         set_pcppage_migratetype(page, migratetype);
36368 -       local_irq_save(flags);
36369 +       local_lock_irqsave(pa_lock, flags);
36370         __count_vm_event(PGFREE);
36371
36372         /*
36373 @@ -2636,12 +2691,17 @@ void free_hot_cold_page(struct page *page, bool cold)
36374         pcp->count++;
36375         if (pcp->count >= pcp->high) {
36376                 unsigned long batch = READ_ONCE(pcp->batch);
36377 -               free_pcppages_bulk(zone, batch, pcp);
36378 +               LIST_HEAD(dst);
36379 +
36380 +               isolate_pcp_pages(batch, pcp, &dst);
36381                 pcp->count -= batch;
36382 +               local_unlock_irqrestore(pa_lock, flags);
36383 +               free_pcppages_bulk(zone, batch, &dst);
36384 +               return;
36385         }
36386
36387  out:
36388 -       local_irq_restore(flags);
36389 +       local_unlock_irqrestore(pa_lock, flags);
36390  }
36391
36392  /*
36393 @@ -2789,7 +2849,7 @@ static struct page *rmqueue_pcplist(struct zone *preferred_zone,
36394         struct page *page;
36395         unsigned long flags;
36396
36397 -       local_irq_save(flags);
36398 +       local_lock_irqsave(pa_lock, flags);
36399         pcp = &this_cpu_ptr(zone->pageset)->pcp;
36400         list = &pcp->lists[migratetype];
36401         page = __rmqueue_pcplist(zone,  migratetype, cold, pcp, list);
36402 @@ -2797,7 +2857,7 @@ static struct page *rmqueue_pcplist(struct zone *preferred_zone,
36403                 __count_zid_vm_events(PGALLOC, page_zonenum(page), 1 << order);
36404                 zone_statistics(preferred_zone, zone);
36405         }
36406 -       local_irq_restore(flags);
36407 +       local_unlock_irqrestore(pa_lock, flags);
36408         return page;
36409  }
36410
36411 @@ -2824,7 +2884,7 @@ struct page *rmqueue(struct zone *preferred_zone,
36412          * allocate greater than order-1 page units with __GFP_NOFAIL.
36413          */
36414         WARN_ON_ONCE((gfp_flags & __GFP_NOFAIL) && (order > 1));
36415 -       spin_lock_irqsave(&zone->lock, flags);
36416 +       local_spin_lock_irqsave(pa_lock, &zone->lock, flags);
36417
36418         do {
36419                 page = NULL;
36420 @@ -2844,14 +2904,14 @@ struct page *rmqueue(struct zone *preferred_zone,
36421
36422         __count_zid_vm_events(PGALLOC, page_zonenum(page), 1 << order);
36423         zone_statistics(preferred_zone, zone);
36424 -       local_irq_restore(flags);
36425 +       local_unlock_irqrestore(pa_lock, flags);
36426
36427  out:
36428         VM_BUG_ON_PAGE(page && bad_range(zone, page), page);
36429         return page;
36430
36431  failed:
36432 -       local_irq_restore(flags);
36433 +       local_unlock_irqrestore(pa_lock, flags);
36434         return NULL;
36435  }
36436
36437 @@ -6785,8 +6845,9 @@ void __init free_area_init(unsigned long *zones_size)
36438
36439  static int page_alloc_cpu_dead(unsigned int cpu)
36440  {
36441 -
36442 +       local_lock_irq_on(swapvec_lock, cpu);
36443         lru_add_drain_cpu(cpu);
36444 +       local_unlock_irq_on(swapvec_lock, cpu);
36445         drain_pages(cpu);
36446
36447         /*
36448 @@ -7690,7 +7751,7 @@ void zone_pcp_reset(struct zone *zone)
36449         struct per_cpu_pageset *pset;
36450
36451         /* avoid races with drain_pages()  */
36452 -       local_irq_save(flags);
36453 +       local_lock_irqsave(pa_lock, flags);
36454         if (zone->pageset != &boot_pageset) {
36455                 for_each_online_cpu(cpu) {
36456                         pset = per_cpu_ptr(zone->pageset, cpu);
36457 @@ -7699,7 +7760,7 @@ void zone_pcp_reset(struct zone *zone)
36458                 free_percpu(zone->pageset);
36459                 zone->pageset = &boot_pageset;
36460         }
36461 -       local_irq_restore(flags);
36462 +       local_unlock_irqrestore(pa_lock, flags);
36463  }
36464
36465  #ifdef CONFIG_MEMORY_HOTREMOVE
36466 diff --git a/mm/slab.h b/mm/slab.h
36467 index 485d9fbb8802..f3b06c48bf39 100644
36468 --- a/mm/slab.h
36469 +++ b/mm/slab.h
36470 @@ -451,7 +451,11 @@ static inline void slab_post_alloc_hook(struct kmem_cache *s, gfp_t flags,
36471   * The slab lists for all objects.
36472   */
36473  struct kmem_cache_node {
36474 +#ifdef CONFIG_SLUB
36475 +       raw_spinlock_t list_lock;
36476 +#else
36477         spinlock_t list_lock;
36478 +#endif
36479
36480  #ifdef CONFIG_SLAB
36481         struct list_head slabs_partial; /* partial list first, better asm code */
36482 diff --git a/mm/slub.c b/mm/slub.c
36483 index 220d42e592ef..9b337c28dd1f 100644
36484 --- a/mm/slub.c
36485 +++ b/mm/slub.c
36486 @@ -1179,7 +1179,7 @@ static noinline int free_debug_processing(
36487         unsigned long uninitialized_var(flags);
36488         int ret = 0;
36489
36490 -       spin_lock_irqsave(&n->list_lock, flags);
36491 +       raw_spin_lock_irqsave(&n->list_lock, flags);
36492         slab_lock(page);
36493
36494         if (s->flags & SLAB_CONSISTENCY_CHECKS) {
36495 @@ -1214,7 +1214,7 @@ static noinline int free_debug_processing(
36496                          bulk_cnt, cnt);
36497
36498         slab_unlock(page);
36499 -       spin_unlock_irqrestore(&n->list_lock, flags);
36500 +       raw_spin_unlock_irqrestore(&n->list_lock, flags);
36501         if (!ret)
36502                 slab_fix(s, "Object at 0x%p not freed", object);
36503         return ret;
36504 @@ -1342,6 +1342,12 @@ static inline void dec_slabs_node(struct kmem_cache *s, int node,
36505
36506  #endif /* CONFIG_SLUB_DEBUG */
36507
36508 +struct slub_free_list {
36509 +       raw_spinlock_t          lock;
36510 +       struct list_head        list;
36511 +};
36512 +static DEFINE_PER_CPU(struct slub_free_list, slub_free_list);
36513 +
36514  /*
36515   * Hooks for other subsystems that check memory allocations. In a typical
36516   * production configuration these hooks all should produce no code at all.
36517 @@ -1561,10 +1567,17 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
36518         void *start, *p;
36519         int idx, order;
36520         bool shuffle;
36521 +       bool enableirqs = false;
36522
36523         flags &= gfp_allowed_mask;
36524
36525         if (gfpflags_allow_blocking(flags))
36526 +               enableirqs = true;
36527 +#ifdef CONFIG_PREEMPT_RT_FULL
36528 +       if (system_state > SYSTEM_BOOTING)
36529 +               enableirqs = true;
36530 +#endif
36531 +       if (enableirqs)
36532                 local_irq_enable();
36533
36534         flags |= s->allocflags;
36535 @@ -1623,7 +1636,7 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
36536         page->frozen = 1;
36537
36538  out:
36539 -       if (gfpflags_allow_blocking(flags))
36540 +       if (enableirqs)
36541                 local_irq_disable();
36542         if (!page)
36543                 return NULL;
36544 @@ -1681,6 +1694,16 @@ static void __free_slab(struct kmem_cache *s, struct page *page)
36545         __free_pages(page, order);
36546  }
36547
36548 +static void free_delayed(struct list_head *h)
36549 +{
36550 +       while(!list_empty(h)) {
36551 +               struct page *page = list_first_entry(h, struct page, lru);
36552 +
36553 +               list_del(&page->lru);
36554 +               __free_slab(page->slab_cache, page);
36555 +       }
36556 +}
36557 +
36558  #define need_reserve_slab_rcu                                          \
36559         (sizeof(((struct page *)NULL)->lru) < sizeof(struct rcu_head))
36560
36561 @@ -1712,6 +1735,12 @@ static void free_slab(struct kmem_cache *s, struct page *page)
36562                 }
36563
36564                 call_rcu(head, rcu_free_slab);
36565 +       } else if (irqs_disabled()) {
36566 +               struct slub_free_list *f = this_cpu_ptr(&slub_free_list);
36567 +
36568 +               raw_spin_lock(&f->lock);
36569 +               list_add(&page->lru, &f->list);
36570 +               raw_spin_unlock(&f->lock);
36571         } else
36572                 __free_slab(s, page);
36573  }
36574 @@ -1819,7 +1848,7 @@ static void *get_partial_node(struct kmem_cache *s, struct kmem_cache_node *n,
36575         if (!n || !n->nr_partial)
36576                 return NULL;
36577
36578 -       spin_lock(&n->list_lock);
36579 +       raw_spin_lock(&n->list_lock);
36580         list_for_each_entry_safe(page, page2, &n->partial, lru) {
36581                 void *t;
36582
36583 @@ -1844,7 +1873,7 @@ static void *get_partial_node(struct kmem_cache *s, struct kmem_cache_node *n,
36584                         break;
36585
36586         }
36587 -       spin_unlock(&n->list_lock);
36588 +       raw_spin_unlock(&n->list_lock);
36589         return object;
36590  }
36591
36592 @@ -2090,7 +2119,7 @@ static void deactivate_slab(struct kmem_cache *s, struct page *page,
36593                          * that acquire_slab() will see a slab page that
36594                          * is frozen
36595                          */
36596 -                       spin_lock(&n->list_lock);
36597 +                       raw_spin_lock(&n->list_lock);
36598                 }
36599         } else {
36600                 m = M_FULL;
36601 @@ -2101,7 +2130,7 @@ static void deactivate_slab(struct kmem_cache *s, struct page *page,
36602                          * slabs from diagnostic functions will not see
36603                          * any frozen slabs.
36604                          */
36605 -                       spin_lock(&n->list_lock);
36606 +                       raw_spin_lock(&n->list_lock);
36607                 }
36608         }
36609
36610 @@ -2136,7 +2165,7 @@ static void deactivate_slab(struct kmem_cache *s, struct page *page,
36611                 goto redo;
36612
36613         if (lock)
36614 -               spin_unlock(&n->list_lock);
36615 +               raw_spin_unlock(&n->list_lock);
36616
36617         if (m == M_FREE) {
36618                 stat(s, DEACTIVATE_EMPTY);
36619 @@ -2171,10 +2200,10 @@ static void unfreeze_partials(struct kmem_cache *s,
36620                 n2 = get_node(s, page_to_nid(page));
36621                 if (n != n2) {
36622                         if (n)
36623 -                               spin_unlock(&n->list_lock);
36624 +                               raw_spin_unlock(&n->list_lock);
36625
36626                         n = n2;
36627 -                       spin_lock(&n->list_lock);
36628 +                       raw_spin_lock(&n->list_lock);
36629                 }
36630
36631                 do {
36632 @@ -2203,7 +2232,7 @@ static void unfreeze_partials(struct kmem_cache *s,
36633         }
36634
36635         if (n)
36636 -               spin_unlock(&n->list_lock);
36637 +               raw_spin_unlock(&n->list_lock);
36638
36639         while (discard_page) {
36640                 page = discard_page;
36641 @@ -2242,14 +2271,21 @@ static void put_cpu_partial(struct kmem_cache *s, struct page *page, int drain)
36642                         pobjects = oldpage->pobjects;
36643                         pages = oldpage->pages;
36644                         if (drain && pobjects > s->cpu_partial) {
36645 +                               struct slub_free_list *f;
36646                                 unsigned long flags;
36647 +                               LIST_HEAD(tofree);
36648                                 /*
36649                                  * partial array is full. Move the existing
36650                                  * set to the per node partial list.
36651                                  */
36652                                 local_irq_save(flags);
36653                                 unfreeze_partials(s, this_cpu_ptr(s->cpu_slab));
36654 +                               f = this_cpu_ptr(&slub_free_list);
36655 +                               raw_spin_lock(&f->lock);
36656 +                               list_splice_init(&f->list, &tofree);
36657 +                               raw_spin_unlock(&f->lock);
36658                                 local_irq_restore(flags);
36659 +                               free_delayed(&tofree);
36660                                 oldpage = NULL;
36661                                 pobjects = 0;
36662                                 pages = 0;
36663 @@ -2319,7 +2355,22 @@ static bool has_cpu_slab(int cpu, void *info)
36664
36665  static void flush_all(struct kmem_cache *s)
36666  {
36667 +       LIST_HEAD(tofree);
36668 +       int cpu;
36669 +
36670         on_each_cpu_cond(has_cpu_slab, flush_cpu_slab, s, 1, GFP_ATOMIC);
36671 +       for_each_online_cpu(cpu) {
36672 +               struct slub_free_list *f;
36673 +
36674 +               if (!has_cpu_slab(cpu, s))
36675 +                       continue;
36676 +
36677 +               f = &per_cpu(slub_free_list, cpu);
36678 +               raw_spin_lock_irq(&f->lock);
36679 +               list_splice_init(&f->list, &tofree);
36680 +               raw_spin_unlock_irq(&f->lock);
36681 +               free_delayed(&tofree);
36682 +       }
36683  }
36684
36685  /*
36686 @@ -2374,10 +2425,10 @@ static unsigned long count_partial(struct kmem_cache_node *n,
36687         unsigned long x = 0;
36688         struct page *page;
36689
36690 -       spin_lock_irqsave(&n->list_lock, flags);
36691 +       raw_spin_lock_irqsave(&n->list_lock, flags);
36692         list_for_each_entry(page, &n->partial, lru)
36693                 x += get_count(page);
36694 -       spin_unlock_irqrestore(&n->list_lock, flags);
36695 +       raw_spin_unlock_irqrestore(&n->list_lock, flags);
36696         return x;
36697  }
36698  #endif /* CONFIG_SLUB_DEBUG || CONFIG_SYSFS */
36699 @@ -2515,8 +2566,10 @@ static inline void *get_freelist(struct kmem_cache *s, struct page *page)
36700   * already disabled (which is the case for bulk allocation).
36701   */
36702  static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
36703 -                         unsigned long addr, struct kmem_cache_cpu *c)
36704 +                         unsigned long addr, struct kmem_cache_cpu *c,
36705 +                         struct list_head *to_free)
36706  {
36707 +       struct slub_free_list *f;
36708         void *freelist;
36709         struct page *page;
36710
36711 @@ -2572,6 +2625,13 @@ static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
36712         VM_BUG_ON(!c->page->frozen);
36713         c->freelist = get_freepointer(s, freelist);
36714         c->tid = next_tid(c->tid);
36715 +
36716 +out:
36717 +       f = this_cpu_ptr(&slub_free_list);
36718 +       raw_spin_lock(&f->lock);
36719 +       list_splice_init(&f->list, to_free);
36720 +       raw_spin_unlock(&f->lock);
36721 +
36722         return freelist;
36723
36724  new_slab:
36725 @@ -2587,7 +2647,7 @@ static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
36726
36727         if (unlikely(!freelist)) {
36728                 slab_out_of_memory(s, gfpflags, node);
36729 -               return NULL;
36730 +               goto out;
36731         }
36732
36733         page = c->page;
36734 @@ -2600,7 +2660,7 @@ static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
36735                 goto new_slab;  /* Slab failed checks. Next slab needed */
36736
36737         deactivate_slab(s, page, get_freepointer(s, freelist), c);
36738 -       return freelist;
36739 +       goto out;
36740  }
36741
36742  /*
36743 @@ -2612,6 +2672,7 @@ static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
36744  {
36745         void *p;
36746         unsigned long flags;
36747 +       LIST_HEAD(tofree);
36748
36749         local_irq_save(flags);
36750  #ifdef CONFIG_PREEMPT
36751 @@ -2623,8 +2684,9 @@ static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
36752         c = this_cpu_ptr(s->cpu_slab);
36753  #endif
36754
36755 -       p = ___slab_alloc(s, gfpflags, node, addr, c);
36756 +       p = ___slab_alloc(s, gfpflags, node, addr, c, &tofree);
36757         local_irq_restore(flags);
36758 +       free_delayed(&tofree);
36759         return p;
36760  }
36761
36762 @@ -2810,7 +2872,7 @@ static void __slab_free(struct kmem_cache *s, struct page *page,
36763
36764         do {
36765                 if (unlikely(n)) {
36766 -                       spin_unlock_irqrestore(&n->list_lock, flags);
36767 +                       raw_spin_unlock_irqrestore(&n->list_lock, flags);
36768                         n = NULL;
36769                 }
36770                 prior = page->freelist;
36771 @@ -2842,7 +2904,7 @@ static void __slab_free(struct kmem_cache *s, struct page *page,
36772                                  * Otherwise the list_lock will synchronize with
36773                                  * other processors updating the list of slabs.
36774                                  */
36775 -                               spin_lock_irqsave(&n->list_lock, flags);
36776 +                               raw_spin_lock_irqsave(&n->list_lock, flags);
36777
36778                         }
36779                 }
36780 @@ -2884,7 +2946,7 @@ static void __slab_free(struct kmem_cache *s, struct page *page,
36781                 add_partial(n, page, DEACTIVATE_TO_TAIL);
36782                 stat(s, FREE_ADD_PARTIAL);
36783         }
36784 -       spin_unlock_irqrestore(&n->list_lock, flags);
36785 +       raw_spin_unlock_irqrestore(&n->list_lock, flags);
36786         return;
36787
36788  slab_empty:
36789 @@ -2899,7 +2961,7 @@ static void __slab_free(struct kmem_cache *s, struct page *page,
36790                 remove_full(s, n, page);
36791         }
36792
36793 -       spin_unlock_irqrestore(&n->list_lock, flags);
36794 +       raw_spin_unlock_irqrestore(&n->list_lock, flags);
36795         stat(s, FREE_SLAB);
36796         discard_slab(s, page);
36797  }
36798 @@ -3104,6 +3166,7 @@ int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size,
36799                           void **p)
36800  {
36801         struct kmem_cache_cpu *c;
36802 +       LIST_HEAD(to_free);
36803         int i;
36804
36805         /* memcg and kmem_cache debug support */
36806 @@ -3127,7 +3190,7 @@ int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size,
36807                          * of re-populating per CPU c->freelist
36808                          */
36809                         p[i] = ___slab_alloc(s, flags, NUMA_NO_NODE,
36810 -                                           _RET_IP_, c);
36811 +                                           _RET_IP_, c, &to_free);
36812                         if (unlikely(!p[i]))
36813                                 goto error;
36814
36815 @@ -3139,6 +3202,7 @@ int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size,
36816         }
36817         c->tid = next_tid(c->tid);
36818         local_irq_enable();
36819 +       free_delayed(&to_free);
36820
36821         /* Clear memory outside IRQ disabled fastpath loop */
36822         if (unlikely(flags & __GFP_ZERO)) {
36823 @@ -3153,6 +3217,7 @@ int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size,
36824         return i;
36825  error:
36826         local_irq_enable();
36827 +       free_delayed(&to_free);
36828         slab_post_alloc_hook(s, flags, i, p);
36829         __kmem_cache_free_bulk(s, i, p);
36830         return 0;
36831 @@ -3286,7 +3351,7 @@ static void
36832  init_kmem_cache_node(struct kmem_cache_node *n)
36833  {
36834         n->nr_partial = 0;
36835 -       spin_lock_init(&n->list_lock);
36836 +       raw_spin_lock_init(&n->list_lock);
36837         INIT_LIST_HEAD(&n->partial);
36838  #ifdef CONFIG_SLUB_DEBUG
36839         atomic_long_set(&n->nr_slabs, 0);
36840 @@ -3640,6 +3705,10 @@ static void list_slab_objects(struct kmem_cache *s, struct page *page,
36841                                                         const char *text)
36842  {
36843  #ifdef CONFIG_SLUB_DEBUG
36844 +#ifdef CONFIG_PREEMPT_RT_BASE
36845 +       /* XXX move out of irq-off section */
36846 +       slab_err(s, page, text, s->name);
36847 +#else
36848         void *addr = page_address(page);
36849         void *p;
36850         unsigned long *map = kzalloc(BITS_TO_LONGS(page->objects) *
36851 @@ -3660,6 +3729,7 @@ static void list_slab_objects(struct kmem_cache *s, struct page *page,
36852         slab_unlock(page);
36853         kfree(map);
36854  #endif
36855 +#endif
36856  }
36857
36858  /*
36859 @@ -3673,7 +3743,7 @@ static void free_partial(struct kmem_cache *s, struct kmem_cache_node *n)
36860         struct page *page, *h;
36861
36862         BUG_ON(irqs_disabled());
36863 -       spin_lock_irq(&n->list_lock);
36864 +       raw_spin_lock_irq(&n->list_lock);
36865         list_for_each_entry_safe(page, h, &n->partial, lru) {
36866                 if (!page->inuse) {
36867                         remove_partial(n, page);
36868 @@ -3683,7 +3753,7 @@ static void free_partial(struct kmem_cache *s, struct kmem_cache_node *n)
36869                         "Objects remaining in %s on __kmem_cache_shutdown()");
36870                 }
36871         }
36872 -       spin_unlock_irq(&n->list_lock);
36873 +       raw_spin_unlock_irq(&n->list_lock);
36874
36875         list_for_each_entry_safe(page, h, &discard, lru)
36876                 discard_slab(s, page);
36877 @@ -3927,7 +3997,7 @@ int __kmem_cache_shrink(struct kmem_cache *s)
36878                 for (i = 0; i < SHRINK_PROMOTE_MAX; i++)
36879                         INIT_LIST_HEAD(promote + i);
36880
36881 -               spin_lock_irqsave(&n->list_lock, flags);
36882 +               raw_spin_lock_irqsave(&n->list_lock, flags);
36883
36884                 /*
36885                  * Build lists of slabs to discard or promote.
36886 @@ -3958,7 +4028,7 @@ int __kmem_cache_shrink(struct kmem_cache *s)
36887                 for (i = SHRINK_PROMOTE_MAX - 1; i >= 0; i--)
36888                         list_splice(promote + i, &n->partial);
36889
36890 -               spin_unlock_irqrestore(&n->list_lock, flags);
36891 +               raw_spin_unlock_irqrestore(&n->list_lock, flags);
36892
36893                 /* Release empty slabs */
36894                 list_for_each_entry_safe(page, t, &discard, lru)
36895 @@ -4171,6 +4241,12 @@ void __init kmem_cache_init(void)
36896  {
36897         static __initdata struct kmem_cache boot_kmem_cache,
36898                 boot_kmem_cache_node;
36899 +       int cpu;
36900 +
36901 +       for_each_possible_cpu(cpu) {
36902 +               raw_spin_lock_init(&per_cpu(slub_free_list, cpu).lock);
36903 +               INIT_LIST_HEAD(&per_cpu(slub_free_list, cpu).list);
36904 +       }
36905
36906         if (debug_guardpage_minorder())
36907                 slub_max_order = 0;
36908 @@ -4379,7 +4455,7 @@ static int validate_slab_node(struct kmem_cache *s,
36909         struct page *page;
36910         unsigned long flags;
36911
36912 -       spin_lock_irqsave(&n->list_lock, flags);
36913 +       raw_spin_lock_irqsave(&n->list_lock, flags);
36914
36915         list_for_each_entry(page, &n->partial, lru) {
36916                 validate_slab_slab(s, page, map);
36917 @@ -4401,7 +4477,7 @@ static int validate_slab_node(struct kmem_cache *s,
36918                        s->name, count, atomic_long_read(&n->nr_slabs));
36919
36920  out:
36921 -       spin_unlock_irqrestore(&n->list_lock, flags);
36922 +       raw_spin_unlock_irqrestore(&n->list_lock, flags);
36923         return count;
36924  }
36925
36926 @@ -4589,12 +4665,12 @@ static int list_locations(struct kmem_cache *s, char *buf,
36927                 if (!atomic_long_read(&n->nr_slabs))
36928                         continue;
36929
36930 -               spin_lock_irqsave(&n->list_lock, flags);
36931 +               raw_spin_lock_irqsave(&n->list_lock, flags);
36932                 list_for_each_entry(page, &n->partial, lru)
36933                         process_slab(&t, s, page, alloc, map);
36934                 list_for_each_entry(page, &n->full, lru)
36935                         process_slab(&t, s, page, alloc, map);
36936 -               spin_unlock_irqrestore(&n->list_lock, flags);
36937 +               raw_spin_unlock_irqrestore(&n->list_lock, flags);
36938         }
36939
36940         for (i = 0; i < t.count; i++) {
36941 diff --git a/mm/swap.c b/mm/swap.c
36942 index a77d68f2c1b6..30d62efe001b 100644
36943 --- a/mm/swap.c
36944 +++ b/mm/swap.c
36945 @@ -32,6 +32,7 @@
36946  #include <linux/memcontrol.h>
36947  #include <linux/gfp.h>
36948  #include <linux/uio.h>
36949 +#include <linux/locallock.h>
36950  #include <linux/hugetlb.h>
36951  #include <linux/page_idle.h>
36952
36953 @@ -50,6 +51,8 @@ static DEFINE_PER_CPU(struct pagevec, lru_lazyfree_pvecs);
36954  #ifdef CONFIG_SMP
36955  static DEFINE_PER_CPU(struct pagevec, activate_page_pvecs);
36956  #endif
36957 +static DEFINE_LOCAL_IRQ_LOCK(rotate_lock);
36958 +DEFINE_LOCAL_IRQ_LOCK(swapvec_lock);
36959
36960  /*
36961   * This path almost never happens for VM activity - pages are normally
36962 @@ -252,11 +255,11 @@ void rotate_reclaimable_page(struct page *page)
36963                 unsigned long flags;
36964
36965                 get_page(page);
36966 -               local_irq_save(flags);
36967 +               local_lock_irqsave(rotate_lock, flags);
36968                 pvec = this_cpu_ptr(&lru_rotate_pvecs);
36969                 if (!pagevec_add(pvec, page) || PageCompound(page))
36970                         pagevec_move_tail(pvec);
36971 -               local_irq_restore(flags);
36972 +               local_unlock_irqrestore(rotate_lock, flags);
36973         }
36974  }
36975
36976 @@ -306,12 +309,13 @@ void activate_page(struct page *page)
36977  {
36978         page = compound_head(page);
36979         if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) {
36980 -               struct pagevec *pvec = &get_cpu_var(activate_page_pvecs);
36981 +               struct pagevec *pvec = &get_locked_var(swapvec_lock,
36982 +                                                      activate_page_pvecs);
36983
36984                 get_page(page);
36985                 if (!pagevec_add(pvec, page) || PageCompound(page))
36986                         pagevec_lru_move_fn(pvec, __activate_page, NULL);
36987 -               put_cpu_var(activate_page_pvecs);
36988 +               put_locked_var(swapvec_lock, activate_page_pvecs);
36989         }
36990  }
36991
36992 @@ -338,7 +342,7 @@ void activate_page(struct page *page)
36993
36994  static void __lru_cache_activate_page(struct page *page)
36995  {
36996 -       struct pagevec *pvec = &get_cpu_var(lru_add_pvec);
36997 +       struct pagevec *pvec = &get_locked_var(swapvec_lock, lru_add_pvec);
36998         int i;
36999
37000         /*
37001 @@ -360,7 +364,7 @@ static void __lru_cache_activate_page(struct page *page)
37002                 }
37003         }
37004
37005 -       put_cpu_var(lru_add_pvec);
37006 +       put_locked_var(swapvec_lock, lru_add_pvec);
37007  }
37008
37009  /*
37010 @@ -402,12 +406,12 @@ EXPORT_SYMBOL(mark_page_accessed);
37011
37012  static void __lru_cache_add(struct page *page)
37013  {
37014 -       struct pagevec *pvec = &get_cpu_var(lru_add_pvec);
37015 +       struct pagevec *pvec = &get_locked_var(swapvec_lock, lru_add_pvec);
37016
37017         get_page(page);
37018         if (!pagevec_add(pvec, page) || PageCompound(page))
37019                 __pagevec_lru_add(pvec);
37020 -       put_cpu_var(lru_add_pvec);
37021 +       put_locked_var(swapvec_lock, lru_add_pvec);
37022  }
37023
37024  /**
37025 @@ -613,9 +617,15 @@ void lru_add_drain_cpu(int cpu)
37026                 unsigned long flags;
37027
37028                 /* No harm done if a racing interrupt already did this */
37029 -               local_irq_save(flags);
37030 +#ifdef CONFIG_PREEMPT_RT_BASE
37031 +               local_lock_irqsave_on(rotate_lock, flags, cpu);
37032                 pagevec_move_tail(pvec);
37033 -               local_irq_restore(flags);
37034 +               local_unlock_irqrestore_on(rotate_lock, flags, cpu);
37035 +#else
37036 +               local_lock_irqsave(rotate_lock, flags);
37037 +               pagevec_move_tail(pvec);
37038 +               local_unlock_irqrestore(rotate_lock, flags);
37039 +#endif
37040         }
37041
37042         pvec = &per_cpu(lru_deactivate_file_pvecs, cpu);
37043 @@ -647,11 +657,12 @@ void deactivate_file_page(struct page *page)
37044                 return;
37045
37046         if (likely(get_page_unless_zero(page))) {
37047 -               struct pagevec *pvec = &get_cpu_var(lru_deactivate_file_pvecs);
37048 +               struct pagevec *pvec = &get_locked_var(swapvec_lock,
37049 +                                                      lru_deactivate_file_pvecs);
37050
37051                 if (!pagevec_add(pvec, page) || PageCompound(page))
37052                         pagevec_lru_move_fn(pvec, lru_deactivate_file_fn, NULL);
37053 -               put_cpu_var(lru_deactivate_file_pvecs);
37054 +               put_locked_var(swapvec_lock, lru_deactivate_file_pvecs);
37055         }
37056  }
37057
37058 @@ -666,21 +677,32 @@ void mark_page_lazyfree(struct page *page)
37059  {
37060         if (PageLRU(page) && PageAnon(page) && PageSwapBacked(page) &&
37061             !PageSwapCache(page) && !PageUnevictable(page)) {
37062 -               struct pagevec *pvec = &get_cpu_var(lru_lazyfree_pvecs);
37063 +               struct pagevec *pvec = &get_locked_var(swapvec_lock,
37064 +                                                      lru_lazyfree_pvecs);
37065
37066                 get_page(page);
37067                 if (!pagevec_add(pvec, page) || PageCompound(page))
37068                         pagevec_lru_move_fn(pvec, lru_lazyfree_fn, NULL);
37069 -               put_cpu_var(lru_lazyfree_pvecs);
37070 +               put_locked_var(swapvec_lock, lru_lazyfree_pvecs);
37071         }
37072  }
37073
37074  void lru_add_drain(void)
37075  {
37076 -       lru_add_drain_cpu(get_cpu());
37077 -       put_cpu();
37078 +       lru_add_drain_cpu(local_lock_cpu(swapvec_lock));
37079 +       local_unlock_cpu(swapvec_lock);
37080 +}
37081 +
37082 +#ifdef CONFIG_PREEMPT_RT_BASE
37083 +static inline void remote_lru_add_drain(int cpu, struct cpumask *has_work)
37084 +{
37085 +       local_lock_on(swapvec_lock, cpu);
37086 +       lru_add_drain_cpu(cpu);
37087 +       local_unlock_on(swapvec_lock, cpu);
37088  }
37089
37090 +#else
37091 +
37092  static void lru_add_drain_per_cpu(struct work_struct *dummy)
37093  {
37094         lru_add_drain();
37095 @@ -688,6 +710,16 @@ static void lru_add_drain_per_cpu(struct work_struct *dummy)
37096
37097  static DEFINE_PER_CPU(struct work_struct, lru_add_drain_work);
37098
37099 +static inline void remote_lru_add_drain(int cpu, struct cpumask *has_work)
37100 +{
37101 +       struct work_struct *work = &per_cpu(lru_add_drain_work, cpu);
37102 +
37103 +       INIT_WORK(work, lru_add_drain_per_cpu);
37104 +       queue_work_on(cpu, mm_percpu_wq, work);
37105 +       cpumask_set_cpu(cpu, has_work);
37106 +}
37107 +#endif
37108 +
37109  void lru_add_drain_all_cpuslocked(void)
37110  {
37111         static DEFINE_MUTEX(lock);
37112 @@ -705,21 +737,19 @@ void lru_add_drain_all_cpuslocked(void)
37113         cpumask_clear(&has_work);
37114
37115         for_each_online_cpu(cpu) {
37116 -               struct work_struct *work = &per_cpu(lru_add_drain_work, cpu);
37117
37118                 if (pagevec_count(&per_cpu(lru_add_pvec, cpu)) ||
37119                     pagevec_count(&per_cpu(lru_rotate_pvecs, cpu)) ||
37120                     pagevec_count(&per_cpu(lru_deactivate_file_pvecs, cpu)) ||
37121                     pagevec_count(&per_cpu(lru_lazyfree_pvecs, cpu)) ||
37122 -                   need_activate_page_drain(cpu)) {
37123 -                       INIT_WORK(work, lru_add_drain_per_cpu);
37124 -                       queue_work_on(cpu, mm_percpu_wq, work);
37125 -                       cpumask_set_cpu(cpu, &has_work);
37126 -               }
37127 +                   need_activate_page_drain(cpu))
37128 +                       remote_lru_add_drain(cpu, &has_work);
37129         }
37130
37131 +#ifndef CONFIG_PREEMPT_RT_BASE
37132         for_each_cpu(cpu, &has_work)
37133                 flush_work(&per_cpu(lru_add_drain_work, cpu));
37134 +#endif
37135
37136         mutex_unlock(&lock);
37137  }
37138 diff --git a/mm/truncate.c b/mm/truncate.c
37139 index 2330223841fb..d0c8e6c8fef5 100644
37140 --- a/mm/truncate.c
37141 +++ b/mm/truncate.c
37142 @@ -41,8 +41,10 @@ static void clear_shadow_entry(struct address_space *mapping, pgoff_t index,
37143                 goto unlock;
37144         if (*slot != entry)
37145                 goto unlock;
37146 +       local_lock(shadow_nodes_lock);
37147         __radix_tree_replace(&mapping->page_tree, node, slot, NULL,
37148 -                            workingset_update_node, mapping);
37149 +                            __workingset_update_node, mapping);
37150 +       local_unlock(shadow_nodes_lock);
37151         mapping->nrexceptional--;
37152  unlock:
37153         spin_unlock_irq(&mapping->tree_lock);
37154 diff --git a/mm/vmalloc.c b/mm/vmalloc.c
37155 index 9ff21a12ea00..95c83b291548 100644
37156 --- a/mm/vmalloc.c
37157 +++ b/mm/vmalloc.c
37158 @@ -865,7 +865,7 @@ static void *new_vmap_block(unsigned int order, gfp_t gfp_mask)
37159         struct vmap_block *vb;
37160         struct vmap_area *va;
37161         unsigned long vb_idx;
37162 -       int node, err;
37163 +       int node, err, cpu;
37164         void *vaddr;
37165
37166         node = numa_node_id();
37167 @@ -908,11 +908,12 @@ static void *new_vmap_block(unsigned int order, gfp_t gfp_mask)
37168         BUG_ON(err);
37169         radix_tree_preload_end();
37170
37171 -       vbq = &get_cpu_var(vmap_block_queue);
37172 +       cpu = get_cpu_light();
37173 +       vbq = this_cpu_ptr(&vmap_block_queue);
37174         spin_lock(&vbq->lock);
37175         list_add_tail_rcu(&vb->free_list, &vbq->free);
37176         spin_unlock(&vbq->lock);
37177 -       put_cpu_var(vmap_block_queue);
37178 +       put_cpu_light();
37179
37180         return vaddr;
37181  }
37182 @@ -981,6 +982,7 @@ static void *vb_alloc(unsigned long size, gfp_t gfp_mask)
37183         struct vmap_block *vb;
37184         void *vaddr = NULL;
37185         unsigned int order;
37186 +       int cpu;
37187
37188         BUG_ON(offset_in_page(size));
37189         BUG_ON(size > PAGE_SIZE*VMAP_MAX_ALLOC);
37190 @@ -995,7 +997,8 @@ static void *vb_alloc(unsigned long size, gfp_t gfp_mask)
37191         order = get_order(size);
37192
37193         rcu_read_lock();
37194 -       vbq = &get_cpu_var(vmap_block_queue);
37195 +       cpu = get_cpu_light();
37196 +       vbq = this_cpu_ptr(&vmap_block_queue);
37197         list_for_each_entry_rcu(vb, &vbq->free, free_list) {
37198                 unsigned long pages_off;
37199
37200 @@ -1018,7 +1021,7 @@ static void *vb_alloc(unsigned long size, gfp_t gfp_mask)
37201                 break;
37202         }
37203
37204 -       put_cpu_var(vmap_block_queue);
37205 +       put_cpu_light();
37206         rcu_read_unlock();
37207
37208         /* Allocate new block if nothing was found */
37209 diff --git a/mm/vmstat.c b/mm/vmstat.c
37210 index 527ae727d547..ae6446b054d3 100644
37211 --- a/mm/vmstat.c
37212 +++ b/mm/vmstat.c
37213 @@ -249,6 +249,7 @@ void __mod_zone_page_state(struct zone *zone, enum zone_stat_item item,
37214         long x;
37215         long t;
37216
37217 +       preempt_disable_rt();
37218         x = delta + __this_cpu_read(*p);
37219
37220         t = __this_cpu_read(pcp->stat_threshold);
37221 @@ -258,6 +259,7 @@ void __mod_zone_page_state(struct zone *zone, enum zone_stat_item item,
37222                 x = 0;
37223         }
37224         __this_cpu_write(*p, x);
37225 +       preempt_enable_rt();
37226  }
37227  EXPORT_SYMBOL(__mod_zone_page_state);
37228
37229 @@ -269,6 +271,7 @@ void __mod_node_page_state(struct pglist_data *pgdat, enum node_stat_item item,
37230         long x;
37231         long t;
37232
37233 +       preempt_disable_rt();
37234         x = delta + __this_cpu_read(*p);
37235
37236         t = __this_cpu_read(pcp->stat_threshold);
37237 @@ -278,6 +281,7 @@ void __mod_node_page_state(struct pglist_data *pgdat, enum node_stat_item item,
37238                 x = 0;
37239         }
37240         __this_cpu_write(*p, x);
37241 +       preempt_enable_rt();
37242  }
37243  EXPORT_SYMBOL(__mod_node_page_state);
37244
37245 @@ -310,6 +314,7 @@ void __inc_zone_state(struct zone *zone, enum zone_stat_item item)
37246         s8 __percpu *p = pcp->vm_stat_diff + item;
37247         s8 v, t;
37248
37249 +       preempt_disable_rt();
37250         v = __this_cpu_inc_return(*p);
37251         t = __this_cpu_read(pcp->stat_threshold);
37252         if (unlikely(v > t)) {
37253 @@ -318,6 +323,7 @@ void __inc_zone_state(struct zone *zone, enum zone_stat_item item)
37254                 zone_page_state_add(v + overstep, zone, item);
37255                 __this_cpu_write(*p, -overstep);
37256         }
37257 +       preempt_enable_rt();
37258  }
37259
37260  void __inc_node_state(struct pglist_data *pgdat, enum node_stat_item item)
37261 @@ -326,6 +332,7 @@ void __inc_node_state(struct pglist_data *pgdat, enum node_stat_item item)
37262         s8 __percpu *p = pcp->vm_node_stat_diff + item;
37263         s8 v, t;
37264
37265 +       preempt_disable_rt();
37266         v = __this_cpu_inc_return(*p);
37267         t = __this_cpu_read(pcp->stat_threshold);
37268         if (unlikely(v > t)) {
37269 @@ -334,6 +341,7 @@ void __inc_node_state(struct pglist_data *pgdat, enum node_stat_item item)
37270                 node_page_state_add(v + overstep, pgdat, item);
37271                 __this_cpu_write(*p, -overstep);
37272         }
37273 +       preempt_enable_rt();
37274  }
37275
37276  void __inc_zone_page_state(struct page *page, enum zone_stat_item item)
37277 @@ -354,6 +362,7 @@ void __dec_zone_state(struct zone *zone, enum zone_stat_item item)
37278         s8 __percpu *p = pcp->vm_stat_diff + item;
37279         s8 v, t;
37280
37281 +       preempt_disable_rt();
37282         v = __this_cpu_dec_return(*p);
37283         t = __this_cpu_read(pcp->stat_threshold);
37284         if (unlikely(v < - t)) {
37285 @@ -362,6 +371,7 @@ void __dec_zone_state(struct zone *zone, enum zone_stat_item item)
37286                 zone_page_state_add(v - overstep, zone, item);
37287                 __this_cpu_write(*p, overstep);
37288         }
37289 +       preempt_enable_rt();
37290  }
37291
37292  void __dec_node_state(struct pglist_data *pgdat, enum node_stat_item item)
37293 @@ -370,6 +380,7 @@ void __dec_node_state(struct pglist_data *pgdat, enum node_stat_item item)
37294         s8 __percpu *p = pcp->vm_node_stat_diff + item;
37295         s8 v, t;
37296
37297 +       preempt_disable_rt();
37298         v = __this_cpu_dec_return(*p);
37299         t = __this_cpu_read(pcp->stat_threshold);
37300         if (unlikely(v < - t)) {
37301 @@ -378,6 +389,7 @@ void __dec_node_state(struct pglist_data *pgdat, enum node_stat_item item)
37302                 node_page_state_add(v - overstep, pgdat, item);
37303                 __this_cpu_write(*p, overstep);
37304         }
37305 +       preempt_enable_rt();
37306  }
37307
37308  void __dec_zone_page_state(struct page *page, enum zone_stat_item item)
37309 diff --git a/mm/workingset.c b/mm/workingset.c
37310 index b997c9de28f6..e252cc69a3d4 100644
37311 --- a/mm/workingset.c
37312 +++ b/mm/workingset.c
37313 @@ -338,9 +338,10 @@ void workingset_activation(struct page *page)
37314   * point where they would still be useful.
37315   */
37316
37317 -static struct list_lru shadow_nodes;
37318 +static struct list_lru __shadow_nodes;
37319 +DEFINE_LOCAL_IRQ_LOCK(shadow_nodes_lock);
37320
37321 -void workingset_update_node(struct radix_tree_node *node, void *private)
37322 +void __workingset_update_node(struct radix_tree_node *node, void *private)
37323  {
37324         struct address_space *mapping = private;
37325
37326 @@ -358,10 +359,10 @@ void workingset_update_node(struct radix_tree_node *node, void *private)
37327          */
37328         if (node->count && node->count == node->exceptional) {
37329                 if (list_empty(&node->private_list))
37330 -                       list_lru_add(&shadow_nodes, &node->private_list);
37331 +                       list_lru_add(&__shadow_nodes, &node->private_list);
37332         } else {
37333                 if (!list_empty(&node->private_list))
37334 -                       list_lru_del(&shadow_nodes, &node->private_list);
37335 +                       list_lru_del(&__shadow_nodes, &node->private_list);
37336         }
37337  }
37338
37339 @@ -373,9 +374,9 @@ static unsigned long count_shadow_nodes(struct shrinker *shrinker,
37340         unsigned long cache;
37341
37342         /* list_lru lock nests inside IRQ-safe mapping->tree_lock */
37343 -       local_irq_disable();
37344 -       nodes = list_lru_shrink_count(&shadow_nodes, sc);
37345 -       local_irq_enable();
37346 +       local_lock_irq(shadow_nodes_lock);
37347 +       nodes = list_lru_shrink_count(&__shadow_nodes, sc);
37348 +       local_unlock_irq(shadow_nodes_lock);
37349
37350         /*
37351          * Approximate a reasonable limit for the radix tree nodes
37352 @@ -475,15 +476,15 @@ static enum lru_status shadow_lru_isolate(struct list_head *item,
37353                 goto out_invalid;
37354         inc_lruvec_page_state(virt_to_page(node), WORKINGSET_NODERECLAIM);
37355         __radix_tree_delete_node(&mapping->page_tree, node,
37356 -                                workingset_update_node, mapping);
37357 +                                __workingset_update_node, mapping);
37358
37359  out_invalid:
37360         spin_unlock(&mapping->tree_lock);
37361         ret = LRU_REMOVED_RETRY;
37362  out:
37363 -       local_irq_enable();
37364 +       local_unlock_irq(shadow_nodes_lock);
37365         cond_resched();
37366 -       local_irq_disable();
37367 +       local_lock_irq(shadow_nodes_lock);
37368         spin_lock(lru_lock);
37369         return ret;
37370  }
37371 @@ -494,9 +495,9 @@ static unsigned long scan_shadow_nodes(struct shrinker *shrinker,
37372         unsigned long ret;
37373
37374         /* list_lru lock nests inside IRQ-safe mapping->tree_lock */
37375 -       local_irq_disable();
37376 -       ret = list_lru_shrink_walk(&shadow_nodes, sc, shadow_lru_isolate, NULL);
37377 -       local_irq_enable();
37378 +       local_lock_irq(shadow_nodes_lock);
37379 +       ret = list_lru_shrink_walk(&__shadow_nodes, sc, shadow_lru_isolate, NULL);
37380 +       local_unlock_irq(shadow_nodes_lock);
37381         return ret;
37382  }
37383
37384 @@ -534,7 +535,7 @@ static int __init workingset_init(void)
37385         pr_info("workingset: timestamp_bits=%d max_order=%d bucket_order=%u\n",
37386                timestamp_bits, max_order, bucket_order);
37387
37388 -       ret = __list_lru_init(&shadow_nodes, true, &shadow_nodes_key);
37389 +       ret = __list_lru_init(&__shadow_nodes, true, &shadow_nodes_key);
37390         if (ret)
37391                 goto err;
37392         ret = register_shrinker(&workingset_shadow_shrinker);
37393 @@ -542,7 +543,7 @@ static int __init workingset_init(void)
37394                 goto err_list_lru;
37395         return 0;
37396  err_list_lru:
37397 -       list_lru_destroy(&shadow_nodes);
37398 +       list_lru_destroy(&__shadow_nodes);
37399  err:
37400         return ret;
37401  }
37402 diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
37403 index 685049a9048d..8d1489fd1dbc 100644
37404 --- a/mm/zsmalloc.c
37405 +++ b/mm/zsmalloc.c
37406 @@ -53,6 +53,7 @@
37407  #include <linux/mount.h>
37408  #include <linux/migrate.h>
37409  #include <linux/pagemap.h>
37410 +#include <linux/locallock.h>
37411
37412  #define ZSPAGE_MAGIC   0x58
37413
37414 @@ -70,9 +71,22 @@
37415   */
37416  #define ZS_MAX_ZSPAGE_ORDER 2
37417  #define ZS_MAX_PAGES_PER_ZSPAGE (_AC(1, UL) << ZS_MAX_ZSPAGE_ORDER)
37418 -
37419  #define ZS_HANDLE_SIZE (sizeof(unsigned long))
37420
37421 +#ifdef CONFIG_PREEMPT_RT_FULL
37422 +
37423 +struct zsmalloc_handle {
37424 +       unsigned long addr;
37425 +       struct mutex lock;
37426 +};
37427 +
37428 +#define ZS_HANDLE_ALLOC_SIZE (sizeof(struct zsmalloc_handle))
37429 +
37430 +#else
37431 +
37432 +#define ZS_HANDLE_ALLOC_SIZE (sizeof(unsigned long))
37433 +#endif
37434 +
37435  /*
37436   * Object location (<PFN>, <obj_idx>) is encoded as
37437   * as single (unsigned long) handle value.
37438 @@ -320,7 +334,7 @@ static void SetZsPageMovable(struct zs_pool *pool, struct zspage *zspage) {}
37439
37440  static int create_cache(struct zs_pool *pool)
37441  {
37442 -       pool->handle_cachep = kmem_cache_create("zs_handle", ZS_HANDLE_SIZE,
37443 +       pool->handle_cachep = kmem_cache_create("zs_handle", ZS_HANDLE_ALLOC_SIZE,
37444                                         0, 0, NULL);
37445         if (!pool->handle_cachep)
37446                 return 1;
37447 @@ -344,10 +358,27 @@ static void destroy_cache(struct zs_pool *pool)
37448
37449  static unsigned long cache_alloc_handle(struct zs_pool *pool, gfp_t gfp)
37450  {
37451 -       return (unsigned long)kmem_cache_alloc(pool->handle_cachep,
37452 -                       gfp & ~(__GFP_HIGHMEM|__GFP_MOVABLE));
37453 +       void *p;
37454 +
37455 +       p = kmem_cache_alloc(pool->handle_cachep,
37456 +                            gfp & ~(__GFP_HIGHMEM|__GFP_MOVABLE));
37457 +#ifdef CONFIG_PREEMPT_RT_FULL
37458 +       if (p) {
37459 +               struct zsmalloc_handle *zh = p;
37460 +
37461 +               mutex_init(&zh->lock);
37462 +       }
37463 +#endif
37464 +       return (unsigned long)p;
37465  }
37466
37467 +#ifdef CONFIG_PREEMPT_RT_FULL
37468 +static struct zsmalloc_handle *zs_get_pure_handle(unsigned long handle)
37469 +{
37470 +       return (void *)(handle &~((1 << OBJ_TAG_BITS) - 1));
37471 +}
37472 +#endif
37473 +
37474  static void cache_free_handle(struct zs_pool *pool, unsigned long handle)
37475  {
37476         kmem_cache_free(pool->handle_cachep, (void *)handle);
37477 @@ -366,12 +397,18 @@ static void cache_free_zspage(struct zs_pool *pool, struct zspage *zspage)
37478
37479  static void record_obj(unsigned long handle, unsigned long obj)
37480  {
37481 +#ifdef CONFIG_PREEMPT_RT_FULL
37482 +       struct zsmalloc_handle *zh = zs_get_pure_handle(handle);
37483 +
37484 +       WRITE_ONCE(zh->addr, obj);
37485 +#else
37486         /*
37487          * lsb of @obj represents handle lock while other bits
37488          * represent object value the handle is pointing so
37489          * updating shouldn't do store tearing.
37490          */
37491         WRITE_ONCE(*(unsigned long *)handle, obj);
37492 +#endif
37493  }
37494
37495  /* zpool driver */
37496 @@ -460,6 +497,7 @@ MODULE_ALIAS("zpool-zsmalloc");
37497
37498  /* per-cpu VM mapping areas for zspage accesses that cross page boundaries */
37499  static DEFINE_PER_CPU(struct mapping_area, zs_map_area);
37500 +static DEFINE_LOCAL_IRQ_LOCK(zs_map_area_lock);
37501
37502  static bool is_zspage_isolated(struct zspage *zspage)
37503  {
37504 @@ -898,7 +936,13 @@ static unsigned long location_to_obj(struct page *page, unsigned int obj_idx)
37505
37506  static unsigned long handle_to_obj(unsigned long handle)
37507  {
37508 +#ifdef CONFIG_PREEMPT_RT_FULL
37509 +       struct zsmalloc_handle *zh = zs_get_pure_handle(handle);
37510 +
37511 +       return zh->addr;
37512 +#else
37513         return *(unsigned long *)handle;
37514 +#endif
37515  }
37516
37517  static unsigned long obj_to_head(struct page *page, void *obj)
37518 @@ -912,22 +956,46 @@ static unsigned long obj_to_head(struct page *page, void *obj)
37519
37520  static inline int testpin_tag(unsigned long handle)
37521  {
37522 +#ifdef CONFIG_PREEMPT_RT_FULL
37523 +       struct zsmalloc_handle *zh = zs_get_pure_handle(handle);
37524 +
37525 +       return mutex_is_locked(&zh->lock);
37526 +#else
37527         return bit_spin_is_locked(HANDLE_PIN_BIT, (unsigned long *)handle);
37528 +#endif
37529  }
37530
37531  static inline int trypin_tag(unsigned long handle)
37532  {
37533 +#ifdef CONFIG_PREEMPT_RT_FULL
37534 +       struct zsmalloc_handle *zh = zs_get_pure_handle(handle);
37535 +
37536 +       return mutex_trylock(&zh->lock);
37537 +#else
37538         return bit_spin_trylock(HANDLE_PIN_BIT, (unsigned long *)handle);
37539 +#endif
37540  }
37541
37542  static void pin_tag(unsigned long handle)
37543  {
37544 +#ifdef CONFIG_PREEMPT_RT_FULL
37545 +       struct zsmalloc_handle *zh = zs_get_pure_handle(handle);
37546 +
37547 +       return mutex_lock(&zh->lock);
37548 +#else
37549         bit_spin_lock(HANDLE_PIN_BIT, (unsigned long *)handle);
37550 +#endif
37551  }
37552
37553  static void unpin_tag(unsigned long handle)
37554  {
37555 +#ifdef CONFIG_PREEMPT_RT_FULL
37556 +       struct zsmalloc_handle *zh = zs_get_pure_handle(handle);
37557 +
37558 +       return mutex_unlock(&zh->lock);
37559 +#else
37560         bit_spin_unlock(HANDLE_PIN_BIT, (unsigned long *)handle);
37561 +#endif
37562  }
37563
37564  static void reset_page(struct page *page)
37565 @@ -1365,7 +1433,7 @@ void *zs_map_object(struct zs_pool *pool, unsigned long handle,
37566         class = pool->size_class[class_idx];
37567         off = (class->size * obj_idx) & ~PAGE_MASK;
37568
37569 -       area = &get_cpu_var(zs_map_area);
37570 +       area = &get_locked_var(zs_map_area_lock, zs_map_area);
37571         area->vm_mm = mm;
37572         if (off + class->size <= PAGE_SIZE) {
37573                 /* this object is contained entirely within a page */
37574 @@ -1419,7 +1487,7 @@ void zs_unmap_object(struct zs_pool *pool, unsigned long handle)
37575
37576                 __zs_unmap_object(area, pages, off, class->size);
37577         }
37578 -       put_cpu_var(zs_map_area);
37579 +       put_locked_var(zs_map_area_lock, zs_map_area);
37580
37581         migrate_read_unlock(zspage);
37582         unpin_tag(handle);
37583 diff --git a/net/9p/trans_xen.c b/net/9p/trans_xen.c
37584 index c10bdf63eae7..84a49f2bcfbc 100644
37585 --- a/net/9p/trans_xen.c
37586 +++ b/net/9p/trans_xen.c
37587 @@ -38,7 +38,6 @@
37588
37589  #include <linux/module.h>
37590  #include <linux/spinlock.h>
37591 -#include <linux/rwlock.h>
37592  #include <net/9p/9p.h>
37593  #include <net/9p/client.h>
37594  #include <net/9p/transport.h>
37595 diff --git a/net/Kconfig b/net/Kconfig
37596 index 9dba2715919d..9c7b38379c09 100644
37597 --- a/net/Kconfig
37598 +++ b/net/Kconfig
37599 @@ -272,7 +272,7 @@ config CGROUP_NET_CLASSID
37600
37601  config NET_RX_BUSY_POLL
37602         bool
37603 -       default y
37604 +       default y if !PREEMPT_RT_FULL
37605
37606  config BQL
37607         bool
37608 diff --git a/net/bluetooth/hci_sock.c b/net/bluetooth/hci_sock.c
37609 index 65d734c165bd..923e9a271872 100644
37610 --- a/net/bluetooth/hci_sock.c
37611 +++ b/net/bluetooth/hci_sock.c
37612 @@ -251,15 +251,13 @@ void hci_send_to_sock(struct hci_dev *hdev, struct sk_buff *skb)
37613  }
37614
37615  /* Send frame to sockets with specific channel */
37616 -void hci_send_to_channel(unsigned short channel, struct sk_buff *skb,
37617 -                        int flag, struct sock *skip_sk)
37618 +static void __hci_send_to_channel(unsigned short channel, struct sk_buff *skb,
37619 +                                 int flag, struct sock *skip_sk)
37620  {
37621         struct sock *sk;
37622
37623         BT_DBG("channel %u len %d", channel, skb->len);
37624
37625 -       read_lock(&hci_sk_list.lock);
37626 -
37627         sk_for_each(sk, &hci_sk_list.head) {
37628                 struct sk_buff *nskb;
37629
37630 @@ -285,6 +283,13 @@ void hci_send_to_channel(unsigned short channel, struct sk_buff *skb,
37631                         kfree_skb(nskb);
37632         }
37633
37634 +}
37635 +
37636 +void hci_send_to_channel(unsigned short channel, struct sk_buff *skb,
37637 +                        int flag, struct sock *skip_sk)
37638 +{
37639 +       read_lock(&hci_sk_list.lock);
37640 +       __hci_send_to_channel(channel, skb, flag, skip_sk);
37641         read_unlock(&hci_sk_list.lock);
37642  }
37643
37644 @@ -388,8 +393,8 @@ void hci_send_monitor_ctrl_event(struct hci_dev *hdev, u16 event,
37645                 hdr->index = index;
37646                 hdr->len = cpu_to_le16(skb->len - HCI_MON_HDR_SIZE);
37647
37648 -               hci_send_to_channel(HCI_CHANNEL_MONITOR, skb,
37649 -                                   HCI_SOCK_TRUSTED, NULL);
37650 +               __hci_send_to_channel(HCI_CHANNEL_MONITOR, skb,
37651 +                                     HCI_SOCK_TRUSTED, NULL);
37652                 kfree_skb(skb);
37653         }
37654
37655 diff --git a/net/can/bcm.c b/net/can/bcm.c
37656 index 13690334efa3..9cc67ac257f1 100644
37657 --- a/net/can/bcm.c
37658 +++ b/net/can/bcm.c
37659 @@ -102,7 +102,6 @@ struct bcm_op {
37660         unsigned long frames_abs, frames_filtered;
37661         struct bcm_timeval ival1, ival2;
37662         struct hrtimer timer, thrtimer;
37663 -       struct tasklet_struct tsklet, thrtsklet;
37664         ktime_t rx_stamp, kt_ival1, kt_ival2, kt_lastmsg;
37665         int rx_ifindex;
37666         int cfsiz;
37667 @@ -364,25 +363,34 @@ static void bcm_send_to_user(struct bcm_op *op, struct bcm_msg_head *head,
37668         }
37669  }
37670
37671 -static void bcm_tx_start_timer(struct bcm_op *op)
37672 +static bool bcm_tx_set_expiry(struct bcm_op *op, struct hrtimer *hrt)
37673  {
37674 +       ktime_t ival;
37675 +
37676         if (op->kt_ival1 && op->count)
37677 -               hrtimer_start(&op->timer,
37678 -                             ktime_add(ktime_get(), op->kt_ival1),
37679 -                             HRTIMER_MODE_ABS);
37680 +               ival = op->kt_ival1;
37681         else if (op->kt_ival2)
37682 -               hrtimer_start(&op->timer,
37683 -                             ktime_add(ktime_get(), op->kt_ival2),
37684 -                             HRTIMER_MODE_ABS);
37685 +               ival = op->kt_ival2;
37686 +       else
37687 +               return false;
37688 +
37689 +       hrtimer_set_expires(hrt, ktime_add(ktime_get(), ival));
37690 +       return true;
37691  }
37692
37693 -static void bcm_tx_timeout_tsklet(unsigned long data)
37694 +static void bcm_tx_start_timer(struct bcm_op *op)
37695  {
37696 -       struct bcm_op *op = (struct bcm_op *)data;
37697 +       if (bcm_tx_set_expiry(op, &op->timer))
37698 +               hrtimer_start_expires(&op->timer, HRTIMER_MODE_ABS_SOFT);
37699 +}
37700 +
37701 +/* bcm_tx_timeout_handler - performs cyclic CAN frame transmissions */
37702 +static enum hrtimer_restart bcm_tx_timeout_handler(struct hrtimer *hrtimer)
37703 +{
37704 +       struct bcm_op *op = container_of(hrtimer, struct bcm_op, timer);
37705         struct bcm_msg_head msg_head;
37706
37707         if (op->kt_ival1 && (op->count > 0)) {
37708 -
37709                 op->count--;
37710                 if (!op->count && (op->flags & TX_COUNTEVT)) {
37711
37712 @@ -399,22 +407,12 @@ static void bcm_tx_timeout_tsklet(unsigned long data)
37713                 }
37714                 bcm_can_tx(op);
37715
37716 -       } else if (op->kt_ival2)
37717 +       } else if (op->kt_ival2) {
37718                 bcm_can_tx(op);
37719 +       }
37720
37721 -       bcm_tx_start_timer(op);
37722 -}
37723 -
37724 -/*
37725 - * bcm_tx_timeout_handler - performs cyclic CAN frame transmissions
37726 - */
37727 -static enum hrtimer_restart bcm_tx_timeout_handler(struct hrtimer *hrtimer)
37728 -{
37729 -       struct bcm_op *op = container_of(hrtimer, struct bcm_op, timer);
37730 -
37731 -       tasklet_schedule(&op->tsklet);
37732 -
37733 -       return HRTIMER_NORESTART;
37734 +       return bcm_tx_set_expiry(op, &op->timer) ?
37735 +               HRTIMER_RESTART : HRTIMER_NORESTART;
37736  }
37737
37738  /*
37739 @@ -480,7 +478,7 @@ static void bcm_rx_update_and_send(struct bcm_op *op,
37740                 /* do not send the saved data - only start throttle timer */
37741                 hrtimer_start(&op->thrtimer,
37742                               ktime_add(op->kt_lastmsg, op->kt_ival2),
37743 -                             HRTIMER_MODE_ABS);
37744 +                             HRTIMER_MODE_ABS_SOFT);
37745                 return;
37746         }
37747
37748 @@ -539,14 +537,21 @@ static void bcm_rx_starttimer(struct bcm_op *op)
37749                 return;
37750
37751         if (op->kt_ival1)
37752 -               hrtimer_start(&op->timer, op->kt_ival1, HRTIMER_MODE_REL);
37753 +               hrtimer_start(&op->timer, op->kt_ival1, HRTIMER_MODE_REL_SOFT);
37754  }
37755
37756 -static void bcm_rx_timeout_tsklet(unsigned long data)
37757 +/* bcm_rx_timeout_handler - when the (cyclic) CAN frame reception timed out */
37758 +static enum hrtimer_restart bcm_rx_timeout_handler(struct hrtimer *hrtimer)
37759  {
37760 -       struct bcm_op *op = (struct bcm_op *)data;
37761 +       struct bcm_op *op = container_of(hrtimer, struct bcm_op, timer);
37762         struct bcm_msg_head msg_head;
37763
37764 +       /* if user wants to be informed, when cyclic CAN-Messages come back */
37765 +       if ((op->flags & RX_ANNOUNCE_RESUME) && op->last_frames) {
37766 +               /* clear received CAN frames to indicate 'nothing received' */
37767 +               memset(op->last_frames, 0, op->nframes * op->cfsiz);
37768 +       }
37769 +
37770         /* create notification to user */
37771         msg_head.opcode  = RX_TIMEOUT;
37772         msg_head.flags   = op->flags;
37773 @@ -557,25 +562,6 @@ static void bcm_rx_timeout_tsklet(unsigned long data)
37774         msg_head.nframes = 0;
37775
37776         bcm_send_to_user(op, &msg_head, NULL, 0);
37777 -}
37778 -
37779 -/*
37780 - * bcm_rx_timeout_handler - when the (cyclic) CAN frame reception timed out
37781 - */
37782 -static enum hrtimer_restart bcm_rx_timeout_handler(struct hrtimer *hrtimer)
37783 -{
37784 -       struct bcm_op *op = container_of(hrtimer, struct bcm_op, timer);
37785 -
37786 -       /* schedule before NET_RX_SOFTIRQ */
37787 -       tasklet_hi_schedule(&op->tsklet);
37788 -
37789 -       /* no restart of the timer is done here! */
37790 -
37791 -       /* if user wants to be informed, when cyclic CAN-Messages come back */
37792 -       if ((op->flags & RX_ANNOUNCE_RESUME) && op->last_frames) {
37793 -               /* clear received CAN frames to indicate 'nothing received' */
37794 -               memset(op->last_frames, 0, op->nframes * op->cfsiz);
37795 -       }
37796
37797         return HRTIMER_NORESTART;
37798  }
37799 @@ -583,14 +569,12 @@ static enum hrtimer_restart bcm_rx_timeout_handler(struct hrtimer *hrtimer)
37800  /*
37801   * bcm_rx_do_flush - helper for bcm_rx_thr_flush
37802   */
37803 -static inline int bcm_rx_do_flush(struct bcm_op *op, int update,
37804 -                                 unsigned int index)
37805 +static inline int bcm_rx_do_flush(struct bcm_op *op, unsigned int index)
37806  {
37807         struct canfd_frame *lcf = op->last_frames + op->cfsiz * index;
37808
37809         if ((op->last_frames) && (lcf->flags & RX_THR)) {
37810 -               if (update)
37811 -                       bcm_rx_changed(op, lcf);
37812 +               bcm_rx_changed(op, lcf);
37813                 return 1;
37814         }
37815         return 0;
37816 @@ -598,11 +582,8 @@ static inline int bcm_rx_do_flush(struct bcm_op *op, int update,
37817
37818  /*
37819   * bcm_rx_thr_flush - Check for throttled data and send it to the userspace
37820 - *
37821 - * update == 0 : just check if throttled data is available  (any irq context)
37822 - * update == 1 : check and send throttled data to userspace (soft_irq context)
37823   */
37824 -static int bcm_rx_thr_flush(struct bcm_op *op, int update)
37825 +static int bcm_rx_thr_flush(struct bcm_op *op)
37826  {
37827         int updated = 0;
37828
37829 @@ -611,24 +592,16 @@ static int bcm_rx_thr_flush(struct bcm_op *op, int update)
37830
37831                 /* for MUX filter we start at index 1 */
37832                 for (i = 1; i < op->nframes; i++)
37833 -                       updated += bcm_rx_do_flush(op, update, i);
37834 +                       updated += bcm_rx_do_flush(op, i);
37835
37836         } else {
37837                 /* for RX_FILTER_ID and simple filter */
37838 -               updated += bcm_rx_do_flush(op, update, 0);
37839 +               updated += bcm_rx_do_flush(op, 0);
37840         }
37841
37842         return updated;
37843  }
37844
37845 -static void bcm_rx_thr_tsklet(unsigned long data)
37846 -{
37847 -       struct bcm_op *op = (struct bcm_op *)data;
37848 -
37849 -       /* push the changed data to the userspace */
37850 -       bcm_rx_thr_flush(op, 1);
37851 -}
37852 -
37853  /*
37854   * bcm_rx_thr_handler - the time for blocked content updates is over now:
37855   *                      Check for throttled data and send it to the userspace
37856 @@ -637,9 +610,7 @@ static enum hrtimer_restart bcm_rx_thr_handler(struct hrtimer *hrtimer)
37857  {
37858         struct bcm_op *op = container_of(hrtimer, struct bcm_op, thrtimer);
37859
37860 -       tasklet_schedule(&op->thrtsklet);
37861 -
37862 -       if (bcm_rx_thr_flush(op, 0)) {
37863 +       if (bcm_rx_thr_flush(op)) {
37864                 hrtimer_forward(hrtimer, ktime_get(), op->kt_ival2);
37865                 return HRTIMER_RESTART;
37866         } else {
37867 @@ -735,23 +706,8 @@ static struct bcm_op *bcm_find_op(struct list_head *ops,
37868
37869  static void bcm_remove_op(struct bcm_op *op)
37870  {
37871 -       if (op->tsklet.func) {
37872 -               while (test_bit(TASKLET_STATE_SCHED, &op->tsklet.state) ||
37873 -                      test_bit(TASKLET_STATE_RUN, &op->tsklet.state) ||
37874 -                      hrtimer_active(&op->timer)) {
37875 -                       hrtimer_cancel(&op->timer);
37876 -                       tasklet_kill(&op->tsklet);
37877 -               }
37878 -       }
37879 -
37880 -       if (op->thrtsklet.func) {
37881 -               while (test_bit(TASKLET_STATE_SCHED, &op->thrtsklet.state) ||
37882 -                      test_bit(TASKLET_STATE_RUN, &op->thrtsklet.state) ||
37883 -                      hrtimer_active(&op->thrtimer)) {
37884 -                       hrtimer_cancel(&op->thrtimer);
37885 -                       tasklet_kill(&op->thrtsklet);
37886 -               }
37887 -       }
37888 +       hrtimer_cancel(&op->timer);
37889 +       hrtimer_cancel(&op->thrtimer);
37890
37891         if ((op->frames) && (op->frames != &op->sframe))
37892                 kfree(op->frames);
37893 @@ -979,15 +935,13 @@ static int bcm_tx_setup(struct bcm_msg_head *msg_head, struct msghdr *msg,
37894                 op->ifindex = ifindex;
37895
37896                 /* initialize uninitialized (kzalloc) structure */
37897 -               hrtimer_init(&op->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
37898 +               hrtimer_init(&op->timer, CLOCK_MONOTONIC,
37899 +                            HRTIMER_MODE_REL_SOFT);
37900                 op->timer.function = bcm_tx_timeout_handler;
37901
37902 -               /* initialize tasklet for tx countevent notification */
37903 -               tasklet_init(&op->tsklet, bcm_tx_timeout_tsklet,
37904 -                            (unsigned long) op);
37905 -
37906                 /* currently unused in tx_ops */
37907 -               hrtimer_init(&op->thrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
37908 +               hrtimer_init(&op->thrtimer, CLOCK_MONOTONIC,
37909 +                            HRTIMER_MODE_REL_SOFT);
37910
37911                 /* add this bcm_op to the list of the tx_ops */
37912                 list_add(&op->list, &bo->tx_ops);
37913 @@ -1150,20 +1104,14 @@ static int bcm_rx_setup(struct bcm_msg_head *msg_head, struct msghdr *msg,
37914                 op->rx_ifindex = ifindex;
37915
37916                 /* initialize uninitialized (kzalloc) structure */
37917 -               hrtimer_init(&op->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
37918 +               hrtimer_init(&op->timer, CLOCK_MONOTONIC,
37919 +                            HRTIMER_MODE_REL_SOFT);
37920                 op->timer.function = bcm_rx_timeout_handler;
37921
37922 -               /* initialize tasklet for rx timeout notification */
37923 -               tasklet_init(&op->tsklet, bcm_rx_timeout_tsklet,
37924 -                            (unsigned long) op);
37925 -
37926 -               hrtimer_init(&op->thrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
37927 +               hrtimer_init(&op->thrtimer, CLOCK_MONOTONIC,
37928 +                            HRTIMER_MODE_REL_SOFT);
37929                 op->thrtimer.function = bcm_rx_thr_handler;
37930
37931 -               /* initialize tasklet for rx throttle handling */
37932 -               tasklet_init(&op->thrtsklet, bcm_rx_thr_tsklet,
37933 -                            (unsigned long) op);
37934 -
37935                 /* add this bcm_op to the list of the rx_ops */
37936                 list_add(&op->list, &bo->rx_ops);
37937
37938 @@ -1209,12 +1157,12 @@ static int bcm_rx_setup(struct bcm_msg_head *msg_head, struct msghdr *msg,
37939                          */
37940                         op->kt_lastmsg = 0;
37941                         hrtimer_cancel(&op->thrtimer);
37942 -                       bcm_rx_thr_flush(op, 1);
37943 +                       bcm_rx_thr_flush(op);
37944                 }
37945
37946                 if ((op->flags & STARTTIMER) && op->kt_ival1)
37947                         hrtimer_start(&op->timer, op->kt_ival1,
37948 -                                     HRTIMER_MODE_REL);
37949 +                                     HRTIMER_MODE_REL_SOFT);
37950         }
37951
37952         /* now we can register for can_ids, if we added a new bcm_op */
37953 diff --git a/net/core/dev.c b/net/core/dev.c
37954 index e8a66ad6d07c..fa9642bb0482 100644
37955 --- a/net/core/dev.c
37956 +++ b/net/core/dev.c
37957 @@ -195,6 +195,7 @@ static unsigned int napi_gen_id = NR_CPUS;
37958  static DEFINE_READ_MOSTLY_HASHTABLE(napi_hash, 8);
37959
37960  static seqcount_t devnet_rename_seq;
37961 +static DEFINE_MUTEX(devnet_rename_mutex);
37962
37963  static inline void dev_base_seq_inc(struct net *net)
37964  {
37965 @@ -217,14 +218,14 @@ static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex)
37966  static inline void rps_lock(struct softnet_data *sd)
37967  {
37968  #ifdef CONFIG_RPS
37969 -       spin_lock(&sd->input_pkt_queue.lock);
37970 +       raw_spin_lock(&sd->input_pkt_queue.raw_lock);
37971  #endif
37972  }
37973
37974  static inline void rps_unlock(struct softnet_data *sd)
37975  {
37976  #ifdef CONFIG_RPS
37977 -       spin_unlock(&sd->input_pkt_queue.lock);
37978 +       raw_spin_unlock(&sd->input_pkt_queue.raw_lock);
37979  #endif
37980  }
37981
37982 @@ -920,7 +921,8 @@ int netdev_get_name(struct net *net, char *name, int ifindex)
37983         strcpy(name, dev->name);
37984         rcu_read_unlock();
37985         if (read_seqcount_retry(&devnet_rename_seq, seq)) {
37986 -               cond_resched();
37987 +               mutex_lock(&devnet_rename_mutex);
37988 +               mutex_unlock(&devnet_rename_mutex);
37989                 goto retry;
37990         }
37991
37992 @@ -1189,20 +1191,17 @@ int dev_change_name(struct net_device *dev, const char *newname)
37993         if (dev->flags & IFF_UP)
37994                 return -EBUSY;
37995
37996 -       write_seqcount_begin(&devnet_rename_seq);
37997 +       mutex_lock(&devnet_rename_mutex);
37998 +       __raw_write_seqcount_begin(&devnet_rename_seq);
37999
38000 -       if (strncmp(newname, dev->name, IFNAMSIZ) == 0) {
38001 -               write_seqcount_end(&devnet_rename_seq);
38002 -               return 0;
38003 -       }
38004 +       if (strncmp(newname, dev->name, IFNAMSIZ) == 0)
38005 +               goto outunlock;
38006
38007         memcpy(oldname, dev->name, IFNAMSIZ);
38008
38009         err = dev_get_valid_name(net, dev, newname);
38010 -       if (err < 0) {
38011 -               write_seqcount_end(&devnet_rename_seq);
38012 -               return err;
38013 -       }
38014 +       if (err < 0)
38015 +               goto outunlock;
38016
38017         if (oldname[0] && !strchr(oldname, '%'))
38018                 netdev_info(dev, "renamed from %s\n", oldname);
38019 @@ -1215,11 +1214,12 @@ int dev_change_name(struct net_device *dev, const char *newname)
38020         if (ret) {
38021                 memcpy(dev->name, oldname, IFNAMSIZ);
38022                 dev->name_assign_type = old_assign_type;
38023 -               write_seqcount_end(&devnet_rename_seq);
38024 -               return ret;
38025 +               err = ret;
38026 +               goto outunlock;
38027         }
38028
38029 -       write_seqcount_end(&devnet_rename_seq);
38030 +       __raw_write_seqcount_end(&devnet_rename_seq);
38031 +       mutex_unlock(&devnet_rename_mutex);
38032
38033         netdev_adjacent_rename_links(dev, oldname);
38034
38035 @@ -1240,7 +1240,8 @@ int dev_change_name(struct net_device *dev, const char *newname)
38036                 /* err >= 0 after dev_alloc_name() or stores the first errno */
38037                 if (err >= 0) {
38038                         err = ret;
38039 -                       write_seqcount_begin(&devnet_rename_seq);
38040 +                       mutex_lock(&devnet_rename_mutex);
38041 +                       __raw_write_seqcount_begin(&devnet_rename_seq);
38042                         memcpy(dev->name, oldname, IFNAMSIZ);
38043                         memcpy(oldname, newname, IFNAMSIZ);
38044                         dev->name_assign_type = old_assign_type;
38045 @@ -1253,6 +1254,11 @@ int dev_change_name(struct net_device *dev, const char *newname)
38046         }
38047
38048         return err;
38049 +
38050 +outunlock:
38051 +       __raw_write_seqcount_end(&devnet_rename_seq);
38052 +       mutex_unlock(&devnet_rename_mutex);
38053 +       return err;
38054  }
38055
38056  /**
38057 @@ -2460,6 +2466,7 @@ static void __netif_reschedule(struct Qdisc *q)
38058         sd->output_queue_tailp = &q->next_sched;
38059         raise_softirq_irqoff(NET_TX_SOFTIRQ);
38060         local_irq_restore(flags);
38061 +       preempt_check_resched_rt();
38062  }
38063
38064  void __netif_schedule(struct Qdisc *q)
38065 @@ -2522,6 +2529,7 @@ void __dev_kfree_skb_irq(struct sk_buff *skb, enum skb_free_reason reason)
38066         __this_cpu_write(softnet_data.completion_queue, skb);
38067         raise_softirq_irqoff(NET_TX_SOFTIRQ);
38068         local_irq_restore(flags);
38069 +       preempt_check_resched_rt();
38070  }
38071  EXPORT_SYMBOL(__dev_kfree_skb_irq);
38072
38073 @@ -3197,7 +3205,11 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
38074          * This permits qdisc->running owner to get the lock more
38075          * often and dequeue packets faster.
38076          */
38077 +#ifdef CONFIG_PREEMPT_RT_FULL
38078 +       contended = true;
38079 +#else
38080         contended = qdisc_is_running(q);
38081 +#endif
38082         if (unlikely(contended))
38083                 spin_lock(&q->busylock);
38084
38085 @@ -3268,8 +3280,10 @@ static void skb_update_prio(struct sk_buff *skb)
38086  #define skb_update_prio(skb)
38087  #endif
38088
38089 +#ifndef CONFIG_PREEMPT_RT_FULL
38090  DEFINE_PER_CPU(int, xmit_recursion);
38091  EXPORT_SYMBOL(xmit_recursion);
38092 +#endif
38093
38094  /**
38095   *     dev_loopback_xmit - loop back @skb
38096 @@ -3509,9 +3523,12 @@ static int __dev_queue_xmit(struct sk_buff *skb, void *accel_priv)
38097         if (dev->flags & IFF_UP) {
38098                 int cpu = smp_processor_id(); /* ok because BHs are off */
38099
38100 +#ifdef CONFIG_PREEMPT_RT_FULL
38101 +               if (txq->xmit_lock_owner != current) {
38102 +#else
38103                 if (txq->xmit_lock_owner != cpu) {
38104 -                       if (unlikely(__this_cpu_read(xmit_recursion) >
38105 -                                    XMIT_RECURSION_LIMIT))
38106 +#endif
38107 +                       if (unlikely(xmit_rec_read() > XMIT_RECURSION_LIMIT))
38108                                 goto recursion_alert;
38109
38110                         skb = validate_xmit_skb(skb, dev);
38111 @@ -3521,9 +3538,9 @@ static int __dev_queue_xmit(struct sk_buff *skb, void *accel_priv)
38112                         HARD_TX_LOCK(dev, txq, cpu);
38113
38114                         if (!netif_xmit_stopped(txq)) {
38115 -                               __this_cpu_inc(xmit_recursion);
38116 +                               xmit_rec_inc();
38117                                 skb = dev_hard_start_xmit(skb, dev, txq, &rc);
38118 -                               __this_cpu_dec(xmit_recursion);
38119 +                               xmit_rec_dec();
38120                                 if (dev_xmit_complete(rc)) {
38121                                         HARD_TX_UNLOCK(dev, txq);
38122                                         goto out;
38123 @@ -3904,6 +3921,7 @@ static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
38124         rps_unlock(sd);
38125
38126         local_irq_restore(flags);
38127 +       preempt_check_resched_rt();
38128
38129         atomic_long_inc(&skb->dev->rx_dropped);
38130         kfree_skb(skb);
38131 @@ -4056,7 +4074,7 @@ static int netif_rx_internal(struct sk_buff *skb)
38132                 struct rps_dev_flow voidflow, *rflow = &voidflow;
38133                 int cpu;
38134
38135 -               preempt_disable();
38136 +               migrate_disable();
38137                 rcu_read_lock();
38138
38139                 cpu = get_rps_cpu(skb->dev, skb, &rflow);
38140 @@ -4066,14 +4084,14 @@ static int netif_rx_internal(struct sk_buff *skb)
38141                 ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
38142
38143                 rcu_read_unlock();
38144 -               preempt_enable();
38145 +               migrate_enable();
38146         } else
38147  #endif
38148         {
38149                 unsigned int qtail;
38150
38151 -               ret = enqueue_to_backlog(skb, get_cpu(), &qtail);
38152 -               put_cpu();
38153 +               ret = enqueue_to_backlog(skb, get_cpu_light(), &qtail);
38154 +               put_cpu_light();
38155         }
38156         return ret;
38157  }
38158 @@ -4107,11 +4125,9 @@ int netif_rx_ni(struct sk_buff *skb)
38159
38160         trace_netif_rx_ni_entry(skb);
38161
38162 -       preempt_disable();
38163 +       local_bh_disable();
38164         err = netif_rx_internal(skb);
38165 -       if (local_softirq_pending())
38166 -               do_softirq();
38167 -       preempt_enable();
38168 +       local_bh_enable();
38169
38170         return err;
38171  }
38172 @@ -4629,7 +4645,7 @@ static void flush_backlog(struct work_struct *work)
38173         skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) {
38174                 if (skb->dev->reg_state == NETREG_UNREGISTERING) {
38175                         __skb_unlink(skb, &sd->input_pkt_queue);
38176 -                       kfree_skb(skb);
38177 +                       __skb_queue_tail(&sd->tofree_queue, skb);
38178                         input_queue_head_incr(sd);
38179                 }
38180         }
38181 @@ -4639,11 +4655,14 @@ static void flush_backlog(struct work_struct *work)
38182         skb_queue_walk_safe(&sd->process_queue, skb, tmp) {
38183                 if (skb->dev->reg_state == NETREG_UNREGISTERING) {
38184                         __skb_unlink(skb, &sd->process_queue);
38185 -                       kfree_skb(skb);
38186 +                       __skb_queue_tail(&sd->tofree_queue, skb);
38187                         input_queue_head_incr(sd);
38188                 }
38189         }
38190 +       if (!skb_queue_empty(&sd->tofree_queue))
38191 +               raise_softirq_irqoff(NET_RX_SOFTIRQ);
38192         local_bh_enable();
38193 +
38194  }
38195
38196  static void flush_all_backlogs(void)
38197 @@ -5153,12 +5172,14 @@ static void net_rps_action_and_irq_enable(struct softnet_data *sd)
38198                 sd->rps_ipi_list = NULL;
38199
38200                 local_irq_enable();
38201 +               preempt_check_resched_rt();
38202
38203                 /* Send pending IPI's to kick RPS processing on remote cpus. */
38204                 net_rps_send_ipi(remsd);
38205         } else
38206  #endif
38207                 local_irq_enable();
38208 +       preempt_check_resched_rt();
38209  }
38210
38211  static bool sd_has_rps_ipi_waiting(struct softnet_data *sd)
38212 @@ -5188,7 +5209,9 @@ static int process_backlog(struct napi_struct *napi, int quota)
38213         while (again) {
38214                 struct sk_buff *skb;
38215
38216 +               local_irq_disable();
38217                 while ((skb = __skb_dequeue(&sd->process_queue))) {
38218 +                       local_irq_enable();
38219                         rcu_read_lock();
38220                         __netif_receive_skb(skb);
38221                         rcu_read_unlock();
38222 @@ -5196,9 +5219,9 @@ static int process_backlog(struct napi_struct *napi, int quota)
38223                         if (++work >= quota)
38224                                 return work;
38225
38226 +                       local_irq_disable();
38227                 }
38228
38229 -               local_irq_disable();
38230                 rps_lock(sd);
38231                 if (skb_queue_empty(&sd->input_pkt_queue)) {
38232                         /*
38233 @@ -5236,6 +5259,7 @@ void __napi_schedule(struct napi_struct *n)
38234         local_irq_save(flags);
38235         ____napi_schedule(this_cpu_ptr(&softnet_data), n);
38236         local_irq_restore(flags);
38237 +       preempt_check_resched_rt();
38238  }
38239  EXPORT_SYMBOL(__napi_schedule);
38240
38241 @@ -5272,6 +5296,7 @@ bool napi_schedule_prep(struct napi_struct *n)
38242  }
38243  EXPORT_SYMBOL(napi_schedule_prep);
38244
38245 +#ifndef CONFIG_PREEMPT_RT_FULL
38246  /**
38247   * __napi_schedule_irqoff - schedule for receive
38248   * @n: entry to schedule
38249 @@ -5283,6 +5308,7 @@ void __napi_schedule_irqoff(struct napi_struct *n)
38250         ____napi_schedule(this_cpu_ptr(&softnet_data), n);
38251  }
38252  EXPORT_SYMBOL(__napi_schedule_irqoff);
38253 +#endif
38254
38255  bool napi_complete_done(struct napi_struct *n, int work_done)
38256  {
38257 @@ -5637,13 +5663,21 @@ static __latent_entropy void net_rx_action(struct softirq_action *h)
38258         unsigned long time_limit = jiffies +
38259                 usecs_to_jiffies(netdev_budget_usecs);
38260         int budget = netdev_budget;
38261 +       struct sk_buff_head tofree_q;
38262 +       struct sk_buff *skb;
38263         LIST_HEAD(list);
38264         LIST_HEAD(repoll);
38265
38266 +       __skb_queue_head_init(&tofree_q);
38267 +
38268         local_irq_disable();
38269 +       skb_queue_splice_init(&sd->tofree_queue, &tofree_q);
38270         list_splice_init(&sd->poll_list, &list);
38271         local_irq_enable();
38272
38273 +       while ((skb = __skb_dequeue(&tofree_q)))
38274 +               kfree_skb(skb);
38275 +
38276         for (;;) {
38277                 struct napi_struct *n;
38278
38279 @@ -5673,7 +5707,7 @@ static __latent_entropy void net_rx_action(struct softirq_action *h)
38280         list_splice_tail(&repoll, &list);
38281         list_splice(&list, &sd->poll_list);
38282         if (!list_empty(&sd->poll_list))
38283 -               __raise_softirq_irqoff(NET_RX_SOFTIRQ);
38284 +               __raise_softirq_irqoff_ksoft(NET_RX_SOFTIRQ);
38285
38286         net_rps_action_and_irq_enable(sd);
38287  out:
38288 @@ -7502,7 +7536,7 @@ static void netdev_init_one_queue(struct net_device *dev,
38289         /* Initialize queue lock */
38290         spin_lock_init(&queue->_xmit_lock);
38291         netdev_set_xmit_lockdep_class(&queue->_xmit_lock, dev->type);
38292 -       queue->xmit_lock_owner = -1;
38293 +       netdev_queue_clear_owner(queue);
38294         netdev_queue_numa_node_write(queue, NUMA_NO_NODE);
38295         queue->dev = dev;
38296  #ifdef CONFIG_BQL
38297 @@ -8442,6 +8476,7 @@ static int dev_cpu_dead(unsigned int oldcpu)
38298
38299         raise_softirq_irqoff(NET_TX_SOFTIRQ);
38300         local_irq_enable();
38301 +       preempt_check_resched_rt();
38302
38303  #ifdef CONFIG_RPS
38304         remsd = oldsd->rps_ipi_list;
38305 @@ -8455,10 +8490,13 @@ static int dev_cpu_dead(unsigned int oldcpu)
38306                 netif_rx_ni(skb);
38307                 input_queue_head_incr(oldsd);
38308         }
38309 -       while ((skb = skb_dequeue(&oldsd->input_pkt_queue))) {
38310 +       while ((skb = __skb_dequeue(&oldsd->input_pkt_queue))) {
38311                 netif_rx_ni(skb);
38312                 input_queue_head_incr(oldsd);
38313         }
38314 +       while ((skb = __skb_dequeue(&oldsd->tofree_queue))) {
38315 +               kfree_skb(skb);
38316 +       }
38317
38318         return 0;
38319  }
38320 @@ -8762,8 +8800,9 @@ static int __init net_dev_init(void)
38321
38322                 INIT_WORK(flush, flush_backlog);
38323
38324 -               skb_queue_head_init(&sd->input_pkt_queue);
38325 -               skb_queue_head_init(&sd->process_queue);
38326 +               skb_queue_head_init_raw(&sd->input_pkt_queue);
38327 +               skb_queue_head_init_raw(&sd->process_queue);
38328 +               skb_queue_head_init_raw(&sd->tofree_queue);
38329                 INIT_LIST_HEAD(&sd->poll_list);
38330                 sd->output_queue_tailp = &sd->output_queue;
38331  #ifdef CONFIG_RPS
38332 diff --git a/net/core/filter.c b/net/core/filter.c
38333 index d5158a10ac8f..ad96ec78f7b8 100644
38334 --- a/net/core/filter.c
38335 +++ b/net/core/filter.c
38336 @@ -1696,7 +1696,7 @@ static inline int __bpf_tx_skb(struct net_device *dev, struct sk_buff *skb)
38337  {
38338         int ret;
38339
38340 -       if (unlikely(__this_cpu_read(xmit_recursion) > XMIT_RECURSION_LIMIT)) {
38341 +       if (unlikely(xmit_rec_read() > XMIT_RECURSION_LIMIT)) {
38342                 net_crit_ratelimited("bpf: recursion limit reached on datapath, buggy bpf program?\n");
38343                 kfree_skb(skb);
38344                 return -ENETDOWN;
38345 @@ -1704,9 +1704,9 @@ static inline int __bpf_tx_skb(struct net_device *dev, struct sk_buff *skb)
38346
38347         skb->dev = dev;
38348
38349 -       __this_cpu_inc(xmit_recursion);
38350 +       xmit_rec_inc();
38351         ret = dev_queue_xmit(skb);
38352 -       __this_cpu_dec(xmit_recursion);
38353 +       xmit_rec_dec();
38354
38355         return ret;
38356  }
38357 diff --git a/net/core/gen_estimator.c b/net/core/gen_estimator.c
38358 index 7f980bd7426e..7250106015ef 100644
38359 --- a/net/core/gen_estimator.c
38360 +++ b/net/core/gen_estimator.c
38361 @@ -46,7 +46,7 @@
38362  struct net_rate_estimator {
38363         struct gnet_stats_basic_packed  *bstats;
38364         spinlock_t              *stats_lock;
38365 -       seqcount_t              *running;
38366 +       net_seqlock_t           *running;
38367         struct gnet_stats_basic_cpu __percpu *cpu_bstats;
38368         u8                      ewma_log;
38369         u8                      intvl_log; /* period : (250ms << intvl_log) */
38370 @@ -129,7 +129,7 @@ int gen_new_estimator(struct gnet_stats_basic_packed *bstats,
38371                       struct gnet_stats_basic_cpu __percpu *cpu_bstats,
38372                       struct net_rate_estimator __rcu **rate_est,
38373                       spinlock_t *stats_lock,
38374 -                     seqcount_t *running,
38375 +                     net_seqlock_t *running,
38376                       struct nlattr *opt)
38377  {
38378         struct gnet_estimator *parm = nla_data(opt);
38379 @@ -222,7 +222,7 @@ int gen_replace_estimator(struct gnet_stats_basic_packed *bstats,
38380                           struct gnet_stats_basic_cpu __percpu *cpu_bstats,
38381                           struct net_rate_estimator __rcu **rate_est,
38382                           spinlock_t *stats_lock,
38383 -                         seqcount_t *running, struct nlattr *opt)
38384 +                         net_seqlock_t *running, struct nlattr *opt)
38385  {
38386         return gen_new_estimator(bstats, cpu_bstats, rate_est,
38387                                  stats_lock, running, opt);
38388 diff --git a/net/core/gen_stats.c b/net/core/gen_stats.c
38389 index 441c04adedba..07f9a6a1f8e4 100644
38390 --- a/net/core/gen_stats.c
38391 +++ b/net/core/gen_stats.c
38392 @@ -142,7 +142,7 @@ __gnet_stats_copy_basic_cpu(struct gnet_stats_basic_packed *bstats,
38393  }
38394
38395  void
38396 -__gnet_stats_copy_basic(const seqcount_t *running,
38397 +__gnet_stats_copy_basic(net_seqlock_t *running,
38398                         struct gnet_stats_basic_packed *bstats,
38399                         struct gnet_stats_basic_cpu __percpu *cpu,
38400                         struct gnet_stats_basic_packed *b)
38401 @@ -155,10 +155,10 @@ __gnet_stats_copy_basic(const seqcount_t *running,
38402         }
38403         do {
38404                 if (running)
38405 -                       seq = read_seqcount_begin(running);
38406 +                       seq = net_seq_begin(running);
38407                 bstats->bytes = b->bytes;
38408                 bstats->packets = b->packets;
38409 -       } while (running && read_seqcount_retry(running, seq));
38410 +       } while (running && net_seq_retry(running, seq));
38411  }
38412  EXPORT_SYMBOL(__gnet_stats_copy_basic);
38413
38414 @@ -176,7 +176,7 @@ EXPORT_SYMBOL(__gnet_stats_copy_basic);
38415   * if the room in the socket buffer was not sufficient.
38416   */
38417  int
38418 -gnet_stats_copy_basic(const seqcount_t *running,
38419 +gnet_stats_copy_basic(net_seqlock_t *running,
38420                       struct gnet_dump *d,
38421                       struct gnet_stats_basic_cpu __percpu *cpu,
38422                       struct gnet_stats_basic_packed *b)
38423 diff --git a/net/core/pktgen.c b/net/core/pktgen.c
38424 index 6e1e10ff433a..c1ae4075e0ed 100644
38425 --- a/net/core/pktgen.c
38426 +++ b/net/core/pktgen.c
38427 @@ -2252,7 +2252,8 @@ static void spin(struct pktgen_dev *pkt_dev, ktime_t spin_until)
38428         s64 remaining;
38429         struct hrtimer_sleeper t;
38430
38431 -       hrtimer_init_on_stack(&t.timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
38432 +       hrtimer_init_sleeper_on_stack(&t, CLOCK_MONOTONIC, HRTIMER_MODE_ABS,
38433 +                                     current);
38434         hrtimer_set_expires(&t.timer, spin_until);
38435
38436         remaining = ktime_to_ns(hrtimer_expires_remaining(&t.timer));
38437 @@ -2267,7 +2268,6 @@ static void spin(struct pktgen_dev *pkt_dev, ktime_t spin_until)
38438                 } while (ktime_compare(end_time, spin_until) < 0);
38439         } else {
38440                 /* see do_nanosleep */
38441 -               hrtimer_init_sleeper(&t, current);
38442                 do {
38443                         set_current_state(TASK_INTERRUPTIBLE);
38444                         hrtimer_start_expires(&t.timer, HRTIMER_MODE_ABS);
38445 diff --git a/net/core/skbuff.c b/net/core/skbuff.c
38446 index 9f80b947f53b..c0f23b8dcfc6 100644
38447 --- a/net/core/skbuff.c
38448 +++ b/net/core/skbuff.c
38449 @@ -63,6 +63,7 @@
38450  #include <linux/errqueue.h>
38451  #include <linux/prefetch.h>
38452  #include <linux/if_vlan.h>
38453 +#include <linux/locallock.h>
38454
38455  #include <net/protocol.h>
38456  #include <net/dst.h>
38457 @@ -330,6 +331,8 @@ struct napi_alloc_cache {
38458
38459  static DEFINE_PER_CPU(struct page_frag_cache, netdev_alloc_cache);
38460  static DEFINE_PER_CPU(struct napi_alloc_cache, napi_alloc_cache);
38461 +static DEFINE_LOCAL_IRQ_LOCK(netdev_alloc_lock);
38462 +static DEFINE_LOCAL_IRQ_LOCK(napi_alloc_cache_lock);
38463
38464  static void *__netdev_alloc_frag(unsigned int fragsz, gfp_t gfp_mask)
38465  {
38466 @@ -337,10 +340,10 @@ static void *__netdev_alloc_frag(unsigned int fragsz, gfp_t gfp_mask)
38467         unsigned long flags;
38468         void *data;
38469
38470 -       local_irq_save(flags);
38471 +       local_lock_irqsave(netdev_alloc_lock, flags);
38472         nc = this_cpu_ptr(&netdev_alloc_cache);
38473         data = page_frag_alloc(nc, fragsz, gfp_mask);
38474 -       local_irq_restore(flags);
38475 +       local_unlock_irqrestore(netdev_alloc_lock, flags);
38476         return data;
38477  }
38478
38479 @@ -359,9 +362,13 @@ EXPORT_SYMBOL(netdev_alloc_frag);
38480
38481  static void *__napi_alloc_frag(unsigned int fragsz, gfp_t gfp_mask)
38482  {
38483 -       struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache);
38484 +       struct napi_alloc_cache *nc;
38485 +       void *data;
38486
38487 -       return page_frag_alloc(&nc->page, fragsz, gfp_mask);
38488 +       nc = &get_locked_var(napi_alloc_cache_lock, napi_alloc_cache);
38489 +       data =  page_frag_alloc(&nc->page, fragsz, gfp_mask);
38490 +       put_locked_var(napi_alloc_cache_lock, napi_alloc_cache);
38491 +       return data;
38492  }
38493
38494  void *napi_alloc_frag(unsigned int fragsz)
38495 @@ -408,13 +415,13 @@ struct sk_buff *__netdev_alloc_skb(struct net_device *dev, unsigned int len,
38496         if (sk_memalloc_socks())
38497                 gfp_mask |= __GFP_MEMALLOC;
38498
38499 -       local_irq_save(flags);
38500 +       local_lock_irqsave(netdev_alloc_lock, flags);
38501
38502         nc = this_cpu_ptr(&netdev_alloc_cache);
38503         data = page_frag_alloc(nc, len, gfp_mask);
38504         pfmemalloc = nc->pfmemalloc;
38505
38506 -       local_irq_restore(flags);
38507 +       local_unlock_irqrestore(netdev_alloc_lock, flags);
38508
38509         if (unlikely(!data))
38510                 return NULL;
38511 @@ -455,9 +462,10 @@ EXPORT_SYMBOL(__netdev_alloc_skb);
38512  struct sk_buff *__napi_alloc_skb(struct napi_struct *napi, unsigned int len,
38513                                  gfp_t gfp_mask)
38514  {
38515 -       struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache);
38516 +       struct napi_alloc_cache *nc;
38517         struct sk_buff *skb;
38518         void *data;
38519 +       bool pfmemalloc;
38520
38521         len += NET_SKB_PAD + NET_IP_ALIGN;
38522
38523 @@ -475,7 +483,10 @@ struct sk_buff *__napi_alloc_skb(struct napi_struct *napi, unsigned int len,
38524         if (sk_memalloc_socks())
38525                 gfp_mask |= __GFP_MEMALLOC;
38526
38527 +       nc = &get_locked_var(napi_alloc_cache_lock, napi_alloc_cache);
38528         data = page_frag_alloc(&nc->page, len, gfp_mask);
38529 +       pfmemalloc = nc->page.pfmemalloc;
38530 +       put_locked_var(napi_alloc_cache_lock, napi_alloc_cache);
38531         if (unlikely(!data))
38532                 return NULL;
38533
38534 @@ -486,7 +497,7 @@ struct sk_buff *__napi_alloc_skb(struct napi_struct *napi, unsigned int len,
38535         }
38536
38537         /* use OR instead of assignment to avoid clearing of bits in mask */
38538 -       if (nc->page.pfmemalloc)
38539 +       if (pfmemalloc)
38540                 skb->pfmemalloc = 1;
38541         skb->head_frag = 1;
38542
38543 @@ -718,23 +729,26 @@ void __consume_stateless_skb(struct sk_buff *skb)
38544
38545  void __kfree_skb_flush(void)
38546  {
38547 -       struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache);
38548 +       struct napi_alloc_cache *nc;
38549
38550 +       nc = &get_locked_var(napi_alloc_cache_lock, napi_alloc_cache);
38551         /* flush skb_cache if containing objects */
38552         if (nc->skb_count) {
38553                 kmem_cache_free_bulk(skbuff_head_cache, nc->skb_count,
38554                                      nc->skb_cache);
38555                 nc->skb_count = 0;
38556         }
38557 +       put_locked_var(napi_alloc_cache_lock, napi_alloc_cache);
38558  }
38559
38560  static inline void _kfree_skb_defer(struct sk_buff *skb)
38561  {
38562 -       struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache);
38563 +       struct napi_alloc_cache *nc;
38564
38565         /* drop skb->head and call any destructors for packet */
38566         skb_release_all(skb);
38567
38568 +       nc = &get_locked_var(napi_alloc_cache_lock, napi_alloc_cache);
38569         /* record skb to CPU local list */
38570         nc->skb_cache[nc->skb_count++] = skb;
38571
38572 @@ -749,6 +763,7 @@ static inline void _kfree_skb_defer(struct sk_buff *skb)
38573                                      nc->skb_cache);
38574                 nc->skb_count = 0;
38575         }
38576 +       put_locked_var(napi_alloc_cache_lock, napi_alloc_cache);
38577  }
38578  void __kfree_skb_defer(struct sk_buff *skb)
38579  {
38580 diff --git a/net/core/sock.c b/net/core/sock.c
38581 index 68d08ed5521e..ee242ff5d4b1 100644
38582 --- a/net/core/sock.c
38583 +++ b/net/core/sock.c
38584 @@ -2757,12 +2757,11 @@ void lock_sock_nested(struct sock *sk, int subclass)
38585         if (sk->sk_lock.owned)
38586                 __lock_sock(sk);
38587         sk->sk_lock.owned = 1;
38588 -       spin_unlock(&sk->sk_lock.slock);
38589 +       spin_unlock_bh(&sk->sk_lock.slock);
38590         /*
38591          * The sk_lock has mutex_lock() semantics here:
38592          */
38593         mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_);
38594 -       local_bh_enable();
38595  }
38596  EXPORT_SYMBOL(lock_sock_nested);
38597
38598 diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c
38599 index 3c1570d3e22f..0310ea93f877 100644
38600 --- a/net/ipv4/icmp.c
38601 +++ b/net/ipv4/icmp.c
38602 @@ -77,6 +77,7 @@
38603  #include <linux/string.h>
38604  #include <linux/netfilter_ipv4.h>
38605  #include <linux/slab.h>
38606 +#include <linux/locallock.h>
38607  #include <net/snmp.h>
38608  #include <net/ip.h>
38609  #include <net/route.h>
38610 @@ -204,6 +205,8 @@ static const struct icmp_control icmp_pointers[NR_ICMP_TYPES+1];
38611   *
38612   *     On SMP we have one ICMP socket per-cpu.
38613   */
38614 +static DEFINE_LOCAL_IRQ_LOCK(icmp_sk_lock);
38615 +
38616  static struct sock *icmp_sk(struct net *net)
38617  {
38618         return *this_cpu_ptr(net->ipv4.icmp_sk);
38619 @@ -214,12 +217,16 @@ static inline struct sock *icmp_xmit_lock(struct net *net)
38620  {
38621         struct sock *sk;
38622
38623 +       if (!local_trylock(icmp_sk_lock))
38624 +               return NULL;
38625 +
38626         sk = icmp_sk(net);
38627
38628         if (unlikely(!spin_trylock(&sk->sk_lock.slock))) {
38629                 /* This can happen if the output path signals a
38630                  * dst_link_failure() for an outgoing ICMP packet.
38631                  */
38632 +               local_unlock(icmp_sk_lock);
38633                 return NULL;
38634         }
38635         return sk;
38636 @@ -228,6 +235,7 @@ static inline struct sock *icmp_xmit_lock(struct net *net)
38637  static inline void icmp_xmit_unlock(struct sock *sk)
38638  {
38639         spin_unlock(&sk->sk_lock.slock);
38640 +       local_unlock(icmp_sk_lock);
38641  }
38642
38643  int sysctl_icmp_msgs_per_sec __read_mostly = 1000;
38644 diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
38645 index 31b34c0c2d5f..851f241e70b5 100644
38646 --- a/net/ipv4/tcp_ipv4.c
38647 +++ b/net/ipv4/tcp_ipv4.c
38648 @@ -62,6 +62,7 @@
38649  #include <linux/init.h>
38650  #include <linux/times.h>
38651  #include <linux/slab.h>
38652 +#include <linux/locallock.h>
38653
38654  #include <net/net_namespace.h>
38655  #include <net/icmp.h>
38656 @@ -580,6 +581,7 @@ void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
38657  }
38658  EXPORT_SYMBOL(tcp_v4_send_check);
38659
38660 +static DEFINE_LOCAL_IRQ_LOCK(tcp_sk_lock);
38661  /*
38662   *     This routine will send an RST to the other tcp.
38663   *
38664 @@ -710,6 +712,7 @@ static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
38665         arg.tos = ip_hdr(skb)->tos;
38666         arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL);
38667         local_bh_disable();
38668 +       local_lock(tcp_sk_lock);
38669         ip_send_unicast_reply(*this_cpu_ptr(net->ipv4.tcp_sk),
38670                               skb, &TCP_SKB_CB(skb)->header.h4.opt,
38671                               ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
38672 @@ -717,6 +720,7 @@ static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
38673
38674         __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
38675         __TCP_INC_STATS(net, TCP_MIB_OUTRSTS);
38676 +       local_unlock(tcp_sk_lock);
38677         local_bh_enable();
38678
38679  #ifdef CONFIG_TCP_MD5SIG
38680 @@ -796,12 +800,14 @@ static void tcp_v4_send_ack(const struct sock *sk,
38681         arg.tos = tos;
38682         arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL);
38683         local_bh_disable();
38684 +       local_lock(tcp_sk_lock);
38685         ip_send_unicast_reply(*this_cpu_ptr(net->ipv4.tcp_sk),
38686                               skb, &TCP_SKB_CB(skb)->header.h4.opt,
38687                               ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
38688                               &arg, arg.iov[0].iov_len);
38689
38690         __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
38691 +       local_unlock(tcp_sk_lock);
38692         local_bh_enable();
38693  }
38694
38695 diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c
38696 index dddd498e1338..8f39b8162df8 100644
38697 --- a/net/mac80211/rx.c
38698 +++ b/net/mac80211/rx.c
38699 @@ -4252,7 +4252,7 @@ void ieee80211_rx_napi(struct ieee80211_hw *hw, struct ieee80211_sta *pubsta,
38700         struct ieee80211_supported_band *sband;
38701         struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(skb);
38702
38703 -       WARN_ON_ONCE(softirq_count() == 0);
38704 +       WARN_ON_ONCE_NONRT(softirq_count() == 0);
38705
38706         if (WARN_ON(status->band >= NUM_NL80211_BANDS))
38707                 goto drop;
38708 diff --git a/net/netfilter/core.c b/net/netfilter/core.c
38709 index 52cd2901a097..c63e937b6676 100644
38710 --- a/net/netfilter/core.c
38711 +++ b/net/netfilter/core.c
38712 @@ -21,6 +21,7 @@
38713  #include <linux/inetdevice.h>
38714  #include <linux/proc_fs.h>
38715  #include <linux/mutex.h>
38716 +#include <linux/locallock.h>
38717  #include <linux/mm.h>
38718  #include <linux/rcupdate.h>
38719  #include <net/net_namespace.h>
38720 @@ -28,6 +29,11 @@
38721
38722  #include "nf_internals.h"
38723
38724 +#ifdef CONFIG_PREEMPT_RT_BASE
38725 +DEFINE_LOCAL_IRQ_LOCK(xt_write_lock);
38726 +EXPORT_PER_CPU_SYMBOL(xt_write_lock);
38727 +#endif
38728 +
38729  static DEFINE_MUTEX(afinfo_mutex);
38730
38731  const struct nf_afinfo __rcu *nf_afinfo[NFPROTO_NUMPROTO] __read_mostly;
38732 diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
38733 index 8d1a7c900393..f1f56be3b061 100644
38734 --- a/net/packet/af_packet.c
38735 +++ b/net/packet/af_packet.c
38736 @@ -63,6 +63,7 @@
38737  #include <linux/if_packet.h>
38738  #include <linux/wireless.h>
38739  #include <linux/kernel.h>
38740 +#include <linux/delay.h>
38741  #include <linux/kmod.h>
38742  #include <linux/slab.h>
38743  #include <linux/vmalloc.h>
38744 @@ -707,7 +708,7 @@ static void prb_retire_rx_blk_timer_expired(unsigned long data)
38745         if (BLOCK_NUM_PKTS(pbd)) {
38746                 while (atomic_read(&pkc->blk_fill_in_prog)) {
38747                         /* Waiting for skb_copy_bits to finish... */
38748 -                       cpu_relax();
38749 +                       cpu_chill();
38750                 }
38751         }
38752
38753 @@ -969,7 +970,7 @@ static void prb_retire_current_block(struct tpacket_kbdq_core *pkc,
38754                 if (!(status & TP_STATUS_BLK_TMO)) {
38755                         while (atomic_read(&pkc->blk_fill_in_prog)) {
38756                                 /* Waiting for skb_copy_bits to finish... */
38757 -                               cpu_relax();
38758 +                               cpu_chill();
38759                         }
38760                 }
38761                 prb_close_block(pkc, pbd, po, status);
38762 diff --git a/net/rds/ib_rdma.c b/net/rds/ib_rdma.c
38763 index 9a3c54e659e9..2a95f1d587ac 100644
38764 --- a/net/rds/ib_rdma.c
38765 +++ b/net/rds/ib_rdma.c
38766 @@ -34,6 +34,7 @@
38767  #include <linux/slab.h>
38768  #include <linux/rculist.h>
38769  #include <linux/llist.h>
38770 +#include <linux/delay.h>
38771
38772  #include "rds_single_path.h"
38773  #include "ib_mr.h"
38774 @@ -210,7 +211,7 @@ static inline void wait_clean_list_grace(void)
38775         for_each_online_cpu(cpu) {
38776                 flag = &per_cpu(clean_list_grace, cpu);
38777                 while (test_bit(CLEAN_LIST_BUSY_BIT, flag))
38778 -                       cpu_relax();
38779 +                       cpu_chill();
38780         }
38781  }
38782
38783 diff --git a/net/rxrpc/security.c b/net/rxrpc/security.c
38784 index e9f428351293..c4479afe8ae7 100644
38785 --- a/net/rxrpc/security.c
38786 +++ b/net/rxrpc/security.c
38787 @@ -19,9 +19,6 @@
38788  #include <keys/rxrpc-type.h>
38789  #include "ar-internal.h"
38790
38791 -static LIST_HEAD(rxrpc_security_methods);
38792 -static DECLARE_RWSEM(rxrpc_security_sem);
38793 -
38794  static const struct rxrpc_security *rxrpc_security_types[] = {
38795         [RXRPC_SECURITY_NONE]   = &rxrpc_no_security,
38796  #ifdef CONFIG_RXKAD
38797 diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c
38798 index cd69aa067543..73348ac5019f 100644
38799 --- a/net/sched/sch_api.c
38800 +++ b/net/sched/sch_api.c
38801 @@ -1081,7 +1081,7 @@ static struct Qdisc *qdisc_create(struct net_device *dev,
38802                         rcu_assign_pointer(sch->stab, stab);
38803                 }
38804                 if (tca[TCA_RATE]) {
38805 -                       seqcount_t *running;
38806 +                       net_seqlock_t *running;
38807
38808                         err = -EOPNOTSUPP;
38809                         if (sch->flags & TCQ_F_MQROOT)
38810 diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
38811 index 79549baf5804..341f7895659c 100644
38812 --- a/net/sched/sch_generic.c
38813 +++ b/net/sched/sch_generic.c
38814 @@ -429,7 +429,11 @@ struct Qdisc noop_qdisc = {
38815         .ops            =       &noop_qdisc_ops,
38816         .q.lock         =       __SPIN_LOCK_UNLOCKED(noop_qdisc.q.lock),
38817         .dev_queue      =       &noop_netdev_queue,
38818 +#ifdef CONFIG_PREEMPT_RT_BASE
38819 +       .running        =       __SEQLOCK_UNLOCKED(noop_qdisc.running),
38820 +#else
38821         .running        =       SEQCNT_ZERO(noop_qdisc.running),
38822 +#endif
38823         .busylock       =       __SPIN_LOCK_UNLOCKED(noop_qdisc.busylock),
38824  };
38825  EXPORT_SYMBOL(noop_qdisc);
38826 @@ -628,9 +632,17 @@ struct Qdisc *qdisc_alloc(struct netdev_queue *dev_queue,
38827         lockdep_set_class(&sch->busylock,
38828                           dev->qdisc_tx_busylock ?: &qdisc_tx_busylock);
38829
38830 +#ifdef CONFIG_PREEMPT_RT_BASE
38831 +       seqlock_init(&sch->running);
38832 +       lockdep_set_class(&sch->running.seqcount,
38833 +                         dev->qdisc_running_key ?: &qdisc_running_key);
38834 +       lockdep_set_class(&sch->running.lock,
38835 +                         dev->qdisc_running_key ?: &qdisc_running_key);
38836 +#else
38837         seqcount_init(&sch->running);
38838         lockdep_set_class(&sch->running,
38839                           dev->qdisc_running_key ?: &qdisc_running_key);
38840 +#endif
38841
38842         sch->ops = ops;
38843         sch->enqueue = ops->enqueue;
38844 @@ -933,7 +945,7 @@ void dev_deactivate_many(struct list_head *head)
38845         /* Wait for outstanding qdisc_run calls. */
38846         list_for_each_entry(dev, head, close_list) {
38847                 while (some_qdisc_is_busy(dev))
38848 -                       yield();
38849 +                       msleep(1);
38850                 /* The new qdisc is assigned at this point so we can safely
38851                  * unwind stale skb lists and qdisc statistics
38852                  */
38853 diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c
38854 index d16a8b423c20..cedaf909eb97 100644
38855 --- a/net/sunrpc/svc_xprt.c
38856 +++ b/net/sunrpc/svc_xprt.c
38857 @@ -396,7 +396,7 @@ void svc_xprt_do_enqueue(struct svc_xprt *xprt)
38858                 goto out;
38859         }
38860
38861 -       cpu = get_cpu();
38862 +       cpu = get_cpu_light();
38863         pool = svc_pool_for_cpu(xprt->xpt_server, cpu);
38864
38865         atomic_long_inc(&pool->sp_stats.packets);
38866 @@ -432,7 +432,7 @@ void svc_xprt_do_enqueue(struct svc_xprt *xprt)
38867
38868                 atomic_long_inc(&pool->sp_stats.threads_woken);
38869                 wake_up_process(rqstp->rq_task);
38870 -               put_cpu();
38871 +               put_cpu_light();
38872                 goto out;
38873         }
38874         rcu_read_unlock();
38875 @@ -453,7 +453,7 @@ void svc_xprt_do_enqueue(struct svc_xprt *xprt)
38876                 goto redo_search;
38877         }
38878         rqstp = NULL;
38879 -       put_cpu();
38880 +       put_cpu_light();
38881  out:
38882         trace_svc_xprt_do_enqueue(xprt, rqstp);
38883  }
38884 diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c
38885 index 6c4ec69e11a0..77f52dc790ec 100644
38886 --- a/net/xfrm/xfrm_state.c
38887 +++ b/net/xfrm/xfrm_state.c
38888 @@ -427,7 +427,7 @@ static void xfrm_put_mode(struct xfrm_mode *mode)
38889
38890  static void xfrm_state_gc_destroy(struct xfrm_state *x)
38891  {
38892 -       tasklet_hrtimer_cancel(&x->mtimer);
38893 +       hrtimer_cancel(&x->mtimer);
38894         del_timer_sync(&x->rtimer);
38895         kfree(x->aead);
38896         kfree(x->aalg);
38897 @@ -472,8 +472,8 @@ static void xfrm_state_gc_task(struct work_struct *work)
38898
38899  static enum hrtimer_restart xfrm_timer_handler(struct hrtimer *me)
38900  {
38901 -       struct tasklet_hrtimer *thr = container_of(me, struct tasklet_hrtimer, timer);
38902 -       struct xfrm_state *x = container_of(thr, struct xfrm_state, mtimer);
38903 +       struct xfrm_state *x = container_of(me, struct xfrm_state, mtimer);
38904 +       enum hrtimer_restart ret = HRTIMER_NORESTART;
38905         unsigned long now = get_seconds();
38906         long next = LONG_MAX;
38907         int warn = 0;
38908 @@ -537,7 +537,8 @@ static enum hrtimer_restart xfrm_timer_handler(struct hrtimer *me)
38909                 km_state_expired(x, 0, 0);
38910  resched:
38911         if (next != LONG_MAX) {
38912 -               tasklet_hrtimer_start(&x->mtimer, ktime_set(next, 0), HRTIMER_MODE_REL);
38913 +               hrtimer_forward_now(&x->mtimer, ktime_set(next, 0));
38914 +               ret = HRTIMER_RESTART;
38915         }
38916
38917         goto out;
38918 @@ -554,7 +555,7 @@ static enum hrtimer_restart xfrm_timer_handler(struct hrtimer *me)
38919
38920  out:
38921         spin_unlock(&x->lock);
38922 -       return HRTIMER_NORESTART;
38923 +       return ret;
38924  }
38925
38926  static void xfrm_replay_timer_handler(unsigned long data);
38927 @@ -573,8 +574,8 @@ struct xfrm_state *xfrm_state_alloc(struct net *net)
38928                 INIT_HLIST_NODE(&x->bydst);
38929                 INIT_HLIST_NODE(&x->bysrc);
38930                 INIT_HLIST_NODE(&x->byspi);
38931 -               tasklet_hrtimer_init(&x->mtimer, xfrm_timer_handler,
38932 -                                       CLOCK_BOOTTIME, HRTIMER_MODE_ABS);
38933 +               hrtimer_init(&x->mtimer, CLOCK_BOOTTIME, HRTIMER_MODE_ABS_SOFT);
38934 +               x->mtimer.function = xfrm_timer_handler;
38935                 setup_timer(&x->rtimer, xfrm_replay_timer_handler,
38936                                 (unsigned long)x);
38937                 x->curlft.add_time = get_seconds();
38938 @@ -1031,7 +1032,9 @@ xfrm_state_find(const xfrm_address_t *daddr, const xfrm_address_t *saddr,
38939                                 hlist_add_head_rcu(&x->byspi, net->xfrm.state_byspi + h);
38940                         }
38941                         x->lft.hard_add_expires_seconds = net->xfrm.sysctl_acq_expires;
38942 -                       tasklet_hrtimer_start(&x->mtimer, ktime_set(net->xfrm.sysctl_acq_expires, 0), HRTIMER_MODE_REL);
38943 +                       hrtimer_start(&x->mtimer,
38944 +                                     ktime_set(net->xfrm.sysctl_acq_expires, 0),
38945 +                                     HRTIMER_MODE_REL_SOFT);
38946                         net->xfrm.state_num++;
38947                         xfrm_hash_grow_check(net, x->bydst.next != NULL);
38948                         spin_unlock_bh(&net->xfrm.xfrm_state_lock);
38949 @@ -1142,7 +1145,7 @@ static void __xfrm_state_insert(struct xfrm_state *x)
38950                 hlist_add_head_rcu(&x->byspi, net->xfrm.state_byspi + h);
38951         }
38952
38953 -       tasklet_hrtimer_start(&x->mtimer, ktime_set(1, 0), HRTIMER_MODE_REL);
38954 +       hrtimer_start(&x->mtimer, ktime_set(1, 0), HRTIMER_MODE_REL_SOFT);
38955         if (x->replay_maxage)
38956                 mod_timer(&x->rtimer, jiffies + x->replay_maxage);
38957
38958 @@ -1246,7 +1249,9 @@ static struct xfrm_state *__find_acq_core(struct net *net,
38959                 x->mark.m = m->m;
38960                 x->lft.hard_add_expires_seconds = net->xfrm.sysctl_acq_expires;
38961                 xfrm_state_hold(x);
38962 -               tasklet_hrtimer_start(&x->mtimer, ktime_set(net->xfrm.sysctl_acq_expires, 0), HRTIMER_MODE_REL);
38963 +               hrtimer_start(&x->mtimer,
38964 +                             ktime_set(net->xfrm.sysctl_acq_expires, 0),
38965 +                             HRTIMER_MODE_REL_SOFT);
38966                 list_add(&x->km.all, &net->xfrm.state_all);
38967                 hlist_add_head_rcu(&x->bydst, net->xfrm.state_bydst + h);
38968                 h = xfrm_src_hash(net, daddr, saddr, family);
38969 @@ -1546,7 +1551,8 @@ int xfrm_state_update(struct xfrm_state *x)
38970                 memcpy(&x1->lft, &x->lft, sizeof(x1->lft));
38971                 x1->km.dying = 0;
38972
38973 -               tasklet_hrtimer_start(&x1->mtimer, ktime_set(1, 0), HRTIMER_MODE_REL);
38974 +               hrtimer_start(&x1->mtimer, ktime_set(1, 0),
38975 +                             HRTIMER_MODE_REL_SOFT);
38976                 if (x1->curlft.use_time)
38977                         xfrm_state_check_expire(x1);
38978
38979 @@ -1570,7 +1576,7 @@ int xfrm_state_check_expire(struct xfrm_state *x)
38980         if (x->curlft.bytes >= x->lft.hard_byte_limit ||
38981             x->curlft.packets >= x->lft.hard_packet_limit) {
38982                 x->km.state = XFRM_STATE_EXPIRED;
38983 -               tasklet_hrtimer_start(&x->mtimer, 0, HRTIMER_MODE_REL);
38984 +               hrtimer_start(&x->mtimer, 0, HRTIMER_MODE_REL_SOFT);
38985                 return -EINVAL;
38986         }
38987
38988 diff --git a/samples/trace_events/trace-events-sample.c b/samples/trace_events/trace-events-sample.c
38989 index 5522692100ba..8b4be8e1802a 100644
38990 --- a/samples/trace_events/trace-events-sample.c
38991 +++ b/samples/trace_events/trace-events-sample.c
38992 @@ -33,7 +33,7 @@ static void simple_thread_func(int cnt)
38993
38994         /* Silly tracepoints */
38995         trace_foo_bar("hello", cnt, array, random_strings[len],
38996 -                     &current->cpus_allowed);
38997 +                     current->cpus_ptr);
38998
38999         trace_foo_with_template_simple("HELLO", cnt);
39000
39001 diff --git a/scripts/mkcompile_h b/scripts/mkcompile_h
39002 index 959199c3147e..3e68004ed345 100755
39003 --- a/scripts/mkcompile_h
39004 +++ b/scripts/mkcompile_h
39005 @@ -5,7 +5,8 @@ TARGET=$1
39006  ARCH=$2
39007  SMP=$3
39008  PREEMPT=$4
39009 -CC=$5
39010 +RT=$5
39011 +CC=$6
39012
39013  vecho() { [ "${quiet}" = "silent_" ] || echo "$@" ; }
39014
39015 @@ -58,6 +59,7 @@ UTS_VERSION="#$VERSION"
39016  CONFIG_FLAGS=""
39017  if [ -n "$SMP" ] ; then CONFIG_FLAGS="SMP"; fi
39018  if [ -n "$PREEMPT" ] ; then CONFIG_FLAGS="$CONFIG_FLAGS PREEMPT"; fi
39019 +if [ -n "$RT" ] ; then CONFIG_FLAGS="$CONFIG_FLAGS RT"; fi
39020  UTS_VERSION="$UTS_VERSION $CONFIG_FLAGS $TIMESTAMP"
39021
39022  # Truncate to maximum length
39023 diff --git a/security/apparmor/include/path.h b/security/apparmor/include/path.h
39024 index 05fb3305671e..b26c16b02662 100644
39025 --- a/security/apparmor/include/path.h
39026 +++ b/security/apparmor/include/path.h
39027 @@ -39,9 +39,10 @@ struct aa_buffers {
39028  };
39029
39030  #include <linux/percpu.h>
39031 -#include <linux/preempt.h>
39032 +#include <linux/locallock.h>
39033
39034  DECLARE_PER_CPU(struct aa_buffers, aa_buffers);
39035 +DECLARE_LOCAL_IRQ_LOCK(aa_buffers_lock);
39036
39037  #define COUNT_ARGS(X...) COUNT_ARGS_HELPER(, ##X, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0)
39038  #define COUNT_ARGS_HELPER(_0, _1, _2, _3, _4, _5, _6, _7, _8, _9, n, X...) n
39039 @@ -55,12 +56,24 @@ DECLARE_PER_CPU(struct aa_buffers, aa_buffers);
39040
39041  #define for_each_cpu_buffer(I) for ((I) = 0; (I) < MAX_PATH_BUFFERS; (I)++)
39042
39043 -#ifdef CONFIG_DEBUG_PREEMPT
39044 +#ifdef CONFIG_PREEMPT_RT_BASE
39045 +
39046 +static inline void AA_BUG_PREEMPT_ENABLED(const char *s)
39047 +{
39048 +       struct local_irq_lock *lv;
39049 +
39050 +       lv = this_cpu_ptr(&aa_buffers_lock);
39051 +       WARN_ONCE(lv->owner != current,
39052 +                 "__get_buffer without aa_buffers_lock\n");
39053 +}
39054 +
39055 +#elif defined(CONFIG_DEBUG_PREEMPT)
39056  #define AA_BUG_PREEMPT_ENABLED(X) AA_BUG(preempt_count() <= 0, X)
39057  #else
39058  #define AA_BUG_PREEMPT_ENABLED(X) /* nop */
39059  #endif
39060
39061 +
39062  #define __get_buffer(N) ({                                     \
39063         struct aa_buffers *__cpu_var; \
39064         AA_BUG_PREEMPT_ENABLED("__get_buffer without preempt disabled");  \
39065 @@ -73,14 +86,14 @@ DECLARE_PER_CPU(struct aa_buffers, aa_buffers);
39066
39067  #define get_buffers(X...)      \
39068  do {                           \
39069 -       preempt_disable();      \
39070 +       local_lock(aa_buffers_lock);    \
39071         __get_buffers(X);       \
39072  } while (0)
39073
39074  #define put_buffers(X, Y...)   \
39075  do {                           \
39076         __put_buffers(X, Y);    \
39077 -       preempt_enable();       \
39078 +       local_unlock(aa_buffers_lock);  \
39079  } while (0)
39080
39081  #endif /* __AA_PATH_H */
39082 diff --git a/security/apparmor/lsm.c b/security/apparmor/lsm.c
39083 index 1346ee5be04f..aa7e4dee107b 100644
39084 --- a/security/apparmor/lsm.c
39085 +++ b/security/apparmor/lsm.c
39086 @@ -44,7 +44,7 @@
39087  int apparmor_initialized;
39088
39089  DEFINE_PER_CPU(struct aa_buffers, aa_buffers);
39090 -
39091 +DEFINE_LOCAL_IRQ_LOCK(aa_buffers_lock);
39092
39093  /*
39094   * LSM hook functions
39095 diff --git a/sound/core/pcm_native.c b/sound/core/pcm_native.c
39096 index ab3bf36786b6..f0bb7c9aa4be 100644
39097 --- a/sound/core/pcm_native.c
39098 +++ b/sound/core/pcm_native.c
39099 @@ -148,7 +148,7 @@ EXPORT_SYMBOL_GPL(snd_pcm_stream_unlock);
39100  void snd_pcm_stream_lock_irq(struct snd_pcm_substream *substream)
39101  {
39102         if (!substream->pcm->nonatomic)
39103 -               local_irq_disable();
39104 +               local_irq_disable_nort();
39105         snd_pcm_stream_lock(substream);
39106  }
39107  EXPORT_SYMBOL_GPL(snd_pcm_stream_lock_irq);
39108 @@ -163,7 +163,7 @@ void snd_pcm_stream_unlock_irq(struct snd_pcm_substream *substream)
39109  {
39110         snd_pcm_stream_unlock(substream);
39111         if (!substream->pcm->nonatomic)
39112 -               local_irq_enable();
39113 +               local_irq_enable_nort();
39114  }
39115  EXPORT_SYMBOL_GPL(snd_pcm_stream_unlock_irq);
39116
39117 @@ -171,7 +171,7 @@ unsigned long _snd_pcm_stream_lock_irqsave(struct snd_pcm_substream *substream)
39118  {
39119         unsigned long flags = 0;
39120         if (!substream->pcm->nonatomic)
39121 -               local_irq_save(flags);
39122 +               local_irq_save_nort(flags);
39123         snd_pcm_stream_lock(substream);
39124         return flags;
39125  }
39126 @@ -189,7 +189,7 @@ void snd_pcm_stream_unlock_irqrestore(struct snd_pcm_substream *substream,
39127  {
39128         snd_pcm_stream_unlock(substream);
39129         if (!substream->pcm->nonatomic)
39130 -               local_irq_restore(flags);
39131 +               local_irq_restore_nort(flags);
39132  }
39133  EXPORT_SYMBOL_GPL(snd_pcm_stream_unlock_irqrestore);
39134
39135 diff --git a/sound/drivers/dummy.c b/sound/drivers/dummy.c
39136 index c0939a0164a6..549e014ecc0d 100644
39137 --- a/sound/drivers/dummy.c
39138 +++ b/sound/drivers/dummy.c
39139 @@ -376,17 +376,9 @@ struct dummy_hrtimer_pcm {
39140         ktime_t period_time;
39141         atomic_t running;
39142         struct hrtimer timer;
39143 -       struct tasklet_struct tasklet;
39144         struct snd_pcm_substream *substream;
39145  };
39146
39147 -static void dummy_hrtimer_pcm_elapsed(unsigned long priv)
39148 -{
39149 -       struct dummy_hrtimer_pcm *dpcm = (struct dummy_hrtimer_pcm *)priv;
39150 -       if (atomic_read(&dpcm->running))
39151 -               snd_pcm_period_elapsed(dpcm->substream);
39152 -}
39153 -
39154  static enum hrtimer_restart dummy_hrtimer_callback(struct hrtimer *timer)
39155  {
39156         struct dummy_hrtimer_pcm *dpcm;
39157 @@ -394,7 +386,14 @@ static enum hrtimer_restart dummy_hrtimer_callback(struct hrtimer *timer)
39158         dpcm = container_of(timer, struct dummy_hrtimer_pcm, timer);
39159         if (!atomic_read(&dpcm->running))
39160                 return HRTIMER_NORESTART;
39161 -       tasklet_schedule(&dpcm->tasklet);
39162 +       /*
39163 +        * In cases of XRUN and draining, this calls .trigger to stop PCM
39164 +        * substream.
39165 +        */
39166 +       snd_pcm_period_elapsed(dpcm->substream);
39167 +       if (!atomic_read(&dpcm->running))
39168 +               return HRTIMER_NORESTART;
39169 +
39170         hrtimer_forward_now(timer, dpcm->period_time);
39171         return HRTIMER_RESTART;
39172  }
39173 @@ -404,7 +403,7 @@ static int dummy_hrtimer_start(struct snd_pcm_substream *substream)
39174         struct dummy_hrtimer_pcm *dpcm = substream->runtime->private_data;
39175
39176         dpcm->base_time = hrtimer_cb_get_time(&dpcm->timer);
39177 -       hrtimer_start(&dpcm->timer, dpcm->period_time, HRTIMER_MODE_REL);
39178 +       hrtimer_start(&dpcm->timer, dpcm->period_time, HRTIMER_MODE_REL_SOFT);
39179         atomic_set(&dpcm->running, 1);
39180         return 0;
39181  }
39182 @@ -414,14 +413,14 @@ static int dummy_hrtimer_stop(struct snd_pcm_substream *substream)
39183         struct dummy_hrtimer_pcm *dpcm = substream->runtime->private_data;
39184
39185         atomic_set(&dpcm->running, 0);
39186 -       hrtimer_cancel(&dpcm->timer);
39187 +       if (!hrtimer_callback_running(&dpcm->timer))
39188 +               hrtimer_cancel(&dpcm->timer);
39189         return 0;
39190  }
39191
39192  static inline void dummy_hrtimer_sync(struct dummy_hrtimer_pcm *dpcm)
39193  {
39194         hrtimer_cancel(&dpcm->timer);
39195 -       tasklet_kill(&dpcm->tasklet);
39196  }
39197
39198  static snd_pcm_uframes_t
39199 @@ -466,12 +465,10 @@ static int dummy_hrtimer_create(struct snd_pcm_substream *substream)
39200         if (!dpcm)
39201                 return -ENOMEM;
39202         substream->runtime->private_data = dpcm;
39203 -       hrtimer_init(&dpcm->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
39204 +       hrtimer_init(&dpcm->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_SOFT);
39205         dpcm->timer.function = dummy_hrtimer_callback;
39206         dpcm->substream = substream;
39207         atomic_set(&dpcm->running, 0);
39208 -       tasklet_init(&dpcm->tasklet, dummy_hrtimer_pcm_elapsed,
39209 -                    (unsigned long)dpcm);
39210         return 0;
39211  }
39212
39213 diff --git a/tools/testing/selftests/ftrace/test.d/functions b/tools/testing/selftests/ftrace/test.d/functions
39214 index 6a4982d029bf..843c2b0d948e 100644
39215 --- a/tools/testing/selftests/ftrace/test.d/functions
39216 +++ b/tools/testing/selftests/ftrace/test.d/functions
39217 @@ -70,6 +70,13 @@ disable_events() {
39218      echo 0 > events/enable
39219  }
39220
39221 +clear_synthetic_events() { # reset all current synthetic events
39222 +    grep -v ^# synthetic_events |
39223 +    while read line; do
39224 +        echo "!$line" >> synthetic_events
39225 +    done
39226 +}
39227 +
39228  initialize_ftrace() { # Reset ftrace to initial-state
39229  # As the initial state, ftrace will be set to nop tracer,
39230  # no events, no triggers, no filters, no function filters,
39231 diff --git a/tools/testing/selftests/ftrace/test.d/trigger/inter-event/trigger-extended-error-support.tc b/tools/testing/selftests/ftrace/test.d/trigger/inter-event/trigger-extended-error-support.tc
39232 new file mode 100644
39233 index 000000000000..786dce7e48be
39234 --- /dev/null
39235 +++ b/tools/testing/selftests/ftrace/test.d/trigger/inter-event/trigger-extended-error-support.tc
39236 @@ -0,0 +1,39 @@
39237 +#!/bin/sh
39238 +# description: event trigger - test extended error support
39239 +
39240 +
39241 +do_reset() {
39242 +    reset_trigger
39243 +    echo > set_event
39244 +    clear_trace
39245 +}
39246 +
39247 +fail() { #msg
39248 +    do_reset
39249 +    echo $1
39250 +    exit_fail
39251 +}
39252 +
39253 +if [ ! -f set_event ]; then
39254 +    echo "event tracing is not supported"
39255 +    exit_unsupported
39256 +fi
39257 +
39258 +if [ ! -f synthetic_events ]; then
39259 +    echo "synthetic event is not supported"
39260 +    exit_unsupported
39261 +fi
39262 +
39263 +reset_tracer
39264 +do_reset
39265 +
39266 +echo "Test extended error support"
39267 +echo 'hist:keys=pid:ts0=common_timestamp.usecs if comm=="ping"' > events/sched/sched_wakeup/trigger
39268 +echo 'hist:keys=pid:ts0=common_timestamp.usecs if comm=="ping"' >> events/sched/sched_wakeup/trigger &>/dev/null
39269 +if ! grep -q "ERROR:" events/sched/sched_wakeup/hist; then
39270 +    fail "Failed to generate extended error in histogram"
39271 +fi
39272 +
39273 +do_reset
39274 +
39275 +exit 0
39276 diff --git a/tools/testing/selftests/ftrace/test.d/trigger/inter-event/trigger-field-variable-support.tc b/tools/testing/selftests/ftrace/test.d/trigger/inter-event/trigger-field-variable-support.tc
39277 new file mode 100644
39278 index 000000000000..7fd5b4a8f060
39279 --- /dev/null
39280 +++ b/tools/testing/selftests/ftrace/test.d/trigger/inter-event/trigger-field-variable-support.tc
39281 @@ -0,0 +1,54 @@
39282 +#!/bin/sh
39283 +# description: event trigger - test field variable support
39284 +
39285 +do_reset() {
39286 +    reset_trigger
39287 +    echo > set_event
39288 +    clear_trace
39289 +}
39290 +
39291 +fail() { #msg
39292 +    do_reset
39293 +    echo $1
39294 +    exit_fail
39295 +}
39296 +
39297 +if [ ! -f set_event ]; then
39298 +    echo "event tracing is not supported"
39299 +    exit_unsupported
39300 +fi
39301 +
39302 +if [ ! -f synthetic_events ]; then
39303 +    echo "synthetic event is not supported"
39304 +    exit_unsupported
39305 +fi
39306 +
39307 +clear_synthetic_events
39308 +reset_tracer
39309 +do_reset
39310 +
39311 +echo "Test field variable support"
39312 +
39313 +echo 'wakeup_latency u64 lat; pid_t pid; int prio; char comm[16]' > synthetic_events
39314 +echo 'hist:keys=comm:ts0=common_timestamp.usecs if comm=="ping"' > events/sched/sched_waking/trigger
39315 +echo 'hist:keys=next_comm:wakeup_lat=common_timestamp.usecs-$ts0:onmatch(sched.sched_waking).wakeup_latency($wakeup_lat,next_pid,sched.sched_waking.prio,next_comm) if next_comm=="ping"' > events/sched/sched_switch/trigger
39316 +echo 'hist:keys=pid,prio,comm:vals=lat:sort=pid,prio' > events/synthetic/wakeup_latency/trigger
39317 +
39318 +ping localhost -c 3
39319 +if ! grep -q "ping" events/synthetic/wakeup_latency/hist; then
39320 +    fail "Failed to create inter-event histogram"
39321 +fi
39322 +
39323 +if ! grep -q "synthetic_prio=prio" events/sched/sched_waking/hist; then
39324 +    fail "Failed to create histogram with field variable"
39325 +fi
39326 +
39327 +echo '!hist:keys=next_comm:wakeup_lat=common_timestamp.usecs-$ts0:onmatch(sched.sched_waking).wakeup_latency($wakeup_lat,next_pid,sched.sched_waking.prio,next_comm) if next_comm=="ping"' >> events/sched/sched_switch/trigger
39328 +
39329 +if grep -q "synthetic_prio=prio" events/sched/sched_waking/hist; then
39330 +    fail "Failed to remove histogram with field variable"
39331 +fi
39332 +
39333 +do_reset
39334 +
39335 +exit 0
39336 diff --git a/tools/testing/selftests/ftrace/test.d/trigger/inter-event/trigger-inter-event-combined-hist.tc b/tools/testing/selftests/ftrace/test.d/trigger/inter-event/trigger-inter-event-combined-hist.tc
39337 new file mode 100644
39338 index 000000000000..c93dbe38b5df
39339 --- /dev/null
39340 +++ b/tools/testing/selftests/ftrace/test.d/trigger/inter-event/trigger-inter-event-combined-hist.tc
39341 @@ -0,0 +1,58 @@
39342 +#!/bin/sh
39343 +# description: event trigger - test inter-event combined histogram trigger
39344 +
39345 +do_reset() {
39346 +    reset_trigger
39347 +    echo > set_event
39348 +    clear_trace
39349 +}
39350 +
39351 +fail() { #msg
39352 +    do_reset
39353 +    echo $1
39354 +    exit_fail
39355 +}
39356 +
39357 +if [ ! -f set_event ]; then
39358 +    echo "event tracing is not supported"
39359 +    exit_unsupported
39360 +fi
39361 +
39362 +if [ ! -f synthetic_events ]; then
39363 +    echo "synthetic event is not supported"
39364 +    exit_unsupported
39365 +fi
39366 +
39367 +reset_tracer
39368 +do_reset
39369 +clear_synthetic_events
39370 +
39371 +echo "Test create synthetic event"
39372 +
39373 +echo 'waking_latency  u64 lat pid_t pid' > synthetic_events
39374 +if [ ! -d events/synthetic/waking_latency ]; then
39375 +    fail "Failed to create waking_latency synthetic event"
39376 +fi
39377 +
39378 +echo "Test combined histogram"
39379 +
39380 +echo 'hist:keys=pid:ts0=common_timestamp.usecs if comm=="ping"' > events/sched/sched_waking/trigger
39381 +echo 'hist:keys=pid:waking_lat=common_timestamp.usecs-$ts0:onmatch(sched.sched_waking).waking_latency($waking_lat,pid) if comm=="ping"' > events/sched/sched_wakeup/trigger
39382 +echo 'hist:keys=pid,lat:sort=pid,lat' > events/synthetic/waking_latency/trigger
39383 +
39384 +echo 'wakeup_latency u64 lat pid_t pid' >> synthetic_events
39385 +echo 'hist:keys=pid:ts1=common_timestamp.usecs if comm=="ping"' >> events/sched/sched_wakeup/trigger
39386 +echo 'hist:keys=next_pid:wakeup_lat=common_timestamp.usecs-$ts1:onmatch(sched.sched_wakeup).wakeup_latency($wakeup_lat,next_pid) if next_comm=="ping"' > events/sched/sched_switch/trigger
39387 +
39388 +echo 'waking+wakeup_latency u64 lat; pid_t pid' >> synthetic_events
39389 +echo 'hist:keys=pid,lat:sort=pid,lat:ww_lat=$waking_lat+$wakeup_lat:onmatch(synthetic.wakeup_latency).waking+wakeup_latency($ww_lat,pid)' >> events/synthetic/wakeup_latency/trigger
39390 +echo 'hist:keys=pid,lat:sort=pid,lat' >> events/synthetic/waking+wakeup_latency/trigger
39391 +
39392 +ping localhost -c 3
39393 +if ! grep -q "pid:" events/synthetic/waking+wakeup_latency/hist; then
39394 +    fail "Failed to create combined histogram"
39395 +fi
39396 +
39397 +do_reset
39398 +
39399 +exit 0
39400 diff --git a/tools/testing/selftests/ftrace/test.d/trigger/inter-event/trigger-onmatch-action-hist.tc b/tools/testing/selftests/ftrace/test.d/trigger/inter-event/trigger-onmatch-action-hist.tc
39401 new file mode 100644
39402 index 000000000000..e84e7d048566
39403 --- /dev/null
39404 +++ b/tools/testing/selftests/ftrace/test.d/trigger/inter-event/trigger-onmatch-action-hist.tc
39405 @@ -0,0 +1,50 @@
39406 +#!/bin/sh
39407 +# description: event trigger - test inter-event histogram trigger onmatch action
39408 +
39409 +do_reset() {
39410 +    reset_trigger
39411 +    echo > set_event
39412 +    clear_trace
39413 +}
39414 +
39415 +fail() { #msg
39416 +    do_reset
39417 +    echo $1
39418 +    exit_fail
39419 +}
39420 +
39421 +if [ ! -f set_event ]; then
39422 +    echo "event tracing is not supported"
39423 +    exit_unsupported
39424 +fi
39425 +
39426 +if [ ! -f synthetic_events ]; then
39427 +    echo "synthetic event is not supported"
39428 +    exit_unsupported
39429 +fi
39430 +
39431 +clear_synthetic_events
39432 +reset_tracer
39433 +do_reset
39434 +
39435 +echo "Test create synthetic event"
39436 +
39437 +echo 'wakeup_latency  u64 lat pid_t pid char comm[16]' > synthetic_events
39438 +if [ ! -d events/synthetic/wakeup_latency ]; then
39439 +    fail "Failed to create wakeup_latency synthetic event"
39440 +fi
39441 +
39442 +echo "Test create histogram for synthetic event"
39443 +echo "Test histogram variables,simple expression support and onmatch action"
39444 +
39445 +echo 'hist:keys=pid:ts0=common_timestamp.usecs if comm=="ping"' > events/sched/sched_wakeup/trigger
39446 +echo 'hist:keys=next_pid:wakeup_lat=common_timestamp.usecs-$ts0:onmatch(sched.sched_wakeup).wakeup_latency($wakeup_lat,next_pid,next_comm) if next_comm=="ping"' > events/sched/sched_switch/trigger
39447 +echo 'hist:keys=comm,pid,lat:wakeup_lat=lat:sort=lat' > events/synthetic/wakeup_latency/trigger
39448 +ping localhost -c 5
39449 +if ! grep -q "ping" events/synthetic/wakeup_latency/hist; then
39450 +    fail "Failed to create onmatch action inter-event histogram"
39451 +fi
39452 +
39453 +do_reset
39454 +
39455 +exit 0
39456 diff --git a/tools/testing/selftests/ftrace/test.d/trigger/inter-event/trigger-onmatch-onmax-action-hist.tc b/tools/testing/selftests/ftrace/test.d/trigger/inter-event/trigger-onmatch-onmax-action-hist.tc
39457 new file mode 100644
39458 index 000000000000..7907d8aacde3
39459 --- /dev/null
39460 +++ b/tools/testing/selftests/ftrace/test.d/trigger/inter-event/trigger-onmatch-onmax-action-hist.tc
39461 @@ -0,0 +1,50 @@
39462 +#!/bin/sh
39463 +# description: event trigger - test inter-event histogram trigger onmatch-onmax action
39464 +
39465 +do_reset() {
39466 +    reset_trigger
39467 +    echo > set_event
39468 +    clear_trace
39469 +}
39470 +
39471 +fail() { #msg
39472 +    do_reset
39473 +    echo $1
39474 +    exit_fail
39475 +}
39476 +
39477 +if [ ! -f set_event ]; then
39478 +    echo "event tracing is not supported"
39479 +    exit_unsupported
39480 +fi
39481 +
39482 +if [ ! -f synthetic_events ]; then
39483 +    echo "synthetic event is not supported"
39484 +    exit_unsupported
39485 +fi
39486 +
39487 +clear_synthetic_events
39488 +reset_tracer
39489 +do_reset
39490 +
39491 +echo "Test create synthetic event"
39492 +
39493 +echo 'wakeup_latency  u64 lat pid_t pid char comm[16]' > synthetic_events
39494 +if [ ! -d events/synthetic/wakeup_latency ]; then
39495 +    fail "Failed to create wakeup_latency synthetic event"
39496 +fi
39497 +
39498 +echo "Test create histogram for synthetic event"
39499 +echo "Test histogram variables,simple expression support and onmatch-onmax action"
39500 +
39501 +echo 'hist:keys=pid:ts0=common_timestamp.usecs if comm=="ping"' > events/sched/sched_wakeup/trigger
39502 +echo 'hist:keys=next_pid:wakeup_lat=common_timestamp.usecs-$ts0:onmatch(sched.sched_wakeup).wakeup_latency($wakeup_lat,next_pid,next_comm):onmax($wakeup_lat).save(next_comm,prev_pid,prev_prio,prev_comm) if next_comm=="ping"' >> events/sched/sched_switch/trigger
39503 +echo 'hist:keys=comm,pid,lat:wakeup_lat=lat:sort=lat' > events/synthetic/wakeup_latency/trigger
39504 +ping localhost -c 5
39505 +if [ ! grep -q "ping" events/synthetic/wakeup_latency/hist -o ! grep -q "max:" events/sched/sched_switch/hist]; then
39506 +    fail "Failed to create onmatch-onmax action inter-event histogram"
39507 +fi
39508 +
39509 +do_reset
39510 +
39511 +exit 0
39512 diff --git a/tools/testing/selftests/ftrace/test.d/trigger/inter-event/trigger-onmax-action-hist.tc b/tools/testing/selftests/ftrace/test.d/trigger/inter-event/trigger-onmax-action-hist.tc
39513 new file mode 100644
39514 index 000000000000..38b7ed6242b2
39515 --- /dev/null
39516 +++ b/tools/testing/selftests/ftrace/test.d/trigger/inter-event/trigger-onmax-action-hist.tc
39517 @@ -0,0 +1,48 @@
39518 +#!/bin/sh
39519 +# description: event trigger - test inter-event histogram trigger onmax action
39520 +
39521 +do_reset() {
39522 +    reset_trigger
39523 +    echo > set_event
39524 +    clear_trace
39525 +}
39526 +
39527 +fail() { #msg
39528 +    do_reset
39529 +    echo $1
39530 +    exit_fail
39531 +}
39532 +
39533 +if [ ! -f set_event ]; then
39534 +    echo "event tracing is not supported"
39535 +    exit_unsupported
39536 +fi
39537 +
39538 +if [ ! -f synthetic_events ]; then
39539 +    echo "synthetic event is not supported"
39540 +    exit_unsupported
39541 +fi
39542 +
39543 +clear_synthetic_events
39544 +reset_tracer
39545 +do_reset
39546 +
39547 +echo "Test create synthetic event"
39548 +
39549 +echo 'wakeup_latency  u64 lat pid_t pid char comm[16]' > synthetic_events
39550 +if [ ! -d events/synthetic/wakeup_latency ]; then
39551 +    fail "Failed to create wakeup_latency synthetic event"
39552 +fi
39553 +
39554 +echo "Test onmax action"
39555 +
39556 +echo 'hist:keys=pid:ts0=common_timestamp.usecs if comm=="ping"' >> events/sched/sched_waking/trigger
39557 +echo 'hist:keys=next_pid:wakeup_lat=common_timestamp.usecs-$ts0:onmax($wakeup_lat).save(next_comm,prev_pid,prev_prio,prev_comm) if next_comm=="ping"' >> events/sched/sched_switch/trigger
39558 +ping localhost -c 3
39559 +if ! grep -q "max:" events/sched/sched_switch/hist; then
39560 +    fail "Failed to create onmax action inter-event histogram"
39561 +fi
39562 +
39563 +do_reset
39564 +
39565 +exit 0
39566 diff --git a/tools/testing/selftests/ftrace/test.d/trigger/inter-event/trigger-synthetic-event-createremove.tc b/tools/testing/selftests/ftrace/test.d/trigger/inter-event/trigger-synthetic-event-createremove.tc
39567 new file mode 100644
39568 index 000000000000..cef11377dcbd
39569 --- /dev/null
39570 +++ b/tools/testing/selftests/ftrace/test.d/trigger/inter-event/trigger-synthetic-event-createremove.tc
39571 @@ -0,0 +1,54 @@
39572 +#!/bin/sh
39573 +# description: event trigger - test synthetic event create remove
39574 +do_reset() {
39575 +    reset_trigger
39576 +    echo > set_event
39577 +    clear_trace
39578 +}
39579 +
39580 +fail() { #msg
39581 +    do_reset
39582 +    echo $1
39583 +    exit_fail
39584 +}
39585 +
39586 +if [ ! -f set_event ]; then
39587 +    echo "event tracing is not supported"
39588 +    exit_unsupported
39589 +fi
39590 +
39591 +if [ ! -f synthetic_events ]; then
39592 +    echo "synthetic event is not supported"
39593 +    exit_unsupported
39594 +fi
39595 +
39596 +clear_synthetic_events
39597 +reset_tracer
39598 +do_reset
39599 +
39600 +echo "Test create synthetic event"
39601 +
39602 +echo 'wakeup_latency  u64 lat pid_t pid char comm[16]' > synthetic_events
39603 +if [ ! -d events/synthetic/wakeup_latency ]; then
39604 +    fail "Failed to create wakeup_latency synthetic event"
39605 +fi
39606 +
39607 +reset_trigger
39608 +
39609 +echo "Test create synthetic event with an error"
39610 +echo 'wakeup_latency  u64 lat pid_t pid char' > synthetic_events > /dev/null
39611 +if [ -d events/synthetic/wakeup_latency ]; then
39612 +    fail "Created wakeup_latency synthetic event with an invalid format"
39613 +fi
39614 +
39615 +reset_trigger
39616 +
39617 +echo "Test remove synthetic event"
39618 +echo '!wakeup_latency  u64 lat pid_t pid char comm[16]' > synthetic_events
39619 +if [ -d events/synthetic/wakeup_latency ]; then
39620 +    fail "Failed to delete wakeup_latency synthetic event"
39621 +fi
39622 +
39623 +do_reset
39624 +
39625 +exit 0
39626 diff --git a/virt/kvm/arm/arm.c b/virt/kvm/arm/arm.c
39627 index d5f1d8364571..c09e04130bfe 100644
39628 --- a/virt/kvm/arm/arm.c
39629 +++ b/virt/kvm/arm/arm.c
39630 @@ -69,7 +69,6 @@ static DEFINE_PER_CPU(unsigned char, kvm_arm_hardware_enabled);
39631
39632  static void kvm_arm_set_running_vcpu(struct kvm_vcpu *vcpu)
39633  {
39634 -       BUG_ON(preemptible());
39635         __this_cpu_write(kvm_arm_running_vcpu, vcpu);
39636  }
39637
39638 @@ -79,7 +78,6 @@ static void kvm_arm_set_running_vcpu(struct kvm_vcpu *vcpu)
39639   */
39640  struct kvm_vcpu *kvm_arm_get_running_vcpu(void)
39641  {
39642 -       BUG_ON(preemptible());
39643         return __this_cpu_read(kvm_arm_running_vcpu);
39644  }
39645
39646 @@ -653,7 +651,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run)
39647                  * involves poking the GIC, which must be done in a
39648                  * non-preemptible context.
39649                  */
39650 -               preempt_disable();
39651 +               migrate_disable();
39652
39653                 kvm_pmu_flush_hwstate(vcpu);
39654
39655 @@ -690,7 +688,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run)
39656                         kvm_pmu_sync_hwstate(vcpu);
39657                         kvm_timer_sync_hwstate(vcpu);
39658                         kvm_vgic_sync_hwstate(vcpu);
39659 -                       preempt_enable();
39660 +                       migrate_enable();
39661                         continue;
39662                 }
39663
39664 @@ -745,7 +743,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run)
39665
39666                 kvm_vgic_sync_hwstate(vcpu);
39667
39668 -               preempt_enable();
39669 +               migrate_enable();
39670
39671                 ret = handle_exit(vcpu, run, ret);
39672         }