]> git.pld-linux.org Git - packages/kernel.git/blame_incremental - kernel-rt.patch
- up to 4.14.227
[packages/kernel.git] / kernel-rt.patch
... / ...
CommitLineData
1diff --git a/Documentation/trace/events.txt b/Documentation/trace/events.txt
2index 2cc08d4a326e..e28f7f29f2b3 100644
3--- a/Documentation/trace/events.txt
4+++ b/Documentation/trace/events.txt
5@@ -517,1550 +517,4 @@ The following commands are supported:
6 totals derived from one or more trace event format fields and/or
7 event counts (hitcount).
8
9- The format of a hist trigger is as follows:
10-
11- hist:keys=<field1[,field2,...]>[:values=<field1[,field2,...]>]
12- [:sort=<field1[,field2,...]>][:size=#entries][:pause][:continue]
13- [:clear][:name=histname1] [if <filter>]
14-
15- When a matching event is hit, an entry is added to a hash table
16- using the key(s) and value(s) named. Keys and values correspond to
17- fields in the event's format description. Values must correspond to
18- numeric fields - on an event hit, the value(s) will be added to a
19- sum kept for that field. The special string 'hitcount' can be used
20- in place of an explicit value field - this is simply a count of
21- event hits. If 'values' isn't specified, an implicit 'hitcount'
22- value will be automatically created and used as the only value.
23- Keys can be any field, or the special string 'stacktrace', which
24- will use the event's kernel stacktrace as the key. The keywords
25- 'keys' or 'key' can be used to specify keys, and the keywords
26- 'values', 'vals', or 'val' can be used to specify values. Compound
27- keys consisting of up to two fields can be specified by the 'keys'
28- keyword. Hashing a compound key produces a unique entry in the
29- table for each unique combination of component keys, and can be
30- useful for providing more fine-grained summaries of event data.
31- Additionally, sort keys consisting of up to two fields can be
32- specified by the 'sort' keyword. If more than one field is
33- specified, the result will be a 'sort within a sort': the first key
34- is taken to be the primary sort key and the second the secondary
35- key. If a hist trigger is given a name using the 'name' parameter,
36- its histogram data will be shared with other triggers of the same
37- name, and trigger hits will update this common data. Only triggers
38- with 'compatible' fields can be combined in this way; triggers are
39- 'compatible' if the fields named in the trigger share the same
40- number and type of fields and those fields also have the same names.
41- Note that any two events always share the compatible 'hitcount' and
42- 'stacktrace' fields and can therefore be combined using those
43- fields, however pointless that may be.
44-
45- 'hist' triggers add a 'hist' file to each event's subdirectory.
46- Reading the 'hist' file for the event will dump the hash table in
47- its entirety to stdout. If there are multiple hist triggers
48- attached to an event, there will be a table for each trigger in the
49- output. The table displayed for a named trigger will be the same as
50- any other instance having the same name. Each printed hash table
51- entry is a simple list of the keys and values comprising the entry;
52- keys are printed first and are delineated by curly braces, and are
53- followed by the set of value fields for the entry. By default,
54- numeric fields are displayed as base-10 integers. This can be
55- modified by appending any of the following modifiers to the field
56- name:
57-
58- .hex display a number as a hex value
59- .sym display an address as a symbol
60- .sym-offset display an address as a symbol and offset
61- .syscall display a syscall id as a system call name
62- .execname display a common_pid as a program name
63-
64- Note that in general the semantics of a given field aren't
65- interpreted when applying a modifier to it, but there are some
66- restrictions to be aware of in this regard:
67-
68- - only the 'hex' modifier can be used for values (because values
69- are essentially sums, and the other modifiers don't make sense
70- in that context).
71- - the 'execname' modifier can only be used on a 'common_pid'. The
72- reason for this is that the execname is simply the 'comm' value
73- saved for the 'current' process when an event was triggered,
74- which is the same as the common_pid value saved by the event
75- tracing code. Trying to apply that comm value to other pid
76- values wouldn't be correct, and typically events that care save
77- pid-specific comm fields in the event itself.
78-
79- A typical usage scenario would be the following to enable a hist
80- trigger, read its current contents, and then turn it off:
81-
82- # echo 'hist:keys=skbaddr.hex:vals=len' > \
83- /sys/kernel/debug/tracing/events/net/netif_rx/trigger
84-
85- # cat /sys/kernel/debug/tracing/events/net/netif_rx/hist
86-
87- # echo '!hist:keys=skbaddr.hex:vals=len' > \
88- /sys/kernel/debug/tracing/events/net/netif_rx/trigger
89-
90- The trigger file itself can be read to show the details of the
91- currently attached hist trigger. This information is also displayed
92- at the top of the 'hist' file when read.
93-
94- By default, the size of the hash table is 2048 entries. The 'size'
95- parameter can be used to specify more or fewer than that. The units
96- are in terms of hashtable entries - if a run uses more entries than
97- specified, the results will show the number of 'drops', the number
98- of hits that were ignored. The size should be a power of 2 between
99- 128 and 131072 (any non- power-of-2 number specified will be rounded
100- up).
101-
102- The 'sort' parameter can be used to specify a value field to sort
103- on. The default if unspecified is 'hitcount' and the default sort
104- order is 'ascending'. To sort in the opposite direction, append
105- .descending' to the sort key.
106-
107- The 'pause' parameter can be used to pause an existing hist trigger
108- or to start a hist trigger but not log any events until told to do
109- so. 'continue' or 'cont' can be used to start or restart a paused
110- hist trigger.
111-
112- The 'clear' parameter will clear the contents of a running hist
113- trigger and leave its current paused/active state.
114-
115- Note that the 'pause', 'cont', and 'clear' parameters should be
116- applied using 'append' shell operator ('>>') if applied to an
117- existing trigger, rather than via the '>' operator, which will cause
118- the trigger to be removed through truncation.
119-
120-- enable_hist/disable_hist
121-
122- The enable_hist and disable_hist triggers can be used to have one
123- event conditionally start and stop another event's already-attached
124- hist trigger. Any number of enable_hist and disable_hist triggers
125- can be attached to a given event, allowing that event to kick off
126- and stop aggregations on a host of other events.
127-
128- The format is very similar to the enable/disable_event triggers:
129-
130- enable_hist:<system>:<event>[:count]
131- disable_hist:<system>:<event>[:count]
132-
133- Instead of enabling or disabling the tracing of the target event
134- into the trace buffer as the enable/disable_event triggers do, the
135- enable/disable_hist triggers enable or disable the aggregation of
136- the target event into a hash table.
137-
138- A typical usage scenario for the enable_hist/disable_hist triggers
139- would be to first set up a paused hist trigger on some event,
140- followed by an enable_hist/disable_hist pair that turns the hist
141- aggregation on and off when conditions of interest are hit:
142-
143- # echo 'hist:keys=skbaddr.hex:vals=len:pause' > \
144- /sys/kernel/debug/tracing/events/net/netif_receive_skb/trigger
145-
146- # echo 'enable_hist:net:netif_receive_skb if filename==/usr/bin/wget' > \
147- /sys/kernel/debug/tracing/events/sched/sched_process_exec/trigger
148-
149- # echo 'disable_hist:net:netif_receive_skb if comm==wget' > \
150- /sys/kernel/debug/tracing/events/sched/sched_process_exit/trigger
151-
152- The above sets up an initially paused hist trigger which is unpaused
153- and starts aggregating events when a given program is executed, and
154- which stops aggregating when the process exits and the hist trigger
155- is paused again.
156-
157- The examples below provide a more concrete illustration of the
158- concepts and typical usage patterns discussed above.
159-
160-
161-6.2 'hist' trigger examples
162----------------------------
163-
164- The first set of examples creates aggregations using the kmalloc
165- event. The fields that can be used for the hist trigger are listed
166- in the kmalloc event's format file:
167-
168- # cat /sys/kernel/debug/tracing/events/kmem/kmalloc/format
169- name: kmalloc
170- ID: 374
171- format:
172- field:unsigned short common_type; offset:0; size:2; signed:0;
173- field:unsigned char common_flags; offset:2; size:1; signed:0;
174- field:unsigned char common_preempt_count; offset:3; size:1; signed:0;
175- field:int common_pid; offset:4; size:4; signed:1;
176-
177- field:unsigned long call_site; offset:8; size:8; signed:0;
178- field:const void * ptr; offset:16; size:8; signed:0;
179- field:size_t bytes_req; offset:24; size:8; signed:0;
180- field:size_t bytes_alloc; offset:32; size:8; signed:0;
181- field:gfp_t gfp_flags; offset:40; size:4; signed:0;
182-
183- We'll start by creating a hist trigger that generates a simple table
184- that lists the total number of bytes requested for each function in
185- the kernel that made one or more calls to kmalloc:
186-
187- # echo 'hist:key=call_site:val=bytes_req' > \
188- /sys/kernel/debug/tracing/events/kmem/kmalloc/trigger
189-
190- This tells the tracing system to create a 'hist' trigger using the
191- call_site field of the kmalloc event as the key for the table, which
192- just means that each unique call_site address will have an entry
193- created for it in the table. The 'val=bytes_req' parameter tells
194- the hist trigger that for each unique entry (call_site) in the
195- table, it should keep a running total of the number of bytes
196- requested by that call_site.
197-
198- We'll let it run for awhile and then dump the contents of the 'hist'
199- file in the kmalloc event's subdirectory (for readability, a number
200- of entries have been omitted):
201-
202- # cat /sys/kernel/debug/tracing/events/kmem/kmalloc/hist
203- # trigger info: hist:keys=call_site:vals=bytes_req:sort=hitcount:size=2048 [active]
204-
205- { call_site: 18446744072106379007 } hitcount: 1 bytes_req: 176
206- { call_site: 18446744071579557049 } hitcount: 1 bytes_req: 1024
207- { call_site: 18446744071580608289 } hitcount: 1 bytes_req: 16384
208- { call_site: 18446744071581827654 } hitcount: 1 bytes_req: 24
209- { call_site: 18446744071580700980 } hitcount: 1 bytes_req: 8
210- { call_site: 18446744071579359876 } hitcount: 1 bytes_req: 152
211- { call_site: 18446744071580795365 } hitcount: 3 bytes_req: 144
212- { call_site: 18446744071581303129 } hitcount: 3 bytes_req: 144
213- { call_site: 18446744071580713234 } hitcount: 4 bytes_req: 2560
214- { call_site: 18446744071580933750 } hitcount: 4 bytes_req: 736
215- .
216- .
217- .
218- { call_site: 18446744072106047046 } hitcount: 69 bytes_req: 5576
219- { call_site: 18446744071582116407 } hitcount: 73 bytes_req: 2336
220- { call_site: 18446744072106054684 } hitcount: 136 bytes_req: 140504
221- { call_site: 18446744072106224230 } hitcount: 136 bytes_req: 19584
222- { call_site: 18446744072106078074 } hitcount: 153 bytes_req: 2448
223- { call_site: 18446744072106062406 } hitcount: 153 bytes_req: 36720
224- { call_site: 18446744071582507929 } hitcount: 153 bytes_req: 37088
225- { call_site: 18446744072102520590 } hitcount: 273 bytes_req: 10920
226- { call_site: 18446744071582143559 } hitcount: 358 bytes_req: 716
227- { call_site: 18446744072106465852 } hitcount: 417 bytes_req: 56712
228- { call_site: 18446744072102523378 } hitcount: 485 bytes_req: 27160
229- { call_site: 18446744072099568646 } hitcount: 1676 bytes_req: 33520
230-
231- Totals:
232- Hits: 4610
233- Entries: 45
234- Dropped: 0
235-
236- The output displays a line for each entry, beginning with the key
237- specified in the trigger, followed by the value(s) also specified in
238- the trigger. At the beginning of the output is a line that displays
239- the trigger info, which can also be displayed by reading the
240- 'trigger' file:
241-
242- # cat /sys/kernel/debug/tracing/events/kmem/kmalloc/trigger
243- hist:keys=call_site:vals=bytes_req:sort=hitcount:size=2048 [active]
244-
245- At the end of the output are a few lines that display the overall
246- totals for the run. The 'Hits' field shows the total number of
247- times the event trigger was hit, the 'Entries' field shows the total
248- number of used entries in the hash table, and the 'Dropped' field
249- shows the number of hits that were dropped because the number of
250- used entries for the run exceeded the maximum number of entries
251- allowed for the table (normally 0, but if not a hint that you may
252- want to increase the size of the table using the 'size' parameter).
253-
254- Notice in the above output that there's an extra field, 'hitcount',
255- which wasn't specified in the trigger. Also notice that in the
256- trigger info output, there's a parameter, 'sort=hitcount', which
257- wasn't specified in the trigger either. The reason for that is that
258- every trigger implicitly keeps a count of the total number of hits
259- attributed to a given entry, called the 'hitcount'. That hitcount
260- information is explicitly displayed in the output, and in the
261- absence of a user-specified sort parameter, is used as the default
262- sort field.
263-
264- The value 'hitcount' can be used in place of an explicit value in
265- the 'values' parameter if you don't really need to have any
266- particular field summed and are mainly interested in hit
267- frequencies.
268-
269- To turn the hist trigger off, simply call up the trigger in the
270- command history and re-execute it with a '!' prepended:
271-
272- # echo '!hist:key=call_site:val=bytes_req' > \
273- /sys/kernel/debug/tracing/events/kmem/kmalloc/trigger
274-
275- Finally, notice that the call_site as displayed in the output above
276- isn't really very useful. It's an address, but normally addresses
277- are displayed in hex. To have a numeric field displayed as a hex
278- value, simply append '.hex' to the field name in the trigger:
279-
280- # echo 'hist:key=call_site.hex:val=bytes_req' > \
281- /sys/kernel/debug/tracing/events/kmem/kmalloc/trigger
282-
283- # cat /sys/kernel/debug/tracing/events/kmem/kmalloc/hist
284- # trigger info: hist:keys=call_site.hex:vals=bytes_req:sort=hitcount:size=2048 [active]
285-
286- { call_site: ffffffffa026b291 } hitcount: 1 bytes_req: 433
287- { call_site: ffffffffa07186ff } hitcount: 1 bytes_req: 176
288- { call_site: ffffffff811ae721 } hitcount: 1 bytes_req: 16384
289- { call_site: ffffffff811c5134 } hitcount: 1 bytes_req: 8
290- { call_site: ffffffffa04a9ebb } hitcount: 1 bytes_req: 511
291- { call_site: ffffffff8122e0a6 } hitcount: 1 bytes_req: 12
292- { call_site: ffffffff8107da84 } hitcount: 1 bytes_req: 152
293- { call_site: ffffffff812d8246 } hitcount: 1 bytes_req: 24
294- { call_site: ffffffff811dc1e5 } hitcount: 3 bytes_req: 144
295- { call_site: ffffffffa02515e8 } hitcount: 3 bytes_req: 648
296- { call_site: ffffffff81258159 } hitcount: 3 bytes_req: 144
297- { call_site: ffffffff811c80f4 } hitcount: 4 bytes_req: 544
298- .
299- .
300- .
301- { call_site: ffffffffa06c7646 } hitcount: 106 bytes_req: 8024
302- { call_site: ffffffffa06cb246 } hitcount: 132 bytes_req: 31680
303- { call_site: ffffffffa06cef7a } hitcount: 132 bytes_req: 2112
304- { call_site: ffffffff8137e399 } hitcount: 132 bytes_req: 23232
305- { call_site: ffffffffa06c941c } hitcount: 185 bytes_req: 171360
306- { call_site: ffffffffa06f2a66 } hitcount: 185 bytes_req: 26640
307- { call_site: ffffffffa036a70e } hitcount: 265 bytes_req: 10600
308- { call_site: ffffffff81325447 } hitcount: 292 bytes_req: 584
309- { call_site: ffffffffa072da3c } hitcount: 446 bytes_req: 60656
310- { call_site: ffffffffa036b1f2 } hitcount: 526 bytes_req: 29456
311- { call_site: ffffffffa0099c06 } hitcount: 1780 bytes_req: 35600
312-
313- Totals:
314- Hits: 4775
315- Entries: 46
316- Dropped: 0
317-
318- Even that's only marginally more useful - while hex values do look
319- more like addresses, what users are typically more interested in
320- when looking at text addresses are the corresponding symbols
321- instead. To have an address displayed as symbolic value instead,
322- simply append '.sym' or '.sym-offset' to the field name in the
323- trigger:
324-
325- # echo 'hist:key=call_site.sym:val=bytes_req' > \
326- /sys/kernel/debug/tracing/events/kmem/kmalloc/trigger
327-
328- # cat /sys/kernel/debug/tracing/events/kmem/kmalloc/hist
329- # trigger info: hist:keys=call_site.sym:vals=bytes_req:sort=hitcount:size=2048 [active]
330-
331- { call_site: [ffffffff810adcb9] syslog_print_all } hitcount: 1 bytes_req: 1024
332- { call_site: [ffffffff8154bc62] usb_control_msg } hitcount: 1 bytes_req: 8
333- { call_site: [ffffffffa00bf6fe] hidraw_send_report [hid] } hitcount: 1 bytes_req: 7
334- { call_site: [ffffffff8154acbe] usb_alloc_urb } hitcount: 1 bytes_req: 192
335- { call_site: [ffffffffa00bf1ca] hidraw_report_event [hid] } hitcount: 1 bytes_req: 7
336- { call_site: [ffffffff811e3a25] __seq_open_private } hitcount: 1 bytes_req: 40
337- { call_site: [ffffffff8109524a] alloc_fair_sched_group } hitcount: 2 bytes_req: 128
338- { call_site: [ffffffff811febd5] fsnotify_alloc_group } hitcount: 2 bytes_req: 528
339- { call_site: [ffffffff81440f58] __tty_buffer_request_room } hitcount: 2 bytes_req: 2624
340- { call_site: [ffffffff81200ba6] inotify_new_group } hitcount: 2 bytes_req: 96
341- { call_site: [ffffffffa05e19af] ieee80211_start_tx_ba_session [mac80211] } hitcount: 2 bytes_req: 464
342- { call_site: [ffffffff81672406] tcp_get_metrics } hitcount: 2 bytes_req: 304
343- { call_site: [ffffffff81097ec2] alloc_rt_sched_group } hitcount: 2 bytes_req: 128
344- { call_site: [ffffffff81089b05] sched_create_group } hitcount: 2 bytes_req: 1424
345- .
346- .
347- .
348- { call_site: [ffffffffa04a580c] intel_crtc_page_flip [i915] } hitcount: 1185 bytes_req: 123240
349- { call_site: [ffffffffa0287592] drm_mode_page_flip_ioctl [drm] } hitcount: 1185 bytes_req: 104280
350- { call_site: [ffffffffa04c4a3c] intel_plane_duplicate_state [i915] } hitcount: 1402 bytes_req: 190672
351- { call_site: [ffffffff812891ca] ext4_find_extent } hitcount: 1518 bytes_req: 146208
352- { call_site: [ffffffffa029070e] drm_vma_node_allow [drm] } hitcount: 1746 bytes_req: 69840
353- { call_site: [ffffffffa045e7c4] i915_gem_do_execbuffer.isra.23 [i915] } hitcount: 2021 bytes_req: 792312
354- { call_site: [ffffffffa02911f2] drm_modeset_lock_crtc [drm] } hitcount: 2592 bytes_req: 145152
355- { call_site: [ffffffffa0489a66] intel_ring_begin [i915] } hitcount: 2629 bytes_req: 378576
356- { call_site: [ffffffffa046041c] i915_gem_execbuffer2 [i915] } hitcount: 2629 bytes_req: 3783248
357- { call_site: [ffffffff81325607] apparmor_file_alloc_security } hitcount: 5192 bytes_req: 10384
358- { call_site: [ffffffffa00b7c06] hid_report_raw_event [hid] } hitcount: 5529 bytes_req: 110584
359- { call_site: [ffffffff8131ebf7] aa_alloc_task_context } hitcount: 21943 bytes_req: 702176
360- { call_site: [ffffffff8125847d] ext4_htree_store_dirent } hitcount: 55759 bytes_req: 5074265
361-
362- Totals:
363- Hits: 109928
364- Entries: 71
365- Dropped: 0
366-
367- Because the default sort key above is 'hitcount', the above shows a
368- the list of call_sites by increasing hitcount, so that at the bottom
369- we see the functions that made the most kmalloc calls during the
370- run. If instead we we wanted to see the top kmalloc callers in
371- terms of the number of bytes requested rather than the number of
372- calls, and we wanted the top caller to appear at the top, we can use
373- the 'sort' parameter, along with the 'descending' modifier:
374-
375- # echo 'hist:key=call_site.sym:val=bytes_req:sort=bytes_req.descending' > \
376- /sys/kernel/debug/tracing/events/kmem/kmalloc/trigger
377-
378- # cat /sys/kernel/debug/tracing/events/kmem/kmalloc/hist
379- # trigger info: hist:keys=call_site.sym:vals=bytes_req:sort=bytes_req.descending:size=2048 [active]
380-
381- { call_site: [ffffffffa046041c] i915_gem_execbuffer2 [i915] } hitcount: 2186 bytes_req: 3397464
382- { call_site: [ffffffffa045e7c4] i915_gem_do_execbuffer.isra.23 [i915] } hitcount: 1790 bytes_req: 712176
383- { call_site: [ffffffff8125847d] ext4_htree_store_dirent } hitcount: 8132 bytes_req: 513135
384- { call_site: [ffffffff811e2a1b] seq_buf_alloc } hitcount: 106 bytes_req: 440128
385- { call_site: [ffffffffa0489a66] intel_ring_begin [i915] } hitcount: 2186 bytes_req: 314784
386- { call_site: [ffffffff812891ca] ext4_find_extent } hitcount: 2174 bytes_req: 208992
387- { call_site: [ffffffff811ae8e1] __kmalloc } hitcount: 8 bytes_req: 131072
388- { call_site: [ffffffffa04c4a3c] intel_plane_duplicate_state [i915] } hitcount: 859 bytes_req: 116824
389- { call_site: [ffffffffa02911f2] drm_modeset_lock_crtc [drm] } hitcount: 1834 bytes_req: 102704
390- { call_site: [ffffffffa04a580c] intel_crtc_page_flip [i915] } hitcount: 972 bytes_req: 101088
391- { call_site: [ffffffffa0287592] drm_mode_page_flip_ioctl [drm] } hitcount: 972 bytes_req: 85536
392- { call_site: [ffffffffa00b7c06] hid_report_raw_event [hid] } hitcount: 3333 bytes_req: 66664
393- { call_site: [ffffffff8137e559] sg_kmalloc } hitcount: 209 bytes_req: 61632
394- .
395- .
396- .
397- { call_site: [ffffffff81095225] alloc_fair_sched_group } hitcount: 2 bytes_req: 128
398- { call_site: [ffffffff81097ec2] alloc_rt_sched_group } hitcount: 2 bytes_req: 128
399- { call_site: [ffffffff812d8406] copy_semundo } hitcount: 2 bytes_req: 48
400- { call_site: [ffffffff81200ba6] inotify_new_group } hitcount: 1 bytes_req: 48
401- { call_site: [ffffffffa027121a] drm_getmagic [drm] } hitcount: 1 bytes_req: 48
402- { call_site: [ffffffff811e3a25] __seq_open_private } hitcount: 1 bytes_req: 40
403- { call_site: [ffffffff811c52f4] bprm_change_interp } hitcount: 2 bytes_req: 16
404- { call_site: [ffffffff8154bc62] usb_control_msg } hitcount: 1 bytes_req: 8
405- { call_site: [ffffffffa00bf1ca] hidraw_report_event [hid] } hitcount: 1 bytes_req: 7
406- { call_site: [ffffffffa00bf6fe] hidraw_send_report [hid] } hitcount: 1 bytes_req: 7
407-
408- Totals:
409- Hits: 32133
410- Entries: 81
411- Dropped: 0
412-
413- To display the offset and size information in addition to the symbol
414- name, just use 'sym-offset' instead:
415-
416- # echo 'hist:key=call_site.sym-offset:val=bytes_req:sort=bytes_req.descending' > \
417- /sys/kernel/debug/tracing/events/kmem/kmalloc/trigger
418-
419- # cat /sys/kernel/debug/tracing/events/kmem/kmalloc/hist
420- # trigger info: hist:keys=call_site.sym-offset:vals=bytes_req:sort=bytes_req.descending:size=2048 [active]
421-
422- { call_site: [ffffffffa046041c] i915_gem_execbuffer2+0x6c/0x2c0 [i915] } hitcount: 4569 bytes_req: 3163720
423- { call_site: [ffffffffa0489a66] intel_ring_begin+0xc6/0x1f0 [i915] } hitcount: 4569 bytes_req: 657936
424- { call_site: [ffffffffa045e7c4] i915_gem_do_execbuffer.isra.23+0x694/0x1020 [i915] } hitcount: 1519 bytes_req: 472936
425- { call_site: [ffffffffa045e646] i915_gem_do_execbuffer.isra.23+0x516/0x1020 [i915] } hitcount: 3050 bytes_req: 211832
426- { call_site: [ffffffff811e2a1b] seq_buf_alloc+0x1b/0x50 } hitcount: 34 bytes_req: 148384
427- { call_site: [ffffffffa04a580c] intel_crtc_page_flip+0xbc/0x870 [i915] } hitcount: 1385 bytes_req: 144040
428- { call_site: [ffffffff811ae8e1] __kmalloc+0x191/0x1b0 } hitcount: 8 bytes_req: 131072
429- { call_site: [ffffffffa0287592] drm_mode_page_flip_ioctl+0x282/0x360 [drm] } hitcount: 1385 bytes_req: 121880
430- { call_site: [ffffffffa02911f2] drm_modeset_lock_crtc+0x32/0x100 [drm] } hitcount: 1848 bytes_req: 103488
431- { call_site: [ffffffffa04c4a3c] intel_plane_duplicate_state+0x2c/0xa0 [i915] } hitcount: 461 bytes_req: 62696
432- { call_site: [ffffffffa029070e] drm_vma_node_allow+0x2e/0xd0 [drm] } hitcount: 1541 bytes_req: 61640
433- { call_site: [ffffffff815f8d7b] sk_prot_alloc+0xcb/0x1b0 } hitcount: 57 bytes_req: 57456
434- .
435- .
436- .
437- { call_site: [ffffffff8109524a] alloc_fair_sched_group+0x5a/0x1a0 } hitcount: 2 bytes_req: 128
438- { call_site: [ffffffffa027b921] drm_vm_open_locked+0x31/0xa0 [drm] } hitcount: 3 bytes_req: 96
439- { call_site: [ffffffff8122e266] proc_self_follow_link+0x76/0xb0 } hitcount: 8 bytes_req: 96
440- { call_site: [ffffffff81213e80] load_elf_binary+0x240/0x1650 } hitcount: 3 bytes_req: 84
441- { call_site: [ffffffff8154bc62] usb_control_msg+0x42/0x110 } hitcount: 1 bytes_req: 8
442- { call_site: [ffffffffa00bf6fe] hidraw_send_report+0x7e/0x1a0 [hid] } hitcount: 1 bytes_req: 7
443- { call_site: [ffffffffa00bf1ca] hidraw_report_event+0x8a/0x120 [hid] } hitcount: 1 bytes_req: 7
444-
445- Totals:
446- Hits: 26098
447- Entries: 64
448- Dropped: 0
449-
450- We can also add multiple fields to the 'values' parameter. For
451- example, we might want to see the total number of bytes allocated
452- alongside bytes requested, and display the result sorted by bytes
453- allocated in a descending order:
454-
455- # echo 'hist:keys=call_site.sym:values=bytes_req,bytes_alloc:sort=bytes_alloc.descending' > \
456- /sys/kernel/debug/tracing/events/kmem/kmalloc/trigger
457-
458- # cat /sys/kernel/debug/tracing/events/kmem/kmalloc/hist
459- # trigger info: hist:keys=call_site.sym:vals=bytes_req,bytes_alloc:sort=bytes_alloc.descending:size=2048 [active]
460-
461- { call_site: [ffffffffa046041c] i915_gem_execbuffer2 [i915] } hitcount: 7403 bytes_req: 4084360 bytes_alloc: 5958016
462- { call_site: [ffffffff811e2a1b] seq_buf_alloc } hitcount: 541 bytes_req: 2213968 bytes_alloc: 2228224
463- { call_site: [ffffffffa0489a66] intel_ring_begin [i915] } hitcount: 7404 bytes_req: 1066176 bytes_alloc: 1421568
464- { call_site: [ffffffffa045e7c4] i915_gem_do_execbuffer.isra.23 [i915] } hitcount: 1565 bytes_req: 557368 bytes_alloc: 1037760
465- { call_site: [ffffffff8125847d] ext4_htree_store_dirent } hitcount: 9557 bytes_req: 595778 bytes_alloc: 695744
466- { call_site: [ffffffffa045e646] i915_gem_do_execbuffer.isra.23 [i915] } hitcount: 5839 bytes_req: 430680 bytes_alloc: 470400
467- { call_site: [ffffffffa04c4a3c] intel_plane_duplicate_state [i915] } hitcount: 2388 bytes_req: 324768 bytes_alloc: 458496
468- { call_site: [ffffffffa02911f2] drm_modeset_lock_crtc [drm] } hitcount: 3911 bytes_req: 219016 bytes_alloc: 250304
469- { call_site: [ffffffff815f8d7b] sk_prot_alloc } hitcount: 235 bytes_req: 236880 bytes_alloc: 240640
470- { call_site: [ffffffff8137e559] sg_kmalloc } hitcount: 557 bytes_req: 169024 bytes_alloc: 221760
471- { call_site: [ffffffffa00b7c06] hid_report_raw_event [hid] } hitcount: 9378 bytes_req: 187548 bytes_alloc: 206312
472- { call_site: [ffffffffa04a580c] intel_crtc_page_flip [i915] } hitcount: 1519 bytes_req: 157976 bytes_alloc: 194432
473- .
474- .
475- .
476- { call_site: [ffffffff8109bd3b] sched_autogroup_create_attach } hitcount: 2 bytes_req: 144 bytes_alloc: 192
477- { call_site: [ffffffff81097ee8] alloc_rt_sched_group } hitcount: 2 bytes_req: 128 bytes_alloc: 128
478- { call_site: [ffffffff8109524a] alloc_fair_sched_group } hitcount: 2 bytes_req: 128 bytes_alloc: 128
479- { call_site: [ffffffff81095225] alloc_fair_sched_group } hitcount: 2 bytes_req: 128 bytes_alloc: 128
480- { call_site: [ffffffff81097ec2] alloc_rt_sched_group } hitcount: 2 bytes_req: 128 bytes_alloc: 128
481- { call_site: [ffffffff81213e80] load_elf_binary } hitcount: 3 bytes_req: 84 bytes_alloc: 96
482- { call_site: [ffffffff81079a2e] kthread_create_on_node } hitcount: 1 bytes_req: 56 bytes_alloc: 64
483- { call_site: [ffffffffa00bf6fe] hidraw_send_report [hid] } hitcount: 1 bytes_req: 7 bytes_alloc: 8
484- { call_site: [ffffffff8154bc62] usb_control_msg } hitcount: 1 bytes_req: 8 bytes_alloc: 8
485- { call_site: [ffffffffa00bf1ca] hidraw_report_event [hid] } hitcount: 1 bytes_req: 7 bytes_alloc: 8
486-
487- Totals:
488- Hits: 66598
489- Entries: 65
490- Dropped: 0
491-
492- Finally, to finish off our kmalloc example, instead of simply having
493- the hist trigger display symbolic call_sites, we can have the hist
494- trigger additionally display the complete set of kernel stack traces
495- that led to each call_site. To do that, we simply use the special
496- value 'stacktrace' for the key parameter:
497-
498- # echo 'hist:keys=stacktrace:values=bytes_req,bytes_alloc:sort=bytes_alloc' > \
499- /sys/kernel/debug/tracing/events/kmem/kmalloc/trigger
500-
501- The above trigger will use the kernel stack trace in effect when an
502- event is triggered as the key for the hash table. This allows the
503- enumeration of every kernel callpath that led up to a particular
504- event, along with a running total of any of the event fields for
505- that event. Here we tally bytes requested and bytes allocated for
506- every callpath in the system that led up to a kmalloc (in this case
507- every callpath to a kmalloc for a kernel compile):
508-
509- # cat /sys/kernel/debug/tracing/events/kmem/kmalloc/hist
510- # trigger info: hist:keys=stacktrace:vals=bytes_req,bytes_alloc:sort=bytes_alloc:size=2048 [active]
511-
512- { stacktrace:
513- __kmalloc_track_caller+0x10b/0x1a0
514- kmemdup+0x20/0x50
515- hidraw_report_event+0x8a/0x120 [hid]
516- hid_report_raw_event+0x3ea/0x440 [hid]
517- hid_input_report+0x112/0x190 [hid]
518- hid_irq_in+0xc2/0x260 [usbhid]
519- __usb_hcd_giveback_urb+0x72/0x120
520- usb_giveback_urb_bh+0x9e/0xe0
521- tasklet_hi_action+0xf8/0x100
522- __do_softirq+0x114/0x2c0
523- irq_exit+0xa5/0xb0
524- do_IRQ+0x5a/0xf0
525- ret_from_intr+0x0/0x30
526- cpuidle_enter+0x17/0x20
527- cpu_startup_entry+0x315/0x3e0
528- rest_init+0x7c/0x80
529- } hitcount: 3 bytes_req: 21 bytes_alloc: 24
530- { stacktrace:
531- __kmalloc_track_caller+0x10b/0x1a0
532- kmemdup+0x20/0x50
533- hidraw_report_event+0x8a/0x120 [hid]
534- hid_report_raw_event+0x3ea/0x440 [hid]
535- hid_input_report+0x112/0x190 [hid]
536- hid_irq_in+0xc2/0x260 [usbhid]
537- __usb_hcd_giveback_urb+0x72/0x120
538- usb_giveback_urb_bh+0x9e/0xe0
539- tasklet_hi_action+0xf8/0x100
540- __do_softirq+0x114/0x2c0
541- irq_exit+0xa5/0xb0
542- do_IRQ+0x5a/0xf0
543- ret_from_intr+0x0/0x30
544- } hitcount: 3 bytes_req: 21 bytes_alloc: 24
545- { stacktrace:
546- kmem_cache_alloc_trace+0xeb/0x150
547- aa_alloc_task_context+0x27/0x40
548- apparmor_cred_prepare+0x1f/0x50
549- security_prepare_creds+0x16/0x20
550- prepare_creds+0xdf/0x1a0
551- SyS_capset+0xb5/0x200
552- system_call_fastpath+0x12/0x6a
553- } hitcount: 1 bytes_req: 32 bytes_alloc: 32
554- .
555- .
556- .
557- { stacktrace:
558- __kmalloc+0x11b/0x1b0
559- i915_gem_execbuffer2+0x6c/0x2c0 [i915]
560- drm_ioctl+0x349/0x670 [drm]
561- do_vfs_ioctl+0x2f0/0x4f0
562- SyS_ioctl+0x81/0xa0
563- system_call_fastpath+0x12/0x6a
564- } hitcount: 17726 bytes_req: 13944120 bytes_alloc: 19593808
565- { stacktrace:
566- __kmalloc+0x11b/0x1b0
567- load_elf_phdrs+0x76/0xa0
568- load_elf_binary+0x102/0x1650
569- search_binary_handler+0x97/0x1d0
570- do_execveat_common.isra.34+0x551/0x6e0
571- SyS_execve+0x3a/0x50
572- return_from_execve+0x0/0x23
573- } hitcount: 33348 bytes_req: 17152128 bytes_alloc: 20226048
574- { stacktrace:
575- kmem_cache_alloc_trace+0xeb/0x150
576- apparmor_file_alloc_security+0x27/0x40
577- security_file_alloc+0x16/0x20
578- get_empty_filp+0x93/0x1c0
579- path_openat+0x31/0x5f0
580- do_filp_open+0x3a/0x90
581- do_sys_open+0x128/0x220
582- SyS_open+0x1e/0x20
583- system_call_fastpath+0x12/0x6a
584- } hitcount: 4766422 bytes_req: 9532844 bytes_alloc: 38131376
585- { stacktrace:
586- __kmalloc+0x11b/0x1b0
587- seq_buf_alloc+0x1b/0x50
588- seq_read+0x2cc/0x370
589- proc_reg_read+0x3d/0x80
590- __vfs_read+0x28/0xe0
591- vfs_read+0x86/0x140
592- SyS_read+0x46/0xb0
593- system_call_fastpath+0x12/0x6a
594- } hitcount: 19133 bytes_req: 78368768 bytes_alloc: 78368768
595-
596- Totals:
597- Hits: 6085872
598- Entries: 253
599- Dropped: 0
600-
601- If you key a hist trigger on common_pid, in order for example to
602- gather and display sorted totals for each process, you can use the
603- special .execname modifier to display the executable names for the
604- processes in the table rather than raw pids. The example below
605- keeps a per-process sum of total bytes read:
606-
607- # echo 'hist:key=common_pid.execname:val=count:sort=count.descending' > \
608- /sys/kernel/debug/tracing/events/syscalls/sys_enter_read/trigger
609-
610- # cat /sys/kernel/debug/tracing/events/syscalls/sys_enter_read/hist
611- # trigger info: hist:keys=common_pid.execname:vals=count:sort=count.descending:size=2048 [active]
612-
613- { common_pid: gnome-terminal [ 3196] } hitcount: 280 count: 1093512
614- { common_pid: Xorg [ 1309] } hitcount: 525 count: 256640
615- { common_pid: compiz [ 2889] } hitcount: 59 count: 254400
616- { common_pid: bash [ 8710] } hitcount: 3 count: 66369
617- { common_pid: dbus-daemon-lau [ 8703] } hitcount: 49 count: 47739
618- { common_pid: irqbalance [ 1252] } hitcount: 27 count: 27648
619- { common_pid: 01ifupdown [ 8705] } hitcount: 3 count: 17216
620- { common_pid: dbus-daemon [ 772] } hitcount: 10 count: 12396
621- { common_pid: Socket Thread [ 8342] } hitcount: 11 count: 11264
622- { common_pid: nm-dhcp-client. [ 8701] } hitcount: 6 count: 7424
623- { common_pid: gmain [ 1315] } hitcount: 18 count: 6336
624- .
625- .
626- .
627- { common_pid: postgres [ 1892] } hitcount: 2 count: 32
628- { common_pid: postgres [ 1891] } hitcount: 2 count: 32
629- { common_pid: gmain [ 8704] } hitcount: 2 count: 32
630- { common_pid: upstart-dbus-br [ 2740] } hitcount: 21 count: 21
631- { common_pid: nm-dispatcher.a [ 8696] } hitcount: 1 count: 16
632- { common_pid: indicator-datet [ 2904] } hitcount: 1 count: 16
633- { common_pid: gdbus [ 2998] } hitcount: 1 count: 16
634- { common_pid: rtkit-daemon [ 2052] } hitcount: 1 count: 8
635- { common_pid: init [ 1] } hitcount: 2 count: 2
636-
637- Totals:
638- Hits: 2116
639- Entries: 51
640- Dropped: 0
641-
642- Similarly, if you key a hist trigger on syscall id, for example to
643- gather and display a list of systemwide syscall hits, you can use
644- the special .syscall modifier to display the syscall names rather
645- than raw ids. The example below keeps a running total of syscall
646- counts for the system during the run:
647-
648- # echo 'hist:key=id.syscall:val=hitcount' > \
649- /sys/kernel/debug/tracing/events/raw_syscalls/sys_enter/trigger
650-
651- # cat /sys/kernel/debug/tracing/events/raw_syscalls/sys_enter/hist
652- # trigger info: hist:keys=id.syscall:vals=hitcount:sort=hitcount:size=2048 [active]
653-
654- { id: sys_fsync [ 74] } hitcount: 1
655- { id: sys_newuname [ 63] } hitcount: 1
656- { id: sys_prctl [157] } hitcount: 1
657- { id: sys_statfs [137] } hitcount: 1
658- { id: sys_symlink [ 88] } hitcount: 1
659- { id: sys_sendmmsg [307] } hitcount: 1
660- { id: sys_semctl [ 66] } hitcount: 1
661- { id: sys_readlink [ 89] } hitcount: 3
662- { id: sys_bind [ 49] } hitcount: 3
663- { id: sys_getsockname [ 51] } hitcount: 3
664- { id: sys_unlink [ 87] } hitcount: 3
665- { id: sys_rename [ 82] } hitcount: 4
666- { id: unknown_syscall [ 58] } hitcount: 4
667- { id: sys_connect [ 42] } hitcount: 4
668- { id: sys_getpid [ 39] } hitcount: 4
669- .
670- .
671- .
672- { id: sys_rt_sigprocmask [ 14] } hitcount: 952
673- { id: sys_futex [202] } hitcount: 1534
674- { id: sys_write [ 1] } hitcount: 2689
675- { id: sys_setitimer [ 38] } hitcount: 2797
676- { id: sys_read [ 0] } hitcount: 3202
677- { id: sys_select [ 23] } hitcount: 3773
678- { id: sys_writev [ 20] } hitcount: 4531
679- { id: sys_poll [ 7] } hitcount: 8314
680- { id: sys_recvmsg [ 47] } hitcount: 13738
681- { id: sys_ioctl [ 16] } hitcount: 21843
682-
683- Totals:
684- Hits: 67612
685- Entries: 72
686- Dropped: 0
687-
688- The syscall counts above provide a rough overall picture of system
689- call activity on the system; we can see for example that the most
690- popular system call on this system was the 'sys_ioctl' system call.
691-
692- We can use 'compound' keys to refine that number and provide some
693- further insight as to which processes exactly contribute to the
694- overall ioctl count.
695-
696- The command below keeps a hitcount for every unique combination of
697- system call id and pid - the end result is essentially a table
698- that keeps a per-pid sum of system call hits. The results are
699- sorted using the system call id as the primary key, and the
700- hitcount sum as the secondary key:
701-
702- # echo 'hist:key=id.syscall,common_pid.execname:val=hitcount:sort=id,hitcount' > \
703- /sys/kernel/debug/tracing/events/raw_syscalls/sys_enter/trigger
704-
705- # cat /sys/kernel/debug/tracing/events/raw_syscalls/sys_enter/hist
706- # trigger info: hist:keys=id.syscall,common_pid.execname:vals=hitcount:sort=id.syscall,hitcount:size=2048 [active]
707-
708- { id: sys_read [ 0], common_pid: rtkit-daemon [ 1877] } hitcount: 1
709- { id: sys_read [ 0], common_pid: gdbus [ 2976] } hitcount: 1
710- { id: sys_read [ 0], common_pid: console-kit-dae [ 3400] } hitcount: 1
711- { id: sys_read [ 0], common_pid: postgres [ 1865] } hitcount: 1
712- { id: sys_read [ 0], common_pid: deja-dup-monito [ 3543] } hitcount: 2
713- { id: sys_read [ 0], common_pid: NetworkManager [ 890] } hitcount: 2
714- { id: sys_read [ 0], common_pid: evolution-calen [ 3048] } hitcount: 2
715- { id: sys_read [ 0], common_pid: postgres [ 1864] } hitcount: 2
716- { id: sys_read [ 0], common_pid: nm-applet [ 3022] } hitcount: 2
717- { id: sys_read [ 0], common_pid: whoopsie [ 1212] } hitcount: 2
718- .
719- .
720- .
721- { id: sys_ioctl [ 16], common_pid: bash [ 8479] } hitcount: 1
722- { id: sys_ioctl [ 16], common_pid: bash [ 3472] } hitcount: 12
723- { id: sys_ioctl [ 16], common_pid: gnome-terminal [ 3199] } hitcount: 16
724- { id: sys_ioctl [ 16], common_pid: Xorg [ 1267] } hitcount: 1808
725- { id: sys_ioctl [ 16], common_pid: compiz [ 2994] } hitcount: 5580
726- .
727- .
728- .
729- { id: sys_waitid [247], common_pid: upstart-dbus-br [ 2690] } hitcount: 3
730- { id: sys_waitid [247], common_pid: upstart-dbus-br [ 2688] } hitcount: 16
731- { id: sys_inotify_add_watch [254], common_pid: gmain [ 975] } hitcount: 2
732- { id: sys_inotify_add_watch [254], common_pid: gmain [ 3204] } hitcount: 4
733- { id: sys_inotify_add_watch [254], common_pid: gmain [ 2888] } hitcount: 4
734- { id: sys_inotify_add_watch [254], common_pid: gmain [ 3003] } hitcount: 4
735- { id: sys_inotify_add_watch [254], common_pid: gmain [ 2873] } hitcount: 4
736- { id: sys_inotify_add_watch [254], common_pid: gmain [ 3196] } hitcount: 6
737- { id: sys_openat [257], common_pid: java [ 2623] } hitcount: 2
738- { id: sys_eventfd2 [290], common_pid: ibus-ui-gtk3 [ 2760] } hitcount: 4
739- { id: sys_eventfd2 [290], common_pid: compiz [ 2994] } hitcount: 6
740-
741- Totals:
742- Hits: 31536
743- Entries: 323
744- Dropped: 0
745-
746- The above list does give us a breakdown of the ioctl syscall by
747- pid, but it also gives us quite a bit more than that, which we
748- don't really care about at the moment. Since we know the syscall
749- id for sys_ioctl (16, displayed next to the sys_ioctl name), we
750- can use that to filter out all the other syscalls:
751-
752- # echo 'hist:key=id.syscall,common_pid.execname:val=hitcount:sort=id,hitcount if id == 16' > \
753- /sys/kernel/debug/tracing/events/raw_syscalls/sys_enter/trigger
754-
755- # cat /sys/kernel/debug/tracing/events/raw_syscalls/sys_enter/hist
756- # trigger info: hist:keys=id.syscall,common_pid.execname:vals=hitcount:sort=id.syscall,hitcount:size=2048 if id == 16 [active]
757-
758- { id: sys_ioctl [ 16], common_pid: gmain [ 2769] } hitcount: 1
759- { id: sys_ioctl [ 16], common_pid: evolution-addre [ 8571] } hitcount: 1
760- { id: sys_ioctl [ 16], common_pid: gmain [ 3003] } hitcount: 1
761- { id: sys_ioctl [ 16], common_pid: gmain [ 2781] } hitcount: 1
762- { id: sys_ioctl [ 16], common_pid: gmain [ 2829] } hitcount: 1
763- { id: sys_ioctl [ 16], common_pid: bash [ 8726] } hitcount: 1
764- { id: sys_ioctl [ 16], common_pid: bash [ 8508] } hitcount: 1
765- { id: sys_ioctl [ 16], common_pid: gmain [ 2970] } hitcount: 1
766- { id: sys_ioctl [ 16], common_pid: gmain [ 2768] } hitcount: 1
767- .
768- .
769- .
770- { id: sys_ioctl [ 16], common_pid: pool [ 8559] } hitcount: 45
771- { id: sys_ioctl [ 16], common_pid: pool [ 8555] } hitcount: 48
772- { id: sys_ioctl [ 16], common_pid: pool [ 8551] } hitcount: 48
773- { id: sys_ioctl [ 16], common_pid: avahi-daemon [ 896] } hitcount: 66
774- { id: sys_ioctl [ 16], common_pid: Xorg [ 1267] } hitcount: 26674
775- { id: sys_ioctl [ 16], common_pid: compiz [ 2994] } hitcount: 73443
776-
777- Totals:
778- Hits: 101162
779- Entries: 103
780- Dropped: 0
781-
782- The above output shows that 'compiz' and 'Xorg' are far and away
783- the heaviest ioctl callers (which might lead to questions about
784- whether they really need to be making all those calls and to
785- possible avenues for further investigation.)
786-
787- The compound key examples used a key and a sum value (hitcount) to
788- sort the output, but we can just as easily use two keys instead.
789- Here's an example where we use a compound key composed of the the
790- common_pid and size event fields. Sorting with pid as the primary
791- key and 'size' as the secondary key allows us to display an
792- ordered summary of the recvfrom sizes, with counts, received by
793- each process:
794-
795- # echo 'hist:key=common_pid.execname,size:val=hitcount:sort=common_pid,size' > \
796- /sys/kernel/debug/tracing/events/syscalls/sys_enter_recvfrom/trigger
797-
798- # cat /sys/kernel/debug/tracing/events/syscalls/sys_enter_recvfrom/hist
799- # trigger info: hist:keys=common_pid.execname,size:vals=hitcount:sort=common_pid.execname,size:size=2048 [active]
800-
801- { common_pid: smbd [ 784], size: 4 } hitcount: 1
802- { common_pid: dnsmasq [ 1412], size: 4096 } hitcount: 672
803- { common_pid: postgres [ 1796], size: 1000 } hitcount: 6
804- { common_pid: postgres [ 1867], size: 1000 } hitcount: 10
805- { common_pid: bamfdaemon [ 2787], size: 28 } hitcount: 2
806- { common_pid: bamfdaemon [ 2787], size: 14360 } hitcount: 1
807- { common_pid: compiz [ 2994], size: 8 } hitcount: 1
808- { common_pid: compiz [ 2994], size: 20 } hitcount: 11
809- { common_pid: gnome-terminal [ 3199], size: 4 } hitcount: 2
810- { common_pid: firefox [ 8817], size: 4 } hitcount: 1
811- { common_pid: firefox [ 8817], size: 8 } hitcount: 5
812- { common_pid: firefox [ 8817], size: 588 } hitcount: 2
813- { common_pid: firefox [ 8817], size: 628 } hitcount: 1
814- { common_pid: firefox [ 8817], size: 6944 } hitcount: 1
815- { common_pid: firefox [ 8817], size: 408880 } hitcount: 2
816- { common_pid: firefox [ 8822], size: 8 } hitcount: 2
817- { common_pid: firefox [ 8822], size: 160 } hitcount: 2
818- { common_pid: firefox [ 8822], size: 320 } hitcount: 2
819- { common_pid: firefox [ 8822], size: 352 } hitcount: 1
820- .
821- .
822- .
823- { common_pid: pool [ 8923], size: 1960 } hitcount: 10
824- { common_pid: pool [ 8923], size: 2048 } hitcount: 10
825- { common_pid: pool [ 8924], size: 1960 } hitcount: 10
826- { common_pid: pool [ 8924], size: 2048 } hitcount: 10
827- { common_pid: pool [ 8928], size: 1964 } hitcount: 4
828- { common_pid: pool [ 8928], size: 1965 } hitcount: 2
829- { common_pid: pool [ 8928], size: 2048 } hitcount: 6
830- { common_pid: pool [ 8929], size: 1982 } hitcount: 1
831- { common_pid: pool [ 8929], size: 2048 } hitcount: 1
832-
833- Totals:
834- Hits: 2016
835- Entries: 224
836- Dropped: 0
837-
838- The above example also illustrates the fact that although a compound
839- key is treated as a single entity for hashing purposes, the sub-keys
840- it's composed of can be accessed independently.
841-
842- The next example uses a string field as the hash key and
843- demonstrates how you can manually pause and continue a hist trigger.
844- In this example, we'll aggregate fork counts and don't expect a
845- large number of entries in the hash table, so we'll drop it to a
846- much smaller number, say 256:
847-
848- # echo 'hist:key=child_comm:val=hitcount:size=256' > \
849- /sys/kernel/debug/tracing/events/sched/sched_process_fork/trigger
850-
851- # cat /sys/kernel/debug/tracing/events/sched/sched_process_fork/hist
852- # trigger info: hist:keys=child_comm:vals=hitcount:sort=hitcount:size=256 [active]
853-
854- { child_comm: dconf worker } hitcount: 1
855- { child_comm: ibus-daemon } hitcount: 1
856- { child_comm: whoopsie } hitcount: 1
857- { child_comm: smbd } hitcount: 1
858- { child_comm: gdbus } hitcount: 1
859- { child_comm: kthreadd } hitcount: 1
860- { child_comm: dconf worker } hitcount: 1
861- { child_comm: evolution-alarm } hitcount: 2
862- { child_comm: Socket Thread } hitcount: 2
863- { child_comm: postgres } hitcount: 2
864- { child_comm: bash } hitcount: 3
865- { child_comm: compiz } hitcount: 3
866- { child_comm: evolution-sourc } hitcount: 4
867- { child_comm: dhclient } hitcount: 4
868- { child_comm: pool } hitcount: 5
869- { child_comm: nm-dispatcher.a } hitcount: 8
870- { child_comm: firefox } hitcount: 8
871- { child_comm: dbus-daemon } hitcount: 8
872- { child_comm: glib-pacrunner } hitcount: 10
873- { child_comm: evolution } hitcount: 23
874-
875- Totals:
876- Hits: 89
877- Entries: 20
878- Dropped: 0
879-
880- If we want to pause the hist trigger, we can simply append :pause to
881- the command that started the trigger. Notice that the trigger info
882- displays as [paused]:
883-
884- # echo 'hist:key=child_comm:val=hitcount:size=256:pause' >> \
885- /sys/kernel/debug/tracing/events/sched/sched_process_fork/trigger
886-
887- # cat /sys/kernel/debug/tracing/events/sched/sched_process_fork/hist
888- # trigger info: hist:keys=child_comm:vals=hitcount:sort=hitcount:size=256 [paused]
889-
890- { child_comm: dconf worker } hitcount: 1
891- { child_comm: kthreadd } hitcount: 1
892- { child_comm: dconf worker } hitcount: 1
893- { child_comm: gdbus } hitcount: 1
894- { child_comm: ibus-daemon } hitcount: 1
895- { child_comm: Socket Thread } hitcount: 2
896- { child_comm: evolution-alarm } hitcount: 2
897- { child_comm: smbd } hitcount: 2
898- { child_comm: bash } hitcount: 3
899- { child_comm: whoopsie } hitcount: 3
900- { child_comm: compiz } hitcount: 3
901- { child_comm: evolution-sourc } hitcount: 4
902- { child_comm: pool } hitcount: 5
903- { child_comm: postgres } hitcount: 6
904- { child_comm: firefox } hitcount: 8
905- { child_comm: dhclient } hitcount: 10
906- { child_comm: emacs } hitcount: 12
907- { child_comm: dbus-daemon } hitcount: 20
908- { child_comm: nm-dispatcher.a } hitcount: 20
909- { child_comm: evolution } hitcount: 35
910- { child_comm: glib-pacrunner } hitcount: 59
911-
912- Totals:
913- Hits: 199
914- Entries: 21
915- Dropped: 0
916-
917- To manually continue having the trigger aggregate events, append
918- :cont instead. Notice that the trigger info displays as [active]
919- again, and the data has changed:
920-
921- # echo 'hist:key=child_comm:val=hitcount:size=256:cont' >> \
922- /sys/kernel/debug/tracing/events/sched/sched_process_fork/trigger
923-
924- # cat /sys/kernel/debug/tracing/events/sched/sched_process_fork/hist
925- # trigger info: hist:keys=child_comm:vals=hitcount:sort=hitcount:size=256 [active]
926-
927- { child_comm: dconf worker } hitcount: 1
928- { child_comm: dconf worker } hitcount: 1
929- { child_comm: kthreadd } hitcount: 1
930- { child_comm: gdbus } hitcount: 1
931- { child_comm: ibus-daemon } hitcount: 1
932- { child_comm: Socket Thread } hitcount: 2
933- { child_comm: evolution-alarm } hitcount: 2
934- { child_comm: smbd } hitcount: 2
935- { child_comm: whoopsie } hitcount: 3
936- { child_comm: compiz } hitcount: 3
937- { child_comm: evolution-sourc } hitcount: 4
938- { child_comm: bash } hitcount: 5
939- { child_comm: pool } hitcount: 5
940- { child_comm: postgres } hitcount: 6
941- { child_comm: firefox } hitcount: 8
942- { child_comm: dhclient } hitcount: 11
943- { child_comm: emacs } hitcount: 12
944- { child_comm: dbus-daemon } hitcount: 22
945- { child_comm: nm-dispatcher.a } hitcount: 22
946- { child_comm: evolution } hitcount: 35
947- { child_comm: glib-pacrunner } hitcount: 59
948-
949- Totals:
950- Hits: 206
951- Entries: 21
952- Dropped: 0
953-
954- The previous example showed how to start and stop a hist trigger by
955- appending 'pause' and 'continue' to the hist trigger command. A
956- hist trigger can also be started in a paused state by initially
957- starting the trigger with ':pause' appended. This allows you to
958- start the trigger only when you're ready to start collecting data
959- and not before. For example, you could start the trigger in a
960- paused state, then unpause it and do something you want to measure,
961- then pause the trigger again when done.
962-
963- Of course, doing this manually can be difficult and error-prone, but
964- it is possible to automatically start and stop a hist trigger based
965- on some condition, via the enable_hist and disable_hist triggers.
966-
967- For example, suppose we wanted to take a look at the relative
968- weights in terms of skb length for each callpath that leads to a
969- netif_receieve_skb event when downloading a decent-sized file using
970- wget.
971-
972- First we set up an initially paused stacktrace trigger on the
973- netif_receive_skb event:
974-
975- # echo 'hist:key=stacktrace:vals=len:pause' > \
976- /sys/kernel/debug/tracing/events/net/netif_receive_skb/trigger
977-
978- Next, we set up an 'enable_hist' trigger on the sched_process_exec
979- event, with an 'if filename==/usr/bin/wget' filter. The effect of
980- this new trigger is that it will 'unpause' the hist trigger we just
981- set up on netif_receive_skb if and only if it sees a
982- sched_process_exec event with a filename of '/usr/bin/wget'. When
983- that happens, all netif_receive_skb events are aggregated into a
984- hash table keyed on stacktrace:
985-
986- # echo 'enable_hist:net:netif_receive_skb if filename==/usr/bin/wget' > \
987- /sys/kernel/debug/tracing/events/sched/sched_process_exec/trigger
988-
989- The aggregation continues until the netif_receive_skb is paused
990- again, which is what the following disable_hist event does by
991- creating a similar setup on the sched_process_exit event, using the
992- filter 'comm==wget':
993-
994- # echo 'disable_hist:net:netif_receive_skb if comm==wget' > \
995- /sys/kernel/debug/tracing/events/sched/sched_process_exit/trigger
996-
997- Whenever a process exits and the comm field of the disable_hist
998- trigger filter matches 'comm==wget', the netif_receive_skb hist
999- trigger is disabled.
1000-
1001- The overall effect is that netif_receive_skb events are aggregated
1002- into the hash table for only the duration of the wget. Executing a
1003- wget command and then listing the 'hist' file will display the
1004- output generated by the wget command:
1005-
1006- $ wget https://www.kernel.org/pub/linux/kernel/v3.x/patch-3.19.xz
1007-
1008- # cat /sys/kernel/debug/tracing/events/net/netif_receive_skb/hist
1009- # trigger info: hist:keys=stacktrace:vals=len:sort=hitcount:size=2048 [paused]
1010-
1011- { stacktrace:
1012- __netif_receive_skb_core+0x46d/0x990
1013- __netif_receive_skb+0x18/0x60
1014- netif_receive_skb_internal+0x23/0x90
1015- napi_gro_receive+0xc8/0x100
1016- ieee80211_deliver_skb+0xd6/0x270 [mac80211]
1017- ieee80211_rx_handlers+0xccf/0x22f0 [mac80211]
1018- ieee80211_prepare_and_rx_handle+0x4e7/0xc40 [mac80211]
1019- ieee80211_rx+0x31d/0x900 [mac80211]
1020- iwlagn_rx_reply_rx+0x3db/0x6f0 [iwldvm]
1021- iwl_rx_dispatch+0x8e/0xf0 [iwldvm]
1022- iwl_pcie_irq_handler+0xe3c/0x12f0 [iwlwifi]
1023- irq_thread_fn+0x20/0x50
1024- irq_thread+0x11f/0x150
1025- kthread+0xd2/0xf0
1026- ret_from_fork+0x42/0x70
1027- } hitcount: 85 len: 28884
1028- { stacktrace:
1029- __netif_receive_skb_core+0x46d/0x990
1030- __netif_receive_skb+0x18/0x60
1031- netif_receive_skb_internal+0x23/0x90
1032- napi_gro_complete+0xa4/0xe0
1033- dev_gro_receive+0x23a/0x360
1034- napi_gro_receive+0x30/0x100
1035- ieee80211_deliver_skb+0xd6/0x270 [mac80211]
1036- ieee80211_rx_handlers+0xccf/0x22f0 [mac80211]
1037- ieee80211_prepare_and_rx_handle+0x4e7/0xc40 [mac80211]
1038- ieee80211_rx+0x31d/0x900 [mac80211]
1039- iwlagn_rx_reply_rx+0x3db/0x6f0 [iwldvm]
1040- iwl_rx_dispatch+0x8e/0xf0 [iwldvm]
1041- iwl_pcie_irq_handler+0xe3c/0x12f0 [iwlwifi]
1042- irq_thread_fn+0x20/0x50
1043- irq_thread+0x11f/0x150
1044- kthread+0xd2/0xf0
1045- } hitcount: 98 len: 664329
1046- { stacktrace:
1047- __netif_receive_skb_core+0x46d/0x990
1048- __netif_receive_skb+0x18/0x60
1049- process_backlog+0xa8/0x150
1050- net_rx_action+0x15d/0x340
1051- __do_softirq+0x114/0x2c0
1052- do_softirq_own_stack+0x1c/0x30
1053- do_softirq+0x65/0x70
1054- __local_bh_enable_ip+0xb5/0xc0
1055- ip_finish_output+0x1f4/0x840
1056- ip_output+0x6b/0xc0
1057- ip_local_out_sk+0x31/0x40
1058- ip_send_skb+0x1a/0x50
1059- udp_send_skb+0x173/0x2a0
1060- udp_sendmsg+0x2bf/0x9f0
1061- inet_sendmsg+0x64/0xa0
1062- sock_sendmsg+0x3d/0x50
1063- } hitcount: 115 len: 13030
1064- { stacktrace:
1065- __netif_receive_skb_core+0x46d/0x990
1066- __netif_receive_skb+0x18/0x60
1067- netif_receive_skb_internal+0x23/0x90
1068- napi_gro_complete+0xa4/0xe0
1069- napi_gro_flush+0x6d/0x90
1070- iwl_pcie_irq_handler+0x92a/0x12f0 [iwlwifi]
1071- irq_thread_fn+0x20/0x50
1072- irq_thread+0x11f/0x150
1073- kthread+0xd2/0xf0
1074- ret_from_fork+0x42/0x70
1075- } hitcount: 934 len: 5512212
1076-
1077- Totals:
1078- Hits: 1232
1079- Entries: 4
1080- Dropped: 0
1081-
1082- The above shows all the netif_receive_skb callpaths and their total
1083- lengths for the duration of the wget command.
1084-
1085- The 'clear' hist trigger param can be used to clear the hash table.
1086- Suppose we wanted to try another run of the previous example but
1087- this time also wanted to see the complete list of events that went
1088- into the histogram. In order to avoid having to set everything up
1089- again, we can just clear the histogram first:
1090-
1091- # echo 'hist:key=stacktrace:vals=len:clear' >> \
1092- /sys/kernel/debug/tracing/events/net/netif_receive_skb/trigger
1093-
1094- Just to verify that it is in fact cleared, here's what we now see in
1095- the hist file:
1096-
1097- # cat /sys/kernel/debug/tracing/events/net/netif_receive_skb/hist
1098- # trigger info: hist:keys=stacktrace:vals=len:sort=hitcount:size=2048 [paused]
1099-
1100- Totals:
1101- Hits: 0
1102- Entries: 0
1103- Dropped: 0
1104-
1105- Since we want to see the detailed list of every netif_receive_skb
1106- event occurring during the new run, which are in fact the same
1107- events being aggregated into the hash table, we add some additional
1108- 'enable_event' events to the triggering sched_process_exec and
1109- sched_process_exit events as such:
1110-
1111- # echo 'enable_event:net:netif_receive_skb if filename==/usr/bin/wget' > \
1112- /sys/kernel/debug/tracing/events/sched/sched_process_exec/trigger
1113-
1114- # echo 'disable_event:net:netif_receive_skb if comm==wget' > \
1115- /sys/kernel/debug/tracing/events/sched/sched_process_exit/trigger
1116-
1117- If you read the trigger files for the sched_process_exec and
1118- sched_process_exit triggers, you should see two triggers for each:
1119- one enabling/disabling the hist aggregation and the other
1120- enabling/disabling the logging of events:
1121-
1122- # cat /sys/kernel/debug/tracing/events/sched/sched_process_exec/trigger
1123- enable_event:net:netif_receive_skb:unlimited if filename==/usr/bin/wget
1124- enable_hist:net:netif_receive_skb:unlimited if filename==/usr/bin/wget
1125-
1126- # cat /sys/kernel/debug/tracing/events/sched/sched_process_exit/trigger
1127- enable_event:net:netif_receive_skb:unlimited if comm==wget
1128- disable_hist:net:netif_receive_skb:unlimited if comm==wget
1129-
1130- In other words, whenever either of the sched_process_exec or
1131- sched_process_exit events is hit and matches 'wget', it enables or
1132- disables both the histogram and the event log, and what you end up
1133- with is a hash table and set of events just covering the specified
1134- duration. Run the wget command again:
1135-
1136- $ wget https://www.kernel.org/pub/linux/kernel/v3.x/patch-3.19.xz
1137-
1138- Displaying the 'hist' file should show something similar to what you
1139- saw in the last run, but this time you should also see the
1140- individual events in the trace file:
1141-
1142- # cat /sys/kernel/debug/tracing/trace
1143-
1144- # tracer: nop
1145- #
1146- # entries-in-buffer/entries-written: 183/1426 #P:4
1147- #
1148- # _-----=> irqs-off
1149- # / _----=> need-resched
1150- # | / _---=> hardirq/softirq
1151- # || / _--=> preempt-depth
1152- # ||| / delay
1153- # TASK-PID CPU# |||| TIMESTAMP FUNCTION
1154- # | | | |||| | |
1155- wget-15108 [000] ..s1 31769.606929: netif_receive_skb: dev=lo skbaddr=ffff88009c353100 len=60
1156- wget-15108 [000] ..s1 31769.606999: netif_receive_skb: dev=lo skbaddr=ffff88009c353200 len=60
1157- dnsmasq-1382 [000] ..s1 31769.677652: netif_receive_skb: dev=lo skbaddr=ffff88009c352b00 len=130
1158- dnsmasq-1382 [000] ..s1 31769.685917: netif_receive_skb: dev=lo skbaddr=ffff88009c352200 len=138
1159- ##### CPU 2 buffer started ####
1160- irq/29-iwlwifi-559 [002] ..s. 31772.031529: netif_receive_skb: dev=wlan0 skbaddr=ffff88009d433d00 len=2948
1161- irq/29-iwlwifi-559 [002] ..s. 31772.031572: netif_receive_skb: dev=wlan0 skbaddr=ffff88009d432200 len=1500
1162- irq/29-iwlwifi-559 [002] ..s. 31772.032196: netif_receive_skb: dev=wlan0 skbaddr=ffff88009d433100 len=2948
1163- irq/29-iwlwifi-559 [002] ..s. 31772.032761: netif_receive_skb: dev=wlan0 skbaddr=ffff88009d433000 len=2948
1164- irq/29-iwlwifi-559 [002] ..s. 31772.033220: netif_receive_skb: dev=wlan0 skbaddr=ffff88009d432e00 len=1500
1165- .
1166- .
1167- .
1168-
1169- The following example demonstrates how multiple hist triggers can be
1170- attached to a given event. This capability can be useful for
1171- creating a set of different summaries derived from the same set of
1172- events, or for comparing the effects of different filters, among
1173- other things.
1174-
1175- # echo 'hist:keys=skbaddr.hex:vals=len if len < 0' >> \
1176- /sys/kernel/debug/tracing/events/net/netif_receive_skb/trigger
1177- # echo 'hist:keys=skbaddr.hex:vals=len if len > 4096' >> \
1178- /sys/kernel/debug/tracing/events/net/netif_receive_skb/trigger
1179- # echo 'hist:keys=skbaddr.hex:vals=len if len == 256' >> \
1180- /sys/kernel/debug/tracing/events/net/netif_receive_skb/trigger
1181- # echo 'hist:keys=skbaddr.hex:vals=len' >> \
1182- /sys/kernel/debug/tracing/events/net/netif_receive_skb/trigger
1183- # echo 'hist:keys=len:vals=common_preempt_count' >> \
1184- /sys/kernel/debug/tracing/events/net/netif_receive_skb/trigger
1185-
1186- The above set of commands create four triggers differing only in
1187- their filters, along with a completely different though fairly
1188- nonsensical trigger. Note that in order to append multiple hist
1189- triggers to the same file, you should use the '>>' operator to
1190- append them ('>' will also add the new hist trigger, but will remove
1191- any existing hist triggers beforehand).
1192-
1193- Displaying the contents of the 'hist' file for the event shows the
1194- contents of all five histograms:
1195-
1196- # cat /sys/kernel/debug/tracing/events/net/netif_receive_skb/hist
1197-
1198- # event histogram
1199- #
1200- # trigger info: hist:keys=len:vals=hitcount,common_preempt_count:sort=hitcount:size=2048 [active]
1201- #
1202-
1203- { len: 176 } hitcount: 1 common_preempt_count: 0
1204- { len: 223 } hitcount: 1 common_preempt_count: 0
1205- { len: 4854 } hitcount: 1 common_preempt_count: 0
1206- { len: 395 } hitcount: 1 common_preempt_count: 0
1207- { len: 177 } hitcount: 1 common_preempt_count: 0
1208- { len: 446 } hitcount: 1 common_preempt_count: 0
1209- { len: 1601 } hitcount: 1 common_preempt_count: 0
1210- .
1211- .
1212- .
1213- { len: 1280 } hitcount: 66 common_preempt_count: 0
1214- { len: 116 } hitcount: 81 common_preempt_count: 40
1215- { len: 708 } hitcount: 112 common_preempt_count: 0
1216- { len: 46 } hitcount: 221 common_preempt_count: 0
1217- { len: 1264 } hitcount: 458 common_preempt_count: 0
1218-
1219- Totals:
1220- Hits: 1428
1221- Entries: 147
1222- Dropped: 0
1223-
1224-
1225- # event histogram
1226- #
1227- # trigger info: hist:keys=skbaddr.hex:vals=hitcount,len:sort=hitcount:size=2048 [active]
1228- #
1229-
1230- { skbaddr: ffff8800baee5e00 } hitcount: 1 len: 130
1231- { skbaddr: ffff88005f3d5600 } hitcount: 1 len: 1280
1232- { skbaddr: ffff88005f3d4900 } hitcount: 1 len: 1280
1233- { skbaddr: ffff88009fed6300 } hitcount: 1 len: 115
1234- { skbaddr: ffff88009fe0ad00 } hitcount: 1 len: 115
1235- { skbaddr: ffff88008cdb1900 } hitcount: 1 len: 46
1236- { skbaddr: ffff880064b5ef00 } hitcount: 1 len: 118
1237- { skbaddr: ffff880044e3c700 } hitcount: 1 len: 60
1238- { skbaddr: ffff880100065900 } hitcount: 1 len: 46
1239- { skbaddr: ffff8800d46bd500 } hitcount: 1 len: 116
1240- { skbaddr: ffff88005f3d5f00 } hitcount: 1 len: 1280
1241- { skbaddr: ffff880100064700 } hitcount: 1 len: 365
1242- { skbaddr: ffff8800badb6f00 } hitcount: 1 len: 60
1243- .
1244- .
1245- .
1246- { skbaddr: ffff88009fe0be00 } hitcount: 27 len: 24677
1247- { skbaddr: ffff88009fe0a400 } hitcount: 27 len: 23052
1248- { skbaddr: ffff88009fe0b700 } hitcount: 31 len: 25589
1249- { skbaddr: ffff88009fe0b600 } hitcount: 32 len: 27326
1250- { skbaddr: ffff88006a462800 } hitcount: 68 len: 71678
1251- { skbaddr: ffff88006a463700 } hitcount: 70 len: 72678
1252- { skbaddr: ffff88006a462b00 } hitcount: 71 len: 77589
1253- { skbaddr: ffff88006a463600 } hitcount: 73 len: 71307
1254- { skbaddr: ffff88006a462200 } hitcount: 81 len: 81032
1255-
1256- Totals:
1257- Hits: 1451
1258- Entries: 318
1259- Dropped: 0
1260-
1261-
1262- # event histogram
1263- #
1264- # trigger info: hist:keys=skbaddr.hex:vals=hitcount,len:sort=hitcount:size=2048 if len == 256 [active]
1265- #
1266-
1267-
1268- Totals:
1269- Hits: 0
1270- Entries: 0
1271- Dropped: 0
1272-
1273-
1274- # event histogram
1275- #
1276- # trigger info: hist:keys=skbaddr.hex:vals=hitcount,len:sort=hitcount:size=2048 if len > 4096 [active]
1277- #
1278-
1279- { skbaddr: ffff88009fd2c300 } hitcount: 1 len: 7212
1280- { skbaddr: ffff8800d2bcce00 } hitcount: 1 len: 7212
1281- { skbaddr: ffff8800d2bcd700 } hitcount: 1 len: 7212
1282- { skbaddr: ffff8800d2bcda00 } hitcount: 1 len: 21492
1283- { skbaddr: ffff8800ae2e2d00 } hitcount: 1 len: 7212
1284- { skbaddr: ffff8800d2bcdb00 } hitcount: 1 len: 7212
1285- { skbaddr: ffff88006a4df500 } hitcount: 1 len: 4854
1286- { skbaddr: ffff88008ce47b00 } hitcount: 1 len: 18636
1287- { skbaddr: ffff8800ae2e2200 } hitcount: 1 len: 12924
1288- { skbaddr: ffff88005f3e1000 } hitcount: 1 len: 4356
1289- { skbaddr: ffff8800d2bcdc00 } hitcount: 2 len: 24420
1290- { skbaddr: ffff8800d2bcc200 } hitcount: 2 len: 12996
1291-
1292- Totals:
1293- Hits: 14
1294- Entries: 12
1295- Dropped: 0
1296-
1297-
1298- # event histogram
1299- #
1300- # trigger info: hist:keys=skbaddr.hex:vals=hitcount,len:sort=hitcount:size=2048 if len < 0 [active]
1301- #
1302-
1303-
1304- Totals:
1305- Hits: 0
1306- Entries: 0
1307- Dropped: 0
1308-
1309- Named triggers can be used to have triggers share a common set of
1310- histogram data. This capability is mostly useful for combining the
1311- output of events generated by tracepoints contained inside inline
1312- functions, but names can be used in a hist trigger on any event.
1313- For example, these two triggers when hit will update the same 'len'
1314- field in the shared 'foo' histogram data:
1315-
1316- # echo 'hist:name=foo:keys=skbaddr.hex:vals=len' > \
1317- /sys/kernel/debug/tracing/events/net/netif_receive_skb/trigger
1318- # echo 'hist:name=foo:keys=skbaddr.hex:vals=len' > \
1319- /sys/kernel/debug/tracing/events/net/netif_rx/trigger
1320-
1321- You can see that they're updating common histogram data by reading
1322- each event's hist files at the same time:
1323-
1324- # cat /sys/kernel/debug/tracing/events/net/netif_receive_skb/hist;
1325- cat /sys/kernel/debug/tracing/events/net/netif_rx/hist
1326-
1327- # event histogram
1328- #
1329- # trigger info: hist:name=foo:keys=skbaddr.hex:vals=hitcount,len:sort=hitcount:size=2048 [active]
1330- #
1331-
1332- { skbaddr: ffff88000ad53500 } hitcount: 1 len: 46
1333- { skbaddr: ffff8800af5a1500 } hitcount: 1 len: 76
1334- { skbaddr: ffff8800d62a1900 } hitcount: 1 len: 46
1335- { skbaddr: ffff8800d2bccb00 } hitcount: 1 len: 468
1336- { skbaddr: ffff8800d3c69900 } hitcount: 1 len: 46
1337- { skbaddr: ffff88009ff09100 } hitcount: 1 len: 52
1338- { skbaddr: ffff88010f13ab00 } hitcount: 1 len: 168
1339- { skbaddr: ffff88006a54f400 } hitcount: 1 len: 46
1340- { skbaddr: ffff8800d2bcc500 } hitcount: 1 len: 260
1341- { skbaddr: ffff880064505000 } hitcount: 1 len: 46
1342- { skbaddr: ffff8800baf24e00 } hitcount: 1 len: 32
1343- { skbaddr: ffff88009fe0ad00 } hitcount: 1 len: 46
1344- { skbaddr: ffff8800d3edff00 } hitcount: 1 len: 44
1345- { skbaddr: ffff88009fe0b400 } hitcount: 1 len: 168
1346- { skbaddr: ffff8800a1c55a00 } hitcount: 1 len: 40
1347- { skbaddr: ffff8800d2bcd100 } hitcount: 1 len: 40
1348- { skbaddr: ffff880064505f00 } hitcount: 1 len: 174
1349- { skbaddr: ffff8800a8bff200 } hitcount: 1 len: 160
1350- { skbaddr: ffff880044e3cc00 } hitcount: 1 len: 76
1351- { skbaddr: ffff8800a8bfe700 } hitcount: 1 len: 46
1352- { skbaddr: ffff8800d2bcdc00 } hitcount: 1 len: 32
1353- { skbaddr: ffff8800a1f64800 } hitcount: 1 len: 46
1354- { skbaddr: ffff8800d2bcde00 } hitcount: 1 len: 988
1355- { skbaddr: ffff88006a5dea00 } hitcount: 1 len: 46
1356- { skbaddr: ffff88002e37a200 } hitcount: 1 len: 44
1357- { skbaddr: ffff8800a1f32c00 } hitcount: 2 len: 676
1358- { skbaddr: ffff88000ad52600 } hitcount: 2 len: 107
1359- { skbaddr: ffff8800a1f91e00 } hitcount: 2 len: 92
1360- { skbaddr: ffff8800af5a0200 } hitcount: 2 len: 142
1361- { skbaddr: ffff8800d2bcc600 } hitcount: 2 len: 220
1362- { skbaddr: ffff8800ba36f500 } hitcount: 2 len: 92
1363- { skbaddr: ffff8800d021f800 } hitcount: 2 len: 92
1364- { skbaddr: ffff8800a1f33600 } hitcount: 2 len: 675
1365- { skbaddr: ffff8800a8bfff00 } hitcount: 3 len: 138
1366- { skbaddr: ffff8800d62a1300 } hitcount: 3 len: 138
1367- { skbaddr: ffff88002e37a100 } hitcount: 4 len: 184
1368- { skbaddr: ffff880064504400 } hitcount: 4 len: 184
1369- { skbaddr: ffff8800a8bfec00 } hitcount: 4 len: 184
1370- { skbaddr: ffff88000ad53700 } hitcount: 5 len: 230
1371- { skbaddr: ffff8800d2bcdb00 } hitcount: 5 len: 196
1372- { skbaddr: ffff8800a1f90000 } hitcount: 6 len: 276
1373- { skbaddr: ffff88006a54f900 } hitcount: 6 len: 276
1374-
1375- Totals:
1376- Hits: 81
1377- Entries: 42
1378- Dropped: 0
1379- # event histogram
1380- #
1381- # trigger info: hist:name=foo:keys=skbaddr.hex:vals=hitcount,len:sort=hitcount:size=2048 [active]
1382- #
1383-
1384- { skbaddr: ffff88000ad53500 } hitcount: 1 len: 46
1385- { skbaddr: ffff8800af5a1500 } hitcount: 1 len: 76
1386- { skbaddr: ffff8800d62a1900 } hitcount: 1 len: 46
1387- { skbaddr: ffff8800d2bccb00 } hitcount: 1 len: 468
1388- { skbaddr: ffff8800d3c69900 } hitcount: 1 len: 46
1389- { skbaddr: ffff88009ff09100 } hitcount: 1 len: 52
1390- { skbaddr: ffff88010f13ab00 } hitcount: 1 len: 168
1391- { skbaddr: ffff88006a54f400 } hitcount: 1 len: 46
1392- { skbaddr: ffff8800d2bcc500 } hitcount: 1 len: 260
1393- { skbaddr: ffff880064505000 } hitcount: 1 len: 46
1394- { skbaddr: ffff8800baf24e00 } hitcount: 1 len: 32
1395- { skbaddr: ffff88009fe0ad00 } hitcount: 1 len: 46
1396- { skbaddr: ffff8800d3edff00 } hitcount: 1 len: 44
1397- { skbaddr: ffff88009fe0b400 } hitcount: 1 len: 168
1398- { skbaddr: ffff8800a1c55a00 } hitcount: 1 len: 40
1399- { skbaddr: ffff8800d2bcd100 } hitcount: 1 len: 40
1400- { skbaddr: ffff880064505f00 } hitcount: 1 len: 174
1401- { skbaddr: ffff8800a8bff200 } hitcount: 1 len: 160
1402- { skbaddr: ffff880044e3cc00 } hitcount: 1 len: 76
1403- { skbaddr: ffff8800a8bfe700 } hitcount: 1 len: 46
1404- { skbaddr: ffff8800d2bcdc00 } hitcount: 1 len: 32
1405- { skbaddr: ffff8800a1f64800 } hitcount: 1 len: 46
1406- { skbaddr: ffff8800d2bcde00 } hitcount: 1 len: 988
1407- { skbaddr: ffff88006a5dea00 } hitcount: 1 len: 46
1408- { skbaddr: ffff88002e37a200 } hitcount: 1 len: 44
1409- { skbaddr: ffff8800a1f32c00 } hitcount: 2 len: 676
1410- { skbaddr: ffff88000ad52600 } hitcount: 2 len: 107
1411- { skbaddr: ffff8800a1f91e00 } hitcount: 2 len: 92
1412- { skbaddr: ffff8800af5a0200 } hitcount: 2 len: 142
1413- { skbaddr: ffff8800d2bcc600 } hitcount: 2 len: 220
1414- { skbaddr: ffff8800ba36f500 } hitcount: 2 len: 92
1415- { skbaddr: ffff8800d021f800 } hitcount: 2 len: 92
1416- { skbaddr: ffff8800a1f33600 } hitcount: 2 len: 675
1417- { skbaddr: ffff8800a8bfff00 } hitcount: 3 len: 138
1418- { skbaddr: ffff8800d62a1300 } hitcount: 3 len: 138
1419- { skbaddr: ffff88002e37a100 } hitcount: 4 len: 184
1420- { skbaddr: ffff880064504400 } hitcount: 4 len: 184
1421- { skbaddr: ffff8800a8bfec00 } hitcount: 4 len: 184
1422- { skbaddr: ffff88000ad53700 } hitcount: 5 len: 230
1423- { skbaddr: ffff8800d2bcdb00 } hitcount: 5 len: 196
1424- { skbaddr: ffff8800a1f90000 } hitcount: 6 len: 276
1425- { skbaddr: ffff88006a54f900 } hitcount: 6 len: 276
1426-
1427- Totals:
1428- Hits: 81
1429- Entries: 42
1430- Dropped: 0
1431-
1432- And here's an example that shows how to combine histogram data from
1433- any two events even if they don't share any 'compatible' fields
1434- other than 'hitcount' and 'stacktrace'. These commands create a
1435- couple of triggers named 'bar' using those fields:
1436-
1437- # echo 'hist:name=bar:key=stacktrace:val=hitcount' > \
1438- /sys/kernel/debug/tracing/events/sched/sched_process_fork/trigger
1439- # echo 'hist:name=bar:key=stacktrace:val=hitcount' > \
1440- /sys/kernel/debug/tracing/events/net/netif_rx/trigger
1441-
1442- And displaying the output of either shows some interesting if
1443- somewhat confusing output:
1444-
1445- # cat /sys/kernel/debug/tracing/events/sched/sched_process_fork/hist
1446- # cat /sys/kernel/debug/tracing/events/net/netif_rx/hist
1447-
1448- # event histogram
1449- #
1450- # trigger info: hist:name=bar:keys=stacktrace:vals=hitcount:sort=hitcount:size=2048 [active]
1451- #
1452-
1453- { stacktrace:
1454- _do_fork+0x18e/0x330
1455- kernel_thread+0x29/0x30
1456- kthreadd+0x154/0x1b0
1457- ret_from_fork+0x3f/0x70
1458- } hitcount: 1
1459- { stacktrace:
1460- netif_rx_internal+0xb2/0xd0
1461- netif_rx_ni+0x20/0x70
1462- dev_loopback_xmit+0xaa/0xd0
1463- ip_mc_output+0x126/0x240
1464- ip_local_out_sk+0x31/0x40
1465- igmp_send_report+0x1e9/0x230
1466- igmp_timer_expire+0xe9/0x120
1467- call_timer_fn+0x39/0xf0
1468- run_timer_softirq+0x1e1/0x290
1469- __do_softirq+0xfd/0x290
1470- irq_exit+0x98/0xb0
1471- smp_apic_timer_interrupt+0x4a/0x60
1472- apic_timer_interrupt+0x6d/0x80
1473- cpuidle_enter+0x17/0x20
1474- call_cpuidle+0x3b/0x60
1475- cpu_startup_entry+0x22d/0x310
1476- } hitcount: 1
1477- { stacktrace:
1478- netif_rx_internal+0xb2/0xd0
1479- netif_rx_ni+0x20/0x70
1480- dev_loopback_xmit+0xaa/0xd0
1481- ip_mc_output+0x17f/0x240
1482- ip_local_out_sk+0x31/0x40
1483- ip_send_skb+0x1a/0x50
1484- udp_send_skb+0x13e/0x270
1485- udp_sendmsg+0x2bf/0x980
1486- inet_sendmsg+0x67/0xa0
1487- sock_sendmsg+0x38/0x50
1488- SYSC_sendto+0xef/0x170
1489- SyS_sendto+0xe/0x10
1490- entry_SYSCALL_64_fastpath+0x12/0x6a
1491- } hitcount: 2
1492- { stacktrace:
1493- netif_rx_internal+0xb2/0xd0
1494- netif_rx+0x1c/0x60
1495- loopback_xmit+0x6c/0xb0
1496- dev_hard_start_xmit+0x219/0x3a0
1497- __dev_queue_xmit+0x415/0x4f0
1498- dev_queue_xmit_sk+0x13/0x20
1499- ip_finish_output2+0x237/0x340
1500- ip_finish_output+0x113/0x1d0
1501- ip_output+0x66/0xc0
1502- ip_local_out_sk+0x31/0x40
1503- ip_send_skb+0x1a/0x50
1504- udp_send_skb+0x16d/0x270
1505- udp_sendmsg+0x2bf/0x980
1506- inet_sendmsg+0x67/0xa0
1507- sock_sendmsg+0x38/0x50
1508- ___sys_sendmsg+0x14e/0x270
1509- } hitcount: 76
1510- { stacktrace:
1511- netif_rx_internal+0xb2/0xd0
1512- netif_rx+0x1c/0x60
1513- loopback_xmit+0x6c/0xb0
1514- dev_hard_start_xmit+0x219/0x3a0
1515- __dev_queue_xmit+0x415/0x4f0
1516- dev_queue_xmit_sk+0x13/0x20
1517- ip_finish_output2+0x237/0x340
1518- ip_finish_output+0x113/0x1d0
1519- ip_output+0x66/0xc0
1520- ip_local_out_sk+0x31/0x40
1521- ip_send_skb+0x1a/0x50
1522- udp_send_skb+0x16d/0x270
1523- udp_sendmsg+0x2bf/0x980
1524- inet_sendmsg+0x67/0xa0
1525- sock_sendmsg+0x38/0x50
1526- ___sys_sendmsg+0x269/0x270
1527- } hitcount: 77
1528- { stacktrace:
1529- netif_rx_internal+0xb2/0xd0
1530- netif_rx+0x1c/0x60
1531- loopback_xmit+0x6c/0xb0
1532- dev_hard_start_xmit+0x219/0x3a0
1533- __dev_queue_xmit+0x415/0x4f0
1534- dev_queue_xmit_sk+0x13/0x20
1535- ip_finish_output2+0x237/0x340
1536- ip_finish_output+0x113/0x1d0
1537- ip_output+0x66/0xc0
1538- ip_local_out_sk+0x31/0x40
1539- ip_send_skb+0x1a/0x50
1540- udp_send_skb+0x16d/0x270
1541- udp_sendmsg+0x2bf/0x980
1542- inet_sendmsg+0x67/0xa0
1543- sock_sendmsg+0x38/0x50
1544- SYSC_sendto+0xef/0x170
1545- } hitcount: 88
1546- { stacktrace:
1547- _do_fork+0x18e/0x330
1548- SyS_clone+0x19/0x20
1549- entry_SYSCALL_64_fastpath+0x12/0x6a
1550- } hitcount: 244
1551-
1552- Totals:
1553- Hits: 489
1554- Entries: 7
1555- Dropped: 0
1556+ See Documentation/trace/histogram.txt for details and examples.
1557diff --git a/Documentation/trace/ftrace.txt b/Documentation/trace/ftrace.txt
1558index d4601df6e72e..54213e5c23f6 100644
1559--- a/Documentation/trace/ftrace.txt
1560+++ b/Documentation/trace/ftrace.txt
1561@@ -539,6 +539,30 @@ of ftrace. Here is a list of some of the key files:
1562
1563 See events.txt for more information.
1564
1565+ timestamp_mode:
1566+
1567+ Certain tracers may change the timestamp mode used when
1568+ logging trace events into the event buffer. Events with
1569+ different modes can coexist within a buffer but the mode in
1570+ effect when an event is logged determines which timestamp mode
1571+ is used for that event. The default timestamp mode is
1572+ 'delta'.
1573+
1574+ Usual timestamp modes for tracing:
1575+
1576+ # cat timestamp_mode
1577+ [delta] absolute
1578+
1579+ The timestamp mode with the square brackets around it is the
1580+ one in effect.
1581+
1582+ delta: Default timestamp mode - timestamp is a delta against
1583+ a per-buffer timestamp.
1584+
1585+ absolute: The timestamp is a full timestamp, not a delta
1586+ against some other value. As such it takes up more
1587+ space and is less efficient.
1588+
1589 hwlat_detector:
1590
1591 Directory for the Hardware Latency Detector.
1592diff --git a/Documentation/trace/histogram.txt b/Documentation/trace/histogram.txt
1593new file mode 100644
1594index 000000000000..6e05510afc28
1595--- /dev/null
1596+++ b/Documentation/trace/histogram.txt
1597@@ -0,0 +1,1995 @@
1598+ Event Histograms
1599+
1600+ Documentation written by Tom Zanussi
1601+
1602+1. Introduction
1603+===============
1604+
1605+ Histogram triggers are special event triggers that can be used to
1606+ aggregate trace event data into histograms. For information on
1607+ trace events and event triggers, see Documentation/trace/events.txt.
1608+
1609+
1610+2. Histogram Trigger Command
1611+============================
1612+
1613+ A histogram trigger command is an event trigger command that
1614+ aggregates event hits into a hash table keyed on one or more trace
1615+ event format fields (or stacktrace) and a set of running totals
1616+ derived from one or more trace event format fields and/or event
1617+ counts (hitcount).
1618+
1619+ The format of a hist trigger is as follows:
1620+
1621+ hist:keys=<field1[,field2,...]>[:values=<field1[,field2,...]>]
1622+ [:sort=<field1[,field2,...]>][:size=#entries][:pause][:continue]
1623+ [:clear][:name=histname1] [if <filter>]
1624+
1625+ When a matching event is hit, an entry is added to a hash table
1626+ using the key(s) and value(s) named. Keys and values correspond to
1627+ fields in the event's format description. Values must correspond to
1628+ numeric fields - on an event hit, the value(s) will be added to a
1629+ sum kept for that field. The special string 'hitcount' can be used
1630+ in place of an explicit value field - this is simply a count of
1631+ event hits. If 'values' isn't specified, an implicit 'hitcount'
1632+ value will be automatically created and used as the only value.
1633+ Keys can be any field, or the special string 'stacktrace', which
1634+ will use the event's kernel stacktrace as the key. The keywords
1635+ 'keys' or 'key' can be used to specify keys, and the keywords
1636+ 'values', 'vals', or 'val' can be used to specify values. Compound
1637+ keys consisting of up to two fields can be specified by the 'keys'
1638+ keyword. Hashing a compound key produces a unique entry in the
1639+ table for each unique combination of component keys, and can be
1640+ useful for providing more fine-grained summaries of event data.
1641+ Additionally, sort keys consisting of up to two fields can be
1642+ specified by the 'sort' keyword. If more than one field is
1643+ specified, the result will be a 'sort within a sort': the first key
1644+ is taken to be the primary sort key and the second the secondary
1645+ key. If a hist trigger is given a name using the 'name' parameter,
1646+ its histogram data will be shared with other triggers of the same
1647+ name, and trigger hits will update this common data. Only triggers
1648+ with 'compatible' fields can be combined in this way; triggers are
1649+ 'compatible' if the fields named in the trigger share the same
1650+ number and type of fields and those fields also have the same names.
1651+ Note that any two events always share the compatible 'hitcount' and
1652+ 'stacktrace' fields and can therefore be combined using those
1653+ fields, however pointless that may be.
1654+
1655+ 'hist' triggers add a 'hist' file to each event's subdirectory.
1656+ Reading the 'hist' file for the event will dump the hash table in
1657+ its entirety to stdout. If there are multiple hist triggers
1658+ attached to an event, there will be a table for each trigger in the
1659+ output. The table displayed for a named trigger will be the same as
1660+ any other instance having the same name. Each printed hash table
1661+ entry is a simple list of the keys and values comprising the entry;
1662+ keys are printed first and are delineated by curly braces, and are
1663+ followed by the set of value fields for the entry. By default,
1664+ numeric fields are displayed as base-10 integers. This can be
1665+ modified by appending any of the following modifiers to the field
1666+ name:
1667+
1668+ .hex display a number as a hex value
1669+ .sym display an address as a symbol
1670+ .sym-offset display an address as a symbol and offset
1671+ .syscall display a syscall id as a system call name
1672+ .execname display a common_pid as a program name
1673+ .log2 display log2 value rather than raw number
1674+ .usecs display a common_timestamp in microseconds
1675+
1676+ Note that in general the semantics of a given field aren't
1677+ interpreted when applying a modifier to it, but there are some
1678+ restrictions to be aware of in this regard:
1679+
1680+ - only the 'hex' modifier can be used for values (because values
1681+ are essentially sums, and the other modifiers don't make sense
1682+ in that context).
1683+ - the 'execname' modifier can only be used on a 'common_pid'. The
1684+ reason for this is that the execname is simply the 'comm' value
1685+ saved for the 'current' process when an event was triggered,
1686+ which is the same as the common_pid value saved by the event
1687+ tracing code. Trying to apply that comm value to other pid
1688+ values wouldn't be correct, and typically events that care save
1689+ pid-specific comm fields in the event itself.
1690+
1691+ A typical usage scenario would be the following to enable a hist
1692+ trigger, read its current contents, and then turn it off:
1693+
1694+ # echo 'hist:keys=skbaddr.hex:vals=len' > \
1695+ /sys/kernel/debug/tracing/events/net/netif_rx/trigger
1696+
1697+ # cat /sys/kernel/debug/tracing/events/net/netif_rx/hist
1698+
1699+ # echo '!hist:keys=skbaddr.hex:vals=len' > \
1700+ /sys/kernel/debug/tracing/events/net/netif_rx/trigger
1701+
1702+ The trigger file itself can be read to show the details of the
1703+ currently attached hist trigger. This information is also displayed
1704+ at the top of the 'hist' file when read.
1705+
1706+ By default, the size of the hash table is 2048 entries. The 'size'
1707+ parameter can be used to specify more or fewer than that. The units
1708+ are in terms of hashtable entries - if a run uses more entries than
1709+ specified, the results will show the number of 'drops', the number
1710+ of hits that were ignored. The size should be a power of 2 between
1711+ 128 and 131072 (any non- power-of-2 number specified will be rounded
1712+ up).
1713+
1714+ The 'sort' parameter can be used to specify a value field to sort
1715+ on. The default if unspecified is 'hitcount' and the default sort
1716+ order is 'ascending'. To sort in the opposite direction, append
1717+ .descending' to the sort key.
1718+
1719+ The 'pause' parameter can be used to pause an existing hist trigger
1720+ or to start a hist trigger but not log any events until told to do
1721+ so. 'continue' or 'cont' can be used to start or restart a paused
1722+ hist trigger.
1723+
1724+ The 'clear' parameter will clear the contents of a running hist
1725+ trigger and leave its current paused/active state.
1726+
1727+ Note that the 'pause', 'cont', and 'clear' parameters should be
1728+ applied using 'append' shell operator ('>>') if applied to an
1729+ existing trigger, rather than via the '>' operator, which will cause
1730+ the trigger to be removed through truncation.
1731+
1732+- enable_hist/disable_hist
1733+
1734+ The enable_hist and disable_hist triggers can be used to have one
1735+ event conditionally start and stop another event's already-attached
1736+ hist trigger. Any number of enable_hist and disable_hist triggers
1737+ can be attached to a given event, allowing that event to kick off
1738+ and stop aggregations on a host of other events.
1739+
1740+ The format is very similar to the enable/disable_event triggers:
1741+
1742+ enable_hist:<system>:<event>[:count]
1743+ disable_hist:<system>:<event>[:count]
1744+
1745+ Instead of enabling or disabling the tracing of the target event
1746+ into the trace buffer as the enable/disable_event triggers do, the
1747+ enable/disable_hist triggers enable or disable the aggregation of
1748+ the target event into a hash table.
1749+
1750+ A typical usage scenario for the enable_hist/disable_hist triggers
1751+ would be to first set up a paused hist trigger on some event,
1752+ followed by an enable_hist/disable_hist pair that turns the hist
1753+ aggregation on and off when conditions of interest are hit:
1754+
1755+ # echo 'hist:keys=skbaddr.hex:vals=len:pause' > \
1756+ /sys/kernel/debug/tracing/events/net/netif_receive_skb/trigger
1757+
1758+ # echo 'enable_hist:net:netif_receive_skb if filename==/usr/bin/wget' > \
1759+ /sys/kernel/debug/tracing/events/sched/sched_process_exec/trigger
1760+
1761+ # echo 'disable_hist:net:netif_receive_skb if comm==wget' > \
1762+ /sys/kernel/debug/tracing/events/sched/sched_process_exit/trigger
1763+
1764+ The above sets up an initially paused hist trigger which is unpaused
1765+ and starts aggregating events when a given program is executed, and
1766+ which stops aggregating when the process exits and the hist trigger
1767+ is paused again.
1768+
1769+ The examples below provide a more concrete illustration of the
1770+ concepts and typical usage patterns discussed above.
1771+
1772+ 'special' event fields
1773+ ------------------------
1774+
1775+ There are a number of 'special event fields' available for use as
1776+ keys or values in a hist trigger. These look like and behave as if
1777+ they were actual event fields, but aren't really part of the event's
1778+ field definition or format file. They are however available for any
1779+ event, and can be used anywhere an actual event field could be.
1780+ They are:
1781+
1782+ common_timestamp u64 - timestamp (from ring buffer) associated
1783+ with the event, in nanoseconds. May be
1784+ modified by .usecs to have timestamps
1785+ interpreted as microseconds.
1786+ cpu int - the cpu on which the event occurred.
1787+
1788+ Extended error information
1789+ --------------------------
1790+
1791+ For some error conditions encountered when invoking a hist trigger
1792+ command, extended error information is available via the
1793+ corresponding event's 'hist' file. Reading the hist file after an
1794+ error will display more detailed information about what went wrong,
1795+ if information is available. This extended error information will
1796+ be available until the next hist trigger command for that event.
1797+
1798+ If available for a given error condition, the extended error
1799+ information and usage takes the following form:
1800+
1801+ # echo xxx > /sys/kernel/debug/tracing/events/sched/sched_wakeup/trigger
1802+ echo: write error: Invalid argument
1803+
1804+ # cat /sys/kernel/debug/tracing/events/sched/sched_wakeup/hist
1805+ ERROR: Couldn't yyy: zzz
1806+ Last command: xxx
1807+
1808+6.2 'hist' trigger examples
1809+---------------------------
1810+
1811+ The first set of examples creates aggregations using the kmalloc
1812+ event. The fields that can be used for the hist trigger are listed
1813+ in the kmalloc event's format file:
1814+
1815+ # cat /sys/kernel/debug/tracing/events/kmem/kmalloc/format
1816+ name: kmalloc
1817+ ID: 374
1818+ format:
1819+ field:unsigned short common_type; offset:0; size:2; signed:0;
1820+ field:unsigned char common_flags; offset:2; size:1; signed:0;
1821+ field:unsigned char common_preempt_count; offset:3; size:1; signed:0;
1822+ field:int common_pid; offset:4; size:4; signed:1;
1823+
1824+ field:unsigned long call_site; offset:8; size:8; signed:0;
1825+ field:const void * ptr; offset:16; size:8; signed:0;
1826+ field:size_t bytes_req; offset:24; size:8; signed:0;
1827+ field:size_t bytes_alloc; offset:32; size:8; signed:0;
1828+ field:gfp_t gfp_flags; offset:40; size:4; signed:0;
1829+
1830+ We'll start by creating a hist trigger that generates a simple table
1831+ that lists the total number of bytes requested for each function in
1832+ the kernel that made one or more calls to kmalloc:
1833+
1834+ # echo 'hist:key=call_site:val=bytes_req' > \
1835+ /sys/kernel/debug/tracing/events/kmem/kmalloc/trigger
1836+
1837+ This tells the tracing system to create a 'hist' trigger using the
1838+ call_site field of the kmalloc event as the key for the table, which
1839+ just means that each unique call_site address will have an entry
1840+ created for it in the table. The 'val=bytes_req' parameter tells
1841+ the hist trigger that for each unique entry (call_site) in the
1842+ table, it should keep a running total of the number of bytes
1843+ requested by that call_site.
1844+
1845+ We'll let it run for awhile and then dump the contents of the 'hist'
1846+ file in the kmalloc event's subdirectory (for readability, a number
1847+ of entries have been omitted):
1848+
1849+ # cat /sys/kernel/debug/tracing/events/kmem/kmalloc/hist
1850+ # trigger info: hist:keys=call_site:vals=bytes_req:sort=hitcount:size=2048 [active]
1851+
1852+ { call_site: 18446744072106379007 } hitcount: 1 bytes_req: 176
1853+ { call_site: 18446744071579557049 } hitcount: 1 bytes_req: 1024
1854+ { call_site: 18446744071580608289 } hitcount: 1 bytes_req: 16384
1855+ { call_site: 18446744071581827654 } hitcount: 1 bytes_req: 24
1856+ { call_site: 18446744071580700980 } hitcount: 1 bytes_req: 8
1857+ { call_site: 18446744071579359876 } hitcount: 1 bytes_req: 152
1858+ { call_site: 18446744071580795365 } hitcount: 3 bytes_req: 144
1859+ { call_site: 18446744071581303129 } hitcount: 3 bytes_req: 144
1860+ { call_site: 18446744071580713234 } hitcount: 4 bytes_req: 2560
1861+ { call_site: 18446744071580933750 } hitcount: 4 bytes_req: 736
1862+ .
1863+ .
1864+ .
1865+ { call_site: 18446744072106047046 } hitcount: 69 bytes_req: 5576
1866+ { call_site: 18446744071582116407 } hitcount: 73 bytes_req: 2336
1867+ { call_site: 18446744072106054684 } hitcount: 136 bytes_req: 140504
1868+ { call_site: 18446744072106224230 } hitcount: 136 bytes_req: 19584
1869+ { call_site: 18446744072106078074 } hitcount: 153 bytes_req: 2448
1870+ { call_site: 18446744072106062406 } hitcount: 153 bytes_req: 36720
1871+ { call_site: 18446744071582507929 } hitcount: 153 bytes_req: 37088
1872+ { call_site: 18446744072102520590 } hitcount: 273 bytes_req: 10920
1873+ { call_site: 18446744071582143559 } hitcount: 358 bytes_req: 716
1874+ { call_site: 18446744072106465852 } hitcount: 417 bytes_req: 56712
1875+ { call_site: 18446744072102523378 } hitcount: 485 bytes_req: 27160
1876+ { call_site: 18446744072099568646 } hitcount: 1676 bytes_req: 33520
1877+
1878+ Totals:
1879+ Hits: 4610
1880+ Entries: 45
1881+ Dropped: 0
1882+
1883+ The output displays a line for each entry, beginning with the key
1884+ specified in the trigger, followed by the value(s) also specified in
1885+ the trigger. At the beginning of the output is a line that displays
1886+ the trigger info, which can also be displayed by reading the
1887+ 'trigger' file:
1888+
1889+ # cat /sys/kernel/debug/tracing/events/kmem/kmalloc/trigger
1890+ hist:keys=call_site:vals=bytes_req:sort=hitcount:size=2048 [active]
1891+
1892+ At the end of the output are a few lines that display the overall
1893+ totals for the run. The 'Hits' field shows the total number of
1894+ times the event trigger was hit, the 'Entries' field shows the total
1895+ number of used entries in the hash table, and the 'Dropped' field
1896+ shows the number of hits that were dropped because the number of
1897+ used entries for the run exceeded the maximum number of entries
1898+ allowed for the table (normally 0, but if not a hint that you may
1899+ want to increase the size of the table using the 'size' parameter).
1900+
1901+ Notice in the above output that there's an extra field, 'hitcount',
1902+ which wasn't specified in the trigger. Also notice that in the
1903+ trigger info output, there's a parameter, 'sort=hitcount', which
1904+ wasn't specified in the trigger either. The reason for that is that
1905+ every trigger implicitly keeps a count of the total number of hits
1906+ attributed to a given entry, called the 'hitcount'. That hitcount
1907+ information is explicitly displayed in the output, and in the
1908+ absence of a user-specified sort parameter, is used as the default
1909+ sort field.
1910+
1911+ The value 'hitcount' can be used in place of an explicit value in
1912+ the 'values' parameter if you don't really need to have any
1913+ particular field summed and are mainly interested in hit
1914+ frequencies.
1915+
1916+ To turn the hist trigger off, simply call up the trigger in the
1917+ command history and re-execute it with a '!' prepended:
1918+
1919+ # echo '!hist:key=call_site:val=bytes_req' > \
1920+ /sys/kernel/debug/tracing/events/kmem/kmalloc/trigger
1921+
1922+ Finally, notice that the call_site as displayed in the output above
1923+ isn't really very useful. It's an address, but normally addresses
1924+ are displayed in hex. To have a numeric field displayed as a hex
1925+ value, simply append '.hex' to the field name in the trigger:
1926+
1927+ # echo 'hist:key=call_site.hex:val=bytes_req' > \
1928+ /sys/kernel/debug/tracing/events/kmem/kmalloc/trigger
1929+
1930+ # cat /sys/kernel/debug/tracing/events/kmem/kmalloc/hist
1931+ # trigger info: hist:keys=call_site.hex:vals=bytes_req:sort=hitcount:size=2048 [active]
1932+
1933+ { call_site: ffffffffa026b291 } hitcount: 1 bytes_req: 433
1934+ { call_site: ffffffffa07186ff } hitcount: 1 bytes_req: 176
1935+ { call_site: ffffffff811ae721 } hitcount: 1 bytes_req: 16384
1936+ { call_site: ffffffff811c5134 } hitcount: 1 bytes_req: 8
1937+ { call_site: ffffffffa04a9ebb } hitcount: 1 bytes_req: 511
1938+ { call_site: ffffffff8122e0a6 } hitcount: 1 bytes_req: 12
1939+ { call_site: ffffffff8107da84 } hitcount: 1 bytes_req: 152
1940+ { call_site: ffffffff812d8246 } hitcount: 1 bytes_req: 24
1941+ { call_site: ffffffff811dc1e5 } hitcount: 3 bytes_req: 144
1942+ { call_site: ffffffffa02515e8 } hitcount: 3 bytes_req: 648
1943+ { call_site: ffffffff81258159 } hitcount: 3 bytes_req: 144
1944+ { call_site: ffffffff811c80f4 } hitcount: 4 bytes_req: 544
1945+ .
1946+ .
1947+ .
1948+ { call_site: ffffffffa06c7646 } hitcount: 106 bytes_req: 8024
1949+ { call_site: ffffffffa06cb246 } hitcount: 132 bytes_req: 31680
1950+ { call_site: ffffffffa06cef7a } hitcount: 132 bytes_req: 2112
1951+ { call_site: ffffffff8137e399 } hitcount: 132 bytes_req: 23232
1952+ { call_site: ffffffffa06c941c } hitcount: 185 bytes_req: 171360
1953+ { call_site: ffffffffa06f2a66 } hitcount: 185 bytes_req: 26640
1954+ { call_site: ffffffffa036a70e } hitcount: 265 bytes_req: 10600
1955+ { call_site: ffffffff81325447 } hitcount: 292 bytes_req: 584
1956+ { call_site: ffffffffa072da3c } hitcount: 446 bytes_req: 60656
1957+ { call_site: ffffffffa036b1f2 } hitcount: 526 bytes_req: 29456
1958+ { call_site: ffffffffa0099c06 } hitcount: 1780 bytes_req: 35600
1959+
1960+ Totals:
1961+ Hits: 4775
1962+ Entries: 46
1963+ Dropped: 0
1964+
1965+ Even that's only marginally more useful - while hex values do look
1966+ more like addresses, what users are typically more interested in
1967+ when looking at text addresses are the corresponding symbols
1968+ instead. To have an address displayed as symbolic value instead,
1969+ simply append '.sym' or '.sym-offset' to the field name in the
1970+ trigger:
1971+
1972+ # echo 'hist:key=call_site.sym:val=bytes_req' > \
1973+ /sys/kernel/debug/tracing/events/kmem/kmalloc/trigger
1974+
1975+ # cat /sys/kernel/debug/tracing/events/kmem/kmalloc/hist
1976+ # trigger info: hist:keys=call_site.sym:vals=bytes_req:sort=hitcount:size=2048 [active]
1977+
1978+ { call_site: [ffffffff810adcb9] syslog_print_all } hitcount: 1 bytes_req: 1024
1979+ { call_site: [ffffffff8154bc62] usb_control_msg } hitcount: 1 bytes_req: 8
1980+ { call_site: [ffffffffa00bf6fe] hidraw_send_report [hid] } hitcount: 1 bytes_req: 7
1981+ { call_site: [ffffffff8154acbe] usb_alloc_urb } hitcount: 1 bytes_req: 192
1982+ { call_site: [ffffffffa00bf1ca] hidraw_report_event [hid] } hitcount: 1 bytes_req: 7
1983+ { call_site: [ffffffff811e3a25] __seq_open_private } hitcount: 1 bytes_req: 40
1984+ { call_site: [ffffffff8109524a] alloc_fair_sched_group } hitcount: 2 bytes_req: 128
1985+ { call_site: [ffffffff811febd5] fsnotify_alloc_group } hitcount: 2 bytes_req: 528
1986+ { call_site: [ffffffff81440f58] __tty_buffer_request_room } hitcount: 2 bytes_req: 2624
1987+ { call_site: [ffffffff81200ba6] inotify_new_group } hitcount: 2 bytes_req: 96
1988+ { call_site: [ffffffffa05e19af] ieee80211_start_tx_ba_session [mac80211] } hitcount: 2 bytes_req: 464
1989+ { call_site: [ffffffff81672406] tcp_get_metrics } hitcount: 2 bytes_req: 304
1990+ { call_site: [ffffffff81097ec2] alloc_rt_sched_group } hitcount: 2 bytes_req: 128
1991+ { call_site: [ffffffff81089b05] sched_create_group } hitcount: 2 bytes_req: 1424
1992+ .
1993+ .
1994+ .
1995+ { call_site: [ffffffffa04a580c] intel_crtc_page_flip [i915] } hitcount: 1185 bytes_req: 123240
1996+ { call_site: [ffffffffa0287592] drm_mode_page_flip_ioctl [drm] } hitcount: 1185 bytes_req: 104280
1997+ { call_site: [ffffffffa04c4a3c] intel_plane_duplicate_state [i915] } hitcount: 1402 bytes_req: 190672
1998+ { call_site: [ffffffff812891ca] ext4_find_extent } hitcount: 1518 bytes_req: 146208
1999+ { call_site: [ffffffffa029070e] drm_vma_node_allow [drm] } hitcount: 1746 bytes_req: 69840
2000+ { call_site: [ffffffffa045e7c4] i915_gem_do_execbuffer.isra.23 [i915] } hitcount: 2021 bytes_req: 792312
2001+ { call_site: [ffffffffa02911f2] drm_modeset_lock_crtc [drm] } hitcount: 2592 bytes_req: 145152
2002+ { call_site: [ffffffffa0489a66] intel_ring_begin [i915] } hitcount: 2629 bytes_req: 378576
2003+ { call_site: [ffffffffa046041c] i915_gem_execbuffer2 [i915] } hitcount: 2629 bytes_req: 3783248
2004+ { call_site: [ffffffff81325607] apparmor_file_alloc_security } hitcount: 5192 bytes_req: 10384
2005+ { call_site: [ffffffffa00b7c06] hid_report_raw_event [hid] } hitcount: 5529 bytes_req: 110584
2006+ { call_site: [ffffffff8131ebf7] aa_alloc_task_context } hitcount: 21943 bytes_req: 702176
2007+ { call_site: [ffffffff8125847d] ext4_htree_store_dirent } hitcount: 55759 bytes_req: 5074265
2008+
2009+ Totals:
2010+ Hits: 109928
2011+ Entries: 71
2012+ Dropped: 0
2013+
2014+ Because the default sort key above is 'hitcount', the above shows a
2015+ the list of call_sites by increasing hitcount, so that at the bottom
2016+ we see the functions that made the most kmalloc calls during the
2017+ run. If instead we we wanted to see the top kmalloc callers in
2018+ terms of the number of bytes requested rather than the number of
2019+ calls, and we wanted the top caller to appear at the top, we can use
2020+ the 'sort' parameter, along with the 'descending' modifier:
2021+
2022+ # echo 'hist:key=call_site.sym:val=bytes_req:sort=bytes_req.descending' > \
2023+ /sys/kernel/debug/tracing/events/kmem/kmalloc/trigger
2024+
2025+ # cat /sys/kernel/debug/tracing/events/kmem/kmalloc/hist
2026+ # trigger info: hist:keys=call_site.sym:vals=bytes_req:sort=bytes_req.descending:size=2048 [active]
2027+
2028+ { call_site: [ffffffffa046041c] i915_gem_execbuffer2 [i915] } hitcount: 2186 bytes_req: 3397464
2029+ { call_site: [ffffffffa045e7c4] i915_gem_do_execbuffer.isra.23 [i915] } hitcount: 1790 bytes_req: 712176
2030+ { call_site: [ffffffff8125847d] ext4_htree_store_dirent } hitcount: 8132 bytes_req: 513135
2031+ { call_site: [ffffffff811e2a1b] seq_buf_alloc } hitcount: 106 bytes_req: 440128
2032+ { call_site: [ffffffffa0489a66] intel_ring_begin [i915] } hitcount: 2186 bytes_req: 314784
2033+ { call_site: [ffffffff812891ca] ext4_find_extent } hitcount: 2174 bytes_req: 208992
2034+ { call_site: [ffffffff811ae8e1] __kmalloc } hitcount: 8 bytes_req: 131072
2035+ { call_site: [ffffffffa04c4a3c] intel_plane_duplicate_state [i915] } hitcount: 859 bytes_req: 116824
2036+ { call_site: [ffffffffa02911f2] drm_modeset_lock_crtc [drm] } hitcount: 1834 bytes_req: 102704
2037+ { call_site: [ffffffffa04a580c] intel_crtc_page_flip [i915] } hitcount: 972 bytes_req: 101088
2038+ { call_site: [ffffffffa0287592] drm_mode_page_flip_ioctl [drm] } hitcount: 972 bytes_req: 85536
2039+ { call_site: [ffffffffa00b7c06] hid_report_raw_event [hid] } hitcount: 3333 bytes_req: 66664
2040+ { call_site: [ffffffff8137e559] sg_kmalloc } hitcount: 209 bytes_req: 61632
2041+ .
2042+ .
2043+ .
2044+ { call_site: [ffffffff81095225] alloc_fair_sched_group } hitcount: 2 bytes_req: 128
2045+ { call_site: [ffffffff81097ec2] alloc_rt_sched_group } hitcount: 2 bytes_req: 128
2046+ { call_site: [ffffffff812d8406] copy_semundo } hitcount: 2 bytes_req: 48
2047+ { call_site: [ffffffff81200ba6] inotify_new_group } hitcount: 1 bytes_req: 48
2048+ { call_site: [ffffffffa027121a] drm_getmagic [drm] } hitcount: 1 bytes_req: 48
2049+ { call_site: [ffffffff811e3a25] __seq_open_private } hitcount: 1 bytes_req: 40
2050+ { call_site: [ffffffff811c52f4] bprm_change_interp } hitcount: 2 bytes_req: 16
2051+ { call_site: [ffffffff8154bc62] usb_control_msg } hitcount: 1 bytes_req: 8
2052+ { call_site: [ffffffffa00bf1ca] hidraw_report_event [hid] } hitcount: 1 bytes_req: 7
2053+ { call_site: [ffffffffa00bf6fe] hidraw_send_report [hid] } hitcount: 1 bytes_req: 7
2054+
2055+ Totals:
2056+ Hits: 32133
2057+ Entries: 81
2058+ Dropped: 0
2059+
2060+ To display the offset and size information in addition to the symbol
2061+ name, just use 'sym-offset' instead:
2062+
2063+ # echo 'hist:key=call_site.sym-offset:val=bytes_req:sort=bytes_req.descending' > \
2064+ /sys/kernel/debug/tracing/events/kmem/kmalloc/trigger
2065+
2066+ # cat /sys/kernel/debug/tracing/events/kmem/kmalloc/hist
2067+ # trigger info: hist:keys=call_site.sym-offset:vals=bytes_req:sort=bytes_req.descending:size=2048 [active]
2068+
2069+ { call_site: [ffffffffa046041c] i915_gem_execbuffer2+0x6c/0x2c0 [i915] } hitcount: 4569 bytes_req: 3163720
2070+ { call_site: [ffffffffa0489a66] intel_ring_begin+0xc6/0x1f0 [i915] } hitcount: 4569 bytes_req: 657936
2071+ { call_site: [ffffffffa045e7c4] i915_gem_do_execbuffer.isra.23+0x694/0x1020 [i915] } hitcount: 1519 bytes_req: 472936
2072+ { call_site: [ffffffffa045e646] i915_gem_do_execbuffer.isra.23+0x516/0x1020 [i915] } hitcount: 3050 bytes_req: 211832
2073+ { call_site: [ffffffff811e2a1b] seq_buf_alloc+0x1b/0x50 } hitcount: 34 bytes_req: 148384
2074+ { call_site: [ffffffffa04a580c] intel_crtc_page_flip+0xbc/0x870 [i915] } hitcount: 1385 bytes_req: 144040
2075+ { call_site: [ffffffff811ae8e1] __kmalloc+0x191/0x1b0 } hitcount: 8 bytes_req: 131072
2076+ { call_site: [ffffffffa0287592] drm_mode_page_flip_ioctl+0x282/0x360 [drm] } hitcount: 1385 bytes_req: 121880
2077+ { call_site: [ffffffffa02911f2] drm_modeset_lock_crtc+0x32/0x100 [drm] } hitcount: 1848 bytes_req: 103488
2078+ { call_site: [ffffffffa04c4a3c] intel_plane_duplicate_state+0x2c/0xa0 [i915] } hitcount: 461 bytes_req: 62696
2079+ { call_site: [ffffffffa029070e] drm_vma_node_allow+0x2e/0xd0 [drm] } hitcount: 1541 bytes_req: 61640
2080+ { call_site: [ffffffff815f8d7b] sk_prot_alloc+0xcb/0x1b0 } hitcount: 57 bytes_req: 57456
2081+ .
2082+ .
2083+ .
2084+ { call_site: [ffffffff8109524a] alloc_fair_sched_group+0x5a/0x1a0 } hitcount: 2 bytes_req: 128
2085+ { call_site: [ffffffffa027b921] drm_vm_open_locked+0x31/0xa0 [drm] } hitcount: 3 bytes_req: 96
2086+ { call_site: [ffffffff8122e266] proc_self_follow_link+0x76/0xb0 } hitcount: 8 bytes_req: 96
2087+ { call_site: [ffffffff81213e80] load_elf_binary+0x240/0x1650 } hitcount: 3 bytes_req: 84
2088+ { call_site: [ffffffff8154bc62] usb_control_msg+0x42/0x110 } hitcount: 1 bytes_req: 8
2089+ { call_site: [ffffffffa00bf6fe] hidraw_send_report+0x7e/0x1a0 [hid] } hitcount: 1 bytes_req: 7
2090+ { call_site: [ffffffffa00bf1ca] hidraw_report_event+0x8a/0x120 [hid] } hitcount: 1 bytes_req: 7
2091+
2092+ Totals:
2093+ Hits: 26098
2094+ Entries: 64
2095+ Dropped: 0
2096+
2097+ We can also add multiple fields to the 'values' parameter. For
2098+ example, we might want to see the total number of bytes allocated
2099+ alongside bytes requested, and display the result sorted by bytes
2100+ allocated in a descending order:
2101+
2102+ # echo 'hist:keys=call_site.sym:values=bytes_req,bytes_alloc:sort=bytes_alloc.descending' > \
2103+ /sys/kernel/debug/tracing/events/kmem/kmalloc/trigger
2104+
2105+ # cat /sys/kernel/debug/tracing/events/kmem/kmalloc/hist
2106+ # trigger info: hist:keys=call_site.sym:vals=bytes_req,bytes_alloc:sort=bytes_alloc.descending:size=2048 [active]
2107+
2108+ { call_site: [ffffffffa046041c] i915_gem_execbuffer2 [i915] } hitcount: 7403 bytes_req: 4084360 bytes_alloc: 5958016
2109+ { call_site: [ffffffff811e2a1b] seq_buf_alloc } hitcount: 541 bytes_req: 2213968 bytes_alloc: 2228224
2110+ { call_site: [ffffffffa0489a66] intel_ring_begin [i915] } hitcount: 7404 bytes_req: 1066176 bytes_alloc: 1421568
2111+ { call_site: [ffffffffa045e7c4] i915_gem_do_execbuffer.isra.23 [i915] } hitcount: 1565 bytes_req: 557368 bytes_alloc: 1037760
2112+ { call_site: [ffffffff8125847d] ext4_htree_store_dirent } hitcount: 9557 bytes_req: 595778 bytes_alloc: 695744
2113+ { call_site: [ffffffffa045e646] i915_gem_do_execbuffer.isra.23 [i915] } hitcount: 5839 bytes_req: 430680 bytes_alloc: 470400
2114+ { call_site: [ffffffffa04c4a3c] intel_plane_duplicate_state [i915] } hitcount: 2388 bytes_req: 324768 bytes_alloc: 458496
2115+ { call_site: [ffffffffa02911f2] drm_modeset_lock_crtc [drm] } hitcount: 3911 bytes_req: 219016 bytes_alloc: 250304
2116+ { call_site: [ffffffff815f8d7b] sk_prot_alloc } hitcount: 235 bytes_req: 236880 bytes_alloc: 240640
2117+ { call_site: [ffffffff8137e559] sg_kmalloc } hitcount: 557 bytes_req: 169024 bytes_alloc: 221760
2118+ { call_site: [ffffffffa00b7c06] hid_report_raw_event [hid] } hitcount: 9378 bytes_req: 187548 bytes_alloc: 206312
2119+ { call_site: [ffffffffa04a580c] intel_crtc_page_flip [i915] } hitcount: 1519 bytes_req: 157976 bytes_alloc: 194432
2120+ .
2121+ .
2122+ .
2123+ { call_site: [ffffffff8109bd3b] sched_autogroup_create_attach } hitcount: 2 bytes_req: 144 bytes_alloc: 192
2124+ { call_site: [ffffffff81097ee8] alloc_rt_sched_group } hitcount: 2 bytes_req: 128 bytes_alloc: 128
2125+ { call_site: [ffffffff8109524a] alloc_fair_sched_group } hitcount: 2 bytes_req: 128 bytes_alloc: 128
2126+ { call_site: [ffffffff81095225] alloc_fair_sched_group } hitcount: 2 bytes_req: 128 bytes_alloc: 128
2127+ { call_site: [ffffffff81097ec2] alloc_rt_sched_group } hitcount: 2 bytes_req: 128 bytes_alloc: 128
2128+ { call_site: [ffffffff81213e80] load_elf_binary } hitcount: 3 bytes_req: 84 bytes_alloc: 96
2129+ { call_site: [ffffffff81079a2e] kthread_create_on_node } hitcount: 1 bytes_req: 56 bytes_alloc: 64
2130+ { call_site: [ffffffffa00bf6fe] hidraw_send_report [hid] } hitcount: 1 bytes_req: 7 bytes_alloc: 8
2131+ { call_site: [ffffffff8154bc62] usb_control_msg } hitcount: 1 bytes_req: 8 bytes_alloc: 8
2132+ { call_site: [ffffffffa00bf1ca] hidraw_report_event [hid] } hitcount: 1 bytes_req: 7 bytes_alloc: 8
2133+
2134+ Totals:
2135+ Hits: 66598
2136+ Entries: 65
2137+ Dropped: 0
2138+
2139+ Finally, to finish off our kmalloc example, instead of simply having
2140+ the hist trigger display symbolic call_sites, we can have the hist
2141+ trigger additionally display the complete set of kernel stack traces
2142+ that led to each call_site. To do that, we simply use the special
2143+ value 'stacktrace' for the key parameter:
2144+
2145+ # echo 'hist:keys=stacktrace:values=bytes_req,bytes_alloc:sort=bytes_alloc' > \
2146+ /sys/kernel/debug/tracing/events/kmem/kmalloc/trigger
2147+
2148+ The above trigger will use the kernel stack trace in effect when an
2149+ event is triggered as the key for the hash table. This allows the
2150+ enumeration of every kernel callpath that led up to a particular
2151+ event, along with a running total of any of the event fields for
2152+ that event. Here we tally bytes requested and bytes allocated for
2153+ every callpath in the system that led up to a kmalloc (in this case
2154+ every callpath to a kmalloc for a kernel compile):
2155+
2156+ # cat /sys/kernel/debug/tracing/events/kmem/kmalloc/hist
2157+ # trigger info: hist:keys=stacktrace:vals=bytes_req,bytes_alloc:sort=bytes_alloc:size=2048 [active]
2158+
2159+ { stacktrace:
2160+ __kmalloc_track_caller+0x10b/0x1a0
2161+ kmemdup+0x20/0x50
2162+ hidraw_report_event+0x8a/0x120 [hid]
2163+ hid_report_raw_event+0x3ea/0x440 [hid]
2164+ hid_input_report+0x112/0x190 [hid]
2165+ hid_irq_in+0xc2/0x260 [usbhid]
2166+ __usb_hcd_giveback_urb+0x72/0x120
2167+ usb_giveback_urb_bh+0x9e/0xe0
2168+ tasklet_hi_action+0xf8/0x100
2169+ __do_softirq+0x114/0x2c0
2170+ irq_exit+0xa5/0xb0
2171+ do_IRQ+0x5a/0xf0
2172+ ret_from_intr+0x0/0x30
2173+ cpuidle_enter+0x17/0x20
2174+ cpu_startup_entry+0x315/0x3e0
2175+ rest_init+0x7c/0x80
2176+ } hitcount: 3 bytes_req: 21 bytes_alloc: 24
2177+ { stacktrace:
2178+ __kmalloc_track_caller+0x10b/0x1a0
2179+ kmemdup+0x20/0x50
2180+ hidraw_report_event+0x8a/0x120 [hid]
2181+ hid_report_raw_event+0x3ea/0x440 [hid]
2182+ hid_input_report+0x112/0x190 [hid]
2183+ hid_irq_in+0xc2/0x260 [usbhid]
2184+ __usb_hcd_giveback_urb+0x72/0x120
2185+ usb_giveback_urb_bh+0x9e/0xe0
2186+ tasklet_hi_action+0xf8/0x100
2187+ __do_softirq+0x114/0x2c0
2188+ irq_exit+0xa5/0xb0
2189+ do_IRQ+0x5a/0xf0
2190+ ret_from_intr+0x0/0x30
2191+ } hitcount: 3 bytes_req: 21 bytes_alloc: 24
2192+ { stacktrace:
2193+ kmem_cache_alloc_trace+0xeb/0x150
2194+ aa_alloc_task_context+0x27/0x40
2195+ apparmor_cred_prepare+0x1f/0x50
2196+ security_prepare_creds+0x16/0x20
2197+ prepare_creds+0xdf/0x1a0
2198+ SyS_capset+0xb5/0x200
2199+ system_call_fastpath+0x12/0x6a
2200+ } hitcount: 1 bytes_req: 32 bytes_alloc: 32
2201+ .
2202+ .
2203+ .
2204+ { stacktrace:
2205+ __kmalloc+0x11b/0x1b0
2206+ i915_gem_execbuffer2+0x6c/0x2c0 [i915]
2207+ drm_ioctl+0x349/0x670 [drm]
2208+ do_vfs_ioctl+0x2f0/0x4f0
2209+ SyS_ioctl+0x81/0xa0
2210+ system_call_fastpath+0x12/0x6a
2211+ } hitcount: 17726 bytes_req: 13944120 bytes_alloc: 19593808
2212+ { stacktrace:
2213+ __kmalloc+0x11b/0x1b0
2214+ load_elf_phdrs+0x76/0xa0
2215+ load_elf_binary+0x102/0x1650
2216+ search_binary_handler+0x97/0x1d0
2217+ do_execveat_common.isra.34+0x551/0x6e0
2218+ SyS_execve+0x3a/0x50
2219+ return_from_execve+0x0/0x23
2220+ } hitcount: 33348 bytes_req: 17152128 bytes_alloc: 20226048
2221+ { stacktrace:
2222+ kmem_cache_alloc_trace+0xeb/0x150
2223+ apparmor_file_alloc_security+0x27/0x40
2224+ security_file_alloc+0x16/0x20
2225+ get_empty_filp+0x93/0x1c0
2226+ path_openat+0x31/0x5f0
2227+ do_filp_open+0x3a/0x90
2228+ do_sys_open+0x128/0x220
2229+ SyS_open+0x1e/0x20
2230+ system_call_fastpath+0x12/0x6a
2231+ } hitcount: 4766422 bytes_req: 9532844 bytes_alloc: 38131376
2232+ { stacktrace:
2233+ __kmalloc+0x11b/0x1b0
2234+ seq_buf_alloc+0x1b/0x50
2235+ seq_read+0x2cc/0x370
2236+ proc_reg_read+0x3d/0x80
2237+ __vfs_read+0x28/0xe0
2238+ vfs_read+0x86/0x140
2239+ SyS_read+0x46/0xb0
2240+ system_call_fastpath+0x12/0x6a
2241+ } hitcount: 19133 bytes_req: 78368768 bytes_alloc: 78368768
2242+
2243+ Totals:
2244+ Hits: 6085872
2245+ Entries: 253
2246+ Dropped: 0
2247+
2248+ If you key a hist trigger on common_pid, in order for example to
2249+ gather and display sorted totals for each process, you can use the
2250+ special .execname modifier to display the executable names for the
2251+ processes in the table rather than raw pids. The example below
2252+ keeps a per-process sum of total bytes read:
2253+
2254+ # echo 'hist:key=common_pid.execname:val=count:sort=count.descending' > \
2255+ /sys/kernel/debug/tracing/events/syscalls/sys_enter_read/trigger
2256+
2257+ # cat /sys/kernel/debug/tracing/events/syscalls/sys_enter_read/hist
2258+ # trigger info: hist:keys=common_pid.execname:vals=count:sort=count.descending:size=2048 [active]
2259+
2260+ { common_pid: gnome-terminal [ 3196] } hitcount: 280 count: 1093512
2261+ { common_pid: Xorg [ 1309] } hitcount: 525 count: 256640
2262+ { common_pid: compiz [ 2889] } hitcount: 59 count: 254400
2263+ { common_pid: bash [ 8710] } hitcount: 3 count: 66369
2264+ { common_pid: dbus-daemon-lau [ 8703] } hitcount: 49 count: 47739
2265+ { common_pid: irqbalance [ 1252] } hitcount: 27 count: 27648
2266+ { common_pid: 01ifupdown [ 8705] } hitcount: 3 count: 17216
2267+ { common_pid: dbus-daemon [ 772] } hitcount: 10 count: 12396
2268+ { common_pid: Socket Thread [ 8342] } hitcount: 11 count: 11264
2269+ { common_pid: nm-dhcp-client. [ 8701] } hitcount: 6 count: 7424
2270+ { common_pid: gmain [ 1315] } hitcount: 18 count: 6336
2271+ .
2272+ .
2273+ .
2274+ { common_pid: postgres [ 1892] } hitcount: 2 count: 32
2275+ { common_pid: postgres [ 1891] } hitcount: 2 count: 32
2276+ { common_pid: gmain [ 8704] } hitcount: 2 count: 32
2277+ { common_pid: upstart-dbus-br [ 2740] } hitcount: 21 count: 21
2278+ { common_pid: nm-dispatcher.a [ 8696] } hitcount: 1 count: 16
2279+ { common_pid: indicator-datet [ 2904] } hitcount: 1 count: 16
2280+ { common_pid: gdbus [ 2998] } hitcount: 1 count: 16
2281+ { common_pid: rtkit-daemon [ 2052] } hitcount: 1 count: 8
2282+ { common_pid: init [ 1] } hitcount: 2 count: 2
2283+
2284+ Totals:
2285+ Hits: 2116
2286+ Entries: 51
2287+ Dropped: 0
2288+
2289+ Similarly, if you key a hist trigger on syscall id, for example to
2290+ gather and display a list of systemwide syscall hits, you can use
2291+ the special .syscall modifier to display the syscall names rather
2292+ than raw ids. The example below keeps a running total of syscall
2293+ counts for the system during the run:
2294+
2295+ # echo 'hist:key=id.syscall:val=hitcount' > \
2296+ /sys/kernel/debug/tracing/events/raw_syscalls/sys_enter/trigger
2297+
2298+ # cat /sys/kernel/debug/tracing/events/raw_syscalls/sys_enter/hist
2299+ # trigger info: hist:keys=id.syscall:vals=hitcount:sort=hitcount:size=2048 [active]
2300+
2301+ { id: sys_fsync [ 74] } hitcount: 1
2302+ { id: sys_newuname [ 63] } hitcount: 1
2303+ { id: sys_prctl [157] } hitcount: 1
2304+ { id: sys_statfs [137] } hitcount: 1
2305+ { id: sys_symlink [ 88] } hitcount: 1
2306+ { id: sys_sendmmsg [307] } hitcount: 1
2307+ { id: sys_semctl [ 66] } hitcount: 1
2308+ { id: sys_readlink [ 89] } hitcount: 3
2309+ { id: sys_bind [ 49] } hitcount: 3
2310+ { id: sys_getsockname [ 51] } hitcount: 3
2311+ { id: sys_unlink [ 87] } hitcount: 3
2312+ { id: sys_rename [ 82] } hitcount: 4
2313+ { id: unknown_syscall [ 58] } hitcount: 4
2314+ { id: sys_connect [ 42] } hitcount: 4
2315+ { id: sys_getpid [ 39] } hitcount: 4
2316+ .
2317+ .
2318+ .
2319+ { id: sys_rt_sigprocmask [ 14] } hitcount: 952
2320+ { id: sys_futex [202] } hitcount: 1534
2321+ { id: sys_write [ 1] } hitcount: 2689
2322+ { id: sys_setitimer [ 38] } hitcount: 2797
2323+ { id: sys_read [ 0] } hitcount: 3202
2324+ { id: sys_select [ 23] } hitcount: 3773
2325+ { id: sys_writev [ 20] } hitcount: 4531
2326+ { id: sys_poll [ 7] } hitcount: 8314
2327+ { id: sys_recvmsg [ 47] } hitcount: 13738
2328+ { id: sys_ioctl [ 16] } hitcount: 21843
2329+
2330+ Totals:
2331+ Hits: 67612
2332+ Entries: 72
2333+ Dropped: 0
2334+
2335+ The syscall counts above provide a rough overall picture of system
2336+ call activity on the system; we can see for example that the most
2337+ popular system call on this system was the 'sys_ioctl' system call.
2338+
2339+ We can use 'compound' keys to refine that number and provide some
2340+ further insight as to which processes exactly contribute to the
2341+ overall ioctl count.
2342+
2343+ The command below keeps a hitcount for every unique combination of
2344+ system call id and pid - the end result is essentially a table
2345+ that keeps a per-pid sum of system call hits. The results are
2346+ sorted using the system call id as the primary key, and the
2347+ hitcount sum as the secondary key:
2348+
2349+ # echo 'hist:key=id.syscall,common_pid.execname:val=hitcount:sort=id,hitcount' > \
2350+ /sys/kernel/debug/tracing/events/raw_syscalls/sys_enter/trigger
2351+
2352+ # cat /sys/kernel/debug/tracing/events/raw_syscalls/sys_enter/hist
2353+ # trigger info: hist:keys=id.syscall,common_pid.execname:vals=hitcount:sort=id.syscall,hitcount:size=2048 [active]
2354+
2355+ { id: sys_read [ 0], common_pid: rtkit-daemon [ 1877] } hitcount: 1
2356+ { id: sys_read [ 0], common_pid: gdbus [ 2976] } hitcount: 1
2357+ { id: sys_read [ 0], common_pid: console-kit-dae [ 3400] } hitcount: 1
2358+ { id: sys_read [ 0], common_pid: postgres [ 1865] } hitcount: 1
2359+ { id: sys_read [ 0], common_pid: deja-dup-monito [ 3543] } hitcount: 2
2360+ { id: sys_read [ 0], common_pid: NetworkManager [ 890] } hitcount: 2
2361+ { id: sys_read [ 0], common_pid: evolution-calen [ 3048] } hitcount: 2
2362+ { id: sys_read [ 0], common_pid: postgres [ 1864] } hitcount: 2
2363+ { id: sys_read [ 0], common_pid: nm-applet [ 3022] } hitcount: 2
2364+ { id: sys_read [ 0], common_pid: whoopsie [ 1212] } hitcount: 2
2365+ .
2366+ .
2367+ .
2368+ { id: sys_ioctl [ 16], common_pid: bash [ 8479] } hitcount: 1
2369+ { id: sys_ioctl [ 16], common_pid: bash [ 3472] } hitcount: 12
2370+ { id: sys_ioctl [ 16], common_pid: gnome-terminal [ 3199] } hitcount: 16
2371+ { id: sys_ioctl [ 16], common_pid: Xorg [ 1267] } hitcount: 1808
2372+ { id: sys_ioctl [ 16], common_pid: compiz [ 2994] } hitcount: 5580
2373+ .
2374+ .
2375+ .
2376+ { id: sys_waitid [247], common_pid: upstart-dbus-br [ 2690] } hitcount: 3
2377+ { id: sys_waitid [247], common_pid: upstart-dbus-br [ 2688] } hitcount: 16
2378+ { id: sys_inotify_add_watch [254], common_pid: gmain [ 975] } hitcount: 2
2379+ { id: sys_inotify_add_watch [254], common_pid: gmain [ 3204] } hitcount: 4
2380+ { id: sys_inotify_add_watch [254], common_pid: gmain [ 2888] } hitcount: 4
2381+ { id: sys_inotify_add_watch [254], common_pid: gmain [ 3003] } hitcount: 4
2382+ { id: sys_inotify_add_watch [254], common_pid: gmain [ 2873] } hitcount: 4
2383+ { id: sys_inotify_add_watch [254], common_pid: gmain [ 3196] } hitcount: 6
2384+ { id: sys_openat [257], common_pid: java [ 2623] } hitcount: 2
2385+ { id: sys_eventfd2 [290], common_pid: ibus-ui-gtk3 [ 2760] } hitcount: 4
2386+ { id: sys_eventfd2 [290], common_pid: compiz [ 2994] } hitcount: 6
2387+
2388+ Totals:
2389+ Hits: 31536
2390+ Entries: 323
2391+ Dropped: 0
2392+
2393+ The above list does give us a breakdown of the ioctl syscall by
2394+ pid, but it also gives us quite a bit more than that, which we
2395+ don't really care about at the moment. Since we know the syscall
2396+ id for sys_ioctl (16, displayed next to the sys_ioctl name), we
2397+ can use that to filter out all the other syscalls:
2398+
2399+ # echo 'hist:key=id.syscall,common_pid.execname:val=hitcount:sort=id,hitcount if id == 16' > \
2400+ /sys/kernel/debug/tracing/events/raw_syscalls/sys_enter/trigger
2401+
2402+ # cat /sys/kernel/debug/tracing/events/raw_syscalls/sys_enter/hist
2403+ # trigger info: hist:keys=id.syscall,common_pid.execname:vals=hitcount:sort=id.syscall,hitcount:size=2048 if id == 16 [active]
2404+
2405+ { id: sys_ioctl [ 16], common_pid: gmain [ 2769] } hitcount: 1
2406+ { id: sys_ioctl [ 16], common_pid: evolution-addre [ 8571] } hitcount: 1
2407+ { id: sys_ioctl [ 16], common_pid: gmain [ 3003] } hitcount: 1
2408+ { id: sys_ioctl [ 16], common_pid: gmain [ 2781] } hitcount: 1
2409+ { id: sys_ioctl [ 16], common_pid: gmain [ 2829] } hitcount: 1
2410+ { id: sys_ioctl [ 16], common_pid: bash [ 8726] } hitcount: 1
2411+ { id: sys_ioctl [ 16], common_pid: bash [ 8508] } hitcount: 1
2412+ { id: sys_ioctl [ 16], common_pid: gmain [ 2970] } hitcount: 1
2413+ { id: sys_ioctl [ 16], common_pid: gmain [ 2768] } hitcount: 1
2414+ .
2415+ .
2416+ .
2417+ { id: sys_ioctl [ 16], common_pid: pool [ 8559] } hitcount: 45
2418+ { id: sys_ioctl [ 16], common_pid: pool [ 8555] } hitcount: 48
2419+ { id: sys_ioctl [ 16], common_pid: pool [ 8551] } hitcount: 48
2420+ { id: sys_ioctl [ 16], common_pid: avahi-daemon [ 896] } hitcount: 66
2421+ { id: sys_ioctl [ 16], common_pid: Xorg [ 1267] } hitcount: 26674
2422+ { id: sys_ioctl [ 16], common_pid: compiz [ 2994] } hitcount: 73443
2423+
2424+ Totals:
2425+ Hits: 101162
2426+ Entries: 103
2427+ Dropped: 0
2428+
2429+ The above output shows that 'compiz' and 'Xorg' are far and away
2430+ the heaviest ioctl callers (which might lead to questions about
2431+ whether they really need to be making all those calls and to
2432+ possible avenues for further investigation.)
2433+
2434+ The compound key examples used a key and a sum value (hitcount) to
2435+ sort the output, but we can just as easily use two keys instead.
2436+ Here's an example where we use a compound key composed of the the
2437+ common_pid and size event fields. Sorting with pid as the primary
2438+ key and 'size' as the secondary key allows us to display an
2439+ ordered summary of the recvfrom sizes, with counts, received by
2440+ each process:
2441+
2442+ # echo 'hist:key=common_pid.execname,size:val=hitcount:sort=common_pid,size' > \
2443+ /sys/kernel/debug/tracing/events/syscalls/sys_enter_recvfrom/trigger
2444+
2445+ # cat /sys/kernel/debug/tracing/events/syscalls/sys_enter_recvfrom/hist
2446+ # trigger info: hist:keys=common_pid.execname,size:vals=hitcount:sort=common_pid.execname,size:size=2048 [active]
2447+
2448+ { common_pid: smbd [ 784], size: 4 } hitcount: 1
2449+ { common_pid: dnsmasq [ 1412], size: 4096 } hitcount: 672
2450+ { common_pid: postgres [ 1796], size: 1000 } hitcount: 6
2451+ { common_pid: postgres [ 1867], size: 1000 } hitcount: 10
2452+ { common_pid: bamfdaemon [ 2787], size: 28 } hitcount: 2
2453+ { common_pid: bamfdaemon [ 2787], size: 14360 } hitcount: 1
2454+ { common_pid: compiz [ 2994], size: 8 } hitcount: 1
2455+ { common_pid: compiz [ 2994], size: 20 } hitcount: 11
2456+ { common_pid: gnome-terminal [ 3199], size: 4 } hitcount: 2
2457+ { common_pid: firefox [ 8817], size: 4 } hitcount: 1
2458+ { common_pid: firefox [ 8817], size: 8 } hitcount: 5
2459+ { common_pid: firefox [ 8817], size: 588 } hitcount: 2
2460+ { common_pid: firefox [ 8817], size: 628 } hitcount: 1
2461+ { common_pid: firefox [ 8817], size: 6944 } hitcount: 1
2462+ { common_pid: firefox [ 8817], size: 408880 } hitcount: 2
2463+ { common_pid: firefox [ 8822], size: 8 } hitcount: 2
2464+ { common_pid: firefox [ 8822], size: 160 } hitcount: 2
2465+ { common_pid: firefox [ 8822], size: 320 } hitcount: 2
2466+ { common_pid: firefox [ 8822], size: 352 } hitcount: 1
2467+ .
2468+ .
2469+ .
2470+ { common_pid: pool [ 8923], size: 1960 } hitcount: 10
2471+ { common_pid: pool [ 8923], size: 2048 } hitcount: 10
2472+ { common_pid: pool [ 8924], size: 1960 } hitcount: 10
2473+ { common_pid: pool [ 8924], size: 2048 } hitcount: 10
2474+ { common_pid: pool [ 8928], size: 1964 } hitcount: 4
2475+ { common_pid: pool [ 8928], size: 1965 } hitcount: 2
2476+ { common_pid: pool [ 8928], size: 2048 } hitcount: 6
2477+ { common_pid: pool [ 8929], size: 1982 } hitcount: 1
2478+ { common_pid: pool [ 8929], size: 2048 } hitcount: 1
2479+
2480+ Totals:
2481+ Hits: 2016
2482+ Entries: 224
2483+ Dropped: 0
2484+
2485+ The above example also illustrates the fact that although a compound
2486+ key is treated as a single entity for hashing purposes, the sub-keys
2487+ it's composed of can be accessed independently.
2488+
2489+ The next example uses a string field as the hash key and
2490+ demonstrates how you can manually pause and continue a hist trigger.
2491+ In this example, we'll aggregate fork counts and don't expect a
2492+ large number of entries in the hash table, so we'll drop it to a
2493+ much smaller number, say 256:
2494+
2495+ # echo 'hist:key=child_comm:val=hitcount:size=256' > \
2496+ /sys/kernel/debug/tracing/events/sched/sched_process_fork/trigger
2497+
2498+ # cat /sys/kernel/debug/tracing/events/sched/sched_process_fork/hist
2499+ # trigger info: hist:keys=child_comm:vals=hitcount:sort=hitcount:size=256 [active]
2500+
2501+ { child_comm: dconf worker } hitcount: 1
2502+ { child_comm: ibus-daemon } hitcount: 1
2503+ { child_comm: whoopsie } hitcount: 1
2504+ { child_comm: smbd } hitcount: 1
2505+ { child_comm: gdbus } hitcount: 1
2506+ { child_comm: kthreadd } hitcount: 1
2507+ { child_comm: dconf worker } hitcount: 1
2508+ { child_comm: evolution-alarm } hitcount: 2
2509+ { child_comm: Socket Thread } hitcount: 2
2510+ { child_comm: postgres } hitcount: 2
2511+ { child_comm: bash } hitcount: 3
2512+ { child_comm: compiz } hitcount: 3
2513+ { child_comm: evolution-sourc } hitcount: 4
2514+ { child_comm: dhclient } hitcount: 4
2515+ { child_comm: pool } hitcount: 5
2516+ { child_comm: nm-dispatcher.a } hitcount: 8
2517+ { child_comm: firefox } hitcount: 8
2518+ { child_comm: dbus-daemon } hitcount: 8
2519+ { child_comm: glib-pacrunner } hitcount: 10
2520+ { child_comm: evolution } hitcount: 23
2521+
2522+ Totals:
2523+ Hits: 89
2524+ Entries: 20
2525+ Dropped: 0
2526+
2527+ If we want to pause the hist trigger, we can simply append :pause to
2528+ the command that started the trigger. Notice that the trigger info
2529+ displays as [paused]:
2530+
2531+ # echo 'hist:key=child_comm:val=hitcount:size=256:pause' >> \
2532+ /sys/kernel/debug/tracing/events/sched/sched_process_fork/trigger
2533+
2534+ # cat /sys/kernel/debug/tracing/events/sched/sched_process_fork/hist
2535+ # trigger info: hist:keys=child_comm:vals=hitcount:sort=hitcount:size=256 [paused]
2536+
2537+ { child_comm: dconf worker } hitcount: 1
2538+ { child_comm: kthreadd } hitcount: 1
2539+ { child_comm: dconf worker } hitcount: 1
2540+ { child_comm: gdbus } hitcount: 1
2541+ { child_comm: ibus-daemon } hitcount: 1
2542+ { child_comm: Socket Thread } hitcount: 2
2543+ { child_comm: evolution-alarm } hitcount: 2
2544+ { child_comm: smbd } hitcount: 2
2545+ { child_comm: bash } hitcount: 3
2546+ { child_comm: whoopsie } hitcount: 3
2547+ { child_comm: compiz } hitcount: 3
2548+ { child_comm: evolution-sourc } hitcount: 4
2549+ { child_comm: pool } hitcount: 5
2550+ { child_comm: postgres } hitcount: 6
2551+ { child_comm: firefox } hitcount: 8
2552+ { child_comm: dhclient } hitcount: 10
2553+ { child_comm: emacs } hitcount: 12
2554+ { child_comm: dbus-daemon } hitcount: 20
2555+ { child_comm: nm-dispatcher.a } hitcount: 20
2556+ { child_comm: evolution } hitcount: 35
2557+ { child_comm: glib-pacrunner } hitcount: 59
2558+
2559+ Totals:
2560+ Hits: 199
2561+ Entries: 21
2562+ Dropped: 0
2563+
2564+ To manually continue having the trigger aggregate events, append
2565+ :cont instead. Notice that the trigger info displays as [active]
2566+ again, and the data has changed:
2567+
2568+ # echo 'hist:key=child_comm:val=hitcount:size=256:cont' >> \
2569+ /sys/kernel/debug/tracing/events/sched/sched_process_fork/trigger
2570+
2571+ # cat /sys/kernel/debug/tracing/events/sched/sched_process_fork/hist
2572+ # trigger info: hist:keys=child_comm:vals=hitcount:sort=hitcount:size=256 [active]
2573+
2574+ { child_comm: dconf worker } hitcount: 1
2575+ { child_comm: dconf worker } hitcount: 1
2576+ { child_comm: kthreadd } hitcount: 1
2577+ { child_comm: gdbus } hitcount: 1
2578+ { child_comm: ibus-daemon } hitcount: 1
2579+ { child_comm: Socket Thread } hitcount: 2
2580+ { child_comm: evolution-alarm } hitcount: 2
2581+ { child_comm: smbd } hitcount: 2
2582+ { child_comm: whoopsie } hitcount: 3
2583+ { child_comm: compiz } hitcount: 3
2584+ { child_comm: evolution-sourc } hitcount: 4
2585+ { child_comm: bash } hitcount: 5
2586+ { child_comm: pool } hitcount: 5
2587+ { child_comm: postgres } hitcount: 6
2588+ { child_comm: firefox } hitcount: 8
2589+ { child_comm: dhclient } hitcount: 11
2590+ { child_comm: emacs } hitcount: 12
2591+ { child_comm: dbus-daemon } hitcount: 22
2592+ { child_comm: nm-dispatcher.a } hitcount: 22
2593+ { child_comm: evolution } hitcount: 35
2594+ { child_comm: glib-pacrunner } hitcount: 59
2595+
2596+ Totals:
2597+ Hits: 206
2598+ Entries: 21
2599+ Dropped: 0
2600+
2601+ The previous example showed how to start and stop a hist trigger by
2602+ appending 'pause' and 'continue' to the hist trigger command. A
2603+ hist trigger can also be started in a paused state by initially
2604+ starting the trigger with ':pause' appended. This allows you to
2605+ start the trigger only when you're ready to start collecting data
2606+ and not before. For example, you could start the trigger in a
2607+ paused state, then unpause it and do something you want to measure,
2608+ then pause the trigger again when done.
2609+
2610+ Of course, doing this manually can be difficult and error-prone, but
2611+ it is possible to automatically start and stop a hist trigger based
2612+ on some condition, via the enable_hist and disable_hist triggers.
2613+
2614+ For example, suppose we wanted to take a look at the relative
2615+ weights in terms of skb length for each callpath that leads to a
2616+ netif_receieve_skb event when downloading a decent-sized file using
2617+ wget.
2618+
2619+ First we set up an initially paused stacktrace trigger on the
2620+ netif_receive_skb event:
2621+
2622+ # echo 'hist:key=stacktrace:vals=len:pause' > \
2623+ /sys/kernel/debug/tracing/events/net/netif_receive_skb/trigger
2624+
2625+ Next, we set up an 'enable_hist' trigger on the sched_process_exec
2626+ event, with an 'if filename==/usr/bin/wget' filter. The effect of
2627+ this new trigger is that it will 'unpause' the hist trigger we just
2628+ set up on netif_receive_skb if and only if it sees a
2629+ sched_process_exec event with a filename of '/usr/bin/wget'. When
2630+ that happens, all netif_receive_skb events are aggregated into a
2631+ hash table keyed on stacktrace:
2632+
2633+ # echo 'enable_hist:net:netif_receive_skb if filename==/usr/bin/wget' > \
2634+ /sys/kernel/debug/tracing/events/sched/sched_process_exec/trigger
2635+
2636+ The aggregation continues until the netif_receive_skb is paused
2637+ again, which is what the following disable_hist event does by
2638+ creating a similar setup on the sched_process_exit event, using the
2639+ filter 'comm==wget':
2640+
2641+ # echo 'disable_hist:net:netif_receive_skb if comm==wget' > \
2642+ /sys/kernel/debug/tracing/events/sched/sched_process_exit/trigger
2643+
2644+ Whenever a process exits and the comm field of the disable_hist
2645+ trigger filter matches 'comm==wget', the netif_receive_skb hist
2646+ trigger is disabled.
2647+
2648+ The overall effect is that netif_receive_skb events are aggregated
2649+ into the hash table for only the duration of the wget. Executing a
2650+ wget command and then listing the 'hist' file will display the
2651+ output generated by the wget command:
2652+
2653+ $ wget https://www.kernel.org/pub/linux/kernel/v3.x/patch-3.19.xz
2654+
2655+ # cat /sys/kernel/debug/tracing/events/net/netif_receive_skb/hist
2656+ # trigger info: hist:keys=stacktrace:vals=len:sort=hitcount:size=2048 [paused]
2657+
2658+ { stacktrace:
2659+ __netif_receive_skb_core+0x46d/0x990
2660+ __netif_receive_skb+0x18/0x60
2661+ netif_receive_skb_internal+0x23/0x90
2662+ napi_gro_receive+0xc8/0x100
2663+ ieee80211_deliver_skb+0xd6/0x270 [mac80211]
2664+ ieee80211_rx_handlers+0xccf/0x22f0 [mac80211]
2665+ ieee80211_prepare_and_rx_handle+0x4e7/0xc40 [mac80211]
2666+ ieee80211_rx+0x31d/0x900 [mac80211]
2667+ iwlagn_rx_reply_rx+0x3db/0x6f0 [iwldvm]
2668+ iwl_rx_dispatch+0x8e/0xf0 [iwldvm]
2669+ iwl_pcie_irq_handler+0xe3c/0x12f0 [iwlwifi]
2670+ irq_thread_fn+0x20/0x50
2671+ irq_thread+0x11f/0x150
2672+ kthread+0xd2/0xf0
2673+ ret_from_fork+0x42/0x70
2674+ } hitcount: 85 len: 28884
2675+ { stacktrace:
2676+ __netif_receive_skb_core+0x46d/0x990
2677+ __netif_receive_skb+0x18/0x60
2678+ netif_receive_skb_internal+0x23/0x90
2679+ napi_gro_complete+0xa4/0xe0
2680+ dev_gro_receive+0x23a/0x360
2681+ napi_gro_receive+0x30/0x100
2682+ ieee80211_deliver_skb+0xd6/0x270 [mac80211]
2683+ ieee80211_rx_handlers+0xccf/0x22f0 [mac80211]
2684+ ieee80211_prepare_and_rx_handle+0x4e7/0xc40 [mac80211]
2685+ ieee80211_rx+0x31d/0x900 [mac80211]
2686+ iwlagn_rx_reply_rx+0x3db/0x6f0 [iwldvm]
2687+ iwl_rx_dispatch+0x8e/0xf0 [iwldvm]
2688+ iwl_pcie_irq_handler+0xe3c/0x12f0 [iwlwifi]
2689+ irq_thread_fn+0x20/0x50
2690+ irq_thread+0x11f/0x150
2691+ kthread+0xd2/0xf0
2692+ } hitcount: 98 len: 664329
2693+ { stacktrace:
2694+ __netif_receive_skb_core+0x46d/0x990
2695+ __netif_receive_skb+0x18/0x60
2696+ process_backlog+0xa8/0x150
2697+ net_rx_action+0x15d/0x340
2698+ __do_softirq+0x114/0x2c0
2699+ do_softirq_own_stack+0x1c/0x30
2700+ do_softirq+0x65/0x70
2701+ __local_bh_enable_ip+0xb5/0xc0
2702+ ip_finish_output+0x1f4/0x840
2703+ ip_output+0x6b/0xc0
2704+ ip_local_out_sk+0x31/0x40
2705+ ip_send_skb+0x1a/0x50
2706+ udp_send_skb+0x173/0x2a0
2707+ udp_sendmsg+0x2bf/0x9f0
2708+ inet_sendmsg+0x64/0xa0
2709+ sock_sendmsg+0x3d/0x50
2710+ } hitcount: 115 len: 13030
2711+ { stacktrace:
2712+ __netif_receive_skb_core+0x46d/0x990
2713+ __netif_receive_skb+0x18/0x60
2714+ netif_receive_skb_internal+0x23/0x90
2715+ napi_gro_complete+0xa4/0xe0
2716+ napi_gro_flush+0x6d/0x90
2717+ iwl_pcie_irq_handler+0x92a/0x12f0 [iwlwifi]
2718+ irq_thread_fn+0x20/0x50
2719+ irq_thread+0x11f/0x150
2720+ kthread+0xd2/0xf0
2721+ ret_from_fork+0x42/0x70
2722+ } hitcount: 934 len: 5512212
2723+
2724+ Totals:
2725+ Hits: 1232
2726+ Entries: 4
2727+ Dropped: 0
2728+
2729+ The above shows all the netif_receive_skb callpaths and their total
2730+ lengths for the duration of the wget command.
2731+
2732+ The 'clear' hist trigger param can be used to clear the hash table.
2733+ Suppose we wanted to try another run of the previous example but
2734+ this time also wanted to see the complete list of events that went
2735+ into the histogram. In order to avoid having to set everything up
2736+ again, we can just clear the histogram first:
2737+
2738+ # echo 'hist:key=stacktrace:vals=len:clear' >> \
2739+ /sys/kernel/debug/tracing/events/net/netif_receive_skb/trigger
2740+
2741+ Just to verify that it is in fact cleared, here's what we now see in
2742+ the hist file:
2743+
2744+ # cat /sys/kernel/debug/tracing/events/net/netif_receive_skb/hist
2745+ # trigger info: hist:keys=stacktrace:vals=len:sort=hitcount:size=2048 [paused]
2746+
2747+ Totals:
2748+ Hits: 0
2749+ Entries: 0
2750+ Dropped: 0
2751+
2752+ Since we want to see the detailed list of every netif_receive_skb
2753+ event occurring during the new run, which are in fact the same
2754+ events being aggregated into the hash table, we add some additional
2755+ 'enable_event' events to the triggering sched_process_exec and
2756+ sched_process_exit events as such:
2757+
2758+ # echo 'enable_event:net:netif_receive_skb if filename==/usr/bin/wget' > \
2759+ /sys/kernel/debug/tracing/events/sched/sched_process_exec/trigger
2760+
2761+ # echo 'disable_event:net:netif_receive_skb if comm==wget' > \
2762+ /sys/kernel/debug/tracing/events/sched/sched_process_exit/trigger
2763+
2764+ If you read the trigger files for the sched_process_exec and
2765+ sched_process_exit triggers, you should see two triggers for each:
2766+ one enabling/disabling the hist aggregation and the other
2767+ enabling/disabling the logging of events:
2768+
2769+ # cat /sys/kernel/debug/tracing/events/sched/sched_process_exec/trigger
2770+ enable_event:net:netif_receive_skb:unlimited if filename==/usr/bin/wget
2771+ enable_hist:net:netif_receive_skb:unlimited if filename==/usr/bin/wget
2772+
2773+ # cat /sys/kernel/debug/tracing/events/sched/sched_process_exit/trigger
2774+ enable_event:net:netif_receive_skb:unlimited if comm==wget
2775+ disable_hist:net:netif_receive_skb:unlimited if comm==wget
2776+
2777+ In other words, whenever either of the sched_process_exec or
2778+ sched_process_exit events is hit and matches 'wget', it enables or
2779+ disables both the histogram and the event log, and what you end up
2780+ with is a hash table and set of events just covering the specified
2781+ duration. Run the wget command again:
2782+
2783+ $ wget https://www.kernel.org/pub/linux/kernel/v3.x/patch-3.19.xz
2784+
2785+ Displaying the 'hist' file should show something similar to what you
2786+ saw in the last run, but this time you should also see the
2787+ individual events in the trace file:
2788+
2789+ # cat /sys/kernel/debug/tracing/trace
2790+
2791+ # tracer: nop
2792+ #
2793+ # entries-in-buffer/entries-written: 183/1426 #P:4
2794+ #
2795+ # _-----=> irqs-off
2796+ # / _----=> need-resched
2797+ # | / _---=> hardirq/softirq
2798+ # || / _--=> preempt-depth
2799+ # ||| / delay
2800+ # TASK-PID CPU# |||| TIMESTAMP FUNCTION
2801+ # | | | |||| | |
2802+ wget-15108 [000] ..s1 31769.606929: netif_receive_skb: dev=lo skbaddr=ffff88009c353100 len=60
2803+ wget-15108 [000] ..s1 31769.606999: netif_receive_skb: dev=lo skbaddr=ffff88009c353200 len=60
2804+ dnsmasq-1382 [000] ..s1 31769.677652: netif_receive_skb: dev=lo skbaddr=ffff88009c352b00 len=130
2805+ dnsmasq-1382 [000] ..s1 31769.685917: netif_receive_skb: dev=lo skbaddr=ffff88009c352200 len=138
2806+ ##### CPU 2 buffer started ####
2807+ irq/29-iwlwifi-559 [002] ..s. 31772.031529: netif_receive_skb: dev=wlan0 skbaddr=ffff88009d433d00 len=2948
2808+ irq/29-iwlwifi-559 [002] ..s. 31772.031572: netif_receive_skb: dev=wlan0 skbaddr=ffff88009d432200 len=1500
2809+ irq/29-iwlwifi-559 [002] ..s. 31772.032196: netif_receive_skb: dev=wlan0 skbaddr=ffff88009d433100 len=2948
2810+ irq/29-iwlwifi-559 [002] ..s. 31772.032761: netif_receive_skb: dev=wlan0 skbaddr=ffff88009d433000 len=2948
2811+ irq/29-iwlwifi-559 [002] ..s. 31772.033220: netif_receive_skb: dev=wlan0 skbaddr=ffff88009d432e00 len=1500
2812+ .
2813+ .
2814+ .
2815+
2816+ The following example demonstrates how multiple hist triggers can be
2817+ attached to a given event. This capability can be useful for
2818+ creating a set of different summaries derived from the same set of
2819+ events, or for comparing the effects of different filters, among
2820+ other things.
2821+
2822+ # echo 'hist:keys=skbaddr.hex:vals=len if len < 0' >> \
2823+ /sys/kernel/debug/tracing/events/net/netif_receive_skb/trigger
2824+ # echo 'hist:keys=skbaddr.hex:vals=len if len > 4096' >> \
2825+ /sys/kernel/debug/tracing/events/net/netif_receive_skb/trigger
2826+ # echo 'hist:keys=skbaddr.hex:vals=len if len == 256' >> \
2827+ /sys/kernel/debug/tracing/events/net/netif_receive_skb/trigger
2828+ # echo 'hist:keys=skbaddr.hex:vals=len' >> \
2829+ /sys/kernel/debug/tracing/events/net/netif_receive_skb/trigger
2830+ # echo 'hist:keys=len:vals=common_preempt_count' >> \
2831+ /sys/kernel/debug/tracing/events/net/netif_receive_skb/trigger
2832+
2833+ The above set of commands create four triggers differing only in
2834+ their filters, along with a completely different though fairly
2835+ nonsensical trigger. Note that in order to append multiple hist
2836+ triggers to the same file, you should use the '>>' operator to
2837+ append them ('>' will also add the new hist trigger, but will remove
2838+ any existing hist triggers beforehand).
2839+
2840+ Displaying the contents of the 'hist' file for the event shows the
2841+ contents of all five histograms:
2842+
2843+ # cat /sys/kernel/debug/tracing/events/net/netif_receive_skb/hist
2844+
2845+ # event histogram
2846+ #
2847+ # trigger info: hist:keys=len:vals=hitcount,common_preempt_count:sort=hitcount:size=2048 [active]
2848+ #
2849+
2850+ { len: 176 } hitcount: 1 common_preempt_count: 0
2851+ { len: 223 } hitcount: 1 common_preempt_count: 0
2852+ { len: 4854 } hitcount: 1 common_preempt_count: 0
2853+ { len: 395 } hitcount: 1 common_preempt_count: 0
2854+ { len: 177 } hitcount: 1 common_preempt_count: 0
2855+ { len: 446 } hitcount: 1 common_preempt_count: 0
2856+ { len: 1601 } hitcount: 1 common_preempt_count: 0
2857+ .
2858+ .
2859+ .
2860+ { len: 1280 } hitcount: 66 common_preempt_count: 0
2861+ { len: 116 } hitcount: 81 common_preempt_count: 40
2862+ { len: 708 } hitcount: 112 common_preempt_count: 0
2863+ { len: 46 } hitcount: 221 common_preempt_count: 0
2864+ { len: 1264 } hitcount: 458 common_preempt_count: 0
2865+
2866+ Totals:
2867+ Hits: 1428
2868+ Entries: 147
2869+ Dropped: 0
2870+
2871+
2872+ # event histogram
2873+ #
2874+ # trigger info: hist:keys=skbaddr.hex:vals=hitcount,len:sort=hitcount:size=2048 [active]
2875+ #
2876+
2877+ { skbaddr: ffff8800baee5e00 } hitcount: 1 len: 130
2878+ { skbaddr: ffff88005f3d5600 } hitcount: 1 len: 1280
2879+ { skbaddr: ffff88005f3d4900 } hitcount: 1 len: 1280
2880+ { skbaddr: ffff88009fed6300 } hitcount: 1 len: 115
2881+ { skbaddr: ffff88009fe0ad00 } hitcount: 1 len: 115
2882+ { skbaddr: ffff88008cdb1900 } hitcount: 1 len: 46
2883+ { skbaddr: ffff880064b5ef00 } hitcount: 1 len: 118
2884+ { skbaddr: ffff880044e3c700 } hitcount: 1 len: 60
2885+ { skbaddr: ffff880100065900 } hitcount: 1 len: 46
2886+ { skbaddr: ffff8800d46bd500 } hitcount: 1 len: 116
2887+ { skbaddr: ffff88005f3d5f00 } hitcount: 1 len: 1280
2888+ { skbaddr: ffff880100064700 } hitcount: 1 len: 365
2889+ { skbaddr: ffff8800badb6f00 } hitcount: 1 len: 60
2890+ .
2891+ .
2892+ .
2893+ { skbaddr: ffff88009fe0be00 } hitcount: 27 len: 24677
2894+ { skbaddr: ffff88009fe0a400 } hitcount: 27 len: 23052
2895+ { skbaddr: ffff88009fe0b700 } hitcount: 31 len: 25589
2896+ { skbaddr: ffff88009fe0b600 } hitcount: 32 len: 27326
2897+ { skbaddr: ffff88006a462800 } hitcount: 68 len: 71678
2898+ { skbaddr: ffff88006a463700 } hitcount: 70 len: 72678
2899+ { skbaddr: ffff88006a462b00 } hitcount: 71 len: 77589
2900+ { skbaddr: ffff88006a463600 } hitcount: 73 len: 71307
2901+ { skbaddr: ffff88006a462200 } hitcount: 81 len: 81032
2902+
2903+ Totals:
2904+ Hits: 1451
2905+ Entries: 318
2906+ Dropped: 0
2907+
2908+
2909+ # event histogram
2910+ #
2911+ # trigger info: hist:keys=skbaddr.hex:vals=hitcount,len:sort=hitcount:size=2048 if len == 256 [active]
2912+ #
2913+
2914+
2915+ Totals:
2916+ Hits: 0
2917+ Entries: 0
2918+ Dropped: 0
2919+
2920+
2921+ # event histogram
2922+ #
2923+ # trigger info: hist:keys=skbaddr.hex:vals=hitcount,len:sort=hitcount:size=2048 if len > 4096 [active]
2924+ #
2925+
2926+ { skbaddr: ffff88009fd2c300 } hitcount: 1 len: 7212
2927+ { skbaddr: ffff8800d2bcce00 } hitcount: 1 len: 7212
2928+ { skbaddr: ffff8800d2bcd700 } hitcount: 1 len: 7212
2929+ { skbaddr: ffff8800d2bcda00 } hitcount: 1 len: 21492
2930+ { skbaddr: ffff8800ae2e2d00 } hitcount: 1 len: 7212
2931+ { skbaddr: ffff8800d2bcdb00 } hitcount: 1 len: 7212
2932+ { skbaddr: ffff88006a4df500 } hitcount: 1 len: 4854
2933+ { skbaddr: ffff88008ce47b00 } hitcount: 1 len: 18636
2934+ { skbaddr: ffff8800ae2e2200 } hitcount: 1 len: 12924
2935+ { skbaddr: ffff88005f3e1000 } hitcount: 1 len: 4356
2936+ { skbaddr: ffff8800d2bcdc00 } hitcount: 2 len: 24420
2937+ { skbaddr: ffff8800d2bcc200 } hitcount: 2 len: 12996
2938+
2939+ Totals:
2940+ Hits: 14
2941+ Entries: 12
2942+ Dropped: 0
2943+
2944+
2945+ # event histogram
2946+ #
2947+ # trigger info: hist:keys=skbaddr.hex:vals=hitcount,len:sort=hitcount:size=2048 if len < 0 [active]
2948+ #
2949+
2950+
2951+ Totals:
2952+ Hits: 0
2953+ Entries: 0
2954+ Dropped: 0
2955+
2956+ Named triggers can be used to have triggers share a common set of
2957+ histogram data. This capability is mostly useful for combining the
2958+ output of events generated by tracepoints contained inside inline
2959+ functions, but names can be used in a hist trigger on any event.
2960+ For example, these two triggers when hit will update the same 'len'
2961+ field in the shared 'foo' histogram data:
2962+
2963+ # echo 'hist:name=foo:keys=skbaddr.hex:vals=len' > \
2964+ /sys/kernel/debug/tracing/events/net/netif_receive_skb/trigger
2965+ # echo 'hist:name=foo:keys=skbaddr.hex:vals=len' > \
2966+ /sys/kernel/debug/tracing/events/net/netif_rx/trigger
2967+
2968+ You can see that they're updating common histogram data by reading
2969+ each event's hist files at the same time:
2970+
2971+ # cat /sys/kernel/debug/tracing/events/net/netif_receive_skb/hist;
2972+ cat /sys/kernel/debug/tracing/events/net/netif_rx/hist
2973+
2974+ # event histogram
2975+ #
2976+ # trigger info: hist:name=foo:keys=skbaddr.hex:vals=hitcount,len:sort=hitcount:size=2048 [active]
2977+ #
2978+
2979+ { skbaddr: ffff88000ad53500 } hitcount: 1 len: 46
2980+ { skbaddr: ffff8800af5a1500 } hitcount: 1 len: 76
2981+ { skbaddr: ffff8800d62a1900 } hitcount: 1 len: 46
2982+ { skbaddr: ffff8800d2bccb00 } hitcount: 1 len: 468
2983+ { skbaddr: ffff8800d3c69900 } hitcount: 1 len: 46
2984+ { skbaddr: ffff88009ff09100 } hitcount: 1 len: 52
2985+ { skbaddr: ffff88010f13ab00 } hitcount: 1 len: 168
2986+ { skbaddr: ffff88006a54f400 } hitcount: 1 len: 46
2987+ { skbaddr: ffff8800d2bcc500 } hitcount: 1 len: 260
2988+ { skbaddr: ffff880064505000 } hitcount: 1 len: 46
2989+ { skbaddr: ffff8800baf24e00 } hitcount: 1 len: 32
2990+ { skbaddr: ffff88009fe0ad00 } hitcount: 1 len: 46
2991+ { skbaddr: ffff8800d3edff00 } hitcount: 1 len: 44
2992+ { skbaddr: ffff88009fe0b400 } hitcount: 1 len: 168
2993+ { skbaddr: ffff8800a1c55a00 } hitcount: 1 len: 40
2994+ { skbaddr: ffff8800d2bcd100 } hitcount: 1 len: 40
2995+ { skbaddr: ffff880064505f00 } hitcount: 1 len: 174
2996+ { skbaddr: ffff8800a8bff200 } hitcount: 1 len: 160
2997+ { skbaddr: ffff880044e3cc00 } hitcount: 1 len: 76
2998+ { skbaddr: ffff8800a8bfe700 } hitcount: 1 len: 46
2999+ { skbaddr: ffff8800d2bcdc00 } hitcount: 1 len: 32
3000+ { skbaddr: ffff8800a1f64800 } hitcount: 1 len: 46
3001+ { skbaddr: ffff8800d2bcde00 } hitcount: 1 len: 988
3002+ { skbaddr: ffff88006a5dea00 } hitcount: 1 len: 46
3003+ { skbaddr: ffff88002e37a200 } hitcount: 1 len: 44
3004+ { skbaddr: ffff8800a1f32c00 } hitcount: 2 len: 676
3005+ { skbaddr: ffff88000ad52600 } hitcount: 2 len: 107
3006+ { skbaddr: ffff8800a1f91e00 } hitcount: 2 len: 92
3007+ { skbaddr: ffff8800af5a0200 } hitcount: 2 len: 142
3008+ { skbaddr: ffff8800d2bcc600 } hitcount: 2 len: 220
3009+ { skbaddr: ffff8800ba36f500 } hitcount: 2 len: 92
3010+ { skbaddr: ffff8800d021f800 } hitcount: 2 len: 92
3011+ { skbaddr: ffff8800a1f33600 } hitcount: 2 len: 675
3012+ { skbaddr: ffff8800a8bfff00 } hitcount: 3 len: 138
3013+ { skbaddr: ffff8800d62a1300 } hitcount: 3 len: 138
3014+ { skbaddr: ffff88002e37a100 } hitcount: 4 len: 184
3015+ { skbaddr: ffff880064504400 } hitcount: 4 len: 184
3016+ { skbaddr: ffff8800a8bfec00 } hitcount: 4 len: 184
3017+ { skbaddr: ffff88000ad53700 } hitcount: 5 len: 230
3018+ { skbaddr: ffff8800d2bcdb00 } hitcount: 5 len: 196
3019+ { skbaddr: ffff8800a1f90000 } hitcount: 6 len: 276
3020+ { skbaddr: ffff88006a54f900 } hitcount: 6 len: 276
3021+
3022+ Totals:
3023+ Hits: 81
3024+ Entries: 42
3025+ Dropped: 0
3026+ # event histogram
3027+ #
3028+ # trigger info: hist:name=foo:keys=skbaddr.hex:vals=hitcount,len:sort=hitcount:size=2048 [active]
3029+ #
3030+
3031+ { skbaddr: ffff88000ad53500 } hitcount: 1 len: 46
3032+ { skbaddr: ffff8800af5a1500 } hitcount: 1 len: 76
3033+ { skbaddr: ffff8800d62a1900 } hitcount: 1 len: 46
3034+ { skbaddr: ffff8800d2bccb00 } hitcount: 1 len: 468
3035+ { skbaddr: ffff8800d3c69900 } hitcount: 1 len: 46
3036+ { skbaddr: ffff88009ff09100 } hitcount: 1 len: 52
3037+ { skbaddr: ffff88010f13ab00 } hitcount: 1 len: 168
3038+ { skbaddr: ffff88006a54f400 } hitcount: 1 len: 46
3039+ { skbaddr: ffff8800d2bcc500 } hitcount: 1 len: 260
3040+ { skbaddr: ffff880064505000 } hitcount: 1 len: 46
3041+ { skbaddr: ffff8800baf24e00 } hitcount: 1 len: 32
3042+ { skbaddr: ffff88009fe0ad00 } hitcount: 1 len: 46
3043+ { skbaddr: ffff8800d3edff00 } hitcount: 1 len: 44
3044+ { skbaddr: ffff88009fe0b400 } hitcount: 1 len: 168
3045+ { skbaddr: ffff8800a1c55a00 } hitcount: 1 len: 40
3046+ { skbaddr: ffff8800d2bcd100 } hitcount: 1 len: 40
3047+ { skbaddr: ffff880064505f00 } hitcount: 1 len: 174
3048+ { skbaddr: ffff8800a8bff200 } hitcount: 1 len: 160
3049+ { skbaddr: ffff880044e3cc00 } hitcount: 1 len: 76
3050+ { skbaddr: ffff8800a8bfe700 } hitcount: 1 len: 46
3051+ { skbaddr: ffff8800d2bcdc00 } hitcount: 1 len: 32
3052+ { skbaddr: ffff8800a1f64800 } hitcount: 1 len: 46
3053+ { skbaddr: ffff8800d2bcde00 } hitcount: 1 len: 988
3054+ { skbaddr: ffff88006a5dea00 } hitcount: 1 len: 46
3055+ { skbaddr: ffff88002e37a200 } hitcount: 1 len: 44
3056+ { skbaddr: ffff8800a1f32c00 } hitcount: 2 len: 676
3057+ { skbaddr: ffff88000ad52600 } hitcount: 2 len: 107
3058+ { skbaddr: ffff8800a1f91e00 } hitcount: 2 len: 92
3059+ { skbaddr: ffff8800af5a0200 } hitcount: 2 len: 142
3060+ { skbaddr: ffff8800d2bcc600 } hitcount: 2 len: 220
3061+ { skbaddr: ffff8800ba36f500 } hitcount: 2 len: 92
3062+ { skbaddr: ffff8800d021f800 } hitcount: 2 len: 92
3063+ { skbaddr: ffff8800a1f33600 } hitcount: 2 len: 675
3064+ { skbaddr: ffff8800a8bfff00 } hitcount: 3 len: 138
3065+ { skbaddr: ffff8800d62a1300 } hitcount: 3 len: 138
3066+ { skbaddr: ffff88002e37a100 } hitcount: 4 len: 184
3067+ { skbaddr: ffff880064504400 } hitcount: 4 len: 184
3068+ { skbaddr: ffff8800a8bfec00 } hitcount: 4 len: 184
3069+ { skbaddr: ffff88000ad53700 } hitcount: 5 len: 230
3070+ { skbaddr: ffff8800d2bcdb00 } hitcount: 5 len: 196
3071+ { skbaddr: ffff8800a1f90000 } hitcount: 6 len: 276
3072+ { skbaddr: ffff88006a54f900 } hitcount: 6 len: 276
3073+
3074+ Totals:
3075+ Hits: 81
3076+ Entries: 42
3077+ Dropped: 0
3078+
3079+ And here's an example that shows how to combine histogram data from
3080+ any two events even if they don't share any 'compatible' fields
3081+ other than 'hitcount' and 'stacktrace'. These commands create a
3082+ couple of triggers named 'bar' using those fields:
3083+
3084+ # echo 'hist:name=bar:key=stacktrace:val=hitcount' > \
3085+ /sys/kernel/debug/tracing/events/sched/sched_process_fork/trigger
3086+ # echo 'hist:name=bar:key=stacktrace:val=hitcount' > \
3087+ /sys/kernel/debug/tracing/events/net/netif_rx/trigger
3088+
3089+ And displaying the output of either shows some interesting if
3090+ somewhat confusing output:
3091+
3092+ # cat /sys/kernel/debug/tracing/events/sched/sched_process_fork/hist
3093+ # cat /sys/kernel/debug/tracing/events/net/netif_rx/hist
3094+
3095+ # event histogram
3096+ #
3097+ # trigger info: hist:name=bar:keys=stacktrace:vals=hitcount:sort=hitcount:size=2048 [active]
3098+ #
3099+
3100+ { stacktrace:
3101+ _do_fork+0x18e/0x330
3102+ kernel_thread+0x29/0x30
3103+ kthreadd+0x154/0x1b0
3104+ ret_from_fork+0x3f/0x70
3105+ } hitcount: 1
3106+ { stacktrace:
3107+ netif_rx_internal+0xb2/0xd0
3108+ netif_rx_ni+0x20/0x70
3109+ dev_loopback_xmit+0xaa/0xd0
3110+ ip_mc_output+0x126/0x240
3111+ ip_local_out_sk+0x31/0x40
3112+ igmp_send_report+0x1e9/0x230
3113+ igmp_timer_expire+0xe9/0x120
3114+ call_timer_fn+0x39/0xf0
3115+ run_timer_softirq+0x1e1/0x290
3116+ __do_softirq+0xfd/0x290
3117+ irq_exit+0x98/0xb0
3118+ smp_apic_timer_interrupt+0x4a/0x60
3119+ apic_timer_interrupt+0x6d/0x80
3120+ cpuidle_enter+0x17/0x20
3121+ call_cpuidle+0x3b/0x60
3122+ cpu_startup_entry+0x22d/0x310
3123+ } hitcount: 1
3124+ { stacktrace:
3125+ netif_rx_internal+0xb2/0xd0
3126+ netif_rx_ni+0x20/0x70
3127+ dev_loopback_xmit+0xaa/0xd0
3128+ ip_mc_output+0x17f/0x240
3129+ ip_local_out_sk+0x31/0x40
3130+ ip_send_skb+0x1a/0x50
3131+ udp_send_skb+0x13e/0x270
3132+ udp_sendmsg+0x2bf/0x980
3133+ inet_sendmsg+0x67/0xa0
3134+ sock_sendmsg+0x38/0x50
3135+ SYSC_sendto+0xef/0x170
3136+ SyS_sendto+0xe/0x10
3137+ entry_SYSCALL_64_fastpath+0x12/0x6a
3138+ } hitcount: 2
3139+ { stacktrace:
3140+ netif_rx_internal+0xb2/0xd0
3141+ netif_rx+0x1c/0x60
3142+ loopback_xmit+0x6c/0xb0
3143+ dev_hard_start_xmit+0x219/0x3a0
3144+ __dev_queue_xmit+0x415/0x4f0
3145+ dev_queue_xmit_sk+0x13/0x20
3146+ ip_finish_output2+0x237/0x340
3147+ ip_finish_output+0x113/0x1d0
3148+ ip_output+0x66/0xc0
3149+ ip_local_out_sk+0x31/0x40
3150+ ip_send_skb+0x1a/0x50
3151+ udp_send_skb+0x16d/0x270
3152+ udp_sendmsg+0x2bf/0x980
3153+ inet_sendmsg+0x67/0xa0
3154+ sock_sendmsg+0x38/0x50
3155+ ___sys_sendmsg+0x14e/0x270
3156+ } hitcount: 76
3157+ { stacktrace:
3158+ netif_rx_internal+0xb2/0xd0
3159+ netif_rx+0x1c/0x60
3160+ loopback_xmit+0x6c/0xb0
3161+ dev_hard_start_xmit+0x219/0x3a0
3162+ __dev_queue_xmit+0x415/0x4f0
3163+ dev_queue_xmit_sk+0x13/0x20
3164+ ip_finish_output2+0x237/0x340
3165+ ip_finish_output+0x113/0x1d0
3166+ ip_output+0x66/0xc0
3167+ ip_local_out_sk+0x31/0x40
3168+ ip_send_skb+0x1a/0x50
3169+ udp_send_skb+0x16d/0x270
3170+ udp_sendmsg+0x2bf/0x980
3171+ inet_sendmsg+0x67/0xa0
3172+ sock_sendmsg+0x38/0x50
3173+ ___sys_sendmsg+0x269/0x270
3174+ } hitcount: 77
3175+ { stacktrace:
3176+ netif_rx_internal+0xb2/0xd0
3177+ netif_rx+0x1c/0x60
3178+ loopback_xmit+0x6c/0xb0
3179+ dev_hard_start_xmit+0x219/0x3a0
3180+ __dev_queue_xmit+0x415/0x4f0
3181+ dev_queue_xmit_sk+0x13/0x20
3182+ ip_finish_output2+0x237/0x340
3183+ ip_finish_output+0x113/0x1d0
3184+ ip_output+0x66/0xc0
3185+ ip_local_out_sk+0x31/0x40
3186+ ip_send_skb+0x1a/0x50
3187+ udp_send_skb+0x16d/0x270
3188+ udp_sendmsg+0x2bf/0x980
3189+ inet_sendmsg+0x67/0xa0
3190+ sock_sendmsg+0x38/0x50
3191+ SYSC_sendto+0xef/0x170
3192+ } hitcount: 88
3193+ { stacktrace:
3194+ _do_fork+0x18e/0x330
3195+ SyS_clone+0x19/0x20
3196+ entry_SYSCALL_64_fastpath+0x12/0x6a
3197+ } hitcount: 244
3198+
3199+ Totals:
3200+ Hits: 489
3201+ Entries: 7
3202+ Dropped: 0
3203+
3204+
3205+2.2 Inter-event hist triggers
3206+-----------------------------
3207+
3208+Inter-event hist triggers are hist triggers that combine values from
3209+one or more other events and create a histogram using that data. Data
3210+from an inter-event histogram can in turn become the source for
3211+further combined histograms, thus providing a chain of related
3212+histograms, which is important for some applications.
3213+
3214+The most important example of an inter-event quantity that can be used
3215+in this manner is latency, which is simply a difference in timestamps
3216+between two events. Although latency is the most important
3217+inter-event quantity, note that because the support is completely
3218+general across the trace event subsystem, any event field can be used
3219+in an inter-event quantity.
3220+
3221+An example of a histogram that combines data from other histograms
3222+into a useful chain would be a 'wakeupswitch latency' histogram that
3223+combines a 'wakeup latency' histogram and a 'switch latency'
3224+histogram.
3225+
3226+Normally, a hist trigger specification consists of a (possibly
3227+compound) key along with one or more numeric values, which are
3228+continually updated sums associated with that key. A histogram
3229+specification in this case consists of individual key and value
3230+specifications that refer to trace event fields associated with a
3231+single event type.
3232+
3233+The inter-event hist trigger extension allows fields from multiple
3234+events to be referenced and combined into a multi-event histogram
3235+specification. In support of this overall goal, a few enabling
3236+features have been added to the hist trigger support:
3237+
3238+ - In order to compute an inter-event quantity, a value from one
3239+ event needs to saved and then referenced from another event. This
3240+ requires the introduction of support for histogram 'variables'.
3241+
3242+ - The computation of inter-event quantities and their combination
3243+ require some minimal amount of support for applying simple
3244+ expressions to variables (+ and -).
3245+
3246+ - A histogram consisting of inter-event quantities isn't logically a
3247+ histogram on either event (so having the 'hist' file for either
3248+ event host the histogram output doesn't really make sense). To
3249+ address the idea that the histogram is associated with a
3250+ combination of events, support is added allowing the creation of
3251+ 'synthetic' events that are events derived from other events.
3252+ These synthetic events are full-fledged events just like any other
3253+ and can be used as such, as for instance to create the
3254+ 'combination' histograms mentioned previously.
3255+
3256+ - A set of 'actions' can be associated with histogram entries -
3257+ these can be used to generate the previously mentioned synthetic
3258+ events, but can also be used for other purposes, such as for
3259+ example saving context when a 'max' latency has been hit.
3260+
3261+ - Trace events don't have a 'timestamp' associated with them, but
3262+ there is an implicit timestamp saved along with an event in the
3263+ underlying ftrace ring buffer. This timestamp is now exposed as a
3264+ a synthetic field named 'common_timestamp' which can be used in
3265+ histograms as if it were any other event field; it isn't an actual
3266+ field in the trace format but rather is a synthesized value that
3267+ nonetheless can be used as if it were an actual field. By default
3268+ it is in units of nanoseconds; appending '.usecs' to a
3269+ common_timestamp field changes the units to microseconds.
3270+
3271+A note on inter-event timestamps: If common_timestamp is used in a
3272+histogram, the trace buffer is automatically switched over to using
3273+absolute timestamps and the "global" trace clock, in order to avoid
3274+bogus timestamp differences with other clocks that aren't coherent
3275+across CPUs. This can be overridden by specifying one of the other
3276+trace clocks instead, using the "clock=XXX" hist trigger attribute,
3277+where XXX is any of the clocks listed in the tracing/trace_clock
3278+pseudo-file.
3279+
3280+These features are described in more detail in the following sections.
3281+
3282+2.2.1 Histogram Variables
3283+-------------------------
3284+
3285+Variables are simply named locations used for saving and retrieving
3286+values between matching events. A 'matching' event is defined as an
3287+event that has a matching key - if a variable is saved for a histogram
3288+entry corresponding to that key, any subsequent event with a matching
3289+key can access that variable.
3290+
3291+A variable's value is normally available to any subsequent event until
3292+it is set to something else by a subsequent event. The one exception
3293+to that rule is that any variable used in an expression is essentially
3294+'read-once' - once it's used by an expression in a subsequent event,
3295+it's reset to its 'unset' state, which means it can't be used again
3296+unless it's set again. This ensures not only that an event doesn't
3297+use an uninitialized variable in a calculation, but that that variable
3298+is used only once and not for any unrelated subsequent match.
3299+
3300+The basic syntax for saving a variable is to simply prefix a unique
3301+variable name not corresponding to any keyword along with an '=' sign
3302+to any event field.
3303+
3304+Either keys or values can be saved and retrieved in this way. This
3305+creates a variable named 'ts0' for a histogram entry with the key
3306+'next_pid':
3307+
3308+ # echo 'hist:keys=next_pid:vals=$ts0:ts0=common_timestamp ... >> \
3309+ event/trigger
3310+
3311+The ts0 variable can be accessed by any subsequent event having the
3312+same pid as 'next_pid'.
3313+
3314+Variable references are formed by prepending the variable name with
3315+the '$' sign. Thus for example, the ts0 variable above would be
3316+referenced as '$ts0' in expressions.
3317+
3318+Because 'vals=' is used, the common_timestamp variable value above
3319+will also be summed as a normal histogram value would (though for a
3320+timestamp it makes little sense).
3321+
3322+The below shows that a key value can also be saved in the same way:
3323+
3324+ # echo 'hist:timer_pid=common_pid:key=timer_pid ...' >> event/trigger
3325+
3326+If a variable isn't a key variable or prefixed with 'vals=', the
3327+associated event field will be saved in a variable but won't be summed
3328+as a value:
3329+
3330+ # echo 'hist:keys=next_pid:ts1=common_timestamp ... >> event/trigger
3331+
3332+Multiple variables can be assigned at the same time. The below would
3333+result in both ts0 and b being created as variables, with both
3334+common_timestamp and field1 additionally being summed as values:
3335+
3336+ # echo 'hist:keys=pid:vals=$ts0,$b:ts0=common_timestamp,b=field1 ... >> \
3337+ event/trigger
3338+
3339+Note that variable assignments can appear either preceding or
3340+following their use. The command below behaves identically to the
3341+command above:
3342+
3343+ # echo 'hist:keys=pid:ts0=common_timestamp,b=field1:vals=$ts0,$b ... >> \
3344+ event/trigger
3345+
3346+Any number of variables not bound to a 'vals=' prefix can also be
3347+assigned by simply separating them with colons. Below is the same
3348+thing but without the values being summed in the histogram:
3349+
3350+ # echo 'hist:keys=pid:ts0=common_timestamp:b=field1 ... >> event/trigger
3351+
3352+Variables set as above can be referenced and used in expressions on
3353+another event.
3354+
3355+For example, here's how a latency can be calculated:
3356+
3357+ # echo 'hist:keys=pid,prio:ts0=common_timestamp ... >> event1/trigger
3358+ # echo 'hist:keys=next_pid:wakeup_lat=common_timestamp-$ts0 ... >> event2/trigger
3359+
3360+In the first line above, the event's timetamp is saved into the
3361+variable ts0. In the next line, ts0 is subtracted from the second
3362+event's timestamp to produce the latency, which is then assigned into
3363+yet another variable, 'wakeup_lat'. The hist trigger below in turn
3364+makes use of the wakeup_lat variable to compute a combined latency
3365+using the same key and variable from yet another event:
3366+
3367+ # echo 'hist:key=pid:wakeupswitch_lat=$wakeup_lat+$switchtime_lat ... >> event3/trigger
3368+
3369+2.2.2 Synthetic Events
3370+----------------------
3371+
3372+Synthetic events are user-defined events generated from hist trigger
3373+variables or fields associated with one or more other events. Their
3374+purpose is to provide a mechanism for displaying data spanning
3375+multiple events consistent with the existing and already familiar
3376+usage for normal events.
3377+
3378+To define a synthetic event, the user writes a simple specification
3379+consisting of the name of the new event along with one or more
3380+variables and their types, which can be any valid field type,
3381+separated by semicolons, to the tracing/synthetic_events file.
3382+
3383+For instance, the following creates a new event named 'wakeup_latency'
3384+with 3 fields: lat, pid, and prio. Each of those fields is simply a
3385+variable reference to a variable on another event:
3386+
3387+ # echo 'wakeup_latency \
3388+ u64 lat; \
3389+ pid_t pid; \
3390+ int prio' >> \
3391+ /sys/kernel/debug/tracing/synthetic_events
3392+
3393+Reading the tracing/synthetic_events file lists all the currently
3394+defined synthetic events, in this case the event defined above:
3395+
3396+ # cat /sys/kernel/debug/tracing/synthetic_events
3397+ wakeup_latency u64 lat; pid_t pid; int prio
3398+
3399+An existing synthetic event definition can be removed by prepending
3400+the command that defined it with a '!':
3401+
3402+ # echo '!wakeup_latency u64 lat pid_t pid int prio' >> \
3403+ /sys/kernel/debug/tracing/synthetic_events
3404+
3405+At this point, there isn't yet an actual 'wakeup_latency' event
3406+instantiated in the event subsytem - for this to happen, a 'hist
3407+trigger action' needs to be instantiated and bound to actual fields
3408+and variables defined on other events (see Section 6.3.3 below).
3409+
3410+Once that is done, an event instance is created, and a histogram can
3411+be defined using it:
3412+
3413+ # echo 'hist:keys=pid,prio,lat.log2:sort=pid,lat' >> \
3414+ /sys/kernel/debug/tracing/events/synthetic/wakeup_latency/trigger
3415+
3416+The new event is created under the tracing/events/synthetic/ directory
3417+and looks and behaves just like any other event:
3418+
3419+ # ls /sys/kernel/debug/tracing/events/synthetic/wakeup_latency
3420+ enable filter format hist id trigger
3421+
3422+Like any other event, once a histogram is enabled for the event, the
3423+output can be displayed by reading the event's 'hist' file.
3424+
3425+2.2.3 Hist trigger 'actions'
3426+----------------------------
3427+
3428+A hist trigger 'action' is a function that's executed whenever a
3429+histogram entry is added or updated.
3430+
3431+The default 'action' if no special function is explicity specified is
3432+as it always has been, to simply update the set of values associated
3433+with an entry. Some applications, however, may want to perform
3434+additional actions at that point, such as generate another event, or
3435+compare and save a maximum.
3436+
3437+The following additional actions are available. To specify an action
3438+for a given event, simply specify the action between colons in the
3439+hist trigger specification.
3440+
3441+ - onmatch(matching.event).<synthetic_event_name>(param list)
3442+
3443+ The 'onmatch(matching.event).<synthetic_event_name>(params)' hist
3444+ trigger action is invoked whenever an event matches and the
3445+ histogram entry would be added or updated. It causes the named
3446+ synthetic event to be generated with the values given in the
3447+ 'param list'. The result is the generation of a synthetic event
3448+ that consists of the values contained in those variables at the
3449+ time the invoking event was hit.
3450+
3451+ The 'param list' consists of one or more parameters which may be
3452+ either variables or fields defined on either the 'matching.event'
3453+ or the target event. The variables or fields specified in the
3454+ param list may be either fully-qualified or unqualified. If a
3455+ variable is specified as unqualified, it must be unique between
3456+ the two events. A field name used as a param can be unqualified
3457+ if it refers to the target event, but must be fully qualified if
3458+ it refers to the matching event. A fully-qualified name is of the
3459+ form 'system.event_name.$var_name' or 'system.event_name.field'.
3460+
3461+ The 'matching.event' specification is simply the fully qualified
3462+ event name of the event that matches the target event for the
3463+ onmatch() functionality, in the form 'system.event_name'.
3464+
3465+ Finally, the number and type of variables/fields in the 'param
3466+ list' must match the number and types of the fields in the
3467+ synthetic event being generated.
3468+
3469+ As an example the below defines a simple synthetic event and uses
3470+ a variable defined on the sched_wakeup_new event as a parameter
3471+ when invoking the synthetic event. Here we define the synthetic
3472+ event:
3473+
3474+ # echo 'wakeup_new_test pid_t pid' >> \
3475+ /sys/kernel/debug/tracing/synthetic_events
3476+
3477+ # cat /sys/kernel/debug/tracing/synthetic_events
3478+ wakeup_new_test pid_t pid
3479+
3480+ The following hist trigger both defines the missing testpid
3481+ variable and specifies an onmatch() action that generates a
3482+ wakeup_new_test synthetic event whenever a sched_wakeup_new event
3483+ occurs, which because of the 'if comm == "cyclictest"' filter only
3484+ happens when the executable is cyclictest:
3485+
3486+ # echo 'hist:keys=$testpid:testpid=pid:onmatch(sched.sched_wakeup_new).\
3487+ wakeup_new_test($testpid) if comm=="cyclictest"' >> \
3488+ /sys/kernel/debug/tracing/events/sched/sched_wakeup_new/trigger
3489+
3490+ Creating and displaying a histogram based on those events is now
3491+ just a matter of using the fields and new synthetic event in the
3492+ tracing/events/synthetic directory, as usual:
3493+
3494+ # echo 'hist:keys=pid:sort=pid' >> \
3495+ /sys/kernel/debug/tracing/events/synthetic/wakeup_new_test/trigger
3496+
3497+ Running 'cyclictest' should cause wakeup_new events to generate
3498+ wakeup_new_test synthetic events which should result in histogram
3499+ output in the wakeup_new_test event's hist file:
3500+
3501+ # cat /sys/kernel/debug/tracing/events/synthetic/wakeup_new_test/hist
3502+
3503+ A more typical usage would be to use two events to calculate a
3504+ latency. The following example uses a set of hist triggers to
3505+ produce a 'wakeup_latency' histogram:
3506+
3507+ First, we define a 'wakeup_latency' synthetic event:
3508+
3509+ # echo 'wakeup_latency u64 lat; pid_t pid; int prio' >> \
3510+ /sys/kernel/debug/tracing/synthetic_events
3511+
3512+ Next, we specify that whenever we see a sched_waking event for a
3513+ cyclictest thread, save the timestamp in a 'ts0' variable:
3514+
3515+ # echo 'hist:keys=$saved_pid:saved_pid=pid:ts0=common_timestamp.usecs \
3516+ if comm=="cyclictest"' >> \
3517+ /sys/kernel/debug/tracing/events/sched/sched_waking/trigger
3518+
3519+ Then, when the corresponding thread is actually scheduled onto the
3520+ CPU by a sched_switch event, calculate the latency and use that
3521+ along with another variable and an event field to generate a
3522+ wakeup_latency synthetic event:
3523+
3524+ # echo 'hist:keys=next_pid:wakeup_lat=common_timestamp.usecs-$ts0:\
3525+ onmatch(sched.sched_waking).wakeup_latency($wakeup_lat,\
3526+ $saved_pid,next_prio) if next_comm=="cyclictest"' >> \
3527+ /sys/kernel/debug/tracing/events/sched/sched_switch/trigger
3528+
3529+ We also need to create a histogram on the wakeup_latency synthetic
3530+ event in order to aggregate the generated synthetic event data:
3531+
3532+ # echo 'hist:keys=pid,prio,lat:sort=pid,lat' >> \
3533+ /sys/kernel/debug/tracing/events/synthetic/wakeup_latency/trigger
3534+
3535+ Finally, once we've run cyclictest to actually generate some
3536+ events, we can see the output by looking at the wakeup_latency
3537+ synthetic event's hist file:
3538+
3539+ # cat /sys/kernel/debug/tracing/events/synthetic/wakeup_latency/hist
3540+
3541+ - onmax(var).save(field,.. .)
3542+
3543+ The 'onmax(var).save(field,...)' hist trigger action is invoked
3544+ whenever the value of 'var' associated with a histogram entry
3545+ exceeds the current maximum contained in that variable.
3546+
3547+ The end result is that the trace event fields specified as the
3548+ onmax.save() params will be saved if 'var' exceeds the current
3549+ maximum for that hist trigger entry. This allows context from the
3550+ event that exhibited the new maximum to be saved for later
3551+ reference. When the histogram is displayed, additional fields
3552+ displaying the saved values will be printed.
3553+
3554+ As an example the below defines a couple of hist triggers, one for
3555+ sched_waking and another for sched_switch, keyed on pid. Whenever
3556+ a sched_waking occurs, the timestamp is saved in the entry
3557+ corresponding to the current pid, and when the scheduler switches
3558+ back to that pid, the timestamp difference is calculated. If the
3559+ resulting latency, stored in wakeup_lat, exceeds the current
3560+ maximum latency, the values specified in the save() fields are
3561+ recoreded:
3562+
3563+ # echo 'hist:keys=pid:ts0=common_timestamp.usecs \
3564+ if comm=="cyclictest"' >> \
3565+ /sys/kernel/debug/tracing/events/sched/sched_waking/trigger
3566+
3567+ # echo 'hist:keys=next_pid:\
3568+ wakeup_lat=common_timestamp.usecs-$ts0:\
3569+ onmax($wakeup_lat).save(next_comm,prev_pid,prev_prio,prev_comm) \
3570+ if next_comm=="cyclictest"' >> \
3571+ /sys/kernel/debug/tracing/events/sched/sched_switch/trigger
3572+
3573+ When the histogram is displayed, the max value and the saved
3574+ values corresponding to the max are displayed following the rest
3575+ of the fields:
3576+
3577+ # cat /sys/kernel/debug/tracing/events/sched/sched_switch/hist
3578+ { next_pid: 2255 } hitcount: 239
3579+ common_timestamp-ts0: 0
3580+ max: 27
3581+ next_comm: cyclictest
3582+ prev_pid: 0 prev_prio: 120 prev_comm: swapper/1
3583+
3584+ { next_pid: 2256 } hitcount: 2355
3585+ common_timestamp-ts0: 0
3586+ max: 49 next_comm: cyclictest
3587+ prev_pid: 0 prev_prio: 120 prev_comm: swapper/0
3588+
3589+ Totals:
3590+ Hits: 12970
3591+ Entries: 2
3592+ Dropped: 0
3593diff --git a/arch/Kconfig b/arch/Kconfig
3594index 40dc31fea90c..7c6108479209 100644
3595--- a/arch/Kconfig
3596+++ b/arch/Kconfig
3597@@ -20,6 +20,7 @@ config OPROFILE
3598 tristate "OProfile system profiling"
3599 depends on PROFILING
3600 depends on HAVE_OPROFILE
3601+ depends on !PREEMPT_RT_FULL
3602 select RING_BUFFER
3603 select RING_BUFFER_ALLOW_SWAP
3604 help
3605diff --git a/arch/alpha/include/asm/spinlock_types.h b/arch/alpha/include/asm/spinlock_types.h
3606index 1d5716bc060b..6883bc952d22 100644
3607--- a/arch/alpha/include/asm/spinlock_types.h
3608+++ b/arch/alpha/include/asm/spinlock_types.h
3609@@ -2,10 +2,6 @@
3610 #ifndef _ALPHA_SPINLOCK_TYPES_H
3611 #define _ALPHA_SPINLOCK_TYPES_H
3612
3613-#ifndef __LINUX_SPINLOCK_TYPES_H
3614-# error "please don't include this file directly"
3615-#endif
3616-
3617 typedef struct {
3618 volatile unsigned int lock;
3619 } arch_spinlock_t;
3620diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig
3621index d1346a160760..558b0995e94a 100644
3622--- a/arch/arm/Kconfig
3623+++ b/arch/arm/Kconfig
3624@@ -45,7 +45,7 @@ config ARM
3625 select HARDIRQS_SW_RESEND
3626 select HAVE_ARCH_AUDITSYSCALL if (AEABI && !OABI_COMPAT)
3627 select HAVE_ARCH_BITREVERSE if (CPU_32v7M || CPU_32v7) && !CPU_32v6
3628- select HAVE_ARCH_JUMP_LABEL if !XIP_KERNEL && !CPU_ENDIAN_BE32 && MMU
3629+ select HAVE_ARCH_JUMP_LABEL if !XIP_KERNEL && !CPU_ENDIAN_BE32 && MMU && !PREEMPT_RT_BASE
3630 select HAVE_ARCH_KGDB if !CPU_ENDIAN_BE32 && MMU
3631 select HAVE_ARCH_MMAP_RND_BITS if MMU
3632 select HAVE_ARCH_SECCOMP_FILTER if (AEABI && !OABI_COMPAT)
3633@@ -85,6 +85,7 @@ config ARM
3634 select HAVE_PERF_EVENTS
3635 select HAVE_PERF_REGS
3636 select HAVE_PERF_USER_STACK_DUMP
3637+ select HAVE_PREEMPT_LAZY
3638 select HAVE_RCU_TABLE_FREE if (SMP && ARM_LPAE)
3639 select HAVE_REGS_AND_STACK_ACCESS_API
3640 select HAVE_SYSCALL_TRACEPOINTS
3641@@ -2164,7 +2165,7 @@ config NEON
3642
3643 config KERNEL_MODE_NEON
3644 bool "Support for NEON in kernel mode"
3645- depends on NEON && AEABI
3646+ depends on NEON && AEABI && !PREEMPT_RT_BASE
3647 help
3648 Say Y to include support for NEON in kernel mode.
3649
3650diff --git a/arch/arm/include/asm/irq.h b/arch/arm/include/asm/irq.h
3651index b6f319606e30..ad377ef73739 100644
3652--- a/arch/arm/include/asm/irq.h
3653+++ b/arch/arm/include/asm/irq.h
3654@@ -23,6 +23,8 @@
3655 #endif
3656
3657 #ifndef __ASSEMBLY__
3658+#include <linux/cpumask.h>
3659+
3660 struct irqaction;
3661 struct pt_regs;
3662 extern void migrate_irqs(void);
3663diff --git a/arch/arm/include/asm/spinlock_types.h b/arch/arm/include/asm/spinlock_types.h
3664index 5976958647fe..a37c0803954b 100644
3665--- a/arch/arm/include/asm/spinlock_types.h
3666+++ b/arch/arm/include/asm/spinlock_types.h
3667@@ -2,10 +2,6 @@
3668 #ifndef __ASM_SPINLOCK_TYPES_H
3669 #define __ASM_SPINLOCK_TYPES_H
3670
3671-#ifndef __LINUX_SPINLOCK_TYPES_H
3672-# error "please don't include this file directly"
3673-#endif
3674-
3675 #define TICKET_SHIFT 16
3676
3677 typedef struct {
3678diff --git a/arch/arm/include/asm/switch_to.h b/arch/arm/include/asm/switch_to.h
3679index d3e937dcee4d..6ab96a2ce1f8 100644
3680--- a/arch/arm/include/asm/switch_to.h
3681+++ b/arch/arm/include/asm/switch_to.h
3682@@ -4,6 +4,13 @@
3683
3684 #include <linux/thread_info.h>
3685
3686+#if defined CONFIG_PREEMPT_RT_FULL && defined CONFIG_HIGHMEM
3687+void switch_kmaps(struct task_struct *prev_p, struct task_struct *next_p);
3688+#else
3689+static inline void
3690+switch_kmaps(struct task_struct *prev_p, struct task_struct *next_p) { }
3691+#endif
3692+
3693 /*
3694 * For v7 SMP cores running a preemptible kernel we may be pre-empted
3695 * during a TLB maintenance operation, so execute an inner-shareable dsb
3696@@ -26,6 +33,7 @@ extern struct task_struct *__switch_to(struct task_struct *, struct thread_info
3697 #define switch_to(prev,next,last) \
3698 do { \
3699 __complete_pending_tlbi(); \
3700+ switch_kmaps(prev, next); \
3701 last = __switch_to(prev,task_thread_info(prev), task_thread_info(next)); \
3702 } while (0)
3703
3704diff --git a/arch/arm/include/asm/thread_info.h b/arch/arm/include/asm/thread_info.h
3705index 57d2ad9c75ca..cdfb6855943b 100644
3706--- a/arch/arm/include/asm/thread_info.h
3707+++ b/arch/arm/include/asm/thread_info.h
3708@@ -49,6 +49,7 @@ struct cpu_context_save {
3709 struct thread_info {
3710 unsigned long flags; /* low level flags */
3711 int preempt_count; /* 0 => preemptable, <0 => bug */
3712+ int preempt_lazy_count; /* 0 => preemptable, <0 => bug */
3713 mm_segment_t addr_limit; /* address limit */
3714 struct task_struct *task; /* main task structure */
3715 __u32 cpu; /* cpu */
3716@@ -142,7 +143,8 @@ extern int vfp_restore_user_hwstate(struct user_vfp *,
3717 #define TIF_SYSCALL_TRACE 4 /* syscall trace active */
3718 #define TIF_SYSCALL_AUDIT 5 /* syscall auditing active */
3719 #define TIF_SYSCALL_TRACEPOINT 6 /* syscall tracepoint instrumentation */
3720-#define TIF_SECCOMP 7 /* seccomp syscall filtering active */
3721+#define TIF_SECCOMP 8 /* seccomp syscall filtering active */
3722+#define TIF_NEED_RESCHED_LAZY 7
3723
3724 #define TIF_NOHZ 12 /* in adaptive nohz mode */
3725 #define TIF_USING_IWMMXT 17
3726@@ -152,6 +154,7 @@ extern int vfp_restore_user_hwstate(struct user_vfp *,
3727 #define _TIF_SIGPENDING (1 << TIF_SIGPENDING)
3728 #define _TIF_NEED_RESCHED (1 << TIF_NEED_RESCHED)
3729 #define _TIF_NOTIFY_RESUME (1 << TIF_NOTIFY_RESUME)
3730+#define _TIF_NEED_RESCHED_LAZY (1 << TIF_NEED_RESCHED_LAZY)
3731 #define _TIF_UPROBE (1 << TIF_UPROBE)
3732 #define _TIF_SYSCALL_TRACE (1 << TIF_SYSCALL_TRACE)
3733 #define _TIF_SYSCALL_AUDIT (1 << TIF_SYSCALL_AUDIT)
3734@@ -167,7 +170,8 @@ extern int vfp_restore_user_hwstate(struct user_vfp *,
3735 * Change these and you break ASM code in entry-common.S
3736 */
3737 #define _TIF_WORK_MASK (_TIF_NEED_RESCHED | _TIF_SIGPENDING | \
3738- _TIF_NOTIFY_RESUME | _TIF_UPROBE)
3739+ _TIF_NOTIFY_RESUME | _TIF_UPROBE | \
3740+ _TIF_NEED_RESCHED_LAZY)
3741
3742 #endif /* __KERNEL__ */
3743 #endif /* __ASM_ARM_THREAD_INFO_H */
3744diff --git a/arch/arm/kernel/asm-offsets.c b/arch/arm/kernel/asm-offsets.c
3745index 608008229c7d..3866da3f7bb7 100644
3746--- a/arch/arm/kernel/asm-offsets.c
3747+++ b/arch/arm/kernel/asm-offsets.c
3748@@ -65,6 +65,7 @@ int main(void)
3749 BLANK();
3750 DEFINE(TI_FLAGS, offsetof(struct thread_info, flags));
3751 DEFINE(TI_PREEMPT, offsetof(struct thread_info, preempt_count));
3752+ DEFINE(TI_PREEMPT_LAZY, offsetof(struct thread_info, preempt_lazy_count));
3753 DEFINE(TI_ADDR_LIMIT, offsetof(struct thread_info, addr_limit));
3754 DEFINE(TI_TASK, offsetof(struct thread_info, task));
3755 DEFINE(TI_CPU, offsetof(struct thread_info, cpu));
3756diff --git a/arch/arm/kernel/entry-armv.S b/arch/arm/kernel/entry-armv.S
3757index fbc707626b3e..b434c59d2b64 100644
3758--- a/arch/arm/kernel/entry-armv.S
3759+++ b/arch/arm/kernel/entry-armv.S
3760@@ -220,11 +220,18 @@ __irq_svc:
3761
3762 #ifdef CONFIG_PREEMPT
3763 ldr r8, [tsk, #TI_PREEMPT] @ get preempt count
3764- ldr r0, [tsk, #TI_FLAGS] @ get flags
3765 teq r8, #0 @ if preempt count != 0
3766+ bne 1f @ return from exeption
3767+ ldr r0, [tsk, #TI_FLAGS] @ get flags
3768+ tst r0, #_TIF_NEED_RESCHED @ if NEED_RESCHED is set
3769+ blne svc_preempt @ preempt!
3770+
3771+ ldr r8, [tsk, #TI_PREEMPT_LAZY] @ get preempt lazy count
3772+ teq r8, #0 @ if preempt lazy count != 0
3773 movne r0, #0 @ force flags to 0
3774- tst r0, #_TIF_NEED_RESCHED
3775+ tst r0, #_TIF_NEED_RESCHED_LAZY
3776 blne svc_preempt
3777+1:
3778 #endif
3779
3780 svc_exit r5, irq = 1 @ return from exception
3781@@ -239,8 +246,14 @@ svc_preempt:
3782 1: bl preempt_schedule_irq @ irq en/disable is done inside
3783 ldr r0, [tsk, #TI_FLAGS] @ get new tasks TI_FLAGS
3784 tst r0, #_TIF_NEED_RESCHED
3785+ bne 1b
3786+ tst r0, #_TIF_NEED_RESCHED_LAZY
3787 reteq r8 @ go again
3788- b 1b
3789+ ldr r0, [tsk, #TI_PREEMPT_LAZY] @ get preempt lazy count
3790+ teq r0, #0 @ if preempt lazy count != 0
3791+ beq 1b
3792+ ret r8 @ go again
3793+
3794 #endif
3795
3796 __und_fault:
3797diff --git a/arch/arm/kernel/entry-common.S b/arch/arm/kernel/entry-common.S
3798index 54c10503d71f..3fdeade24e3f 100644
3799--- a/arch/arm/kernel/entry-common.S
3800+++ b/arch/arm/kernel/entry-common.S
3801@@ -53,7 +53,9 @@ ret_fast_syscall:
3802 cmp r2, #TASK_SIZE
3803 blne addr_limit_check_failed
3804 ldr r1, [tsk, #TI_FLAGS] @ re-check for syscall tracing
3805- tst r1, #_TIF_SYSCALL_WORK | _TIF_WORK_MASK
3806+ tst r1, #((_TIF_SYSCALL_WORK | _TIF_WORK_MASK) & ~_TIF_SECCOMP)
3807+ bne fast_work_pending
3808+ tst r1, #_TIF_SECCOMP
3809 bne fast_work_pending
3810
3811
3812@@ -83,8 +85,11 @@ ret_fast_syscall:
3813 cmp r2, #TASK_SIZE
3814 blne addr_limit_check_failed
3815 ldr r1, [tsk, #TI_FLAGS] @ re-check for syscall tracing
3816- tst r1, #_TIF_SYSCALL_WORK | _TIF_WORK_MASK
3817+ tst r1, #((_TIF_SYSCALL_WORK | _TIF_WORK_MASK) & ~_TIF_SECCOMP)
3818+ bne do_slower_path
3819+ tst r1, #_TIF_SECCOMP
3820 beq no_work_pending
3821+do_slower_path:
3822 UNWIND(.fnend )
3823 ENDPROC(ret_fast_syscall)
3824
3825diff --git a/arch/arm/kernel/patch.c b/arch/arm/kernel/patch.c
3826index a50dc00d79a2..d0a05a3bdb96 100644
3827--- a/arch/arm/kernel/patch.c
3828+++ b/arch/arm/kernel/patch.c
3829@@ -16,7 +16,7 @@ struct patch {
3830 unsigned int insn;
3831 };
3832
3833-static DEFINE_SPINLOCK(patch_lock);
3834+static DEFINE_RAW_SPINLOCK(patch_lock);
3835
3836 static void __kprobes *patch_map(void *addr, int fixmap, unsigned long *flags)
3837 __acquires(&patch_lock)
3838@@ -33,7 +33,7 @@ static void __kprobes *patch_map(void *addr, int fixmap, unsigned long *flags)
3839 return addr;
3840
3841 if (flags)
3842- spin_lock_irqsave(&patch_lock, *flags);
3843+ raw_spin_lock_irqsave(&patch_lock, *flags);
3844 else
3845 __acquire(&patch_lock);
3846
3847@@ -48,7 +48,7 @@ static void __kprobes patch_unmap(int fixmap, unsigned long *flags)
3848 clear_fixmap(fixmap);
3849
3850 if (flags)
3851- spin_unlock_irqrestore(&patch_lock, *flags);
3852+ raw_spin_unlock_irqrestore(&patch_lock, *flags);
3853 else
3854 __release(&patch_lock);
3855 }
3856diff --git a/arch/arm/kernel/process.c b/arch/arm/kernel/process.c
3857index d96714e1858c..cf4e1452d4b4 100644
3858--- a/arch/arm/kernel/process.c
3859+++ b/arch/arm/kernel/process.c
3860@@ -325,6 +325,30 @@ unsigned long arch_randomize_brk(struct mm_struct *mm)
3861 }
3862
3863 #ifdef CONFIG_MMU
3864+/*
3865+ * CONFIG_SPLIT_PTLOCK_CPUS results in a page->ptl lock. If the lock is not
3866+ * initialized by pgtable_page_ctor() then a coredump of the vector page will
3867+ * fail.
3868+ */
3869+static int __init vectors_user_mapping_init_page(void)
3870+{
3871+ struct page *page;
3872+ unsigned long addr = 0xffff0000;
3873+ pgd_t *pgd;
3874+ pud_t *pud;
3875+ pmd_t *pmd;
3876+
3877+ pgd = pgd_offset_k(addr);
3878+ pud = pud_offset(pgd, addr);
3879+ pmd = pmd_offset(pud, addr);
3880+ page = pmd_page(*(pmd));
3881+
3882+ pgtable_page_ctor(page);
3883+
3884+ return 0;
3885+}
3886+late_initcall(vectors_user_mapping_init_page);
3887+
3888 #ifdef CONFIG_KUSER_HELPERS
3889 /*
3890 * The vectors page is always readable from user space for the
3891diff --git a/arch/arm/kernel/signal.c b/arch/arm/kernel/signal.c
3892index cdfe52b15a0a..198cf8bf0b37 100644
3893--- a/arch/arm/kernel/signal.c
3894+++ b/arch/arm/kernel/signal.c
3895@@ -615,7 +615,8 @@ do_work_pending(struct pt_regs *regs, unsigned int thread_flags, int syscall)
3896 */
3897 trace_hardirqs_off();
3898 do {
3899- if (likely(thread_flags & _TIF_NEED_RESCHED)) {
3900+ if (likely(thread_flags & (_TIF_NEED_RESCHED |
3901+ _TIF_NEED_RESCHED_LAZY))) {
3902 schedule();
3903 } else {
3904 if (unlikely(!user_mode(regs)))
3905diff --git a/arch/arm/kernel/smp.c b/arch/arm/kernel/smp.c
3906index e61af0600133..d8f2e77d5651 100644
3907--- a/arch/arm/kernel/smp.c
3908+++ b/arch/arm/kernel/smp.c
3909@@ -237,8 +237,6 @@ int __cpu_disable(void)
3910 flush_cache_louis();
3911 local_flush_tlb_all();
3912
3913- clear_tasks_mm_cpumask(cpu);
3914-
3915 return 0;
3916 }
3917
3918@@ -256,6 +254,7 @@ void __cpu_die(unsigned int cpu)
3919 }
3920 pr_debug("CPU%u: shutdown\n", cpu);
3921
3922+ clear_tasks_mm_cpumask(cpu);
3923 /*
3924 * platform_cpu_kill() is generally expected to do the powering off
3925 * and/or cutting of clocks to the dying CPU. Optionally, this may
3926diff --git a/arch/arm/kernel/unwind.c b/arch/arm/kernel/unwind.c
3927index 0bee233fef9a..314cfb232a63 100644
3928--- a/arch/arm/kernel/unwind.c
3929+++ b/arch/arm/kernel/unwind.c
3930@@ -93,7 +93,7 @@ extern const struct unwind_idx __start_unwind_idx[];
3931 static const struct unwind_idx *__origin_unwind_idx;
3932 extern const struct unwind_idx __stop_unwind_idx[];
3933
3934-static DEFINE_SPINLOCK(unwind_lock);
3935+static DEFINE_RAW_SPINLOCK(unwind_lock);
3936 static LIST_HEAD(unwind_tables);
3937
3938 /* Convert a prel31 symbol to an absolute address */
3939@@ -201,7 +201,7 @@ static const struct unwind_idx *unwind_find_idx(unsigned long addr)
3940 /* module unwind tables */
3941 struct unwind_table *table;
3942
3943- spin_lock_irqsave(&unwind_lock, flags);
3944+ raw_spin_lock_irqsave(&unwind_lock, flags);
3945 list_for_each_entry(table, &unwind_tables, list) {
3946 if (addr >= table->begin_addr &&
3947 addr < table->end_addr) {
3948@@ -213,7 +213,7 @@ static const struct unwind_idx *unwind_find_idx(unsigned long addr)
3949 break;
3950 }
3951 }
3952- spin_unlock_irqrestore(&unwind_lock, flags);
3953+ raw_spin_unlock_irqrestore(&unwind_lock, flags);
3954 }
3955
3956 pr_debug("%s: idx = %p\n", __func__, idx);
3957@@ -529,9 +529,9 @@ struct unwind_table *unwind_table_add(unsigned long start, unsigned long size,
3958 tab->begin_addr = text_addr;
3959 tab->end_addr = text_addr + text_size;
3960
3961- spin_lock_irqsave(&unwind_lock, flags);
3962+ raw_spin_lock_irqsave(&unwind_lock, flags);
3963 list_add_tail(&tab->list, &unwind_tables);
3964- spin_unlock_irqrestore(&unwind_lock, flags);
3965+ raw_spin_unlock_irqrestore(&unwind_lock, flags);
3966
3967 return tab;
3968 }
3969@@ -543,9 +543,9 @@ void unwind_table_del(struct unwind_table *tab)
3970 if (!tab)
3971 return;
3972
3973- spin_lock_irqsave(&unwind_lock, flags);
3974+ raw_spin_lock_irqsave(&unwind_lock, flags);
3975 list_del(&tab->list);
3976- spin_unlock_irqrestore(&unwind_lock, flags);
3977+ raw_spin_unlock_irqrestore(&unwind_lock, flags);
3978
3979 kfree(tab);
3980 }
3981diff --git a/arch/arm/mach-exynos/platsmp.c b/arch/arm/mach-exynos/platsmp.c
3982index 5a03bffe7226..3080ea833d19 100644
3983--- a/arch/arm/mach-exynos/platsmp.c
3984+++ b/arch/arm/mach-exynos/platsmp.c
3985@@ -229,7 +229,7 @@ static void __iomem *scu_base_addr(void)
3986 return (void __iomem *)(S5P_VA_SCU);
3987 }
3988
3989-static DEFINE_SPINLOCK(boot_lock);
3990+static DEFINE_RAW_SPINLOCK(boot_lock);
3991
3992 static void exynos_secondary_init(unsigned int cpu)
3993 {
3994@@ -242,8 +242,8 @@ static void exynos_secondary_init(unsigned int cpu)
3995 /*
3996 * Synchronise with the boot thread.
3997 */
3998- spin_lock(&boot_lock);
3999- spin_unlock(&boot_lock);
4000+ raw_spin_lock(&boot_lock);
4001+ raw_spin_unlock(&boot_lock);
4002 }
4003
4004 int exynos_set_boot_addr(u32 core_id, unsigned long boot_addr)
4005@@ -307,7 +307,7 @@ static int exynos_boot_secondary(unsigned int cpu, struct task_struct *idle)
4006 * Set synchronisation state between this boot processor
4007 * and the secondary one
4008 */
4009- spin_lock(&boot_lock);
4010+ raw_spin_lock(&boot_lock);
4011
4012 /*
4013 * The secondary processor is waiting to be released from
4014@@ -334,7 +334,7 @@ static int exynos_boot_secondary(unsigned int cpu, struct task_struct *idle)
4015
4016 if (timeout == 0) {
4017 printk(KERN_ERR "cpu1 power enable failed");
4018- spin_unlock(&boot_lock);
4019+ raw_spin_unlock(&boot_lock);
4020 return -ETIMEDOUT;
4021 }
4022 }
4023@@ -380,7 +380,7 @@ static int exynos_boot_secondary(unsigned int cpu, struct task_struct *idle)
4024 * calibrations, then wait for it to finish
4025 */
4026 fail:
4027- spin_unlock(&boot_lock);
4028+ raw_spin_unlock(&boot_lock);
4029
4030 return pen_release != -1 ? ret : 0;
4031 }
4032diff --git a/arch/arm/mach-hisi/platmcpm.c b/arch/arm/mach-hisi/platmcpm.c
4033index f66815c3dd07..00524abd963f 100644
4034--- a/arch/arm/mach-hisi/platmcpm.c
4035+++ b/arch/arm/mach-hisi/platmcpm.c
4036@@ -61,7 +61,7 @@
4037
4038 static void __iomem *sysctrl, *fabric;
4039 static int hip04_cpu_table[HIP04_MAX_CLUSTERS][HIP04_MAX_CPUS_PER_CLUSTER];
4040-static DEFINE_SPINLOCK(boot_lock);
4041+static DEFINE_RAW_SPINLOCK(boot_lock);
4042 static u32 fabric_phys_addr;
4043 /*
4044 * [0]: bootwrapper physical address
4045@@ -113,7 +113,7 @@ static int hip04_boot_secondary(unsigned int l_cpu, struct task_struct *idle)
4046 if (cluster >= HIP04_MAX_CLUSTERS || cpu >= HIP04_MAX_CPUS_PER_CLUSTER)
4047 return -EINVAL;
4048
4049- spin_lock_irq(&boot_lock);
4050+ raw_spin_lock_irq(&boot_lock);
4051
4052 if (hip04_cpu_table[cluster][cpu])
4053 goto out;
4054@@ -147,7 +147,7 @@ static int hip04_boot_secondary(unsigned int l_cpu, struct task_struct *idle)
4055
4056 out:
4057 hip04_cpu_table[cluster][cpu]++;
4058- spin_unlock_irq(&boot_lock);
4059+ raw_spin_unlock_irq(&boot_lock);
4060
4061 return 0;
4062 }
4063@@ -162,11 +162,11 @@ static void hip04_cpu_die(unsigned int l_cpu)
4064 cpu = MPIDR_AFFINITY_LEVEL(mpidr, 0);
4065 cluster = MPIDR_AFFINITY_LEVEL(mpidr, 1);
4066
4067- spin_lock(&boot_lock);
4068+ raw_spin_lock(&boot_lock);
4069 hip04_cpu_table[cluster][cpu]--;
4070 if (hip04_cpu_table[cluster][cpu] == 1) {
4071 /* A power_up request went ahead of us. */
4072- spin_unlock(&boot_lock);
4073+ raw_spin_unlock(&boot_lock);
4074 return;
4075 } else if (hip04_cpu_table[cluster][cpu] > 1) {
4076 pr_err("Cluster %d CPU%d boots multiple times\n", cluster, cpu);
4077@@ -174,7 +174,7 @@ static void hip04_cpu_die(unsigned int l_cpu)
4078 }
4079
4080 last_man = hip04_cluster_is_down(cluster);
4081- spin_unlock(&boot_lock);
4082+ raw_spin_unlock(&boot_lock);
4083 if (last_man) {
4084 /* Since it's Cortex A15, disable L2 prefetching. */
4085 asm volatile(
4086@@ -203,7 +203,7 @@ static int hip04_cpu_kill(unsigned int l_cpu)
4087 cpu >= HIP04_MAX_CPUS_PER_CLUSTER);
4088
4089 count = TIMEOUT_MSEC / POLL_MSEC;
4090- spin_lock_irq(&boot_lock);
4091+ raw_spin_lock_irq(&boot_lock);
4092 for (tries = 0; tries < count; tries++) {
4093 if (hip04_cpu_table[cluster][cpu])
4094 goto err;
4095@@ -211,10 +211,10 @@ static int hip04_cpu_kill(unsigned int l_cpu)
4096 data = readl_relaxed(sysctrl + SC_CPU_RESET_STATUS(cluster));
4097 if (data & CORE_WFI_STATUS(cpu))
4098 break;
4099- spin_unlock_irq(&boot_lock);
4100+ raw_spin_unlock_irq(&boot_lock);
4101 /* Wait for clean L2 when the whole cluster is down. */
4102 msleep(POLL_MSEC);
4103- spin_lock_irq(&boot_lock);
4104+ raw_spin_lock_irq(&boot_lock);
4105 }
4106 if (tries >= count)
4107 goto err;
4108@@ -231,10 +231,10 @@ static int hip04_cpu_kill(unsigned int l_cpu)
4109 goto err;
4110 if (hip04_cluster_is_down(cluster))
4111 hip04_set_snoop_filter(cluster, 0);
4112- spin_unlock_irq(&boot_lock);
4113+ raw_spin_unlock_irq(&boot_lock);
4114 return 1;
4115 err:
4116- spin_unlock_irq(&boot_lock);
4117+ raw_spin_unlock_irq(&boot_lock);
4118 return 0;
4119 }
4120 #endif
4121diff --git a/arch/arm/mach-omap2/omap-smp.c b/arch/arm/mach-omap2/omap-smp.c
4122index 1c73694c871a..ac4d2f030b87 100644
4123--- a/arch/arm/mach-omap2/omap-smp.c
4124+++ b/arch/arm/mach-omap2/omap-smp.c
4125@@ -69,7 +69,7 @@ static const struct omap_smp_config omap5_cfg __initconst = {
4126 .startup_addr = omap5_secondary_startup,
4127 };
4128
4129-static DEFINE_SPINLOCK(boot_lock);
4130+static DEFINE_RAW_SPINLOCK(boot_lock);
4131
4132 void __iomem *omap4_get_scu_base(void)
4133 {
4134@@ -177,8 +177,8 @@ static void omap4_secondary_init(unsigned int cpu)
4135 /*
4136 * Synchronise with the boot thread.
4137 */
4138- spin_lock(&boot_lock);
4139- spin_unlock(&boot_lock);
4140+ raw_spin_lock(&boot_lock);
4141+ raw_spin_unlock(&boot_lock);
4142 }
4143
4144 static int omap4_boot_secondary(unsigned int cpu, struct task_struct *idle)
4145@@ -191,7 +191,7 @@ static int omap4_boot_secondary(unsigned int cpu, struct task_struct *idle)
4146 * Set synchronisation state between this boot processor
4147 * and the secondary one
4148 */
4149- spin_lock(&boot_lock);
4150+ raw_spin_lock(&boot_lock);
4151
4152 /*
4153 * Update the AuxCoreBoot0 with boot state for secondary core.
4154@@ -270,7 +270,7 @@ static int omap4_boot_secondary(unsigned int cpu, struct task_struct *idle)
4155 * Now the secondary core is starting up let it run its
4156 * calibrations, then wait for it to finish
4157 */
4158- spin_unlock(&boot_lock);
4159+ raw_spin_unlock(&boot_lock);
4160
4161 return 0;
4162 }
4163diff --git a/arch/arm/mach-prima2/platsmp.c b/arch/arm/mach-prima2/platsmp.c
4164index 75ef5d4be554..c17c86e5d860 100644
4165--- a/arch/arm/mach-prima2/platsmp.c
4166+++ b/arch/arm/mach-prima2/platsmp.c
4167@@ -22,7 +22,7 @@
4168
4169 static void __iomem *clk_base;
4170
4171-static DEFINE_SPINLOCK(boot_lock);
4172+static DEFINE_RAW_SPINLOCK(boot_lock);
4173
4174 static void sirfsoc_secondary_init(unsigned int cpu)
4175 {
4176@@ -36,8 +36,8 @@ static void sirfsoc_secondary_init(unsigned int cpu)
4177 /*
4178 * Synchronise with the boot thread.
4179 */
4180- spin_lock(&boot_lock);
4181- spin_unlock(&boot_lock);
4182+ raw_spin_lock(&boot_lock);
4183+ raw_spin_unlock(&boot_lock);
4184 }
4185
4186 static const struct of_device_id clk_ids[] = {
4187@@ -75,7 +75,7 @@ static int sirfsoc_boot_secondary(unsigned int cpu, struct task_struct *idle)
4188 /* make sure write buffer is drained */
4189 mb();
4190
4191- spin_lock(&boot_lock);
4192+ raw_spin_lock(&boot_lock);
4193
4194 /*
4195 * The secondary processor is waiting to be released from
4196@@ -107,7 +107,7 @@ static int sirfsoc_boot_secondary(unsigned int cpu, struct task_struct *idle)
4197 * now the secondary core is starting up let it run its
4198 * calibrations, then wait for it to finish
4199 */
4200- spin_unlock(&boot_lock);
4201+ raw_spin_unlock(&boot_lock);
4202
4203 return pen_release != -1 ? -ENOSYS : 0;
4204 }
4205diff --git a/arch/arm/mach-qcom/platsmp.c b/arch/arm/mach-qcom/platsmp.c
4206index 5494c9e0c909..e8ce157d3548 100644
4207--- a/arch/arm/mach-qcom/platsmp.c
4208+++ b/arch/arm/mach-qcom/platsmp.c
4209@@ -46,7 +46,7 @@
4210
4211 extern void secondary_startup_arm(void);
4212
4213-static DEFINE_SPINLOCK(boot_lock);
4214+static DEFINE_RAW_SPINLOCK(boot_lock);
4215
4216 #ifdef CONFIG_HOTPLUG_CPU
4217 static void qcom_cpu_die(unsigned int cpu)
4218@@ -60,8 +60,8 @@ static void qcom_secondary_init(unsigned int cpu)
4219 /*
4220 * Synchronise with the boot thread.
4221 */
4222- spin_lock(&boot_lock);
4223- spin_unlock(&boot_lock);
4224+ raw_spin_lock(&boot_lock);
4225+ raw_spin_unlock(&boot_lock);
4226 }
4227
4228 static int scss_release_secondary(unsigned int cpu)
4229@@ -284,7 +284,7 @@ static int qcom_boot_secondary(unsigned int cpu, int (*func)(unsigned int))
4230 * set synchronisation state between this boot processor
4231 * and the secondary one
4232 */
4233- spin_lock(&boot_lock);
4234+ raw_spin_lock(&boot_lock);
4235
4236 /*
4237 * Send the secondary CPU a soft interrupt, thereby causing
4238@@ -297,7 +297,7 @@ static int qcom_boot_secondary(unsigned int cpu, int (*func)(unsigned int))
4239 * now the secondary core is starting up let it run its
4240 * calibrations, then wait for it to finish
4241 */
4242- spin_unlock(&boot_lock);
4243+ raw_spin_unlock(&boot_lock);
4244
4245 return ret;
4246 }
4247diff --git a/arch/arm/mach-spear/platsmp.c b/arch/arm/mach-spear/platsmp.c
4248index 39038a03836a..6da5c93872bf 100644
4249--- a/arch/arm/mach-spear/platsmp.c
4250+++ b/arch/arm/mach-spear/platsmp.c
4251@@ -32,7 +32,7 @@ static void write_pen_release(int val)
4252 sync_cache_w(&pen_release);
4253 }
4254
4255-static DEFINE_SPINLOCK(boot_lock);
4256+static DEFINE_RAW_SPINLOCK(boot_lock);
4257
4258 static void __iomem *scu_base = IOMEM(VA_SCU_BASE);
4259
4260@@ -47,8 +47,8 @@ static void spear13xx_secondary_init(unsigned int cpu)
4261 /*
4262 * Synchronise with the boot thread.
4263 */
4264- spin_lock(&boot_lock);
4265- spin_unlock(&boot_lock);
4266+ raw_spin_lock(&boot_lock);
4267+ raw_spin_unlock(&boot_lock);
4268 }
4269
4270 static int spear13xx_boot_secondary(unsigned int cpu, struct task_struct *idle)
4271@@ -59,7 +59,7 @@ static int spear13xx_boot_secondary(unsigned int cpu, struct task_struct *idle)
4272 * set synchronisation state between this boot processor
4273 * and the secondary one
4274 */
4275- spin_lock(&boot_lock);
4276+ raw_spin_lock(&boot_lock);
4277
4278 /*
4279 * The secondary processor is waiting to be released from
4280@@ -84,7 +84,7 @@ static int spear13xx_boot_secondary(unsigned int cpu, struct task_struct *idle)
4281 * now the secondary core is starting up let it run its
4282 * calibrations, then wait for it to finish
4283 */
4284- spin_unlock(&boot_lock);
4285+ raw_spin_unlock(&boot_lock);
4286
4287 return pen_release != -1 ? -ENOSYS : 0;
4288 }
4289diff --git a/arch/arm/mach-sti/platsmp.c b/arch/arm/mach-sti/platsmp.c
4290index 231f19e17436..a3419b7003e6 100644
4291--- a/arch/arm/mach-sti/platsmp.c
4292+++ b/arch/arm/mach-sti/platsmp.c
4293@@ -35,7 +35,7 @@ static void write_pen_release(int val)
4294 sync_cache_w(&pen_release);
4295 }
4296
4297-static DEFINE_SPINLOCK(boot_lock);
4298+static DEFINE_RAW_SPINLOCK(boot_lock);
4299
4300 static void sti_secondary_init(unsigned int cpu)
4301 {
4302@@ -48,8 +48,8 @@ static void sti_secondary_init(unsigned int cpu)
4303 /*
4304 * Synchronise with the boot thread.
4305 */
4306- spin_lock(&boot_lock);
4307- spin_unlock(&boot_lock);
4308+ raw_spin_lock(&boot_lock);
4309+ raw_spin_unlock(&boot_lock);
4310 }
4311
4312 static int sti_boot_secondary(unsigned int cpu, struct task_struct *idle)
4313@@ -60,7 +60,7 @@ static int sti_boot_secondary(unsigned int cpu, struct task_struct *idle)
4314 * set synchronisation state between this boot processor
4315 * and the secondary one
4316 */
4317- spin_lock(&boot_lock);
4318+ raw_spin_lock(&boot_lock);
4319
4320 /*
4321 * The secondary processor is waiting to be released from
4322@@ -91,7 +91,7 @@ static int sti_boot_secondary(unsigned int cpu, struct task_struct *idle)
4323 * now the secondary core is starting up let it run its
4324 * calibrations, then wait for it to finish
4325 */
4326- spin_unlock(&boot_lock);
4327+ raw_spin_unlock(&boot_lock);
4328
4329 return pen_release != -1 ? -ENOSYS : 0;
4330 }
4331diff --git a/arch/arm/mm/fault.c b/arch/arm/mm/fault.c
4332index 49b1b8048635..b261967ea028 100644
4333--- a/arch/arm/mm/fault.c
4334+++ b/arch/arm/mm/fault.c
4335@@ -437,6 +437,9 @@ do_translation_fault(unsigned long addr, unsigned int fsr,
4336 if (addr < TASK_SIZE)
4337 return do_page_fault(addr, fsr, regs);
4338
4339+ if (interrupts_enabled(regs))
4340+ local_irq_enable();
4341+
4342 if (user_mode(regs))
4343 goto bad_area;
4344
4345@@ -504,6 +507,9 @@ do_translation_fault(unsigned long addr, unsigned int fsr,
4346 static int
4347 do_sect_fault(unsigned long addr, unsigned int fsr, struct pt_regs *regs)
4348 {
4349+ if (interrupts_enabled(regs))
4350+ local_irq_enable();
4351+
4352 do_bad_area(addr, fsr, regs);
4353 return 0;
4354 }
4355diff --git a/arch/arm/mm/highmem.c b/arch/arm/mm/highmem.c
4356index d02f8187b1cc..542692dbd40a 100644
4357--- a/arch/arm/mm/highmem.c
4358+++ b/arch/arm/mm/highmem.c
4359@@ -34,6 +34,11 @@ static inline pte_t get_fixmap_pte(unsigned long vaddr)
4360 return *ptep;
4361 }
4362
4363+static unsigned int fixmap_idx(int type)
4364+{
4365+ return FIX_KMAP_BEGIN + type + KM_TYPE_NR * smp_processor_id();
4366+}
4367+
4368 void *kmap(struct page *page)
4369 {
4370 might_sleep();
4371@@ -54,12 +59,13 @@ EXPORT_SYMBOL(kunmap);
4372
4373 void *kmap_atomic(struct page *page)
4374 {
4375+ pte_t pte = mk_pte(page, kmap_prot);
4376 unsigned int idx;
4377 unsigned long vaddr;
4378 void *kmap;
4379 int type;
4380
4381- preempt_disable();
4382+ preempt_disable_nort();
4383 pagefault_disable();
4384 if (!PageHighMem(page))
4385 return page_address(page);
4386@@ -79,7 +85,7 @@ void *kmap_atomic(struct page *page)
4387
4388 type = kmap_atomic_idx_push();
4389
4390- idx = FIX_KMAP_BEGIN + type + KM_TYPE_NR * smp_processor_id();
4391+ idx = fixmap_idx(type);
4392 vaddr = __fix_to_virt(idx);
4393 #ifdef CONFIG_DEBUG_HIGHMEM
4394 /*
4395@@ -93,7 +99,10 @@ void *kmap_atomic(struct page *page)
4396 * in place, so the contained TLB flush ensures the TLB is updated
4397 * with the new mapping.
4398 */
4399- set_fixmap_pte(idx, mk_pte(page, kmap_prot));
4400+#ifdef CONFIG_PREEMPT_RT_FULL
4401+ current->kmap_pte[type] = pte;
4402+#endif
4403+ set_fixmap_pte(idx, pte);
4404
4405 return (void *)vaddr;
4406 }
4407@@ -106,44 +115,75 @@ void __kunmap_atomic(void *kvaddr)
4408
4409 if (kvaddr >= (void *)FIXADDR_START) {
4410 type = kmap_atomic_idx();
4411- idx = FIX_KMAP_BEGIN + type + KM_TYPE_NR * smp_processor_id();
4412+ idx = fixmap_idx(type);
4413
4414 if (cache_is_vivt())
4415 __cpuc_flush_dcache_area((void *)vaddr, PAGE_SIZE);
4416+#ifdef CONFIG_PREEMPT_RT_FULL
4417+ current->kmap_pte[type] = __pte(0);
4418+#endif
4419 #ifdef CONFIG_DEBUG_HIGHMEM
4420 BUG_ON(vaddr != __fix_to_virt(idx));
4421- set_fixmap_pte(idx, __pte(0));
4422 #else
4423 (void) idx; /* to kill a warning */
4424 #endif
4425+ set_fixmap_pte(idx, __pte(0));
4426 kmap_atomic_idx_pop();
4427 } else if (vaddr >= PKMAP_ADDR(0) && vaddr < PKMAP_ADDR(LAST_PKMAP)) {
4428 /* this address was obtained through kmap_high_get() */
4429 kunmap_high(pte_page(pkmap_page_table[PKMAP_NR(vaddr)]));
4430 }
4431 pagefault_enable();
4432- preempt_enable();
4433+ preempt_enable_nort();
4434 }
4435 EXPORT_SYMBOL(__kunmap_atomic);
4436
4437 void *kmap_atomic_pfn(unsigned long pfn)
4438 {
4439+ pte_t pte = pfn_pte(pfn, kmap_prot);
4440 unsigned long vaddr;
4441 int idx, type;
4442 struct page *page = pfn_to_page(pfn);
4443
4444- preempt_disable();
4445+ preempt_disable_nort();
4446 pagefault_disable();
4447 if (!PageHighMem(page))
4448 return page_address(page);
4449
4450 type = kmap_atomic_idx_push();
4451- idx = FIX_KMAP_BEGIN + type + KM_TYPE_NR * smp_processor_id();
4452+ idx = fixmap_idx(type);
4453 vaddr = __fix_to_virt(idx);
4454 #ifdef CONFIG_DEBUG_HIGHMEM
4455 BUG_ON(!pte_none(get_fixmap_pte(vaddr)));
4456 #endif
4457- set_fixmap_pte(idx, pfn_pte(pfn, kmap_prot));
4458+#ifdef CONFIG_PREEMPT_RT_FULL
4459+ current->kmap_pte[type] = pte;
4460+#endif
4461+ set_fixmap_pte(idx, pte);
4462
4463 return (void *)vaddr;
4464 }
4465+#if defined CONFIG_PREEMPT_RT_FULL
4466+void switch_kmaps(struct task_struct *prev_p, struct task_struct *next_p)
4467+{
4468+ int i;
4469+
4470+ /*
4471+ * Clear @prev's kmap_atomic mappings
4472+ */
4473+ for (i = 0; i < prev_p->kmap_idx; i++) {
4474+ int idx = fixmap_idx(i);
4475+
4476+ set_fixmap_pte(idx, __pte(0));
4477+ }
4478+ /*
4479+ * Restore @next_p's kmap_atomic mappings
4480+ */
4481+ for (i = 0; i < next_p->kmap_idx; i++) {
4482+ int idx = fixmap_idx(i);
4483+
4484+ if (!pte_none(next_p->kmap_pte[i]))
4485+ set_fixmap_pte(idx, next_p->kmap_pte[i]);
4486+ }
4487+}
4488+#endif
4489diff --git a/arch/arm/plat-versatile/platsmp.c b/arch/arm/plat-versatile/platsmp.c
4490index c2366510187a..6b60f582b738 100644
4491--- a/arch/arm/plat-versatile/platsmp.c
4492+++ b/arch/arm/plat-versatile/platsmp.c
4493@@ -32,7 +32,7 @@ static void write_pen_release(int val)
4494 sync_cache_w(&pen_release);
4495 }
4496
4497-static DEFINE_SPINLOCK(boot_lock);
4498+static DEFINE_RAW_SPINLOCK(boot_lock);
4499
4500 void versatile_secondary_init(unsigned int cpu)
4501 {
4502@@ -45,8 +45,8 @@ void versatile_secondary_init(unsigned int cpu)
4503 /*
4504 * Synchronise with the boot thread.
4505 */
4506- spin_lock(&boot_lock);
4507- spin_unlock(&boot_lock);
4508+ raw_spin_lock(&boot_lock);
4509+ raw_spin_unlock(&boot_lock);
4510 }
4511
4512 int versatile_boot_secondary(unsigned int cpu, struct task_struct *idle)
4513@@ -57,7 +57,7 @@ int versatile_boot_secondary(unsigned int cpu, struct task_struct *idle)
4514 * Set synchronisation state between this boot processor
4515 * and the secondary one
4516 */
4517- spin_lock(&boot_lock);
4518+ raw_spin_lock(&boot_lock);
4519
4520 /*
4521 * This is really belt and braces; we hold unintended secondary
4522@@ -87,7 +87,7 @@ int versatile_boot_secondary(unsigned int cpu, struct task_struct *idle)
4523 * now the secondary core is starting up let it run its
4524 * calibrations, then wait for it to finish
4525 */
4526- spin_unlock(&boot_lock);
4527+ raw_spin_unlock(&boot_lock);
4528
4529 return pen_release != -1 ? -ENOSYS : 0;
4530 }
4531diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
4532index c30cd78b6918..458d2033ffde 100644
4533--- a/arch/arm64/Kconfig
4534+++ b/arch/arm64/Kconfig
4535@@ -103,6 +103,7 @@ config ARM64
4536 select HAVE_PERF_EVENTS
4537 select HAVE_PERF_REGS
4538 select HAVE_PERF_USER_STACK_DUMP
4539+ select HAVE_PREEMPT_LAZY
4540 select HAVE_REGS_AND_STACK_ACCESS_API
4541 select HAVE_RCU_TABLE_FREE
4542 select HAVE_SYSCALL_TRACEPOINTS
4543diff --git a/arch/arm64/crypto/Kconfig b/arch/arm64/crypto/Kconfig
4544index 70c517aa4501..2a5f05b5a19a 100644
4545--- a/arch/arm64/crypto/Kconfig
4546+++ b/arch/arm64/crypto/Kconfig
4547@@ -19,19 +19,19 @@ config CRYPTO_SHA512_ARM64
4548
4549 config CRYPTO_SHA1_ARM64_CE
4550 tristate "SHA-1 digest algorithm (ARMv8 Crypto Extensions)"
4551- depends on KERNEL_MODE_NEON
4552+ depends on KERNEL_MODE_NEON && !PREEMPT_RT_BASE
4553 select CRYPTO_HASH
4554 select CRYPTO_SHA1
4555
4556 config CRYPTO_SHA2_ARM64_CE
4557 tristate "SHA-224/SHA-256 digest algorithm (ARMv8 Crypto Extensions)"
4558- depends on KERNEL_MODE_NEON
4559+ depends on KERNEL_MODE_NEON && !PREEMPT_RT_BASE
4560 select CRYPTO_HASH
4561 select CRYPTO_SHA256_ARM64
4562
4563 config CRYPTO_GHASH_ARM64_CE
4564 tristate "GHASH/AES-GCM using ARMv8 Crypto Extensions"
4565- depends on KERNEL_MODE_NEON
4566+ depends on KERNEL_MODE_NEON && !PREEMPT_RT_BASE
4567 select CRYPTO_HASH
4568 select CRYPTO_GF128MUL
4569 select CRYPTO_AES
4570@@ -39,7 +39,7 @@ config CRYPTO_GHASH_ARM64_CE
4571
4572 config CRYPTO_CRCT10DIF_ARM64_CE
4573 tristate "CRCT10DIF digest algorithm using PMULL instructions"
4574- depends on KERNEL_MODE_NEON && CRC_T10DIF
4575+ depends on KERNEL_MODE_NEON && CRC_T10DIF && !PREEMPT_RT_BASE
4576 select CRYPTO_HASH
4577
4578 config CRYPTO_CRC32_ARM64_CE
4579@@ -53,13 +53,13 @@ config CRYPTO_AES_ARM64
4580
4581 config CRYPTO_AES_ARM64_CE
4582 tristate "AES core cipher using ARMv8 Crypto Extensions"
4583- depends on ARM64 && KERNEL_MODE_NEON
4584+ depends on ARM64 && KERNEL_MODE_NEON && !PREEMPT_RT_BASE
4585 select CRYPTO_ALGAPI
4586 select CRYPTO_AES_ARM64
4587
4588 config CRYPTO_AES_ARM64_CE_CCM
4589 tristate "AES in CCM mode using ARMv8 Crypto Extensions"
4590- depends on ARM64 && KERNEL_MODE_NEON
4591+ depends on ARM64 && KERNEL_MODE_NEON && !PREEMPT_RT_BASE
4592 select CRYPTO_ALGAPI
4593 select CRYPTO_AES_ARM64_CE
4594 select CRYPTO_AES_ARM64
4595@@ -67,7 +67,7 @@ config CRYPTO_AES_ARM64_CE_CCM
4596
4597 config CRYPTO_AES_ARM64_CE_BLK
4598 tristate "AES in ECB/CBC/CTR/XTS modes using ARMv8 Crypto Extensions"
4599- depends on KERNEL_MODE_NEON
4600+ depends on KERNEL_MODE_NEON && !PREEMPT_RT_BASE
4601 select CRYPTO_BLKCIPHER
4602 select CRYPTO_AES_ARM64_CE
4603 select CRYPTO_AES_ARM64
4604@@ -75,7 +75,7 @@ config CRYPTO_AES_ARM64_CE_BLK
4605
4606 config CRYPTO_AES_ARM64_NEON_BLK
4607 tristate "AES in ECB/CBC/CTR/XTS modes using NEON instructions"
4608- depends on KERNEL_MODE_NEON
4609+ depends on KERNEL_MODE_NEON && !PREEMPT_RT_BASE
4610 select CRYPTO_BLKCIPHER
4611 select CRYPTO_AES_ARM64
4612 select CRYPTO_AES
4613@@ -83,13 +83,13 @@ config CRYPTO_AES_ARM64_NEON_BLK
4614
4615 config CRYPTO_CHACHA20_NEON
4616 tristate "NEON accelerated ChaCha20 symmetric cipher"
4617- depends on KERNEL_MODE_NEON
4618+ depends on KERNEL_MODE_NEON && !PREEMPT_RT_BASE
4619 select CRYPTO_BLKCIPHER
4620 select CRYPTO_CHACHA20
4621
4622 config CRYPTO_AES_ARM64_BS
4623 tristate "AES in ECB/CBC/CTR/XTS modes using bit-sliced NEON algorithm"
4624- depends on KERNEL_MODE_NEON
4625+ depends on KERNEL_MODE_NEON && !PREEMPT_RT_BASE
4626 select CRYPTO_BLKCIPHER
4627 select CRYPTO_AES_ARM64_NEON_BLK
4628 select CRYPTO_AES_ARM64
4629diff --git a/arch/arm64/crypto/crc32-ce-glue.c b/arch/arm64/crypto/crc32-ce-glue.c
4630index 34b4e3d46aab..ae055cdad8cf 100644
4631--- a/arch/arm64/crypto/crc32-ce-glue.c
4632+++ b/arch/arm64/crypto/crc32-ce-glue.c
4633@@ -208,7 +208,8 @@ static struct shash_alg crc32_pmull_algs[] = { {
4634
4635 static int __init crc32_pmull_mod_init(void)
4636 {
4637- if (IS_ENABLED(CONFIG_KERNEL_MODE_NEON) && (elf_hwcap & HWCAP_PMULL)) {
4638+ if (IS_ENABLED(CONFIG_KERNEL_MODE_NEON) &&
4639+ !IS_ENABLED(CONFIG_PREEMPT_RT_BASE) && (elf_hwcap & HWCAP_PMULL)) {
4640 crc32_pmull_algs[0].update = crc32_pmull_update;
4641 crc32_pmull_algs[1].update = crc32c_pmull_update;
4642
4643diff --git a/arch/arm64/include/asm/spinlock_types.h b/arch/arm64/include/asm/spinlock_types.h
4644index 55be59a35e3f..ba0cf1361f65 100644
4645--- a/arch/arm64/include/asm/spinlock_types.h
4646+++ b/arch/arm64/include/asm/spinlock_types.h
4647@@ -16,10 +16,6 @@
4648 #ifndef __ASM_SPINLOCK_TYPES_H
4649 #define __ASM_SPINLOCK_TYPES_H
4650
4651-#if !defined(__LINUX_SPINLOCK_TYPES_H) && !defined(__ASM_SPINLOCK_H)
4652-# error "please don't include this file directly"
4653-#endif
4654-
4655 #include <linux/types.h>
4656
4657 #define TICKET_SHIFT 16
4658diff --git a/arch/arm64/include/asm/thread_info.h b/arch/arm64/include/asm/thread_info.h
4659index fc786d344e46..b833258b7594 100644
4660--- a/arch/arm64/include/asm/thread_info.h
4661+++ b/arch/arm64/include/asm/thread_info.h
4662@@ -43,6 +43,7 @@ struct thread_info {
4663 u64 ttbr0; /* saved TTBR0_EL1 */
4664 #endif
4665 int preempt_count; /* 0 => preemptable, <0 => bug */
4666+ int preempt_lazy_count; /* 0 => preemptable, <0 => bug */
4667 };
4668
4669 #define INIT_THREAD_INFO(tsk) \
4670@@ -82,6 +83,7 @@ void arch_setup_new_exec(void);
4671 #define TIF_FOREIGN_FPSTATE 3 /* CPU's FP state is not current's */
4672 #define TIF_UPROBE 4 /* uprobe breakpoint or singlestep */
4673 #define TIF_FSCHECK 5 /* Check FS is USER_DS on return */
4674+#define TIF_NEED_RESCHED_LAZY 6
4675 #define TIF_NOHZ 7
4676 #define TIF_SYSCALL_TRACE 8
4677 #define TIF_SYSCALL_AUDIT 9
4678@@ -98,6 +100,7 @@ void arch_setup_new_exec(void);
4679 #define _TIF_NEED_RESCHED (1 << TIF_NEED_RESCHED)
4680 #define _TIF_NOTIFY_RESUME (1 << TIF_NOTIFY_RESUME)
4681 #define _TIF_FOREIGN_FPSTATE (1 << TIF_FOREIGN_FPSTATE)
4682+#define _TIF_NEED_RESCHED_LAZY (1 << TIF_NEED_RESCHED_LAZY)
4683 #define _TIF_NOHZ (1 << TIF_NOHZ)
4684 #define _TIF_SYSCALL_TRACE (1 << TIF_SYSCALL_TRACE)
4685 #define _TIF_SYSCALL_AUDIT (1 << TIF_SYSCALL_AUDIT)
4686@@ -109,8 +112,9 @@ void arch_setup_new_exec(void);
4687
4688 #define _TIF_WORK_MASK (_TIF_NEED_RESCHED | _TIF_SIGPENDING | \
4689 _TIF_NOTIFY_RESUME | _TIF_FOREIGN_FPSTATE | \
4690- _TIF_UPROBE | _TIF_FSCHECK)
4691+ _TIF_UPROBE | _TIF_FSCHECK | _TIF_NEED_RESCHED_LAZY)
4692
4693+#define _TIF_NEED_RESCHED_MASK (_TIF_NEED_RESCHED | _TIF_NEED_RESCHED_LAZY)
4694 #define _TIF_SYSCALL_WORK (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_AUDIT | \
4695 _TIF_SYSCALL_TRACEPOINT | _TIF_SECCOMP | \
4696 _TIF_NOHZ)
4697diff --git a/arch/arm64/kernel/asm-offsets.c b/arch/arm64/kernel/asm-offsets.c
4698index b5e43b01b396..ae26a1664436 100644
4699--- a/arch/arm64/kernel/asm-offsets.c
4700+++ b/arch/arm64/kernel/asm-offsets.c
4701@@ -39,6 +39,7 @@ int main(void)
4702 BLANK();
4703 DEFINE(TSK_TI_FLAGS, offsetof(struct task_struct, thread_info.flags));
4704 DEFINE(TSK_TI_PREEMPT, offsetof(struct task_struct, thread_info.preempt_count));
4705+ DEFINE(TSK_TI_PREEMPT_LAZY, offsetof(struct task_struct, thread_info.preempt_lazy_count));
4706 DEFINE(TSK_TI_ADDR_LIMIT, offsetof(struct task_struct, thread_info.addr_limit));
4707 #ifdef CONFIG_ARM64_SW_TTBR0_PAN
4708 DEFINE(TSK_TI_TTBR0, offsetof(struct task_struct, thread_info.ttbr0));
4709diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S
4710index c1ffa95c0ad2..c60ecb5a3916 100644
4711--- a/arch/arm64/kernel/entry.S
4712+++ b/arch/arm64/kernel/entry.S
4713@@ -637,11 +637,16 @@ el1_irq:
4714
4715 #ifdef CONFIG_PREEMPT
4716 ldr w24, [tsk, #TSK_TI_PREEMPT] // get preempt count
4717- cbnz w24, 1f // preempt count != 0
4718+ cbnz w24, 2f // preempt count != 0
4719 ldr x0, [tsk, #TSK_TI_FLAGS] // get flags
4720- tbz x0, #TIF_NEED_RESCHED, 1f // needs rescheduling?
4721- bl el1_preempt
4722+ tbnz x0, #TIF_NEED_RESCHED, 1f // needs rescheduling?
4723+
4724+ ldr w24, [tsk, #TSK_TI_PREEMPT_LAZY] // get preempt lazy count
4725+ cbnz w24, 2f // preempt lazy count != 0
4726+ tbz x0, #TIF_NEED_RESCHED_LAZY, 2f // needs rescheduling?
4727 1:
4728+ bl el1_preempt
4729+2:
4730 #endif
4731 #ifdef CONFIG_TRACE_IRQFLAGS
4732 bl trace_hardirqs_on
4733@@ -655,6 +660,7 @@ el1_preempt:
4734 1: bl preempt_schedule_irq // irq en/disable is done inside
4735 ldr x0, [tsk, #TSK_TI_FLAGS] // get new tasks TI_FLAGS
4736 tbnz x0, #TIF_NEED_RESCHED, 1b // needs rescheduling?
4737+ tbnz x0, #TIF_NEED_RESCHED_LAZY, 1b // needs rescheduling?
4738 ret x24
4739 #endif
4740
4741diff --git a/arch/arm64/kernel/signal.c b/arch/arm64/kernel/signal.c
4742index 43442b3a463f..81bf9545a589 100644
4743--- a/arch/arm64/kernel/signal.c
4744+++ b/arch/arm64/kernel/signal.c
4745@@ -756,7 +756,7 @@ asmlinkage void do_notify_resume(struct pt_regs *regs,
4746 /* Check valid user FS if needed */
4747 addr_limit_user_check();
4748
4749- if (thread_flags & _TIF_NEED_RESCHED) {
4750+ if (thread_flags & _TIF_NEED_RESCHED_MASK) {
4751 schedule();
4752 } else {
4753 local_irq_enable();
4754diff --git a/arch/blackfin/include/asm/spinlock_types.h b/arch/blackfin/include/asm/spinlock_types.h
4755index 1a33608c958b..103b34d3dcf6 100644
4756--- a/arch/blackfin/include/asm/spinlock_types.h
4757+++ b/arch/blackfin/include/asm/spinlock_types.h
4758@@ -7,10 +7,6 @@
4759 #ifndef __ASM_SPINLOCK_TYPES_H
4760 #define __ASM_SPINLOCK_TYPES_H
4761
4762-#ifndef __LINUX_SPINLOCK_TYPES_H
4763-# error "please don't include this file directly"
4764-#endif
4765-
4766 #include <asm/rwlock.h>
4767
4768 typedef struct {
4769diff --git a/arch/hexagon/include/asm/spinlock_types.h b/arch/hexagon/include/asm/spinlock_types.h
4770index 7a906b5214a4..d8f596fec022 100644
4771--- a/arch/hexagon/include/asm/spinlock_types.h
4772+++ b/arch/hexagon/include/asm/spinlock_types.h
4773@@ -21,10 +21,6 @@
4774 #ifndef _ASM_SPINLOCK_TYPES_H
4775 #define _ASM_SPINLOCK_TYPES_H
4776
4777-#ifndef __LINUX_SPINLOCK_TYPES_H
4778-# error "please don't include this file directly"
4779-#endif
4780-
4781 typedef struct {
4782 volatile unsigned int lock;
4783 } arch_spinlock_t;
4784diff --git a/arch/ia64/include/asm/spinlock_types.h b/arch/ia64/include/asm/spinlock_types.h
4785index 6e345fefcdca..681408d6816f 100644
4786--- a/arch/ia64/include/asm/spinlock_types.h
4787+++ b/arch/ia64/include/asm/spinlock_types.h
4788@@ -2,10 +2,6 @@
4789 #ifndef _ASM_IA64_SPINLOCK_TYPES_H
4790 #define _ASM_IA64_SPINLOCK_TYPES_H
4791
4792-#ifndef __LINUX_SPINLOCK_TYPES_H
4793-# error "please don't include this file directly"
4794-#endif
4795-
4796 typedef struct {
4797 volatile unsigned int lock;
4798 } arch_spinlock_t;
4799diff --git a/arch/ia64/kernel/mca.c b/arch/ia64/kernel/mca.c
4800index 555b11180156..6866201a7603 100644
4801--- a/arch/ia64/kernel/mca.c
4802+++ b/arch/ia64/kernel/mca.c
4803@@ -1824,7 +1824,7 @@ format_mca_init_stack(void *mca_data, unsigned long offset,
4804 ti->cpu = cpu;
4805 p->stack = ti;
4806 p->state = TASK_UNINTERRUPTIBLE;
4807- cpumask_set_cpu(cpu, &p->cpus_allowed);
4808+ cpumask_set_cpu(cpu, &p->cpus_mask);
4809 INIT_LIST_HEAD(&p->tasks);
4810 p->parent = p->real_parent = p->group_leader = p;
4811 INIT_LIST_HEAD(&p->children);
4812diff --git a/arch/m32r/include/asm/spinlock_types.h b/arch/m32r/include/asm/spinlock_types.h
4813index bb0d17b64198..fc6afa42fe11 100644
4814--- a/arch/m32r/include/asm/spinlock_types.h
4815+++ b/arch/m32r/include/asm/spinlock_types.h
4816@@ -2,10 +2,6 @@
4817 #ifndef _ASM_M32R_SPINLOCK_TYPES_H
4818 #define _ASM_M32R_SPINLOCK_TYPES_H
4819
4820-#ifndef __LINUX_SPINLOCK_TYPES_H
4821-# error "please don't include this file directly"
4822-#endif
4823-
4824 typedef struct {
4825 volatile int slock;
4826 } arch_spinlock_t;
4827diff --git a/arch/metag/include/asm/spinlock_types.h b/arch/metag/include/asm/spinlock_types.h
4828index cd197f1bed59..adc26e9797c5 100644
4829--- a/arch/metag/include/asm/spinlock_types.h
4830+++ b/arch/metag/include/asm/spinlock_types.h
4831@@ -2,10 +2,6 @@
4832 #ifndef _ASM_METAG_SPINLOCK_TYPES_H
4833 #define _ASM_METAG_SPINLOCK_TYPES_H
4834
4835-#ifndef __LINUX_SPINLOCK_TYPES_H
4836-# error "please don't include this file directly"
4837-#endif
4838-
4839 typedef struct {
4840 volatile unsigned int lock;
4841 } arch_spinlock_t;
4842diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig
4843index c82457b0e733..7bb1838508de 100644
4844--- a/arch/mips/Kconfig
4845+++ b/arch/mips/Kconfig
4846@@ -2519,7 +2519,7 @@ config MIPS_ASID_BITS_VARIABLE
4847 #
4848 config HIGHMEM
4849 bool "High Memory Support"
4850- depends on 32BIT && CPU_SUPPORTS_HIGHMEM && SYS_SUPPORTS_HIGHMEM && !CPU_MIPS32_3_5_EVA
4851+ depends on 32BIT && CPU_SUPPORTS_HIGHMEM && SYS_SUPPORTS_HIGHMEM && !CPU_MIPS32_3_5_EVA && !PREEMPT_RT_FULL
4852
4853 config CPU_SUPPORTS_HIGHMEM
4854 bool
4855diff --git a/arch/mips/include/asm/switch_to.h b/arch/mips/include/asm/switch_to.h
4856index e610473d61b8..1428b4febbc9 100644
4857--- a/arch/mips/include/asm/switch_to.h
4858+++ b/arch/mips/include/asm/switch_to.h
4859@@ -42,7 +42,7 @@ extern struct task_struct *ll_task;
4860 * inline to try to keep the overhead down. If we have been forced to run on
4861 * a "CPU" with an FPU because of a previous high level of FP computation,
4862 * but did not actually use the FPU during the most recent time-slice (CU1
4863- * isn't set), we undo the restriction on cpus_allowed.
4864+ * isn't set), we undo the restriction on cpus_mask.
4865 *
4866 * We're not calling set_cpus_allowed() here, because we have no need to
4867 * force prompt migration - we're already switching the current CPU to a
4868@@ -57,7 +57,7 @@ do { \
4869 test_ti_thread_flag(__prev_ti, TIF_FPUBOUND) && \
4870 (!(KSTK_STATUS(prev) & ST0_CU1))) { \
4871 clear_ti_thread_flag(__prev_ti, TIF_FPUBOUND); \
4872- prev->cpus_allowed = prev->thread.user_cpus_allowed; \
4873+ prev->cpus_mask = prev->thread.user_cpus_allowed; \
4874 } \
4875 next->thread.emulated_fp = 0; \
4876 } while(0)
4877diff --git a/arch/mips/kernel/mips-mt-fpaff.c b/arch/mips/kernel/mips-mt-fpaff.c
4878index a7c0f97e4b0d..1a08428eedcf 100644
4879--- a/arch/mips/kernel/mips-mt-fpaff.c
4880+++ b/arch/mips/kernel/mips-mt-fpaff.c
4881@@ -177,7 +177,7 @@ asmlinkage long mipsmt_sys_sched_getaffinity(pid_t pid, unsigned int len,
4882 if (retval)
4883 goto out_unlock;
4884
4885- cpumask_or(&allowed, &p->thread.user_cpus_allowed, &p->cpus_allowed);
4886+ cpumask_or(&allowed, &p->thread.user_cpus_allowed, p->cpus_ptr);
4887 cpumask_and(&mask, &allowed, cpu_active_mask);
4888
4889 out_unlock:
4890diff --git a/arch/mips/kernel/traps.c b/arch/mips/kernel/traps.c
4891index 583aed906933..24ad7aaca5eb 100644
4892--- a/arch/mips/kernel/traps.c
4893+++ b/arch/mips/kernel/traps.c
4894@@ -1193,12 +1193,12 @@ static void mt_ase_fp_affinity(void)
4895 * restricted the allowed set to exclude any CPUs with FPUs,
4896 * we'll skip the procedure.
4897 */
4898- if (cpumask_intersects(&current->cpus_allowed, &mt_fpu_cpumask)) {
4899+ if (cpumask_intersects(&current->cpus_mask, &mt_fpu_cpumask)) {
4900 cpumask_t tmask;
4901
4902 current->thread.user_cpus_allowed
4903- = current->cpus_allowed;
4904- cpumask_and(&tmask, &current->cpus_allowed,
4905+ = current->cpus_mask;
4906+ cpumask_and(&tmask, &current->cpus_mask,
4907 &mt_fpu_cpumask);
4908 set_cpus_allowed_ptr(current, &tmask);
4909 set_thread_flag(TIF_FPUBOUND);
4910diff --git a/arch/mn10300/include/asm/spinlock_types.h b/arch/mn10300/include/asm/spinlock_types.h
4911index 32abdc89bbc7..c45230a12d60 100644
4912--- a/arch/mn10300/include/asm/spinlock_types.h
4913+++ b/arch/mn10300/include/asm/spinlock_types.h
4914@@ -2,10 +2,6 @@
4915 #ifndef _ASM_SPINLOCK_TYPES_H
4916 #define _ASM_SPINLOCK_TYPES_H
4917
4918-#ifndef __LINUX_SPINLOCK_TYPES_H
4919-# error "please don't include this file directly"
4920-#endif
4921-
4922 typedef struct arch_spinlock {
4923 unsigned int slock;
4924 } arch_spinlock_t;
4925diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
4926index fe418226df7f..b5658e925465 100644
4927--- a/arch/powerpc/Kconfig
4928+++ b/arch/powerpc/Kconfig
4929@@ -111,10 +111,11 @@ config LOCKDEP_SUPPORT
4930
4931 config RWSEM_GENERIC_SPINLOCK
4932 bool
4933+ default y if PREEMPT_RT_FULL
4934
4935 config RWSEM_XCHGADD_ALGORITHM
4936 bool
4937- default y
4938+ default y if !PREEMPT_RT_FULL
4939
4940 config GENERIC_LOCKBREAK
4941 bool
4942@@ -215,6 +216,7 @@ config PPC
4943 select HAVE_HARDLOCKUP_DETECTOR_PERF if PERF_EVENTS && HAVE_PERF_EVENTS_NMI && !HAVE_HARDLOCKUP_DETECTOR_ARCH
4944 select HAVE_PERF_REGS
4945 select HAVE_PERF_USER_STACK_DUMP
4946+ select HAVE_PREEMPT_LAZY
4947 select HAVE_RCU_TABLE_FREE if SMP
4948 select HAVE_REGS_AND_STACK_ACCESS_API
4949 select HAVE_SYSCALL_TRACEPOINTS
4950@@ -390,7 +392,7 @@ menu "Kernel options"
4951
4952 config HIGHMEM
4953 bool "High memory support"
4954- depends on PPC32
4955+ depends on PPC32 && !PREEMPT_RT_FULL
4956
4957 source kernel/Kconfig.hz
4958 source kernel/Kconfig.preempt
4959diff --git a/arch/powerpc/include/asm/spinlock_types.h b/arch/powerpc/include/asm/spinlock_types.h
4960index 87adaf13b7e8..7305cb6a53e4 100644
4961--- a/arch/powerpc/include/asm/spinlock_types.h
4962+++ b/arch/powerpc/include/asm/spinlock_types.h
4963@@ -2,10 +2,6 @@
4964 #ifndef _ASM_POWERPC_SPINLOCK_TYPES_H
4965 #define _ASM_POWERPC_SPINLOCK_TYPES_H
4966
4967-#ifndef __LINUX_SPINLOCK_TYPES_H
4968-# error "please don't include this file directly"
4969-#endif
4970-
4971 typedef struct {
4972 volatile unsigned int slock;
4973 } arch_spinlock_t;
4974diff --git a/arch/powerpc/include/asm/thread_info.h b/arch/powerpc/include/asm/thread_info.h
4975index a264c3ad366b..020afb8329a1 100644
4976--- a/arch/powerpc/include/asm/thread_info.h
4977+++ b/arch/powerpc/include/asm/thread_info.h
4978@@ -36,6 +36,8 @@ struct thread_info {
4979 int cpu; /* cpu we're on */
4980 int preempt_count; /* 0 => preemptable,
4981 <0 => BUG */
4982+ int preempt_lazy_count; /* 0 => preemptable,
4983+ <0 => BUG */
4984 unsigned long local_flags; /* private flags for thread */
4985 #ifdef CONFIG_LIVEPATCH
4986 unsigned long *livepatch_sp;
4987@@ -81,8 +83,7 @@ static inline struct thread_info *current_thread_info(void)
4988 #define TIF_SYSCALL_TRACE 0 /* syscall trace active */
4989 #define TIF_SIGPENDING 1 /* signal pending */
4990 #define TIF_NEED_RESCHED 2 /* rescheduling necessary */
4991-#define TIF_POLLING_NRFLAG 3 /* true if poll_idle() is polling
4992- TIF_NEED_RESCHED */
4993+#define TIF_NEED_RESCHED_LAZY 3 /* lazy rescheduling necessary */
4994 #define TIF_32BIT 4 /* 32 bit binary */
4995 #define TIF_RESTORE_TM 5 /* need to restore TM FP/VEC/VSX */
4996 #define TIF_PATCH_PENDING 6 /* pending live patching update */
4997@@ -101,6 +102,8 @@ static inline struct thread_info *current_thread_info(void)
4998 #if defined(CONFIG_PPC64)
4999 #define TIF_ELF2ABI 18 /* function descriptors must die! */
5000 #endif
5001+#define TIF_POLLING_NRFLAG 19 /* true if poll_idle() is polling
5002+ TIF_NEED_RESCHED */
5003
5004 /* as above, but as bit values */
5005 #define _TIF_SYSCALL_TRACE (1<<TIF_SYSCALL_TRACE)
5006@@ -120,14 +123,16 @@ static inline struct thread_info *current_thread_info(void)
5007 #define _TIF_SYSCALL_TRACEPOINT (1<<TIF_SYSCALL_TRACEPOINT)
5008 #define _TIF_EMULATE_STACK_STORE (1<<TIF_EMULATE_STACK_STORE)
5009 #define _TIF_NOHZ (1<<TIF_NOHZ)
5010+#define _TIF_NEED_RESCHED_LAZY (1<<TIF_NEED_RESCHED_LAZY)
5011 #define _TIF_SYSCALL_DOTRACE (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_AUDIT | \
5012 _TIF_SECCOMP | _TIF_SYSCALL_TRACEPOINT | \
5013 _TIF_NOHZ)
5014
5015 #define _TIF_USER_WORK_MASK (_TIF_SIGPENDING | _TIF_NEED_RESCHED | \
5016 _TIF_NOTIFY_RESUME | _TIF_UPROBE | \
5017- _TIF_RESTORE_TM | _TIF_PATCH_PENDING)
5018+ _TIF_RESTORE_TM | _TIF_PATCH_PENDING | _TIF_NEED_RESCHED_LAZY)
5019 #define _TIF_PERSYSCALL_MASK (_TIF_RESTOREALL|_TIF_NOERROR)
5020+#define _TIF_NEED_RESCHED_MASK (_TIF_NEED_RESCHED | _TIF_NEED_RESCHED_LAZY)
5021
5022 /* Bits in local_flags */
5023 /* Don't move TLF_NAPPING without adjusting the code in entry_32.S */
5024diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c
5025index 2e5ea300258a..a2cb40098d7c 100644
5026--- a/arch/powerpc/kernel/asm-offsets.c
5027+++ b/arch/powerpc/kernel/asm-offsets.c
5028@@ -156,6 +156,7 @@ int main(void)
5029 OFFSET(TI_FLAGS, thread_info, flags);
5030 OFFSET(TI_LOCAL_FLAGS, thread_info, local_flags);
5031 OFFSET(TI_PREEMPT, thread_info, preempt_count);
5032+ OFFSET(TI_PREEMPT_LAZY, thread_info, preempt_lazy_count);
5033 OFFSET(TI_TASK, thread_info, task);
5034 OFFSET(TI_CPU, thread_info, cpu);
5035
5036diff --git a/arch/powerpc/kernel/entry_32.S b/arch/powerpc/kernel/entry_32.S
5037index e780e1fbf6c2..dc7fe90ff6a9 100644
5038--- a/arch/powerpc/kernel/entry_32.S
5039+++ b/arch/powerpc/kernel/entry_32.S
5040@@ -866,7 +866,14 @@ resume_kernel:
5041 cmpwi 0,r0,0 /* if non-zero, just restore regs and return */
5042 bne restore
5043 andi. r8,r8,_TIF_NEED_RESCHED
5044+ bne+ 1f
5045+ lwz r0,TI_PREEMPT_LAZY(r9)
5046+ cmpwi 0,r0,0 /* if non-zero, just restore regs and return */
5047+ bne restore
5048+ lwz r0,TI_FLAGS(r9)
5049+ andi. r0,r0,_TIF_NEED_RESCHED_LAZY
5050 beq+ restore
5051+1:
5052 lwz r3,_MSR(r1)
5053 andi. r0,r3,MSR_EE /* interrupts off? */
5054 beq restore /* don't schedule if so */
5055@@ -877,11 +884,11 @@ resume_kernel:
5056 */
5057 bl trace_hardirqs_off
5058 #endif
5059-1: bl preempt_schedule_irq
5060+2: bl preempt_schedule_irq
5061 CURRENT_THREAD_INFO(r9, r1)
5062 lwz r3,TI_FLAGS(r9)
5063- andi. r0,r3,_TIF_NEED_RESCHED
5064- bne- 1b
5065+ andi. r0,r3,_TIF_NEED_RESCHED_MASK
5066+ bne- 2b
5067 #ifdef CONFIG_TRACE_IRQFLAGS
5068 /* And now, to properly rebalance the above, we tell lockdep they
5069 * are being turned back on, which will happen when we return
5070@@ -1204,7 +1211,7 @@ global_dbcr0:
5071 #endif /* !(CONFIG_4xx || CONFIG_BOOKE) */
5072
5073 do_work: /* r10 contains MSR_KERNEL here */
5074- andi. r0,r9,_TIF_NEED_RESCHED
5075+ andi. r0,r9,_TIF_NEED_RESCHED_MASK
5076 beq do_user_signal
5077
5078 do_resched: /* r10 contains MSR_KERNEL here */
5079@@ -1225,7 +1232,7 @@ recheck:
5080 MTMSRD(r10) /* disable interrupts */
5081 CURRENT_THREAD_INFO(r9, r1)
5082 lwz r9,TI_FLAGS(r9)
5083- andi. r0,r9,_TIF_NEED_RESCHED
5084+ andi. r0,r9,_TIF_NEED_RESCHED_MASK
5085 bne- do_resched
5086 andi. r0,r9,_TIF_USER_WORK_MASK
5087 beq restore_user
5088diff --git a/arch/powerpc/kernel/entry_64.S b/arch/powerpc/kernel/entry_64.S
5089index c194f4c8e66b..117c1f6cab66 100644
5090--- a/arch/powerpc/kernel/entry_64.S
5091+++ b/arch/powerpc/kernel/entry_64.S
5092@@ -690,7 +690,7 @@ _GLOBAL(ret_from_except_lite)
5093 bl restore_math
5094 b restore
5095 #endif
5096-1: andi. r0,r4,_TIF_NEED_RESCHED
5097+1: andi. r0,r4,_TIF_NEED_RESCHED_MASK
5098 beq 2f
5099 bl restore_interrupts
5100 SCHEDULE_USER
5101@@ -752,10 +752,18 @@ resume_kernel:
5102
5103 #ifdef CONFIG_PREEMPT
5104 /* Check if we need to preempt */
5105+ lwz r8,TI_PREEMPT(r9)
5106+ cmpwi 0,r8,0 /* if non-zero, just restore regs and return */
5107+ bne restore
5108 andi. r0,r4,_TIF_NEED_RESCHED
5109+ bne+ check_count
5110+
5111+ andi. r0,r4,_TIF_NEED_RESCHED_LAZY
5112 beq+ restore
5113+ lwz r8,TI_PREEMPT_LAZY(r9)
5114+
5115 /* Check that preempt_count() == 0 and interrupts are enabled */
5116- lwz r8,TI_PREEMPT(r9)
5117+check_count:
5118 cmpwi cr1,r8,0
5119 ld r0,SOFTE(r1)
5120 cmpdi r0,0
5121@@ -772,7 +780,7 @@ resume_kernel:
5122 /* Re-test flags and eventually loop */
5123 CURRENT_THREAD_INFO(r9, r1)
5124 ld r4,TI_FLAGS(r9)
5125- andi. r0,r4,_TIF_NEED_RESCHED
5126+ andi. r0,r4,_TIF_NEED_RESCHED_MASK
5127 bne 1b
5128
5129 /*
5130diff --git a/arch/powerpc/kernel/irq.c b/arch/powerpc/kernel/irq.c
5131index 0ce8b0e5d7ba..375adb3048fc 100644
5132--- a/arch/powerpc/kernel/irq.c
5133+++ b/arch/powerpc/kernel/irq.c
5134@@ -693,6 +693,7 @@ void irq_ctx_init(void)
5135 }
5136 }
5137
5138+#ifndef CONFIG_PREEMPT_RT_FULL
5139 void do_softirq_own_stack(void)
5140 {
5141 struct thread_info *curtp, *irqtp;
5142@@ -710,6 +711,7 @@ void do_softirq_own_stack(void)
5143 if (irqtp->flags)
5144 set_bits(irqtp->flags, &curtp->flags);
5145 }
5146+#endif
5147
5148 irq_hw_number_t virq_to_hw(unsigned int virq)
5149 {
5150diff --git a/arch/powerpc/kernel/misc_32.S b/arch/powerpc/kernel/misc_32.S
5151index 3f7a9a2d2435..1795359d27b6 100644
5152--- a/arch/powerpc/kernel/misc_32.S
5153+++ b/arch/powerpc/kernel/misc_32.S
5154@@ -41,6 +41,7 @@
5155 * We store the saved ksp_limit in the unused part
5156 * of the STACK_FRAME_OVERHEAD
5157 */
5158+#ifndef CONFIG_PREEMPT_RT_FULL
5159 _GLOBAL(call_do_softirq)
5160 mflr r0
5161 stw r0,4(r1)
5162@@ -57,6 +58,7 @@ _GLOBAL(call_do_softirq)
5163 stw r10,THREAD+KSP_LIMIT(r2)
5164 mtlr r0
5165 blr
5166+#endif
5167
5168 /*
5169 * void call_do_irq(struct pt_regs *regs, struct thread_info *irqtp);
5170diff --git a/arch/powerpc/kernel/misc_64.S b/arch/powerpc/kernel/misc_64.S
5171index 3280953a82cf..dd2a80d190c4 100644
5172--- a/arch/powerpc/kernel/misc_64.S
5173+++ b/arch/powerpc/kernel/misc_64.S
5174@@ -31,6 +31,7 @@
5175
5176 .text
5177
5178+#ifndef CONFIG_PREEMPT_RT_FULL
5179 _GLOBAL(call_do_softirq)
5180 mflr r0
5181 std r0,16(r1)
5182@@ -41,6 +42,7 @@ _GLOBAL(call_do_softirq)
5183 ld r0,16(r1)
5184 mtlr r0
5185 blr
5186+#endif
5187
5188 _GLOBAL(call_do_irq)
5189 mflr r0
5190diff --git a/arch/powerpc/kvm/Kconfig b/arch/powerpc/kvm/Kconfig
5191index 648160334abf..9d24331fc9b4 100644
5192--- a/arch/powerpc/kvm/Kconfig
5193+++ b/arch/powerpc/kvm/Kconfig
5194@@ -177,6 +177,7 @@ config KVM_E500MC
5195 config KVM_MPIC
5196 bool "KVM in-kernel MPIC emulation"
5197 depends on KVM && E500
5198+ depends on !PREEMPT_RT_FULL
5199 select HAVE_KVM_IRQCHIP
5200 select HAVE_KVM_IRQFD
5201 select HAVE_KVM_IRQ_ROUTING
5202diff --git a/arch/powerpc/platforms/cell/spufs/sched.c b/arch/powerpc/platforms/cell/spufs/sched.c
5203index 1fbb5da17dd2..ca86366d5424 100644
5204--- a/arch/powerpc/platforms/cell/spufs/sched.c
5205+++ b/arch/powerpc/platforms/cell/spufs/sched.c
5206@@ -141,7 +141,7 @@ void __spu_update_sched_info(struct spu_context *ctx)
5207 * runqueue. The context will be rescheduled on the proper node
5208 * if it is timesliced or preempted.
5209 */
5210- cpumask_copy(&ctx->cpus_allowed, &current->cpus_allowed);
5211+ cpumask_copy(&ctx->cpus_allowed, current->cpus_ptr);
5212
5213 /* Save the current cpu id for spu interrupt routing. */
5214 ctx->last_ran = raw_smp_processor_id();
5215diff --git a/arch/powerpc/platforms/ps3/device-init.c b/arch/powerpc/platforms/ps3/device-init.c
5216index e48462447ff0..2670cee66064 100644
5217--- a/arch/powerpc/platforms/ps3/device-init.c
5218+++ b/arch/powerpc/platforms/ps3/device-init.c
5219@@ -752,7 +752,7 @@ static int ps3_notification_read_write(struct ps3_notification_device *dev,
5220 }
5221 pr_debug("%s:%u: notification %s issued\n", __func__, __LINE__, op);
5222
5223- res = wait_event_interruptible(dev->done.wait,
5224+ res = swait_event_interruptible(dev->done.wait,
5225 dev->done.done || kthread_should_stop());
5226 if (kthread_should_stop())
5227 res = -EINTR;
5228diff --git a/arch/s390/include/asm/spinlock_types.h b/arch/s390/include/asm/spinlock_types.h
5229index 1861a0c5dd47..74092ebaca3c 100644
5230--- a/arch/s390/include/asm/spinlock_types.h
5231+++ b/arch/s390/include/asm/spinlock_types.h
5232@@ -2,10 +2,6 @@
5233 #ifndef __ASM_SPINLOCK_TYPES_H
5234 #define __ASM_SPINLOCK_TYPES_H
5235
5236-#ifndef __LINUX_SPINLOCK_TYPES_H
5237-# error "please don't include this file directly"
5238-#endif
5239-
5240 typedef struct {
5241 int lock;
5242 } __attribute__ ((aligned (4))) arch_spinlock_t;
5243diff --git a/arch/sh/include/asm/spinlock_types.h b/arch/sh/include/asm/spinlock_types.h
5244index e82369f286a2..22ca9a98bbb8 100644
5245--- a/arch/sh/include/asm/spinlock_types.h
5246+++ b/arch/sh/include/asm/spinlock_types.h
5247@@ -2,10 +2,6 @@
5248 #ifndef __ASM_SH_SPINLOCK_TYPES_H
5249 #define __ASM_SH_SPINLOCK_TYPES_H
5250
5251-#ifndef __LINUX_SPINLOCK_TYPES_H
5252-# error "please don't include this file directly"
5253-#endif
5254-
5255 typedef struct {
5256 volatile unsigned int lock;
5257 } arch_spinlock_t;
5258diff --git a/arch/sh/kernel/irq.c b/arch/sh/kernel/irq.c
5259index 245dbeb20afe..e298c82d2a69 100644
5260--- a/arch/sh/kernel/irq.c
5261+++ b/arch/sh/kernel/irq.c
5262@@ -148,6 +148,7 @@ void irq_ctx_exit(int cpu)
5263 hardirq_ctx[cpu] = NULL;
5264 }
5265
5266+#ifndef CONFIG_PREEMPT_RT_FULL
5267 void do_softirq_own_stack(void)
5268 {
5269 struct thread_info *curctx;
5270@@ -175,6 +176,7 @@ void do_softirq_own_stack(void)
5271 "r5", "r6", "r7", "r8", "r9", "r15", "t", "pr"
5272 );
5273 }
5274+#endif
5275 #else
5276 static inline void handle_one_irq(unsigned int irq)
5277 {
5278diff --git a/arch/sparc/Kconfig b/arch/sparc/Kconfig
5279index 4e83f950713e..7f9d71523763 100644
5280--- a/arch/sparc/Kconfig
5281+++ b/arch/sparc/Kconfig
5282@@ -206,12 +206,10 @@ config NR_CPUS
5283 source kernel/Kconfig.hz
5284
5285 config RWSEM_GENERIC_SPINLOCK
5286- bool
5287- default y if SPARC32
5288+ def_bool PREEMPT_RT_FULL
5289
5290 config RWSEM_XCHGADD_ALGORITHM
5291- bool
5292- default y if SPARC64
5293+ def_bool !RWSEM_GENERIC_SPINLOCK && !PREEMPT_RT_FULL
5294
5295 config GENERIC_HWEIGHT
5296 bool
5297diff --git a/arch/sparc/kernel/irq_64.c b/arch/sparc/kernel/irq_64.c
5298index d66dde833f5e..f87b3f8f4d43 100644
5299--- a/arch/sparc/kernel/irq_64.c
5300+++ b/arch/sparc/kernel/irq_64.c
5301@@ -855,6 +855,7 @@ void __irq_entry handler_irq(int pil, struct pt_regs *regs)
5302 set_irq_regs(old_regs);
5303 }
5304
5305+#ifndef CONFIG_PREEMPT_RT_FULL
5306 void do_softirq_own_stack(void)
5307 {
5308 void *orig_sp, *sp = softirq_stack[smp_processor_id()];
5309@@ -869,6 +870,7 @@ void do_softirq_own_stack(void)
5310 __asm__ __volatile__("mov %0, %%sp"
5311 : : "r" (orig_sp));
5312 }
5313+#endif
5314
5315 #ifdef CONFIG_HOTPLUG_CPU
5316 void fixup_irqs(void)
5317diff --git a/arch/tile/include/asm/setup.h b/arch/tile/include/asm/setup.h
5318index 2a0347af0702..670fa2f4cfc3 100644
5319--- a/arch/tile/include/asm/setup.h
5320+++ b/arch/tile/include/asm/setup.h
5321@@ -49,7 +49,7 @@ int hardwall_ipi_valid(int cpu);
5322
5323 /* Hook hardwall code into changes in affinity. */
5324 #define arch_set_cpus_allowed(p, new_mask) do { \
5325- if (!cpumask_equal(&p->cpus_allowed, new_mask)) \
5326+ if (!cpumask_equal(p->cpus_ptr, new_mask)) \
5327 hardwall_deactivate_all(p); \
5328 } while (0)
5329 #endif
5330diff --git a/arch/tile/include/asm/spinlock_types.h b/arch/tile/include/asm/spinlock_types.h
5331index a71f59b49c50..9311c6ff2abc 100644
5332--- a/arch/tile/include/asm/spinlock_types.h
5333+++ b/arch/tile/include/asm/spinlock_types.h
5334@@ -15,10 +15,6 @@
5335 #ifndef _ASM_TILE_SPINLOCK_TYPES_H
5336 #define _ASM_TILE_SPINLOCK_TYPES_H
5337
5338-#ifndef __LINUX_SPINLOCK_TYPES_H
5339-# error "please don't include this file directly"
5340-#endif
5341-
5342 #ifdef __tilegx__
5343
5344 /* Low 15 bits are "next"; high 15 bits are "current". */
5345diff --git a/arch/tile/kernel/hardwall.c b/arch/tile/kernel/hardwall.c
5346index 2fd1694ac1d0..98f4fb696289 100644
5347--- a/arch/tile/kernel/hardwall.c
5348+++ b/arch/tile/kernel/hardwall.c
5349@@ -590,12 +590,12 @@ static int hardwall_activate(struct hardwall_info *info)
5350 * Get our affinity; if we're not bound to this tile uniquely,
5351 * we can't access the network registers.
5352 */
5353- if (cpumask_weight(&p->cpus_allowed) != 1)
5354+ if (p->nr_cpus_allowed != 1)
5355 return -EPERM;
5356
5357 /* Make sure we are bound to a cpu assigned to this resource. */
5358 cpu = smp_processor_id();
5359- BUG_ON(cpumask_first(&p->cpus_allowed) != cpu);
5360+ BUG_ON(cpumask_first(p->cpus_ptr) != cpu);
5361 if (!cpumask_test_cpu(cpu, &info->cpumask))
5362 return -EINVAL;
5363
5364@@ -621,17 +621,17 @@ static int hardwall_activate(struct hardwall_info *info)
5365 * Deactivate a task's hardwall. Must hold lock for hardwall_type.
5366 * This method may be called from exit_thread(), so we don't want to
5367 * rely on too many fields of struct task_struct still being valid.
5368- * We assume the cpus_allowed, pid, and comm fields are still valid.
5369+ * We assume the nr_cpus_allowed, pid, and comm fields are still valid.
5370 */
5371 static void _hardwall_deactivate(struct hardwall_type *hwt,
5372 struct task_struct *task)
5373 {
5374 struct thread_struct *ts = &task->thread;
5375
5376- if (cpumask_weight(&task->cpus_allowed) != 1) {
5377+ if (task->nr_cpus_allowed != 1) {
5378 pr_err("pid %d (%s) releasing %s hardwall with an affinity mask containing %d cpus!\n",
5379 task->pid, task->comm, hwt->name,
5380- cpumask_weight(&task->cpus_allowed));
5381+ task->nr_cpus_allowed);
5382 BUG();
5383 }
5384
5385diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
5386index 2af0af33362a..7764f936d6ab 100644
5387--- a/arch/x86/Kconfig
5388+++ b/arch/x86/Kconfig
5389@@ -169,6 +169,7 @@ config X86
5390 select HAVE_HARDLOCKUP_DETECTOR_PERF if PERF_EVENTS && HAVE_PERF_EVENTS_NMI
5391 select HAVE_PERF_REGS
5392 select HAVE_PERF_USER_STACK_DUMP
5393+ select HAVE_PREEMPT_LAZY
5394 select HAVE_RCU_TABLE_FREE
5395 select HAVE_RCU_TABLE_INVALIDATE if HAVE_RCU_TABLE_FREE
5396 select HAVE_REGS_AND_STACK_ACCESS_API
5397@@ -257,8 +258,11 @@ config ARCH_MAY_HAVE_PC_FDC
5398 def_bool y
5399 depends on ISA_DMA_API
5400
5401+config RWSEM_GENERIC_SPINLOCK
5402+ def_bool PREEMPT_RT_FULL
5403+
5404 config RWSEM_XCHGADD_ALGORITHM
5405- def_bool y
5406+ def_bool !RWSEM_GENERIC_SPINLOCK && !PREEMPT_RT_FULL
5407
5408 config GENERIC_CALIBRATE_DELAY
5409 def_bool y
5410@@ -933,7 +937,7 @@ config IOMMU_HELPER
5411 config MAXSMP
5412 bool "Enable Maximum number of SMP Processors and NUMA Nodes"
5413 depends on X86_64 && SMP && DEBUG_KERNEL
5414- select CPUMASK_OFFSTACK
5415+ select CPUMASK_OFFSTACK if !PREEMPT_RT_FULL
5416 ---help---
5417 Enable maximum number of CPUS and NUMA Nodes for this architecture.
5418 If unsure, say N.
5419diff --git a/arch/x86/crypto/aesni-intel_glue.c b/arch/x86/crypto/aesni-intel_glue.c
5420index c690ddc78c03..7a3138d33e33 100644
5421--- a/arch/x86/crypto/aesni-intel_glue.c
5422+++ b/arch/x86/crypto/aesni-intel_glue.c
5423@@ -387,14 +387,14 @@ static int ecb_encrypt(struct skcipher_request *req)
5424
5425 err = skcipher_walk_virt(&walk, req, true);
5426
5427- kernel_fpu_begin();
5428 while ((nbytes = walk.nbytes)) {
5429+ kernel_fpu_begin();
5430 aesni_ecb_enc(ctx, walk.dst.virt.addr, walk.src.virt.addr,
5431 nbytes & AES_BLOCK_MASK);
5432+ kernel_fpu_end();
5433 nbytes &= AES_BLOCK_SIZE - 1;
5434 err = skcipher_walk_done(&walk, nbytes);
5435 }
5436- kernel_fpu_end();
5437
5438 return err;
5439 }
5440@@ -409,14 +409,14 @@ static int ecb_decrypt(struct skcipher_request *req)
5441
5442 err = skcipher_walk_virt(&walk, req, true);
5443
5444- kernel_fpu_begin();
5445 while ((nbytes = walk.nbytes)) {
5446+ kernel_fpu_begin();
5447 aesni_ecb_dec(ctx, walk.dst.virt.addr, walk.src.virt.addr,
5448 nbytes & AES_BLOCK_MASK);
5449+ kernel_fpu_end();
5450 nbytes &= AES_BLOCK_SIZE - 1;
5451 err = skcipher_walk_done(&walk, nbytes);
5452 }
5453- kernel_fpu_end();
5454
5455 return err;
5456 }
5457@@ -431,14 +431,14 @@ static int cbc_encrypt(struct skcipher_request *req)
5458
5459 err = skcipher_walk_virt(&walk, req, true);
5460
5461- kernel_fpu_begin();
5462 while ((nbytes = walk.nbytes)) {
5463+ kernel_fpu_begin();
5464 aesni_cbc_enc(ctx, walk.dst.virt.addr, walk.src.virt.addr,
5465 nbytes & AES_BLOCK_MASK, walk.iv);
5466+ kernel_fpu_end();
5467 nbytes &= AES_BLOCK_SIZE - 1;
5468 err = skcipher_walk_done(&walk, nbytes);
5469 }
5470- kernel_fpu_end();
5471
5472 return err;
5473 }
5474@@ -453,14 +453,14 @@ static int cbc_decrypt(struct skcipher_request *req)
5475
5476 err = skcipher_walk_virt(&walk, req, true);
5477
5478- kernel_fpu_begin();
5479 while ((nbytes = walk.nbytes)) {
5480+ kernel_fpu_begin();
5481 aesni_cbc_dec(ctx, walk.dst.virt.addr, walk.src.virt.addr,
5482 nbytes & AES_BLOCK_MASK, walk.iv);
5483+ kernel_fpu_end();
5484 nbytes &= AES_BLOCK_SIZE - 1;
5485 err = skcipher_walk_done(&walk, nbytes);
5486 }
5487- kernel_fpu_end();
5488
5489 return err;
5490 }
5491@@ -510,18 +510,20 @@ static int ctr_crypt(struct skcipher_request *req)
5492
5493 err = skcipher_walk_virt(&walk, req, true);
5494
5495- kernel_fpu_begin();
5496 while ((nbytes = walk.nbytes) >= AES_BLOCK_SIZE) {
5497+ kernel_fpu_begin();
5498 aesni_ctr_enc_tfm(ctx, walk.dst.virt.addr, walk.src.virt.addr,
5499 nbytes & AES_BLOCK_MASK, walk.iv);
5500+ kernel_fpu_end();
5501 nbytes &= AES_BLOCK_SIZE - 1;
5502 err = skcipher_walk_done(&walk, nbytes);
5503 }
5504 if (walk.nbytes) {
5505+ kernel_fpu_begin();
5506 ctr_crypt_final(ctx, &walk);
5507+ kernel_fpu_end();
5508 err = skcipher_walk_done(&walk, 0);
5509 }
5510- kernel_fpu_end();
5511
5512 return err;
5513 }
5514diff --git a/arch/x86/crypto/camellia_aesni_avx2_glue.c b/arch/x86/crypto/camellia_aesni_avx2_glue.c
5515index 60907c139c4e..0902db7d326a 100644
5516--- a/arch/x86/crypto/camellia_aesni_avx2_glue.c
5517+++ b/arch/x86/crypto/camellia_aesni_avx2_glue.c
5518@@ -206,6 +206,20 @@ struct crypt_priv {
5519 bool fpu_enabled;
5520 };
5521
5522+#ifdef CONFIG_PREEMPT_RT_FULL
5523+static void camellia_fpu_end_rt(struct crypt_priv *ctx)
5524+{
5525+ bool fpu_enabled = ctx->fpu_enabled;
5526+
5527+ if (!fpu_enabled)
5528+ return;
5529+ camellia_fpu_end(fpu_enabled);
5530+ ctx->fpu_enabled = false;
5531+}
5532+#else
5533+static void camellia_fpu_end_rt(struct crypt_priv *ctx) { }
5534+#endif
5535+
5536 static void encrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes)
5537 {
5538 const unsigned int bsize = CAMELLIA_BLOCK_SIZE;
5539@@ -221,16 +235,19 @@ static void encrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes)
5540 }
5541
5542 if (nbytes >= CAMELLIA_AESNI_PARALLEL_BLOCKS * bsize) {
5543+ kernel_fpu_resched();
5544 camellia_ecb_enc_16way(ctx->ctx, srcdst, srcdst);
5545 srcdst += bsize * CAMELLIA_AESNI_PARALLEL_BLOCKS;
5546 nbytes -= bsize * CAMELLIA_AESNI_PARALLEL_BLOCKS;
5547 }
5548
5549 while (nbytes >= CAMELLIA_PARALLEL_BLOCKS * bsize) {
5550+ kernel_fpu_resched();
5551 camellia_enc_blk_2way(ctx->ctx, srcdst, srcdst);
5552 srcdst += bsize * CAMELLIA_PARALLEL_BLOCKS;
5553 nbytes -= bsize * CAMELLIA_PARALLEL_BLOCKS;
5554 }
5555+ camellia_fpu_end_rt(ctx);
5556
5557 for (i = 0; i < nbytes / bsize; i++, srcdst += bsize)
5558 camellia_enc_blk(ctx->ctx, srcdst, srcdst);
5559@@ -251,16 +268,19 @@ static void decrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes)
5560 }
5561
5562 if (nbytes >= CAMELLIA_AESNI_PARALLEL_BLOCKS * bsize) {
5563+ kernel_fpu_resched();
5564 camellia_ecb_dec_16way(ctx->ctx, srcdst, srcdst);
5565 srcdst += bsize * CAMELLIA_AESNI_PARALLEL_BLOCKS;
5566 nbytes -= bsize * CAMELLIA_AESNI_PARALLEL_BLOCKS;
5567 }
5568
5569 while (nbytes >= CAMELLIA_PARALLEL_BLOCKS * bsize) {
5570+ kernel_fpu_resched();
5571 camellia_dec_blk_2way(ctx->ctx, srcdst, srcdst);
5572 srcdst += bsize * CAMELLIA_PARALLEL_BLOCKS;
5573 nbytes -= bsize * CAMELLIA_PARALLEL_BLOCKS;
5574 }
5575+ camellia_fpu_end_rt(ctx);
5576
5577 for (i = 0; i < nbytes / bsize; i++, srcdst += bsize)
5578 camellia_dec_blk(ctx->ctx, srcdst, srcdst);
5579diff --git a/arch/x86/crypto/camellia_aesni_avx_glue.c b/arch/x86/crypto/camellia_aesni_avx_glue.c
5580index d96429da88eb..3b8e91841039 100644
5581--- a/arch/x86/crypto/camellia_aesni_avx_glue.c
5582+++ b/arch/x86/crypto/camellia_aesni_avx_glue.c
5583@@ -210,6 +210,21 @@ struct crypt_priv {
5584 bool fpu_enabled;
5585 };
5586
5587+#ifdef CONFIG_PREEMPT_RT_FULL
5588+static void camellia_fpu_end_rt(struct crypt_priv *ctx)
5589+{
5590+ bool fpu_enabled = ctx->fpu_enabled;
5591+
5592+ if (!fpu_enabled)
5593+ return;
5594+ camellia_fpu_end(fpu_enabled);
5595+ ctx->fpu_enabled = false;
5596+}
5597+
5598+#else
5599+static void camellia_fpu_end_rt(struct crypt_priv *ctx) { }
5600+#endif
5601+
5602 static void encrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes)
5603 {
5604 const unsigned int bsize = CAMELLIA_BLOCK_SIZE;
5605@@ -225,10 +240,12 @@ static void encrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes)
5606 }
5607
5608 while (nbytes >= CAMELLIA_PARALLEL_BLOCKS * bsize) {
5609+ kernel_fpu_resched();
5610 camellia_enc_blk_2way(ctx->ctx, srcdst, srcdst);
5611 srcdst += bsize * CAMELLIA_PARALLEL_BLOCKS;
5612 nbytes -= bsize * CAMELLIA_PARALLEL_BLOCKS;
5613 }
5614+ camellia_fpu_end_rt(ctx);
5615
5616 for (i = 0; i < nbytes / bsize; i++, srcdst += bsize)
5617 camellia_enc_blk(ctx->ctx, srcdst, srcdst);
5618@@ -249,10 +266,12 @@ static void decrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes)
5619 }
5620
5621 while (nbytes >= CAMELLIA_PARALLEL_BLOCKS * bsize) {
5622+ kernel_fpu_resched();
5623 camellia_dec_blk_2way(ctx->ctx, srcdst, srcdst);
5624 srcdst += bsize * CAMELLIA_PARALLEL_BLOCKS;
5625 nbytes -= bsize * CAMELLIA_PARALLEL_BLOCKS;
5626 }
5627+ camellia_fpu_end_rt(ctx);
5628
5629 for (i = 0; i < nbytes / bsize; i++, srcdst += bsize)
5630 camellia_dec_blk(ctx->ctx, srcdst, srcdst);
5631diff --git a/arch/x86/crypto/cast5_avx_glue.c b/arch/x86/crypto/cast5_avx_glue.c
5632index 575292a33bdf..0a4b0a222b18 100644
5633--- a/arch/x86/crypto/cast5_avx_glue.c
5634+++ b/arch/x86/crypto/cast5_avx_glue.c
5635@@ -59,7 +59,7 @@ static inline void cast5_fpu_end(bool fpu_enabled)
5636 static int ecb_crypt(struct blkcipher_desc *desc, struct blkcipher_walk *walk,
5637 bool enc)
5638 {
5639- bool fpu_enabled = false;
5640+ bool fpu_enabled;
5641 struct cast5_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
5642 const unsigned int bsize = CAST5_BLOCK_SIZE;
5643 unsigned int nbytes;
5644@@ -73,7 +73,7 @@ static int ecb_crypt(struct blkcipher_desc *desc, struct blkcipher_walk *walk,
5645 u8 *wsrc = walk->src.virt.addr;
5646 u8 *wdst = walk->dst.virt.addr;
5647
5648- fpu_enabled = cast5_fpu_begin(fpu_enabled, nbytes);
5649+ fpu_enabled = cast5_fpu_begin(false, nbytes);
5650
5651 /* Process multi-block batch */
5652 if (nbytes >= bsize * CAST5_PARALLEL_BLOCKS) {
5653@@ -102,10 +102,9 @@ static int ecb_crypt(struct blkcipher_desc *desc, struct blkcipher_walk *walk,
5654 } while (nbytes >= bsize);
5655
5656 done:
5657+ cast5_fpu_end(fpu_enabled);
5658 err = blkcipher_walk_done(desc, walk, nbytes);
5659 }
5660-
5661- cast5_fpu_end(fpu_enabled);
5662 return err;
5663 }
5664
5665@@ -226,7 +225,7 @@ static unsigned int __cbc_decrypt(struct blkcipher_desc *desc,
5666 static int cbc_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
5667 struct scatterlist *src, unsigned int nbytes)
5668 {
5669- bool fpu_enabled = false;
5670+ bool fpu_enabled;
5671 struct blkcipher_walk walk;
5672 int err;
5673
5674@@ -235,12 +234,11 @@ static int cbc_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
5675 desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
5676
5677 while ((nbytes = walk.nbytes)) {
5678- fpu_enabled = cast5_fpu_begin(fpu_enabled, nbytes);
5679+ fpu_enabled = cast5_fpu_begin(false, nbytes);
5680 nbytes = __cbc_decrypt(desc, &walk);
5681+ cast5_fpu_end(fpu_enabled);
5682 err = blkcipher_walk_done(desc, &walk, nbytes);
5683 }
5684-
5685- cast5_fpu_end(fpu_enabled);
5686 return err;
5687 }
5688
5689@@ -309,7 +307,7 @@ static unsigned int __ctr_crypt(struct blkcipher_desc *desc,
5690 static int ctr_crypt(struct blkcipher_desc *desc, struct scatterlist *dst,
5691 struct scatterlist *src, unsigned int nbytes)
5692 {
5693- bool fpu_enabled = false;
5694+ bool fpu_enabled;
5695 struct blkcipher_walk walk;
5696 int err;
5697
5698@@ -318,13 +316,12 @@ static int ctr_crypt(struct blkcipher_desc *desc, struct scatterlist *dst,
5699 desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
5700
5701 while ((nbytes = walk.nbytes) >= CAST5_BLOCK_SIZE) {
5702- fpu_enabled = cast5_fpu_begin(fpu_enabled, nbytes);
5703+ fpu_enabled = cast5_fpu_begin(false, nbytes);
5704 nbytes = __ctr_crypt(desc, &walk);
5705+ cast5_fpu_end(fpu_enabled);
5706 err = blkcipher_walk_done(desc, &walk, nbytes);
5707 }
5708
5709- cast5_fpu_end(fpu_enabled);
5710-
5711 if (walk.nbytes) {
5712 ctr_crypt_final(desc, &walk);
5713 err = blkcipher_walk_done(desc, &walk, 0);
5714diff --git a/arch/x86/crypto/cast6_avx_glue.c b/arch/x86/crypto/cast6_avx_glue.c
5715index 50e684768c55..8caf9ba8c1da 100644
5716--- a/arch/x86/crypto/cast6_avx_glue.c
5717+++ b/arch/x86/crypto/cast6_avx_glue.c
5718@@ -205,19 +205,33 @@ struct crypt_priv {
5719 bool fpu_enabled;
5720 };
5721
5722+#ifdef CONFIG_PREEMPT_RT_FULL
5723+static void cast6_fpu_end_rt(struct crypt_priv *ctx)
5724+{
5725+ bool fpu_enabled = ctx->fpu_enabled;
5726+
5727+ if (!fpu_enabled)
5728+ return;
5729+ cast6_fpu_end(fpu_enabled);
5730+ ctx->fpu_enabled = false;
5731+}
5732+
5733+#else
5734+static void cast6_fpu_end_rt(struct crypt_priv *ctx) { }
5735+#endif
5736+
5737 static void encrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes)
5738 {
5739 const unsigned int bsize = CAST6_BLOCK_SIZE;
5740 struct crypt_priv *ctx = priv;
5741 int i;
5742
5743- ctx->fpu_enabled = cast6_fpu_begin(ctx->fpu_enabled, nbytes);
5744-
5745 if (nbytes == bsize * CAST6_PARALLEL_BLOCKS) {
5746+ ctx->fpu_enabled = cast6_fpu_begin(ctx->fpu_enabled, nbytes);
5747 cast6_ecb_enc_8way(ctx->ctx, srcdst, srcdst);
5748+ cast6_fpu_end_rt(ctx);
5749 return;
5750 }
5751-
5752 for (i = 0; i < nbytes / bsize; i++, srcdst += bsize)
5753 __cast6_encrypt(ctx->ctx, srcdst, srcdst);
5754 }
5755@@ -228,10 +242,10 @@ static void decrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes)
5756 struct crypt_priv *ctx = priv;
5757 int i;
5758
5759- ctx->fpu_enabled = cast6_fpu_begin(ctx->fpu_enabled, nbytes);
5760-
5761 if (nbytes == bsize * CAST6_PARALLEL_BLOCKS) {
5762+ ctx->fpu_enabled = cast6_fpu_begin(ctx->fpu_enabled, nbytes);
5763 cast6_ecb_dec_8way(ctx->ctx, srcdst, srcdst);
5764+ cast6_fpu_end_rt(ctx);
5765 return;
5766 }
5767
5768diff --git a/arch/x86/crypto/chacha20_glue.c b/arch/x86/crypto/chacha20_glue.c
5769index 1e6af1b35f7b..e7809fd2a4fd 100644
5770--- a/arch/x86/crypto/chacha20_glue.c
5771+++ b/arch/x86/crypto/chacha20_glue.c
5772@@ -81,23 +81,24 @@ static int chacha20_simd(struct skcipher_request *req)
5773
5774 crypto_chacha20_init(state, ctx, walk.iv);
5775
5776- kernel_fpu_begin();
5777-
5778 while (walk.nbytes >= CHACHA20_BLOCK_SIZE) {
5779+ kernel_fpu_begin();
5780+
5781 chacha20_dosimd(state, walk.dst.virt.addr, walk.src.virt.addr,
5782 rounddown(walk.nbytes, CHACHA20_BLOCK_SIZE));
5783+ kernel_fpu_end();
5784 err = skcipher_walk_done(&walk,
5785 walk.nbytes % CHACHA20_BLOCK_SIZE);
5786 }
5787
5788 if (walk.nbytes) {
5789+ kernel_fpu_begin();
5790 chacha20_dosimd(state, walk.dst.virt.addr, walk.src.virt.addr,
5791 walk.nbytes);
5792+ kernel_fpu_end();
5793 err = skcipher_walk_done(&walk, 0);
5794 }
5795
5796- kernel_fpu_end();
5797-
5798 return err;
5799 }
5800
5801diff --git a/arch/x86/crypto/glue_helper.c b/arch/x86/crypto/glue_helper.c
5802index d61e57960fe0..c67560d9718a 100644
5803--- a/arch/x86/crypto/glue_helper.c
5804+++ b/arch/x86/crypto/glue_helper.c
5805@@ -40,7 +40,7 @@ static int __glue_ecb_crypt_128bit(const struct common_glue_ctx *gctx,
5806 void *ctx = crypto_blkcipher_ctx(desc->tfm);
5807 const unsigned int bsize = 128 / 8;
5808 unsigned int nbytes, i, func_bytes;
5809- bool fpu_enabled = false;
5810+ bool fpu_enabled;
5811 int err;
5812
5813 err = blkcipher_walk_virt(desc, walk);
5814@@ -50,7 +50,7 @@ static int __glue_ecb_crypt_128bit(const struct common_glue_ctx *gctx,
5815 u8 *wdst = walk->dst.virt.addr;
5816
5817 fpu_enabled = glue_fpu_begin(bsize, gctx->fpu_blocks_limit,
5818- desc, fpu_enabled, nbytes);
5819+ desc, false, nbytes);
5820
5821 for (i = 0; i < gctx->num_funcs; i++) {
5822 func_bytes = bsize * gctx->funcs[i].num_blocks;
5823@@ -72,10 +72,10 @@ static int __glue_ecb_crypt_128bit(const struct common_glue_ctx *gctx,
5824 }
5825
5826 done:
5827+ glue_fpu_end(fpu_enabled);
5828 err = blkcipher_walk_done(desc, walk, nbytes);
5829 }
5830
5831- glue_fpu_end(fpu_enabled);
5832 return err;
5833 }
5834
5835@@ -192,7 +192,7 @@ int glue_cbc_decrypt_128bit(const struct common_glue_ctx *gctx,
5836 struct scatterlist *src, unsigned int nbytes)
5837 {
5838 const unsigned int bsize = 128 / 8;
5839- bool fpu_enabled = false;
5840+ bool fpu_enabled;
5841 struct blkcipher_walk walk;
5842 int err;
5843
5844@@ -201,12 +201,12 @@ int glue_cbc_decrypt_128bit(const struct common_glue_ctx *gctx,
5845
5846 while ((nbytes = walk.nbytes)) {
5847 fpu_enabled = glue_fpu_begin(bsize, gctx->fpu_blocks_limit,
5848- desc, fpu_enabled, nbytes);
5849+ desc, false, nbytes);
5850 nbytes = __glue_cbc_decrypt_128bit(gctx, desc, &walk);
5851+ glue_fpu_end(fpu_enabled);
5852 err = blkcipher_walk_done(desc, &walk, nbytes);
5853 }
5854
5855- glue_fpu_end(fpu_enabled);
5856 return err;
5857 }
5858 EXPORT_SYMBOL_GPL(glue_cbc_decrypt_128bit);
5859@@ -275,7 +275,7 @@ int glue_ctr_crypt_128bit(const struct common_glue_ctx *gctx,
5860 struct scatterlist *src, unsigned int nbytes)
5861 {
5862 const unsigned int bsize = 128 / 8;
5863- bool fpu_enabled = false;
5864+ bool fpu_enabled;
5865 struct blkcipher_walk walk;
5866 int err;
5867
5868@@ -284,13 +284,12 @@ int glue_ctr_crypt_128bit(const struct common_glue_ctx *gctx,
5869
5870 while ((nbytes = walk.nbytes) >= bsize) {
5871 fpu_enabled = glue_fpu_begin(bsize, gctx->fpu_blocks_limit,
5872- desc, fpu_enabled, nbytes);
5873+ desc, false, nbytes);
5874 nbytes = __glue_ctr_crypt_128bit(gctx, desc, &walk);
5875+ glue_fpu_end(fpu_enabled);
5876 err = blkcipher_walk_done(desc, &walk, nbytes);
5877 }
5878
5879- glue_fpu_end(fpu_enabled);
5880-
5881 if (walk.nbytes) {
5882 glue_ctr_crypt_final_128bit(
5883 gctx->funcs[gctx->num_funcs - 1].fn_u.ctr, desc, &walk);
5884@@ -380,7 +379,7 @@ int glue_xts_crypt_128bit(const struct common_glue_ctx *gctx,
5885 void *tweak_ctx, void *crypt_ctx)
5886 {
5887 const unsigned int bsize = 128 / 8;
5888- bool fpu_enabled = false;
5889+ bool fpu_enabled;
5890 struct blkcipher_walk walk;
5891 int err;
5892
5893@@ -393,21 +392,21 @@ int glue_xts_crypt_128bit(const struct common_glue_ctx *gctx,
5894
5895 /* set minimum length to bsize, for tweak_fn */
5896 fpu_enabled = glue_fpu_begin(bsize, gctx->fpu_blocks_limit,
5897- desc, fpu_enabled,
5898+ desc, false,
5899 nbytes < bsize ? bsize : nbytes);
5900-
5901 /* calculate first value of T */
5902 tweak_fn(tweak_ctx, walk.iv, walk.iv);
5903+ glue_fpu_end(fpu_enabled);
5904
5905 while (nbytes) {
5906+ fpu_enabled = glue_fpu_begin(bsize, gctx->fpu_blocks_limit,
5907+ desc, false, nbytes);
5908 nbytes = __glue_xts_crypt_128bit(gctx, crypt_ctx, desc, &walk);
5909
5910+ glue_fpu_end(fpu_enabled);
5911 err = blkcipher_walk_done(desc, &walk, nbytes);
5912 nbytes = walk.nbytes;
5913 }
5914-
5915- glue_fpu_end(fpu_enabled);
5916-
5917 return err;
5918 }
5919 EXPORT_SYMBOL_GPL(glue_xts_crypt_128bit);
5920diff --git a/arch/x86/crypto/serpent_avx2_glue.c b/arch/x86/crypto/serpent_avx2_glue.c
5921index 870f6d812a2d..5c806bf39f1d 100644
5922--- a/arch/x86/crypto/serpent_avx2_glue.c
5923+++ b/arch/x86/crypto/serpent_avx2_glue.c
5924@@ -184,6 +184,21 @@ struct crypt_priv {
5925 bool fpu_enabled;
5926 };
5927
5928+#ifdef CONFIG_PREEMPT_RT_FULL
5929+static void serpent_fpu_end_rt(struct crypt_priv *ctx)
5930+{
5931+ bool fpu_enabled = ctx->fpu_enabled;
5932+
5933+ if (!fpu_enabled)
5934+ return;
5935+ serpent_fpu_end(fpu_enabled);
5936+ ctx->fpu_enabled = false;
5937+}
5938+
5939+#else
5940+static void serpent_fpu_end_rt(struct crypt_priv *ctx) { }
5941+#endif
5942+
5943 static void encrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes)
5944 {
5945 const unsigned int bsize = SERPENT_BLOCK_SIZE;
5946@@ -199,10 +214,12 @@ static void encrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes)
5947 }
5948
5949 while (nbytes >= SERPENT_PARALLEL_BLOCKS * bsize) {
5950+ kernel_fpu_resched();
5951 serpent_ecb_enc_8way_avx(ctx->ctx, srcdst, srcdst);
5952 srcdst += bsize * SERPENT_PARALLEL_BLOCKS;
5953 nbytes -= bsize * SERPENT_PARALLEL_BLOCKS;
5954 }
5955+ serpent_fpu_end_rt(ctx);
5956
5957 for (i = 0; i < nbytes / bsize; i++, srcdst += bsize)
5958 __serpent_encrypt(ctx->ctx, srcdst, srcdst);
5959@@ -223,10 +240,12 @@ static void decrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes)
5960 }
5961
5962 while (nbytes >= SERPENT_PARALLEL_BLOCKS * bsize) {
5963+ kernel_fpu_resched();
5964 serpent_ecb_dec_8way_avx(ctx->ctx, srcdst, srcdst);
5965 srcdst += bsize * SERPENT_PARALLEL_BLOCKS;
5966 nbytes -= bsize * SERPENT_PARALLEL_BLOCKS;
5967 }
5968+ serpent_fpu_end_rt(ctx);
5969
5970 for (i = 0; i < nbytes / bsize; i++, srcdst += bsize)
5971 __serpent_decrypt(ctx->ctx, srcdst, srcdst);
5972diff --git a/arch/x86/crypto/serpent_avx_glue.c b/arch/x86/crypto/serpent_avx_glue.c
5973index 6f778d3daa22..46dcbdbd0518 100644
5974--- a/arch/x86/crypto/serpent_avx_glue.c
5975+++ b/arch/x86/crypto/serpent_avx_glue.c
5976@@ -218,16 +218,31 @@ struct crypt_priv {
5977 bool fpu_enabled;
5978 };
5979
5980+#ifdef CONFIG_PREEMPT_RT_FULL
5981+static void serpent_fpu_end_rt(struct crypt_priv *ctx)
5982+{
5983+ bool fpu_enabled = ctx->fpu_enabled;
5984+
5985+ if (!fpu_enabled)
5986+ return;
5987+ serpent_fpu_end(fpu_enabled);
5988+ ctx->fpu_enabled = false;
5989+}
5990+
5991+#else
5992+static void serpent_fpu_end_rt(struct crypt_priv *ctx) { }
5993+#endif
5994+
5995 static void encrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes)
5996 {
5997 const unsigned int bsize = SERPENT_BLOCK_SIZE;
5998 struct crypt_priv *ctx = priv;
5999 int i;
6000
6001- ctx->fpu_enabled = serpent_fpu_begin(ctx->fpu_enabled, nbytes);
6002-
6003 if (nbytes == bsize * SERPENT_PARALLEL_BLOCKS) {
6004+ ctx->fpu_enabled = serpent_fpu_begin(ctx->fpu_enabled, nbytes);
6005 serpent_ecb_enc_8way_avx(ctx->ctx, srcdst, srcdst);
6006+ serpent_fpu_end_rt(ctx);
6007 return;
6008 }
6009
6010@@ -241,10 +256,10 @@ static void decrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes)
6011 struct crypt_priv *ctx = priv;
6012 int i;
6013
6014- ctx->fpu_enabled = serpent_fpu_begin(ctx->fpu_enabled, nbytes);
6015-
6016 if (nbytes == bsize * SERPENT_PARALLEL_BLOCKS) {
6017+ ctx->fpu_enabled = serpent_fpu_begin(ctx->fpu_enabled, nbytes);
6018 serpent_ecb_dec_8way_avx(ctx->ctx, srcdst, srcdst);
6019+ serpent_fpu_end_rt(ctx);
6020 return;
6021 }
6022
6023diff --git a/arch/x86/crypto/serpent_sse2_glue.c b/arch/x86/crypto/serpent_sse2_glue.c
6024index ac0e831943f5..d35f607d067f 100644
6025--- a/arch/x86/crypto/serpent_sse2_glue.c
6026+++ b/arch/x86/crypto/serpent_sse2_glue.c
6027@@ -187,16 +187,31 @@ struct crypt_priv {
6028 bool fpu_enabled;
6029 };
6030
6031+#ifdef CONFIG_PREEMPT_RT_FULL
6032+static void serpent_fpu_end_rt(struct crypt_priv *ctx)
6033+{
6034+ bool fpu_enabled = ctx->fpu_enabled;
6035+
6036+ if (!fpu_enabled)
6037+ return;
6038+ serpent_fpu_end(fpu_enabled);
6039+ ctx->fpu_enabled = false;
6040+}
6041+
6042+#else
6043+static void serpent_fpu_end_rt(struct crypt_priv *ctx) { }
6044+#endif
6045+
6046 static void encrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes)
6047 {
6048 const unsigned int bsize = SERPENT_BLOCK_SIZE;
6049 struct crypt_priv *ctx = priv;
6050 int i;
6051
6052- ctx->fpu_enabled = serpent_fpu_begin(ctx->fpu_enabled, nbytes);
6053-
6054 if (nbytes == bsize * SERPENT_PARALLEL_BLOCKS) {
6055+ ctx->fpu_enabled = serpent_fpu_begin(ctx->fpu_enabled, nbytes);
6056 serpent_enc_blk_xway(ctx->ctx, srcdst, srcdst);
6057+ serpent_fpu_end_rt(ctx);
6058 return;
6059 }
6060
6061@@ -210,10 +225,10 @@ static void decrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes)
6062 struct crypt_priv *ctx = priv;
6063 int i;
6064
6065- ctx->fpu_enabled = serpent_fpu_begin(ctx->fpu_enabled, nbytes);
6066-
6067 if (nbytes == bsize * SERPENT_PARALLEL_BLOCKS) {
6068+ ctx->fpu_enabled = serpent_fpu_begin(ctx->fpu_enabled, nbytes);
6069 serpent_dec_blk_xway(ctx->ctx, srcdst, srcdst);
6070+ serpent_fpu_end_rt(ctx);
6071 return;
6072 }
6073
6074diff --git a/arch/x86/crypto/twofish_avx_glue.c b/arch/x86/crypto/twofish_avx_glue.c
6075index b7a3904b953c..de00fe24927e 100644
6076--- a/arch/x86/crypto/twofish_avx_glue.c
6077+++ b/arch/x86/crypto/twofish_avx_glue.c
6078@@ -218,6 +218,21 @@ struct crypt_priv {
6079 bool fpu_enabled;
6080 };
6081
6082+#ifdef CONFIG_PREEMPT_RT_FULL
6083+static void twofish_fpu_end_rt(struct crypt_priv *ctx)
6084+{
6085+ bool fpu_enabled = ctx->fpu_enabled;
6086+
6087+ if (!fpu_enabled)
6088+ return;
6089+ twofish_fpu_end(fpu_enabled);
6090+ ctx->fpu_enabled = false;
6091+}
6092+
6093+#else
6094+static void twofish_fpu_end_rt(struct crypt_priv *ctx) { }
6095+#endif
6096+
6097 static void encrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes)
6098 {
6099 const unsigned int bsize = TF_BLOCK_SIZE;
6100@@ -228,12 +243,16 @@ static void encrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes)
6101
6102 if (nbytes == bsize * TWOFISH_PARALLEL_BLOCKS) {
6103 twofish_ecb_enc_8way(ctx->ctx, srcdst, srcdst);
6104+ twofish_fpu_end_rt(ctx);
6105 return;
6106 }
6107
6108- for (i = 0; i < nbytes / (bsize * 3); i++, srcdst += bsize * 3)
6109+ for (i = 0; i < nbytes / (bsize * 3); i++, srcdst += bsize * 3) {
6110+ kernel_fpu_resched();
6111 twofish_enc_blk_3way(ctx->ctx, srcdst, srcdst);
6112+ }
6113
6114+ twofish_fpu_end_rt(ctx);
6115 nbytes %= bsize * 3;
6116
6117 for (i = 0; i < nbytes / bsize; i++, srcdst += bsize)
6118@@ -250,11 +269,15 @@ static void decrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes)
6119
6120 if (nbytes == bsize * TWOFISH_PARALLEL_BLOCKS) {
6121 twofish_ecb_dec_8way(ctx->ctx, srcdst, srcdst);
6122+ twofish_fpu_end_rt(ctx);
6123 return;
6124 }
6125
6126- for (i = 0; i < nbytes / (bsize * 3); i++, srcdst += bsize * 3)
6127+ for (i = 0; i < nbytes / (bsize * 3); i++, srcdst += bsize * 3) {
6128+ kernel_fpu_resched();
6129 twofish_dec_blk_3way(ctx->ctx, srcdst, srcdst);
6130+ }
6131+ twofish_fpu_end_rt(ctx);
6132
6133 nbytes %= bsize * 3;
6134
6135diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c
6136index 60e21ccfb6d6..0e27f35febe7 100644
6137--- a/arch/x86/entry/common.c
6138+++ b/arch/x86/entry/common.c
6139@@ -133,7 +133,7 @@ static long syscall_trace_enter(struct pt_regs *regs)
6140
6141 #define EXIT_TO_USERMODE_LOOP_FLAGS \
6142 (_TIF_SIGPENDING | _TIF_NOTIFY_RESUME | _TIF_UPROBE | \
6143- _TIF_NEED_RESCHED | _TIF_USER_RETURN_NOTIFY | _TIF_PATCH_PENDING)
6144+ _TIF_NEED_RESCHED_MASK | _TIF_USER_RETURN_NOTIFY | _TIF_PATCH_PENDING)
6145
6146 static void exit_to_usermode_loop(struct pt_regs *regs, u32 cached_flags)
6147 {
6148@@ -148,9 +148,16 @@ static void exit_to_usermode_loop(struct pt_regs *regs, u32 cached_flags)
6149 /* We have work to do. */
6150 local_irq_enable();
6151
6152- if (cached_flags & _TIF_NEED_RESCHED)
6153+ if (cached_flags & _TIF_NEED_RESCHED_MASK)
6154 schedule();
6155
6156+#ifdef ARCH_RT_DELAYS_SIGNAL_SEND
6157+ if (unlikely(current->forced_info.si_signo)) {
6158+ struct task_struct *t = current;
6159+ force_sig_info(t->forced_info.si_signo, &t->forced_info, t);
6160+ t->forced_info.si_signo = 0;
6161+ }
6162+#endif
6163 if (cached_flags & _TIF_UPROBE)
6164 uprobe_notify_resume(regs);
6165
6166diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S
6167index 60c4c342316c..cd0c7c56e2dd 100644
6168--- a/arch/x86/entry/entry_32.S
6169+++ b/arch/x86/entry/entry_32.S
6170@@ -350,8 +350,25 @@ END(ret_from_exception)
6171 ENTRY(resume_kernel)
6172 DISABLE_INTERRUPTS(CLBR_ANY)
6173 .Lneed_resched:
6174+ # preempt count == 0 + NEED_RS set?
6175 cmpl $0, PER_CPU_VAR(__preempt_count)
6176+#ifndef CONFIG_PREEMPT_LAZY
6177 jnz restore_all
6178+#else
6179+ jz test_int_off
6180+
6181+ # atleast preempt count == 0 ?
6182+ cmpl $_PREEMPT_ENABLED,PER_CPU_VAR(__preempt_count)
6183+ jne restore_all
6184+
6185+ movl PER_CPU_VAR(current_task), %ebp
6186+ cmpl $0,TASK_TI_preempt_lazy_count(%ebp) # non-zero preempt_lazy_count ?
6187+ jnz restore_all
6188+
6189+ testl $_TIF_NEED_RESCHED_LAZY, TASK_TI_flags(%ebp)
6190+ jz restore_all
6191+test_int_off:
6192+#endif
6193 testl $X86_EFLAGS_IF, PT_EFLAGS(%esp) # interrupts off (exception path) ?
6194 jz restore_all
6195 call preempt_schedule_irq
6196diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
6197index 164cd7529f0b..75d42cb8a7c9 100644
6198--- a/arch/x86/entry/entry_64.S
6199+++ b/arch/x86/entry/entry_64.S
6200@@ -633,7 +633,23 @@ retint_kernel:
6201 btl $9, EFLAGS(%rsp) /* were interrupts off? */
6202 jnc 1f
6203 0: cmpl $0, PER_CPU_VAR(__preempt_count)
6204+#ifndef CONFIG_PREEMPT_LAZY
6205 jnz 1f
6206+#else
6207+ jz do_preempt_schedule_irq
6208+
6209+ # atleast preempt count == 0 ?
6210+ cmpl $_PREEMPT_ENABLED,PER_CPU_VAR(__preempt_count)
6211+ jnz 1f
6212+
6213+ movq PER_CPU_VAR(current_task), %rcx
6214+ cmpl $0, TASK_TI_preempt_lazy_count(%rcx)
6215+ jnz 1f
6216+
6217+ bt $TIF_NEED_RESCHED_LAZY,TASK_TI_flags(%rcx)
6218+ jnc 1f
6219+do_preempt_schedule_irq:
6220+#endif
6221 call preempt_schedule_irq
6222 jmp 0b
6223 1:
6224@@ -988,6 +1004,7 @@ bad_gs:
6225 jmp 2b
6226 .previous
6227
6228+#ifndef CONFIG_PREEMPT_RT_FULL
6229 /* Call softirq on interrupt stack. Interrupts are off. */
6230 ENTRY(do_softirq_own_stack)
6231 pushq %rbp
6232@@ -998,6 +1015,7 @@ ENTRY(do_softirq_own_stack)
6233 leaveq
6234 ret
6235 ENDPROC(do_softirq_own_stack)
6236+#endif
6237
6238 #ifdef CONFIG_XEN
6239 idtentry hypervisor_callback xen_do_hypervisor_callback has_error_code=0
6240diff --git a/arch/x86/include/asm/fpu/api.h b/arch/x86/include/asm/fpu/api.h
6241index a9caac9d4a72..18b31f22ca5d 100644
6242--- a/arch/x86/include/asm/fpu/api.h
6243+++ b/arch/x86/include/asm/fpu/api.h
6244@@ -25,6 +25,7 @@ extern void __kernel_fpu_begin(void);
6245 extern void __kernel_fpu_end(void);
6246 extern void kernel_fpu_begin(void);
6247 extern void kernel_fpu_end(void);
6248+extern void kernel_fpu_resched(void);
6249 extern bool irq_fpu_usable(void);
6250
6251 /*
6252diff --git a/arch/x86/include/asm/preempt.h b/arch/x86/include/asm/preempt.h
6253index 7f2dbd91fc74..22992c837795 100644
6254--- a/arch/x86/include/asm/preempt.h
6255+++ b/arch/x86/include/asm/preempt.h
6256@@ -86,17 +86,46 @@ static __always_inline void __preempt_count_sub(int val)
6257 * a decrement which hits zero means we have no preempt_count and should
6258 * reschedule.
6259 */
6260-static __always_inline bool __preempt_count_dec_and_test(void)
6261+static __always_inline bool ____preempt_count_dec_and_test(void)
6262 {
6263 GEN_UNARY_RMWcc("decl", __preempt_count, __percpu_arg(0), e);
6264 }
6265
6266+static __always_inline bool __preempt_count_dec_and_test(void)
6267+{
6268+ if (____preempt_count_dec_and_test())
6269+ return true;
6270+#ifdef CONFIG_PREEMPT_LAZY
6271+ if (current_thread_info()->preempt_lazy_count)
6272+ return false;
6273+ return test_thread_flag(TIF_NEED_RESCHED_LAZY);
6274+#else
6275+ return false;
6276+#endif
6277+}
6278+
6279 /*
6280 * Returns true when we need to resched and can (barring IRQ state).
6281 */
6282 static __always_inline bool should_resched(int preempt_offset)
6283 {
6284+#ifdef CONFIG_PREEMPT_LAZY
6285+ u32 tmp;
6286+
6287+ tmp = raw_cpu_read_4(__preempt_count);
6288+ if (tmp == preempt_offset)
6289+ return true;
6290+
6291+ /* preempt count == 0 ? */
6292+ tmp &= ~PREEMPT_NEED_RESCHED;
6293+ if (tmp)
6294+ return false;
6295+ if (current_thread_info()->preempt_lazy_count)
6296+ return false;
6297+ return test_thread_flag(TIF_NEED_RESCHED_LAZY);
6298+#else
6299 return unlikely(raw_cpu_read_4(__preempt_count) == preempt_offset);
6300+#endif
6301 }
6302
6303 #ifdef CONFIG_PREEMPT
6304diff --git a/arch/x86/include/asm/signal.h b/arch/x86/include/asm/signal.h
6305index 5f9012ff52ed..39117e57caf2 100644
6306--- a/arch/x86/include/asm/signal.h
6307+++ b/arch/x86/include/asm/signal.h
6308@@ -28,6 +28,19 @@ typedef struct {
6309 #define SA_IA32_ABI 0x02000000u
6310 #define SA_X32_ABI 0x01000000u
6311
6312+/*
6313+ * Because some traps use the IST stack, we must keep preemption
6314+ * disabled while calling do_trap(), but do_trap() may call
6315+ * force_sig_info() which will grab the signal spin_locks for the
6316+ * task, which in PREEMPT_RT_FULL are mutexes. By defining
6317+ * ARCH_RT_DELAYS_SIGNAL_SEND the force_sig_info() will set
6318+ * TIF_NOTIFY_RESUME and set up the signal to be sent on exit of the
6319+ * trap.
6320+ */
6321+#if defined(CONFIG_PREEMPT_RT_FULL)
6322+#define ARCH_RT_DELAYS_SIGNAL_SEND
6323+#endif
6324+
6325 #ifndef CONFIG_COMPAT
6326 typedef sigset_t compat_sigset_t;
6327 #endif
6328diff --git a/arch/x86/include/asm/stackprotector.h b/arch/x86/include/asm/stackprotector.h
6329index 371b3a4af000..06613a805b25 100644
6330--- a/arch/x86/include/asm/stackprotector.h
6331+++ b/arch/x86/include/asm/stackprotector.h
6332@@ -60,7 +60,7 @@
6333 */
6334 static __always_inline void boot_init_stack_canary(void)
6335 {
6336- u64 canary;
6337+ u64 uninitialized_var(canary);
6338 u64 tsc;
6339
6340 #ifdef CONFIG_X86_64
6341@@ -71,8 +71,14 @@ static __always_inline void boot_init_stack_canary(void)
6342 * of randomness. The TSC only matters for very early init,
6343 * there it already has some randomness on most systems. Later
6344 * on during the bootup the random pool has true entropy too.
6345+ * For preempt-rt we need to weaken the randomness a bit, as
6346+ * we can't call into the random generator from atomic context
6347+ * due to locking constraints. We just leave canary
6348+ * uninitialized and use the TSC based randomness on top of it.
6349 */
6350+#ifndef CONFIG_PREEMPT_RT_FULL
6351 get_random_bytes(&canary, sizeof(canary));
6352+#endif
6353 tsc = rdtsc();
6354 canary += tsc + (tsc << 32UL);
6355 canary &= CANARY_MASK;
6356diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h
6357index 95ff2d7f553f..b1c9129f64fc 100644
6358--- a/arch/x86/include/asm/thread_info.h
6359+++ b/arch/x86/include/asm/thread_info.h
6360@@ -56,11 +56,14 @@ struct task_struct;
6361 struct thread_info {
6362 unsigned long flags; /* low level flags */
6363 u32 status; /* thread synchronous flags */
6364+ int preempt_lazy_count; /* 0 => lazy preemptable
6365+ <0 => BUG */
6366 };
6367
6368 #define INIT_THREAD_INFO(tsk) \
6369 { \
6370 .flags = 0, \
6371+ .preempt_lazy_count = 0, \
6372 }
6373
6374 #define init_stack (init_thread_union.stack)
6375@@ -69,6 +72,10 @@ struct thread_info {
6376
6377 #include <asm/asm-offsets.h>
6378
6379+#define GET_THREAD_INFO(reg) \
6380+ _ASM_MOV PER_CPU_VAR(cpu_current_top_of_stack),reg ; \
6381+ _ASM_SUB $(THREAD_SIZE),reg ;
6382+
6383 #endif
6384
6385 /*
6386@@ -85,6 +92,7 @@ struct thread_info {
6387 #define TIF_SYSCALL_EMU 6 /* syscall emulation active */
6388 #define TIF_SYSCALL_AUDIT 7 /* syscall auditing active */
6389 #define TIF_SECCOMP 8 /* secure computing */
6390+#define TIF_NEED_RESCHED_LAZY 9 /* lazy rescheduling necessary */
6391 #define TIF_USER_RETURN_NOTIFY 11 /* notify kernel of userspace return */
6392 #define TIF_UPROBE 12 /* breakpointed or singlestepping */
6393 #define TIF_PATCH_PENDING 13 /* pending live patching update */
6394@@ -112,6 +120,7 @@ struct thread_info {
6395 #define _TIF_SYSCALL_EMU (1 << TIF_SYSCALL_EMU)
6396 #define _TIF_SYSCALL_AUDIT (1 << TIF_SYSCALL_AUDIT)
6397 #define _TIF_SECCOMP (1 << TIF_SECCOMP)
6398+#define _TIF_NEED_RESCHED_LAZY (1 << TIF_NEED_RESCHED_LAZY)
6399 #define _TIF_USER_RETURN_NOTIFY (1 << TIF_USER_RETURN_NOTIFY)
6400 #define _TIF_UPROBE (1 << TIF_UPROBE)
6401 #define _TIF_PATCH_PENDING (1 << TIF_PATCH_PENDING)
6402@@ -153,6 +162,8 @@ struct thread_info {
6403 #define _TIF_WORK_CTXSW_PREV (_TIF_WORK_CTXSW|_TIF_USER_RETURN_NOTIFY)
6404 #define _TIF_WORK_CTXSW_NEXT (_TIF_WORK_CTXSW)
6405
6406+#define _TIF_NEED_RESCHED_MASK (_TIF_NEED_RESCHED | _TIF_NEED_RESCHED_LAZY)
6407+
6408 #define STACK_WARN (THREAD_SIZE/8)
6409
6410 /*
6411diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
6412index 96a8a68f9c79..c9af5afebc4a 100644
6413--- a/arch/x86/kernel/apic/io_apic.c
6414+++ b/arch/x86/kernel/apic/io_apic.c
6415@@ -1688,19 +1688,20 @@ static bool io_apic_level_ack_pending(struct mp_chip_data *data)
6416 return false;
6417 }
6418
6419-static inline bool ioapic_irqd_mask(struct irq_data *data)
6420+static inline bool ioapic_prepare_move(struct irq_data *data)
6421 {
6422 /* If we are moving the irq we need to mask it */
6423 if (unlikely(irqd_is_setaffinity_pending(data))) {
6424- mask_ioapic_irq(data);
6425+ if (!irqd_irq_masked(data))
6426+ mask_ioapic_irq(data);
6427 return true;
6428 }
6429 return false;
6430 }
6431
6432-static inline void ioapic_irqd_unmask(struct irq_data *data, bool masked)
6433+static inline void ioapic_finish_move(struct irq_data *data, bool moveit)
6434 {
6435- if (unlikely(masked)) {
6436+ if (unlikely(moveit)) {
6437 /* Only migrate the irq if the ack has been received.
6438 *
6439 * On rare occasions the broadcast level triggered ack gets
6440@@ -1729,15 +1730,17 @@ static inline void ioapic_irqd_unmask(struct irq_data *data, bool masked)
6441 */
6442 if (!io_apic_level_ack_pending(data->chip_data))
6443 irq_move_masked_irq(data);
6444- unmask_ioapic_irq(data);
6445+ /* If the irq is masked in the core, leave it */
6446+ if (!irqd_irq_masked(data))
6447+ unmask_ioapic_irq(data);
6448 }
6449 }
6450 #else
6451-static inline bool ioapic_irqd_mask(struct irq_data *data)
6452+static inline bool ioapic_prepare_move(struct irq_data *data)
6453 {
6454 return false;
6455 }
6456-static inline void ioapic_irqd_unmask(struct irq_data *data, bool masked)
6457+static inline void ioapic_finish_move(struct irq_data *data, bool moveit)
6458 {
6459 }
6460 #endif
6461@@ -1746,11 +1749,11 @@ static void ioapic_ack_level(struct irq_data *irq_data)
6462 {
6463 struct irq_cfg *cfg = irqd_cfg(irq_data);
6464 unsigned long v;
6465- bool masked;
6466+ bool moveit;
6467 int i;
6468
6469 irq_complete_move(cfg);
6470- masked = ioapic_irqd_mask(irq_data);
6471+ moveit = ioapic_prepare_move(irq_data);
6472
6473 /*
6474 * It appears there is an erratum which affects at least version 0x11
6475@@ -1805,7 +1808,7 @@ static void ioapic_ack_level(struct irq_data *irq_data)
6476 eoi_ioapic_pin(cfg->vector, irq_data->chip_data);
6477 }
6478
6479- ioapic_irqd_unmask(irq_data, masked);
6480+ ioapic_finish_move(irq_data, moveit);
6481 }
6482
6483 static void ioapic_ir_ack_level(struct irq_data *irq_data)
6484diff --git a/arch/x86/kernel/asm-offsets.c b/arch/x86/kernel/asm-offsets.c
6485index 76417a9aab73..62c3e27c8e1c 100644
6486--- a/arch/x86/kernel/asm-offsets.c
6487+++ b/arch/x86/kernel/asm-offsets.c
6488@@ -38,6 +38,7 @@ void common(void) {
6489
6490 BLANK();
6491 OFFSET(TASK_TI_flags, task_struct, thread_info.flags);
6492+ OFFSET(TASK_TI_preempt_lazy_count, task_struct, thread_info.preempt_lazy_count);
6493 OFFSET(TASK_addr_limit, task_struct, thread.addr_limit);
6494
6495 BLANK();
6496@@ -94,6 +95,7 @@ void common(void) {
6497
6498 BLANK();
6499 DEFINE(PTREGS_SIZE, sizeof(struct pt_regs));
6500+ DEFINE(_PREEMPT_ENABLED, PREEMPT_ENABLED);
6501
6502 /* TLB state for the entry code */
6503 OFFSET(TLB_STATE_user_pcid_flush_mask, tlb_state, user_pcid_flush_mask);
6504diff --git a/arch/x86/kernel/cpu/mcheck/dev-mcelog.c b/arch/x86/kernel/cpu/mcheck/dev-mcelog.c
6505index 7f85b76f43bc..9e74b805070f 100644
6506--- a/arch/x86/kernel/cpu/mcheck/dev-mcelog.c
6507+++ b/arch/x86/kernel/cpu/mcheck/dev-mcelog.c
6508@@ -14,6 +14,7 @@
6509 #include <linux/slab.h>
6510 #include <linux/kmod.h>
6511 #include <linux/poll.h>
6512+#include <linux/swork.h>
6513
6514 #include "mce-internal.h"
6515
6516@@ -86,13 +87,43 @@ static void mce_do_trigger(struct work_struct *work)
6517
6518 static DECLARE_WORK(mce_trigger_work, mce_do_trigger);
6519
6520-
6521-void mce_work_trigger(void)
6522+static void __mce_work_trigger(struct swork_event *event)
6523 {
6524 if (mce_helper[0])
6525 schedule_work(&mce_trigger_work);
6526 }
6527
6528+#ifdef CONFIG_PREEMPT_RT_FULL
6529+static bool notify_work_ready __read_mostly;
6530+static struct swork_event notify_work;
6531+
6532+static int mce_notify_work_init(void)
6533+{
6534+ int err;
6535+
6536+ err = swork_get();
6537+ if (err)
6538+ return err;
6539+
6540+ INIT_SWORK(&notify_work, __mce_work_trigger);
6541+ notify_work_ready = true;
6542+ return 0;
6543+}
6544+
6545+void mce_work_trigger(void)
6546+{
6547+ if (notify_work_ready)
6548+ swork_queue(&notify_work);
6549+}
6550+
6551+#else
6552+void mce_work_trigger(void)
6553+{
6554+ __mce_work_trigger(NULL);
6555+}
6556+static inline int mce_notify_work_init(void) { return 0; }
6557+#endif
6558+
6559 static ssize_t
6560 show_trigger(struct device *s, struct device_attribute *attr, char *buf)
6561 {
6562@@ -356,7 +387,7 @@ static __init int dev_mcelog_init_device(void)
6563
6564 return err;
6565 }
6566-
6567+ mce_notify_work_init();
6568 mce_register_decode_chain(&dev_mcelog_nb);
6569 return 0;
6570 }
6571diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
6572index 98e4e4dc4a3b..5cce2ee3b9f6 100644
6573--- a/arch/x86/kernel/cpu/mcheck/mce.c
6574+++ b/arch/x86/kernel/cpu/mcheck/mce.c
6575@@ -42,6 +42,7 @@
6576 #include <linux/debugfs.h>
6577 #include <linux/irq_work.h>
6578 #include <linux/export.h>
6579+#include <linux/jiffies.h>
6580 #include <linux/jump_label.h>
6581
6582 #include <asm/intel-family.h>
6583@@ -1365,7 +1366,7 @@ int memory_failure(unsigned long pfn, int vector, int flags)
6584 static unsigned long check_interval = INITIAL_CHECK_INTERVAL;
6585
6586 static DEFINE_PER_CPU(unsigned long, mce_next_interval); /* in jiffies */
6587-static DEFINE_PER_CPU(struct timer_list, mce_timer);
6588+static DEFINE_PER_CPU(struct hrtimer, mce_timer);
6589
6590 static unsigned long mce_adjust_timer_default(unsigned long interval)
6591 {
6592@@ -1374,27 +1375,19 @@ static unsigned long mce_adjust_timer_default(unsigned long interval)
6593
6594 static unsigned long (*mce_adjust_timer)(unsigned long interval) = mce_adjust_timer_default;
6595
6596-static void __start_timer(struct timer_list *t, unsigned long interval)
6597+static void __start_timer(struct hrtimer *t, unsigned long iv)
6598 {
6599- unsigned long when = jiffies + interval;
6600- unsigned long flags;
6601-
6602- local_irq_save(flags);
6603-
6604- if (!timer_pending(t) || time_before(when, t->expires))
6605- mod_timer(t, round_jiffies(when));
6606+ if (!iv)
6607+ return;
6608
6609- local_irq_restore(flags);
6610+ hrtimer_start_range_ns(t, ns_to_ktime(jiffies_to_usecs(iv) * 1000ULL),
6611+ 0, HRTIMER_MODE_REL_PINNED);
6612 }
6613
6614-static void mce_timer_fn(unsigned long data)
6615+static enum hrtimer_restart mce_timer_fn(struct hrtimer *timer)
6616 {
6617- struct timer_list *t = this_cpu_ptr(&mce_timer);
6618- int cpu = smp_processor_id();
6619 unsigned long iv;
6620
6621- WARN_ON(cpu != data);
6622-
6623 iv = __this_cpu_read(mce_next_interval);
6624
6625 if (mce_available(this_cpu_ptr(&cpu_info))) {
6626@@ -1417,7 +1410,11 @@ static void mce_timer_fn(unsigned long data)
6627
6628 done:
6629 __this_cpu_write(mce_next_interval, iv);
6630- __start_timer(t, iv);
6631+ if (!iv)
6632+ return HRTIMER_NORESTART;
6633+
6634+ hrtimer_forward_now(timer, ns_to_ktime(jiffies_to_nsecs(iv)));
6635+ return HRTIMER_RESTART;
6636 }
6637
6638 /*
6639@@ -1425,7 +1422,7 @@ static void mce_timer_fn(unsigned long data)
6640 */
6641 void mce_timer_kick(unsigned long interval)
6642 {
6643- struct timer_list *t = this_cpu_ptr(&mce_timer);
6644+ struct hrtimer *t = this_cpu_ptr(&mce_timer);
6645 unsigned long iv = __this_cpu_read(mce_next_interval);
6646
6647 __start_timer(t, interval);
6648@@ -1440,7 +1437,7 @@ static void mce_timer_delete_all(void)
6649 int cpu;
6650
6651 for_each_online_cpu(cpu)
6652- del_timer_sync(&per_cpu(mce_timer, cpu));
6653+ hrtimer_cancel(&per_cpu(mce_timer, cpu));
6654 }
6655
6656 /*
6657@@ -1769,7 +1766,7 @@ static void __mcheck_cpu_clear_vendor(struct cpuinfo_x86 *c)
6658 }
6659 }
6660
6661-static void mce_start_timer(struct timer_list *t)
6662+static void mce_start_timer(struct hrtimer *t)
6663 {
6664 unsigned long iv = check_interval * HZ;
6665
6666@@ -1782,18 +1779,19 @@ static void mce_start_timer(struct timer_list *t)
6667
6668 static void __mcheck_cpu_setup_timer(void)
6669 {
6670- struct timer_list *t = this_cpu_ptr(&mce_timer);
6671- unsigned int cpu = smp_processor_id();
6672+ struct hrtimer *t = this_cpu_ptr(&mce_timer);
6673
6674- setup_pinned_timer(t, mce_timer_fn, cpu);
6675+ hrtimer_init(t, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
6676+ t->function = mce_timer_fn;
6677 }
6678
6679 static void __mcheck_cpu_init_timer(void)
6680 {
6681- struct timer_list *t = this_cpu_ptr(&mce_timer);
6682- unsigned int cpu = smp_processor_id();
6683+ struct hrtimer *t = this_cpu_ptr(&mce_timer);
6684+
6685+ hrtimer_init(t, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
6686+ t->function = mce_timer_fn;
6687
6688- setup_pinned_timer(t, mce_timer_fn, cpu);
6689 mce_start_timer(t);
6690 }
6691
6692@@ -2309,7 +2307,7 @@ static int mce_cpu_dead(unsigned int cpu)
6693
6694 static int mce_cpu_online(unsigned int cpu)
6695 {
6696- struct timer_list *t = this_cpu_ptr(&mce_timer);
6697+ struct hrtimer *t = this_cpu_ptr(&mce_timer);
6698 int ret;
6699
6700 mce_device_create(cpu);
6701@@ -2326,10 +2324,10 @@ static int mce_cpu_online(unsigned int cpu)
6702
6703 static int mce_cpu_pre_down(unsigned int cpu)
6704 {
6705- struct timer_list *t = this_cpu_ptr(&mce_timer);
6706+ struct hrtimer *t = this_cpu_ptr(&mce_timer);
6707
6708 mce_disable_cpu();
6709- del_timer_sync(t);
6710+ hrtimer_cancel(t);
6711 mce_threshold_remove_device(cpu);
6712 mce_device_remove(cpu);
6713 return 0;
6714diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c
6715index 2ea85b32421a..6914dc569d1e 100644
6716--- a/arch/x86/kernel/fpu/core.c
6717+++ b/arch/x86/kernel/fpu/core.c
6718@@ -138,6 +138,18 @@ void kernel_fpu_end(void)
6719 }
6720 EXPORT_SYMBOL_GPL(kernel_fpu_end);
6721
6722+void kernel_fpu_resched(void)
6723+{
6724+ WARN_ON_FPU(!this_cpu_read(in_kernel_fpu));
6725+
6726+ if (should_resched(PREEMPT_OFFSET)) {
6727+ kernel_fpu_end();
6728+ cond_resched();
6729+ kernel_fpu_begin();
6730+ }
6731+}
6732+EXPORT_SYMBOL_GPL(kernel_fpu_resched);
6733+
6734 /*
6735 * Save the FPU state (mark it for reload if necessary):
6736 *
6737diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c
6738index 95600a99ae93..9192d76085ba 100644
6739--- a/arch/x86/kernel/irq_32.c
6740+++ b/arch/x86/kernel/irq_32.c
6741@@ -130,6 +130,7 @@ void irq_ctx_init(int cpu)
6742 cpu, per_cpu(hardirq_stack, cpu), per_cpu(softirq_stack, cpu));
6743 }
6744
6745+#ifndef CONFIG_PREEMPT_RT_FULL
6746 void do_softirq_own_stack(void)
6747 {
6748 struct irq_stack *irqstk;
6749@@ -146,6 +147,7 @@ void do_softirq_own_stack(void)
6750
6751 call_on_stack(__do_softirq, isp);
6752 }
6753+#endif
6754
6755 bool handle_irq(struct irq_desc *desc, struct pt_regs *regs)
6756 {
6757diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
6758index 5224c6099184..9b2b1f0409c5 100644
6759--- a/arch/x86/kernel/process_32.c
6760+++ b/arch/x86/kernel/process_32.c
6761@@ -38,6 +38,7 @@
6762 #include <linux/io.h>
6763 #include <linux/kdebug.h>
6764 #include <linux/syscalls.h>
6765+#include <linux/highmem.h>
6766
6767 #include <asm/pgtable.h>
6768 #include <asm/ldt.h>
6769@@ -198,6 +199,35 @@ start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp)
6770 }
6771 EXPORT_SYMBOL_GPL(start_thread);
6772
6773+#ifdef CONFIG_PREEMPT_RT_FULL
6774+static void switch_kmaps(struct task_struct *prev_p, struct task_struct *next_p)
6775+{
6776+ int i;
6777+
6778+ /*
6779+ * Clear @prev's kmap_atomic mappings
6780+ */
6781+ for (i = 0; i < prev_p->kmap_idx; i++) {
6782+ int idx = i + KM_TYPE_NR * smp_processor_id();
6783+ pte_t *ptep = kmap_pte - idx;
6784+
6785+ kpte_clear_flush(ptep, __fix_to_virt(FIX_KMAP_BEGIN + idx));
6786+ }
6787+ /*
6788+ * Restore @next_p's kmap_atomic mappings
6789+ */
6790+ for (i = 0; i < next_p->kmap_idx; i++) {
6791+ int idx = i + KM_TYPE_NR * smp_processor_id();
6792+
6793+ if (!pte_none(next_p->kmap_pte[i]))
6794+ set_pte(kmap_pte - idx, next_p->kmap_pte[i]);
6795+ }
6796+}
6797+#else
6798+static inline void
6799+switch_kmaps(struct task_struct *prev_p, struct task_struct *next_p) { }
6800+#endif
6801+
6802
6803 /*
6804 * switch_to(x,y) should switch tasks from x to y.
6805@@ -273,6 +303,8 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
6806 task_thread_info(next_p)->flags & _TIF_WORK_CTXSW_NEXT))
6807 __switch_to_xtra(prev_p, next_p, tss);
6808
6809+ switch_kmaps(prev_p, next_p);
6810+
6811 /*
6812 * Leave lazy mode, flushing any hypercalls made here.
6813 * This must be done before restoring TLS segments so
6814diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
6815index 13dfb55b84db..dd66f629d1d0 100644
6816--- a/arch/x86/kvm/lapic.c
6817+++ b/arch/x86/kvm/lapic.c
6818@@ -2136,7 +2136,7 @@ int kvm_create_lapic(struct kvm_vcpu *vcpu)
6819 apic->vcpu = vcpu;
6820
6821 hrtimer_init(&apic->lapic_timer.timer, CLOCK_MONOTONIC,
6822- HRTIMER_MODE_ABS_PINNED);
6823+ HRTIMER_MODE_ABS_PINNED_HARD);
6824 apic->lapic_timer.timer.function = apic_timer_fn;
6825
6826 /*
6827diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
6828index 3856828ee1dc..407658146ae1 100644
6829--- a/arch/x86/kvm/x86.c
6830+++ b/arch/x86/kvm/x86.c
6831@@ -6287,6 +6287,13 @@ int kvm_arch_init(void *opaque)
6832 goto out;
6833 }
6834
6835+#ifdef CONFIG_PREEMPT_RT_FULL
6836+ if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) {
6837+ printk(KERN_ERR "RT requires X86_FEATURE_CONSTANT_TSC\n");
6838+ return -EOPNOTSUPP;
6839+ }
6840+#endif
6841+
6842 r = kvm_mmu_module_init();
6843 if (r)
6844 goto out_free_percpu;
6845diff --git a/arch/x86/mm/highmem_32.c b/arch/x86/mm/highmem_32.c
6846index 6d18b70ed5a9..f752724c22e8 100644
6847--- a/arch/x86/mm/highmem_32.c
6848+++ b/arch/x86/mm/highmem_32.c
6849@@ -32,10 +32,11 @@ EXPORT_SYMBOL(kunmap);
6850 */
6851 void *kmap_atomic_prot(struct page *page, pgprot_t prot)
6852 {
6853+ pte_t pte = mk_pte(page, prot);
6854 unsigned long vaddr;
6855 int idx, type;
6856
6857- preempt_disable();
6858+ preempt_disable_nort();
6859 pagefault_disable();
6860
6861 if (!PageHighMem(page))
6862@@ -45,7 +46,10 @@ void *kmap_atomic_prot(struct page *page, pgprot_t prot)
6863 idx = type + KM_TYPE_NR*smp_processor_id();
6864 vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx);
6865 BUG_ON(!pte_none(*(kmap_pte-idx)));
6866- set_pte(kmap_pte-idx, mk_pte(page, prot));
6867+#ifdef CONFIG_PREEMPT_RT_FULL
6868+ current->kmap_pte[type] = pte;
6869+#endif
6870+ set_pte(kmap_pte-idx, pte);
6871 arch_flush_lazy_mmu_mode();
6872
6873 return (void *)vaddr;
6874@@ -88,6 +92,9 @@ void __kunmap_atomic(void *kvaddr)
6875 * is a bad idea also, in case the page changes cacheability
6876 * attributes or becomes a protected page in a hypervisor.
6877 */
6878+#ifdef CONFIG_PREEMPT_RT_FULL
6879+ current->kmap_pte[type] = __pte(0);
6880+#endif
6881 kpte_clear_flush(kmap_pte-idx, vaddr);
6882 kmap_atomic_idx_pop();
6883 arch_flush_lazy_mmu_mode();
6884@@ -100,7 +107,7 @@ void __kunmap_atomic(void *kvaddr)
6885 #endif
6886
6887 pagefault_enable();
6888- preempt_enable();
6889+ preempt_enable_nort();
6890 }
6891 EXPORT_SYMBOL(__kunmap_atomic);
6892
6893diff --git a/arch/x86/mm/iomap_32.c b/arch/x86/mm/iomap_32.c
6894index ada98b39b8ad..585f6829653b 100644
6895--- a/arch/x86/mm/iomap_32.c
6896+++ b/arch/x86/mm/iomap_32.c
6897@@ -56,6 +56,7 @@ EXPORT_SYMBOL_GPL(iomap_free);
6898
6899 void *kmap_atomic_prot_pfn(unsigned long pfn, pgprot_t prot)
6900 {
6901+ pte_t pte = pfn_pte(pfn, prot);
6902 unsigned long vaddr;
6903 int idx, type;
6904
6905@@ -65,7 +66,12 @@ void *kmap_atomic_prot_pfn(unsigned long pfn, pgprot_t prot)
6906 type = kmap_atomic_idx_push();
6907 idx = type + KM_TYPE_NR * smp_processor_id();
6908 vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx);
6909- set_pte(kmap_pte - idx, pfn_pte(pfn, prot));
6910+ WARN_ON(!pte_none(*(kmap_pte - idx)));
6911+
6912+#ifdef CONFIG_PREEMPT_RT_FULL
6913+ current->kmap_pte[type] = pte;
6914+#endif
6915+ set_pte(kmap_pte - idx, pte);
6916 arch_flush_lazy_mmu_mode();
6917
6918 return (void *)vaddr;
6919@@ -113,6 +119,9 @@ iounmap_atomic(void __iomem *kvaddr)
6920 * is a bad idea also, in case the page changes cacheability
6921 * attributes or becomes a protected page in a hypervisor.
6922 */
6923+#ifdef CONFIG_PREEMPT_RT_FULL
6924+ current->kmap_pte[type] = __pte(0);
6925+#endif
6926 kpte_clear_flush(kmap_pte-idx, vaddr);
6927 kmap_atomic_idx_pop();
6928 }
6929diff --git a/arch/xtensa/include/asm/spinlock_types.h b/arch/xtensa/include/asm/spinlock_types.h
6930index bb1fe6c1816e..8a22f1e7b6c9 100644
6931--- a/arch/xtensa/include/asm/spinlock_types.h
6932+++ b/arch/xtensa/include/asm/spinlock_types.h
6933@@ -2,10 +2,6 @@
6934 #ifndef __ASM_SPINLOCK_TYPES_H
6935 #define __ASM_SPINLOCK_TYPES_H
6936
6937-#ifndef __LINUX_SPINLOCK_TYPES_H
6938-# error "please don't include this file directly"
6939-#endif
6940-
6941 typedef struct {
6942 volatile unsigned int slock;
6943 } arch_spinlock_t;
6944diff --git a/block/blk-core.c b/block/blk-core.c
6945index 6aa2bc4e9652..f005077ae291 100644
6946--- a/block/blk-core.c
6947+++ b/block/blk-core.c
6948@@ -116,6 +116,9 @@ void blk_rq_init(struct request_queue *q, struct request *rq)
6949
6950 INIT_LIST_HEAD(&rq->queuelist);
6951 INIT_LIST_HEAD(&rq->timeout_list);
6952+#ifdef CONFIG_PREEMPT_RT_FULL
6953+ INIT_WORK(&rq->work, __blk_mq_complete_request_remote_work);
6954+#endif
6955 rq->cpu = -1;
6956 rq->q = q;
6957 rq->__sector = (sector_t) -1;
6958@@ -280,7 +283,7 @@ EXPORT_SYMBOL(blk_start_queue_async);
6959 void blk_start_queue(struct request_queue *q)
6960 {
6961 lockdep_assert_held(q->queue_lock);
6962- WARN_ON(!in_interrupt() && !irqs_disabled());
6963+ WARN_ON_NONRT(!in_interrupt() && !irqs_disabled());
6964 WARN_ON_ONCE(q->mq_ops);
6965
6966 queue_flag_clear(QUEUE_FLAG_STOPPED, q);
6967@@ -812,12 +815,21 @@ void blk_queue_exit(struct request_queue *q)
6968 percpu_ref_put(&q->q_usage_counter);
6969 }
6970
6971+static void blk_queue_usage_counter_release_swork(struct swork_event *sev)
6972+{
6973+ struct request_queue *q =
6974+ container_of(sev, struct request_queue, mq_pcpu_wake);
6975+
6976+ wake_up_all(&q->mq_freeze_wq);
6977+}
6978+
6979 static void blk_queue_usage_counter_release(struct percpu_ref *ref)
6980 {
6981 struct request_queue *q =
6982 container_of(ref, struct request_queue, q_usage_counter);
6983
6984- wake_up_all(&q->mq_freeze_wq);
6985+ if (wq_has_sleeper(&q->mq_freeze_wq))
6986+ swork_queue(&q->mq_pcpu_wake);
6987 }
6988
6989 static void blk_rq_timed_out_timer(unsigned long data)
6990@@ -894,6 +906,7 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id)
6991 __set_bit(QUEUE_FLAG_BYPASS, &q->queue_flags);
6992
6993 init_waitqueue_head(&q->mq_freeze_wq);
6994+ INIT_SWORK(&q->mq_pcpu_wake, blk_queue_usage_counter_release_swork);
6995
6996 /*
6997 * Init percpu_ref in atomic mode so that it's faster to shutdown.
6998@@ -3313,7 +3326,7 @@ static void queue_unplugged(struct request_queue *q, unsigned int depth,
6999 blk_run_queue_async(q);
7000 else
7001 __blk_run_queue(q);
7002- spin_unlock(q->queue_lock);
7003+ spin_unlock_irq(q->queue_lock);
7004 }
7005
7006 static void flush_plug_callbacks(struct blk_plug *plug, bool from_schedule)
7007@@ -3361,7 +3374,6 @@ EXPORT_SYMBOL(blk_check_plugged);
7008 void blk_flush_plug_list(struct blk_plug *plug, bool from_schedule)
7009 {
7010 struct request_queue *q;
7011- unsigned long flags;
7012 struct request *rq;
7013 LIST_HEAD(list);
7014 unsigned int depth;
7015@@ -3381,11 +3393,6 @@ void blk_flush_plug_list(struct blk_plug *plug, bool from_schedule)
7016 q = NULL;
7017 depth = 0;
7018
7019- /*
7020- * Save and disable interrupts here, to avoid doing it for every
7021- * queue lock we have to take.
7022- */
7023- local_irq_save(flags);
7024 while (!list_empty(&list)) {
7025 rq = list_entry_rq(list.next);
7026 list_del_init(&rq->queuelist);
7027@@ -3398,7 +3405,7 @@ void blk_flush_plug_list(struct blk_plug *plug, bool from_schedule)
7028 queue_unplugged(q, depth, from_schedule);
7029 q = rq->q;
7030 depth = 0;
7031- spin_lock(q->queue_lock);
7032+ spin_lock_irq(q->queue_lock);
7033 }
7034
7035 /*
7036@@ -3425,8 +3432,6 @@ void blk_flush_plug_list(struct blk_plug *plug, bool from_schedule)
7037 */
7038 if (q)
7039 queue_unplugged(q, depth, from_schedule);
7040-
7041- local_irq_restore(flags);
7042 }
7043
7044 void blk_finish_plug(struct blk_plug *plug)
7045@@ -3638,6 +3643,8 @@ int __init blk_dev_init(void)
7046 if (!kblockd_workqueue)
7047 panic("Failed to create kblockd\n");
7048
7049+ BUG_ON(swork_get());
7050+
7051 request_cachep = kmem_cache_create("blkdev_requests",
7052 sizeof(struct request), 0, SLAB_PANIC, NULL);
7053
7054diff --git a/block/blk-ioc.c b/block/blk-ioc.c
7055index f23311e4b201..ca9ea624f159 100644
7056--- a/block/blk-ioc.c
7057+++ b/block/blk-ioc.c
7058@@ -9,6 +9,7 @@
7059 #include <linux/blkdev.h>
7060 #include <linux/slab.h>
7061 #include <linux/sched/task.h>
7062+#include <linux/delay.h>
7063
7064 #include "blk.h"
7065
7066@@ -118,7 +119,7 @@ static void ioc_release_fn(struct work_struct *work)
7067 spin_unlock(q->queue_lock);
7068 } else {
7069 spin_unlock_irqrestore(&ioc->lock, flags);
7070- cpu_relax();
7071+ cpu_chill();
7072 spin_lock_irqsave_nested(&ioc->lock, flags, 1);
7073 }
7074 }
7075@@ -202,7 +203,7 @@ void put_io_context_active(struct io_context *ioc)
7076 spin_unlock(icq->q->queue_lock);
7077 } else {
7078 spin_unlock_irqrestore(&ioc->lock, flags);
7079- cpu_relax();
7080+ cpu_chill();
7081 goto retry;
7082 }
7083 }
7084diff --git a/block/blk-mq.c b/block/blk-mq.c
7085index eac444804736..a6314b82273e 100644
7086--- a/block/blk-mq.c
7087+++ b/block/blk-mq.c
7088@@ -339,6 +339,9 @@ static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data,
7089 /* tag was already set */
7090 rq->extra_len = 0;
7091
7092+#ifdef CONFIG_PREEMPT_RT_FULL
7093+ INIT_WORK(&rq->work, __blk_mq_complete_request_remote_work);
7094+#endif
7095 INIT_LIST_HEAD(&rq->timeout_list);
7096 rq->timeout = 0;
7097
7098@@ -533,12 +536,24 @@ void blk_mq_end_request(struct request *rq, blk_status_t error)
7099 }
7100 EXPORT_SYMBOL(blk_mq_end_request);
7101
7102+#ifdef CONFIG_PREEMPT_RT_FULL
7103+
7104+void __blk_mq_complete_request_remote_work(struct work_struct *work)
7105+{
7106+ struct request *rq = container_of(work, struct request, work);
7107+
7108+ rq->q->softirq_done_fn(rq);
7109+}
7110+
7111+#else
7112+
7113 static void __blk_mq_complete_request_remote(void *data)
7114 {
7115 struct request *rq = data;
7116
7117 rq->q->softirq_done_fn(rq);
7118 }
7119+#endif
7120
7121 static void __blk_mq_complete_request(struct request *rq)
7122 {
7123@@ -558,19 +573,27 @@ static void __blk_mq_complete_request(struct request *rq)
7124 return;
7125 }
7126
7127- cpu = get_cpu();
7128+ cpu = get_cpu_light();
7129 if (!test_bit(QUEUE_FLAG_SAME_FORCE, &rq->q->queue_flags))
7130 shared = cpus_share_cache(cpu, ctx->cpu);
7131
7132 if (cpu != ctx->cpu && !shared && cpu_online(ctx->cpu)) {
7133+#ifdef CONFIG_PREEMPT_RT_FULL
7134+ /*
7135+ * We could force QUEUE_FLAG_SAME_FORCE then we would not get in
7136+ * here. But we could try to invoke it one the CPU like this.
7137+ */
7138+ schedule_work_on(ctx->cpu, &rq->work);
7139+#else
7140 rq->csd.func = __blk_mq_complete_request_remote;
7141 rq->csd.info = rq;
7142 rq->csd.flags = 0;
7143 smp_call_function_single_async(ctx->cpu, &rq->csd);
7144+#endif
7145 } else {
7146 rq->q->softirq_done_fn(rq);
7147 }
7148- put_cpu();
7149+ put_cpu_light();
7150 }
7151
7152 /**
7153@@ -1238,14 +1261,14 @@ static void __blk_mq_delay_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async,
7154 return;
7155
7156 if (!async && !(hctx->flags & BLK_MQ_F_BLOCKING)) {
7157- int cpu = get_cpu();
7158+ int cpu = get_cpu_light();
7159 if (cpumask_test_cpu(cpu, hctx->cpumask)) {
7160 __blk_mq_run_hw_queue(hctx);
7161- put_cpu();
7162+ put_cpu_light();
7163 return;
7164 }
7165
7166- put_cpu();
7167+ put_cpu_light();
7168 }
7169
7170 kblockd_schedule_delayed_work_on(blk_mq_hctx_next_cpu(hctx),
7171@@ -2863,10 +2886,9 @@ static bool blk_mq_poll_hybrid_sleep(struct request_queue *q,
7172 kt = nsecs;
7173
7174 mode = HRTIMER_MODE_REL;
7175- hrtimer_init_on_stack(&hs.timer, CLOCK_MONOTONIC, mode);
7176+ hrtimer_init_sleeper_on_stack(&hs, CLOCK_MONOTONIC, mode, current);
7177 hrtimer_set_expires(&hs.timer, kt);
7178
7179- hrtimer_init_sleeper(&hs, current);
7180 do {
7181 if (test_bit(REQ_ATOM_COMPLETE, &rq->atomic_flags))
7182 break;
7183diff --git a/block/blk-mq.h b/block/blk-mq.h
7184index 877237e09083..d944750bade0 100644
7185--- a/block/blk-mq.h
7186+++ b/block/blk-mq.h
7187@@ -98,12 +98,12 @@ static inline struct blk_mq_ctx *__blk_mq_get_ctx(struct request_queue *q,
7188 */
7189 static inline struct blk_mq_ctx *blk_mq_get_ctx(struct request_queue *q)
7190 {
7191- return __blk_mq_get_ctx(q, get_cpu());
7192+ return __blk_mq_get_ctx(q, get_cpu_light());
7193 }
7194
7195 static inline void blk_mq_put_ctx(struct blk_mq_ctx *ctx)
7196 {
7197- put_cpu();
7198+ put_cpu_light();
7199 }
7200
7201 struct blk_mq_alloc_data {
7202diff --git a/block/blk-softirq.c b/block/blk-softirq.c
7203index 01e2b353a2b9..e8c0d4945f5a 100644
7204--- a/block/blk-softirq.c
7205+++ b/block/blk-softirq.c
7206@@ -53,6 +53,7 @@ static void trigger_softirq(void *data)
7207 raise_softirq_irqoff(BLOCK_SOFTIRQ);
7208
7209 local_irq_restore(flags);
7210+ preempt_check_resched_rt();
7211 }
7212
7213 /*
7214@@ -91,6 +92,7 @@ static int blk_softirq_cpu_dead(unsigned int cpu)
7215 this_cpu_ptr(&blk_cpu_done));
7216 raise_softirq_irqoff(BLOCK_SOFTIRQ);
7217 local_irq_enable();
7218+ preempt_check_resched_rt();
7219
7220 return 0;
7221 }
7222@@ -143,6 +145,7 @@ void __blk_complete_request(struct request *req)
7223 goto do_local;
7224
7225 local_irq_restore(flags);
7226+ preempt_check_resched_rt();
7227 }
7228
7229 /**
7230diff --git a/block/bounce.c b/block/bounce.c
7231index 1d05c422c932..0101ffefddc4 100644
7232--- a/block/bounce.c
7233+++ b/block/bounce.c
7234@@ -66,11 +66,11 @@ static void bounce_copy_vec(struct bio_vec *to, unsigned char *vfrom)
7235 unsigned long flags;
7236 unsigned char *vto;
7237
7238- local_irq_save(flags);
7239+ local_irq_save_nort(flags);
7240 vto = kmap_atomic(to->bv_page);
7241 memcpy(vto + to->bv_offset, vfrom, to->bv_len);
7242 kunmap_atomic(vto);
7243- local_irq_restore(flags);
7244+ local_irq_restore_nort(flags);
7245 }
7246
7247 #else /* CONFIG_HIGHMEM */
7248diff --git a/crypto/algapi.c b/crypto/algapi.c
7249index 50eb828db767..7bce92a6599a 100644
7250--- a/crypto/algapi.c
7251+++ b/crypto/algapi.c
7252@@ -731,13 +731,13 @@ EXPORT_SYMBOL_GPL(crypto_spawn_tfm2);
7253
7254 int crypto_register_notifier(struct notifier_block *nb)
7255 {
7256- return blocking_notifier_chain_register(&crypto_chain, nb);
7257+ return srcu_notifier_chain_register(&crypto_chain, nb);
7258 }
7259 EXPORT_SYMBOL_GPL(crypto_register_notifier);
7260
7261 int crypto_unregister_notifier(struct notifier_block *nb)
7262 {
7263- return blocking_notifier_chain_unregister(&crypto_chain, nb);
7264+ return srcu_notifier_chain_unregister(&crypto_chain, nb);
7265 }
7266 EXPORT_SYMBOL_GPL(crypto_unregister_notifier);
7267
7268diff --git a/crypto/api.c b/crypto/api.c
7269index e485aed11ad0..089e648d2fa9 100644
7270--- a/crypto/api.c
7271+++ b/crypto/api.c
7272@@ -31,7 +31,7 @@ EXPORT_SYMBOL_GPL(crypto_alg_list);
7273 DECLARE_RWSEM(crypto_alg_sem);
7274 EXPORT_SYMBOL_GPL(crypto_alg_sem);
7275
7276-BLOCKING_NOTIFIER_HEAD(crypto_chain);
7277+SRCU_NOTIFIER_HEAD(crypto_chain);
7278 EXPORT_SYMBOL_GPL(crypto_chain);
7279
7280 static struct crypto_alg *crypto_larval_wait(struct crypto_alg *alg);
7281@@ -236,10 +236,10 @@ int crypto_probing_notify(unsigned long val, void *v)
7282 {
7283 int ok;
7284
7285- ok = blocking_notifier_call_chain(&crypto_chain, val, v);
7286+ ok = srcu_notifier_call_chain(&crypto_chain, val, v);
7287 if (ok == NOTIFY_DONE) {
7288 request_module("cryptomgr");
7289- ok = blocking_notifier_call_chain(&crypto_chain, val, v);
7290+ ok = srcu_notifier_call_chain(&crypto_chain, val, v);
7291 }
7292
7293 return ok;
7294diff --git a/crypto/cryptd.c b/crypto/cryptd.c
7295index 248f6ba41688..54b7985c8caa 100644
7296--- a/crypto/cryptd.c
7297+++ b/crypto/cryptd.c
7298@@ -37,6 +37,7 @@
7299 struct cryptd_cpu_queue {
7300 struct crypto_queue queue;
7301 struct work_struct work;
7302+ spinlock_t qlock;
7303 };
7304
7305 struct cryptd_queue {
7306@@ -115,6 +116,7 @@ static int cryptd_init_queue(struct cryptd_queue *queue,
7307 cpu_queue = per_cpu_ptr(queue->cpu_queue, cpu);
7308 crypto_init_queue(&cpu_queue->queue, max_cpu_qlen);
7309 INIT_WORK(&cpu_queue->work, cryptd_queue_worker);
7310+ spin_lock_init(&cpu_queue->qlock);
7311 }
7312 return 0;
7313 }
7314@@ -139,8 +141,10 @@ static int cryptd_enqueue_request(struct cryptd_queue *queue,
7315 atomic_t *refcnt;
7316 bool may_backlog;
7317
7318- cpu = get_cpu();
7319- cpu_queue = this_cpu_ptr(queue->cpu_queue);
7320+ cpu_queue = raw_cpu_ptr(queue->cpu_queue);
7321+ spin_lock_bh(&cpu_queue->qlock);
7322+ cpu = smp_processor_id();
7323+
7324 err = crypto_enqueue_request(&cpu_queue->queue, request);
7325
7326 refcnt = crypto_tfm_ctx(request->tfm);
7327@@ -157,7 +161,7 @@ static int cryptd_enqueue_request(struct cryptd_queue *queue,
7328 atomic_inc(refcnt);
7329
7330 out_put_cpu:
7331- put_cpu();
7332+ spin_unlock_bh(&cpu_queue->qlock);
7333
7334 return err;
7335 }
7336@@ -173,16 +177,11 @@ static void cryptd_queue_worker(struct work_struct *work)
7337 cpu_queue = container_of(work, struct cryptd_cpu_queue, work);
7338 /*
7339 * Only handle one request at a time to avoid hogging crypto workqueue.
7340- * preempt_disable/enable is used to prevent being preempted by
7341- * cryptd_enqueue_request(). local_bh_disable/enable is used to prevent
7342- * cryptd_enqueue_request() being accessed from software interrupts.
7343 */
7344- local_bh_disable();
7345- preempt_disable();
7346+ spin_lock_bh(&cpu_queue->qlock);
7347 backlog = crypto_get_backlog(&cpu_queue->queue);
7348 req = crypto_dequeue_request(&cpu_queue->queue);
7349- preempt_enable();
7350- local_bh_enable();
7351+ spin_unlock_bh(&cpu_queue->qlock);
7352
7353 if (!req)
7354 return;
7355diff --git a/crypto/internal.h b/crypto/internal.h
7356index f07320423191..333d985088fe 100644
7357--- a/crypto/internal.h
7358+++ b/crypto/internal.h
7359@@ -47,7 +47,7 @@ struct crypto_larval {
7360
7361 extern struct list_head crypto_alg_list;
7362 extern struct rw_semaphore crypto_alg_sem;
7363-extern struct blocking_notifier_head crypto_chain;
7364+extern struct srcu_notifier_head crypto_chain;
7365
7366 #ifdef CONFIG_PROC_FS
7367 void __init crypto_init_proc(void);
7368@@ -143,7 +143,7 @@ static inline int crypto_is_moribund(struct crypto_alg *alg)
7369
7370 static inline void crypto_notify(unsigned long val, void *v)
7371 {
7372- blocking_notifier_call_chain(&crypto_chain, val, v);
7373+ srcu_notifier_call_chain(&crypto_chain, val, v);
7374 }
7375
7376 #endif /* _CRYPTO_INTERNAL_H */
7377diff --git a/crypto/scompress.c b/crypto/scompress.c
7378index 2075e2c4e7df..c6b4e265c6bf 100644
7379--- a/crypto/scompress.c
7380+++ b/crypto/scompress.c
7381@@ -24,6 +24,7 @@
7382 #include <linux/cryptouser.h>
7383 #include <net/netlink.h>
7384 #include <linux/scatterlist.h>
7385+#include <linux/locallock.h>
7386 #include <crypto/scatterwalk.h>
7387 #include <crypto/internal/acompress.h>
7388 #include <crypto/internal/scompress.h>
7389@@ -34,6 +35,7 @@ static void * __percpu *scomp_src_scratches;
7390 static void * __percpu *scomp_dst_scratches;
7391 static int scomp_scratch_users;
7392 static DEFINE_MUTEX(scomp_lock);
7393+static DEFINE_LOCAL_IRQ_LOCK(scomp_scratches_lock);
7394
7395 #ifdef CONFIG_NET
7396 static int crypto_scomp_report(struct sk_buff *skb, struct crypto_alg *alg)
7397@@ -193,7 +195,7 @@ static int scomp_acomp_comp_decomp(struct acomp_req *req, int dir)
7398 void **tfm_ctx = acomp_tfm_ctx(tfm);
7399 struct crypto_scomp *scomp = *tfm_ctx;
7400 void **ctx = acomp_request_ctx(req);
7401- const int cpu = get_cpu();
7402+ const int cpu = local_lock_cpu(scomp_scratches_lock);
7403 u8 *scratch_src = *per_cpu_ptr(scomp_src_scratches, cpu);
7404 u8 *scratch_dst = *per_cpu_ptr(scomp_dst_scratches, cpu);
7405 int ret;
7406@@ -228,7 +230,7 @@ static int scomp_acomp_comp_decomp(struct acomp_req *req, int dir)
7407 1);
7408 }
7409 out:
7410- put_cpu();
7411+ local_unlock_cpu(scomp_scratches_lock);
7412 return ret;
7413 }
7414
7415diff --git a/drivers/acpi/acpica/acglobal.h b/drivers/acpi/acpica/acglobal.h
7416index 95eed442703f..50bc5b61d899 100644
7417--- a/drivers/acpi/acpica/acglobal.h
7418+++ b/drivers/acpi/acpica/acglobal.h
7419@@ -116,7 +116,7 @@ ACPI_GLOBAL(u8, acpi_gbl_global_lock_pending);
7420 * interrupt level
7421 */
7422 ACPI_GLOBAL(acpi_spinlock, acpi_gbl_gpe_lock); /* For GPE data structs and registers */
7423-ACPI_GLOBAL(acpi_spinlock, acpi_gbl_hardware_lock); /* For ACPI H/W except GPE registers */
7424+ACPI_GLOBAL(acpi_raw_spinlock, acpi_gbl_hardware_lock); /* For ACPI H/W except GPE registers */
7425 ACPI_GLOBAL(acpi_spinlock, acpi_gbl_reference_count_lock);
7426
7427 /* Mutex for _OSI support */
7428diff --git a/drivers/acpi/acpica/hwregs.c b/drivers/acpi/acpica/hwregs.c
7429index acb417b58bbb..ea49e08c263f 100644
7430--- a/drivers/acpi/acpica/hwregs.c
7431+++ b/drivers/acpi/acpica/hwregs.c
7432@@ -428,14 +428,14 @@ acpi_status acpi_hw_clear_acpi_status(void)
7433 ACPI_BITMASK_ALL_FIXED_STATUS,
7434 ACPI_FORMAT_UINT64(acpi_gbl_xpm1a_status.address)));
7435
7436- lock_flags = acpi_os_acquire_lock(acpi_gbl_hardware_lock);
7437+ raw_spin_lock_irqsave(acpi_gbl_hardware_lock, lock_flags);
7438
7439 /* Clear the fixed events in PM1 A/B */
7440
7441 status = acpi_hw_register_write(ACPI_REGISTER_PM1_STATUS,
7442 ACPI_BITMASK_ALL_FIXED_STATUS);
7443
7444- acpi_os_release_lock(acpi_gbl_hardware_lock, lock_flags);
7445+ raw_spin_unlock_irqrestore(acpi_gbl_hardware_lock, lock_flags);
7446
7447 if (ACPI_FAILURE(status)) {
7448 goto exit;
7449diff --git a/drivers/acpi/acpica/hwxface.c b/drivers/acpi/acpica/hwxface.c
7450index 34684ae89981..fb84983e1839 100644
7451--- a/drivers/acpi/acpica/hwxface.c
7452+++ b/drivers/acpi/acpica/hwxface.c
7453@@ -373,7 +373,7 @@ acpi_status acpi_write_bit_register(u32 register_id, u32 value)
7454 return_ACPI_STATUS(AE_BAD_PARAMETER);
7455 }
7456
7457- lock_flags = acpi_os_acquire_lock(acpi_gbl_hardware_lock);
7458+ raw_spin_lock_irqsave(acpi_gbl_hardware_lock, lock_flags);
7459
7460 /*
7461 * At this point, we know that the parent register is one of the
7462@@ -434,7 +434,7 @@ acpi_status acpi_write_bit_register(u32 register_id, u32 value)
7463
7464 unlock_and_exit:
7465
7466- acpi_os_release_lock(acpi_gbl_hardware_lock, lock_flags);
7467+ raw_spin_unlock_irqrestore(acpi_gbl_hardware_lock, lock_flags);
7468 return_ACPI_STATUS(status);
7469 }
7470
7471diff --git a/drivers/acpi/acpica/utmutex.c b/drivers/acpi/acpica/utmutex.c
7472index 586354788018..3a3c2a86437f 100644
7473--- a/drivers/acpi/acpica/utmutex.c
7474+++ b/drivers/acpi/acpica/utmutex.c
7475@@ -88,7 +88,7 @@ acpi_status acpi_ut_mutex_initialize(void)
7476 return_ACPI_STATUS (status);
7477 }
7478
7479- status = acpi_os_create_lock (&acpi_gbl_hardware_lock);
7480+ status = acpi_os_create_raw_lock (&acpi_gbl_hardware_lock);
7481 if (ACPI_FAILURE (status)) {
7482 return_ACPI_STATUS (status);
7483 }
7484@@ -145,7 +145,7 @@ void acpi_ut_mutex_terminate(void)
7485 /* Delete the spinlocks */
7486
7487 acpi_os_delete_lock(acpi_gbl_gpe_lock);
7488- acpi_os_delete_lock(acpi_gbl_hardware_lock);
7489+ acpi_os_delete_raw_lock(acpi_gbl_hardware_lock);
7490 acpi_os_delete_lock(acpi_gbl_reference_count_lock);
7491
7492 /* Delete the reader/writer lock */
7493diff --git a/drivers/ata/libata-sff.c b/drivers/ata/libata-sff.c
7494index cc2f2e35f4c2..0f0bc86e02df 100644
7495--- a/drivers/ata/libata-sff.c
7496+++ b/drivers/ata/libata-sff.c
7497@@ -679,9 +679,9 @@ unsigned int ata_sff_data_xfer_noirq(struct ata_queued_cmd *qc, unsigned char *b
7498 unsigned long flags;
7499 unsigned int consumed;
7500
7501- local_irq_save(flags);
7502+ local_irq_save_nort(flags);
7503 consumed = ata_sff_data_xfer32(qc, buf, buflen, rw);
7504- local_irq_restore(flags);
7505+ local_irq_restore_nort(flags);
7506
7507 return consumed;
7508 }
7509diff --git a/drivers/base/power/wakeup.c b/drivers/base/power/wakeup.c
7510index cdd6f256da59..2269d379c92f 100644
7511--- a/drivers/base/power/wakeup.c
7512+++ b/drivers/base/power/wakeup.c
7513@@ -52,7 +52,7 @@ static void split_counters(unsigned int *cnt, unsigned int *inpr)
7514 /* A preserved old value of the events counter. */
7515 static unsigned int saved_count;
7516
7517-static DEFINE_SPINLOCK(events_lock);
7518+static DEFINE_RAW_SPINLOCK(events_lock);
7519
7520 static void pm_wakeup_timer_fn(unsigned long data);
7521
7522@@ -180,9 +180,9 @@ void wakeup_source_add(struct wakeup_source *ws)
7523 ws->active = false;
7524 ws->last_time = ktime_get();
7525
7526- spin_lock_irqsave(&events_lock, flags);
7527+ raw_spin_lock_irqsave(&events_lock, flags);
7528 list_add_rcu(&ws->entry, &wakeup_sources);
7529- spin_unlock_irqrestore(&events_lock, flags);
7530+ raw_spin_unlock_irqrestore(&events_lock, flags);
7531 }
7532 EXPORT_SYMBOL_GPL(wakeup_source_add);
7533
7534@@ -197,9 +197,9 @@ void wakeup_source_remove(struct wakeup_source *ws)
7535 if (WARN_ON(!ws))
7536 return;
7537
7538- spin_lock_irqsave(&events_lock, flags);
7539+ raw_spin_lock_irqsave(&events_lock, flags);
7540 list_del_rcu(&ws->entry);
7541- spin_unlock_irqrestore(&events_lock, flags);
7542+ raw_spin_unlock_irqrestore(&events_lock, flags);
7543 synchronize_srcu(&wakeup_srcu);
7544 }
7545 EXPORT_SYMBOL_GPL(wakeup_source_remove);
7546@@ -844,7 +844,7 @@ bool pm_wakeup_pending(void)
7547 unsigned long flags;
7548 bool ret = false;
7549
7550- spin_lock_irqsave(&events_lock, flags);
7551+ raw_spin_lock_irqsave(&events_lock, flags);
7552 if (events_check_enabled) {
7553 unsigned int cnt, inpr;
7554
7555@@ -852,7 +852,7 @@ bool pm_wakeup_pending(void)
7556 ret = (cnt != saved_count || inpr > 0);
7557 events_check_enabled = !ret;
7558 }
7559- spin_unlock_irqrestore(&events_lock, flags);
7560+ raw_spin_unlock_irqrestore(&events_lock, flags);
7561
7562 if (ret) {
7563 pr_info("PM: Wakeup pending, aborting suspend\n");
7564@@ -941,13 +941,13 @@ bool pm_save_wakeup_count(unsigned int count)
7565 unsigned long flags;
7566
7567 events_check_enabled = false;
7568- spin_lock_irqsave(&events_lock, flags);
7569+ raw_spin_lock_irqsave(&events_lock, flags);
7570 split_counters(&cnt, &inpr);
7571 if (cnt == count && inpr == 0) {
7572 saved_count = count;
7573 events_check_enabled = true;
7574 }
7575- spin_unlock_irqrestore(&events_lock, flags);
7576+ raw_spin_unlock_irqrestore(&events_lock, flags);
7577 return events_check_enabled;
7578 }
7579
7580diff --git a/drivers/block/brd.c b/drivers/block/brd.c
7581index 2d7178f7754e..c1cf87718c2e 100644
7582--- a/drivers/block/brd.c
7583+++ b/drivers/block/brd.c
7584@@ -60,7 +60,6 @@ struct brd_device {
7585 /*
7586 * Look up and return a brd's page for a given sector.
7587 */
7588-static DEFINE_MUTEX(brd_mutex);
7589 static struct page *brd_lookup_page(struct brd_device *brd, sector_t sector)
7590 {
7591 pgoff_t idx;
7592diff --git a/drivers/block/zram/zcomp.c b/drivers/block/zram/zcomp.c
7593index 5b8992beffec..40345483a022 100644
7594--- a/drivers/block/zram/zcomp.c
7595+++ b/drivers/block/zram/zcomp.c
7596@@ -116,12 +116,20 @@ ssize_t zcomp_available_show(const char *comp, char *buf)
7597
7598 struct zcomp_strm *zcomp_stream_get(struct zcomp *comp)
7599 {
7600- return *get_cpu_ptr(comp->stream);
7601+ struct zcomp_strm *zstrm;
7602+
7603+ zstrm = *get_local_ptr(comp->stream);
7604+ spin_lock(&zstrm->zcomp_lock);
7605+ return zstrm;
7606 }
7607
7608 void zcomp_stream_put(struct zcomp *comp)
7609 {
7610- put_cpu_ptr(comp->stream);
7611+ struct zcomp_strm *zstrm;
7612+
7613+ zstrm = *this_cpu_ptr(comp->stream);
7614+ spin_unlock(&zstrm->zcomp_lock);
7615+ put_local_ptr(zstrm);
7616 }
7617
7618 int zcomp_compress(struct zcomp_strm *zstrm,
7619@@ -171,6 +179,7 @@ int zcomp_cpu_up_prepare(unsigned int cpu, struct hlist_node *node)
7620 pr_err("Can't allocate a compression stream\n");
7621 return -ENOMEM;
7622 }
7623+ spin_lock_init(&zstrm->zcomp_lock);
7624 *per_cpu_ptr(comp->stream, cpu) = zstrm;
7625 return 0;
7626 }
7627diff --git a/drivers/block/zram/zcomp.h b/drivers/block/zram/zcomp.h
7628index 41c1002a7d7d..d424eafcbf8e 100644
7629--- a/drivers/block/zram/zcomp.h
7630+++ b/drivers/block/zram/zcomp.h
7631@@ -14,6 +14,7 @@ struct zcomp_strm {
7632 /* compression/decompression buffer */
7633 void *buffer;
7634 struct crypto_comp *tfm;
7635+ spinlock_t zcomp_lock;
7636 };
7637
7638 /* dynamic per-device compression frontend */
7639diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
7640index 1e2648e4c286..c5d61209eb05 100644
7641--- a/drivers/block/zram/zram_drv.c
7642+++ b/drivers/block/zram/zram_drv.c
7643@@ -761,6 +761,30 @@ static DEVICE_ATTR_RO(io_stat);
7644 static DEVICE_ATTR_RO(mm_stat);
7645 static DEVICE_ATTR_RO(debug_stat);
7646
7647+#ifdef CONFIG_PREEMPT_RT_BASE
7648+static void zram_meta_init_table_locks(struct zram *zram, size_t num_pages)
7649+{
7650+ size_t index;
7651+
7652+ for (index = 0; index < num_pages; index++)
7653+ spin_lock_init(&zram->table[index].lock);
7654+}
7655+
7656+static void zram_slot_lock(struct zram *zram, u32 index)
7657+{
7658+ spin_lock(&zram->table[index].lock);
7659+ __set_bit(ZRAM_ACCESS, &zram->table[index].value);
7660+}
7661+
7662+static void zram_slot_unlock(struct zram *zram, u32 index)
7663+{
7664+ __clear_bit(ZRAM_ACCESS, &zram->table[index].value);
7665+ spin_unlock(&zram->table[index].lock);
7666+}
7667+
7668+#else
7669+static void zram_meta_init_table_locks(struct zram *zram, size_t num_pages) { }
7670+
7671 static void zram_slot_lock(struct zram *zram, u32 index)
7672 {
7673 bit_spin_lock(ZRAM_ACCESS, &zram->table[index].value);
7674@@ -770,6 +794,7 @@ static void zram_slot_unlock(struct zram *zram, u32 index)
7675 {
7676 bit_spin_unlock(ZRAM_ACCESS, &zram->table[index].value);
7677 }
7678+#endif
7679
7680 static void zram_meta_free(struct zram *zram, u64 disksize)
7681 {
7682@@ -799,6 +824,7 @@ static bool zram_meta_alloc(struct zram *zram, u64 disksize)
7683 return false;
7684 }
7685
7686+ zram_meta_init_table_locks(zram, num_pages);
7687 return true;
7688 }
7689
7690@@ -850,6 +876,7 @@ static int __zram_bvec_read(struct zram *zram, struct page *page, u32 index,
7691 unsigned long handle;
7692 unsigned int size;
7693 void *src, *dst;
7694+ struct zcomp_strm *zstrm;
7695
7696 if (zram_wb_enabled(zram)) {
7697 zram_slot_lock(zram, index);
7698@@ -884,6 +911,7 @@ static int __zram_bvec_read(struct zram *zram, struct page *page, u32 index,
7699
7700 size = zram_get_obj_size(zram, index);
7701
7702+ zstrm = zcomp_stream_get(zram->comp);
7703 src = zs_map_object(zram->mem_pool, handle, ZS_MM_RO);
7704 if (size == PAGE_SIZE) {
7705 dst = kmap_atomic(page);
7706@@ -891,14 +919,13 @@ static int __zram_bvec_read(struct zram *zram, struct page *page, u32 index,
7707 kunmap_atomic(dst);
7708 ret = 0;
7709 } else {
7710- struct zcomp_strm *zstrm = zcomp_stream_get(zram->comp);
7711
7712 dst = kmap_atomic(page);
7713 ret = zcomp_decompress(zstrm, src, size, dst);
7714 kunmap_atomic(dst);
7715- zcomp_stream_put(zram->comp);
7716 }
7717 zs_unmap_object(zram->mem_pool, handle);
7718+ zcomp_stream_put(zram->comp);
7719 zram_slot_unlock(zram, index);
7720
7721 /* Should NEVER happen. Return bio error if it does. */
7722diff --git a/drivers/block/zram/zram_drv.h b/drivers/block/zram/zram_drv.h
7723index 31762db861e3..a417c96b8f3f 100644
7724--- a/drivers/block/zram/zram_drv.h
7725+++ b/drivers/block/zram/zram_drv.h
7726@@ -77,6 +77,9 @@ struct zram_table_entry {
7727 unsigned long element;
7728 };
7729 unsigned long value;
7730+#ifdef CONFIG_PREEMPT_RT_BASE
7731+ spinlock_t lock;
7732+#endif
7733 };
7734
7735 struct zram_stats {
7736diff --git a/drivers/char/random.c b/drivers/char/random.c
7737index ea4dbfa30657..c72a7f0b4494 100644
7738--- a/drivers/char/random.c
7739+++ b/drivers/char/random.c
7740@@ -265,6 +265,7 @@
7741 #include <linux/syscalls.h>
7742 #include <linux/completion.h>
7743 #include <linux/uuid.h>
7744+#include <linux/locallock.h>
7745 #include <crypto/chacha20.h>
7746
7747 #include <asm/processor.h>
7748@@ -856,7 +857,7 @@ static int crng_fast_load(const char *cp, size_t len)
7749 invalidate_batched_entropy();
7750 crng_init = 1;
7751 wake_up_interruptible(&crng_init_wait);
7752- pr_notice("random: fast init done\n");
7753+ /* pr_notice("random: fast init done\n"); */
7754 }
7755 return 1;
7756 }
7757@@ -941,17 +942,21 @@ static void crng_reseed(struct crng_state *crng, struct entropy_store *r)
7758 crng_init = 2;
7759 process_random_ready_list();
7760 wake_up_interruptible(&crng_init_wait);
7761- pr_notice("random: crng init done\n");
7762+ /* pr_notice("random: crng init done\n"); */
7763 if (unseeded_warning.missed) {
7764+#if 0
7765 pr_notice("random: %d get_random_xx warning(s) missed "
7766 "due to ratelimiting\n",
7767 unseeded_warning.missed);
7768+#endif
7769 unseeded_warning.missed = 0;
7770 }
7771 if (urandom_warning.missed) {
7772+#if 0
7773 pr_notice("random: %d urandom warning(s) missed "
7774 "due to ratelimiting\n",
7775 urandom_warning.missed);
7776+#endif
7777 urandom_warning.missed = 0;
7778 }
7779 }
7780@@ -1122,8 +1127,6 @@ static void add_timer_randomness(struct timer_rand_state *state, unsigned num)
7781 } sample;
7782 long delta, delta2, delta3;
7783
7784- preempt_disable();
7785-
7786 sample.jiffies = jiffies;
7787 sample.cycles = random_get_entropy();
7788 sample.num = num;
7789@@ -1164,7 +1167,6 @@ static void add_timer_randomness(struct timer_rand_state *state, unsigned num)
7790 */
7791 credit_entropy_bits(r, min_t(int, fls(delta>>1), 11));
7792 }
7793- preempt_enable();
7794 }
7795
7796 void add_input_randomness(unsigned int type, unsigned int code,
7797@@ -1221,28 +1223,27 @@ static __u32 get_reg(struct fast_pool *f, struct pt_regs *regs)
7798 return *ptr;
7799 }
7800
7801-void add_interrupt_randomness(int irq, int irq_flags)
7802+void add_interrupt_randomness(int irq, int irq_flags, __u64 ip)
7803 {
7804 struct entropy_store *r;
7805 struct fast_pool *fast_pool = this_cpu_ptr(&irq_randomness);
7806- struct pt_regs *regs = get_irq_regs();
7807 unsigned long now = jiffies;
7808 cycles_t cycles = random_get_entropy();
7809 __u32 c_high, j_high;
7810- __u64 ip;
7811 unsigned long seed;
7812 int credit = 0;
7813
7814 if (cycles == 0)
7815- cycles = get_reg(fast_pool, regs);
7816+ cycles = get_reg(fast_pool, NULL);
7817 c_high = (sizeof(cycles) > 4) ? cycles >> 32 : 0;
7818 j_high = (sizeof(now) > 4) ? now >> 32 : 0;
7819 fast_pool->pool[0] ^= cycles ^ j_high ^ irq;
7820 fast_pool->pool[1] ^= now ^ c_high;
7821- ip = regs ? instruction_pointer(regs) : _RET_IP_;
7822+ if (!ip)
7823+ ip = _RET_IP_;
7824 fast_pool->pool[2] ^= ip;
7825 fast_pool->pool[3] ^= (sizeof(ip) > 4) ? ip >> 32 :
7826- get_reg(fast_pool, regs);
7827+ get_reg(fast_pool, NULL);
7828
7829 fast_mix(fast_pool);
7830 add_interrupt_bench(cycles);
7831@@ -2200,6 +2201,7 @@ static rwlock_t batched_entropy_reset_lock = __RW_LOCK_UNLOCKED(batched_entropy_
7832 * at any point prior.
7833 */
7834 static DEFINE_PER_CPU(struct batched_entropy, batched_entropy_u64);
7835+static DEFINE_LOCAL_IRQ_LOCK(batched_entropy_u64_lock);
7836 u64 get_random_u64(void)
7837 {
7838 u64 ret;
7839@@ -2220,7 +2222,7 @@ u64 get_random_u64(void)
7840 warn_unseeded_randomness(&previous);
7841
7842 use_lock = READ_ONCE(crng_init) < 2;
7843- batch = &get_cpu_var(batched_entropy_u64);
7844+ batch = &get_locked_var(batched_entropy_u64_lock, batched_entropy_u64);
7845 if (use_lock)
7846 read_lock_irqsave(&batched_entropy_reset_lock, flags);
7847 if (batch->position % ARRAY_SIZE(batch->entropy_u64) == 0) {
7848@@ -2230,12 +2232,13 @@ u64 get_random_u64(void)
7849 ret = batch->entropy_u64[batch->position++];
7850 if (use_lock)
7851 read_unlock_irqrestore(&batched_entropy_reset_lock, flags);
7852- put_cpu_var(batched_entropy_u64);
7853+ put_locked_var(batched_entropy_u64_lock, batched_entropy_u64);
7854 return ret;
7855 }
7856 EXPORT_SYMBOL(get_random_u64);
7857
7858 static DEFINE_PER_CPU(struct batched_entropy, batched_entropy_u32);
7859+static DEFINE_LOCAL_IRQ_LOCK(batched_entropy_u32_lock);
7860 u32 get_random_u32(void)
7861 {
7862 u32 ret;
7863@@ -2250,7 +2253,7 @@ u32 get_random_u32(void)
7864 warn_unseeded_randomness(&previous);
7865
7866 use_lock = READ_ONCE(crng_init) < 2;
7867- batch = &get_cpu_var(batched_entropy_u32);
7868+ batch = &get_locked_var(batched_entropy_u32_lock, batched_entropy_u32);
7869 if (use_lock)
7870 read_lock_irqsave(&batched_entropy_reset_lock, flags);
7871 if (batch->position % ARRAY_SIZE(batch->entropy_u32) == 0) {
7872@@ -2260,7 +2263,7 @@ u32 get_random_u32(void)
7873 ret = batch->entropy_u32[batch->position++];
7874 if (use_lock)
7875 read_unlock_irqrestore(&batched_entropy_reset_lock, flags);
7876- put_cpu_var(batched_entropy_u32);
7877+ put_locked_var(batched_entropy_u32_lock, batched_entropy_u32);
7878 return ret;
7879 }
7880 EXPORT_SYMBOL(get_random_u32);
7881diff --git a/drivers/char/tpm/tpm_tis.c b/drivers/char/tpm/tpm_tis.c
7882index 50b59a69dc33..cbdb0a6c5337 100644
7883--- a/drivers/char/tpm/tpm_tis.c
7884+++ b/drivers/char/tpm/tpm_tis.c
7885@@ -52,6 +52,31 @@ static inline struct tpm_tis_tcg_phy *to_tpm_tis_tcg_phy(struct tpm_tis_data *da
7886 return container_of(data, struct tpm_tis_tcg_phy, priv);
7887 }
7888
7889+#ifdef CONFIG_PREEMPT_RT_FULL
7890+/*
7891+ * Flushes previous write operations to chip so that a subsequent
7892+ * ioread*()s won't stall a cpu.
7893+ */
7894+static inline void tpm_tis_flush(void __iomem *iobase)
7895+{
7896+ ioread8(iobase + TPM_ACCESS(0));
7897+}
7898+#else
7899+#define tpm_tis_flush(iobase) do { } while (0)
7900+#endif
7901+
7902+static inline void tpm_tis_iowrite8(u8 b, void __iomem *iobase, u32 addr)
7903+{
7904+ iowrite8(b, iobase + addr);
7905+ tpm_tis_flush(iobase);
7906+}
7907+
7908+static inline void tpm_tis_iowrite32(u32 b, void __iomem *iobase, u32 addr)
7909+{
7910+ iowrite32(b, iobase + addr);
7911+ tpm_tis_flush(iobase);
7912+}
7913+
7914 static bool interrupts = true;
7915 module_param(interrupts, bool, 0444);
7916 MODULE_PARM_DESC(interrupts, "Enable interrupts");
7917@@ -149,7 +174,7 @@ static int tpm_tcg_write_bytes(struct tpm_tis_data *data, u32 addr, u16 len,
7918 struct tpm_tis_tcg_phy *phy = to_tpm_tis_tcg_phy(data);
7919
7920 while (len--)
7921- iowrite8(*value++, phy->iobase + addr);
7922+ tpm_tis_iowrite8(*value++, phy->iobase, addr);
7923
7924 return 0;
7925 }
7926@@ -176,7 +201,7 @@ static int tpm_tcg_write32(struct tpm_tis_data *data, u32 addr, u32 value)
7927 {
7928 struct tpm_tis_tcg_phy *phy = to_tpm_tis_tcg_phy(data);
7929
7930- iowrite32(value, phy->iobase + addr);
7931+ tpm_tis_iowrite32(value, phy->iobase, addr);
7932
7933 return 0;
7934 }
7935diff --git a/drivers/clocksource/tcb_clksrc.c b/drivers/clocksource/tcb_clksrc.c
7936index 9de47d4d2d9e..05f4b88bb955 100644
7937--- a/drivers/clocksource/tcb_clksrc.c
7938+++ b/drivers/clocksource/tcb_clksrc.c
7939@@ -25,8 +25,7 @@
7940 * this 32 bit free-running counter. the second channel is not used.
7941 *
7942 * - The third channel may be used to provide a 16-bit clockevent
7943- * source, used in either periodic or oneshot mode. This runs
7944- * at 32 KiHZ, and can handle delays of up to two seconds.
7945+ * source, used in either periodic or oneshot mode.
7946 *
7947 * A boot clocksource and clockevent source are also currently needed,
7948 * unless the relevant platforms (ARM/AT91, AVR32/AT32) are changed so
7949@@ -126,6 +125,8 @@ static struct clocksource clksrc = {
7950 struct tc_clkevt_device {
7951 struct clock_event_device clkevt;
7952 struct clk *clk;
7953+ bool clk_enabled;
7954+ u32 freq;
7955 void __iomem *regs;
7956 };
7957
7958@@ -134,15 +135,26 @@ static struct tc_clkevt_device *to_tc_clkevt(struct clock_event_device *clkevt)
7959 return container_of(clkevt, struct tc_clkevt_device, clkevt);
7960 }
7961
7962-/* For now, we always use the 32K clock ... this optimizes for NO_HZ,
7963- * because using one of the divided clocks would usually mean the
7964- * tick rate can never be less than several dozen Hz (vs 0.5 Hz).
7965- *
7966- * A divided clock could be good for high resolution timers, since
7967- * 30.5 usec resolution can seem "low".
7968- */
7969 static u32 timer_clock;
7970
7971+static void tc_clk_disable(struct clock_event_device *d)
7972+{
7973+ struct tc_clkevt_device *tcd = to_tc_clkevt(d);
7974+
7975+ clk_disable(tcd->clk);
7976+ tcd->clk_enabled = false;
7977+}
7978+
7979+static void tc_clk_enable(struct clock_event_device *d)
7980+{
7981+ struct tc_clkevt_device *tcd = to_tc_clkevt(d);
7982+
7983+ if (tcd->clk_enabled)
7984+ return;
7985+ clk_enable(tcd->clk);
7986+ tcd->clk_enabled = true;
7987+}
7988+
7989 static int tc_shutdown(struct clock_event_device *d)
7990 {
7991 struct tc_clkevt_device *tcd = to_tc_clkevt(d);
7992@@ -150,8 +162,14 @@ static int tc_shutdown(struct clock_event_device *d)
7993
7994 writel(0xff, regs + ATMEL_TC_REG(2, IDR));
7995 writel(ATMEL_TC_CLKDIS, regs + ATMEL_TC_REG(2, CCR));
7996+ return 0;
7997+}
7998+
7999+static int tc_shutdown_clk_off(struct clock_event_device *d)
8000+{
8001+ tc_shutdown(d);
8002 if (!clockevent_state_detached(d))
8003- clk_disable(tcd->clk);
8004+ tc_clk_disable(d);
8005
8006 return 0;
8007 }
8008@@ -164,9 +182,9 @@ static int tc_set_oneshot(struct clock_event_device *d)
8009 if (clockevent_state_oneshot(d) || clockevent_state_periodic(d))
8010 tc_shutdown(d);
8011
8012- clk_enable(tcd->clk);
8013+ tc_clk_enable(d);
8014
8015- /* slow clock, count up to RC, then irq and stop */
8016+ /* count up to RC, then irq and stop */
8017 writel(timer_clock | ATMEL_TC_CPCSTOP | ATMEL_TC_WAVE |
8018 ATMEL_TC_WAVESEL_UP_AUTO, regs + ATMEL_TC_REG(2, CMR));
8019 writel(ATMEL_TC_CPCS, regs + ATMEL_TC_REG(2, IER));
8020@@ -186,12 +204,12 @@ static int tc_set_periodic(struct clock_event_device *d)
8021 /* By not making the gentime core emulate periodic mode on top
8022 * of oneshot, we get lower overhead and improved accuracy.
8023 */
8024- clk_enable(tcd->clk);
8025+ tc_clk_enable(d);
8026
8027- /* slow clock, count up to RC, then irq and restart */
8028+ /* count up to RC, then irq and restart */
8029 writel(timer_clock | ATMEL_TC_WAVE | ATMEL_TC_WAVESEL_UP_AUTO,
8030 regs + ATMEL_TC_REG(2, CMR));
8031- writel((32768 + HZ / 2) / HZ, tcaddr + ATMEL_TC_REG(2, RC));
8032+ writel((tcd->freq + HZ / 2) / HZ, tcaddr + ATMEL_TC_REG(2, RC));
8033
8034 /* Enable clock and interrupts on RC compare */
8035 writel(ATMEL_TC_CPCS, regs + ATMEL_TC_REG(2, IER));
8036@@ -218,9 +236,13 @@ static struct tc_clkevt_device clkevt = {
8037 .features = CLOCK_EVT_FEAT_PERIODIC |
8038 CLOCK_EVT_FEAT_ONESHOT,
8039 /* Should be lower than at91rm9200's system timer */
8040+#ifdef CONFIG_ATMEL_TCB_CLKSRC_USE_SLOW_CLOCK
8041 .rating = 125,
8042+#else
8043+ .rating = 200,
8044+#endif
8045 .set_next_event = tc_next_event,
8046- .set_state_shutdown = tc_shutdown,
8047+ .set_state_shutdown = tc_shutdown_clk_off,
8048 .set_state_periodic = tc_set_periodic,
8049 .set_state_oneshot = tc_set_oneshot,
8050 },
8051@@ -240,8 +262,9 @@ static irqreturn_t ch2_irq(int irq, void *handle)
8052 return IRQ_NONE;
8053 }
8054
8055-static int __init setup_clkevents(struct atmel_tc *tc, int clk32k_divisor_idx)
8056+static int __init setup_clkevents(struct atmel_tc *tc, int divisor_idx)
8057 {
8058+ unsigned divisor = atmel_tc_divisors[divisor_idx];
8059 int ret;
8060 struct clk *t2_clk = tc->clk[2];
8061 int irq = tc->irq[2];
8062@@ -262,7 +285,11 @@ static int __init setup_clkevents(struct atmel_tc *tc, int clk32k_divisor_idx)
8063 clkevt.regs = tc->regs;
8064 clkevt.clk = t2_clk;
8065
8066- timer_clock = clk32k_divisor_idx;
8067+ timer_clock = divisor_idx;
8068+ if (!divisor)
8069+ clkevt.freq = 32768;
8070+ else
8071+ clkevt.freq = clk_get_rate(t2_clk) / divisor;
8072
8073 clkevt.clkevt.cpumask = cpumask_of(0);
8074
8075@@ -273,7 +300,7 @@ static int __init setup_clkevents(struct atmel_tc *tc, int clk32k_divisor_idx)
8076 return ret;
8077 }
8078
8079- clockevents_config_and_register(&clkevt.clkevt, 32768, 1, 0xffff);
8080+ clockevents_config_and_register(&clkevt.clkevt, clkevt.freq, 1, 0xffff);
8081
8082 return ret;
8083 }
8084@@ -410,7 +437,11 @@ static int __init tcb_clksrc_init(void)
8085 goto err_disable_t1;
8086
8087 /* channel 2: periodic and oneshot timer support */
8088+#ifdef CONFIG_ATMEL_TCB_CLKSRC_USE_SLOW_CLOCK
8089 ret = setup_clkevents(tc, clk32k_divisor_idx);
8090+#else
8091+ ret = setup_clkevents(tc, best_divisor_idx);
8092+#endif
8093 if (ret)
8094 goto err_unregister_clksrc;
8095
8096diff --git a/drivers/clocksource/timer-atmel-pit.c b/drivers/clocksource/timer-atmel-pit.c
8097index 2fab18fae4fc..98460c1bdec0 100644
8098--- a/drivers/clocksource/timer-atmel-pit.c
8099+++ b/drivers/clocksource/timer-atmel-pit.c
8100@@ -46,6 +46,7 @@ struct pit_data {
8101 u32 cycle;
8102 u32 cnt;
8103 unsigned int irq;
8104+ bool irq_requested;
8105 struct clk *mck;
8106 };
8107
8108@@ -96,15 +97,29 @@ static int pit_clkevt_shutdown(struct clock_event_device *dev)
8109
8110 /* disable irq, leaving the clocksource active */
8111 pit_write(data->base, AT91_PIT_MR, (data->cycle - 1) | AT91_PIT_PITEN);
8112+ if (data->irq_requested) {
8113+ free_irq(data->irq, data);
8114+ data->irq_requested = false;
8115+ }
8116 return 0;
8117 }
8118
8119+static irqreturn_t at91sam926x_pit_interrupt(int irq, void *dev_id);
8120 /*
8121 * Clockevent device: interrupts every 1/HZ (== pit_cycles * MCK/16)
8122 */
8123 static int pit_clkevt_set_periodic(struct clock_event_device *dev)
8124 {
8125 struct pit_data *data = clkevt_to_pit_data(dev);
8126+ int ret;
8127+
8128+ ret = request_irq(data->irq, at91sam926x_pit_interrupt,
8129+ IRQF_SHARED | IRQF_TIMER | IRQF_IRQPOLL,
8130+ "at91_tick", data);
8131+ if (ret)
8132+ panic(pr_fmt("Unable to setup IRQ\n"));
8133+
8134+ data->irq_requested = true;
8135
8136 /* update clocksource counter */
8137 data->cnt += data->cycle * PIT_PICNT(pit_read(data->base, AT91_PIT_PIVR));
8138@@ -233,16 +248,6 @@ static int __init at91sam926x_pit_dt_init(struct device_node *node)
8139 goto exit;
8140 }
8141
8142- /* Set up irq handler */
8143- ret = request_irq(data->irq, at91sam926x_pit_interrupt,
8144- IRQF_SHARED | IRQF_TIMER | IRQF_IRQPOLL,
8145- "at91_tick", data);
8146- if (ret) {
8147- pr_err("Unable to setup IRQ\n");
8148- clocksource_unregister(&data->clksrc);
8149- goto exit;
8150- }
8151-
8152 /* Set up and register clockevents */
8153 data->clkevt.name = "pit";
8154 data->clkevt.features = CLOCK_EVT_FEAT_PERIODIC;
8155diff --git a/drivers/clocksource/timer-atmel-st.c b/drivers/clocksource/timer-atmel-st.c
8156index d2e660f475af..c63b96cfc23e 100644
8157--- a/drivers/clocksource/timer-atmel-st.c
8158+++ b/drivers/clocksource/timer-atmel-st.c
8159@@ -115,18 +115,29 @@ static void clkdev32k_disable_and_flush_irq(void)
8160 last_crtr = read_CRTR();
8161 }
8162
8163+static int atmel_st_irq;
8164+
8165 static int clkevt32k_shutdown(struct clock_event_device *evt)
8166 {
8167 clkdev32k_disable_and_flush_irq();
8168 irqmask = 0;
8169 regmap_write(regmap_st, AT91_ST_IER, irqmask);
8170+ free_irq(atmel_st_irq, regmap_st);
8171 return 0;
8172 }
8173
8174 static int clkevt32k_set_oneshot(struct clock_event_device *dev)
8175 {
8176+ int ret;
8177+
8178 clkdev32k_disable_and_flush_irq();
8179
8180+ ret = request_irq(atmel_st_irq, at91rm9200_timer_interrupt,
8181+ IRQF_SHARED | IRQF_TIMER | IRQF_IRQPOLL,
8182+ "at91_tick", regmap_st);
8183+ if (ret)
8184+ panic(pr_fmt("Unable to setup IRQ\n"));
8185+
8186 /*
8187 * ALM for oneshot irqs, set by next_event()
8188 * before 32 seconds have passed.
8189@@ -139,8 +150,16 @@ static int clkevt32k_set_oneshot(struct clock_event_device *dev)
8190
8191 static int clkevt32k_set_periodic(struct clock_event_device *dev)
8192 {
8193+ int ret;
8194+
8195 clkdev32k_disable_and_flush_irq();
8196
8197+ ret = request_irq(atmel_st_irq, at91rm9200_timer_interrupt,
8198+ IRQF_SHARED | IRQF_TIMER | IRQF_IRQPOLL,
8199+ "at91_tick", regmap_st);
8200+ if (ret)
8201+ panic(pr_fmt("Unable to setup IRQ\n"));
8202+
8203 /* PIT for periodic irqs; fixed rate of 1/HZ */
8204 irqmask = AT91_ST_PITS;
8205 regmap_write(regmap_st, AT91_ST_PIMR, timer_latch);
8206@@ -198,7 +217,7 @@ static int __init atmel_st_timer_init(struct device_node *node)
8207 {
8208 struct clk *sclk;
8209 unsigned int sclk_rate, val;
8210- int irq, ret;
8211+ int ret;
8212
8213 regmap_st = syscon_node_to_regmap(node);
8214 if (IS_ERR(regmap_st)) {
8215@@ -212,21 +231,12 @@ static int __init atmel_st_timer_init(struct device_node *node)
8216 regmap_read(regmap_st, AT91_ST_SR, &val);
8217
8218 /* Get the interrupts property */
8219- irq = irq_of_parse_and_map(node, 0);
8220- if (!irq) {
8221+ atmel_st_irq = irq_of_parse_and_map(node, 0);
8222+ if (!atmel_st_irq) {
8223 pr_err("Unable to get IRQ from DT\n");
8224 return -EINVAL;
8225 }
8226
8227- /* Make IRQs happen for the system timer */
8228- ret = request_irq(irq, at91rm9200_timer_interrupt,
8229- IRQF_SHARED | IRQF_TIMER | IRQF_IRQPOLL,
8230- "at91_tick", regmap_st);
8231- if (ret) {
8232- pr_err("Unable to setup IRQ\n");
8233- return ret;
8234- }
8235-
8236 sclk = of_clk_get(node, 0);
8237 if (IS_ERR(sclk)) {
8238 pr_err("Unable to get slow clock\n");
8239diff --git a/drivers/connector/cn_proc.c b/drivers/connector/cn_proc.c
8240index a782ce87715c..19d265948526 100644
8241--- a/drivers/connector/cn_proc.c
8242+++ b/drivers/connector/cn_proc.c
8243@@ -32,6 +32,7 @@
8244 #include <linux/pid_namespace.h>
8245
8246 #include <linux/cn_proc.h>
8247+#include <linux/locallock.h>
8248
8249 /*
8250 * Size of a cn_msg followed by a proc_event structure. Since the
8251@@ -54,10 +55,11 @@ static struct cb_id cn_proc_event_id = { CN_IDX_PROC, CN_VAL_PROC };
8252
8253 /* proc_event_counts is used as the sequence number of the netlink message */
8254 static DEFINE_PER_CPU(__u32, proc_event_counts) = { 0 };
8255+static DEFINE_LOCAL_IRQ_LOCK(send_msg_lock);
8256
8257 static inline void send_msg(struct cn_msg *msg)
8258 {
8259- preempt_disable();
8260+ local_lock(send_msg_lock);
8261
8262 msg->seq = __this_cpu_inc_return(proc_event_counts) - 1;
8263 ((struct proc_event *)msg->data)->cpu = smp_processor_id();
8264@@ -70,7 +72,7 @@ static inline void send_msg(struct cn_msg *msg)
8265 */
8266 cn_netlink_send(msg, 0, CN_IDX_PROC, GFP_NOWAIT);
8267
8268- preempt_enable();
8269+ local_unlock(send_msg_lock);
8270 }
8271
8272 void proc_fork_connector(struct task_struct *task)
8273diff --git a/drivers/cpufreq/Kconfig.x86 b/drivers/cpufreq/Kconfig.x86
8274index 35f71825b7f3..bb4a6160d0f7 100644
8275--- a/drivers/cpufreq/Kconfig.x86
8276+++ b/drivers/cpufreq/Kconfig.x86
8277@@ -125,7 +125,7 @@ config X86_POWERNOW_K7_ACPI
8278
8279 config X86_POWERNOW_K8
8280 tristate "AMD Opteron/Athlon64 PowerNow!"
8281- depends on ACPI && ACPI_PROCESSOR && X86_ACPI_CPUFREQ
8282+ depends on ACPI && ACPI_PROCESSOR && X86_ACPI_CPUFREQ && !PREEMPT_RT_BASE
8283 help
8284 This adds the CPUFreq driver for K8/early Opteron/Athlon64 processors.
8285 Support for K10 and newer processors is now in acpi-cpufreq.
8286diff --git a/drivers/firmware/efi/efi.c b/drivers/firmware/efi/efi.c
8287index c3eefa126e3b..47093745a53c 100644
8288--- a/drivers/firmware/efi/efi.c
8289+++ b/drivers/firmware/efi/efi.c
8290@@ -74,7 +74,7 @@ static unsigned long *efi_tables[] = {
8291 &efi.mem_attr_table,
8292 };
8293
8294-static bool disable_runtime;
8295+static bool disable_runtime = IS_ENABLED(CONFIG_PREEMPT_RT_BASE);
8296 static int __init setup_noefi(char *arg)
8297 {
8298 disable_runtime = true;
8299@@ -100,6 +100,9 @@ static int __init parse_efi_cmdline(char *str)
8300 if (parse_option_str(str, "noruntime"))
8301 disable_runtime = true;
8302
8303+ if (parse_option_str(str, "runtime"))
8304+ disable_runtime = false;
8305+
8306 return 0;
8307 }
8308 early_param("efi", parse_efi_cmdline);
8309diff --git a/drivers/gpu/drm/i915/i915_gem_timeline.c b/drivers/gpu/drm/i915/i915_gem_timeline.c
8310index c597ce277a04..c1108d3921f8 100644
8311--- a/drivers/gpu/drm/i915/i915_gem_timeline.c
8312+++ b/drivers/gpu/drm/i915/i915_gem_timeline.c
8313@@ -33,11 +33,8 @@ static void __intel_timeline_init(struct intel_timeline *tl,
8314 {
8315 tl->fence_context = context;
8316 tl->common = parent;
8317-#ifdef CONFIG_DEBUG_SPINLOCK
8318- __raw_spin_lock_init(&tl->lock.rlock, lockname, lockclass);
8319-#else
8320 spin_lock_init(&tl->lock);
8321-#endif
8322+ lockdep_set_class_and_name(&tl->lock, lockclass, lockname);
8323 init_request_active(&tl->last_request, NULL);
8324 INIT_LIST_HEAD(&tl->requests);
8325 i915_syncmap_init(&tl->sync);
8326diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
8327index 20a471ad0ad2..5d34d48a8b7b 100644
8328--- a/drivers/gpu/drm/i915/i915_irq.c
8329+++ b/drivers/gpu/drm/i915/i915_irq.c
8330@@ -867,6 +867,7 @@ static bool i915_get_crtc_scanoutpos(struct drm_device *dev, unsigned int pipe,
8331 spin_lock_irqsave(&dev_priv->uncore.lock, irqflags);
8332
8333 /* preempt_disable_rt() should go right here in PREEMPT_RT patchset. */
8334+ preempt_disable_rt();
8335
8336 /* Get optional system timestamp before query. */
8337 if (stime)
8338@@ -918,6 +919,7 @@ static bool i915_get_crtc_scanoutpos(struct drm_device *dev, unsigned int pipe,
8339 *etime = ktime_get();
8340
8341 /* preempt_enable_rt() should go right here in PREEMPT_RT patchset. */
8342+ preempt_enable_rt();
8343
8344 spin_unlock_irqrestore(&dev_priv->uncore.lock, irqflags);
8345
8346diff --git a/drivers/gpu/drm/i915/intel_sprite.c b/drivers/gpu/drm/i915/intel_sprite.c
8347index 41e31a454604..7e0cadf51b31 100644
8348--- a/drivers/gpu/drm/i915/intel_sprite.c
8349+++ b/drivers/gpu/drm/i915/intel_sprite.c
8350@@ -36,6 +36,7 @@
8351 #include <drm/drm_rect.h>
8352 #include <drm/drm_atomic.h>
8353 #include <drm/drm_plane_helper.h>
8354+#include <linux/locallock.h>
8355 #include "intel_drv.h"
8356 #include "intel_frontbuffer.h"
8357 #include <drm/i915_drm.h>
8358@@ -67,7 +68,7 @@ int intel_usecs_to_scanlines(const struct drm_display_mode *adjusted_mode,
8359 }
8360
8361 #define VBLANK_EVASION_TIME_US 100
8362-
8363+static DEFINE_LOCAL_IRQ_LOCK(pipe_update_lock);
8364 /**
8365 * intel_pipe_update_start() - start update of a set of display registers
8366 * @crtc: the crtc of which the registers are going to be updated
8367@@ -102,7 +103,7 @@ void intel_pipe_update_start(struct intel_crtc *crtc)
8368 VBLANK_EVASION_TIME_US);
8369 max = vblank_start - 1;
8370
8371- local_irq_disable();
8372+ local_lock_irq(pipe_update_lock);
8373
8374 if (min <= 0 || max <= 0)
8375 return;
8376@@ -132,11 +133,11 @@ void intel_pipe_update_start(struct intel_crtc *crtc)
8377 break;
8378 }
8379
8380- local_irq_enable();
8381+ local_unlock_irq(pipe_update_lock);
8382
8383 timeout = schedule_timeout(timeout);
8384
8385- local_irq_disable();
8386+ local_lock_irq(pipe_update_lock);
8387 }
8388
8389 finish_wait(wq, &wait);
8390@@ -201,7 +202,7 @@ void intel_pipe_update_end(struct intel_crtc *crtc)
8391 crtc->base.state->event = NULL;
8392 }
8393
8394- local_irq_enable();
8395+ local_unlock_irq(pipe_update_lock);
8396
8397 if (intel_vgpu_active(dev_priv))
8398 return;
8399diff --git a/drivers/gpu/drm/radeon/radeon_display.c b/drivers/gpu/drm/radeon/radeon_display.c
8400index ddfe91efa61e..3157bcf6428f 100644
8401--- a/drivers/gpu/drm/radeon/radeon_display.c
8402+++ b/drivers/gpu/drm/radeon/radeon_display.c
8403@@ -1839,6 +1839,7 @@ int radeon_get_crtc_scanoutpos(struct drm_device *dev, unsigned int pipe,
8404 struct radeon_device *rdev = dev->dev_private;
8405
8406 /* preempt_disable_rt() should go right here in PREEMPT_RT patchset. */
8407+ preempt_disable_rt();
8408
8409 /* Get optional system timestamp before query. */
8410 if (stime)
8411@@ -1931,6 +1932,7 @@ int radeon_get_crtc_scanoutpos(struct drm_device *dev, unsigned int pipe,
8412 *etime = ktime_get();
8413
8414 /* preempt_enable_rt() should go right here in PREEMPT_RT patchset. */
8415+ preempt_enable_rt();
8416
8417 /* Decode into vertical and horizontal scanout position. */
8418 *vpos = position & 0x1fff;
8419diff --git a/drivers/hv/hyperv_vmbus.h b/drivers/hv/hyperv_vmbus.h
8420index 49569f8fe038..a3608cd52805 100644
8421--- a/drivers/hv/hyperv_vmbus.h
8422+++ b/drivers/hv/hyperv_vmbus.h
8423@@ -30,6 +30,7 @@
8424 #include <linux/atomic.h>
8425 #include <linux/hyperv.h>
8426 #include <linux/interrupt.h>
8427+#include <linux/irq.h>
8428
8429 /*
8430 * Timeout for services such as KVP and fcopy.
8431diff --git a/drivers/hv/vmbus_drv.c b/drivers/hv/vmbus_drv.c
8432index 2cd134dd94d2..cedf225d4182 100644
8433--- a/drivers/hv/vmbus_drv.c
8434+++ b/drivers/hv/vmbus_drv.c
8435@@ -966,6 +966,8 @@ static void vmbus_isr(void)
8436 void *page_addr = hv_cpu->synic_event_page;
8437 struct hv_message *msg;
8438 union hv_synic_event_flags *event;
8439+ struct pt_regs *regs = get_irq_regs();
8440+ u64 ip = regs ? instruction_pointer(regs) : 0;
8441 bool handled = false;
8442
8443 if (unlikely(page_addr == NULL))
8444@@ -1009,7 +1011,7 @@ static void vmbus_isr(void)
8445 tasklet_schedule(&hv_cpu->msg_dpc);
8446 }
8447
8448- add_interrupt_randomness(HYPERVISOR_CALLBACK_VECTOR, 0);
8449+ add_interrupt_randomness(HYPERVISOR_CALLBACK_VECTOR, 0, ip);
8450 }
8451
8452
8453diff --git a/drivers/ide/alim15x3.c b/drivers/ide/alim15x3.c
8454index 36f76e28a0bf..394f142f90c7 100644
8455--- a/drivers/ide/alim15x3.c
8456+++ b/drivers/ide/alim15x3.c
8457@@ -234,7 +234,7 @@ static int init_chipset_ali15x3(struct pci_dev *dev)
8458
8459 isa_dev = pci_get_device(PCI_VENDOR_ID_AL, PCI_DEVICE_ID_AL_M1533, NULL);
8460
8461- local_irq_save(flags);
8462+ local_irq_save_nort(flags);
8463
8464 if (m5229_revision < 0xC2) {
8465 /*
8466@@ -325,7 +325,7 @@ static int init_chipset_ali15x3(struct pci_dev *dev)
8467 }
8468 pci_dev_put(north);
8469 pci_dev_put(isa_dev);
8470- local_irq_restore(flags);
8471+ local_irq_restore_nort(flags);
8472 return 0;
8473 }
8474
8475diff --git a/drivers/ide/hpt366.c b/drivers/ide/hpt366.c
8476index 4b5dc0162e67..590cc7d64622 100644
8477--- a/drivers/ide/hpt366.c
8478+++ b/drivers/ide/hpt366.c
8479@@ -1236,7 +1236,7 @@ static int init_dma_hpt366(ide_hwif_t *hwif,
8480
8481 dma_old = inb(base + 2);
8482
8483- local_irq_save(flags);
8484+ local_irq_save_nort(flags);
8485
8486 dma_new = dma_old;
8487 pci_read_config_byte(dev, hwif->channel ? 0x4b : 0x43, &masterdma);
8488@@ -1247,7 +1247,7 @@ static int init_dma_hpt366(ide_hwif_t *hwif,
8489 if (dma_new != dma_old)
8490 outb(dma_new, base + 2);
8491
8492- local_irq_restore(flags);
8493+ local_irq_restore_nort(flags);
8494
8495 printk(KERN_INFO " %s: BM-DMA at 0x%04lx-0x%04lx\n",
8496 hwif->name, base, base + 7);
8497diff --git a/drivers/ide/ide-io-std.c b/drivers/ide/ide-io-std.c
8498index 19763977568c..4169433faab5 100644
8499--- a/drivers/ide/ide-io-std.c
8500+++ b/drivers/ide/ide-io-std.c
8501@@ -175,7 +175,7 @@ void ide_input_data(ide_drive_t *drive, struct ide_cmd *cmd, void *buf,
8502 unsigned long uninitialized_var(flags);
8503
8504 if ((io_32bit & 2) && !mmio) {
8505- local_irq_save(flags);
8506+ local_irq_save_nort(flags);
8507 ata_vlb_sync(io_ports->nsect_addr);
8508 }
8509
8510@@ -186,7 +186,7 @@ void ide_input_data(ide_drive_t *drive, struct ide_cmd *cmd, void *buf,
8511 insl(data_addr, buf, words);
8512
8513 if ((io_32bit & 2) && !mmio)
8514- local_irq_restore(flags);
8515+ local_irq_restore_nort(flags);
8516
8517 if (((len + 1) & 3) < 2)
8518 return;
8519@@ -219,7 +219,7 @@ void ide_output_data(ide_drive_t *drive, struct ide_cmd *cmd, void *buf,
8520 unsigned long uninitialized_var(flags);
8521
8522 if ((io_32bit & 2) && !mmio) {
8523- local_irq_save(flags);
8524+ local_irq_save_nort(flags);
8525 ata_vlb_sync(io_ports->nsect_addr);
8526 }
8527
8528@@ -230,7 +230,7 @@ void ide_output_data(ide_drive_t *drive, struct ide_cmd *cmd, void *buf,
8529 outsl(data_addr, buf, words);
8530
8531 if ((io_32bit & 2) && !mmio)
8532- local_irq_restore(flags);
8533+ local_irq_restore_nort(flags);
8534
8535 if (((len + 1) & 3) < 2)
8536 return;
8537diff --git a/drivers/ide/ide-io.c b/drivers/ide/ide-io.c
8538index 3a234701d92c..420e4e645856 100644
8539--- a/drivers/ide/ide-io.c
8540+++ b/drivers/ide/ide-io.c
8541@@ -660,7 +660,7 @@ void ide_timer_expiry (unsigned long data)
8542 /* disable_irq_nosync ?? */
8543 disable_irq(hwif->irq);
8544 /* local CPU only, as if we were handling an interrupt */
8545- local_irq_disable();
8546+ local_irq_disable_nort();
8547 if (hwif->polling) {
8548 startstop = handler(drive);
8549 } else if (drive_is_ready(drive)) {
8550diff --git a/drivers/ide/ide-iops.c b/drivers/ide/ide-iops.c
8551index 210a0887dd29..7bf05b6147e8 100644
8552--- a/drivers/ide/ide-iops.c
8553+++ b/drivers/ide/ide-iops.c
8554@@ -129,12 +129,12 @@ int __ide_wait_stat(ide_drive_t *drive, u8 good, u8 bad,
8555 if ((stat & ATA_BUSY) == 0)
8556 break;
8557
8558- local_irq_restore(flags);
8559+ local_irq_restore_nort(flags);
8560 *rstat = stat;
8561 return -EBUSY;
8562 }
8563 }
8564- local_irq_restore(flags);
8565+ local_irq_restore_nort(flags);
8566 }
8567 /*
8568 * Allow status to settle, then read it again.
8569diff --git a/drivers/ide/ide-probe.c b/drivers/ide/ide-probe.c
8570index eaf39e5db08b..be4c941eaa83 100644
8571--- a/drivers/ide/ide-probe.c
8572+++ b/drivers/ide/ide-probe.c
8573@@ -196,10 +196,10 @@ static void do_identify(ide_drive_t *drive, u8 cmd, u16 *id)
8574 int bswap = 1;
8575
8576 /* local CPU only; some systems need this */
8577- local_irq_save(flags);
8578+ local_irq_save_nort(flags);
8579 /* read 512 bytes of id info */
8580 hwif->tp_ops->input_data(drive, NULL, id, SECTOR_SIZE);
8581- local_irq_restore(flags);
8582+ local_irq_restore_nort(flags);
8583
8584 drive->dev_flags |= IDE_DFLAG_ID_READ;
8585 #ifdef DEBUG
8586diff --git a/drivers/ide/ide-taskfile.c b/drivers/ide/ide-taskfile.c
8587index 4efe4c6e956c..7eae3aa1def7 100644
8588--- a/drivers/ide/ide-taskfile.c
8589+++ b/drivers/ide/ide-taskfile.c
8590@@ -251,7 +251,7 @@ void ide_pio_bytes(ide_drive_t *drive, struct ide_cmd *cmd,
8591
8592 page_is_high = PageHighMem(page);
8593 if (page_is_high)
8594- local_irq_save(flags);
8595+ local_irq_save_nort(flags);
8596
8597 buf = kmap_atomic(page) + offset;
8598
8599@@ -272,7 +272,7 @@ void ide_pio_bytes(ide_drive_t *drive, struct ide_cmd *cmd,
8600 kunmap_atomic(buf);
8601
8602 if (page_is_high)
8603- local_irq_restore(flags);
8604+ local_irq_restore_nort(flags);
8605
8606 len -= nr_bytes;
8607 }
8608@@ -415,7 +415,7 @@ static ide_startstop_t pre_task_out_intr(ide_drive_t *drive,
8609 }
8610
8611 if ((drive->dev_flags & IDE_DFLAG_UNMASK) == 0)
8612- local_irq_disable();
8613+ local_irq_disable_nort();
8614
8615 ide_set_handler(drive, &task_pio_intr, WAIT_WORSTCASE);
8616
8617diff --git a/drivers/infiniband/hw/hfi1/affinity.c b/drivers/infiniband/hw/hfi1/affinity.c
8618index b197e925fe36..95ac319c8e69 100644
8619--- a/drivers/infiniband/hw/hfi1/affinity.c
8620+++ b/drivers/infiniband/hw/hfi1/affinity.c
8621@@ -593,7 +593,7 @@ int hfi1_get_proc_affinity(int node)
8622 struct hfi1_affinity_node *entry;
8623 cpumask_var_t diff, hw_thread_mask, available_mask, intrs_mask;
8624 const struct cpumask *node_mask,
8625- *proc_mask = &current->cpus_allowed;
8626+ *proc_mask = current->cpus_ptr;
8627 struct hfi1_affinity_node_list *affinity = &node_affinity;
8628 struct cpu_mask_set *set = &affinity->proc;
8629
8630@@ -601,7 +601,7 @@ int hfi1_get_proc_affinity(int node)
8631 * check whether process/context affinity has already
8632 * been set
8633 */
8634- if (cpumask_weight(proc_mask) == 1) {
8635+ if (current->nr_cpus_allowed == 1) {
8636 hfi1_cdbg(PROC, "PID %u %s affinity set to CPU %*pbl",
8637 current->pid, current->comm,
8638 cpumask_pr_args(proc_mask));
8639@@ -612,7 +612,7 @@ int hfi1_get_proc_affinity(int node)
8640 cpu = cpumask_first(proc_mask);
8641 cpumask_set_cpu(cpu, &set->used);
8642 goto done;
8643- } else if (cpumask_weight(proc_mask) < cpumask_weight(&set->mask)) {
8644+ } else if (current->nr_cpus_allowed < cpumask_weight(&set->mask)) {
8645 hfi1_cdbg(PROC, "PID %u %s affinity set to CPU set(s) %*pbl",
8646 current->pid, current->comm,
8647 cpumask_pr_args(proc_mask));
8648diff --git a/drivers/infiniband/hw/hfi1/sdma.c b/drivers/infiniband/hw/hfi1/sdma.c
8649index 6781bcdb10b3..d069ad261572 100644
8650--- a/drivers/infiniband/hw/hfi1/sdma.c
8651+++ b/drivers/infiniband/hw/hfi1/sdma.c
8652@@ -856,14 +856,13 @@ struct sdma_engine *sdma_select_user_engine(struct hfi1_devdata *dd,
8653 {
8654 struct sdma_rht_node *rht_node;
8655 struct sdma_engine *sde = NULL;
8656- const struct cpumask *current_mask = &current->cpus_allowed;
8657 unsigned long cpu_id;
8658
8659 /*
8660 * To ensure that always the same sdma engine(s) will be
8661 * selected make sure the process is pinned to this CPU only.
8662 */
8663- if (cpumask_weight(current_mask) != 1)
8664+ if (current->nr_cpus_allowed != 1)
8665 goto out;
8666
8667 cpu_id = smp_processor_id();
8668diff --git a/drivers/infiniband/hw/qib/qib_file_ops.c b/drivers/infiniband/hw/qib/qib_file_ops.c
8669index 40efc9151ec4..12924aad90cc 100644
8670--- a/drivers/infiniband/hw/qib/qib_file_ops.c
8671+++ b/drivers/infiniband/hw/qib/qib_file_ops.c
8672@@ -1167,7 +1167,7 @@ static unsigned int qib_poll(struct file *fp, struct poll_table_struct *pt)
8673 static void assign_ctxt_affinity(struct file *fp, struct qib_devdata *dd)
8674 {
8675 struct qib_filedata *fd = fp->private_data;
8676- const unsigned int weight = cpumask_weight(&current->cpus_allowed);
8677+ const unsigned int weight = current->nr_cpus_allowed;
8678 const struct cpumask *local_mask = cpumask_of_pcibus(dd->pcidev->bus);
8679 int local_cpu;
8680
8681@@ -1648,9 +1648,8 @@ static int qib_assign_ctxt(struct file *fp, const struct qib_user_info *uinfo)
8682 ret = find_free_ctxt(i_minor - 1, fp, uinfo);
8683 else {
8684 int unit;
8685- const unsigned int cpu = cpumask_first(&current->cpus_allowed);
8686- const unsigned int weight =
8687- cpumask_weight(&current->cpus_allowed);
8688+ const unsigned int cpu = cpumask_first(current->cpus_ptr);
8689+ const unsigned int weight = current->nr_cpus_allowed;
8690
8691 if (weight == 1 && !test_bit(cpu, qib_cpulist))
8692 if (!find_hca(cpu, &unit) && unit >= 0)
8693diff --git a/drivers/infiniband/ulp/ipoib/ipoib_multicast.c b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c
8694index 9b3f47ae2016..8327b598d909 100644
8695--- a/drivers/infiniband/ulp/ipoib/ipoib_multicast.c
8696+++ b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c
8697@@ -898,7 +898,7 @@ void ipoib_mcast_restart_task(struct work_struct *work)
8698
8699 ipoib_dbg_mcast(priv, "restarting multicast task\n");
8700
8701- local_irq_save(flags);
8702+ local_irq_save_nort(flags);
8703 netif_addr_lock(dev);
8704 spin_lock(&priv->lock);
8705
8706@@ -980,7 +980,7 @@ void ipoib_mcast_restart_task(struct work_struct *work)
8707
8708 spin_unlock(&priv->lock);
8709 netif_addr_unlock(dev);
8710- local_irq_restore(flags);
8711+ local_irq_restore_nort(flags);
8712
8713 ipoib_mcast_remove_list(&remove_list);
8714
8715diff --git a/drivers/input/gameport/gameport.c b/drivers/input/gameport/gameport.c
8716index cedc665364cd..4a4fdef151aa 100644
8717--- a/drivers/input/gameport/gameport.c
8718+++ b/drivers/input/gameport/gameport.c
8719@@ -91,13 +91,13 @@ static int gameport_measure_speed(struct gameport *gameport)
8720 tx = ~0;
8721
8722 for (i = 0; i < 50; i++) {
8723- local_irq_save(flags);
8724+ local_irq_save_nort(flags);
8725 t1 = ktime_get_ns();
8726 for (t = 0; t < 50; t++)
8727 gameport_read(gameport);
8728 t2 = ktime_get_ns();
8729 t3 = ktime_get_ns();
8730- local_irq_restore(flags);
8731+ local_irq_restore_nort(flags);
8732 udelay(i * 10);
8733 t = (t2 - t1) - (t3 - t2);
8734 if (t < tx)
8735@@ -124,12 +124,12 @@ static int old_gameport_measure_speed(struct gameport *gameport)
8736 tx = 1 << 30;
8737
8738 for(i = 0; i < 50; i++) {
8739- local_irq_save(flags);
8740+ local_irq_save_nort(flags);
8741 GET_TIME(t1);
8742 for (t = 0; t < 50; t++) gameport_read(gameport);
8743 GET_TIME(t2);
8744 GET_TIME(t3);
8745- local_irq_restore(flags);
8746+ local_irq_restore_nort(flags);
8747 udelay(i * 10);
8748 if ((t = DELTA(t2,t1) - DELTA(t3,t2)) < tx) tx = t;
8749 }
8750@@ -148,11 +148,11 @@ static int old_gameport_measure_speed(struct gameport *gameport)
8751 tx = 1 << 30;
8752
8753 for(i = 0; i < 50; i++) {
8754- local_irq_save(flags);
8755+ local_irq_save_nort(flags);
8756 t1 = rdtsc();
8757 for (t = 0; t < 50; t++) gameport_read(gameport);
8758 t2 = rdtsc();
8759- local_irq_restore(flags);
8760+ local_irq_restore_nort(flags);
8761 udelay(i * 10);
8762 if (t2 - t1 < tx) tx = t2 - t1;
8763 }
8764diff --git a/drivers/iommu/amd_iommu.c b/drivers/iommu/amd_iommu.c
8765index efa6cd2500b9..7d9d41f803d1 100644
8766--- a/drivers/iommu/amd_iommu.c
8767+++ b/drivers/iommu/amd_iommu.c
8768@@ -81,11 +81,12 @@
8769 */
8770 #define AMD_IOMMU_PGSIZES ((~0xFFFUL) & ~(2ULL << 38))
8771
8772-static DEFINE_RWLOCK(amd_iommu_devtable_lock);
8773+static DEFINE_SPINLOCK(amd_iommu_devtable_lock);
8774+static DEFINE_SPINLOCK(pd_bitmap_lock);
8775+static DEFINE_SPINLOCK(iommu_table_lock);
8776
8777 /* List of all available dev_data structures */
8778-static LIST_HEAD(dev_data_list);
8779-static DEFINE_SPINLOCK(dev_data_list_lock);
8780+static LLIST_HEAD(dev_data_list);
8781
8782 LIST_HEAD(ioapic_map);
8783 LIST_HEAD(hpet_map);
8784@@ -204,40 +205,33 @@ static struct dma_ops_domain* to_dma_ops_domain(struct protection_domain *domain
8785 static struct iommu_dev_data *alloc_dev_data(u16 devid)
8786 {
8787 struct iommu_dev_data *dev_data;
8788- unsigned long flags;
8789
8790 dev_data = kzalloc(sizeof(*dev_data), GFP_KERNEL);
8791 if (!dev_data)
8792 return NULL;
8793
8794 dev_data->devid = devid;
8795-
8796- spin_lock_irqsave(&dev_data_list_lock, flags);
8797- list_add_tail(&dev_data->dev_data_list, &dev_data_list);
8798- spin_unlock_irqrestore(&dev_data_list_lock, flags);
8799-
8800 ratelimit_default_init(&dev_data->rs);
8801
8802+ llist_add(&dev_data->dev_data_list, &dev_data_list);
8803 return dev_data;
8804 }
8805
8806 static struct iommu_dev_data *search_dev_data(u16 devid)
8807 {
8808 struct iommu_dev_data *dev_data;
8809- unsigned long flags;
8810+ struct llist_node *node;
8811+
8812+ if (llist_empty(&dev_data_list))
8813+ return NULL;
8814
8815- spin_lock_irqsave(&dev_data_list_lock, flags);
8816- list_for_each_entry(dev_data, &dev_data_list, dev_data_list) {
8817+ node = dev_data_list.first;
8818+ llist_for_each_entry(dev_data, node, dev_data_list) {
8819 if (dev_data->devid == devid)
8820- goto out_unlock;
8821+ return dev_data;
8822 }
8823
8824- dev_data = NULL;
8825-
8826-out_unlock:
8827- spin_unlock_irqrestore(&dev_data_list_lock, flags);
8828-
8829- return dev_data;
8830+ return NULL;
8831 }
8832
8833 static int __last_alias(struct pci_dev *pdev, u16 alias, void *data)
8834@@ -1062,9 +1056,9 @@ static int iommu_queue_command_sync(struct amd_iommu *iommu,
8835 unsigned long flags;
8836 int ret;
8837
8838- spin_lock_irqsave(&iommu->lock, flags);
8839+ raw_spin_lock_irqsave(&iommu->lock, flags);
8840 ret = __iommu_queue_command_sync(iommu, cmd, sync);
8841- spin_unlock_irqrestore(&iommu->lock, flags);
8842+ raw_spin_unlock_irqrestore(&iommu->lock, flags);
8843
8844 return ret;
8845 }
8846@@ -1090,7 +1084,7 @@ static int iommu_completion_wait(struct amd_iommu *iommu)
8847
8848 build_completion_wait(&cmd, (u64)&iommu->cmd_sem);
8849
8850- spin_lock_irqsave(&iommu->lock, flags);
8851+ raw_spin_lock_irqsave(&iommu->lock, flags);
8852
8853 iommu->cmd_sem = 0;
8854
8855@@ -1101,7 +1095,7 @@ static int iommu_completion_wait(struct amd_iommu *iommu)
8856 ret = wait_on_sem(&iommu->cmd_sem);
8857
8858 out_unlock:
8859- spin_unlock_irqrestore(&iommu->lock, flags);
8860+ raw_spin_unlock_irqrestore(&iommu->lock, flags);
8861
8862 return ret;
8863 }
8864@@ -1610,29 +1604,26 @@ static void del_domain_from_list(struct protection_domain *domain)
8865
8866 static u16 domain_id_alloc(void)
8867 {
8868- unsigned long flags;
8869 int id;
8870
8871- write_lock_irqsave(&amd_iommu_devtable_lock, flags);
8872+ spin_lock(&pd_bitmap_lock);
8873 id = find_first_zero_bit(amd_iommu_pd_alloc_bitmap, MAX_DOMAIN_ID);
8874 BUG_ON(id == 0);
8875 if (id > 0 && id < MAX_DOMAIN_ID)
8876 __set_bit(id, amd_iommu_pd_alloc_bitmap);
8877 else
8878 id = 0;
8879- write_unlock_irqrestore(&amd_iommu_devtable_lock, flags);
8880+ spin_unlock(&pd_bitmap_lock);
8881
8882 return id;
8883 }
8884
8885 static void domain_id_free(int id)
8886 {
8887- unsigned long flags;
8888-
8889- write_lock_irqsave(&amd_iommu_devtable_lock, flags);
8890+ spin_lock(&pd_bitmap_lock);
8891 if (id > 0 && id < MAX_DOMAIN_ID)
8892 __clear_bit(id, amd_iommu_pd_alloc_bitmap);
8893- write_unlock_irqrestore(&amd_iommu_devtable_lock, flags);
8894+ spin_unlock(&pd_bitmap_lock);
8895 }
8896
8897 #define DEFINE_FREE_PT_FN(LVL, FN) \
8898@@ -1952,10 +1943,10 @@ static int __attach_device(struct iommu_dev_data *dev_data,
8899 int ret;
8900
8901 /*
8902- * Must be called with IRQs disabled. Warn here to detect early
8903- * when its not.
8904+ * Must be called with IRQs disabled on a non RT kernel. Warn here to
8905+ * detect early when its not.
8906 */
8907- WARN_ON(!irqs_disabled());
8908+ WARN_ON_NONRT(!irqs_disabled());
8909
8910 /* lock domain */
8911 spin_lock(&domain->lock);
8912@@ -2101,9 +2092,9 @@ static int attach_device(struct device *dev,
8913 }
8914
8915 skip_ats_check:
8916- write_lock_irqsave(&amd_iommu_devtable_lock, flags);
8917+ spin_lock_irqsave(&amd_iommu_devtable_lock, flags);
8918 ret = __attach_device(dev_data, domain);
8919- write_unlock_irqrestore(&amd_iommu_devtable_lock, flags);
8920+ spin_unlock_irqrestore(&amd_iommu_devtable_lock, flags);
8921
8922 /*
8923 * We might boot into a crash-kernel here. The crashed kernel
8924@@ -2123,10 +2114,10 @@ static void __detach_device(struct iommu_dev_data *dev_data)
8925 struct protection_domain *domain;
8926
8927 /*
8928- * Must be called with IRQs disabled. Warn here to detect early
8929- * when its not.
8930+ * Must be called with IRQs disabled on a non RT kernel. Warn here to
8931+ * detect early when its not.
8932 */
8933- WARN_ON(!irqs_disabled());
8934+ WARN_ON_NONRT(!irqs_disabled());
8935
8936 if (WARN_ON(!dev_data->domain))
8937 return;
8938@@ -2153,9 +2144,9 @@ static void detach_device(struct device *dev)
8939 domain = dev_data->domain;
8940
8941 /* lock device table */
8942- write_lock_irqsave(&amd_iommu_devtable_lock, flags);
8943+ spin_lock_irqsave(&amd_iommu_devtable_lock, flags);
8944 __detach_device(dev_data);
8945- write_unlock_irqrestore(&amd_iommu_devtable_lock, flags);
8946+ spin_unlock_irqrestore(&amd_iommu_devtable_lock, flags);
8947
8948 if (!dev_is_pci(dev))
8949 return;
8950@@ -2819,7 +2810,7 @@ static void cleanup_domain(struct protection_domain *domain)
8951 struct iommu_dev_data *entry;
8952 unsigned long flags;
8953
8954- write_lock_irqsave(&amd_iommu_devtable_lock, flags);
8955+ spin_lock_irqsave(&amd_iommu_devtable_lock, flags);
8956
8957 while (!list_empty(&domain->dev_list)) {
8958 entry = list_first_entry(&domain->dev_list,
8959@@ -2827,7 +2818,7 @@ static void cleanup_domain(struct protection_domain *domain)
8960 __detach_device(entry);
8961 }
8962
8963- write_unlock_irqrestore(&amd_iommu_devtable_lock, flags);
8964+ spin_unlock_irqrestore(&amd_iommu_devtable_lock, flags);
8965 }
8966
8967 static void protection_domain_free(struct protection_domain *domain)
8968@@ -3594,14 +3585,62 @@ static void set_dte_irq_entry(u16 devid, struct irq_remap_table *table)
8969 amd_iommu_dev_table[devid].data[2] = dte;
8970 }
8971
8972-static struct irq_remap_table *get_irq_table(u16 devid, bool ioapic)
8973+static struct irq_remap_table *get_irq_table(u16 devid)
8974+{
8975+ struct irq_remap_table *table;
8976+
8977+ if (WARN_ONCE(!amd_iommu_rlookup_table[devid],
8978+ "%s: no iommu for devid %x\n", __func__, devid))
8979+ return NULL;
8980+
8981+ table = irq_lookup_table[devid];
8982+ if (WARN_ONCE(!table, "%s: no table for devid %x\n", __func__, devid))
8983+ return NULL;
8984+
8985+ return table;
8986+}
8987+
8988+static struct irq_remap_table *__alloc_irq_table(void)
8989+{
8990+ struct irq_remap_table *table;
8991+
8992+ table = kzalloc(sizeof(*table), GFP_KERNEL);
8993+ if (!table)
8994+ return NULL;
8995+
8996+ table->table = kmem_cache_alloc(amd_iommu_irq_cache, GFP_KERNEL);
8997+ if (!table->table) {
8998+ kfree(table);
8999+ return NULL;
9000+ }
9001+ raw_spin_lock_init(&table->lock);
9002+
9003+ if (!AMD_IOMMU_GUEST_IR_GA(amd_iommu_guest_ir))
9004+ memset(table->table, 0,
9005+ MAX_IRQS_PER_TABLE * sizeof(u32));
9006+ else
9007+ memset(table->table, 0,
9008+ (MAX_IRQS_PER_TABLE * (sizeof(u64) * 2)));
9009+ return table;
9010+}
9011+
9012+static void set_remap_table_entry(struct amd_iommu *iommu, u16 devid,
9013+ struct irq_remap_table *table)
9014+{
9015+ irq_lookup_table[devid] = table;
9016+ set_dte_irq_entry(devid, table);
9017+ iommu_flush_dte(iommu, devid);
9018+}
9019+
9020+static struct irq_remap_table *alloc_irq_table(u16 devid)
9021 {
9022 struct irq_remap_table *table = NULL;
9023+ struct irq_remap_table *new_table = NULL;
9024 struct amd_iommu *iommu;
9025 unsigned long flags;
9026 u16 alias;
9027
9028- write_lock_irqsave(&amd_iommu_devtable_lock, flags);
9029+ spin_lock_irqsave(&iommu_table_lock, flags);
9030
9031 iommu = amd_iommu_rlookup_table[devid];
9032 if (!iommu)
9033@@ -3614,60 +3653,45 @@ static struct irq_remap_table *get_irq_table(u16 devid, bool ioapic)
9034 alias = amd_iommu_alias_table[devid];
9035 table = irq_lookup_table[alias];
9036 if (table) {
9037- irq_lookup_table[devid] = table;
9038- set_dte_irq_entry(devid, table);
9039- iommu_flush_dte(iommu, devid);
9040- goto out;
9041+ set_remap_table_entry(iommu, devid, table);
9042+ goto out_wait;
9043 }
9044+ spin_unlock_irqrestore(&iommu_table_lock, flags);
9045
9046 /* Nothing there yet, allocate new irq remapping table */
9047- table = kzalloc(sizeof(*table), GFP_ATOMIC);
9048- if (!table)
9049- goto out_unlock;
9050-
9051- /* Initialize table spin-lock */
9052- spin_lock_init(&table->lock);
9053+ new_table = __alloc_irq_table();
9054+ if (!new_table)
9055+ return NULL;
9056
9057- if (ioapic)
9058- /* Keep the first 32 indexes free for IOAPIC interrupts */
9059- table->min_index = 32;
9060+ spin_lock_irqsave(&iommu_table_lock, flags);
9061
9062- table->table = kmem_cache_alloc(amd_iommu_irq_cache, GFP_ATOMIC);
9063- if (!table->table) {
9064- kfree(table);
9065- table = NULL;
9066+ table = irq_lookup_table[devid];
9067+ if (table)
9068 goto out_unlock;
9069- }
9070-
9071- if (!AMD_IOMMU_GUEST_IR_GA(amd_iommu_guest_ir))
9072- memset(table->table, 0,
9073- MAX_IRQS_PER_TABLE * sizeof(u32));
9074- else
9075- memset(table->table, 0,
9076- (MAX_IRQS_PER_TABLE * (sizeof(u64) * 2)));
9077
9078- if (ioapic) {
9079- int i;
9080-
9081- for (i = 0; i < 32; ++i)
9082- iommu->irte_ops->set_allocated(table, i);
9083+ table = irq_lookup_table[alias];
9084+ if (table) {
9085+ set_remap_table_entry(iommu, devid, table);
9086+ goto out_wait;
9087 }
9088
9089- irq_lookup_table[devid] = table;
9090- set_dte_irq_entry(devid, table);
9091- iommu_flush_dte(iommu, devid);
9092- if (devid != alias) {
9093- irq_lookup_table[alias] = table;
9094- set_dte_irq_entry(alias, table);
9095- iommu_flush_dte(iommu, alias);
9096- }
9097+ table = new_table;
9098+ new_table = NULL;
9099
9100-out:
9101+ set_remap_table_entry(iommu, devid, table);
9102+ if (devid != alias)
9103+ set_remap_table_entry(iommu, alias, table);
9104+
9105+out_wait:
9106 iommu_completion_wait(iommu);
9107
9108 out_unlock:
9109- write_unlock_irqrestore(&amd_iommu_devtable_lock, flags);
9110+ spin_unlock_irqrestore(&iommu_table_lock, flags);
9111
9112+ if (new_table) {
9113+ kmem_cache_free(amd_iommu_irq_cache, new_table->table);
9114+ kfree(new_table);
9115+ }
9116 return table;
9117 }
9118
9119@@ -3681,11 +3705,11 @@ static int alloc_irq_index(u16 devid, int count)
9120 if (!iommu)
9121 return -ENODEV;
9122
9123- table = get_irq_table(devid, false);
9124+ table = alloc_irq_table(devid);
9125 if (!table)
9126 return -ENODEV;
9127
9128- spin_lock_irqsave(&table->lock, flags);
9129+ raw_spin_lock_irqsave(&table->lock, flags);
9130
9131 /* Scan table for free entries */
9132 for (c = 0, index = table->min_index;
9133@@ -3708,7 +3732,7 @@ static int alloc_irq_index(u16 devid, int count)
9134 index = -ENOSPC;
9135
9136 out:
9137- spin_unlock_irqrestore(&table->lock, flags);
9138+ raw_spin_unlock_irqrestore(&table->lock, flags);
9139
9140 return index;
9141 }
9142@@ -3725,11 +3749,11 @@ static int modify_irte_ga(u16 devid, int index, struct irte_ga *irte,
9143 if (iommu == NULL)
9144 return -EINVAL;
9145
9146- table = get_irq_table(devid, false);
9147+ table = get_irq_table(devid);
9148 if (!table)
9149 return -ENOMEM;
9150
9151- spin_lock_irqsave(&table->lock, flags);
9152+ raw_spin_lock_irqsave(&table->lock, flags);
9153
9154 entry = (struct irte_ga *)table->table;
9155 entry = &entry[index];
9156@@ -3740,7 +3764,7 @@ static int modify_irte_ga(u16 devid, int index, struct irte_ga *irte,
9157 if (data)
9158 data->ref = entry;
9159
9160- spin_unlock_irqrestore(&table->lock, flags);
9161+ raw_spin_unlock_irqrestore(&table->lock, flags);
9162
9163 iommu_flush_irt(iommu, devid);
9164 iommu_completion_wait(iommu);
9165@@ -3758,13 +3782,13 @@ static int modify_irte(u16 devid, int index, union irte *irte)
9166 if (iommu == NULL)
9167 return -EINVAL;
9168
9169- table = get_irq_table(devid, false);
9170+ table = get_irq_table(devid);
9171 if (!table)
9172 return -ENOMEM;
9173
9174- spin_lock_irqsave(&table->lock, flags);
9175+ raw_spin_lock_irqsave(&table->lock, flags);
9176 table->table[index] = irte->val;
9177- spin_unlock_irqrestore(&table->lock, flags);
9178+ raw_spin_unlock_irqrestore(&table->lock, flags);
9179
9180 iommu_flush_irt(iommu, devid);
9181 iommu_completion_wait(iommu);
9182@@ -3782,13 +3806,13 @@ static void free_irte(u16 devid, int index)
9183 if (iommu == NULL)
9184 return;
9185
9186- table = get_irq_table(devid, false);
9187+ table = get_irq_table(devid);
9188 if (!table)
9189 return;
9190
9191- spin_lock_irqsave(&table->lock, flags);
9192+ raw_spin_lock_irqsave(&table->lock, flags);
9193 iommu->irte_ops->clear_allocated(table, index);
9194- spin_unlock_irqrestore(&table->lock, flags);
9195+ raw_spin_unlock_irqrestore(&table->lock, flags);
9196
9197 iommu_flush_irt(iommu, devid);
9198 iommu_completion_wait(iommu);
9199@@ -3869,10 +3893,8 @@ static void irte_ga_set_affinity(void *entry, u16 devid, u16 index,
9200 u8 vector, u32 dest_apicid)
9201 {
9202 struct irte_ga *irte = (struct irte_ga *) entry;
9203- struct iommu_dev_data *dev_data = search_dev_data(devid);
9204
9205- if (!dev_data || !dev_data->use_vapic ||
9206- !irte->lo.fields_remap.guest_mode) {
9207+ if (!irte->lo.fields_remap.guest_mode) {
9208 irte->hi.fields.vector = vector;
9209 irte->lo.fields_remap.destination = dest_apicid;
9210 modify_irte_ga(devid, index, irte, NULL);
9211@@ -4078,7 +4100,7 @@ static int irq_remapping_alloc(struct irq_domain *domain, unsigned int virq,
9212 struct amd_ir_data *data = NULL;
9213 struct irq_cfg *cfg;
9214 int i, ret, devid;
9215- int index = -1;
9216+ int index;
9217
9218 if (!info)
9219 return -EINVAL;
9220@@ -4102,10 +4124,26 @@ static int irq_remapping_alloc(struct irq_domain *domain, unsigned int virq,
9221 return ret;
9222
9223 if (info->type == X86_IRQ_ALLOC_TYPE_IOAPIC) {
9224- if (get_irq_table(devid, true))
9225+ struct irq_remap_table *table;
9226+ struct amd_iommu *iommu;
9227+
9228+ table = alloc_irq_table(devid);
9229+ if (table) {
9230+ if (!table->min_index) {
9231+ /*
9232+ * Keep the first 32 indexes free for IOAPIC
9233+ * interrupts.
9234+ */
9235+ table->min_index = 32;
9236+ iommu = amd_iommu_rlookup_table[devid];
9237+ for (i = 0; i < 32; ++i)
9238+ iommu->irte_ops->set_allocated(table, i);
9239+ }
9240+ WARN_ON(table->min_index != 32);
9241 index = info->ioapic_pin;
9242- else
9243- ret = -ENOMEM;
9244+ } else {
9245+ index = -ENOMEM;
9246+ }
9247 } else {
9248 index = alloc_irq_index(devid, nr_irqs);
9249 }
9250@@ -4349,7 +4387,7 @@ int amd_iommu_update_ga(int cpu, bool is_run, void *data)
9251 {
9252 unsigned long flags;
9253 struct amd_iommu *iommu;
9254- struct irq_remap_table *irt;
9255+ struct irq_remap_table *table;
9256 struct amd_ir_data *ir_data = (struct amd_ir_data *)data;
9257 int devid = ir_data->irq_2_irte.devid;
9258 struct irte_ga *entry = (struct irte_ga *) ir_data->entry;
9259@@ -4363,11 +4401,11 @@ int amd_iommu_update_ga(int cpu, bool is_run, void *data)
9260 if (!iommu)
9261 return -ENODEV;
9262
9263- irt = get_irq_table(devid, false);
9264- if (!irt)
9265+ table = get_irq_table(devid);
9266+ if (!table)
9267 return -ENODEV;
9268
9269- spin_lock_irqsave(&irt->lock, flags);
9270+ raw_spin_lock_irqsave(&table->lock, flags);
9271
9272 if (ref->lo.fields_vapic.guest_mode) {
9273 if (cpu >= 0)
9274@@ -4376,7 +4414,7 @@ int amd_iommu_update_ga(int cpu, bool is_run, void *data)
9275 barrier();
9276 }
9277
9278- spin_unlock_irqrestore(&irt->lock, flags);
9279+ raw_spin_unlock_irqrestore(&table->lock, flags);
9280
9281 iommu_flush_irt(iommu, devid);
9282 iommu_completion_wait(iommu);
9283diff --git a/drivers/iommu/amd_iommu_init.c b/drivers/iommu/amd_iommu_init.c
9284index 6fe2d0346073..e3cd81b32a33 100644
9285--- a/drivers/iommu/amd_iommu_init.c
9286+++ b/drivers/iommu/amd_iommu_init.c
9287@@ -1474,7 +1474,7 @@ static int __init init_iommu_one(struct amd_iommu *iommu, struct ivhd_header *h)
9288 {
9289 int ret;
9290
9291- spin_lock_init(&iommu->lock);
9292+ raw_spin_lock_init(&iommu->lock);
9293
9294 /* Add IOMMU to internal data structures */
9295 list_add_tail(&iommu->list, &amd_iommu_list);
9296diff --git a/drivers/iommu/amd_iommu_types.h b/drivers/iommu/amd_iommu_types.h
9297index f6b24c7d8b70..16b1404da58c 100644
9298--- a/drivers/iommu/amd_iommu_types.h
9299+++ b/drivers/iommu/amd_iommu_types.h
9300@@ -406,7 +406,7 @@ extern bool amd_iommu_iotlb_sup;
9301 #define IRQ_TABLE_ALIGNMENT 128
9302
9303 struct irq_remap_table {
9304- spinlock_t lock;
9305+ raw_spinlock_t lock;
9306 unsigned min_index;
9307 u32 *table;
9308 };
9309@@ -488,7 +488,7 @@ struct amd_iommu {
9310 int index;
9311
9312 /* locks the accesses to the hardware */
9313- spinlock_t lock;
9314+ raw_spinlock_t lock;
9315
9316 /* Pointer to PCI device of this IOMMU */
9317 struct pci_dev *dev;
9318@@ -625,7 +625,7 @@ struct devid_map {
9319 */
9320 struct iommu_dev_data {
9321 struct list_head list; /* For domain->dev_list */
9322- struct list_head dev_data_list; /* For global dev_data_list */
9323+ struct llist_node dev_data_list; /* For global dev_data_list */
9324 struct protection_domain *domain; /* Domain the device is bound to */
9325 u16 devid; /* PCI Device ID */
9326 u16 alias; /* Alias Device ID */
9327diff --git a/drivers/iommu/iova.c b/drivers/iommu/iova.c
9328index 33edfa794ae9..b30900025c62 100644
9329--- a/drivers/iommu/iova.c
9330+++ b/drivers/iommu/iova.c
9331@@ -570,7 +570,7 @@ void queue_iova(struct iova_domain *iovad,
9332 unsigned long pfn, unsigned long pages,
9333 unsigned long data)
9334 {
9335- struct iova_fq *fq = get_cpu_ptr(iovad->fq);
9336+ struct iova_fq *fq = raw_cpu_ptr(iovad->fq);
9337 unsigned long flags;
9338 unsigned idx;
9339
9340@@ -600,8 +600,6 @@ void queue_iova(struct iova_domain *iovad,
9341 if (atomic_cmpxchg(&iovad->fq_timer_on, 0, 1) == 0)
9342 mod_timer(&iovad->fq_timer,
9343 jiffies + msecs_to_jiffies(IOVA_FQ_TIMEOUT));
9344-
9345- put_cpu_ptr(iovad->fq);
9346 }
9347 EXPORT_SYMBOL_GPL(queue_iova);
9348
9349diff --git a/drivers/irqchip/irq-gic-v3-its.c b/drivers/irqchip/irq-gic-v3-its.c
9350index 2ea39a83737f..a3e23d0fc4af 100644
9351--- a/drivers/irqchip/irq-gic-v3-its.c
9352+++ b/drivers/irqchip/irq-gic-v3-its.c
9353@@ -148,7 +148,7 @@ static struct {
9354 } vpe_proxy;
9355
9356 static LIST_HEAD(its_nodes);
9357-static DEFINE_SPINLOCK(its_lock);
9358+static DEFINE_RAW_SPINLOCK(its_lock);
9359 static struct rdists *gic_rdists;
9360 static struct irq_domain *its_parent;
9361
9362@@ -165,6 +165,7 @@ static DEFINE_RAW_SPINLOCK(vmovp_lock);
9363 static DEFINE_IDA(its_vpeid_ida);
9364
9365 #define gic_data_rdist() (raw_cpu_ptr(gic_rdists->rdist))
9366+#define gic_data_rdist_cpu(cpu) (per_cpu_ptr(gic_rdists->rdist, cpu))
9367 #define gic_data_rdist_rd_base() (gic_data_rdist()->rd_base)
9368 #define gic_data_rdist_vlpi_base() (gic_data_rdist_rd_base() + SZ_128K)
9369
9370@@ -1432,7 +1433,7 @@ static void its_free_prop_table(struct page *prop_page)
9371 get_order(LPI_PROPBASE_SZ));
9372 }
9373
9374-static int __init its_alloc_lpi_tables(void)
9375+static int __init its_alloc_lpi_prop_table(void)
9376 {
9377 phys_addr_t paddr;
9378
9379@@ -1758,30 +1759,47 @@ static void its_free_pending_table(struct page *pt)
9380 get_order(max_t(u32, LPI_PENDBASE_SZ, SZ_64K)));
9381 }
9382
9383-static void its_cpu_init_lpis(void)
9384+static int __init allocate_lpi_tables(void)
9385 {
9386- void __iomem *rbase = gic_data_rdist_rd_base();
9387- struct page *pend_page;
9388- u64 val, tmp;
9389+ int err, cpu;
9390
9391- /* If we didn't allocate the pending table yet, do it now */
9392- pend_page = gic_data_rdist()->pend_page;
9393- if (!pend_page) {
9394- phys_addr_t paddr;
9395+ err = its_alloc_lpi_prop_table();
9396+ if (err)
9397+ return err;
9398+
9399+ /*
9400+ * We allocate all the pending tables anyway, as we may have a
9401+ * mix of RDs that have had LPIs enabled, and some that
9402+ * don't. We'll free the unused ones as each CPU comes online.
9403+ */
9404+ for_each_possible_cpu(cpu) {
9405+ struct page *pend_page;
9406
9407 pend_page = its_allocate_pending_table(GFP_NOWAIT);
9408 if (!pend_page) {
9409- pr_err("Failed to allocate PENDBASE for CPU%d\n",
9410- smp_processor_id());
9411- return;
9412+ pr_err("Failed to allocate PENDBASE for CPU%d\n", cpu);
9413+ return -ENOMEM;
9414 }
9415
9416- paddr = page_to_phys(pend_page);
9417- pr_info("CPU%d: using LPI pending table @%pa\n",
9418- smp_processor_id(), &paddr);
9419- gic_data_rdist()->pend_page = pend_page;
9420+ gic_data_rdist_cpu(cpu)->pend_page = pend_page;
9421 }
9422
9423+ return 0;
9424+}
9425+
9426+static void its_cpu_init_lpis(void)
9427+{
9428+ void __iomem *rbase = gic_data_rdist_rd_base();
9429+ struct page *pend_page;
9430+ phys_addr_t paddr;
9431+ u64 val, tmp;
9432+
9433+ if (gic_data_rdist()->lpi_enabled)
9434+ return;
9435+
9436+ pend_page = gic_data_rdist()->pend_page;
9437+ paddr = page_to_phys(pend_page);
9438+
9439 /* Disable LPIs */
9440 val = readl_relaxed(rbase + GICR_CTLR);
9441 val &= ~GICR_CTLR_ENABLE_LPIS;
9442@@ -1843,6 +1861,10 @@ static void its_cpu_init_lpis(void)
9443
9444 /* Make sure the GIC has seen the above */
9445 dsb(sy);
9446+ gic_data_rdist()->lpi_enabled = true;
9447+ pr_info("GICv3: CPU%d: using LPI pending table @%pa\n",
9448+ smp_processor_id(),
9449+ &paddr);
9450 }
9451
9452 static void its_cpu_init_collection(void)
9453@@ -1850,7 +1872,7 @@ static void its_cpu_init_collection(void)
9454 struct its_node *its;
9455 int cpu;
9456
9457- spin_lock(&its_lock);
9458+ raw_spin_lock(&its_lock);
9459 cpu = smp_processor_id();
9460
9461 list_for_each_entry(its, &its_nodes, entry) {
9462@@ -1892,7 +1914,7 @@ static void its_cpu_init_collection(void)
9463 its_send_invall(its, &its->collections[cpu]);
9464 }
9465
9466- spin_unlock(&its_lock);
9467+ raw_spin_unlock(&its_lock);
9468 }
9469
9470 static struct its_device *its_find_device(struct its_node *its, u32 dev_id)
9471@@ -3041,9 +3063,9 @@ static int __init its_probe_one(struct resource *res,
9472 if (err)
9473 goto out_free_tables;
9474
9475- spin_lock(&its_lock);
9476+ raw_spin_lock(&its_lock);
9477 list_add(&its->entry, &its_nodes);
9478- spin_unlock(&its_lock);
9479+ raw_spin_unlock(&its_lock);
9480
9481 return 0;
9482
9483@@ -3278,7 +3300,8 @@ int __init its_init(struct fwnode_handle *handle, struct rdists *rdists,
9484 }
9485
9486 gic_rdists = rdists;
9487- err = its_alloc_lpi_tables();
9488+
9489+ err = allocate_lpi_tables();
9490 if (err)
9491 return err;
9492
9493diff --git a/drivers/leds/trigger/Kconfig b/drivers/leds/trigger/Kconfig
9494index 3f9ddb9fafa7..09da5b6b44a1 100644
9495--- a/drivers/leds/trigger/Kconfig
9496+++ b/drivers/leds/trigger/Kconfig
9497@@ -69,7 +69,7 @@ config LEDS_TRIGGER_BACKLIGHT
9498
9499 config LEDS_TRIGGER_CPU
9500 bool "LED CPU Trigger"
9501- depends on LEDS_TRIGGERS
9502+ depends on LEDS_TRIGGERS && !PREEMPT_RT_BASE
9503 help
9504 This allows LEDs to be controlled by active CPUs. This shows
9505 the active CPUs across an array of LEDs so you can see which
9506diff --git a/drivers/md/bcache/Kconfig b/drivers/md/bcache/Kconfig
9507index 4d200883c505..98b64ed5cb81 100644
9508--- a/drivers/md/bcache/Kconfig
9509+++ b/drivers/md/bcache/Kconfig
9510@@ -1,6 +1,7 @@
9511
9512 config BCACHE
9513 tristate "Block device as cache"
9514+ depends on !PREEMPT_RT_FULL
9515 ---help---
9516 Allows a block device to be used as cache for other devices; uses
9517 a btree for indexing and the layout is optimized for SSDs.
9518diff --git a/drivers/md/dm-rq.c b/drivers/md/dm-rq.c
9519index eadfcfd106ff..8824aeda85cf 100644
9520--- a/drivers/md/dm-rq.c
9521+++ b/drivers/md/dm-rq.c
9522@@ -671,7 +671,7 @@ static void dm_old_request_fn(struct request_queue *q)
9523 /* Establish tio->ti before queuing work (map_tio_request) */
9524 tio->ti = ti;
9525 kthread_queue_work(&md->kworker, &tio->work);
9526- BUG_ON(!irqs_disabled());
9527+ BUG_ON_NONRT(!irqs_disabled());
9528 }
9529 }
9530
9531diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
9532index dbf51b4c21b3..5cfccaf87687 100644
9533--- a/drivers/md/raid5.c
9534+++ b/drivers/md/raid5.c
9535@@ -410,7 +410,7 @@ void raid5_release_stripe(struct stripe_head *sh)
9536 md_wakeup_thread(conf->mddev->thread);
9537 return;
9538 slow_path:
9539- local_irq_save(flags);
9540+ local_irq_save_nort(flags);
9541 /* we are ok here if STRIPE_ON_RELEASE_LIST is set or not */
9542 if (atomic_dec_and_lock(&sh->count, &conf->device_lock)) {
9543 INIT_LIST_HEAD(&list);
9544@@ -419,7 +419,7 @@ void raid5_release_stripe(struct stripe_head *sh)
9545 spin_unlock(&conf->device_lock);
9546 release_inactive_stripe_list(conf, &list, hash);
9547 }
9548- local_irq_restore(flags);
9549+ local_irq_restore_nort(flags);
9550 }
9551
9552 static inline void remove_hash(struct stripe_head *sh)
9553@@ -2067,8 +2067,9 @@ static void raid_run_ops(struct stripe_head *sh, unsigned long ops_request)
9554 struct raid5_percpu *percpu;
9555 unsigned long cpu;
9556
9557- cpu = get_cpu();
9558+ cpu = get_cpu_light();
9559 percpu = per_cpu_ptr(conf->percpu, cpu);
9560+ spin_lock(&percpu->lock);
9561 if (test_bit(STRIPE_OP_BIOFILL, &ops_request)) {
9562 ops_run_biofill(sh);
9563 overlap_clear++;
9564@@ -2127,7 +2128,8 @@ static void raid_run_ops(struct stripe_head *sh, unsigned long ops_request)
9565 if (test_and_clear_bit(R5_Overlap, &dev->flags))
9566 wake_up(&sh->raid_conf->wait_for_overlap);
9567 }
9568- put_cpu();
9569+ spin_unlock(&percpu->lock);
9570+ put_cpu_light();
9571 }
9572
9573 static void free_stripe(struct kmem_cache *sc, struct stripe_head *sh)
9574@@ -6781,6 +6783,7 @@ static int raid456_cpu_up_prepare(unsigned int cpu, struct hlist_node *node)
9575 __func__, cpu);
9576 return -ENOMEM;
9577 }
9578+ spin_lock_init(&per_cpu_ptr(conf->percpu, cpu)->lock);
9579 return 0;
9580 }
9581
9582@@ -6791,7 +6794,6 @@ static int raid5_alloc_percpu(struct r5conf *conf)
9583 conf->percpu = alloc_percpu(struct raid5_percpu);
9584 if (!conf->percpu)
9585 return -ENOMEM;
9586-
9587 err = cpuhp_state_add_instance(CPUHP_MD_RAID5_PREPARE, &conf->node);
9588 if (!err) {
9589 conf->scribble_disks = max(conf->raid_disks,
9590diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h
9591index 2e6123825095..37a6021418a2 100644
9592--- a/drivers/md/raid5.h
9593+++ b/drivers/md/raid5.h
9594@@ -624,6 +624,7 @@ struct r5conf {
9595 int recovery_disabled;
9596 /* per cpu variables */
9597 struct raid5_percpu {
9598+ spinlock_t lock; /* Protection for -RT */
9599 struct page *spare_page; /* Used when checking P/Q in raid6 */
9600 struct flex_array *scribble; /* space for constructing buffer
9601 * lists and performing address
9602diff --git a/drivers/mfd/atmel-smc.c b/drivers/mfd/atmel-smc.c
9603index 7d77948567d7..0adbd2e796fe 100644
9604--- a/drivers/mfd/atmel-smc.c
9605+++ b/drivers/mfd/atmel-smc.c
9606@@ -12,6 +12,7 @@
9607 */
9608
9609 #include <linux/mfd/syscon/atmel-smc.h>
9610+#include <linux/string.h>
9611
9612 /**
9613 * atmel_smc_cs_conf_init - initialize a SMC CS conf
9614diff --git a/drivers/misc/Kconfig b/drivers/misc/Kconfig
9615index 8136dc7e863d..86e83b9629d7 100644
9616--- a/drivers/misc/Kconfig
9617+++ b/drivers/misc/Kconfig
9618@@ -54,6 +54,7 @@ config AD525X_DPOT_SPI
9619 config ATMEL_TCLIB
9620 bool "Atmel AT32/AT91 Timer/Counter Library"
9621 depends on (AVR32 || ARCH_AT91)
9622+ default y if PREEMPT_RT_FULL
9623 help
9624 Select this if you want a library to allocate the Timer/Counter
9625 blocks found on many Atmel processors. This facilitates using
9626@@ -69,8 +70,7 @@ config ATMEL_TCB_CLKSRC
9627 are combined to make a single 32-bit timer.
9628
9629 When GENERIC_CLOCKEVENTS is defined, the third timer channel
9630- may be used as a clock event device supporting oneshot mode
9631- (delays of up to two seconds) based on the 32 KiHz clock.
9632+ may be used as a clock event device supporting oneshot mode.
9633
9634 config ATMEL_TCB_CLKSRC_BLOCK
9635 int
9636@@ -84,6 +84,15 @@ config ATMEL_TCB_CLKSRC_BLOCK
9637 TC can be used for other purposes, such as PWM generation and
9638 interval timing.
9639
9640+config ATMEL_TCB_CLKSRC_USE_SLOW_CLOCK
9641+ bool "TC Block use 32 KiHz clock"
9642+ depends on ATMEL_TCB_CLKSRC
9643+ default y if !PREEMPT_RT_FULL
9644+ help
9645+ Select this to use 32 KiHz base clock rate as TC block clock
9646+ source for clock events.
9647+
9648+
9649 config DUMMY_IRQ
9650 tristate "Dummy IRQ handler"
9651 default n
9652diff --git a/drivers/mmc/host/mmci.c b/drivers/mmc/host/mmci.c
9653index f1f54a818489..ce102378df02 100644
9654--- a/drivers/mmc/host/mmci.c
9655+++ b/drivers/mmc/host/mmci.c
9656@@ -1200,15 +1200,12 @@ static irqreturn_t mmci_pio_irq(int irq, void *dev_id)
9657 struct sg_mapping_iter *sg_miter = &host->sg_miter;
9658 struct variant_data *variant = host->variant;
9659 void __iomem *base = host->base;
9660- unsigned long flags;
9661 u32 status;
9662
9663 status = readl(base + MMCISTATUS);
9664
9665 dev_dbg(mmc_dev(host->mmc), "irq1 (pio) %08x\n", status);
9666
9667- local_irq_save(flags);
9668-
9669 do {
9670 unsigned int remain, len;
9671 char *buffer;
9672@@ -1248,8 +1245,6 @@ static irqreturn_t mmci_pio_irq(int irq, void *dev_id)
9673
9674 sg_miter_stop(sg_miter);
9675
9676- local_irq_restore(flags);
9677-
9678 /*
9679 * If we have less than the fifo 'half-full' threshold to transfer,
9680 * trigger a PIO interrupt as soon as any data is available.
9681diff --git a/drivers/net/ethernet/3com/3c59x.c b/drivers/net/ethernet/3com/3c59x.c
9682index 402d9090ad29..9bc02563b853 100644
9683--- a/drivers/net/ethernet/3com/3c59x.c
9684+++ b/drivers/net/ethernet/3com/3c59x.c
9685@@ -842,9 +842,9 @@ static void poll_vortex(struct net_device *dev)
9686 {
9687 struct vortex_private *vp = netdev_priv(dev);
9688 unsigned long flags;
9689- local_irq_save(flags);
9690+ local_irq_save_nort(flags);
9691 (vp->full_bus_master_rx ? boomerang_interrupt:vortex_interrupt)(dev->irq,dev);
9692- local_irq_restore(flags);
9693+ local_irq_restore_nort(flags);
9694 }
9695 #endif
9696
9697@@ -1908,12 +1908,12 @@ static void vortex_tx_timeout(struct net_device *dev)
9698 * Block interrupts because vortex_interrupt does a bare spin_lock()
9699 */
9700 unsigned long flags;
9701- local_irq_save(flags);
9702+ local_irq_save_nort(flags);
9703 if (vp->full_bus_master_tx)
9704 boomerang_interrupt(dev->irq, dev);
9705 else
9706 vortex_interrupt(dev->irq, dev);
9707- local_irq_restore(flags);
9708+ local_irq_restore_nort(flags);
9709 }
9710 }
9711
9712diff --git a/drivers/net/ethernet/marvell/mvpp2.c b/drivers/net/ethernet/marvell/mvpp2.c
9713index 00e6f1d155a6..9c69ab2c5b07 100644
9714--- a/drivers/net/ethernet/marvell/mvpp2.c
9715+++ b/drivers/net/ethernet/marvell/mvpp2.c
9716@@ -831,9 +831,8 @@ struct mvpp2_pcpu_stats {
9717 /* Per-CPU port control */
9718 struct mvpp2_port_pcpu {
9719 struct hrtimer tx_done_timer;
9720+ struct net_device *dev;
9721 bool timer_scheduled;
9722- /* Tasklet for egress finalization */
9723- struct tasklet_struct tx_done_tasklet;
9724 };
9725
9726 struct mvpp2_queue_vector {
9727@@ -5955,46 +5954,34 @@ static void mvpp2_link_event(struct net_device *dev)
9728 }
9729 }
9730
9731-static void mvpp2_timer_set(struct mvpp2_port_pcpu *port_pcpu)
9732-{
9733- ktime_t interval;
9734-
9735- if (!port_pcpu->timer_scheduled) {
9736- port_pcpu->timer_scheduled = true;
9737- interval = MVPP2_TXDONE_HRTIMER_PERIOD_NS;
9738- hrtimer_start(&port_pcpu->tx_done_timer, interval,
9739- HRTIMER_MODE_REL_PINNED);
9740- }
9741-}
9742-
9743-static void mvpp2_tx_proc_cb(unsigned long data)
9744+static enum hrtimer_restart mvpp2_hr_timer_cb(struct hrtimer *timer)
9745 {
9746- struct net_device *dev = (struct net_device *)data;
9747- struct mvpp2_port *port = netdev_priv(dev);
9748- struct mvpp2_port_pcpu *port_pcpu = this_cpu_ptr(port->pcpu);
9749+ struct net_device *dev;
9750+ struct mvpp2_port *port;
9751+ struct mvpp2_port_pcpu *port_pcpu;
9752 unsigned int tx_todo, cause;
9753
9754+ port_pcpu = container_of(timer, struct mvpp2_port_pcpu, tx_done_timer);
9755+ dev = port_pcpu->dev;
9756+
9757 if (!netif_running(dev))
9758- return;
9759+ return HRTIMER_NORESTART;
9760+
9761 port_pcpu->timer_scheduled = false;
9762+ port = netdev_priv(dev);
9763
9764 /* Process all the Tx queues */
9765 cause = (1 << port->ntxqs) - 1;
9766 tx_todo = mvpp2_tx_done(port, cause, smp_processor_id());
9767
9768 /* Set the timer in case not all the packets were processed */
9769- if (tx_todo)
9770- mvpp2_timer_set(port_pcpu);
9771-}
9772-
9773-static enum hrtimer_restart mvpp2_hr_timer_cb(struct hrtimer *timer)
9774-{
9775- struct mvpp2_port_pcpu *port_pcpu = container_of(timer,
9776- struct mvpp2_port_pcpu,
9777- tx_done_timer);
9778-
9779- tasklet_schedule(&port_pcpu->tx_done_tasklet);
9780+ if (tx_todo && !port_pcpu->timer_scheduled) {
9781+ port_pcpu->timer_scheduled = true;
9782+ hrtimer_forward_now(&port_pcpu->tx_done_timer,
9783+ MVPP2_TXDONE_HRTIMER_PERIOD_NS);
9784
9785+ return HRTIMER_RESTART;
9786+ }
9787 return HRTIMER_NORESTART;
9788 }
9789
9790@@ -6484,7 +6471,12 @@ static int mvpp2_tx(struct sk_buff *skb, struct net_device *dev)
9791 txq_pcpu->count > 0) {
9792 struct mvpp2_port_pcpu *port_pcpu = this_cpu_ptr(port->pcpu);
9793
9794- mvpp2_timer_set(port_pcpu);
9795+ if (!port_pcpu->timer_scheduled) {
9796+ port_pcpu->timer_scheduled = true;
9797+ hrtimer_start(&port_pcpu->tx_done_timer,
9798+ MVPP2_TXDONE_HRTIMER_PERIOD_NS,
9799+ HRTIMER_MODE_REL_PINNED_SOFT);
9800+ }
9801 }
9802
9803 return NETDEV_TX_OK;
9804@@ -6875,7 +6867,6 @@ static int mvpp2_stop(struct net_device *dev)
9805
9806 hrtimer_cancel(&port_pcpu->tx_done_timer);
9807 port_pcpu->timer_scheduled = false;
9808- tasklet_kill(&port_pcpu->tx_done_tasklet);
9809 }
9810 }
9811 mvpp2_cleanup_rxqs(port);
9812@@ -7648,13 +7639,10 @@ static int mvpp2_port_probe(struct platform_device *pdev,
9813 port_pcpu = per_cpu_ptr(port->pcpu, cpu);
9814
9815 hrtimer_init(&port_pcpu->tx_done_timer, CLOCK_MONOTONIC,
9816- HRTIMER_MODE_REL_PINNED);
9817+ HRTIMER_MODE_REL_PINNED_SOFT);
9818 port_pcpu->tx_done_timer.function = mvpp2_hr_timer_cb;
9819 port_pcpu->timer_scheduled = false;
9820-
9821- tasklet_init(&port_pcpu->tx_done_tasklet,
9822- mvpp2_tx_proc_cb,
9823- (unsigned long)dev);
9824+ port_pcpu->dev = dev;
9825 }
9826 }
9827
9828diff --git a/drivers/net/wireless/intersil/orinoco/orinoco_usb.c b/drivers/net/wireless/intersil/orinoco/orinoco_usb.c
9829index 56f6e3b71f48..a50350d01a80 100644
9830--- a/drivers/net/wireless/intersil/orinoco/orinoco_usb.c
9831+++ b/drivers/net/wireless/intersil/orinoco/orinoco_usb.c
9832@@ -697,7 +697,7 @@ static void ezusb_req_ctx_wait(struct ezusb_priv *upriv,
9833 while (!ctx->done.done && msecs--)
9834 udelay(1000);
9835 } else {
9836- wait_event_interruptible(ctx->done.wait,
9837+ swait_event_interruptible(ctx->done.wait,
9838 ctx->done.done);
9839 }
9840 break;
9841diff --git a/drivers/net/wireless/mac80211_hwsim.c b/drivers/net/wireless/mac80211_hwsim.c
9842index aafa7aa18fbd..388f6d71ba71 100644
9843--- a/drivers/net/wireless/mac80211_hwsim.c
9844+++ b/drivers/net/wireless/mac80211_hwsim.c
9845@@ -537,7 +537,7 @@ struct mac80211_hwsim_data {
9846 unsigned int rx_filter;
9847 bool started, idle, scanning;
9848 struct mutex mutex;
9849- struct tasklet_hrtimer beacon_timer;
9850+ struct hrtimer beacon_timer;
9851 enum ps_mode {
9852 PS_DISABLED, PS_ENABLED, PS_AUTO_POLL, PS_MANUAL_POLL
9853 } ps;
9854@@ -1423,7 +1423,7 @@ static void mac80211_hwsim_stop(struct ieee80211_hw *hw)
9855 {
9856 struct mac80211_hwsim_data *data = hw->priv;
9857 data->started = false;
9858- tasklet_hrtimer_cancel(&data->beacon_timer);
9859+ hrtimer_cancel(&data->beacon_timer);
9860 wiphy_debug(hw->wiphy, "%s\n", __func__);
9861 }
9862
9863@@ -1546,14 +1546,12 @@ static enum hrtimer_restart
9864 mac80211_hwsim_beacon(struct hrtimer *timer)
9865 {
9866 struct mac80211_hwsim_data *data =
9867- container_of(timer, struct mac80211_hwsim_data,
9868- beacon_timer.timer);
9869+ container_of(timer, struct mac80211_hwsim_data, beacon_timer);
9870 struct ieee80211_hw *hw = data->hw;
9871 u64 bcn_int = data->beacon_int;
9872- ktime_t next_bcn;
9873
9874 if (!data->started)
9875- goto out;
9876+ return HRTIMER_NORESTART;
9877
9878 ieee80211_iterate_active_interfaces_atomic(
9879 hw, IEEE80211_IFACE_ITER_NORMAL,
9880@@ -1565,11 +1563,9 @@ mac80211_hwsim_beacon(struct hrtimer *timer)
9881 data->bcn_delta = 0;
9882 }
9883
9884- next_bcn = ktime_add(hrtimer_get_expires(timer),
9885- ns_to_ktime(bcn_int * 1000));
9886- tasklet_hrtimer_start(&data->beacon_timer, next_bcn, HRTIMER_MODE_ABS);
9887-out:
9888- return HRTIMER_NORESTART;
9889+ hrtimer_forward(&data->beacon_timer, hrtimer_get_expires(timer),
9890+ ns_to_ktime(bcn_int * NSEC_PER_USEC));
9891+ return HRTIMER_RESTART;
9892 }
9893
9894 static const char * const hwsim_chanwidths[] = {
9895@@ -1643,15 +1639,15 @@ static int mac80211_hwsim_config(struct ieee80211_hw *hw, u32 changed)
9896 mutex_unlock(&data->mutex);
9897
9898 if (!data->started || !data->beacon_int)
9899- tasklet_hrtimer_cancel(&data->beacon_timer);
9900- else if (!hrtimer_is_queued(&data->beacon_timer.timer)) {
9901+ hrtimer_cancel(&data->beacon_timer);
9902+ else if (!hrtimer_is_queued(&data->beacon_timer)) {
9903 u64 tsf = mac80211_hwsim_get_tsf(hw, NULL);
9904 u32 bcn_int = data->beacon_int;
9905 u64 until_tbtt = bcn_int - do_div(tsf, bcn_int);
9906
9907- tasklet_hrtimer_start(&data->beacon_timer,
9908- ns_to_ktime(until_tbtt * 1000),
9909- HRTIMER_MODE_REL);
9910+ hrtimer_start(&data->beacon_timer,
9911+ ns_to_ktime(until_tbtt * 1000),
9912+ HRTIMER_MODE_REL_SOFT);
9913 }
9914
9915 return 0;
9916@@ -1714,7 +1710,7 @@ static void mac80211_hwsim_bss_info_changed(struct ieee80211_hw *hw,
9917 info->enable_beacon, info->beacon_int);
9918 vp->bcn_en = info->enable_beacon;
9919 if (data->started &&
9920- !hrtimer_is_queued(&data->beacon_timer.timer) &&
9921+ !hrtimer_is_queued(&data->beacon_timer) &&
9922 info->enable_beacon) {
9923 u64 tsf, until_tbtt;
9924 u32 bcn_int;
9925@@ -1722,9 +1718,9 @@ static void mac80211_hwsim_bss_info_changed(struct ieee80211_hw *hw,
9926 tsf = mac80211_hwsim_get_tsf(hw, vif);
9927 bcn_int = data->beacon_int;
9928 until_tbtt = bcn_int - do_div(tsf, bcn_int);
9929- tasklet_hrtimer_start(&data->beacon_timer,
9930- ns_to_ktime(until_tbtt * 1000),
9931- HRTIMER_MODE_REL);
9932+ hrtimer_start(&data->beacon_timer,
9933+ ns_to_ktime(until_tbtt * 1000),
9934+ HRTIMER_MODE_REL_SOFT);
9935 } else if (!info->enable_beacon) {
9936 unsigned int count = 0;
9937 ieee80211_iterate_active_interfaces_atomic(
9938@@ -1733,7 +1729,7 @@ static void mac80211_hwsim_bss_info_changed(struct ieee80211_hw *hw,
9939 wiphy_debug(hw->wiphy, " beaconing vifs remaining: %u",
9940 count);
9941 if (count == 0) {
9942- tasklet_hrtimer_cancel(&data->beacon_timer);
9943+ hrtimer_cancel(&data->beacon_timer);
9944 data->beacon_int = 0;
9945 }
9946 }
9947@@ -2722,9 +2718,9 @@ static int mac80211_hwsim_new_radio(struct genl_info *info,
9948 data->debugfs,
9949 data, &hwsim_simulate_radar);
9950
9951- tasklet_hrtimer_init(&data->beacon_timer,
9952- mac80211_hwsim_beacon,
9953- CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
9954+ hrtimer_init(&data->beacon_timer, CLOCK_MONOTONIC,
9955+ HRTIMER_MODE_ABS_SOFT);
9956+ data->beacon_timer.function = mac80211_hwsim_beacon;
9957
9958 spin_lock_bh(&hwsim_radio_lock);
9959 list_add_tail(&data->list, &hwsim_radios);
9960diff --git a/drivers/pci/switch/switchtec.c b/drivers/pci/switch/switchtec.c
9961index 620f5b995a12..7fd1548a2905 100644
9962--- a/drivers/pci/switch/switchtec.c
9963+++ b/drivers/pci/switch/switchtec.c
9964@@ -308,10 +308,11 @@ struct switchtec_user {
9965
9966 enum mrpc_state state;
9967
9968- struct completion comp;
9969+ wait_queue_head_t cmd_comp;
9970 struct kref kref;
9971 struct list_head list;
9972
9973+ bool cmd_done;
9974 u32 cmd;
9975 u32 status;
9976 u32 return_code;
9977@@ -333,7 +334,7 @@ static struct switchtec_user *stuser_create(struct switchtec_dev *stdev)
9978 stuser->stdev = stdev;
9979 kref_init(&stuser->kref);
9980 INIT_LIST_HEAD(&stuser->list);
9981- init_completion(&stuser->comp);
9982+ init_waitqueue_head(&stuser->cmd_comp);
9983 stuser->event_cnt = atomic_read(&stdev->event_cnt);
9984
9985 dev_dbg(&stdev->dev, "%s: %p\n", __func__, stuser);
9986@@ -416,7 +417,7 @@ static int mrpc_queue_cmd(struct switchtec_user *stuser)
9987 kref_get(&stuser->kref);
9988 stuser->read_len = sizeof(stuser->data);
9989 stuser_set_state(stuser, MRPC_QUEUED);
9990- init_completion(&stuser->comp);
9991+ stuser->cmd_done = false;
9992 list_add_tail(&stuser->list, &stdev->mrpc_queue);
9993
9994 mrpc_cmd_submit(stdev);
9995@@ -453,7 +454,8 @@ static void mrpc_complete_cmd(struct switchtec_dev *stdev)
9996 stuser->read_len);
9997
9998 out:
9999- complete_all(&stuser->comp);
10000+ stuser->cmd_done = true;
10001+ wake_up_interruptible(&stuser->cmd_comp);
10002 list_del_init(&stuser->list);
10003 stuser_put(stuser);
10004 stdev->mrpc_busy = 0;
10005@@ -723,10 +725,11 @@ static ssize_t switchtec_dev_read(struct file *filp, char __user *data,
10006 mutex_unlock(&stdev->mrpc_mutex);
10007
10008 if (filp->f_flags & O_NONBLOCK) {
10009- if (!try_wait_for_completion(&stuser->comp))
10010+ if (!READ_ONCE(stuser->cmd_done))
10011 return -EAGAIN;
10012 } else {
10013- rc = wait_for_completion_interruptible(&stuser->comp);
10014+ rc = wait_event_interruptible(stuser->cmd_comp,
10015+ stuser->cmd_done);
10016 if (rc < 0)
10017 return rc;
10018 }
10019@@ -774,7 +777,7 @@ static unsigned int switchtec_dev_poll(struct file *filp, poll_table *wait)
10020 struct switchtec_dev *stdev = stuser->stdev;
10021 int ret = 0;
10022
10023- poll_wait(filp, &stuser->comp.wait, wait);
10024+ poll_wait(filp, &stuser->cmd_comp, wait);
10025 poll_wait(filp, &stdev->event_wq, wait);
10026
10027 if (lock_mutex_and_test_alive(stdev))
10028@@ -782,7 +785,7 @@ static unsigned int switchtec_dev_poll(struct file *filp, poll_table *wait)
10029
10030 mutex_unlock(&stdev->mrpc_mutex);
10031
10032- if (try_wait_for_completion(&stuser->comp))
10033+ if (READ_ONCE(stuser->cmd_done))
10034 ret |= POLLIN | POLLRDNORM;
10035
10036 if (stuser->event_cnt != atomic_read(&stdev->event_cnt))
10037@@ -1259,7 +1262,8 @@ static void stdev_kill(struct switchtec_dev *stdev)
10038
10039 /* Wake up and kill any users waiting on an MRPC request */
10040 list_for_each_entry_safe(stuser, tmpuser, &stdev->mrpc_queue, list) {
10041- complete_all(&stuser->comp);
10042+ stuser->cmd_done = true;
10043+ wake_up_interruptible(&stuser->cmd_comp);
10044 list_del_init(&stuser->list);
10045 stuser_put(stuser);
10046 }
10047diff --git a/drivers/scsi/fcoe/fcoe.c b/drivers/scsi/fcoe/fcoe.c
10048index 85f9a3eba387..08ea05ddcd82 100644
10049--- a/drivers/scsi/fcoe/fcoe.c
10050+++ b/drivers/scsi/fcoe/fcoe.c
10051@@ -1464,11 +1464,11 @@ static int fcoe_rcv(struct sk_buff *skb, struct net_device *netdev,
10052 static int fcoe_alloc_paged_crc_eof(struct sk_buff *skb, int tlen)
10053 {
10054 struct fcoe_percpu_s *fps;
10055- int rc;
10056+ int rc, cpu = get_cpu_light();
10057
10058- fps = &get_cpu_var(fcoe_percpu);
10059+ fps = &per_cpu(fcoe_percpu, cpu);
10060 rc = fcoe_get_paged_crc_eof(skb, tlen, fps);
10061- put_cpu_var(fcoe_percpu);
10062+ put_cpu_light();
10063
10064 return rc;
10065 }
10066@@ -1655,11 +1655,11 @@ static inline int fcoe_filter_frames(struct fc_lport *lport,
10067 return 0;
10068 }
10069
10070- stats = per_cpu_ptr(lport->stats, get_cpu());
10071+ stats = per_cpu_ptr(lport->stats, get_cpu_light());
10072 stats->InvalidCRCCount++;
10073 if (stats->InvalidCRCCount < 5)
10074 printk(KERN_WARNING "fcoe: dropping frame with CRC error\n");
10075- put_cpu();
10076+ put_cpu_light();
10077 return -EINVAL;
10078 }
10079
10080@@ -1702,7 +1702,7 @@ static void fcoe_recv_frame(struct sk_buff *skb)
10081 */
10082 hp = (struct fcoe_hdr *) skb_network_header(skb);
10083
10084- stats = per_cpu_ptr(lport->stats, get_cpu());
10085+ stats = per_cpu_ptr(lport->stats, get_cpu_light());
10086 if (unlikely(FC_FCOE_DECAPS_VER(hp) != FC_FCOE_VER)) {
10087 if (stats->ErrorFrames < 5)
10088 printk(KERN_WARNING "fcoe: FCoE version "
10089@@ -1734,13 +1734,13 @@ static void fcoe_recv_frame(struct sk_buff *skb)
10090 goto drop;
10091
10092 if (!fcoe_filter_frames(lport, fp)) {
10093- put_cpu();
10094+ put_cpu_light();
10095 fc_exch_recv(lport, fp);
10096 return;
10097 }
10098 drop:
10099 stats->ErrorFrames++;
10100- put_cpu();
10101+ put_cpu_light();
10102 kfree_skb(skb);
10103 }
10104
10105diff --git a/drivers/scsi/fcoe/fcoe_ctlr.c b/drivers/scsi/fcoe/fcoe_ctlr.c
10106index 03019e07abb9..9ec11316bfe6 100644
10107--- a/drivers/scsi/fcoe/fcoe_ctlr.c
10108+++ b/drivers/scsi/fcoe/fcoe_ctlr.c
10109@@ -835,7 +835,7 @@ static unsigned long fcoe_ctlr_age_fcfs(struct fcoe_ctlr *fip)
10110
10111 INIT_LIST_HEAD(&del_list);
10112
10113- stats = per_cpu_ptr(fip->lp->stats, get_cpu());
10114+ stats = per_cpu_ptr(fip->lp->stats, get_cpu_light());
10115
10116 list_for_each_entry_safe(fcf, next, &fip->fcfs, list) {
10117 deadline = fcf->time + fcf->fka_period + fcf->fka_period / 2;
10118@@ -871,7 +871,7 @@ static unsigned long fcoe_ctlr_age_fcfs(struct fcoe_ctlr *fip)
10119 sel_time = fcf->time;
10120 }
10121 }
10122- put_cpu();
10123+ put_cpu_light();
10124
10125 list_for_each_entry_safe(fcf, next, &del_list, list) {
10126 /* Removes fcf from current list */
10127diff --git a/drivers/scsi/libfc/fc_exch.c b/drivers/scsi/libfc/fc_exch.c
10128index 42bcf7f3a0f9..2ce045d6860c 100644
10129--- a/drivers/scsi/libfc/fc_exch.c
10130+++ b/drivers/scsi/libfc/fc_exch.c
10131@@ -833,10 +833,10 @@ static struct fc_exch *fc_exch_em_alloc(struct fc_lport *lport,
10132 }
10133 memset(ep, 0, sizeof(*ep));
10134
10135- cpu = get_cpu();
10136+ cpu = get_cpu_light();
10137 pool = per_cpu_ptr(mp->pool, cpu);
10138 spin_lock_bh(&pool->lock);
10139- put_cpu();
10140+ put_cpu_light();
10141
10142 /* peek cache of free slot */
10143 if (pool->left != FC_XID_UNKNOWN) {
10144diff --git a/drivers/scsi/libsas/sas_ata.c b/drivers/scsi/libsas/sas_ata.c
10145index 70be4425ae0b..a23ef685deac 100644
10146--- a/drivers/scsi/libsas/sas_ata.c
10147+++ b/drivers/scsi/libsas/sas_ata.c
10148@@ -190,7 +190,7 @@ static unsigned int sas_ata_qc_issue(struct ata_queued_cmd *qc)
10149 /* TODO: audit callers to ensure they are ready for qc_issue to
10150 * unconditionally re-enable interrupts
10151 */
10152- local_irq_save(flags);
10153+ local_irq_save_nort(flags);
10154 spin_unlock(ap->lock);
10155
10156 /* If the device fell off, no sense in issuing commands */
10157@@ -252,7 +252,7 @@ static unsigned int sas_ata_qc_issue(struct ata_queued_cmd *qc)
10158
10159 out:
10160 spin_lock(ap->lock);
10161- local_irq_restore(flags);
10162+ local_irq_restore_nort(flags);
10163 return ret;
10164 }
10165
10166diff --git a/drivers/scsi/qla2xxx/qla_inline.h b/drivers/scsi/qla2xxx/qla_inline.h
10167index 3f5a0f0f8b62..c75783143dc1 100644
10168--- a/drivers/scsi/qla2xxx/qla_inline.h
10169+++ b/drivers/scsi/qla2xxx/qla_inline.h
10170@@ -59,12 +59,12 @@ qla2x00_poll(struct rsp_que *rsp)
10171 {
10172 unsigned long flags;
10173 struct qla_hw_data *ha = rsp->hw;
10174- local_irq_save(flags);
10175+ local_irq_save_nort(flags);
10176 if (IS_P3P_TYPE(ha))
10177 qla82xx_poll(0, rsp);
10178 else
10179 ha->isp_ops->intr_handler(0, rsp);
10180- local_irq_restore(flags);
10181+ local_irq_restore_nort(flags);
10182 }
10183
10184 static inline uint8_t *
10185diff --git a/drivers/staging/greybus/audio_manager.c b/drivers/staging/greybus/audio_manager.c
10186index aa6508b44fab..045696ce85c7 100644
10187--- a/drivers/staging/greybus/audio_manager.c
10188+++ b/drivers/staging/greybus/audio_manager.c
10189@@ -10,7 +10,7 @@
10190 #include <linux/sysfs.h>
10191 #include <linux/module.h>
10192 #include <linux/init.h>
10193-#include <linux/rwlock.h>
10194+#include <linux/spinlock.h>
10195 #include <linux/idr.h>
10196
10197 #include "audio_manager.h"
10198diff --git a/drivers/target/target_core_tmr.c b/drivers/target/target_core_tmr.c
10199index 9c7bc1ca341a..3d35dad1de2c 100644
10200--- a/drivers/target/target_core_tmr.c
10201+++ b/drivers/target/target_core_tmr.c
10202@@ -114,8 +114,6 @@ static bool __target_check_io_state(struct se_cmd *se_cmd,
10203 {
10204 struct se_session *sess = se_cmd->se_sess;
10205
10206- assert_spin_locked(&sess->sess_cmd_lock);
10207- WARN_ON_ONCE(!irqs_disabled());
10208 /*
10209 * If command already reached CMD_T_COMPLETE state within
10210 * target_complete_cmd() or CMD_T_FABRIC_STOP due to shutdown,
10211diff --git a/drivers/target/target_core_transport.c b/drivers/target/target_core_transport.c
10212index 0d0be7d8b9d6..f652e58e2988 100644
10213--- a/drivers/target/target_core_transport.c
10214+++ b/drivers/target/target_core_transport.c
10215@@ -2967,9 +2967,6 @@ __transport_wait_for_tasks(struct se_cmd *cmd, bool fabric_stop,
10216 __acquires(&cmd->t_state_lock)
10217 {
10218
10219- assert_spin_locked(&cmd->t_state_lock);
10220- WARN_ON_ONCE(!irqs_disabled());
10221-
10222 if (fabric_stop)
10223 cmd->transport_state |= CMD_T_FABRIC_STOP;
10224
10225@@ -3239,9 +3236,6 @@ static int __transport_check_aborted_status(struct se_cmd *cmd, int send_status)
10226 {
10227 int ret;
10228
10229- assert_spin_locked(&cmd->t_state_lock);
10230- WARN_ON_ONCE(!irqs_disabled());
10231-
10232 if (!(cmd->transport_state & CMD_T_ABORTED))
10233 return 0;
10234 /*
10235diff --git a/drivers/thermal/x86_pkg_temp_thermal.c b/drivers/thermal/x86_pkg_temp_thermal.c
10236index d93eee2f101b..0287333b1f3c 100644
10237--- a/drivers/thermal/x86_pkg_temp_thermal.c
10238+++ b/drivers/thermal/x86_pkg_temp_thermal.c
10239@@ -29,6 +29,7 @@
10240 #include <linux/pm.h>
10241 #include <linux/thermal.h>
10242 #include <linux/debugfs.h>
10243+#include <linux/swork.h>
10244 #include <asm/cpu_device_id.h>
10245 #include <asm/mce.h>
10246
10247@@ -329,7 +330,7 @@ static void pkg_thermal_schedule_work(int cpu, struct delayed_work *work)
10248 schedule_delayed_work_on(cpu, work, ms);
10249 }
10250
10251-static int pkg_thermal_notify(u64 msr_val)
10252+static void pkg_thermal_notify_work(struct swork_event *event)
10253 {
10254 int cpu = smp_processor_id();
10255 struct pkg_device *pkgdev;
10256@@ -348,9 +349,47 @@ static int pkg_thermal_notify(u64 msr_val)
10257 }
10258
10259 spin_unlock_irqrestore(&pkg_temp_lock, flags);
10260+}
10261+
10262+#ifdef CONFIG_PREEMPT_RT_FULL
10263+static struct swork_event notify_work;
10264+
10265+static int pkg_thermal_notify_work_init(void)
10266+{
10267+ int err;
10268+
10269+ err = swork_get();
10270+ if (err)
10271+ return err;
10272+
10273+ INIT_SWORK(&notify_work, pkg_thermal_notify_work);
10274 return 0;
10275 }
10276
10277+static void pkg_thermal_notify_work_cleanup(void)
10278+{
10279+ swork_put();
10280+}
10281+
10282+static int pkg_thermal_notify(u64 msr_val)
10283+{
10284+ swork_queue(&notify_work);
10285+ return 0;
10286+}
10287+
10288+#else /* !CONFIG_PREEMPT_RT_FULL */
10289+
10290+static int pkg_thermal_notify_work_init(void) { return 0; }
10291+
10292+static void pkg_thermal_notify_work_cleanup(void) { }
10293+
10294+static int pkg_thermal_notify(u64 msr_val)
10295+{
10296+ pkg_thermal_notify_work(NULL);
10297+ return 0;
10298+}
10299+#endif /* CONFIG_PREEMPT_RT_FULL */
10300+
10301 static int pkg_temp_thermal_device_add(unsigned int cpu)
10302 {
10303 int pkgid = topology_logical_package_id(cpu);
10304@@ -515,10 +554,15 @@ static int __init pkg_temp_thermal_init(void)
10305 if (!x86_match_cpu(pkg_temp_thermal_ids))
10306 return -ENODEV;
10307
10308+ if (!pkg_thermal_notify_work_init())
10309+ return -ENODEV;
10310+
10311 max_packages = topology_max_packages();
10312 packages = kzalloc(max_packages * sizeof(struct pkg_device *), GFP_KERNEL);
10313- if (!packages)
10314- return -ENOMEM;
10315+ if (!packages) {
10316+ ret = -ENOMEM;
10317+ goto err;
10318+ }
10319
10320 ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "thermal/x86_pkg:online",
10321 pkg_thermal_cpu_online, pkg_thermal_cpu_offline);
10322@@ -536,6 +580,7 @@ static int __init pkg_temp_thermal_init(void)
10323 return 0;
10324
10325 err:
10326+ pkg_thermal_notify_work_cleanup();
10327 kfree(packages);
10328 return ret;
10329 }
10330@@ -549,6 +594,7 @@ static void __exit pkg_temp_thermal_exit(void)
10331 cpuhp_remove_state(pkg_thermal_hp_state);
10332 debugfs_remove_recursive(debugfs);
10333 kfree(packages);
10334+ pkg_thermal_notify_work_cleanup();
10335 }
10336 module_exit(pkg_temp_thermal_exit)
10337
10338diff --git a/drivers/tty/serial/8250/8250_core.c b/drivers/tty/serial/8250/8250_core.c
10339index d29b512a7d9f..bc8cbb995b29 100644
10340--- a/drivers/tty/serial/8250/8250_core.c
10341+++ b/drivers/tty/serial/8250/8250_core.c
10342@@ -58,7 +58,16 @@ static struct uart_driver serial8250_reg;
10343
10344 static unsigned int skip_txen_test; /* force skip of txen test at init time */
10345
10346-#define PASS_LIMIT 512
10347+/*
10348+ * On -rt we can have a more delays, and legitimately
10349+ * so - so don't drop work spuriously and spam the
10350+ * syslog:
10351+ */
10352+#ifdef CONFIG_PREEMPT_RT_FULL
10353+# define PASS_LIMIT 1000000
10354+#else
10355+# define PASS_LIMIT 512
10356+#endif
10357
10358 #include <asm/serial.h>
10359 /*
10360diff --git a/drivers/tty/serial/8250/8250_port.c b/drivers/tty/serial/8250/8250_port.c
10361index ecf3d631bc09..6e029f34f37f 100644
10362--- a/drivers/tty/serial/8250/8250_port.c
10363+++ b/drivers/tty/serial/8250/8250_port.c
10364@@ -35,6 +35,7 @@
10365 #include <linux/nmi.h>
10366 #include <linux/mutex.h>
10367 #include <linux/slab.h>
10368+#include <linux/kdb.h>
10369 #include <linux/uaccess.h>
10370 #include <linux/pm_runtime.h>
10371 #include <linux/ktime.h>
10372@@ -3224,9 +3225,9 @@ void serial8250_console_write(struct uart_8250_port *up, const char *s,
10373
10374 serial8250_rpm_get(up);
10375
10376- if (port->sysrq)
10377+ if (port->sysrq || oops_in_progress)
10378 locked = 0;
10379- else if (oops_in_progress)
10380+ else if (in_kdb_printk())
10381 locked = spin_trylock_irqsave(&port->lock, flags);
10382 else
10383 spin_lock_irqsave(&port->lock, flags);
10384diff --git a/drivers/tty/serial/amba-pl011.c b/drivers/tty/serial/amba-pl011.c
10385index c9f701aca677..81d6b15fb80a 100644
10386--- a/drivers/tty/serial/amba-pl011.c
10387+++ b/drivers/tty/serial/amba-pl011.c
10388@@ -2236,13 +2236,19 @@ pl011_console_write(struct console *co, const char *s, unsigned int count)
10389
10390 clk_enable(uap->clk);
10391
10392- local_irq_save(flags);
10393+ /*
10394+ * local_irq_save(flags);
10395+ *
10396+ * This local_irq_save() is nonsense. If we come in via sysrq
10397+ * handling then interrupts are already disabled. Aside of
10398+ * that the port.sysrq check is racy on SMP regardless.
10399+ */
10400 if (uap->port.sysrq)
10401 locked = 0;
10402 else if (oops_in_progress)
10403- locked = spin_trylock(&uap->port.lock);
10404+ locked = spin_trylock_irqsave(&uap->port.lock, flags);
10405 else
10406- spin_lock(&uap->port.lock);
10407+ spin_lock_irqsave(&uap->port.lock, flags);
10408
10409 /*
10410 * First save the CR then disable the interrupts
10411@@ -2268,8 +2274,7 @@ pl011_console_write(struct console *co, const char *s, unsigned int count)
10412 pl011_write(old_cr, uap, REG_CR);
10413
10414 if (locked)
10415- spin_unlock(&uap->port.lock);
10416- local_irq_restore(flags);
10417+ spin_unlock_irqrestore(&uap->port.lock, flags);
10418
10419 clk_disable(uap->clk);
10420 }
10421diff --git a/drivers/tty/serial/omap-serial.c b/drivers/tty/serial/omap-serial.c
10422index 26a22b100df1..69117e355bcd 100644
10423--- a/drivers/tty/serial/omap-serial.c
10424+++ b/drivers/tty/serial/omap-serial.c
10425@@ -1311,13 +1311,10 @@ serial_omap_console_write(struct console *co, const char *s,
10426
10427 pm_runtime_get_sync(up->dev);
10428
10429- local_irq_save(flags);
10430- if (up->port.sysrq)
10431- locked = 0;
10432- else if (oops_in_progress)
10433- locked = spin_trylock(&up->port.lock);
10434+ if (up->port.sysrq || oops_in_progress)
10435+ locked = spin_trylock_irqsave(&up->port.lock, flags);
10436 else
10437- spin_lock(&up->port.lock);
10438+ spin_lock_irqsave(&up->port.lock, flags);
10439
10440 /*
10441 * First save the IER then disable the interrupts
10442@@ -1346,8 +1343,7 @@ serial_omap_console_write(struct console *co, const char *s,
10443 pm_runtime_mark_last_busy(up->dev);
10444 pm_runtime_put_autosuspend(up->dev);
10445 if (locked)
10446- spin_unlock(&up->port.lock);
10447- local_irq_restore(flags);
10448+ spin_unlock_irqrestore(&up->port.lock, flags);
10449 }
10450
10451 static int __init
10452diff --git a/drivers/usb/core/hcd.c b/drivers/usb/core/hcd.c
10453index d0b2e0ed9bab..91f4f2bd55b0 100644
10454--- a/drivers/usb/core/hcd.c
10455+++ b/drivers/usb/core/hcd.c
10456@@ -1775,9 +1775,9 @@ static void __usb_hcd_giveback_urb(struct urb *urb)
10457 * and no one may trigger the above deadlock situation when
10458 * running complete() in tasklet.
10459 */
10460- local_irq_save(flags);
10461+ local_irq_save_nort(flags);
10462 urb->complete(urb);
10463- local_irq_restore(flags);
10464+ local_irq_restore_nort(flags);
10465
10466 usb_anchor_resume_wakeups(anchor);
10467 atomic_dec(&urb->use_count);
10468diff --git a/drivers/usb/gadget/function/f_fs.c b/drivers/usb/gadget/function/f_fs.c
10469index 17467545391b..42ec6f2db6a9 100644
10470--- a/drivers/usb/gadget/function/f_fs.c
10471+++ b/drivers/usb/gadget/function/f_fs.c
10472@@ -1623,7 +1623,7 @@ static void ffs_data_put(struct ffs_data *ffs)
10473 pr_info("%s(): freeing\n", __func__);
10474 ffs_data_clear(ffs);
10475 BUG_ON(waitqueue_active(&ffs->ev.waitq) ||
10476- waitqueue_active(&ffs->ep0req_completion.wait) ||
10477+ swait_active(&ffs->ep0req_completion.wait) ||
10478 waitqueue_active(&ffs->wait));
10479 destroy_workqueue(ffs->io_completion_wq);
10480 kfree(ffs->dev_name);
10481diff --git a/drivers/usb/gadget/function/f_ncm.c b/drivers/usb/gadget/function/f_ncm.c
10482index 45b334ceaf2e..5f24e6d3b6eb 100644
10483--- a/drivers/usb/gadget/function/f_ncm.c
10484+++ b/drivers/usb/gadget/function/f_ncm.c
10485@@ -77,9 +77,7 @@ struct f_ncm {
10486 struct sk_buff *skb_tx_ndp;
10487 u16 ndp_dgram_count;
10488 bool timer_force_tx;
10489- struct tasklet_struct tx_tasklet;
10490 struct hrtimer task_timer;
10491-
10492 bool timer_stopping;
10493 };
10494
10495@@ -1108,7 +1106,7 @@ static struct sk_buff *ncm_wrap_ntb(struct gether *port,
10496
10497 /* Delay the timer. */
10498 hrtimer_start(&ncm->task_timer, TX_TIMEOUT_NSECS,
10499- HRTIMER_MODE_REL);
10500+ HRTIMER_MODE_REL_SOFT);
10501
10502 /* Add the datagram position entries */
10503 ntb_ndp = skb_put_zero(ncm->skb_tx_ndp, dgram_idx_len);
10504@@ -1152,17 +1150,15 @@ static struct sk_buff *ncm_wrap_ntb(struct gether *port,
10505 }
10506
10507 /*
10508- * This transmits the NTB if there are frames waiting.
10509+ * The transmit should only be run if no skb data has been sent
10510+ * for a certain duration.
10511 */
10512-static void ncm_tx_tasklet(unsigned long data)
10513+static enum hrtimer_restart ncm_tx_timeout(struct hrtimer *data)
10514 {
10515- struct f_ncm *ncm = (void *)data;
10516-
10517- if (ncm->timer_stopping)
10518- return;
10519+ struct f_ncm *ncm = container_of(data, struct f_ncm, task_timer);
10520
10521 /* Only send if data is available. */
10522- if (ncm->skb_tx_data) {
10523+ if (!ncm->timer_stopping && ncm->skb_tx_data) {
10524 ncm->timer_force_tx = true;
10525
10526 /* XXX This allowance of a NULL skb argument to ndo_start_xmit
10527@@ -1175,16 +1171,6 @@ static void ncm_tx_tasklet(unsigned long data)
10528
10529 ncm->timer_force_tx = false;
10530 }
10531-}
10532-
10533-/*
10534- * The transmit should only be run if no skb data has been sent
10535- * for a certain duration.
10536- */
10537-static enum hrtimer_restart ncm_tx_timeout(struct hrtimer *data)
10538-{
10539- struct f_ncm *ncm = container_of(data, struct f_ncm, task_timer);
10540- tasklet_schedule(&ncm->tx_tasklet);
10541 return HRTIMER_NORESTART;
10542 }
10543
10544@@ -1517,8 +1503,7 @@ static int ncm_bind(struct usb_configuration *c, struct usb_function *f)
10545 ncm->port.open = ncm_open;
10546 ncm->port.close = ncm_close;
10547
10548- tasklet_init(&ncm->tx_tasklet, ncm_tx_tasklet, (unsigned long) ncm);
10549- hrtimer_init(&ncm->task_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
10550+ hrtimer_init(&ncm->task_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_SOFT);
10551 ncm->task_timer.function = ncm_tx_timeout;
10552
10553 DBG(cdev, "CDC Network: %s speed IN/%s OUT/%s NOTIFY/%s\n",
10554@@ -1627,7 +1612,6 @@ static void ncm_unbind(struct usb_configuration *c, struct usb_function *f)
10555 DBG(c->cdev, "ncm unbind\n");
10556
10557 hrtimer_cancel(&ncm->task_timer);
10558- tasklet_kill(&ncm->tx_tasklet);
10559
10560 ncm_string_defs[0].id = 0;
10561 usb_free_all_descriptors(f);
10562diff --git a/drivers/usb/gadget/legacy/inode.c b/drivers/usb/gadget/legacy/inode.c
10563index 5c28bee327e1..ed49dba4704d 100644
10564--- a/drivers/usb/gadget/legacy/inode.c
10565+++ b/drivers/usb/gadget/legacy/inode.c
10566@@ -347,7 +347,7 @@ ep_io (struct ep_data *epdata, void *buf, unsigned len)
10567 spin_unlock_irq (&epdata->dev->lock);
10568
10569 if (likely (value == 0)) {
10570- value = wait_event_interruptible (done.wait, done.done);
10571+ value = swait_event_interruptible (done.wait, done.done);
10572 if (value != 0) {
10573 spin_lock_irq (&epdata->dev->lock);
10574 if (likely (epdata->ep != NULL)) {
10575@@ -356,7 +356,7 @@ ep_io (struct ep_data *epdata, void *buf, unsigned len)
10576 usb_ep_dequeue (epdata->ep, epdata->req);
10577 spin_unlock_irq (&epdata->dev->lock);
10578
10579- wait_event (done.wait, done.done);
10580+ swait_event (done.wait, done.done);
10581 if (epdata->status == -ECONNRESET)
10582 epdata->status = -EINTR;
10583 } else {
10584diff --git a/fs/aio.c b/fs/aio.c
10585index 3a749c3a92e3..24c6ceadaae6 100644
10586--- a/fs/aio.c
10587+++ b/fs/aio.c
10588@@ -40,6 +40,7 @@
10589 #include <linux/ramfs.h>
10590 #include <linux/percpu-refcount.h>
10591 #include <linux/mount.h>
10592+#include <linux/swork.h>
10593
10594 #include <asm/kmap_types.h>
10595 #include <linux/uaccess.h>
10596@@ -117,6 +118,7 @@ struct kioctx {
10597
10598 struct rcu_head free_rcu;
10599 struct work_struct free_work; /* see free_ioctx() */
10600+ struct swork_event free_swork; /* see free_ioctx() */
10601
10602 /*
10603 * signals when all in-flight requests are done
10604@@ -259,6 +261,7 @@ static int __init aio_setup(void)
10605 .mount = aio_mount,
10606 .kill_sb = kill_anon_super,
10607 };
10608+ BUG_ON(swork_get());
10609 aio_mnt = kern_mount(&aio_fs);
10610 if (IS_ERR(aio_mnt))
10611 panic("Failed to create aio fs mount.");
10612@@ -633,9 +636,9 @@ static void free_ioctx_reqs(struct percpu_ref *ref)
10613 * and ctx->users has dropped to 0, so we know no more kiocbs can be submitted -
10614 * now it's safe to cancel any that need to be.
10615 */
10616-static void free_ioctx_users(struct percpu_ref *ref)
10617+static void free_ioctx_users_work(struct swork_event *sev)
10618 {
10619- struct kioctx *ctx = container_of(ref, struct kioctx, users);
10620+ struct kioctx *ctx = container_of(sev, struct kioctx, free_swork);
10621 struct aio_kiocb *req;
10622
10623 spin_lock_irq(&ctx->ctx_lock);
10624@@ -653,6 +656,14 @@ static void free_ioctx_users(struct percpu_ref *ref)
10625 percpu_ref_put(&ctx->reqs);
10626 }
10627
10628+static void free_ioctx_users(struct percpu_ref *ref)
10629+{
10630+ struct kioctx *ctx = container_of(ref, struct kioctx, users);
10631+
10632+ INIT_SWORK(&ctx->free_swork, free_ioctx_users_work);
10633+ swork_queue(&ctx->free_swork);
10634+}
10635+
10636 static int ioctx_add_table(struct kioctx *ctx, struct mm_struct *mm)
10637 {
10638 unsigned i, new_nr;
10639diff --git a/fs/autofs4/autofs_i.h b/fs/autofs4/autofs_i.h
10640index ce696d6c4641..b120fbd41483 100644
10641--- a/fs/autofs4/autofs_i.h
10642+++ b/fs/autofs4/autofs_i.h
10643@@ -20,6 +20,7 @@
10644 #include <linux/sched.h>
10645 #include <linux/mount.h>
10646 #include <linux/namei.h>
10647+#include <linux/delay.h>
10648 #include <linux/uaccess.h>
10649 #include <linux/mutex.h>
10650 #include <linux/spinlock.h>
10651diff --git a/fs/autofs4/expire.c b/fs/autofs4/expire.c
10652index 57725d4a8c59..62220508bace 100644
10653--- a/fs/autofs4/expire.c
10654+++ b/fs/autofs4/expire.c
10655@@ -148,7 +148,7 @@ static struct dentry *get_next_positive_dentry(struct dentry *prev,
10656 parent = p->d_parent;
10657 if (!spin_trylock(&parent->d_lock)) {
10658 spin_unlock(&p->d_lock);
10659- cpu_relax();
10660+ cpu_chill();
10661 goto relock;
10662 }
10663 spin_unlock(&p->d_lock);
10664diff --git a/fs/buffer.c b/fs/buffer.c
10665index b96f3b98a6ef..4ca5f222537a 100644
10666--- a/fs/buffer.c
10667+++ b/fs/buffer.c
10668@@ -302,8 +302,7 @@ static void end_buffer_async_read(struct buffer_head *bh, int uptodate)
10669 * decide that the page is now completely done.
10670 */
10671 first = page_buffers(page);
10672- local_irq_save(flags);
10673- bit_spin_lock(BH_Uptodate_Lock, &first->b_state);
10674+ flags = bh_uptodate_lock_irqsave(first);
10675 clear_buffer_async_read(bh);
10676 unlock_buffer(bh);
10677 tmp = bh;
10678@@ -316,8 +315,7 @@ static void end_buffer_async_read(struct buffer_head *bh, int uptodate)
10679 }
10680 tmp = tmp->b_this_page;
10681 } while (tmp != bh);
10682- bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
10683- local_irq_restore(flags);
10684+ bh_uptodate_unlock_irqrestore(first, flags);
10685
10686 /*
10687 * If none of the buffers had errors and they are all
10688@@ -329,9 +327,7 @@ static void end_buffer_async_read(struct buffer_head *bh, int uptodate)
10689 return;
10690
10691 still_busy:
10692- bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
10693- local_irq_restore(flags);
10694- return;
10695+ bh_uptodate_unlock_irqrestore(first, flags);
10696 }
10697
10698 /*
10699@@ -358,8 +354,7 @@ void end_buffer_async_write(struct buffer_head *bh, int uptodate)
10700 }
10701
10702 first = page_buffers(page);
10703- local_irq_save(flags);
10704- bit_spin_lock(BH_Uptodate_Lock, &first->b_state);
10705+ flags = bh_uptodate_lock_irqsave(first);
10706
10707 clear_buffer_async_write(bh);
10708 unlock_buffer(bh);
10709@@ -371,15 +366,12 @@ void end_buffer_async_write(struct buffer_head *bh, int uptodate)
10710 }
10711 tmp = tmp->b_this_page;
10712 }
10713- bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
10714- local_irq_restore(flags);
10715+ bh_uptodate_unlock_irqrestore(first, flags);
10716 end_page_writeback(page);
10717 return;
10718
10719 still_busy:
10720- bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
10721- local_irq_restore(flags);
10722- return;
10723+ bh_uptodate_unlock_irqrestore(first, flags);
10724 }
10725 EXPORT_SYMBOL(end_buffer_async_write);
10726
10727@@ -3417,6 +3409,7 @@ struct buffer_head *alloc_buffer_head(gfp_t gfp_flags)
10728 struct buffer_head *ret = kmem_cache_zalloc(bh_cachep, gfp_flags);
10729 if (ret) {
10730 INIT_LIST_HEAD(&ret->b_assoc_buffers);
10731+ buffer_head_init_locks(ret);
10732 preempt_disable();
10733 __this_cpu_inc(bh_accounting.nr);
10734 recalc_bh_state();
10735diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c
10736index ef24b4527459..3ce6331a1101 100644
10737--- a/fs/cifs/readdir.c
10738+++ b/fs/cifs/readdir.c
10739@@ -80,7 +80,7 @@ cifs_prime_dcache(struct dentry *parent, struct qstr *name,
10740 struct inode *inode;
10741 struct super_block *sb = parent->d_sb;
10742 struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
10743- DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq);
10744+ DECLARE_SWAIT_QUEUE_HEAD_ONSTACK(wq);
10745
10746 cifs_dbg(FYI, "%s: for %s\n", __func__, name->name);
10747
10748diff --git a/fs/dcache.c b/fs/dcache.c
10749index 28b2e770bb69..b08506ef464a 100644
10750--- a/fs/dcache.c
10751+++ b/fs/dcache.c
10752@@ -19,6 +19,7 @@
10753 #include <linux/mm.h>
10754 #include <linux/fs.h>
10755 #include <linux/fsnotify.h>
10756+#include <linux/delay.h>
10757 #include <linux/slab.h>
10758 #include <linux/init.h>
10759 #include <linux/hash.h>
10760@@ -808,6 +809,8 @@ static inline bool fast_dput(struct dentry *dentry)
10761 */
10762 void dput(struct dentry *dentry)
10763 {
10764+ struct dentry *parent;
10765+
10766 if (unlikely(!dentry))
10767 return;
10768
10769@@ -844,9 +847,18 @@ void dput(struct dentry *dentry)
10770 return;
10771
10772 kill_it:
10773- dentry = dentry_kill(dentry);
10774- if (dentry) {
10775- cond_resched();
10776+ parent = dentry_kill(dentry);
10777+ if (parent) {
10778+ int r;
10779+
10780+ if (parent == dentry) {
10781+ /* the task with the highest priority won't schedule */
10782+ r = cond_resched();
10783+ if (!r)
10784+ cpu_chill();
10785+ } else {
10786+ dentry = parent;
10787+ }
10788 goto repeat;
10789 }
10790 }
10791@@ -2414,7 +2426,7 @@ void d_delete(struct dentry * dentry)
10792 if (dentry->d_lockref.count == 1) {
10793 if (!spin_trylock(&inode->i_lock)) {
10794 spin_unlock(&dentry->d_lock);
10795- cpu_relax();
10796+ cpu_chill();
10797 goto again;
10798 }
10799 dentry->d_flags &= ~DCACHE_CANT_MOUNT;
10800@@ -2459,9 +2471,10 @@ EXPORT_SYMBOL(d_rehash);
10801 static inline unsigned start_dir_add(struct inode *dir)
10802 {
10803
10804+ preempt_disable_rt();
10805 for (;;) {
10806- unsigned n = dir->i_dir_seq;
10807- if (!(n & 1) && cmpxchg(&dir->i_dir_seq, n, n + 1) == n)
10808+ unsigned n = dir->__i_dir_seq;
10809+ if (!(n & 1) && cmpxchg(&dir->__i_dir_seq, n, n + 1) == n)
10810 return n;
10811 cpu_relax();
10812 }
10813@@ -2469,26 +2482,30 @@ static inline unsigned start_dir_add(struct inode *dir)
10814
10815 static inline void end_dir_add(struct inode *dir, unsigned n)
10816 {
10817- smp_store_release(&dir->i_dir_seq, n + 2);
10818+ smp_store_release(&dir->__i_dir_seq, n + 2);
10819+ preempt_enable_rt();
10820 }
10821
10822 static void d_wait_lookup(struct dentry *dentry)
10823 {
10824- if (d_in_lookup(dentry)) {
10825- DECLARE_WAITQUEUE(wait, current);
10826- add_wait_queue(dentry->d_wait, &wait);
10827- do {
10828- set_current_state(TASK_UNINTERRUPTIBLE);
10829- spin_unlock(&dentry->d_lock);
10830- schedule();
10831- spin_lock(&dentry->d_lock);
10832- } while (d_in_lookup(dentry));
10833- }
10834+ struct swait_queue __wait;
10835+
10836+ if (!d_in_lookup(dentry))
10837+ return;
10838+
10839+ INIT_LIST_HEAD(&__wait.task_list);
10840+ do {
10841+ prepare_to_swait(dentry->d_wait, &__wait, TASK_UNINTERRUPTIBLE);
10842+ spin_unlock(&dentry->d_lock);
10843+ schedule();
10844+ spin_lock(&dentry->d_lock);
10845+ } while (d_in_lookup(dentry));
10846+ finish_swait(dentry->d_wait, &__wait);
10847 }
10848
10849 struct dentry *d_alloc_parallel(struct dentry *parent,
10850 const struct qstr *name,
10851- wait_queue_head_t *wq)
10852+ struct swait_queue_head *wq)
10853 {
10854 unsigned int hash = name->hash;
10855 struct hlist_bl_head *b = in_lookup_hash(parent, hash);
10856@@ -2502,7 +2519,7 @@ struct dentry *d_alloc_parallel(struct dentry *parent,
10857
10858 retry:
10859 rcu_read_lock();
10860- seq = smp_load_acquire(&parent->d_inode->i_dir_seq);
10861+ seq = smp_load_acquire(&parent->d_inode->__i_dir_seq);
10862 r_seq = read_seqbegin(&rename_lock);
10863 dentry = __d_lookup_rcu(parent, name, &d_seq);
10864 if (unlikely(dentry)) {
10865@@ -2530,7 +2547,7 @@ struct dentry *d_alloc_parallel(struct dentry *parent,
10866 }
10867
10868 hlist_bl_lock(b);
10869- if (unlikely(READ_ONCE(parent->d_inode->i_dir_seq) != seq)) {
10870+ if (unlikely(READ_ONCE(parent->d_inode->__i_dir_seq) != seq)) {
10871 hlist_bl_unlock(b);
10872 rcu_read_unlock();
10873 goto retry;
10874@@ -2603,7 +2620,7 @@ void __d_lookup_done(struct dentry *dentry)
10875 hlist_bl_lock(b);
10876 dentry->d_flags &= ~DCACHE_PAR_LOOKUP;
10877 __hlist_bl_del(&dentry->d_u.d_in_lookup_hash);
10878- wake_up_all(dentry->d_wait);
10879+ swake_up_all(dentry->d_wait);
10880 dentry->d_wait = NULL;
10881 hlist_bl_unlock(b);
10882 INIT_HLIST_NODE(&dentry->d_u.d_alias);
10883@@ -3638,6 +3655,8 @@ __setup("dhash_entries=", set_dhash_entries);
10884
10885 static void __init dcache_init_early(void)
10886 {
10887+ unsigned int loop;
10888+
10889 /* If hashes are distributed across NUMA nodes, defer
10890 * hash allocation until vmalloc space is available.
10891 */
10892@@ -3654,10 +3673,14 @@ static void __init dcache_init_early(void)
10893 &d_hash_mask,
10894 0,
10895 0);
10896+
10897+ for (loop = 0; loop < (1U << d_hash_shift); loop++)
10898+ INIT_HLIST_BL_HEAD(dentry_hashtable + loop);
10899 }
10900
10901 static void __init dcache_init(void)
10902 {
10903+ unsigned int loop;
10904 /*
10905 * A constructor could be added for stable state like the lists,
10906 * but it is probably not worth it because of the cache nature
10907@@ -3680,6 +3703,10 @@ static void __init dcache_init(void)
10908 &d_hash_mask,
10909 0,
10910 0);
10911+
10912+ for (loop = 0; loop < (1U << d_hash_shift); loop++)
10913+ INIT_HLIST_BL_HEAD(dentry_hashtable + loop);
10914+
10915 }
10916
10917 /* SLAB cache for __getname() consumers */
10918diff --git a/fs/eventpoll.c b/fs/eventpoll.c
10919index 2fabd19cdeea..b768c32631eb 100644
10920--- a/fs/eventpoll.c
10921+++ b/fs/eventpoll.c
10922@@ -587,12 +587,12 @@ static int ep_poll_wakeup_proc(void *priv, void *cookie, int call_nests)
10923 */
10924 static void ep_poll_safewake(wait_queue_head_t *wq)
10925 {
10926- int this_cpu = get_cpu();
10927+ int this_cpu = get_cpu_light();
10928
10929 ep_call_nested(&poll_safewake_ncalls, EP_MAX_NESTS,
10930 ep_poll_wakeup_proc, NULL, wq, (void *) (long) this_cpu);
10931
10932- put_cpu();
10933+ put_cpu_light();
10934 }
10935
10936 static void ep_remove_wait_queue(struct eppoll_entry *pwq)
10937diff --git a/fs/exec.c b/fs/exec.c
10938index 0da4d748b4e6..609aee4dbfa9 100644
10939--- a/fs/exec.c
10940+++ b/fs/exec.c
10941@@ -1024,12 +1024,14 @@ static int exec_mmap(struct mm_struct *mm)
10942 }
10943 }
10944 task_lock(tsk);
10945+ preempt_disable_rt();
10946 active_mm = tsk->active_mm;
10947 tsk->mm = mm;
10948 tsk->active_mm = mm;
10949 activate_mm(active_mm, mm);
10950 tsk->mm->vmacache_seqnum = 0;
10951 vmacache_flush(tsk);
10952+ preempt_enable_rt();
10953 task_unlock(tsk);
10954 if (old_mm) {
10955 up_read(&old_mm->mmap_sem);
10956diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c
10957index db7590178dfc..d76364124443 100644
10958--- a/fs/ext4/page-io.c
10959+++ b/fs/ext4/page-io.c
10960@@ -95,8 +95,7 @@ static void ext4_finish_bio(struct bio *bio)
10961 * We check all buffers in the page under BH_Uptodate_Lock
10962 * to avoid races with other end io clearing async_write flags
10963 */
10964- local_irq_save(flags);
10965- bit_spin_lock(BH_Uptodate_Lock, &head->b_state);
10966+ flags = bh_uptodate_lock_irqsave(head);
10967 do {
10968 if (bh_offset(bh) < bio_start ||
10969 bh_offset(bh) + bh->b_size > bio_end) {
10970@@ -108,8 +107,7 @@ static void ext4_finish_bio(struct bio *bio)
10971 if (bio->bi_status)
10972 buffer_io_error(bh);
10973 } while ((bh = bh->b_this_page) != head);
10974- bit_spin_unlock(BH_Uptodate_Lock, &head->b_state);
10975- local_irq_restore(flags);
10976+ bh_uptodate_unlock_irqrestore(head, flags);
10977 if (!under_io) {
10978 #ifdef CONFIG_EXT4_FS_ENCRYPTION
10979 if (data_page)
10980diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
10981index 29868c35c19a..76d354eee035 100644
10982--- a/fs/fuse/dir.c
10983+++ b/fs/fuse/dir.c
10984@@ -1188,7 +1188,7 @@ static int fuse_direntplus_link(struct file *file,
10985 struct inode *dir = d_inode(parent);
10986 struct fuse_conn *fc;
10987 struct inode *inode;
10988- DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq);
10989+ DECLARE_SWAIT_QUEUE_HEAD_ONSTACK(wq);
10990
10991 if (!o->nodeid) {
10992 /*
10993diff --git a/fs/inode.c b/fs/inode.c
10994index cfc36d11bcb3..b77ce179798a 100644
10995--- a/fs/inode.c
10996+++ b/fs/inode.c
10997@@ -154,7 +154,7 @@ int inode_init_always(struct super_block *sb, struct inode *inode)
10998 inode->i_bdev = NULL;
10999 inode->i_cdev = NULL;
11000 inode->i_link = NULL;
11001- inode->i_dir_seq = 0;
11002+ inode->__i_dir_seq = 0;
11003 inode->i_rdev = 0;
11004 inode->dirtied_when = 0;
11005
11006diff --git a/fs/libfs.c b/fs/libfs.c
11007index 3aabe553fc45..b5d63bf1ad8e 100644
11008--- a/fs/libfs.c
11009+++ b/fs/libfs.c
11010@@ -90,7 +90,7 @@ static struct dentry *next_positive(struct dentry *parent,
11011 struct list_head *from,
11012 int count)
11013 {
11014- unsigned *seq = &parent->d_inode->i_dir_seq, n;
11015+ unsigned *seq = &parent->d_inode->__i_dir_seq, n;
11016 struct dentry *res;
11017 struct list_head *p;
11018 bool skipped;
11019@@ -123,8 +123,9 @@ static struct dentry *next_positive(struct dentry *parent,
11020 static void move_cursor(struct dentry *cursor, struct list_head *after)
11021 {
11022 struct dentry *parent = cursor->d_parent;
11023- unsigned n, *seq = &parent->d_inode->i_dir_seq;
11024+ unsigned n, *seq = &parent->d_inode->__i_dir_seq;
11025 spin_lock(&parent->d_lock);
11026+ preempt_disable_rt();
11027 for (;;) {
11028 n = *seq;
11029 if (!(n & 1) && cmpxchg(seq, n, n + 1) == n)
11030@@ -137,6 +138,7 @@ static void move_cursor(struct dentry *cursor, struct list_head *after)
11031 else
11032 list_add_tail(&cursor->d_child, &parent->d_subdirs);
11033 smp_store_release(seq, n + 2);
11034+ preempt_enable_rt();
11035 spin_unlock(&parent->d_lock);
11036 }
11037
11038diff --git a/fs/locks.c b/fs/locks.c
11039index 665e3ce9ab47..47b66bfc4fa3 100644
11040--- a/fs/locks.c
11041+++ b/fs/locks.c
11042@@ -945,7 +945,7 @@ static int flock_lock_inode(struct inode *inode, struct file_lock *request)
11043 return -ENOMEM;
11044 }
11045
11046- percpu_down_read_preempt_disable(&file_rwsem);
11047+ percpu_down_read(&file_rwsem);
11048 spin_lock(&ctx->flc_lock);
11049 if (request->fl_flags & FL_ACCESS)
11050 goto find_conflict;
11051@@ -986,7 +986,7 @@ static int flock_lock_inode(struct inode *inode, struct file_lock *request)
11052
11053 out:
11054 spin_unlock(&ctx->flc_lock);
11055- percpu_up_read_preempt_enable(&file_rwsem);
11056+ percpu_up_read(&file_rwsem);
11057 if (new_fl)
11058 locks_free_lock(new_fl);
11059 locks_dispose_list(&dispose);
11060@@ -1023,7 +1023,7 @@ static int posix_lock_inode(struct inode *inode, struct file_lock *request,
11061 new_fl2 = locks_alloc_lock();
11062 }
11063
11064- percpu_down_read_preempt_disable(&file_rwsem);
11065+ percpu_down_read(&file_rwsem);
11066 spin_lock(&ctx->flc_lock);
11067 /*
11068 * New lock request. Walk all POSIX locks and look for conflicts. If
11069@@ -1195,7 +1195,7 @@ static int posix_lock_inode(struct inode *inode, struct file_lock *request,
11070 }
11071 out:
11072 spin_unlock(&ctx->flc_lock);
11073- percpu_up_read_preempt_enable(&file_rwsem);
11074+ percpu_up_read(&file_rwsem);
11075 /*
11076 * Free any unused locks.
11077 */
11078@@ -1470,7 +1470,7 @@ int __break_lease(struct inode *inode, unsigned int mode, unsigned int type)
11079 return error;
11080 }
11081
11082- percpu_down_read_preempt_disable(&file_rwsem);
11083+ percpu_down_read(&file_rwsem);
11084 spin_lock(&ctx->flc_lock);
11085
11086 time_out_leases(inode, &dispose);
11087@@ -1522,13 +1522,13 @@ int __break_lease(struct inode *inode, unsigned int mode, unsigned int type)
11088 locks_insert_block(fl, new_fl);
11089 trace_break_lease_block(inode, new_fl);
11090 spin_unlock(&ctx->flc_lock);
11091- percpu_up_read_preempt_enable(&file_rwsem);
11092+ percpu_up_read(&file_rwsem);
11093
11094 locks_dispose_list(&dispose);
11095 error = wait_event_interruptible_timeout(new_fl->fl_wait,
11096 !new_fl->fl_next, break_time);
11097
11098- percpu_down_read_preempt_disable(&file_rwsem);
11099+ percpu_down_read(&file_rwsem);
11100 spin_lock(&ctx->flc_lock);
11101 trace_break_lease_unblock(inode, new_fl);
11102 locks_delete_block(new_fl);
11103@@ -1545,7 +1545,7 @@ int __break_lease(struct inode *inode, unsigned int mode, unsigned int type)
11104 }
11105 out:
11106 spin_unlock(&ctx->flc_lock);
11107- percpu_up_read_preempt_enable(&file_rwsem);
11108+ percpu_up_read(&file_rwsem);
11109 locks_dispose_list(&dispose);
11110 locks_free_lock(new_fl);
11111 return error;
11112@@ -1619,7 +1619,7 @@ int fcntl_getlease(struct file *filp)
11113
11114 ctx = smp_load_acquire(&inode->i_flctx);
11115 if (ctx && !list_empty_careful(&ctx->flc_lease)) {
11116- percpu_down_read_preempt_disable(&file_rwsem);
11117+ percpu_down_read(&file_rwsem);
11118 spin_lock(&ctx->flc_lock);
11119 time_out_leases(inode, &dispose);
11120 list_for_each_entry(fl, &ctx->flc_lease, fl_list) {
11121@@ -1629,7 +1629,7 @@ int fcntl_getlease(struct file *filp)
11122 break;
11123 }
11124 spin_unlock(&ctx->flc_lock);
11125- percpu_up_read_preempt_enable(&file_rwsem);
11126+ percpu_up_read(&file_rwsem);
11127
11128 locks_dispose_list(&dispose);
11129 }
11130@@ -1704,7 +1704,7 @@ generic_add_lease(struct file *filp, long arg, struct file_lock **flp, void **pr
11131 return -EINVAL;
11132 }
11133
11134- percpu_down_read_preempt_disable(&file_rwsem);
11135+ percpu_down_read(&file_rwsem);
11136 spin_lock(&ctx->flc_lock);
11137 time_out_leases(inode, &dispose);
11138 error = check_conflicting_open(dentry, arg, lease->fl_flags);
11139@@ -1775,7 +1775,7 @@ generic_add_lease(struct file *filp, long arg, struct file_lock **flp, void **pr
11140 lease->fl_lmops->lm_setup(lease, priv);
11141 out:
11142 spin_unlock(&ctx->flc_lock);
11143- percpu_up_read_preempt_enable(&file_rwsem);
11144+ percpu_up_read(&file_rwsem);
11145 locks_dispose_list(&dispose);
11146 if (is_deleg)
11147 inode_unlock(inode);
11148@@ -1798,7 +1798,7 @@ static int generic_delete_lease(struct file *filp, void *owner)
11149 return error;
11150 }
11151
11152- percpu_down_read_preempt_disable(&file_rwsem);
11153+ percpu_down_read(&file_rwsem);
11154 spin_lock(&ctx->flc_lock);
11155 list_for_each_entry(fl, &ctx->flc_lease, fl_list) {
11156 if (fl->fl_file == filp &&
11157@@ -1811,7 +1811,7 @@ static int generic_delete_lease(struct file *filp, void *owner)
11158 if (victim)
11159 error = fl->fl_lmops->lm_change(victim, F_UNLCK, &dispose);
11160 spin_unlock(&ctx->flc_lock);
11161- percpu_up_read_preempt_enable(&file_rwsem);
11162+ percpu_up_read(&file_rwsem);
11163 locks_dispose_list(&dispose);
11164 return error;
11165 }
11166@@ -2542,13 +2542,13 @@ locks_remove_lease(struct file *filp, struct file_lock_context *ctx)
11167 if (list_empty(&ctx->flc_lease))
11168 return;
11169
11170- percpu_down_read_preempt_disable(&file_rwsem);
11171+ percpu_down_read(&file_rwsem);
11172 spin_lock(&ctx->flc_lock);
11173 list_for_each_entry_safe(fl, tmp, &ctx->flc_lease, fl_list)
11174 if (filp == fl->fl_file)
11175 lease_modify(fl, F_UNLCK, &dispose);
11176 spin_unlock(&ctx->flc_lock);
11177- percpu_up_read_preempt_enable(&file_rwsem);
11178+ percpu_up_read(&file_rwsem);
11179
11180 locks_dispose_list(&dispose);
11181 }
11182diff --git a/fs/namei.c b/fs/namei.c
11183index 0b46b858cd42..f5c6c2ec44ce 100644
11184--- a/fs/namei.c
11185+++ b/fs/namei.c
11186@@ -1627,7 +1627,7 @@ static struct dentry *lookup_slow(const struct qstr *name,
11187 {
11188 struct dentry *dentry = ERR_PTR(-ENOENT), *old;
11189 struct inode *inode = dir->d_inode;
11190- DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq);
11191+ DECLARE_SWAIT_QUEUE_HEAD_ONSTACK(wq);
11192
11193 inode_lock_shared(inode);
11194 /* Don't go there if it's already dead */
11195@@ -3100,7 +3100,7 @@ static int lookup_open(struct nameidata *nd, struct path *path,
11196 struct dentry *dentry;
11197 int error, create_error = 0;
11198 umode_t mode = op->mode;
11199- DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq);
11200+ DECLARE_SWAIT_QUEUE_HEAD_ONSTACK(wq);
11201
11202 if (unlikely(IS_DEADDIR(dir_inode)))
11203 return -ENOENT;
11204diff --git a/fs/namespace.c b/fs/namespace.c
11205index 9dc146e7b5e0..85bfe5e55adf 100644
11206--- a/fs/namespace.c
11207+++ b/fs/namespace.c
11208@@ -14,6 +14,7 @@
11209 #include <linux/mnt_namespace.h>
11210 #include <linux/user_namespace.h>
11211 #include <linux/namei.h>
11212+#include <linux/delay.h>
11213 #include <linux/security.h>
11214 #include <linux/cred.h>
11215 #include <linux/idr.h>
11216@@ -353,8 +354,11 @@ int __mnt_want_write(struct vfsmount *m)
11217 * incremented count after it has set MNT_WRITE_HOLD.
11218 */
11219 smp_mb();
11220- while (ACCESS_ONCE(mnt->mnt.mnt_flags) & MNT_WRITE_HOLD)
11221- cpu_relax();
11222+ while (ACCESS_ONCE(mnt->mnt.mnt_flags) & MNT_WRITE_HOLD) {
11223+ preempt_enable();
11224+ cpu_chill();
11225+ preempt_disable();
11226+ }
11227 /*
11228 * After the slowpath clears MNT_WRITE_HOLD, mnt_is_readonly will
11229 * be set to match its requirements. So we must not load that until
11230diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c
11231index 606dd3871f66..fa41eb75b4d8 100644
11232--- a/fs/nfs/delegation.c
11233+++ b/fs/nfs/delegation.c
11234@@ -150,11 +150,11 @@ static int nfs_delegation_claim_opens(struct inode *inode,
11235 sp = state->owner;
11236 /* Block nfs4_proc_unlck */
11237 mutex_lock(&sp->so_delegreturn_mutex);
11238- seq = raw_seqcount_begin(&sp->so_reclaim_seqcount);
11239+ seq = read_seqbegin(&sp->so_reclaim_seqlock);
11240 err = nfs4_open_delegation_recall(ctx, state, stateid, type);
11241 if (!err)
11242 err = nfs_delegation_claim_locks(ctx, state, stateid);
11243- if (!err && read_seqcount_retry(&sp->so_reclaim_seqcount, seq))
11244+ if (!err && read_seqretry(&sp->so_reclaim_seqlock, seq))
11245 err = -EAGAIN;
11246 mutex_unlock(&sp->so_delegreturn_mutex);
11247 put_nfs_open_context(ctx);
11248diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
11249index bf2c43635062..f43f5da4a8c3 100644
11250--- a/fs/nfs/dir.c
11251+++ b/fs/nfs/dir.c
11252@@ -452,7 +452,7 @@ static
11253 void nfs_prime_dcache(struct dentry *parent, struct nfs_entry *entry)
11254 {
11255 struct qstr filename = QSTR_INIT(entry->name, entry->len);
11256- DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq);
11257+ DECLARE_SWAIT_QUEUE_HEAD_ONSTACK(wq);
11258 struct dentry *dentry;
11259 struct dentry *alias;
11260 struct inode *dir = d_inode(parent);
11261@@ -1443,7 +1443,7 @@ int nfs_atomic_open(struct inode *dir, struct dentry *dentry,
11262 struct file *file, unsigned open_flags,
11263 umode_t mode, int *opened)
11264 {
11265- DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq);
11266+ DECLARE_SWAIT_QUEUE_HEAD_ONSTACK(wq);
11267 struct nfs_open_context *ctx;
11268 struct dentry *res;
11269 struct iattr attr = { .ia_valid = ATTR_OPEN };
11270@@ -1763,7 +1763,11 @@ int nfs_rmdir(struct inode *dir, struct dentry *dentry)
11271
11272 trace_nfs_rmdir_enter(dir, dentry);
11273 if (d_really_is_positive(dentry)) {
11274+#ifdef CONFIG_PREEMPT_RT_BASE
11275+ down(&NFS_I(d_inode(dentry))->rmdir_sem);
11276+#else
11277 down_write(&NFS_I(d_inode(dentry))->rmdir_sem);
11278+#endif
11279 error = NFS_PROTO(dir)->rmdir(dir, &dentry->d_name);
11280 /* Ensure the VFS deletes this inode */
11281 switch (error) {
11282@@ -1773,7 +1777,11 @@ int nfs_rmdir(struct inode *dir, struct dentry *dentry)
11283 case -ENOENT:
11284 nfs_dentry_handle_enoent(dentry);
11285 }
11286+#ifdef CONFIG_PREEMPT_RT_BASE
11287+ up(&NFS_I(d_inode(dentry))->rmdir_sem);
11288+#else
11289 up_write(&NFS_I(d_inode(dentry))->rmdir_sem);
11290+#endif
11291 } else
11292 error = NFS_PROTO(dir)->rmdir(dir, &dentry->d_name);
11293 trace_nfs_rmdir_exit(dir, dentry, error);
11294diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
11295index 134d9f560240..ff64167f9811 100644
11296--- a/fs/nfs/inode.c
11297+++ b/fs/nfs/inode.c
11298@@ -2014,7 +2014,11 @@ static void init_once(void *foo)
11299 atomic_long_set(&nfsi->nrequests, 0);
11300 atomic_long_set(&nfsi->commit_info.ncommit, 0);
11301 atomic_set(&nfsi->commit_info.rpcs_out, 0);
11302+#ifdef CONFIG_PREEMPT_RT_BASE
11303+ sema_init(&nfsi->rmdir_sem, 1);
11304+#else
11305 init_rwsem(&nfsi->rmdir_sem);
11306+#endif
11307 mutex_init(&nfsi->commit_mutex);
11308 nfs4_init_once(nfsi);
11309 }
11310diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h
11311index a73144b3cb8c..0c403d280b96 100644
11312--- a/fs/nfs/nfs4_fs.h
11313+++ b/fs/nfs/nfs4_fs.h
11314@@ -112,7 +112,7 @@ struct nfs4_state_owner {
11315 unsigned long so_flags;
11316 struct list_head so_states;
11317 struct nfs_seqid_counter so_seqid;
11318- seqcount_t so_reclaim_seqcount;
11319+ seqlock_t so_reclaim_seqlock;
11320 struct mutex so_delegreturn_mutex;
11321 };
11322
11323diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
11324index a3b67d3b1dfb..4ce6ec109c2b 100644
11325--- a/fs/nfs/nfs4proc.c
11326+++ b/fs/nfs/nfs4proc.c
11327@@ -2700,7 +2700,7 @@ static int _nfs4_open_and_get_state(struct nfs4_opendata *opendata,
11328 unsigned int seq;
11329 int ret;
11330
11331- seq = raw_seqcount_begin(&sp->so_reclaim_seqcount);
11332+ seq = raw_seqcount_begin(&sp->so_reclaim_seqlock.seqcount);
11333
11334 ret = _nfs4_proc_open(opendata);
11335 if (ret != 0)
11336@@ -2738,7 +2738,7 @@ static int _nfs4_open_and_get_state(struct nfs4_opendata *opendata,
11337
11338 if (d_inode(dentry) == state->inode) {
11339 nfs_inode_attach_open_context(ctx);
11340- if (read_seqcount_retry(&sp->so_reclaim_seqcount, seq))
11341+ if (read_seqretry(&sp->so_reclaim_seqlock, seq))
11342 nfs4_schedule_stateid_recovery(server, state);
11343 }
11344 out:
11345diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
11346index e1d88bca815e..c51bcc176026 100644
11347--- a/fs/nfs/nfs4state.c
11348+++ b/fs/nfs/nfs4state.c
11349@@ -494,7 +494,7 @@ nfs4_alloc_state_owner(struct nfs_server *server,
11350 nfs4_init_seqid_counter(&sp->so_seqid);
11351 atomic_set(&sp->so_count, 1);
11352 INIT_LIST_HEAD(&sp->so_lru);
11353- seqcount_init(&sp->so_reclaim_seqcount);
11354+ seqlock_init(&sp->so_reclaim_seqlock);
11355 mutex_init(&sp->so_delegreturn_mutex);
11356 return sp;
11357 }
11358@@ -1521,8 +1521,12 @@ static int nfs4_reclaim_open_state(struct nfs4_state_owner *sp, const struct nfs
11359 * recovering after a network partition or a reboot from a
11360 * server that doesn't support a grace period.
11361 */
11362+#ifdef CONFIG_PREEMPT_RT_FULL
11363+ write_seqlock(&sp->so_reclaim_seqlock);
11364+#else
11365+ write_seqcount_begin(&sp->so_reclaim_seqlock.seqcount);
11366+#endif
11367 spin_lock(&sp->so_lock);
11368- raw_write_seqcount_begin(&sp->so_reclaim_seqcount);
11369 restart:
11370 list_for_each_entry(state, &sp->so_states, open_states) {
11371 if (!test_and_clear_bit(ops->state_flag_bit, &state->flags))
11372@@ -1591,14 +1595,20 @@ static int nfs4_reclaim_open_state(struct nfs4_state_owner *sp, const struct nfs
11373 spin_lock(&sp->so_lock);
11374 goto restart;
11375 }
11376- raw_write_seqcount_end(&sp->so_reclaim_seqcount);
11377 spin_unlock(&sp->so_lock);
11378+#ifdef CONFIG_PREEMPT_RT_FULL
11379+ write_sequnlock(&sp->so_reclaim_seqlock);
11380+#else
11381+ write_seqcount_end(&sp->so_reclaim_seqlock.seqcount);
11382+#endif
11383 return 0;
11384 out_err:
11385 nfs4_put_open_state(state);
11386- spin_lock(&sp->so_lock);
11387- raw_write_seqcount_end(&sp->so_reclaim_seqcount);
11388- spin_unlock(&sp->so_lock);
11389+#ifdef CONFIG_PREEMPT_RT_FULL
11390+ write_sequnlock(&sp->so_reclaim_seqlock);
11391+#else
11392+ write_seqcount_end(&sp->so_reclaim_seqlock.seqcount);
11393+#endif
11394 return status;
11395 }
11396
11397diff --git a/fs/nfs/unlink.c b/fs/nfs/unlink.c
11398index 630b4a3c1a93..0dc1d3e6a62f 100644
11399--- a/fs/nfs/unlink.c
11400+++ b/fs/nfs/unlink.c
11401@@ -13,7 +13,7 @@
11402 #include <linux/sunrpc/clnt.h>
11403 #include <linux/nfs_fs.h>
11404 #include <linux/sched.h>
11405-#include <linux/wait.h>
11406+#include <linux/swait.h>
11407 #include <linux/namei.h>
11408 #include <linux/fsnotify.h>
11409
11410@@ -52,6 +52,29 @@ static void nfs_async_unlink_done(struct rpc_task *task, void *calldata)
11411 rpc_restart_call_prepare(task);
11412 }
11413
11414+#ifdef CONFIG_PREEMPT_RT_BASE
11415+static void nfs_down_anon(struct semaphore *sema)
11416+{
11417+ down(sema);
11418+}
11419+
11420+static void nfs_up_anon(struct semaphore *sema)
11421+{
11422+ up(sema);
11423+}
11424+
11425+#else
11426+static void nfs_down_anon(struct rw_semaphore *rwsem)
11427+{
11428+ down_read_non_owner(rwsem);
11429+}
11430+
11431+static void nfs_up_anon(struct rw_semaphore *rwsem)
11432+{
11433+ up_read_non_owner(rwsem);
11434+}
11435+#endif
11436+
11437 /**
11438 * nfs_async_unlink_release - Release the sillydelete data.
11439 * @task: rpc_task of the sillydelete
11440@@ -65,7 +88,7 @@ static void nfs_async_unlink_release(void *calldata)
11441 struct dentry *dentry = data->dentry;
11442 struct super_block *sb = dentry->d_sb;
11443
11444- up_read_non_owner(&NFS_I(d_inode(dentry->d_parent))->rmdir_sem);
11445+ nfs_up_anon(&NFS_I(d_inode(dentry->d_parent))->rmdir_sem);
11446 d_lookup_done(dentry);
11447 nfs_free_unlinkdata(data);
11448 dput(dentry);
11449@@ -118,10 +141,10 @@ static int nfs_call_unlink(struct dentry *dentry, struct nfs_unlinkdata *data)
11450 struct inode *dir = d_inode(dentry->d_parent);
11451 struct dentry *alias;
11452
11453- down_read_non_owner(&NFS_I(dir)->rmdir_sem);
11454+ nfs_down_anon(&NFS_I(dir)->rmdir_sem);
11455 alias = d_alloc_parallel(dentry->d_parent, &data->args.name, &data->wq);
11456 if (IS_ERR(alias)) {
11457- up_read_non_owner(&NFS_I(dir)->rmdir_sem);
11458+ nfs_up_anon(&NFS_I(dir)->rmdir_sem);
11459 return 0;
11460 }
11461 if (!d_in_lookup(alias)) {
11462@@ -143,7 +166,7 @@ static int nfs_call_unlink(struct dentry *dentry, struct nfs_unlinkdata *data)
11463 ret = 0;
11464 spin_unlock(&alias->d_lock);
11465 dput(alias);
11466- up_read_non_owner(&NFS_I(dir)->rmdir_sem);
11467+ nfs_up_anon(&NFS_I(dir)->rmdir_sem);
11468 /*
11469 * If we'd displaced old cached devname, free it. At that
11470 * point dentry is definitely not a root, so we won't need
11471@@ -183,7 +206,7 @@ nfs_async_unlink(struct dentry *dentry, const struct qstr *name)
11472 goto out_free_name;
11473 }
11474 data->res.dir_attr = &data->dir_attr;
11475- init_waitqueue_head(&data->wq);
11476+ init_swait_queue_head(&data->wq);
11477
11478 status = -EBUSY;
11479 spin_lock(&dentry->d_lock);
11480diff --git a/fs/ntfs/aops.c b/fs/ntfs/aops.c
11481index cc91856b5e2d..a982d7c3ad91 100644
11482--- a/fs/ntfs/aops.c
11483+++ b/fs/ntfs/aops.c
11484@@ -93,13 +93,13 @@ static void ntfs_end_buffer_async_read(struct buffer_head *bh, int uptodate)
11485 ofs = 0;
11486 if (file_ofs < init_size)
11487 ofs = init_size - file_ofs;
11488- local_irq_save(flags);
11489+ local_irq_save_nort(flags);
11490 kaddr = kmap_atomic(page);
11491 memset(kaddr + bh_offset(bh) + ofs, 0,
11492 bh->b_size - ofs);
11493 flush_dcache_page(page);
11494 kunmap_atomic(kaddr);
11495- local_irq_restore(flags);
11496+ local_irq_restore_nort(flags);
11497 }
11498 } else {
11499 clear_buffer_uptodate(bh);
11500@@ -108,8 +108,7 @@ static void ntfs_end_buffer_async_read(struct buffer_head *bh, int uptodate)
11501 "0x%llx.", (unsigned long long)bh->b_blocknr);
11502 }
11503 first = page_buffers(page);
11504- local_irq_save(flags);
11505- bit_spin_lock(BH_Uptodate_Lock, &first->b_state);
11506+ flags = bh_uptodate_lock_irqsave(first);
11507 clear_buffer_async_read(bh);
11508 unlock_buffer(bh);
11509 tmp = bh;
11510@@ -124,8 +123,7 @@ static void ntfs_end_buffer_async_read(struct buffer_head *bh, int uptodate)
11511 }
11512 tmp = tmp->b_this_page;
11513 } while (tmp != bh);
11514- bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
11515- local_irq_restore(flags);
11516+ bh_uptodate_unlock_irqrestore(first, flags);
11517 /*
11518 * If none of the buffers had errors then we can set the page uptodate,
11519 * but we first have to perform the post read mst fixups, if the
11520@@ -146,13 +144,13 @@ static void ntfs_end_buffer_async_read(struct buffer_head *bh, int uptodate)
11521 recs = PAGE_SIZE / rec_size;
11522 /* Should have been verified before we got here... */
11523 BUG_ON(!recs);
11524- local_irq_save(flags);
11525+ local_irq_save_nort(flags);
11526 kaddr = kmap_atomic(page);
11527 for (i = 0; i < recs; i++)
11528 post_read_mst_fixup((NTFS_RECORD*)(kaddr +
11529 i * rec_size), rec_size);
11530 kunmap_atomic(kaddr);
11531- local_irq_restore(flags);
11532+ local_irq_restore_nort(flags);
11533 flush_dcache_page(page);
11534 if (likely(page_uptodate && !PageError(page)))
11535 SetPageUptodate(page);
11536@@ -160,9 +158,7 @@ static void ntfs_end_buffer_async_read(struct buffer_head *bh, int uptodate)
11537 unlock_page(page);
11538 return;
11539 still_busy:
11540- bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
11541- local_irq_restore(flags);
11542- return;
11543+ bh_uptodate_unlock_irqrestore(first, flags);
11544 }
11545
11546 /**
11547diff --git a/fs/proc/array.c b/fs/proc/array.c
11548index 4ac811e1a26c..9dcb40690cde 100644
11549--- a/fs/proc/array.c
11550+++ b/fs/proc/array.c
11551@@ -386,9 +386,9 @@ static inline void task_context_switch_counts(struct seq_file *m,
11552 static void task_cpus_allowed(struct seq_file *m, struct task_struct *task)
11553 {
11554 seq_printf(m, "Cpus_allowed:\t%*pb\n",
11555- cpumask_pr_args(&task->cpus_allowed));
11556+ cpumask_pr_args(task->cpus_ptr));
11557 seq_printf(m, "Cpus_allowed_list:\t%*pbl\n",
11558- cpumask_pr_args(&task->cpus_allowed));
11559+ cpumask_pr_args(task->cpus_ptr));
11560 }
11561
11562 int proc_pid_status(struct seq_file *m, struct pid_namespace *ns,
11563diff --git a/fs/proc/base.c b/fs/proc/base.c
11564index 9063738ff1f0..4085e56e261c 100644
11565--- a/fs/proc/base.c
11566+++ b/fs/proc/base.c
11567@@ -1900,7 +1900,7 @@ bool proc_fill_cache(struct file *file, struct dir_context *ctx,
11568
11569 child = d_hash_and_lookup(dir, &qname);
11570 if (!child) {
11571- DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq);
11572+ DECLARE_SWAIT_QUEUE_HEAD_ONSTACK(wq);
11573 child = d_alloc_parallel(dir, &qname, &wq);
11574 if (IS_ERR(child))
11575 goto end_instantiate;
11576diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c
11577index 82ac5f682b73..c35714621a38 100644
11578--- a/fs/proc/proc_sysctl.c
11579+++ b/fs/proc/proc_sysctl.c
11580@@ -679,7 +679,7 @@ static bool proc_sys_fill_cache(struct file *file,
11581
11582 child = d_lookup(dir, &qname);
11583 if (!child) {
11584- DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq);
11585+ DECLARE_SWAIT_QUEUE_HEAD_ONSTACK(wq);
11586 child = d_alloc_parallel(dir, &qname, &wq);
11587 if (IS_ERR(child))
11588 return false;
11589diff --git a/fs/squashfs/decompressor_multi_percpu.c b/fs/squashfs/decompressor_multi_percpu.c
11590index 23a9c28ad8ea..6a73c4fa88e7 100644
11591--- a/fs/squashfs/decompressor_multi_percpu.c
11592+++ b/fs/squashfs/decompressor_multi_percpu.c
11593@@ -10,6 +10,7 @@
11594 #include <linux/slab.h>
11595 #include <linux/percpu.h>
11596 #include <linux/buffer_head.h>
11597+#include <linux/locallock.h>
11598
11599 #include "squashfs_fs.h"
11600 #include "squashfs_fs_sb.h"
11601@@ -25,6 +26,8 @@ struct squashfs_stream {
11602 void *stream;
11603 };
11604
11605+static DEFINE_LOCAL_IRQ_LOCK(stream_lock);
11606+
11607 void *squashfs_decompressor_create(struct squashfs_sb_info *msblk,
11608 void *comp_opts)
11609 {
11610@@ -79,10 +82,15 @@ int squashfs_decompress(struct squashfs_sb_info *msblk, struct buffer_head **bh,
11611 {
11612 struct squashfs_stream __percpu *percpu =
11613 (struct squashfs_stream __percpu *) msblk->stream;
11614- struct squashfs_stream *stream = get_cpu_ptr(percpu);
11615- int res = msblk->decompressor->decompress(msblk, stream->stream, bh, b,
11616- offset, length, output);
11617- put_cpu_ptr(stream);
11618+ struct squashfs_stream *stream;
11619+ int res;
11620+
11621+ stream = get_locked_ptr(stream_lock, percpu);
11622+
11623+ res = msblk->decompressor->decompress(msblk, stream->stream, bh, b,
11624+ offset, length, output);
11625+
11626+ put_locked_ptr(stream_lock, stream);
11627
11628 if (res < 0)
11629 ERROR("%s decompression failed, data probably corrupt\n",
11630diff --git a/fs/timerfd.c b/fs/timerfd.c
11631index 040612ec9598..b3d9d435926c 100644
11632--- a/fs/timerfd.c
11633+++ b/fs/timerfd.c
11634@@ -471,7 +471,10 @@ static int do_timerfd_settime(int ufd, int flags,
11635 break;
11636 }
11637 spin_unlock_irq(&ctx->wqh.lock);
11638- cpu_relax();
11639+ if (isalarm(ctx))
11640+ hrtimer_wait_for_timer(&ctx->t.alarm.timer);
11641+ else
11642+ hrtimer_wait_for_timer(&ctx->t.tmr);
11643 }
11644
11645 /*
11646diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
11647index b0cccf8a81a8..eaa4383defec 100644
11648--- a/fs/xfs/xfs_aops.c
11649+++ b/fs/xfs/xfs_aops.c
11650@@ -120,8 +120,7 @@ xfs_finish_page_writeback(
11651 ASSERT(bvec->bv_offset + bvec->bv_len <= PAGE_SIZE);
11652 ASSERT((bvec->bv_len & (i_blocksize(inode) - 1)) == 0);
11653
11654- local_irq_save(flags);
11655- bit_spin_lock(BH_Uptodate_Lock, &head->b_state);
11656+ flags = bh_uptodate_lock_irqsave(head);
11657 do {
11658 if (off >= bvec->bv_offset &&
11659 off < bvec->bv_offset + bvec->bv_len) {
11660@@ -143,8 +142,7 @@ xfs_finish_page_writeback(
11661 }
11662 off += bh->b_size;
11663 } while ((bh = bh->b_this_page) != head);
11664- bit_spin_unlock(BH_Uptodate_Lock, &head->b_state);
11665- local_irq_restore(flags);
11666+ bh_uptodate_unlock_irqrestore(head, flags);
11667
11668 if (!busy)
11669 end_page_writeback(bvec->bv_page);
11670diff --git a/include/acpi/platform/aclinux.h b/include/acpi/platform/aclinux.h
11671index 1b473efd9eb6..89ee5e1dac48 100644
11672--- a/include/acpi/platform/aclinux.h
11673+++ b/include/acpi/platform/aclinux.h
11674@@ -134,6 +134,7 @@
11675
11676 #define acpi_cache_t struct kmem_cache
11677 #define acpi_spinlock spinlock_t *
11678+#define acpi_raw_spinlock raw_spinlock_t *
11679 #define acpi_cpu_flags unsigned long
11680
11681 /* Use native linux version of acpi_os_allocate_zeroed */
11682@@ -152,6 +153,20 @@
11683 #define ACPI_USE_ALTERNATE_PROTOTYPE_acpi_os_get_thread_id
11684 #define ACPI_USE_ALTERNATE_PROTOTYPE_acpi_os_create_lock
11685
11686+#define acpi_os_create_raw_lock(__handle) \
11687+({ \
11688+ raw_spinlock_t *lock = ACPI_ALLOCATE(sizeof(*lock)); \
11689+ \
11690+ if (lock) { \
11691+ *(__handle) = lock; \
11692+ raw_spin_lock_init(*(__handle)); \
11693+ } \
11694+ lock ? AE_OK : AE_NO_MEMORY; \
11695+ })
11696+
11697+#define acpi_os_delete_raw_lock(__handle) kfree(__handle)
11698+
11699+
11700 /*
11701 * OSL interfaces used by debugger/disassembler
11702 */
11703diff --git a/include/asm-generic/bug.h b/include/asm-generic/bug.h
11704index ae1a33aa8955..c6d04eca8345 100644
11705--- a/include/asm-generic/bug.h
11706+++ b/include/asm-generic/bug.h
11707@@ -234,6 +234,20 @@ void __warn(const char *file, int line, void *caller, unsigned taint,
11708 # define WARN_ON_SMP(x) ({0;})
11709 #endif
11710
11711+#ifdef CONFIG_PREEMPT_RT_BASE
11712+# define BUG_ON_RT(c) BUG_ON(c)
11713+# define BUG_ON_NONRT(c) do { } while (0)
11714+# define WARN_ON_RT(condition) WARN_ON(condition)
11715+# define WARN_ON_NONRT(condition) do { } while (0)
11716+# define WARN_ON_ONCE_NONRT(condition) do { } while (0)
11717+#else
11718+# define BUG_ON_RT(c) do { } while (0)
11719+# define BUG_ON_NONRT(c) BUG_ON(c)
11720+# define WARN_ON_RT(condition) do { } while (0)
11721+# define WARN_ON_NONRT(condition) WARN_ON(condition)
11722+# define WARN_ON_ONCE_NONRT(condition) WARN_ON_ONCE(condition)
11723+#endif
11724+
11725 #endif /* __ASSEMBLY__ */
11726
11727 #endif
11728diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
11729index 994cbb0f7ffc..0d4b7e3489a9 100644
11730--- a/include/linux/blk-mq.h
11731+++ b/include/linux/blk-mq.h
11732@@ -226,7 +226,7 @@ static inline u16 blk_mq_unique_tag_to_tag(u32 unique_tag)
11733 return unique_tag & BLK_MQ_UNIQUE_TAG_MASK;
11734 }
11735
11736-
11737+void __blk_mq_complete_request_remote_work(struct work_struct *work);
11738 int blk_mq_request_started(struct request *rq);
11739 void blk_mq_start_request(struct request *rq);
11740 void blk_mq_end_request(struct request *rq, blk_status_t error);
11741diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
11742index 4d4af0e94059..cbf9d5730dd3 100644
11743--- a/include/linux/blkdev.h
11744+++ b/include/linux/blkdev.h
11745@@ -27,6 +27,7 @@
11746 #include <linux/percpu-refcount.h>
11747 #include <linux/scatterlist.h>
11748 #include <linux/blkzoned.h>
11749+#include <linux/swork.h>
11750
11751 struct module;
11752 struct scsi_ioctl_command;
11753@@ -134,6 +135,9 @@ typedef __u32 __bitwise req_flags_t;
11754 */
11755 struct request {
11756 struct list_head queuelist;
11757+#ifdef CONFIG_PREEMPT_RT_FULL
11758+ struct work_struct work;
11759+#endif
11760 union {
11761 struct __call_single_data csd;
11762 u64 fifo_time;
11763@@ -596,6 +600,7 @@ struct request_queue {
11764 #endif
11765 struct rcu_head rcu_head;
11766 wait_queue_head_t mq_freeze_wq;
11767+ struct swork_event mq_pcpu_wake;
11768 struct percpu_ref q_usage_counter;
11769 struct list_head all_q_node;
11770
11771diff --git a/include/linux/bottom_half.h b/include/linux/bottom_half.h
11772index a19519f4241d..40dd5ef9c154 100644
11773--- a/include/linux/bottom_half.h
11774+++ b/include/linux/bottom_half.h
11775@@ -4,6 +4,39 @@
11776
11777 #include <linux/preempt.h>
11778
11779+#ifdef CONFIG_PREEMPT_RT_FULL
11780+
11781+extern void __local_bh_disable(void);
11782+extern void _local_bh_enable(void);
11783+extern void __local_bh_enable(void);
11784+
11785+static inline void local_bh_disable(void)
11786+{
11787+ __local_bh_disable();
11788+}
11789+
11790+static inline void __local_bh_disable_ip(unsigned long ip, unsigned int cnt)
11791+{
11792+ __local_bh_disable();
11793+}
11794+
11795+static inline void local_bh_enable(void)
11796+{
11797+ __local_bh_enable();
11798+}
11799+
11800+static inline void __local_bh_enable_ip(unsigned long ip, unsigned int cnt)
11801+{
11802+ __local_bh_enable();
11803+}
11804+
11805+static inline void local_bh_enable_ip(unsigned long ip)
11806+{
11807+ __local_bh_enable();
11808+}
11809+
11810+#else
11811+
11812 #ifdef CONFIG_TRACE_IRQFLAGS
11813 extern void __local_bh_disable_ip(unsigned long ip, unsigned int cnt);
11814 #else
11815@@ -31,5 +64,6 @@ static inline void local_bh_enable(void)
11816 {
11817 __local_bh_enable_ip(_THIS_IP_, SOFTIRQ_DISABLE_OFFSET);
11818 }
11819+#endif
11820
11821 #endif /* _LINUX_BH_H */
11822diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h
11823index afa37f807f12..48505fade7e1 100644
11824--- a/include/linux/buffer_head.h
11825+++ b/include/linux/buffer_head.h
11826@@ -76,8 +76,50 @@ struct buffer_head {
11827 struct address_space *b_assoc_map; /* mapping this buffer is
11828 associated with */
11829 atomic_t b_count; /* users using this buffer_head */
11830+#ifdef CONFIG_PREEMPT_RT_BASE
11831+ spinlock_t b_uptodate_lock;
11832+#if IS_ENABLED(CONFIG_JBD2)
11833+ spinlock_t b_state_lock;
11834+ spinlock_t b_journal_head_lock;
11835+#endif
11836+#endif
11837 };
11838
11839+static inline unsigned long bh_uptodate_lock_irqsave(struct buffer_head *bh)
11840+{
11841+ unsigned long flags;
11842+
11843+#ifndef CONFIG_PREEMPT_RT_BASE
11844+ local_irq_save(flags);
11845+ bit_spin_lock(BH_Uptodate_Lock, &bh->b_state);
11846+#else
11847+ spin_lock_irqsave(&bh->b_uptodate_lock, flags);
11848+#endif
11849+ return flags;
11850+}
11851+
11852+static inline void
11853+bh_uptodate_unlock_irqrestore(struct buffer_head *bh, unsigned long flags)
11854+{
11855+#ifndef CONFIG_PREEMPT_RT_BASE
11856+ bit_spin_unlock(BH_Uptodate_Lock, &bh->b_state);
11857+ local_irq_restore(flags);
11858+#else
11859+ spin_unlock_irqrestore(&bh->b_uptodate_lock, flags);
11860+#endif
11861+}
11862+
11863+static inline void buffer_head_init_locks(struct buffer_head *bh)
11864+{
11865+#ifdef CONFIG_PREEMPT_RT_BASE
11866+ spin_lock_init(&bh->b_uptodate_lock);
11867+#if IS_ENABLED(CONFIG_JBD2)
11868+ spin_lock_init(&bh->b_state_lock);
11869+ spin_lock_init(&bh->b_journal_head_lock);
11870+#endif
11871+#endif
11872+}
11873+
11874 /*
11875 * macro tricks to expand the set_buffer_foo(), clear_buffer_foo()
11876 * and buffer_foo() functions.
11877diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h
11878index e7905d9353e8..4ecf7875e04f 100644
11879--- a/include/linux/cgroup-defs.h
11880+++ b/include/linux/cgroup-defs.h
11881@@ -19,6 +19,7 @@
11882 #include <linux/percpu-rwsem.h>
11883 #include <linux/workqueue.h>
11884 #include <linux/bpf-cgroup.h>
11885+#include <linux/swork.h>
11886
11887 #ifdef CONFIG_CGROUPS
11888
11889@@ -152,6 +153,7 @@ struct cgroup_subsys_state {
11890 /* percpu_ref killing and RCU release */
11891 struct rcu_head rcu_head;
11892 struct work_struct destroy_work;
11893+ struct swork_event destroy_swork;
11894
11895 /*
11896 * PI: the parent css. Placed here for cache proximity to following
11897diff --git a/include/linux/completion.h b/include/linux/completion.h
11898index 7828451e161a..f5838b10cf84 100644
11899--- a/include/linux/completion.h
11900+++ b/include/linux/completion.h
11901@@ -9,7 +9,7 @@
11902 * See kernel/sched/completion.c for details.
11903 */
11904
11905-#include <linux/wait.h>
11906+#include <linux/swait.h>
11907 #ifdef CONFIG_LOCKDEP_COMPLETIONS
11908 #include <linux/lockdep.h>
11909 #endif
11910@@ -28,7 +28,7 @@
11911 */
11912 struct completion {
11913 unsigned int done;
11914- wait_queue_head_t wait;
11915+ struct swait_queue_head wait;
11916 #ifdef CONFIG_LOCKDEP_COMPLETIONS
11917 struct lockdep_map_cross map;
11918 #endif
11919@@ -67,11 +67,11 @@ static inline void complete_release_commit(struct completion *x) {}
11920
11921 #ifdef CONFIG_LOCKDEP_COMPLETIONS
11922 #define COMPLETION_INITIALIZER(work) \
11923- { 0, __WAIT_QUEUE_HEAD_INITIALIZER((work).wait), \
11924+ { 0, __SWAIT_QUEUE_HEAD_INITIALIZER((work).wait), \
11925 STATIC_CROSS_LOCKDEP_MAP_INIT("(complete)" #work, &(work)) }
11926 #else
11927 #define COMPLETION_INITIALIZER(work) \
11928- { 0, __WAIT_QUEUE_HEAD_INITIALIZER((work).wait) }
11929+ { 0, __SWAIT_QUEUE_HEAD_INITIALIZER((work).wait) }
11930 #endif
11931
11932 #define COMPLETION_INITIALIZER_ONSTACK(work) \
11933@@ -117,7 +117,7 @@ static inline void complete_release_commit(struct completion *x) {}
11934 static inline void __init_completion(struct completion *x)
11935 {
11936 x->done = 0;
11937- init_waitqueue_head(&x->wait);
11938+ init_swait_queue_head(&x->wait);
11939 }
11940
11941 /**
11942diff --git a/include/linux/cpu.h b/include/linux/cpu.h
11943index 2a378d261914..b418d3c5159d 100644
11944--- a/include/linux/cpu.h
11945+++ b/include/linux/cpu.h
11946@@ -120,6 +120,8 @@ extern void cpu_hotplug_disable(void);
11947 extern void cpu_hotplug_enable(void);
11948 void clear_tasks_mm_cpumask(int cpu);
11949 int cpu_down(unsigned int cpu);
11950+extern void pin_current_cpu(void);
11951+extern void unpin_current_cpu(void);
11952
11953 #else /* CONFIG_HOTPLUG_CPU */
11954
11955@@ -130,6 +132,9 @@ static inline void cpus_read_unlock(void) { }
11956 static inline void lockdep_assert_cpus_held(void) { }
11957 static inline void cpu_hotplug_disable(void) { }
11958 static inline void cpu_hotplug_enable(void) { }
11959+static inline void pin_current_cpu(void) { }
11960+static inline void unpin_current_cpu(void) { }
11961+
11962 #endif /* !CONFIG_HOTPLUG_CPU */
11963
11964 /* Wrappers which go away once all code is converted */
11965diff --git a/include/linux/dcache.h b/include/linux/dcache.h
11966index 006f4ccda5f5..d413993f7f17 100644
11967--- a/include/linux/dcache.h
11968+++ b/include/linux/dcache.h
11969@@ -107,7 +107,7 @@ struct dentry {
11970
11971 union {
11972 struct list_head d_lru; /* LRU list */
11973- wait_queue_head_t *d_wait; /* in-lookup ones only */
11974+ struct swait_queue_head *d_wait; /* in-lookup ones only */
11975 };
11976 struct list_head d_child; /* child of parent list */
11977 struct list_head d_subdirs; /* our children */
11978@@ -238,7 +238,7 @@ extern void d_set_d_op(struct dentry *dentry, const struct dentry_operations *op
11979 extern struct dentry * d_alloc(struct dentry *, const struct qstr *);
11980 extern struct dentry * d_alloc_pseudo(struct super_block *, const struct qstr *);
11981 extern struct dentry * d_alloc_parallel(struct dentry *, const struct qstr *,
11982- wait_queue_head_t *);
11983+ struct swait_queue_head *);
11984 extern struct dentry * d_splice_alias(struct inode *, struct dentry *);
11985 extern struct dentry * d_add_ci(struct dentry *, struct inode *, struct qstr *);
11986 extern struct dentry * d_exact_alias(struct dentry *, struct inode *);
11987diff --git a/include/linux/delay.h b/include/linux/delay.h
11988index b78bab4395d8..7c4bc414a504 100644
11989--- a/include/linux/delay.h
11990+++ b/include/linux/delay.h
11991@@ -64,4 +64,10 @@ static inline void ssleep(unsigned int seconds)
11992 msleep(seconds * 1000);
11993 }
11994
11995+#ifdef CONFIG_PREEMPT_RT_FULL
11996+extern void cpu_chill(void);
11997+#else
11998+# define cpu_chill() cpu_relax()
11999+#endif
12000+
12001 #endif /* defined(_LINUX_DELAY_H) */
12002diff --git a/include/linux/fs.h b/include/linux/fs.h
12003index cc613f20e5a6..b806e2116f5c 100644
12004--- a/include/linux/fs.h
12005+++ b/include/linux/fs.h
12006@@ -655,7 +655,7 @@ struct inode {
12007 struct block_device *i_bdev;
12008 struct cdev *i_cdev;
12009 char *i_link;
12010- unsigned i_dir_seq;
12011+ unsigned __i_dir_seq;
12012 };
12013
12014 __u32 i_generation;
12015diff --git a/include/linux/highmem.h b/include/linux/highmem.h
12016index 776f90f3a1cd..5f0bd7a3e6a7 100644
12017--- a/include/linux/highmem.h
12018+++ b/include/linux/highmem.h
12019@@ -8,6 +8,7 @@
12020 #include <linux/mm.h>
12021 #include <linux/uaccess.h>
12022 #include <linux/hardirq.h>
12023+#include <linux/sched.h>
12024
12025 #include <asm/cacheflush.h>
12026
12027@@ -66,7 +67,7 @@ static inline void kunmap(struct page *page)
12028
12029 static inline void *kmap_atomic(struct page *page)
12030 {
12031- preempt_disable();
12032+ preempt_disable_nort();
12033 pagefault_disable();
12034 return page_address(page);
12035 }
12036@@ -75,7 +76,7 @@ static inline void *kmap_atomic(struct page *page)
12037 static inline void __kunmap_atomic(void *addr)
12038 {
12039 pagefault_enable();
12040- preempt_enable();
12041+ preempt_enable_nort();
12042 }
12043
12044 #define kmap_atomic_pfn(pfn) kmap_atomic(pfn_to_page(pfn))
12045@@ -87,32 +88,51 @@ static inline void __kunmap_atomic(void *addr)
12046
12047 #if defined(CONFIG_HIGHMEM) || defined(CONFIG_X86_32)
12048
12049+#ifndef CONFIG_PREEMPT_RT_FULL
12050 DECLARE_PER_CPU(int, __kmap_atomic_idx);
12051+#endif
12052
12053 static inline int kmap_atomic_idx_push(void)
12054 {
12055+#ifndef CONFIG_PREEMPT_RT_FULL
12056 int idx = __this_cpu_inc_return(__kmap_atomic_idx) - 1;
12057
12058-#ifdef CONFIG_DEBUG_HIGHMEM
12059+# ifdef CONFIG_DEBUG_HIGHMEM
12060 WARN_ON_ONCE(in_irq() && !irqs_disabled());
12061 BUG_ON(idx >= KM_TYPE_NR);
12062-#endif
12063+# endif
12064 return idx;
12065+#else
12066+ current->kmap_idx++;
12067+ BUG_ON(current->kmap_idx > KM_TYPE_NR);
12068+ return current->kmap_idx - 1;
12069+#endif
12070 }
12071
12072 static inline int kmap_atomic_idx(void)
12073 {
12074+#ifndef CONFIG_PREEMPT_RT_FULL
12075 return __this_cpu_read(__kmap_atomic_idx) - 1;
12076+#else
12077+ return current->kmap_idx - 1;
12078+#endif
12079 }
12080
12081 static inline void kmap_atomic_idx_pop(void)
12082 {
12083-#ifdef CONFIG_DEBUG_HIGHMEM
12084+#ifndef CONFIG_PREEMPT_RT_FULL
12085+# ifdef CONFIG_DEBUG_HIGHMEM
12086 int idx = __this_cpu_dec_return(__kmap_atomic_idx);
12087
12088 BUG_ON(idx < 0);
12089-#else
12090+# else
12091 __this_cpu_dec(__kmap_atomic_idx);
12092+# endif
12093+#else
12094+ current->kmap_idx--;
12095+# ifdef CONFIG_DEBUG_HIGHMEM
12096+ BUG_ON(current->kmap_idx < 0);
12097+# endif
12098 #endif
12099 }
12100
12101diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h
12102index 012c37fdb688..3bd606859b0a 100644
12103--- a/include/linux/hrtimer.h
12104+++ b/include/linux/hrtimer.h
12105@@ -22,19 +22,42 @@
12106 #include <linux/percpu.h>
12107 #include <linux/timer.h>
12108 #include <linux/timerqueue.h>
12109+#include <linux/wait.h>
12110
12111 struct hrtimer_clock_base;
12112 struct hrtimer_cpu_base;
12113
12114 /*
12115 * Mode arguments of xxx_hrtimer functions:
12116+ *
12117+ * HRTIMER_MODE_ABS - Time value is absolute
12118+ * HRTIMER_MODE_REL - Time value is relative to now
12119+ * HRTIMER_MODE_PINNED - Timer is bound to CPU (is only considered
12120+ * when starting the timer)
12121+ * HRTIMER_MODE_SOFT - Timer callback function will be executed in
12122+ * soft irq context
12123 */
12124 enum hrtimer_mode {
12125- HRTIMER_MODE_ABS = 0x0, /* Time value is absolute */
12126- HRTIMER_MODE_REL = 0x1, /* Time value is relative to now */
12127- HRTIMER_MODE_PINNED = 0x02, /* Timer is bound to CPU */
12128- HRTIMER_MODE_ABS_PINNED = 0x02,
12129- HRTIMER_MODE_REL_PINNED = 0x03,
12130+ HRTIMER_MODE_ABS = 0x00,
12131+ HRTIMER_MODE_REL = 0x01,
12132+ HRTIMER_MODE_PINNED = 0x02,
12133+ HRTIMER_MODE_SOFT = 0x04,
12134+ HRTIMER_MODE_HARD = 0x08,
12135+
12136+ HRTIMER_MODE_ABS_PINNED = HRTIMER_MODE_ABS | HRTIMER_MODE_PINNED,
12137+ HRTIMER_MODE_REL_PINNED = HRTIMER_MODE_REL | HRTIMER_MODE_PINNED,
12138+
12139+ HRTIMER_MODE_ABS_SOFT = HRTIMER_MODE_ABS | HRTIMER_MODE_SOFT,
12140+ HRTIMER_MODE_REL_SOFT = HRTIMER_MODE_REL | HRTIMER_MODE_SOFT,
12141+
12142+ HRTIMER_MODE_ABS_PINNED_SOFT = HRTIMER_MODE_ABS_PINNED | HRTIMER_MODE_SOFT,
12143+ HRTIMER_MODE_REL_PINNED_SOFT = HRTIMER_MODE_REL_PINNED | HRTIMER_MODE_SOFT,
12144+
12145+ HRTIMER_MODE_ABS_HARD = HRTIMER_MODE_ABS | HRTIMER_MODE_HARD,
12146+ HRTIMER_MODE_REL_HARD = HRTIMER_MODE_REL | HRTIMER_MODE_HARD,
12147+
12148+ HRTIMER_MODE_ABS_PINNED_HARD = HRTIMER_MODE_ABS_PINNED | HRTIMER_MODE_HARD,
12149+ HRTIMER_MODE_REL_PINNED_HARD = HRTIMER_MODE_REL_PINNED | HRTIMER_MODE_HARD,
12150 };
12151
12152 /*
12153@@ -87,6 +110,7 @@ enum hrtimer_restart {
12154 * @base: pointer to the timer base (per cpu and per clock)
12155 * @state: state information (See bit values above)
12156 * @is_rel: Set if the timer was armed relative
12157+ * @is_soft: Set if hrtimer will be expired in soft interrupt context.
12158 *
12159 * The hrtimer structure must be initialized by hrtimer_init()
12160 */
12161@@ -97,6 +121,7 @@ struct hrtimer {
12162 struct hrtimer_clock_base *base;
12163 u8 state;
12164 u8 is_rel;
12165+ u8 is_soft;
12166 };
12167
12168 /**
12169@@ -112,9 +137,9 @@ struct hrtimer_sleeper {
12170 };
12171
12172 #ifdef CONFIG_64BIT
12173-# define HRTIMER_CLOCK_BASE_ALIGN 64
12174+# define __hrtimer_clock_base_align ____cacheline_aligned
12175 #else
12176-# define HRTIMER_CLOCK_BASE_ALIGN 32
12177+# define __hrtimer_clock_base_align
12178 #endif
12179
12180 /**
12181@@ -123,48 +148,57 @@ struct hrtimer_sleeper {
12182 * @index: clock type index for per_cpu support when moving a
12183 * timer to a base on another cpu.
12184 * @clockid: clock id for per_cpu support
12185+ * @seq: seqcount around __run_hrtimer
12186+ * @running: pointer to the currently running hrtimer
12187 * @active: red black tree root node for the active timers
12188 * @get_time: function to retrieve the current time of the clock
12189 * @offset: offset of this clock to the monotonic base
12190 */
12191 struct hrtimer_clock_base {
12192 struct hrtimer_cpu_base *cpu_base;
12193- int index;
12194+ unsigned int index;
12195 clockid_t clockid;
12196+ seqcount_t seq;
12197+ struct hrtimer *running;
12198 struct timerqueue_head active;
12199 ktime_t (*get_time)(void);
12200 ktime_t offset;
12201-} __attribute__((__aligned__(HRTIMER_CLOCK_BASE_ALIGN)));
12202+} __hrtimer_clock_base_align;
12203
12204 enum hrtimer_base_type {
12205 HRTIMER_BASE_MONOTONIC,
12206 HRTIMER_BASE_REALTIME,
12207 HRTIMER_BASE_BOOTTIME,
12208 HRTIMER_BASE_TAI,
12209+ HRTIMER_BASE_MONOTONIC_SOFT,
12210+ HRTIMER_BASE_REALTIME_SOFT,
12211+ HRTIMER_BASE_BOOTTIME_SOFT,
12212+ HRTIMER_BASE_TAI_SOFT,
12213 HRTIMER_MAX_CLOCK_BASES,
12214 };
12215
12216-/*
12217+/**
12218 * struct hrtimer_cpu_base - the per cpu clock bases
12219 * @lock: lock protecting the base and associated clock bases
12220 * and timers
12221- * @seq: seqcount around __run_hrtimer
12222- * @running: pointer to the currently running hrtimer
12223 * @cpu: cpu number
12224 * @active_bases: Bitfield to mark bases with active timers
12225 * @clock_was_set_seq: Sequence counter of clock was set events
12226- * @migration_enabled: The migration of hrtimers to other cpus is enabled
12227- * @nohz_active: The nohz functionality is enabled
12228- * @expires_next: absolute time of the next event which was scheduled
12229- * via clock_set_next_event()
12230- * @next_timer: Pointer to the first expiring timer
12231- * @in_hrtirq: hrtimer_interrupt() is currently executing
12232 * @hres_active: State of high resolution mode
12233+ * @in_hrtirq: hrtimer_interrupt() is currently executing
12234 * @hang_detected: The last hrtimer interrupt detected a hang
12235+ * @softirq_activated: displays, if the softirq is raised - update of softirq
12236+ * related settings is not required then.
12237 * @nr_events: Total number of hrtimer interrupt events
12238 * @nr_retries: Total number of hrtimer interrupt retries
12239 * @nr_hangs: Total number of hrtimer interrupt hangs
12240 * @max_hang_time: Maximum time spent in hrtimer_interrupt
12241+ * @expires_next: absolute time of the next event, is required for remote
12242+ * hrtimer enqueue; it is the total first expiry time (hard
12243+ * and soft hrtimer are taken into account)
12244+ * @next_timer: Pointer to the first expiring timer
12245+ * @softirq_expires_next: Time to check, if soft queues needs also to be expired
12246+ * @softirq_next_timer: Pointer to the first expiring softirq based timer
12247 * @clock_base: array of clock bases for this cpu
12248 *
12249 * Note: next_timer is just an optimization for __remove_hrtimer().
12250@@ -173,31 +207,31 @@ enum hrtimer_base_type {
12251 */
12252 struct hrtimer_cpu_base {
12253 raw_spinlock_t lock;
12254- seqcount_t seq;
12255- struct hrtimer *running;
12256 unsigned int cpu;
12257 unsigned int active_bases;
12258 unsigned int clock_was_set_seq;
12259- bool migration_enabled;
12260- bool nohz_active;
12261+ unsigned int hres_active : 1,
12262+ in_hrtirq : 1,
12263+ hang_detected : 1,
12264+ softirq_activated : 1;
12265 #ifdef CONFIG_HIGH_RES_TIMERS
12266- unsigned int in_hrtirq : 1,
12267- hres_active : 1,
12268- hang_detected : 1;
12269- ktime_t expires_next;
12270- struct hrtimer *next_timer;
12271 unsigned int nr_events;
12272- unsigned int nr_retries;
12273- unsigned int nr_hangs;
12274+ unsigned short nr_retries;
12275+ unsigned short nr_hangs;
12276 unsigned int max_hang_time;
12277 #endif
12278+ ktime_t expires_next;
12279+ struct hrtimer *next_timer;
12280+ ktime_t softirq_expires_next;
12281+#ifdef CONFIG_PREEMPT_RT_BASE
12282+ wait_queue_head_t wait;
12283+#endif
12284+ struct hrtimer *softirq_next_timer;
12285 struct hrtimer_clock_base clock_base[HRTIMER_MAX_CLOCK_BASES];
12286 } ____cacheline_aligned;
12287
12288 static inline void hrtimer_set_expires(struct hrtimer *timer, ktime_t time)
12289 {
12290- BUILD_BUG_ON(sizeof(struct hrtimer_clock_base) > HRTIMER_CLOCK_BASE_ALIGN);
12291-
12292 timer->node.expires = time;
12293 timer->_softexpires = time;
12294 }
12295@@ -266,16 +300,17 @@ static inline ktime_t hrtimer_cb_get_time(struct hrtimer *timer)
12296 return timer->base->get_time();
12297 }
12298
12299+static inline int hrtimer_is_hres_active(struct hrtimer *timer)
12300+{
12301+ return IS_ENABLED(CONFIG_HIGH_RES_TIMERS) ?
12302+ timer->base->cpu_base->hres_active : 0;
12303+}
12304+
12305 #ifdef CONFIG_HIGH_RES_TIMERS
12306 struct clock_event_device;
12307
12308 extern void hrtimer_interrupt(struct clock_event_device *dev);
12309
12310-static inline int hrtimer_is_hres_active(struct hrtimer *timer)
12311-{
12312- return timer->base->cpu_base->hres_active;
12313-}
12314-
12315 /*
12316 * The resolution of the clocks. The resolution value is returned in
12317 * the clock_getres() system call to give application programmers an
12318@@ -298,11 +333,6 @@ extern unsigned int hrtimer_resolution;
12319
12320 #define hrtimer_resolution (unsigned int)LOW_RES_NSEC
12321
12322-static inline int hrtimer_is_hres_active(struct hrtimer *timer)
12323-{
12324- return 0;
12325-}
12326-
12327 static inline void clock_was_set_delayed(void) { }
12328
12329 #endif
12330@@ -344,10 +374,17 @@ DECLARE_PER_CPU(struct tick_device, tick_cpu_device);
12331 /* Initialize timers: */
12332 extern void hrtimer_init(struct hrtimer *timer, clockid_t which_clock,
12333 enum hrtimer_mode mode);
12334+extern void hrtimer_init_sleeper(struct hrtimer_sleeper *sl, clockid_t clock_id,
12335+ enum hrtimer_mode mode,
12336+ struct task_struct *task);
12337
12338 #ifdef CONFIG_DEBUG_OBJECTS_TIMERS
12339 extern void hrtimer_init_on_stack(struct hrtimer *timer, clockid_t which_clock,
12340 enum hrtimer_mode mode);
12341+extern void hrtimer_init_sleeper_on_stack(struct hrtimer_sleeper *sl,
12342+ clockid_t clock_id,
12343+ enum hrtimer_mode mode,
12344+ struct task_struct *task);
12345
12346 extern void destroy_hrtimer_on_stack(struct hrtimer *timer);
12347 #else
12348@@ -357,6 +394,15 @@ static inline void hrtimer_init_on_stack(struct hrtimer *timer,
12349 {
12350 hrtimer_init(timer, which_clock, mode);
12351 }
12352+
12353+static inline void hrtimer_init_sleeper_on_stack(struct hrtimer_sleeper *sl,
12354+ clockid_t clock_id,
12355+ enum hrtimer_mode mode,
12356+ struct task_struct *task)
12357+{
12358+ hrtimer_init_sleeper(sl, clock_id, mode, task);
12359+}
12360+
12361 static inline void destroy_hrtimer_on_stack(struct hrtimer *timer) { }
12362 #endif
12363
12364@@ -365,11 +411,12 @@ extern void hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim,
12365 u64 range_ns, const enum hrtimer_mode mode);
12366
12367 /**
12368- * hrtimer_start - (re)start an hrtimer on the current CPU
12369+ * hrtimer_start - (re)start an hrtimer
12370 * @timer: the timer to be added
12371 * @tim: expiry time
12372- * @mode: expiry mode: absolute (HRTIMER_MODE_ABS) or
12373- * relative (HRTIMER_MODE_REL)
12374+ * @mode: timer mode: absolute (HRTIMER_MODE_ABS) or
12375+ * relative (HRTIMER_MODE_REL), and pinned (HRTIMER_MODE_PINNED);
12376+ * softirq based mode is considered for debug purpose only!
12377 */
12378 static inline void hrtimer_start(struct hrtimer *timer, ktime_t tim,
12379 const enum hrtimer_mode mode)
12380@@ -396,6 +443,13 @@ static inline void hrtimer_restart(struct hrtimer *timer)
12381 hrtimer_start_expires(timer, HRTIMER_MODE_ABS);
12382 }
12383
12384+/* Softirq preemption could deadlock timer removal */
12385+#ifdef CONFIG_PREEMPT_RT_BASE
12386+ extern void hrtimer_wait_for_timer(const struct hrtimer *timer);
12387+#else
12388+# define hrtimer_wait_for_timer(timer) do { cpu_relax(); } while (0)
12389+#endif
12390+
12391 /* Query timers: */
12392 extern ktime_t __hrtimer_get_remaining(const struct hrtimer *timer, bool adjust);
12393
12394@@ -420,9 +474,9 @@ static inline int hrtimer_is_queued(struct hrtimer *timer)
12395 * Helper function to check, whether the timer is running the callback
12396 * function
12397 */
12398-static inline int hrtimer_callback_running(struct hrtimer *timer)
12399+static inline int hrtimer_callback_running(const struct hrtimer *timer)
12400 {
12401- return timer->base->cpu_base->running == timer;
12402+ return timer->base->running == timer;
12403 }
12404
12405 /* Forward a hrtimer so it expires after now: */
12406@@ -458,15 +512,12 @@ extern long hrtimer_nanosleep(const struct timespec64 *rqtp,
12407 const enum hrtimer_mode mode,
12408 const clockid_t clockid);
12409
12410-extern void hrtimer_init_sleeper(struct hrtimer_sleeper *sl,
12411- struct task_struct *tsk);
12412-
12413 extern int schedule_hrtimeout_range(ktime_t *expires, u64 delta,
12414 const enum hrtimer_mode mode);
12415 extern int schedule_hrtimeout_range_clock(ktime_t *expires,
12416 u64 delta,
12417 const enum hrtimer_mode mode,
12418- int clock);
12419+ clockid_t clock_id);
12420 extern int schedule_hrtimeout(ktime_t *expires, const enum hrtimer_mode mode);
12421
12422 /* Soft interrupt function to run the hrtimer queues: */
12423diff --git a/include/linux/idr.h b/include/linux/idr.h
12424index 7c3a365f7e12..a922d984d9b6 100644
12425--- a/include/linux/idr.h
12426+++ b/include/linux/idr.h
12427@@ -167,10 +167,7 @@ static inline bool idr_is_empty(const struct idr *idr)
12428 * Each idr_preload() should be matched with an invocation of this
12429 * function. See idr_preload() for details.
12430 */
12431-static inline void idr_preload_end(void)
12432-{
12433- preempt_enable();
12434-}
12435+void idr_preload_end(void);
12436
12437 /**
12438 * idr_find - return pointer for given id
12439diff --git a/include/linux/init_task.h b/include/linux/init_task.h
12440index 8062e6cc607c..ee3ff961b84c 100644
12441--- a/include/linux/init_task.h
12442+++ b/include/linux/init_task.h
12443@@ -163,6 +163,12 @@ extern struct cred init_cred;
12444 # define INIT_PERF_EVENTS(tsk)
12445 #endif
12446
12447+#if defined(CONFIG_POSIX_TIMERS) && defined(CONFIG_PREEMPT_RT_BASE)
12448+# define INIT_TIMER_LIST .posix_timer_list = NULL,
12449+#else
12450+# define INIT_TIMER_LIST
12451+#endif
12452+
12453 #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
12454 # define INIT_VTIME(tsk) \
12455 .vtime.seqcount = SEQCNT_ZERO(tsk.vtime.seqcount), \
12456@@ -234,7 +240,8 @@ extern struct cred init_cred;
12457 .static_prio = MAX_PRIO-20, \
12458 .normal_prio = MAX_PRIO-20, \
12459 .policy = SCHED_NORMAL, \
12460- .cpus_allowed = CPU_MASK_ALL, \
12461+ .cpus_ptr = &tsk.cpus_mask, \
12462+ .cpus_mask = CPU_MASK_ALL, \
12463 .nr_cpus_allowed= NR_CPUS, \
12464 .mm = NULL, \
12465 .active_mm = &init_mm, \
12466@@ -276,6 +283,7 @@ extern struct cred init_cred;
12467 INIT_CPU_TIMERS(tsk) \
12468 .pi_lock = __RAW_SPIN_LOCK_UNLOCKED(tsk.pi_lock), \
12469 .timer_slack_ns = 50000, /* 50 usec default slack */ \
12470+ INIT_TIMER_LIST \
12471 .pids = { \
12472 [PIDTYPE_PID] = INIT_PID_LINK(PIDTYPE_PID), \
12473 [PIDTYPE_PGID] = INIT_PID_LINK(PIDTYPE_PGID), \
12474diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h
12475index 69c238210325..0f25fa19b2d8 100644
12476--- a/include/linux/interrupt.h
12477+++ b/include/linux/interrupt.h
12478@@ -15,6 +15,7 @@
12479 #include <linux/hrtimer.h>
12480 #include <linux/kref.h>
12481 #include <linux/workqueue.h>
12482+#include <linux/swork.h>
12483
12484 #include <linux/atomic.h>
12485 #include <asm/ptrace.h>
12486@@ -63,6 +64,7 @@
12487 * interrupt handler after suspending interrupts. For system
12488 * wakeup devices users need to implement wakeup detection in
12489 * their interrupt handlers.
12490+ * IRQF_NO_SOFTIRQ_CALL - Do not process softirqs in the irq thread context (RT)
12491 */
12492 #define IRQF_SHARED 0x00000080
12493 #define IRQF_PROBE_SHARED 0x00000100
12494@@ -76,6 +78,7 @@
12495 #define IRQF_NO_THREAD 0x00010000
12496 #define IRQF_EARLY_RESUME 0x00020000
12497 #define IRQF_COND_SUSPEND 0x00040000
12498+#define IRQF_NO_SOFTIRQ_CALL 0x00080000
12499
12500 #define IRQF_TIMER (__IRQF_TIMER | IRQF_NO_SUSPEND | IRQF_NO_THREAD)
12501
12502@@ -207,7 +210,7 @@ extern void devm_free_irq(struct device *dev, unsigned int irq, void *dev_id);
12503 #ifdef CONFIG_LOCKDEP
12504 # define local_irq_enable_in_hardirq() do { } while (0)
12505 #else
12506-# define local_irq_enable_in_hardirq() local_irq_enable()
12507+# define local_irq_enable_in_hardirq() local_irq_enable_nort()
12508 #endif
12509
12510 extern void disable_irq_nosync(unsigned int irq);
12511@@ -227,6 +230,7 @@ extern void resume_device_irqs(void);
12512 * struct irq_affinity_notify - context for notification of IRQ affinity changes
12513 * @irq: Interrupt to which notification applies
12514 * @kref: Reference count, for internal use
12515+ * @swork: Swork item, for internal use
12516 * @work: Work item, for internal use
12517 * @notify: Function to be called on change. This will be
12518 * called in process context.
12519@@ -238,7 +242,11 @@ extern void resume_device_irqs(void);
12520 struct irq_affinity_notify {
12521 unsigned int irq;
12522 struct kref kref;
12523+#ifdef CONFIG_PREEMPT_RT_BASE
12524+ struct swork_event swork;
12525+#else
12526 struct work_struct work;
12527+#endif
12528 void (*notify)(struct irq_affinity_notify *, const cpumask_t *mask);
12529 void (*release)(struct kref *ref);
12530 };
12531@@ -429,9 +437,13 @@ extern int irq_set_irqchip_state(unsigned int irq, enum irqchip_irq_state which,
12532 bool state);
12533
12534 #ifdef CONFIG_IRQ_FORCED_THREADING
12535+# ifndef CONFIG_PREEMPT_RT_BASE
12536 extern bool force_irqthreads;
12537+# else
12538+# define force_irqthreads (true)
12539+# endif
12540 #else
12541-#define force_irqthreads (0)
12542+#define force_irqthreads (false)
12543 #endif
12544
12545 #ifndef __ARCH_SET_SOFTIRQ_PENDING
12546@@ -488,9 +500,10 @@ struct softirq_action
12547 void (*action)(struct softirq_action *);
12548 };
12549
12550+#ifndef CONFIG_PREEMPT_RT_FULL
12551 asmlinkage void do_softirq(void);
12552 asmlinkage void __do_softirq(void);
12553-
12554+static inline void thread_do_softirq(void) { do_softirq(); }
12555 #ifdef __ARCH_HAS_DO_SOFTIRQ
12556 void do_softirq_own_stack(void);
12557 #else
12558@@ -499,13 +512,25 @@ static inline void do_softirq_own_stack(void)
12559 __do_softirq();
12560 }
12561 #endif
12562+#else
12563+extern void thread_do_softirq(void);
12564+#endif
12565
12566 extern void open_softirq(int nr, void (*action)(struct softirq_action *));
12567 extern void softirq_init(void);
12568 extern void __raise_softirq_irqoff(unsigned int nr);
12569+#ifdef CONFIG_PREEMPT_RT_FULL
12570+extern void __raise_softirq_irqoff_ksoft(unsigned int nr);
12571+#else
12572+static inline void __raise_softirq_irqoff_ksoft(unsigned int nr)
12573+{
12574+ __raise_softirq_irqoff(nr);
12575+}
12576+#endif
12577
12578 extern void raise_softirq_irqoff(unsigned int nr);
12579 extern void raise_softirq(unsigned int nr);
12580+extern void softirq_check_pending_idle(void);
12581
12582 DECLARE_PER_CPU(struct task_struct *, ksoftirqd);
12583
12584@@ -527,8 +552,9 @@ static inline struct task_struct *this_cpu_ksoftirqd(void)
12585 to be executed on some cpu at least once after this.
12586 * If the tasklet is already scheduled, but its execution is still not
12587 started, it will be executed only once.
12588- * If this tasklet is already running on another CPU (or schedule is called
12589- from tasklet itself), it is rescheduled for later.
12590+ * If this tasklet is already running on another CPU, it is rescheduled
12591+ for later.
12592+ * Schedule must not be called from the tasklet itself (a lockup occurs)
12593 * Tasklet is strictly serialized wrt itself, but not
12594 wrt another tasklets. If client needs some intertask synchronization,
12595 he makes it with spinlocks.
12596@@ -553,27 +579,36 @@ struct tasklet_struct name = { NULL, 0, ATOMIC_INIT(1), func, data }
12597 enum
12598 {
12599 TASKLET_STATE_SCHED, /* Tasklet is scheduled for execution */
12600- TASKLET_STATE_RUN /* Tasklet is running (SMP only) */
12601+ TASKLET_STATE_RUN, /* Tasklet is running (SMP only) */
12602+ TASKLET_STATE_PENDING /* Tasklet is pending */
12603 };
12604
12605-#ifdef CONFIG_SMP
12606+#define TASKLET_STATEF_SCHED (1 << TASKLET_STATE_SCHED)
12607+#define TASKLET_STATEF_RUN (1 << TASKLET_STATE_RUN)
12608+#define TASKLET_STATEF_PENDING (1 << TASKLET_STATE_PENDING)
12609+
12610+#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT_FULL)
12611 static inline int tasklet_trylock(struct tasklet_struct *t)
12612 {
12613 return !test_and_set_bit(TASKLET_STATE_RUN, &(t)->state);
12614 }
12615
12616+static inline int tasklet_tryunlock(struct tasklet_struct *t)
12617+{
12618+ return cmpxchg(&t->state, TASKLET_STATEF_RUN, 0) == TASKLET_STATEF_RUN;
12619+}
12620+
12621 static inline void tasklet_unlock(struct tasklet_struct *t)
12622 {
12623 smp_mb__before_atomic();
12624 clear_bit(TASKLET_STATE_RUN, &(t)->state);
12625 }
12626
12627-static inline void tasklet_unlock_wait(struct tasklet_struct *t)
12628-{
12629- while (test_bit(TASKLET_STATE_RUN, &(t)->state)) { barrier(); }
12630-}
12631+extern void tasklet_unlock_wait(struct tasklet_struct *t);
12632+
12633 #else
12634 #define tasklet_trylock(t) 1
12635+#define tasklet_tryunlock(t) 1
12636 #define tasklet_unlock_wait(t) do { } while (0)
12637 #define tasklet_unlock(t) do { } while (0)
12638 #endif
12639@@ -607,41 +642,17 @@ static inline void tasklet_disable(struct tasklet_struct *t)
12640 smp_mb();
12641 }
12642
12643-static inline void tasklet_enable(struct tasklet_struct *t)
12644-{
12645- smp_mb__before_atomic();
12646- atomic_dec(&t->count);
12647-}
12648-
12649+extern void tasklet_enable(struct tasklet_struct *t);
12650 extern void tasklet_kill(struct tasklet_struct *t);
12651 extern void tasklet_kill_immediate(struct tasklet_struct *t, unsigned int cpu);
12652 extern void tasklet_init(struct tasklet_struct *t,
12653 void (*func)(unsigned long), unsigned long data);
12654
12655-struct tasklet_hrtimer {
12656- struct hrtimer timer;
12657- struct tasklet_struct tasklet;
12658- enum hrtimer_restart (*function)(struct hrtimer *);
12659-};
12660-
12661-extern void
12662-tasklet_hrtimer_init(struct tasklet_hrtimer *ttimer,
12663- enum hrtimer_restart (*function)(struct hrtimer *),
12664- clockid_t which_clock, enum hrtimer_mode mode);
12665-
12666-static inline
12667-void tasklet_hrtimer_start(struct tasklet_hrtimer *ttimer, ktime_t time,
12668- const enum hrtimer_mode mode)
12669-{
12670- hrtimer_start(&ttimer->timer, time, mode);
12671-}
12672-
12673-static inline
12674-void tasklet_hrtimer_cancel(struct tasklet_hrtimer *ttimer)
12675-{
12676- hrtimer_cancel(&ttimer->timer);
12677- tasklet_kill(&ttimer->tasklet);
12678-}
12679+#ifdef CONFIG_PREEMPT_RT_FULL
12680+extern void softirq_early_init(void);
12681+#else
12682+static inline void softirq_early_init(void) { }
12683+#endif
12684
12685 /*
12686 * Autoprobing for irqs:
12687diff --git a/include/linux/irq.h b/include/linux/irq.h
12688index 0d53626405bf..ddd23c6e2e55 100644
12689--- a/include/linux/irq.h
12690+++ b/include/linux/irq.h
12691@@ -74,6 +74,7 @@ enum irqchip_irq_state;
12692 * IRQ_IS_POLLED - Always polled by another interrupt. Exclude
12693 * it from the spurious interrupt detection
12694 * mechanism and from core side polling.
12695+ * IRQ_NO_SOFTIRQ_CALL - No softirq processing in the irq thread context (RT)
12696 * IRQ_DISABLE_UNLAZY - Disable lazy irq disable
12697 */
12698 enum {
12699@@ -101,13 +102,14 @@ enum {
12700 IRQ_PER_CPU_DEVID = (1 << 17),
12701 IRQ_IS_POLLED = (1 << 18),
12702 IRQ_DISABLE_UNLAZY = (1 << 19),
12703+ IRQ_NO_SOFTIRQ_CALL = (1 << 20),
12704 };
12705
12706 #define IRQF_MODIFY_MASK \
12707 (IRQ_TYPE_SENSE_MASK | IRQ_NOPROBE | IRQ_NOREQUEST | \
12708 IRQ_NOAUTOEN | IRQ_MOVE_PCNTXT | IRQ_LEVEL | IRQ_NO_BALANCING | \
12709 IRQ_PER_CPU | IRQ_NESTED_THREAD | IRQ_NOTHREAD | IRQ_PER_CPU_DEVID | \
12710- IRQ_IS_POLLED | IRQ_DISABLE_UNLAZY)
12711+ IRQ_IS_POLLED | IRQ_DISABLE_UNLAZY | IRQ_NO_SOFTIRQ_CALL)
12712
12713 #define IRQ_NO_BALANCING_MASK (IRQ_PER_CPU | IRQ_NO_BALANCING)
12714
12715diff --git a/include/linux/irq_work.h b/include/linux/irq_work.h
12716index 9270d73ea682..1e66fac6f1d2 100644
12717--- a/include/linux/irq_work.h
12718+++ b/include/linux/irq_work.h
12719@@ -17,6 +17,7 @@
12720 #define IRQ_WORK_BUSY 2UL
12721 #define IRQ_WORK_FLAGS 3UL
12722 #define IRQ_WORK_LAZY 4UL /* Doesn't want IPI, wait for tick */
12723+#define IRQ_WORK_HARD_IRQ 8UL /* Run hard IRQ context, even on RT */
12724
12725 struct irq_work {
12726 unsigned long flags;
12727@@ -52,4 +53,10 @@ static inline bool irq_work_needs_cpu(void) { return false; }
12728 static inline void irq_work_run(void) { }
12729 #endif
12730
12731+#if defined(CONFIG_IRQ_WORK) && defined(CONFIG_PREEMPT_RT_FULL)
12732+void irq_work_tick_soft(void);
12733+#else
12734+static inline void irq_work_tick_soft(void) { }
12735+#endif
12736+
12737 #endif /* _LINUX_IRQ_WORK_H */
12738diff --git a/include/linux/irqchip/arm-gic-v3.h b/include/linux/irqchip/arm-gic-v3.h
12739index bacb499c512c..688f2565294c 100644
12740--- a/include/linux/irqchip/arm-gic-v3.h
12741+++ b/include/linux/irqchip/arm-gic-v3.h
12742@@ -568,6 +568,7 @@ struct rdists {
12743 void __iomem *rd_base;
12744 struct page *pend_page;
12745 phys_addr_t phys_base;
12746+ bool lpi_enabled;
12747 } __percpu *rdist;
12748 struct page *prop_page;
12749 int id_bits;
12750diff --git a/include/linux/irqdesc.h b/include/linux/irqdesc.h
12751index b6084898d330..d334476cdca6 100644
12752--- a/include/linux/irqdesc.h
12753+++ b/include/linux/irqdesc.h
12754@@ -70,6 +70,7 @@ struct irq_desc {
12755 unsigned int irqs_unhandled;
12756 atomic_t threads_handled;
12757 int threads_handled_last;
12758+ u64 random_ip;
12759 raw_spinlock_t lock;
12760 struct cpumask *percpu_enabled;
12761 const struct cpumask *percpu_affinity;
12762diff --git a/include/linux/irqflags.h b/include/linux/irqflags.h
12763index 46cb57d5eb13..2e023bfe45af 100644
12764--- a/include/linux/irqflags.h
12765+++ b/include/linux/irqflags.h
12766@@ -34,16 +34,6 @@ do { \
12767 current->hardirq_context--; \
12768 crossrelease_hist_end(XHLOCK_HARD); \
12769 } while (0)
12770-# define lockdep_softirq_enter() \
12771-do { \
12772- current->softirq_context++; \
12773- crossrelease_hist_start(XHLOCK_SOFT); \
12774-} while (0)
12775-# define lockdep_softirq_exit() \
12776-do { \
12777- current->softirq_context--; \
12778- crossrelease_hist_end(XHLOCK_SOFT); \
12779-} while (0)
12780 # define INIT_TRACE_IRQFLAGS .softirqs_enabled = 1,
12781 #else
12782 # define trace_hardirqs_on() do { } while (0)
12783@@ -56,9 +46,23 @@ do { \
12784 # define trace_softirqs_enabled(p) 0
12785 # define trace_hardirq_enter() do { } while (0)
12786 # define trace_hardirq_exit() do { } while (0)
12787+# define INIT_TRACE_IRQFLAGS
12788+#endif
12789+
12790+#if defined(CONFIG_TRACE_IRQFLAGS) && !defined(CONFIG_PREEMPT_RT_FULL)
12791+# define lockdep_softirq_enter() \
12792+do { \
12793+ current->softirq_context++; \
12794+ crossrelease_hist_start(XHLOCK_SOFT); \
12795+} while (0)
12796+# define lockdep_softirq_exit() \
12797+do { \
12798+ current->softirq_context--; \
12799+ crossrelease_hist_end(XHLOCK_SOFT); \
12800+} while (0)
12801+#else
12802 # define lockdep_softirq_enter() do { } while (0)
12803 # define lockdep_softirq_exit() do { } while (0)
12804-# define INIT_TRACE_IRQFLAGS
12805 #endif
12806
12807 #if defined(CONFIG_IRQSOFF_TRACER) || \
12808@@ -165,4 +169,23 @@ do { \
12809
12810 #define irqs_disabled_flags(flags) raw_irqs_disabled_flags(flags)
12811
12812+/*
12813+ * local_irq* variants depending on RT/!RT
12814+ */
12815+#ifdef CONFIG_PREEMPT_RT_FULL
12816+# define local_irq_disable_nort() do { } while (0)
12817+# define local_irq_enable_nort() do { } while (0)
12818+# define local_irq_save_nort(flags) local_save_flags(flags)
12819+# define local_irq_restore_nort(flags) (void)(flags)
12820+# define local_irq_disable_rt() local_irq_disable()
12821+# define local_irq_enable_rt() local_irq_enable()
12822+#else
12823+# define local_irq_disable_nort() local_irq_disable()
12824+# define local_irq_enable_nort() local_irq_enable()
12825+# define local_irq_save_nort(flags) local_irq_save(flags)
12826+# define local_irq_restore_nort(flags) local_irq_restore(flags)
12827+# define local_irq_disable_rt() do { } while (0)
12828+# define local_irq_enable_rt() do { } while (0)
12829+#endif
12830+
12831 #endif
12832diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h
12833index 29290bfb94a8..32379bfab9f0 100644
12834--- a/include/linux/jbd2.h
12835+++ b/include/linux/jbd2.h
12836@@ -347,32 +347,56 @@ static inline struct journal_head *bh2jh(struct buffer_head *bh)
12837
12838 static inline void jbd_lock_bh_state(struct buffer_head *bh)
12839 {
12840+#ifndef CONFIG_PREEMPT_RT_BASE
12841 bit_spin_lock(BH_State, &bh->b_state);
12842+#else
12843+ spin_lock(&bh->b_state_lock);
12844+#endif
12845 }
12846
12847 static inline int jbd_trylock_bh_state(struct buffer_head *bh)
12848 {
12849+#ifndef CONFIG_PREEMPT_RT_BASE
12850 return bit_spin_trylock(BH_State, &bh->b_state);
12851+#else
12852+ return spin_trylock(&bh->b_state_lock);
12853+#endif
12854 }
12855
12856 static inline int jbd_is_locked_bh_state(struct buffer_head *bh)
12857 {
12858+#ifndef CONFIG_PREEMPT_RT_BASE
12859 return bit_spin_is_locked(BH_State, &bh->b_state);
12860+#else
12861+ return spin_is_locked(&bh->b_state_lock);
12862+#endif
12863 }
12864
12865 static inline void jbd_unlock_bh_state(struct buffer_head *bh)
12866 {
12867+#ifndef CONFIG_PREEMPT_RT_BASE
12868 bit_spin_unlock(BH_State, &bh->b_state);
12869+#else
12870+ spin_unlock(&bh->b_state_lock);
12871+#endif
12872 }
12873
12874 static inline void jbd_lock_bh_journal_head(struct buffer_head *bh)
12875 {
12876+#ifndef CONFIG_PREEMPT_RT_BASE
12877 bit_spin_lock(BH_JournalHead, &bh->b_state);
12878+#else
12879+ spin_lock(&bh->b_journal_head_lock);
12880+#endif
12881 }
12882
12883 static inline void jbd_unlock_bh_journal_head(struct buffer_head *bh)
12884 {
12885+#ifndef CONFIG_PREEMPT_RT_BASE
12886 bit_spin_unlock(BH_JournalHead, &bh->b_state);
12887+#else
12888+ spin_unlock(&bh->b_journal_head_lock);
12889+#endif
12890 }
12891
12892 #define J_ASSERT(assert) BUG_ON(!(assert))
12893diff --git a/include/linux/kdb.h b/include/linux/kdb.h
12894index 68bd88223417..e033b25b0b72 100644
12895--- a/include/linux/kdb.h
12896+++ b/include/linux/kdb.h
12897@@ -167,6 +167,7 @@ extern __printf(2, 0) int vkdb_printf(enum kdb_msgsrc src, const char *fmt,
12898 extern __printf(1, 2) int kdb_printf(const char *, ...);
12899 typedef __printf(1, 2) int (*kdb_printf_t)(const char *, ...);
12900
12901+#define in_kdb_printk() (kdb_trap_printk)
12902 extern void kdb_init(int level);
12903
12904 /* Access to kdb specific polling devices */
12905@@ -201,6 +202,7 @@ extern int kdb_register_flags(char *, kdb_func_t, char *, char *,
12906 extern int kdb_unregister(char *);
12907 #else /* ! CONFIG_KGDB_KDB */
12908 static inline __printf(1, 2) int kdb_printf(const char *fmt, ...) { return 0; }
12909+#define in_kdb_printk() (0)
12910 static inline void kdb_init(int level) {}
12911 static inline int kdb_register(char *cmd, kdb_func_t func, char *usage,
12912 char *help, short minlen) { return 0; }
12913diff --git a/include/linux/kernel.h b/include/linux/kernel.h
12914index 4b484ab9e163..74feebf9d82c 100644
12915--- a/include/linux/kernel.h
12916+++ b/include/linux/kernel.h
12917@@ -225,6 +225,9 @@ extern int _cond_resched(void);
12918 */
12919 # define might_sleep() \
12920 do { __might_sleep(__FILE__, __LINE__, 0); might_resched(); } while (0)
12921+
12922+# define might_sleep_no_state_check() \
12923+ do { ___might_sleep(__FILE__, __LINE__, 0); might_resched(); } while (0)
12924 # define sched_annotate_sleep() (current->task_state_change = 0)
12925 #else
12926 static inline void ___might_sleep(const char *file, int line,
12927@@ -232,6 +235,7 @@ extern int _cond_resched(void);
12928 static inline void __might_sleep(const char *file, int line,
12929 int preempt_offset) { }
12930 # define might_sleep() do { might_resched(); } while (0)
12931+# define might_sleep_no_state_check() do { might_resched(); } while (0)
12932 # define sched_annotate_sleep() do { } while (0)
12933 #endif
12934
12935@@ -531,6 +535,7 @@ extern enum system_states {
12936 SYSTEM_HALT,
12937 SYSTEM_POWER_OFF,
12938 SYSTEM_RESTART,
12939+ SYSTEM_SUSPEND,
12940 } system_state;
12941
12942 #define TAINT_PROPRIETARY_MODULE 0
12943diff --git a/include/linux/list_bl.h b/include/linux/list_bl.h
12944index 3fc2cc57ba1b..0b5de7d9ffcf 100644
12945--- a/include/linux/list_bl.h
12946+++ b/include/linux/list_bl.h
12947@@ -3,6 +3,7 @@
12948 #define _LINUX_LIST_BL_H
12949
12950 #include <linux/list.h>
12951+#include <linux/spinlock.h>
12952 #include <linux/bit_spinlock.h>
12953
12954 /*
12955@@ -33,13 +34,24 @@
12956
12957 struct hlist_bl_head {
12958 struct hlist_bl_node *first;
12959+#ifdef CONFIG_PREEMPT_RT_BASE
12960+ raw_spinlock_t lock;
12961+#endif
12962 };
12963
12964 struct hlist_bl_node {
12965 struct hlist_bl_node *next, **pprev;
12966 };
12967-#define INIT_HLIST_BL_HEAD(ptr) \
12968- ((ptr)->first = NULL)
12969+
12970+#ifdef CONFIG_PREEMPT_RT_BASE
12971+#define INIT_HLIST_BL_HEAD(h) \
12972+do { \
12973+ (h)->first = NULL; \
12974+ raw_spin_lock_init(&(h)->lock); \
12975+} while (0)
12976+#else
12977+#define INIT_HLIST_BL_HEAD(h) (h)->first = NULL
12978+#endif
12979
12980 static inline void INIT_HLIST_BL_NODE(struct hlist_bl_node *h)
12981 {
12982@@ -119,12 +131,26 @@ static inline void hlist_bl_del_init(struct hlist_bl_node *n)
12983
12984 static inline void hlist_bl_lock(struct hlist_bl_head *b)
12985 {
12986+#ifndef CONFIG_PREEMPT_RT_BASE
12987 bit_spin_lock(0, (unsigned long *)b);
12988+#else
12989+ raw_spin_lock(&b->lock);
12990+#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK)
12991+ __set_bit(0, (unsigned long *)b);
12992+#endif
12993+#endif
12994 }
12995
12996 static inline void hlist_bl_unlock(struct hlist_bl_head *b)
12997 {
12998+#ifndef CONFIG_PREEMPT_RT_BASE
12999 __bit_spin_unlock(0, (unsigned long *)b);
13000+#else
13001+#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK)
13002+ __clear_bit(0, (unsigned long *)b);
13003+#endif
13004+ raw_spin_unlock(&b->lock);
13005+#endif
13006 }
13007
13008 static inline bool hlist_bl_is_locked(struct hlist_bl_head *b)
13009diff --git a/include/linux/locallock.h b/include/linux/locallock.h
13010new file mode 100644
13011index 000000000000..921eab83cd34
13012--- /dev/null
13013+++ b/include/linux/locallock.h
13014@@ -0,0 +1,281 @@
13015+#ifndef _LINUX_LOCALLOCK_H
13016+#define _LINUX_LOCALLOCK_H
13017+
13018+#include <linux/percpu.h>
13019+#include <linux/spinlock.h>
13020+
13021+#ifdef CONFIG_PREEMPT_RT_BASE
13022+
13023+#ifdef CONFIG_DEBUG_SPINLOCK
13024+# define LL_WARN(cond) WARN_ON(cond)
13025+#else
13026+# define LL_WARN(cond) do { } while (0)
13027+#endif
13028+
13029+/*
13030+ * per cpu lock based substitute for local_irq_*()
13031+ */
13032+struct local_irq_lock {
13033+ spinlock_t lock;
13034+ struct task_struct *owner;
13035+ int nestcnt;
13036+ unsigned long flags;
13037+};
13038+
13039+#define DEFINE_LOCAL_IRQ_LOCK(lvar) \
13040+ DEFINE_PER_CPU(struct local_irq_lock, lvar) = { \
13041+ .lock = __SPIN_LOCK_UNLOCKED((lvar).lock) }
13042+
13043+#define DECLARE_LOCAL_IRQ_LOCK(lvar) \
13044+ DECLARE_PER_CPU(struct local_irq_lock, lvar)
13045+
13046+#define local_irq_lock_init(lvar) \
13047+ do { \
13048+ int __cpu; \
13049+ for_each_possible_cpu(__cpu) \
13050+ spin_lock_init(&per_cpu(lvar, __cpu).lock); \
13051+ } while (0)
13052+
13053+static inline void __local_lock(struct local_irq_lock *lv)
13054+{
13055+ if (lv->owner != current) {
13056+ spin_lock(&lv->lock);
13057+ LL_WARN(lv->owner);
13058+ LL_WARN(lv->nestcnt);
13059+ lv->owner = current;
13060+ }
13061+ lv->nestcnt++;
13062+}
13063+
13064+#define local_lock(lvar) \
13065+ do { __local_lock(&get_local_var(lvar)); } while (0)
13066+
13067+#define local_lock_on(lvar, cpu) \
13068+ do { __local_lock(&per_cpu(lvar, cpu)); } while (0)
13069+
13070+static inline int __local_trylock(struct local_irq_lock *lv)
13071+{
13072+ if (lv->owner != current && spin_trylock(&lv->lock)) {
13073+ LL_WARN(lv->owner);
13074+ LL_WARN(lv->nestcnt);
13075+ lv->owner = current;
13076+ lv->nestcnt = 1;
13077+ return 1;
13078+ } else if (lv->owner == current) {
13079+ lv->nestcnt++;
13080+ return 1;
13081+ }
13082+ return 0;
13083+}
13084+
13085+#define local_trylock(lvar) \
13086+ ({ \
13087+ int __locked; \
13088+ __locked = __local_trylock(&get_local_var(lvar)); \
13089+ if (!__locked) \
13090+ put_local_var(lvar); \
13091+ __locked; \
13092+ })
13093+
13094+static inline void __local_unlock(struct local_irq_lock *lv)
13095+{
13096+ LL_WARN(lv->nestcnt == 0);
13097+ LL_WARN(lv->owner != current);
13098+ if (--lv->nestcnt)
13099+ return;
13100+
13101+ lv->owner = NULL;
13102+ spin_unlock(&lv->lock);
13103+}
13104+
13105+#define local_unlock(lvar) \
13106+ do { \
13107+ __local_unlock(this_cpu_ptr(&lvar)); \
13108+ put_local_var(lvar); \
13109+ } while (0)
13110+
13111+#define local_unlock_on(lvar, cpu) \
13112+ do { __local_unlock(&per_cpu(lvar, cpu)); } while (0)
13113+
13114+static inline void __local_lock_irq(struct local_irq_lock *lv)
13115+{
13116+ spin_lock_irqsave(&lv->lock, lv->flags);
13117+ LL_WARN(lv->owner);
13118+ LL_WARN(lv->nestcnt);
13119+ lv->owner = current;
13120+ lv->nestcnt = 1;
13121+}
13122+
13123+#define local_lock_irq(lvar) \
13124+ do { __local_lock_irq(&get_local_var(lvar)); } while (0)
13125+
13126+#define local_lock_irq_on(lvar, cpu) \
13127+ do { __local_lock_irq(&per_cpu(lvar, cpu)); } while (0)
13128+
13129+static inline void __local_unlock_irq(struct local_irq_lock *lv)
13130+{
13131+ LL_WARN(!lv->nestcnt);
13132+ LL_WARN(lv->owner != current);
13133+ lv->owner = NULL;
13134+ lv->nestcnt = 0;
13135+ spin_unlock_irq(&lv->lock);
13136+}
13137+
13138+#define local_unlock_irq(lvar) \
13139+ do { \
13140+ __local_unlock_irq(this_cpu_ptr(&lvar)); \
13141+ put_local_var(lvar); \
13142+ } while (0)
13143+
13144+#define local_unlock_irq_on(lvar, cpu) \
13145+ do { \
13146+ __local_unlock_irq(&per_cpu(lvar, cpu)); \
13147+ } while (0)
13148+
13149+static inline int __local_lock_irqsave(struct local_irq_lock *lv)
13150+{
13151+ if (lv->owner != current) {
13152+ __local_lock_irq(lv);
13153+ return 0;
13154+ } else {
13155+ lv->nestcnt++;
13156+ return 1;
13157+ }
13158+}
13159+
13160+#define local_lock_irqsave(lvar, _flags) \
13161+ do { \
13162+ if (__local_lock_irqsave(&get_local_var(lvar))) \
13163+ put_local_var(lvar); \
13164+ _flags = __this_cpu_read(lvar.flags); \
13165+ } while (0)
13166+
13167+#define local_lock_irqsave_on(lvar, _flags, cpu) \
13168+ do { \
13169+ __local_lock_irqsave(&per_cpu(lvar, cpu)); \
13170+ _flags = per_cpu(lvar, cpu).flags; \
13171+ } while (0)
13172+
13173+static inline int __local_unlock_irqrestore(struct local_irq_lock *lv,
13174+ unsigned long flags)
13175+{
13176+ LL_WARN(!lv->nestcnt);
13177+ LL_WARN(lv->owner != current);
13178+ if (--lv->nestcnt)
13179+ return 0;
13180+
13181+ lv->owner = NULL;
13182+ spin_unlock_irqrestore(&lv->lock, lv->flags);
13183+ return 1;
13184+}
13185+
13186+#define local_unlock_irqrestore(lvar, flags) \
13187+ do { \
13188+ if (__local_unlock_irqrestore(this_cpu_ptr(&lvar), flags)) \
13189+ put_local_var(lvar); \
13190+ } while (0)
13191+
13192+#define local_unlock_irqrestore_on(lvar, flags, cpu) \
13193+ do { \
13194+ __local_unlock_irqrestore(&per_cpu(lvar, cpu), flags); \
13195+ } while (0)
13196+
13197+#define local_spin_trylock_irq(lvar, lock) \
13198+ ({ \
13199+ int __locked; \
13200+ local_lock_irq(lvar); \
13201+ __locked = spin_trylock(lock); \
13202+ if (!__locked) \
13203+ local_unlock_irq(lvar); \
13204+ __locked; \
13205+ })
13206+
13207+#define local_spin_lock_irq(lvar, lock) \
13208+ do { \
13209+ local_lock_irq(lvar); \
13210+ spin_lock(lock); \
13211+ } while (0)
13212+
13213+#define local_spin_unlock_irq(lvar, lock) \
13214+ do { \
13215+ spin_unlock(lock); \
13216+ local_unlock_irq(lvar); \
13217+ } while (0)
13218+
13219+#define local_spin_lock_irqsave(lvar, lock, flags) \
13220+ do { \
13221+ local_lock_irqsave(lvar, flags); \
13222+ spin_lock(lock); \
13223+ } while (0)
13224+
13225+#define local_spin_unlock_irqrestore(lvar, lock, flags) \
13226+ do { \
13227+ spin_unlock(lock); \
13228+ local_unlock_irqrestore(lvar, flags); \
13229+ } while (0)
13230+
13231+#define get_locked_var(lvar, var) \
13232+ (*({ \
13233+ local_lock(lvar); \
13234+ this_cpu_ptr(&var); \
13235+ }))
13236+
13237+#define put_locked_var(lvar, var) local_unlock(lvar);
13238+
13239+#define get_locked_ptr(lvar, var) \
13240+ ({ \
13241+ local_lock(lvar); \
13242+ this_cpu_ptr(var); \
13243+ })
13244+
13245+#define put_locked_ptr(lvar, var) local_unlock(lvar);
13246+
13247+#define local_lock_cpu(lvar) \
13248+ ({ \
13249+ local_lock(lvar); \
13250+ smp_processor_id(); \
13251+ })
13252+
13253+#define local_unlock_cpu(lvar) local_unlock(lvar)
13254+
13255+#else /* PREEMPT_RT_BASE */
13256+
13257+#define DEFINE_LOCAL_IRQ_LOCK(lvar) __typeof__(const int) lvar
13258+#define DECLARE_LOCAL_IRQ_LOCK(lvar) extern __typeof__(const int) lvar
13259+
13260+static inline void local_irq_lock_init(int lvar) { }
13261+
13262+#define local_trylock(lvar) \
13263+ ({ \
13264+ preempt_disable(); \
13265+ 1; \
13266+ })
13267+
13268+#define local_lock(lvar) preempt_disable()
13269+#define local_unlock(lvar) preempt_enable()
13270+#define local_lock_irq(lvar) local_irq_disable()
13271+#define local_lock_irq_on(lvar, cpu) local_irq_disable()
13272+#define local_unlock_irq(lvar) local_irq_enable()
13273+#define local_unlock_irq_on(lvar, cpu) local_irq_enable()
13274+#define local_lock_irqsave(lvar, flags) local_irq_save(flags)
13275+#define local_unlock_irqrestore(lvar, flags) local_irq_restore(flags)
13276+
13277+#define local_spin_trylock_irq(lvar, lock) spin_trylock_irq(lock)
13278+#define local_spin_lock_irq(lvar, lock) spin_lock_irq(lock)
13279+#define local_spin_unlock_irq(lvar, lock) spin_unlock_irq(lock)
13280+#define local_spin_lock_irqsave(lvar, lock, flags) \
13281+ spin_lock_irqsave(lock, flags)
13282+#define local_spin_unlock_irqrestore(lvar, lock, flags) \
13283+ spin_unlock_irqrestore(lock, flags)
13284+
13285+#define get_locked_var(lvar, var) get_cpu_var(var)
13286+#define put_locked_var(lvar, var) put_cpu_var(var)
13287+#define get_locked_ptr(lvar, var) get_cpu_ptr(var)
13288+#define put_locked_ptr(lvar, var) put_cpu_ptr(var)
13289+
13290+#define local_lock_cpu(lvar) get_cpu()
13291+#define local_unlock_cpu(lvar) put_cpu()
13292+
13293+#endif
13294+
13295+#endif
13296diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
13297index e41ef532c4ce..63317710311e 100644
13298--- a/include/linux/mm_types.h
13299+++ b/include/linux/mm_types.h
13300@@ -12,6 +12,7 @@
13301 #include <linux/completion.h>
13302 #include <linux/cpumask.h>
13303 #include <linux/uprobes.h>
13304+#include <linux/rcupdate.h>
13305 #include <linux/page-flags-layout.h>
13306 #include <linux/workqueue.h>
13307
13308@@ -496,6 +497,9 @@ struct mm_struct {
13309 bool tlb_flush_batched;
13310 #endif
13311 struct uprobes_state uprobes_state;
13312+#ifdef CONFIG_PREEMPT_RT_BASE
13313+ struct rcu_head delayed_drop;
13314+#endif
13315 #ifdef CONFIG_HUGETLB_PAGE
13316 atomic_long_t hugetlb_usage;
13317 #endif
13318diff --git a/include/linux/mutex.h b/include/linux/mutex.h
13319index 153274f78402..dbb52857b25b 100644
13320--- a/include/linux/mutex.h
13321+++ b/include/linux/mutex.h
13322@@ -23,6 +23,17 @@
13323
13324 struct ww_acquire_ctx;
13325
13326+#ifdef CONFIG_DEBUG_LOCK_ALLOC
13327+# define __DEP_MAP_MUTEX_INITIALIZER(lockname) \
13328+ , .dep_map = { .name = #lockname }
13329+#else
13330+# define __DEP_MAP_MUTEX_INITIALIZER(lockname)
13331+#endif
13332+
13333+#ifdef CONFIG_PREEMPT_RT_FULL
13334+# include <linux/mutex_rt.h>
13335+#else
13336+
13337 /*
13338 * Simple, straightforward mutexes with strict semantics:
13339 *
13340@@ -114,13 +125,6 @@ do { \
13341 __mutex_init((mutex), #mutex, &__key); \
13342 } while (0)
13343
13344-#ifdef CONFIG_DEBUG_LOCK_ALLOC
13345-# define __DEP_MAP_MUTEX_INITIALIZER(lockname) \
13346- , .dep_map = { .name = #lockname }
13347-#else
13348-# define __DEP_MAP_MUTEX_INITIALIZER(lockname)
13349-#endif
13350-
13351 #define __MUTEX_INITIALIZER(lockname) \
13352 { .owner = ATOMIC_LONG_INIT(0) \
13353 , .wait_lock = __SPIN_LOCK_UNLOCKED(lockname.wait_lock) \
13354@@ -228,4 +232,6 @@ mutex_trylock_recursive(struct mutex *lock)
13355 return mutex_trylock(lock);
13356 }
13357
13358+#endif /* !PREEMPT_RT_FULL */
13359+
13360 #endif /* __LINUX_MUTEX_H */
13361diff --git a/include/linux/mutex_rt.h b/include/linux/mutex_rt.h
13362new file mode 100644
13363index 000000000000..3fcb5edb1d2b
13364--- /dev/null
13365+++ b/include/linux/mutex_rt.h
13366@@ -0,0 +1,130 @@
13367+#ifndef __LINUX_MUTEX_RT_H
13368+#define __LINUX_MUTEX_RT_H
13369+
13370+#ifndef __LINUX_MUTEX_H
13371+#error "Please include mutex.h"
13372+#endif
13373+
13374+#include <linux/rtmutex.h>
13375+
13376+/* FIXME: Just for __lockfunc */
13377+#include <linux/spinlock.h>
13378+
13379+struct mutex {
13380+ struct rt_mutex lock;
13381+#ifdef CONFIG_DEBUG_LOCK_ALLOC
13382+ struct lockdep_map dep_map;
13383+#endif
13384+};
13385+
13386+#define __MUTEX_INITIALIZER(mutexname) \
13387+ { \
13388+ .lock = __RT_MUTEX_INITIALIZER(mutexname.lock) \
13389+ __DEP_MAP_MUTEX_INITIALIZER(mutexname) \
13390+ }
13391+
13392+#define DEFINE_MUTEX(mutexname) \
13393+ struct mutex mutexname = __MUTEX_INITIALIZER(mutexname)
13394+
13395+extern void __mutex_do_init(struct mutex *lock, const char *name, struct lock_class_key *key);
13396+extern void __lockfunc _mutex_lock(struct mutex *lock);
13397+extern void __lockfunc _mutex_lock_io(struct mutex *lock);
13398+extern void __lockfunc _mutex_lock_io_nested(struct mutex *lock, int subclass);
13399+extern int __lockfunc _mutex_lock_interruptible(struct mutex *lock);
13400+extern int __lockfunc _mutex_lock_killable(struct mutex *lock);
13401+extern void __lockfunc _mutex_lock_nested(struct mutex *lock, int subclass);
13402+extern void __lockfunc _mutex_lock_nest_lock(struct mutex *lock, struct lockdep_map *nest_lock);
13403+extern int __lockfunc _mutex_lock_interruptible_nested(struct mutex *lock, int subclass);
13404+extern int __lockfunc _mutex_lock_killable_nested(struct mutex *lock, int subclass);
13405+extern int __lockfunc _mutex_trylock(struct mutex *lock);
13406+extern void __lockfunc _mutex_unlock(struct mutex *lock);
13407+
13408+#define mutex_is_locked(l) rt_mutex_is_locked(&(l)->lock)
13409+#define mutex_lock(l) _mutex_lock(l)
13410+#define mutex_lock_interruptible(l) _mutex_lock_interruptible(l)
13411+#define mutex_lock_killable(l) _mutex_lock_killable(l)
13412+#define mutex_trylock(l) _mutex_trylock(l)
13413+#define mutex_unlock(l) _mutex_unlock(l)
13414+#define mutex_lock_io(l) _mutex_lock_io(l);
13415+
13416+#define __mutex_owner(l) ((l)->lock.owner)
13417+
13418+#ifdef CONFIG_DEBUG_MUTEXES
13419+#define mutex_destroy(l) rt_mutex_destroy(&(l)->lock)
13420+#else
13421+static inline void mutex_destroy(struct mutex *lock) {}
13422+#endif
13423+
13424+#ifdef CONFIG_DEBUG_LOCK_ALLOC
13425+# define mutex_lock_nested(l, s) _mutex_lock_nested(l, s)
13426+# define mutex_lock_interruptible_nested(l, s) \
13427+ _mutex_lock_interruptible_nested(l, s)
13428+# define mutex_lock_killable_nested(l, s) \
13429+ _mutex_lock_killable_nested(l, s)
13430+# define mutex_lock_io_nested(l, s) _mutex_lock_io_nested(l, s)
13431+
13432+# define mutex_lock_nest_lock(lock, nest_lock) \
13433+do { \
13434+ typecheck(struct lockdep_map *, &(nest_lock)->dep_map); \
13435+ _mutex_lock_nest_lock(lock, &(nest_lock)->dep_map); \
13436+} while (0)
13437+
13438+#else
13439+# define mutex_lock_nested(l, s) _mutex_lock(l)
13440+# define mutex_lock_interruptible_nested(l, s) \
13441+ _mutex_lock_interruptible(l)
13442+# define mutex_lock_killable_nested(l, s) \
13443+ _mutex_lock_killable(l)
13444+# define mutex_lock_nest_lock(lock, nest_lock) mutex_lock(lock)
13445+# define mutex_lock_io_nested(l, s) _mutex_lock_io(l)
13446+#endif
13447+
13448+# define mutex_init(mutex) \
13449+do { \
13450+ static struct lock_class_key __key; \
13451+ \
13452+ rt_mutex_init(&(mutex)->lock); \
13453+ __mutex_do_init((mutex), #mutex, &__key); \
13454+} while (0)
13455+
13456+# define __mutex_init(mutex, name, key) \
13457+do { \
13458+ rt_mutex_init(&(mutex)->lock); \
13459+ __mutex_do_init((mutex), name, key); \
13460+} while (0)
13461+
13462+/**
13463+ * These values are chosen such that FAIL and SUCCESS match the
13464+ * values of the regular mutex_trylock().
13465+ */
13466+enum mutex_trylock_recursive_enum {
13467+ MUTEX_TRYLOCK_FAILED = 0,
13468+ MUTEX_TRYLOCK_SUCCESS = 1,
13469+ MUTEX_TRYLOCK_RECURSIVE,
13470+};
13471+/**
13472+ * mutex_trylock_recursive - trylock variant that allows recursive locking
13473+ * @lock: mutex to be locked
13474+ *
13475+ * This function should not be used, _ever_. It is purely for hysterical GEM
13476+ * raisins, and once those are gone this will be removed.
13477+ *
13478+ * Returns:
13479+ * MUTEX_TRYLOCK_FAILED - trylock failed,
13480+ * MUTEX_TRYLOCK_SUCCESS - lock acquired,
13481+ * MUTEX_TRYLOCK_RECURSIVE - we already owned the lock.
13482+ */
13483+int __rt_mutex_owner_current(struct rt_mutex *lock);
13484+
13485+static inline /* __deprecated */ __must_check enum mutex_trylock_recursive_enum
13486+mutex_trylock_recursive(struct mutex *lock)
13487+{
13488+ if (unlikely(__rt_mutex_owner_current(&lock->lock)))
13489+ return MUTEX_TRYLOCK_RECURSIVE;
13490+
13491+ return mutex_trylock(lock);
13492+}
13493+
13494+extern int atomic_dec_and_mutex_lock(atomic_t *cnt, struct mutex *lock);
13495+
13496+#endif
13497diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
13498index a516dbe5869f..3ceccf72757e 100644
13499--- a/include/linux/netdevice.h
13500+++ b/include/linux/netdevice.h
13501@@ -409,7 +409,19 @@ typedef enum rx_handler_result rx_handler_result_t;
13502 typedef rx_handler_result_t rx_handler_func_t(struct sk_buff **pskb);
13503
13504 void __napi_schedule(struct napi_struct *n);
13505+
13506+/*
13507+ * When PREEMPT_RT_FULL is defined, all device interrupt handlers
13508+ * run as threads, and they can also be preempted (without PREEMPT_RT
13509+ * interrupt threads can not be preempted). Which means that calling
13510+ * __napi_schedule_irqoff() from an interrupt handler can be preempted
13511+ * and can corrupt the napi->poll_list.
13512+ */
13513+#ifdef CONFIG_PREEMPT_RT_FULL
13514+#define __napi_schedule_irqoff(n) __napi_schedule(n)
13515+#else
13516 void __napi_schedule_irqoff(struct napi_struct *n);
13517+#endif
13518
13519 static inline bool napi_disable_pending(struct napi_struct *n)
13520 {
13521@@ -571,7 +583,11 @@ struct netdev_queue {
13522 * write-mostly part
13523 */
13524 spinlock_t _xmit_lock ____cacheline_aligned_in_smp;
13525+#ifdef CONFIG_PREEMPT_RT_FULL
13526+ struct task_struct *xmit_lock_owner;
13527+#else
13528 int xmit_lock_owner;
13529+#endif
13530 /*
13531 * Time (in jiffies) of last Tx
13532 */
13533@@ -2440,14 +2456,53 @@ void netdev_freemem(struct net_device *dev);
13534 void synchronize_net(void);
13535 int init_dummy_netdev(struct net_device *dev);
13536
13537-DECLARE_PER_CPU(int, xmit_recursion);
13538 #define XMIT_RECURSION_LIMIT 10
13539+#ifdef CONFIG_PREEMPT_RT_FULL
13540+static inline int dev_recursion_level(void)
13541+{
13542+ return current->xmit_recursion;
13543+}
13544+
13545+static inline int xmit_rec_read(void)
13546+{
13547+ return current->xmit_recursion;
13548+}
13549+
13550+static inline void xmit_rec_inc(void)
13551+{
13552+ current->xmit_recursion++;
13553+}
13554+
13555+static inline void xmit_rec_dec(void)
13556+{
13557+ current->xmit_recursion--;
13558+}
13559+
13560+#else
13561+
13562+DECLARE_PER_CPU(int, xmit_recursion);
13563
13564 static inline int dev_recursion_level(void)
13565 {
13566 return this_cpu_read(xmit_recursion);
13567 }
13568
13569+static inline int xmit_rec_read(void)
13570+{
13571+ return __this_cpu_read(xmit_recursion);
13572+}
13573+
13574+static inline void xmit_rec_inc(void)
13575+{
13576+ __this_cpu_inc(xmit_recursion);
13577+}
13578+
13579+static inline void xmit_rec_dec(void)
13580+{
13581+ __this_cpu_dec(xmit_recursion);
13582+}
13583+#endif
13584+
13585 struct net_device *dev_get_by_index(struct net *net, int ifindex);
13586 struct net_device *__dev_get_by_index(struct net *net, int ifindex);
13587 struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex);
13588@@ -2799,6 +2854,7 @@ struct softnet_data {
13589 unsigned int dropped;
13590 struct sk_buff_head input_pkt_queue;
13591 struct napi_struct backlog;
13592+ struct sk_buff_head tofree_queue;
13593
13594 };
13595
13596@@ -3522,10 +3578,48 @@ static inline u32 netif_msg_init(int debug_value, int default_msg_enable_bits)
13597 return (1 << debug_value) - 1;
13598 }
13599
13600+#ifdef CONFIG_PREEMPT_RT_FULL
13601+static inline void netdev_queue_set_owner(struct netdev_queue *txq, int cpu)
13602+{
13603+ txq->xmit_lock_owner = current;
13604+}
13605+
13606+static inline void netdev_queue_clear_owner(struct netdev_queue *txq)
13607+{
13608+ txq->xmit_lock_owner = NULL;
13609+}
13610+
13611+static inline bool netdev_queue_has_owner(struct netdev_queue *txq)
13612+{
13613+ if (txq->xmit_lock_owner != NULL)
13614+ return true;
13615+ return false;
13616+}
13617+
13618+#else
13619+
13620+static inline void netdev_queue_set_owner(struct netdev_queue *txq, int cpu)
13621+{
13622+ txq->xmit_lock_owner = cpu;
13623+}
13624+
13625+static inline void netdev_queue_clear_owner(struct netdev_queue *txq)
13626+{
13627+ txq->xmit_lock_owner = -1;
13628+}
13629+
13630+static inline bool netdev_queue_has_owner(struct netdev_queue *txq)
13631+{
13632+ if (txq->xmit_lock_owner != -1)
13633+ return true;
13634+ return false;
13635+}
13636+#endif
13637+
13638 static inline void __netif_tx_lock(struct netdev_queue *txq, int cpu)
13639 {
13640 spin_lock(&txq->_xmit_lock);
13641- txq->xmit_lock_owner = cpu;
13642+ netdev_queue_set_owner(txq, cpu);
13643 }
13644
13645 static inline bool __netif_tx_acquire(struct netdev_queue *txq)
13646@@ -3542,32 +3636,32 @@ static inline void __netif_tx_release(struct netdev_queue *txq)
13647 static inline void __netif_tx_lock_bh(struct netdev_queue *txq)
13648 {
13649 spin_lock_bh(&txq->_xmit_lock);
13650- txq->xmit_lock_owner = smp_processor_id();
13651+ netdev_queue_set_owner(txq, smp_processor_id());
13652 }
13653
13654 static inline bool __netif_tx_trylock(struct netdev_queue *txq)
13655 {
13656 bool ok = spin_trylock(&txq->_xmit_lock);
13657 if (likely(ok))
13658- txq->xmit_lock_owner = smp_processor_id();
13659+ netdev_queue_set_owner(txq, smp_processor_id());
13660 return ok;
13661 }
13662
13663 static inline void __netif_tx_unlock(struct netdev_queue *txq)
13664 {
13665- txq->xmit_lock_owner = -1;
13666+ netdev_queue_clear_owner(txq);
13667 spin_unlock(&txq->_xmit_lock);
13668 }
13669
13670 static inline void __netif_tx_unlock_bh(struct netdev_queue *txq)
13671 {
13672- txq->xmit_lock_owner = -1;
13673+ netdev_queue_clear_owner(txq);
13674 spin_unlock_bh(&txq->_xmit_lock);
13675 }
13676
13677 static inline void txq_trans_update(struct netdev_queue *txq)
13678 {
13679- if (txq->xmit_lock_owner != -1)
13680+ if (netdev_queue_has_owner(txq))
13681 txq->trans_start = jiffies;
13682 }
13683
13684diff --git a/include/linux/netfilter/x_tables.h b/include/linux/netfilter/x_tables.h
13685index 54f346a45cd0..79723e76af66 100644
13686--- a/include/linux/netfilter/x_tables.h
13687+++ b/include/linux/netfilter/x_tables.h
13688@@ -6,6 +6,7 @@
13689 #include <linux/netdevice.h>
13690 #include <linux/static_key.h>
13691 #include <linux/netfilter.h>
13692+#include <linux/locallock.h>
13693 #include <uapi/linux/netfilter/x_tables.h>
13694
13695 /* Test a struct->invflags and a boolean for inequality */
13696@@ -341,6 +342,8 @@ void xt_free_table_info(struct xt_table_info *info);
13697 */
13698 DECLARE_PER_CPU(seqcount_t, xt_recseq);
13699
13700+DECLARE_LOCAL_IRQ_LOCK(xt_write_lock);
13701+
13702 /* xt_tee_enabled - true if x_tables needs to handle reentrancy
13703 *
13704 * Enabled if current ip(6)tables ruleset has at least one -j TEE rule.
13705@@ -361,6 +364,9 @@ static inline unsigned int xt_write_recseq_begin(void)
13706 {
13707 unsigned int addend;
13708
13709+ /* RT protection */
13710+ local_lock(xt_write_lock);
13711+
13712 /*
13713 * Low order bit of sequence is set if we already
13714 * called xt_write_recseq_begin().
13715@@ -391,6 +397,7 @@ static inline void xt_write_recseq_end(unsigned int addend)
13716 /* this is kind of a write_seqcount_end(), but addend is 0 or 1 */
13717 smp_wmb();
13718 __this_cpu_add(xt_recseq.sequence, addend);
13719+ local_unlock(xt_write_lock);
13720 }
13721
13722 /*
13723diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h
13724index f0015f801a78..c38288622819 100644
13725--- a/include/linux/nfs_fs.h
13726+++ b/include/linux/nfs_fs.h
13727@@ -162,7 +162,11 @@ struct nfs_inode {
13728
13729 /* Readers: in-flight sillydelete RPC calls */
13730 /* Writers: rmdir */
13731+#ifdef CONFIG_PREEMPT_RT_BASE
13732+ struct semaphore rmdir_sem;
13733+#else
13734 struct rw_semaphore rmdir_sem;
13735+#endif
13736 struct mutex commit_mutex;
13737
13738 #if IS_ENABLED(CONFIG_NFS_V4)
13739diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
13740index 6959968dc36a..802e849b57ac 100644
13741--- a/include/linux/nfs_xdr.h
13742+++ b/include/linux/nfs_xdr.h
13743@@ -1530,7 +1530,7 @@ struct nfs_unlinkdata {
13744 struct nfs_removeargs args;
13745 struct nfs_removeres res;
13746 struct dentry *dentry;
13747- wait_queue_head_t wq;
13748+ struct swait_queue_head wq;
13749 struct rpc_cred *cred;
13750 struct nfs_fattr dir_attr;
13751 long timeout;
13752diff --git a/include/linux/notifier.h b/include/linux/notifier.h
13753index 6d731110e0db..e758627da14d 100644
13754--- a/include/linux/notifier.h
13755+++ b/include/linux/notifier.h
13756@@ -7,7 +7,7 @@
13757 *
13758 * Alan Cox <Alan.Cox@linux.org>
13759 */
13760-
13761+
13762 #ifndef _LINUX_NOTIFIER_H
13763 #define _LINUX_NOTIFIER_H
13764 #include <linux/errno.h>
13765@@ -43,9 +43,7 @@
13766 * in srcu_notifier_call_chain(): no cache bounces and no memory barriers.
13767 * As compensation, srcu_notifier_chain_unregister() is rather expensive.
13768 * SRCU notifier chains should be used when the chain will be called very
13769- * often but notifier_blocks will seldom be removed. Also, SRCU notifier
13770- * chains are slightly more difficult to use because they require special
13771- * runtime initialization.
13772+ * often but notifier_blocks will seldom be removed.
13773 */
13774
13775 struct notifier_block;
13776@@ -91,7 +89,7 @@ struct srcu_notifier_head {
13777 (name)->head = NULL; \
13778 } while (0)
13779
13780-/* srcu_notifier_heads must be initialized and cleaned up dynamically */
13781+/* srcu_notifier_heads must be cleaned up dynamically */
13782 extern void srcu_init_notifier_head(struct srcu_notifier_head *nh);
13783 #define srcu_cleanup_notifier_head(name) \
13784 cleanup_srcu_struct(&(name)->srcu);
13785@@ -104,7 +102,13 @@ extern void srcu_init_notifier_head(struct srcu_notifier_head *nh);
13786 .head = NULL }
13787 #define RAW_NOTIFIER_INIT(name) { \
13788 .head = NULL }
13789-/* srcu_notifier_heads cannot be initialized statically */
13790+
13791+#define SRCU_NOTIFIER_INIT(name, pcpu) \
13792+ { \
13793+ .mutex = __MUTEX_INITIALIZER(name.mutex), \
13794+ .head = NULL, \
13795+ .srcu = __SRCU_STRUCT_INIT(name.srcu, pcpu), \
13796+ }
13797
13798 #define ATOMIC_NOTIFIER_HEAD(name) \
13799 struct atomic_notifier_head name = \
13800@@ -116,6 +120,26 @@ extern void srcu_init_notifier_head(struct srcu_notifier_head *nh);
13801 struct raw_notifier_head name = \
13802 RAW_NOTIFIER_INIT(name)
13803
13804+#ifdef CONFIG_TREE_SRCU
13805+#define _SRCU_NOTIFIER_HEAD(name, mod) \
13806+ static DEFINE_PER_CPU(struct srcu_data, \
13807+ name##_head_srcu_data); \
13808+ mod struct srcu_notifier_head name = \
13809+ SRCU_NOTIFIER_INIT(name, name##_head_srcu_data)
13810+
13811+#else
13812+#define _SRCU_NOTIFIER_HEAD(name, mod) \
13813+ mod struct srcu_notifier_head name = \
13814+ SRCU_NOTIFIER_INIT(name, name)
13815+
13816+#endif
13817+
13818+#define SRCU_NOTIFIER_HEAD(name) \
13819+ _SRCU_NOTIFIER_HEAD(name, )
13820+
13821+#define SRCU_NOTIFIER_HEAD_STATIC(name) \
13822+ _SRCU_NOTIFIER_HEAD(name, static)
13823+
13824 #ifdef __KERNEL__
13825
13826 extern int atomic_notifier_chain_register(struct atomic_notifier_head *nh,
13827@@ -185,12 +209,12 @@ static inline int notifier_to_errno(int ret)
13828
13829 /*
13830 * Declared notifiers so far. I can imagine quite a few more chains
13831- * over time (eg laptop power reset chains, reboot chain (to clean
13832+ * over time (eg laptop power reset chains, reboot chain (to clean
13833 * device units up), device [un]mount chain, module load/unload chain,
13834- * low memory chain, screenblank chain (for plug in modular screenblankers)
13835+ * low memory chain, screenblank chain (for plug in modular screenblankers)
13836 * VC switch chains (for loadable kernel svgalib VC switch helpers) etc...
13837 */
13838-
13839+
13840 /* CPU notfiers are defined in include/linux/cpu.h. */
13841
13842 /* netdevice notifiers are defined in include/linux/netdevice.h */
13843diff --git a/include/linux/percpu-rwsem.h b/include/linux/percpu-rwsem.h
13844index 79b99d653e03..fb44e237316d 100644
13845--- a/include/linux/percpu-rwsem.h
13846+++ b/include/linux/percpu-rwsem.h
13847@@ -29,7 +29,7 @@ static struct percpu_rw_semaphore name = { \
13848 extern int __percpu_down_read(struct percpu_rw_semaphore *, int);
13849 extern void __percpu_up_read(struct percpu_rw_semaphore *);
13850
13851-static inline void percpu_down_read_preempt_disable(struct percpu_rw_semaphore *sem)
13852+static inline void percpu_down_read(struct percpu_rw_semaphore *sem)
13853 {
13854 might_sleep();
13855
13856@@ -47,16 +47,10 @@ static inline void percpu_down_read_preempt_disable(struct percpu_rw_semaphore *
13857 __this_cpu_inc(*sem->read_count);
13858 if (unlikely(!rcu_sync_is_idle(&sem->rss)))
13859 __percpu_down_read(sem, false); /* Unconditional memory barrier */
13860- barrier();
13861 /*
13862- * The barrier() prevents the compiler from
13863+ * The preempt_enable() prevents the compiler from
13864 * bleeding the critical section out.
13865 */
13866-}
13867-
13868-static inline void percpu_down_read(struct percpu_rw_semaphore *sem)
13869-{
13870- percpu_down_read_preempt_disable(sem);
13871 preempt_enable();
13872 }
13873
13874@@ -83,13 +77,9 @@ static inline int percpu_down_read_trylock(struct percpu_rw_semaphore *sem)
13875 return ret;
13876 }
13877
13878-static inline void percpu_up_read_preempt_enable(struct percpu_rw_semaphore *sem)
13879+static inline void percpu_up_read(struct percpu_rw_semaphore *sem)
13880 {
13881- /*
13882- * The barrier() prevents the compiler from
13883- * bleeding the critical section out.
13884- */
13885- barrier();
13886+ preempt_disable();
13887 /*
13888 * Same as in percpu_down_read().
13889 */
13890@@ -102,12 +92,6 @@ static inline void percpu_up_read_preempt_enable(struct percpu_rw_semaphore *sem
13891 rwsem_release(&sem->rw_sem.dep_map, 1, _RET_IP_);
13892 }
13893
13894-static inline void percpu_up_read(struct percpu_rw_semaphore *sem)
13895-{
13896- preempt_disable();
13897- percpu_up_read_preempt_enable(sem);
13898-}
13899-
13900 extern void percpu_down_write(struct percpu_rw_semaphore *);
13901 extern void percpu_up_write(struct percpu_rw_semaphore *);
13902
13903diff --git a/include/linux/percpu.h b/include/linux/percpu.h
13904index 296bbe49d5d1..4414796e3941 100644
13905--- a/include/linux/percpu.h
13906+++ b/include/linux/percpu.h
13907@@ -19,6 +19,35 @@
13908 #define PERCPU_MODULE_RESERVE 0
13909 #endif
13910
13911+#ifdef CONFIG_PREEMPT_RT_FULL
13912+
13913+#define get_local_var(var) (*({ \
13914+ migrate_disable(); \
13915+ this_cpu_ptr(&var); }))
13916+
13917+#define put_local_var(var) do { \
13918+ (void)&(var); \
13919+ migrate_enable(); \
13920+} while (0)
13921+
13922+# define get_local_ptr(var) ({ \
13923+ migrate_disable(); \
13924+ this_cpu_ptr(var); })
13925+
13926+# define put_local_ptr(var) do { \
13927+ (void)(var); \
13928+ migrate_enable(); \
13929+} while (0)
13930+
13931+#else
13932+
13933+#define get_local_var(var) get_cpu_var(var)
13934+#define put_local_var(var) put_cpu_var(var)
13935+#define get_local_ptr(var) get_cpu_ptr(var)
13936+#define put_local_ptr(var) put_cpu_ptr(var)
13937+
13938+#endif
13939+
13940 /* minimum unit size, also is the maximum supported allocation size */
13941 #define PCPU_MIN_UNIT_SIZE PFN_ALIGN(32 << 10)
13942
13943diff --git a/include/linux/pid.h b/include/linux/pid.h
13944index dfd684ce0787..bc954a99aa70 100644
13945--- a/include/linux/pid.h
13946+++ b/include/linux/pid.h
13947@@ -3,6 +3,7 @@
13948 #define _LINUX_PID_H
13949
13950 #include <linux/rculist.h>
13951+#include <linux/atomic.h>
13952
13953 enum pid_type
13954 {
13955diff --git a/include/linux/posix-timers.h b/include/linux/posix-timers.h
13956index 437a539898ae..de5c49b0dccf 100644
13957--- a/include/linux/posix-timers.h
13958+++ b/include/linux/posix-timers.h
13959@@ -101,8 +101,8 @@ struct k_itimer {
13960 struct {
13961 struct alarm alarmtimer;
13962 } alarm;
13963- struct rcu_head rcu;
13964 } it;
13965+ struct rcu_head rcu;
13966 };
13967
13968 void run_posix_cpu_timers(struct task_struct *task);
13969diff --git a/include/linux/preempt.h b/include/linux/preempt.h
13970index 5bd3f151da78..6728662a81e8 100644
13971--- a/include/linux/preempt.h
13972+++ b/include/linux/preempt.h
13973@@ -51,7 +51,11 @@
13974 #define HARDIRQ_OFFSET (1UL << HARDIRQ_SHIFT)
13975 #define NMI_OFFSET (1UL << NMI_SHIFT)
13976
13977-#define SOFTIRQ_DISABLE_OFFSET (2 * SOFTIRQ_OFFSET)
13978+#ifndef CONFIG_PREEMPT_RT_FULL
13979+# define SOFTIRQ_DISABLE_OFFSET (2 * SOFTIRQ_OFFSET)
13980+#else
13981+# define SOFTIRQ_DISABLE_OFFSET (0)
13982+#endif
13983
13984 /* We use the MSB mostly because its available */
13985 #define PREEMPT_NEED_RESCHED 0x80000000
13986@@ -81,9 +85,15 @@
13987 #include <asm/preempt.h>
13988
13989 #define hardirq_count() (preempt_count() & HARDIRQ_MASK)
13990-#define softirq_count() (preempt_count() & SOFTIRQ_MASK)
13991 #define irq_count() (preempt_count() & (HARDIRQ_MASK | SOFTIRQ_MASK \
13992 | NMI_MASK))
13993+#ifndef CONFIG_PREEMPT_RT_FULL
13994+# define softirq_count() (preempt_count() & SOFTIRQ_MASK)
13995+# define in_serving_softirq() (softirq_count() & SOFTIRQ_OFFSET)
13996+#else
13997+# define softirq_count() (0UL)
13998+extern int in_serving_softirq(void);
13999+#endif
14000
14001 /*
14002 * Are we doing bottom half or hardware interrupt processing?
14003@@ -101,7 +111,6 @@
14004 #define in_irq() (hardirq_count())
14005 #define in_softirq() (softirq_count())
14006 #define in_interrupt() (irq_count())
14007-#define in_serving_softirq() (softirq_count() & SOFTIRQ_OFFSET)
14008 #define in_nmi() (preempt_count() & NMI_MASK)
14009 #define in_task() (!(preempt_count() & \
14010 (NMI_MASK | HARDIRQ_MASK | SOFTIRQ_OFFSET)))
14011@@ -118,7 +127,11 @@
14012 /*
14013 * The preempt_count offset after spin_lock()
14014 */
14015+#if !defined(CONFIG_PREEMPT_RT_FULL)
14016 #define PREEMPT_LOCK_OFFSET PREEMPT_DISABLE_OFFSET
14017+#else
14018+#define PREEMPT_LOCK_OFFSET 0
14019+#endif
14020
14021 /*
14022 * The preempt_count offset needed for things like:
14023@@ -167,6 +180,20 @@ extern void preempt_count_sub(int val);
14024 #define preempt_count_inc() preempt_count_add(1)
14025 #define preempt_count_dec() preempt_count_sub(1)
14026
14027+#ifdef CONFIG_PREEMPT_LAZY
14028+#define add_preempt_lazy_count(val) do { preempt_lazy_count() += (val); } while (0)
14029+#define sub_preempt_lazy_count(val) do { preempt_lazy_count() -= (val); } while (0)
14030+#define inc_preempt_lazy_count() add_preempt_lazy_count(1)
14031+#define dec_preempt_lazy_count() sub_preempt_lazy_count(1)
14032+#define preempt_lazy_count() (current_thread_info()->preempt_lazy_count)
14033+#else
14034+#define add_preempt_lazy_count(val) do { } while (0)
14035+#define sub_preempt_lazy_count(val) do { } while (0)
14036+#define inc_preempt_lazy_count() do { } while (0)
14037+#define dec_preempt_lazy_count() do { } while (0)
14038+#define preempt_lazy_count() (0)
14039+#endif
14040+
14041 #ifdef CONFIG_PREEMPT_COUNT
14042
14043 #define preempt_disable() \
14044@@ -175,16 +202,53 @@ do { \
14045 barrier(); \
14046 } while (0)
14047
14048+#define preempt_lazy_disable() \
14049+do { \
14050+ inc_preempt_lazy_count(); \
14051+ barrier(); \
14052+} while (0)
14053+
14054 #define sched_preempt_enable_no_resched() \
14055 do { \
14056 barrier(); \
14057 preempt_count_dec(); \
14058 } while (0)
14059
14060-#define preempt_enable_no_resched() sched_preempt_enable_no_resched()
14061+#ifdef CONFIG_PREEMPT_RT_BASE
14062+# define preempt_enable_no_resched() sched_preempt_enable_no_resched()
14063+# define preempt_check_resched_rt() preempt_check_resched()
14064+#else
14065+# define preempt_enable_no_resched() preempt_enable()
14066+# define preempt_check_resched_rt() barrier();
14067+#endif
14068
14069 #define preemptible() (preempt_count() == 0 && !irqs_disabled())
14070
14071+#if defined(CONFIG_SMP) && defined(CONFIG_PREEMPT_RT_BASE)
14072+
14073+extern void migrate_disable(void);
14074+extern void migrate_enable(void);
14075+
14076+int __migrate_disabled(struct task_struct *p);
14077+
14078+#elif !defined(CONFIG_SMP) && defined(CONFIG_PREEMPT_RT_BASE)
14079+
14080+extern void migrate_disable(void);
14081+extern void migrate_enable(void);
14082+static inline int __migrate_disabled(struct task_struct *p)
14083+{
14084+ return 0;
14085+}
14086+
14087+#else
14088+#define migrate_disable() preempt_disable()
14089+#define migrate_enable() preempt_enable()
14090+static inline int __migrate_disabled(struct task_struct *p)
14091+{
14092+ return 0;
14093+}
14094+#endif
14095+
14096 #ifdef CONFIG_PREEMPT
14097 #define preempt_enable() \
14098 do { \
14099@@ -206,6 +270,13 @@ do { \
14100 __preempt_schedule(); \
14101 } while (0)
14102
14103+#define preempt_lazy_enable() \
14104+do { \
14105+ dec_preempt_lazy_count(); \
14106+ barrier(); \
14107+ preempt_check_resched(); \
14108+} while (0)
14109+
14110 #else /* !CONFIG_PREEMPT */
14111 #define preempt_enable() \
14112 do { \
14113@@ -213,6 +284,12 @@ do { \
14114 preempt_count_dec(); \
14115 } while (0)
14116
14117+#define preempt_lazy_enable() \
14118+do { \
14119+ dec_preempt_lazy_count(); \
14120+ barrier(); \
14121+} while (0)
14122+
14123 #define preempt_enable_notrace() \
14124 do { \
14125 barrier(); \
14126@@ -251,8 +328,16 @@ do { \
14127 #define preempt_disable_notrace() barrier()
14128 #define preempt_enable_no_resched_notrace() barrier()
14129 #define preempt_enable_notrace() barrier()
14130+#define preempt_check_resched_rt() barrier()
14131 #define preemptible() 0
14132
14133+#define migrate_disable() barrier()
14134+#define migrate_enable() barrier()
14135+
14136+static inline int __migrate_disabled(struct task_struct *p)
14137+{
14138+ return 0;
14139+}
14140 #endif /* CONFIG_PREEMPT_COUNT */
14141
14142 #ifdef MODULE
14143@@ -271,10 +356,22 @@ do { \
14144 } while (0)
14145 #define preempt_fold_need_resched() \
14146 do { \
14147- if (tif_need_resched()) \
14148+ if (tif_need_resched_now()) \
14149 set_preempt_need_resched(); \
14150 } while (0)
14151
14152+#ifdef CONFIG_PREEMPT_RT_FULL
14153+# define preempt_disable_rt() preempt_disable()
14154+# define preempt_enable_rt() preempt_enable()
14155+# define preempt_disable_nort() barrier()
14156+# define preempt_enable_nort() barrier()
14157+#else
14158+# define preempt_disable_rt() barrier()
14159+# define preempt_enable_rt() barrier()
14160+# define preempt_disable_nort() preempt_disable()
14161+# define preempt_enable_nort() preempt_enable()
14162+#endif
14163+
14164 #ifdef CONFIG_PREEMPT_NOTIFIERS
14165
14166 struct preempt_notifier;
14167diff --git a/include/linux/printk.h b/include/linux/printk.h
14168index 6106befed756..1dba9cb7b91b 100644
14169--- a/include/linux/printk.h
14170+++ b/include/linux/printk.h
14171@@ -142,9 +142,11 @@ struct va_format {
14172 #ifdef CONFIG_EARLY_PRINTK
14173 extern asmlinkage __printf(1, 2)
14174 void early_printk(const char *fmt, ...);
14175+extern void printk_kill(void);
14176 #else
14177 static inline __printf(1, 2) __cold
14178 void early_printk(const char *s, ...) { }
14179+static inline void printk_kill(void) { }
14180 #endif
14181
14182 #ifdef CONFIG_PRINTK_NMI
14183diff --git a/include/linux/radix-tree.h b/include/linux/radix-tree.h
14184index 567ebb5eaab0..9da7ea957399 100644
14185--- a/include/linux/radix-tree.h
14186+++ b/include/linux/radix-tree.h
14187@@ -328,6 +328,8 @@ unsigned int radix_tree_gang_lookup_slot(const struct radix_tree_root *,
14188 int radix_tree_preload(gfp_t gfp_mask);
14189 int radix_tree_maybe_preload(gfp_t gfp_mask);
14190 int radix_tree_maybe_preload_order(gfp_t gfp_mask, int order);
14191+void radix_tree_preload_end(void);
14192+
14193 void radix_tree_init(void);
14194 void *radix_tree_tag_set(struct radix_tree_root *,
14195 unsigned long index, unsigned int tag);
14196@@ -347,11 +349,6 @@ unsigned int radix_tree_gang_lookup_tag_slot(const struct radix_tree_root *,
14197 unsigned int max_items, unsigned int tag);
14198 int radix_tree_tagged(const struct radix_tree_root *, unsigned int tag);
14199
14200-static inline void radix_tree_preload_end(void)
14201-{
14202- preempt_enable();
14203-}
14204-
14205 int radix_tree_split_preload(unsigned old_order, unsigned new_order, gfp_t);
14206 int radix_tree_split(struct radix_tree_root *, unsigned long index,
14207 unsigned new_order);
14208diff --git a/include/linux/random.h b/include/linux/random.h
14209index 4024f7d9c77d..462d752a739b 100644
14210--- a/include/linux/random.h
14211+++ b/include/linux/random.h
14212@@ -32,7 +32,7 @@ static inline void add_latent_entropy(void) {}
14213
14214 extern void add_input_randomness(unsigned int type, unsigned int code,
14215 unsigned int value) __latent_entropy;
14216-extern void add_interrupt_randomness(int irq, int irq_flags) __latent_entropy;
14217+extern void add_interrupt_randomness(int irq, int irq_flags, __u64 ip) __latent_entropy;
14218
14219 extern void get_random_bytes(void *buf, int nbytes);
14220 extern int wait_for_random_bytes(void);
14221diff --git a/include/linux/rbtree.h b/include/linux/rbtree.h
14222index d574361943ea..0a9f442409b9 100644
14223--- a/include/linux/rbtree.h
14224+++ b/include/linux/rbtree.h
14225@@ -31,7 +31,7 @@
14226
14227 #include <linux/kernel.h>
14228 #include <linux/stddef.h>
14229-#include <linux/rcupdate.h>
14230+#include <linux/rcu_assign_pointer.h>
14231
14232 struct rb_node {
14233 unsigned long __rb_parent_color;
14234diff --git a/include/linux/rbtree_augmented.h b/include/linux/rbtree_augmented.h
14235index 6bfd2b581f75..af8a61be2d8d 100644
14236--- a/include/linux/rbtree_augmented.h
14237+++ b/include/linux/rbtree_augmented.h
14238@@ -26,6 +26,7 @@
14239
14240 #include <linux/compiler.h>
14241 #include <linux/rbtree.h>
14242+#include <linux/rcupdate.h>
14243
14244 /*
14245 * Please note - only struct rb_augment_callbacks and the prototypes for
14246diff --git a/include/linux/rbtree_latch.h b/include/linux/rbtree_latch.h
14247index ece43e882b56..7d012faa509a 100644
14248--- a/include/linux/rbtree_latch.h
14249+++ b/include/linux/rbtree_latch.h
14250@@ -35,6 +35,7 @@
14251
14252 #include <linux/rbtree.h>
14253 #include <linux/seqlock.h>
14254+#include <linux/rcupdate.h>
14255
14256 struct latch_tree_node {
14257 struct rb_node node[2];
14258diff --git a/include/linux/rcu_assign_pointer.h b/include/linux/rcu_assign_pointer.h
14259new file mode 100644
14260index 000000000000..7066962a4379
14261--- /dev/null
14262+++ b/include/linux/rcu_assign_pointer.h
14263@@ -0,0 +1,54 @@
14264+#ifndef __LINUX_RCU_ASSIGN_POINTER_H__
14265+#define __LINUX_RCU_ASSIGN_POINTER_H__
14266+#include <linux/compiler.h>
14267+#include <asm/barrier.h>
14268+
14269+/**
14270+ * RCU_INITIALIZER() - statically initialize an RCU-protected global variable
14271+ * @v: The value to statically initialize with.
14272+ */
14273+#define RCU_INITIALIZER(v) (typeof(*(v)) __force __rcu *)(v)
14274+
14275+/**
14276+ * rcu_assign_pointer() - assign to RCU-protected pointer
14277+ * @p: pointer to assign to
14278+ * @v: value to assign (publish)
14279+ *
14280+ * Assigns the specified value to the specified RCU-protected
14281+ * pointer, ensuring that any concurrent RCU readers will see
14282+ * any prior initialization.
14283+ *
14284+ * Inserts memory barriers on architectures that require them
14285+ * (which is most of them), and also prevents the compiler from
14286+ * reordering the code that initializes the structure after the pointer
14287+ * assignment. More importantly, this call documents which pointers
14288+ * will be dereferenced by RCU read-side code.
14289+ *
14290+ * In some special cases, you may use RCU_INIT_POINTER() instead
14291+ * of rcu_assign_pointer(). RCU_INIT_POINTER() is a bit faster due
14292+ * to the fact that it does not constrain either the CPU or the compiler.
14293+ * That said, using RCU_INIT_POINTER() when you should have used
14294+ * rcu_assign_pointer() is a very bad thing that results in
14295+ * impossible-to-diagnose memory corruption. So please be careful.
14296+ * See the RCU_INIT_POINTER() comment header for details.
14297+ *
14298+ * Note that rcu_assign_pointer() evaluates each of its arguments only
14299+ * once, appearances notwithstanding. One of the "extra" evaluations
14300+ * is in typeof() and the other visible only to sparse (__CHECKER__),
14301+ * neither of which actually execute the argument. As with most cpp
14302+ * macros, this execute-arguments-only-once property is important, so
14303+ * please be careful when making changes to rcu_assign_pointer() and the
14304+ * other macros that it invokes.
14305+ */
14306+#define rcu_assign_pointer(p, v) \
14307+({ \
14308+ uintptr_t _r_a_p__v = (uintptr_t)(v); \
14309+ \
14310+ if (__builtin_constant_p(v) && (_r_a_p__v) == (uintptr_t)NULL) \
14311+ WRITE_ONCE((p), (typeof(p))(_r_a_p__v)); \
14312+ else \
14313+ smp_store_release(&p, RCU_INITIALIZER((typeof(p))_r_a_p__v)); \
14314+ _r_a_p__v; \
14315+})
14316+
14317+#endif
14318diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
14319index a6ddc42f87a5..70996e134818 100644
14320--- a/include/linux/rcupdate.h
14321+++ b/include/linux/rcupdate.h
14322@@ -42,6 +42,7 @@
14323 #include <linux/lockdep.h>
14324 #include <asm/processor.h>
14325 #include <linux/cpumask.h>
14326+#include <linux/rcu_assign_pointer.h>
14327
14328 #define ULONG_CMP_GE(a, b) (ULONG_MAX / 2 >= (a) - (b))
14329 #define ULONG_CMP_LT(a, b) (ULONG_MAX / 2 < (a) - (b))
14330@@ -55,7 +56,11 @@ void call_rcu(struct rcu_head *head, rcu_callback_t func);
14331 #define call_rcu call_rcu_sched
14332 #endif /* #else #ifdef CONFIG_PREEMPT_RCU */
14333
14334+#ifdef CONFIG_PREEMPT_RT_FULL
14335+#define call_rcu_bh call_rcu
14336+#else
14337 void call_rcu_bh(struct rcu_head *head, rcu_callback_t func);
14338+#endif
14339 void call_rcu_sched(struct rcu_head *head, rcu_callback_t func);
14340 void synchronize_sched(void);
14341 void rcu_barrier_tasks(void);
14342@@ -74,6 +79,11 @@ void synchronize_rcu(void);
14343 * types of kernel builds, the rcu_read_lock() nesting depth is unknowable.
14344 */
14345 #define rcu_preempt_depth() (current->rcu_read_lock_nesting)
14346+#ifndef CONFIG_PREEMPT_RT_FULL
14347+#define sched_rcu_preempt_depth() rcu_preempt_depth()
14348+#else
14349+static inline int sched_rcu_preempt_depth(void) { return 0; }
14350+#endif
14351
14352 #else /* #ifdef CONFIG_PREEMPT_RCU */
14353
14354@@ -99,6 +109,8 @@ static inline int rcu_preempt_depth(void)
14355 return 0;
14356 }
14357
14358+#define sched_rcu_preempt_depth() rcu_preempt_depth()
14359+
14360 #endif /* #else #ifdef CONFIG_PREEMPT_RCU */
14361
14362 /* Internal to kernel */
14363@@ -255,7 +267,14 @@ extern struct lockdep_map rcu_sched_lock_map;
14364 extern struct lockdep_map rcu_callback_map;
14365 int debug_lockdep_rcu_enabled(void);
14366 int rcu_read_lock_held(void);
14367+#ifdef CONFIG_PREEMPT_RT_FULL
14368+static inline int rcu_read_lock_bh_held(void)
14369+{
14370+ return rcu_read_lock_held();
14371+}
14372+#else
14373 int rcu_read_lock_bh_held(void);
14374+#endif
14375 int rcu_read_lock_sched_held(void);
14376
14377 #else /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
14378@@ -364,54 +383,6 @@ static inline void rcu_preempt_sleep_check(void) { }
14379 ((typeof(*p) __force __kernel *)(________p1)); \
14380 })
14381
14382-/**
14383- * RCU_INITIALIZER() - statically initialize an RCU-protected global variable
14384- * @v: The value to statically initialize with.
14385- */
14386-#define RCU_INITIALIZER(v) (typeof(*(v)) __force __rcu *)(v)
14387-
14388-/**
14389- * rcu_assign_pointer() - assign to RCU-protected pointer
14390- * @p: pointer to assign to
14391- * @v: value to assign (publish)
14392- *
14393- * Assigns the specified value to the specified RCU-protected
14394- * pointer, ensuring that any concurrent RCU readers will see
14395- * any prior initialization.
14396- *
14397- * Inserts memory barriers on architectures that require them
14398- * (which is most of them), and also prevents the compiler from
14399- * reordering the code that initializes the structure after the pointer
14400- * assignment. More importantly, this call documents which pointers
14401- * will be dereferenced by RCU read-side code.
14402- *
14403- * In some special cases, you may use RCU_INIT_POINTER() instead
14404- * of rcu_assign_pointer(). RCU_INIT_POINTER() is a bit faster due
14405- * to the fact that it does not constrain either the CPU or the compiler.
14406- * That said, using RCU_INIT_POINTER() when you should have used
14407- * rcu_assign_pointer() is a very bad thing that results in
14408- * impossible-to-diagnose memory corruption. So please be careful.
14409- * See the RCU_INIT_POINTER() comment header for details.
14410- *
14411- * Note that rcu_assign_pointer() evaluates each of its arguments only
14412- * once, appearances notwithstanding. One of the "extra" evaluations
14413- * is in typeof() and the other visible only to sparse (__CHECKER__),
14414- * neither of which actually execute the argument. As with most cpp
14415- * macros, this execute-arguments-only-once property is important, so
14416- * please be careful when making changes to rcu_assign_pointer() and the
14417- * other macros that it invokes.
14418- */
14419-#define rcu_assign_pointer(p, v) \
14420-({ \
14421- uintptr_t _r_a_p__v = (uintptr_t)(v); \
14422- \
14423- if (__builtin_constant_p(v) && (_r_a_p__v) == (uintptr_t)NULL) \
14424- WRITE_ONCE((p), (typeof(p))(_r_a_p__v)); \
14425- else \
14426- smp_store_release(&p, RCU_INITIALIZER((typeof(p))_r_a_p__v)); \
14427- _r_a_p__v; \
14428-})
14429-
14430 /**
14431 * rcu_swap_protected() - swap an RCU and a regular pointer
14432 * @rcu_ptr: RCU pointer
14433@@ -707,10 +678,14 @@ static inline void rcu_read_unlock(void)
14434 static inline void rcu_read_lock_bh(void)
14435 {
14436 local_bh_disable();
14437+#ifdef CONFIG_PREEMPT_RT_FULL
14438+ rcu_read_lock();
14439+#else
14440 __acquire(RCU_BH);
14441 rcu_lock_acquire(&rcu_bh_lock_map);
14442 RCU_LOCKDEP_WARN(!rcu_is_watching(),
14443 "rcu_read_lock_bh() used illegally while idle");
14444+#endif
14445 }
14446
14447 /*
14448@@ -720,10 +695,14 @@ static inline void rcu_read_lock_bh(void)
14449 */
14450 static inline void rcu_read_unlock_bh(void)
14451 {
14452+#ifdef CONFIG_PREEMPT_RT_FULL
14453+ rcu_read_unlock();
14454+#else
14455 RCU_LOCKDEP_WARN(!rcu_is_watching(),
14456 "rcu_read_unlock_bh() used illegally while idle");
14457 rcu_lock_release(&rcu_bh_lock_map);
14458 __release(RCU_BH);
14459+#endif
14460 local_bh_enable();
14461 }
14462
14463diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h
14464index 37d6fd3b7ff8..a082fde7d6bc 100644
14465--- a/include/linux/rcutree.h
14466+++ b/include/linux/rcutree.h
14467@@ -44,7 +44,11 @@ static inline void rcu_virt_note_context_switch(int cpu)
14468 rcu_note_context_switch(false);
14469 }
14470
14471+#ifdef CONFIG_PREEMPT_RT_FULL
14472+# define synchronize_rcu_bh synchronize_rcu
14473+#else
14474 void synchronize_rcu_bh(void);
14475+#endif
14476 void synchronize_sched_expedited(void);
14477 void synchronize_rcu_expedited(void);
14478
14479@@ -72,7 +76,11 @@ static inline void synchronize_rcu_bh_expedited(void)
14480 }
14481
14482 void rcu_barrier(void);
14483+#ifdef CONFIG_PREEMPT_RT_FULL
14484+# define rcu_barrier_bh rcu_barrier
14485+#else
14486 void rcu_barrier_bh(void);
14487+#endif
14488 void rcu_barrier_sched(void);
14489 unsigned long get_state_synchronize_rcu(void);
14490 void cond_synchronize_rcu(unsigned long oldstate);
14491diff --git a/include/linux/ring_buffer.h b/include/linux/ring_buffer.h
14492index 5caa062a02b2..abce5f5325e1 100644
14493--- a/include/linux/ring_buffer.h
14494+++ b/include/linux/ring_buffer.h
14495@@ -34,10 +34,12 @@ struct ring_buffer_event {
14496 * array[0] = time delta (28 .. 59)
14497 * size = 8 bytes
14498 *
14499- * @RINGBUF_TYPE_TIME_STAMP: Sync time stamp with external clock
14500- * array[0] = tv_nsec
14501- * array[1..2] = tv_sec
14502- * size = 16 bytes
14503+ * @RINGBUF_TYPE_TIME_STAMP: Absolute timestamp
14504+ * Same format as TIME_EXTEND except that the
14505+ * value is an absolute timestamp, not a delta
14506+ * event.time_delta contains bottom 27 bits
14507+ * array[0] = top (28 .. 59) bits
14508+ * size = 8 bytes
14509 *
14510 * <= @RINGBUF_TYPE_DATA_TYPE_LEN_MAX:
14511 * Data record
14512@@ -54,12 +56,12 @@ enum ring_buffer_type {
14513 RINGBUF_TYPE_DATA_TYPE_LEN_MAX = 28,
14514 RINGBUF_TYPE_PADDING,
14515 RINGBUF_TYPE_TIME_EXTEND,
14516- /* FIXME: RINGBUF_TYPE_TIME_STAMP not implemented */
14517 RINGBUF_TYPE_TIME_STAMP,
14518 };
14519
14520 unsigned ring_buffer_event_length(struct ring_buffer_event *event);
14521 void *ring_buffer_event_data(struct ring_buffer_event *event);
14522+u64 ring_buffer_event_time_stamp(struct ring_buffer_event *event);
14523
14524 /*
14525 * ring_buffer_discard_commit will remove an event that has not
14526@@ -115,6 +117,9 @@ int ring_buffer_unlock_commit(struct ring_buffer *buffer,
14527 int ring_buffer_write(struct ring_buffer *buffer,
14528 unsigned long length, void *data);
14529
14530+void ring_buffer_nest_start(struct ring_buffer *buffer);
14531+void ring_buffer_nest_end(struct ring_buffer *buffer);
14532+
14533 struct ring_buffer_event *
14534 ring_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts,
14535 unsigned long *lost_events);
14536@@ -179,6 +184,8 @@ void ring_buffer_normalize_time_stamp(struct ring_buffer *buffer,
14537 int cpu, u64 *ts);
14538 void ring_buffer_set_clock(struct ring_buffer *buffer,
14539 u64 (*clock)(void));
14540+void ring_buffer_set_time_stamp_abs(struct ring_buffer *buffer, bool abs);
14541+bool ring_buffer_time_stamp_abs(struct ring_buffer *buffer);
14542
14543 size_t ring_buffer_page_len(void *page);
14544
14545diff --git a/include/linux/rtmutex.h b/include/linux/rtmutex.h
14546index 6fd615a0eea9..138bd1e183e0 100644
14547--- a/include/linux/rtmutex.h
14548+++ b/include/linux/rtmutex.h
14549@@ -14,11 +14,15 @@
14550 #define __LINUX_RT_MUTEX_H
14551
14552 #include <linux/linkage.h>
14553+#include <linux/spinlock_types_raw.h>
14554 #include <linux/rbtree.h>
14555-#include <linux/spinlock_types.h>
14556
14557 extern int max_lock_depth; /* for sysctl */
14558
14559+#ifdef CONFIG_DEBUG_MUTEXES
14560+#include <linux/debug_locks.h>
14561+#endif
14562+
14563 /**
14564 * The rt_mutex structure
14565 *
14566@@ -31,8 +35,8 @@ struct rt_mutex {
14567 raw_spinlock_t wait_lock;
14568 struct rb_root_cached waiters;
14569 struct task_struct *owner;
14570-#ifdef CONFIG_DEBUG_RT_MUTEXES
14571 int save_state;
14572+#ifdef CONFIG_DEBUG_RT_MUTEXES
14573 const char *name, *file;
14574 int line;
14575 void *magic;
14576@@ -82,16 +86,23 @@ do { \
14577 #define __DEP_MAP_RT_MUTEX_INITIALIZER(mutexname)
14578 #endif
14579
14580-#define __RT_MUTEX_INITIALIZER(mutexname) \
14581- { .wait_lock = __RAW_SPIN_LOCK_UNLOCKED(mutexname.wait_lock) \
14582+#define __RT_MUTEX_INITIALIZER_PLAIN(mutexname) \
14583+ .wait_lock = __RAW_SPIN_LOCK_UNLOCKED(mutexname.wait_lock) \
14584 , .waiters = RB_ROOT_CACHED \
14585 , .owner = NULL \
14586 __DEBUG_RT_MUTEX_INITIALIZER(mutexname) \
14587- __DEP_MAP_RT_MUTEX_INITIALIZER(mutexname)}
14588+ __DEP_MAP_RT_MUTEX_INITIALIZER(mutexname)
14589+
14590+#define __RT_MUTEX_INITIALIZER(mutexname) \
14591+ { __RT_MUTEX_INITIALIZER_PLAIN(mutexname) }
14592
14593 #define DEFINE_RT_MUTEX(mutexname) \
14594 struct rt_mutex mutexname = __RT_MUTEX_INITIALIZER(mutexname)
14595
14596+#define __RT_MUTEX_INITIALIZER_SAVE_STATE(mutexname) \
14597+ { __RT_MUTEX_INITIALIZER_PLAIN(mutexname) \
14598+ , .save_state = 1 }
14599+
14600 /**
14601 * rt_mutex_is_locked - is the mutex locked
14602 * @lock: the mutex to be queried
14603@@ -115,6 +126,7 @@ extern void rt_mutex_lock(struct rt_mutex *lock);
14604 #endif
14605
14606 extern int rt_mutex_lock_interruptible(struct rt_mutex *lock);
14607+extern int rt_mutex_lock_killable(struct rt_mutex *lock);
14608 extern int rt_mutex_timed_lock(struct rt_mutex *lock,
14609 struct hrtimer_sleeper *timeout);
14610
14611diff --git a/include/linux/rwlock_rt.h b/include/linux/rwlock_rt.h
14612new file mode 100644
14613index 000000000000..a9c4c2ac4d1f
14614--- /dev/null
14615+++ b/include/linux/rwlock_rt.h
14616@@ -0,0 +1,119 @@
14617+#ifndef __LINUX_RWLOCK_RT_H
14618+#define __LINUX_RWLOCK_RT_H
14619+
14620+#ifndef __LINUX_SPINLOCK_H
14621+#error Do not include directly. Use spinlock.h
14622+#endif
14623+
14624+extern void __lockfunc rt_write_lock(rwlock_t *rwlock);
14625+extern void __lockfunc rt_read_lock(rwlock_t *rwlock);
14626+extern int __lockfunc rt_write_trylock(rwlock_t *rwlock);
14627+extern int __lockfunc rt_read_trylock(rwlock_t *rwlock);
14628+extern void __lockfunc rt_write_unlock(rwlock_t *rwlock);
14629+extern void __lockfunc rt_read_unlock(rwlock_t *rwlock);
14630+extern int __lockfunc rt_read_can_lock(rwlock_t *rwlock);
14631+extern int __lockfunc rt_write_can_lock(rwlock_t *rwlock);
14632+extern void __rt_rwlock_init(rwlock_t *rwlock, char *name, struct lock_class_key *key);
14633+
14634+#define read_can_lock(rwlock) rt_read_can_lock(rwlock)
14635+#define write_can_lock(rwlock) rt_write_can_lock(rwlock)
14636+
14637+#define read_trylock(lock) __cond_lock(lock, rt_read_trylock(lock))
14638+#define write_trylock(lock) __cond_lock(lock, rt_write_trylock(lock))
14639+
14640+static inline int __write_trylock_rt_irqsave(rwlock_t *lock, unsigned long *flags)
14641+{
14642+ /* XXX ARCH_IRQ_ENABLED */
14643+ *flags = 0;
14644+ return rt_write_trylock(lock);
14645+}
14646+
14647+#define write_trylock_irqsave(lock, flags) \
14648+ __cond_lock(lock, __write_trylock_rt_irqsave(lock, &(flags)))
14649+
14650+#define read_lock_irqsave(lock, flags) \
14651+ do { \
14652+ typecheck(unsigned long, flags); \
14653+ rt_read_lock(lock); \
14654+ flags = 0; \
14655+ } while (0)
14656+
14657+#define write_lock_irqsave(lock, flags) \
14658+ do { \
14659+ typecheck(unsigned long, flags); \
14660+ rt_write_lock(lock); \
14661+ flags = 0; \
14662+ } while (0)
14663+
14664+#define read_lock(lock) rt_read_lock(lock)
14665+
14666+#define read_lock_bh(lock) \
14667+ do { \
14668+ local_bh_disable(); \
14669+ rt_read_lock(lock); \
14670+ } while (0)
14671+
14672+#define read_lock_irq(lock) read_lock(lock)
14673+
14674+#define write_lock(lock) rt_write_lock(lock)
14675+
14676+#define write_lock_bh(lock) \
14677+ do { \
14678+ local_bh_disable(); \
14679+ rt_write_lock(lock); \
14680+ } while (0)
14681+
14682+#define write_lock_irq(lock) write_lock(lock)
14683+
14684+#define read_unlock(lock) rt_read_unlock(lock)
14685+
14686+#define read_unlock_bh(lock) \
14687+ do { \
14688+ rt_read_unlock(lock); \
14689+ local_bh_enable(); \
14690+ } while (0)
14691+
14692+#define read_unlock_irq(lock) read_unlock(lock)
14693+
14694+#define write_unlock(lock) rt_write_unlock(lock)
14695+
14696+#define write_unlock_bh(lock) \
14697+ do { \
14698+ rt_write_unlock(lock); \
14699+ local_bh_enable(); \
14700+ } while (0)
14701+
14702+#define write_unlock_irq(lock) write_unlock(lock)
14703+
14704+#define read_unlock_irqrestore(lock, flags) \
14705+ do { \
14706+ typecheck(unsigned long, flags); \
14707+ (void) flags; \
14708+ rt_read_unlock(lock); \
14709+ } while (0)
14710+
14711+#define write_unlock_irqrestore(lock, flags) \
14712+ do { \
14713+ typecheck(unsigned long, flags); \
14714+ (void) flags; \
14715+ rt_write_unlock(lock); \
14716+ } while (0)
14717+
14718+#define rwlock_init(rwl) \
14719+do { \
14720+ static struct lock_class_key __key; \
14721+ \
14722+ __rt_rwlock_init(rwl, #rwl, &__key); \
14723+} while (0)
14724+
14725+/*
14726+ * Internal functions made global for CPU pinning
14727+ */
14728+void __read_rt_lock(struct rt_rw_lock *lock);
14729+int __read_rt_trylock(struct rt_rw_lock *lock);
14730+void __write_rt_lock(struct rt_rw_lock *lock);
14731+int __write_rt_trylock(struct rt_rw_lock *lock);
14732+void __read_rt_unlock(struct rt_rw_lock *lock);
14733+void __write_rt_unlock(struct rt_rw_lock *lock);
14734+
14735+#endif
14736diff --git a/include/linux/rwlock_types.h b/include/linux/rwlock_types.h
14737index cc0072e93e36..5317cd957292 100644
14738--- a/include/linux/rwlock_types.h
14739+++ b/include/linux/rwlock_types.h
14740@@ -1,6 +1,10 @@
14741 #ifndef __LINUX_RWLOCK_TYPES_H
14742 #define __LINUX_RWLOCK_TYPES_H
14743
14744+#if !defined(__LINUX_SPINLOCK_TYPES_H)
14745+# error "Do not include directly, include spinlock_types.h"
14746+#endif
14747+
14748 /*
14749 * include/linux/rwlock_types.h - generic rwlock type definitions
14750 * and initializers
14751diff --git a/include/linux/rwlock_types_rt.h b/include/linux/rwlock_types_rt.h
14752new file mode 100644
14753index 000000000000..546a1f8f1274
14754--- /dev/null
14755+++ b/include/linux/rwlock_types_rt.h
14756@@ -0,0 +1,55 @@
14757+#ifndef __LINUX_RWLOCK_TYPES_RT_H
14758+#define __LINUX_RWLOCK_TYPES_RT_H
14759+
14760+#ifndef __LINUX_SPINLOCK_TYPES_H
14761+#error "Do not include directly. Include spinlock_types.h instead"
14762+#endif
14763+
14764+#ifdef CONFIG_DEBUG_LOCK_ALLOC
14765+# define RW_DEP_MAP_INIT(lockname) .dep_map = { .name = #lockname }
14766+#else
14767+# define RW_DEP_MAP_INIT(lockname)
14768+#endif
14769+
14770+typedef struct rt_rw_lock rwlock_t;
14771+
14772+#define __RW_LOCK_UNLOCKED(name) __RWLOCK_RT_INITIALIZER(name)
14773+
14774+#define DEFINE_RWLOCK(name) \
14775+ rwlock_t name = __RW_LOCK_UNLOCKED(name)
14776+
14777+/*
14778+ * A reader biased implementation primarily for CPU pinning.
14779+ *
14780+ * Can be selected as general replacement for the single reader RT rwlock
14781+ * variant
14782+ */
14783+struct rt_rw_lock {
14784+ struct rt_mutex rtmutex;
14785+ atomic_t readers;
14786+#ifdef CONFIG_DEBUG_LOCK_ALLOC
14787+ struct lockdep_map dep_map;
14788+#endif
14789+};
14790+
14791+#define READER_BIAS (1U << 31)
14792+#define WRITER_BIAS (1U << 30)
14793+
14794+#define __RWLOCK_RT_INITIALIZER(name) \
14795+{ \
14796+ .readers = ATOMIC_INIT(READER_BIAS), \
14797+ .rtmutex = __RT_MUTEX_INITIALIZER_SAVE_STATE(name.rtmutex), \
14798+ RW_DEP_MAP_INIT(name) \
14799+}
14800+
14801+void __rwlock_biased_rt_init(struct rt_rw_lock *lock, const char *name,
14802+ struct lock_class_key *key);
14803+
14804+#define rwlock_biased_rt_init(rwlock) \
14805+ do { \
14806+ static struct lock_class_key __key; \
14807+ \
14808+ __rwlock_biased_rt_init((rwlock), #rwlock, &__key); \
14809+ } while (0)
14810+
14811+#endif
14812diff --git a/include/linux/rwsem.h b/include/linux/rwsem.h
14813index c427ffaa4904..513df11a364e 100644
14814--- a/include/linux/rwsem.h
14815+++ b/include/linux/rwsem.h
14816@@ -20,6 +20,10 @@
14817 #include <linux/osq_lock.h>
14818 #endif
14819
14820+#ifdef CONFIG_PREEMPT_RT_FULL
14821+#include <linux/rwsem_rt.h>
14822+#else /* PREEMPT_RT_FULL */
14823+
14824 struct rw_semaphore;
14825
14826 #ifdef CONFIG_RWSEM_GENERIC_SPINLOCK
14827@@ -114,6 +118,13 @@ static inline int rwsem_is_contended(struct rw_semaphore *sem)
14828 return !list_empty(&sem->wait_list);
14829 }
14830
14831+#endif /* !PREEMPT_RT_FULL */
14832+
14833+/*
14834+ * The functions below are the same for all rwsem implementations including
14835+ * the RT specific variant.
14836+ */
14837+
14838 /*
14839 * lock for reading
14840 */
14841diff --git a/include/linux/rwsem_rt.h b/include/linux/rwsem_rt.h
14842new file mode 100644
14843index 000000000000..2ffbf093ae92
14844--- /dev/null
14845+++ b/include/linux/rwsem_rt.h
14846@@ -0,0 +1,67 @@
14847+#ifndef _LINUX_RWSEM_RT_H
14848+#define _LINUX_RWSEM_RT_H
14849+
14850+#ifndef _LINUX_RWSEM_H
14851+#error "Include rwsem.h"
14852+#endif
14853+
14854+#include <linux/rtmutex.h>
14855+#include <linux/swait.h>
14856+
14857+#define READER_BIAS (1U << 31)
14858+#define WRITER_BIAS (1U << 30)
14859+
14860+struct rw_semaphore {
14861+ atomic_t readers;
14862+ struct rt_mutex rtmutex;
14863+#ifdef CONFIG_DEBUG_LOCK_ALLOC
14864+ struct lockdep_map dep_map;
14865+#endif
14866+};
14867+
14868+#define __RWSEM_INITIALIZER(name) \
14869+{ \
14870+ .readers = ATOMIC_INIT(READER_BIAS), \
14871+ .rtmutex = __RT_MUTEX_INITIALIZER(name.rtmutex), \
14872+ RW_DEP_MAP_INIT(name) \
14873+}
14874+
14875+#define DECLARE_RWSEM(lockname) \
14876+ struct rw_semaphore lockname = __RWSEM_INITIALIZER(lockname)
14877+
14878+extern void __rwsem_init(struct rw_semaphore *rwsem, const char *name,
14879+ struct lock_class_key *key);
14880+
14881+#define __init_rwsem(sem, name, key) \
14882+do { \
14883+ rt_mutex_init(&(sem)->rtmutex); \
14884+ __rwsem_init((sem), (name), (key)); \
14885+} while (0)
14886+
14887+#define init_rwsem(sem) \
14888+do { \
14889+ static struct lock_class_key __key; \
14890+ \
14891+ __init_rwsem((sem), #sem, &__key); \
14892+} while (0)
14893+
14894+static inline int rwsem_is_locked(struct rw_semaphore *sem)
14895+{
14896+ return atomic_read(&sem->readers) != READER_BIAS;
14897+}
14898+
14899+static inline int rwsem_is_contended(struct rw_semaphore *sem)
14900+{
14901+ return atomic_read(&sem->readers) > 0;
14902+}
14903+
14904+extern void __down_read(struct rw_semaphore *sem);
14905+extern int __down_read_trylock(struct rw_semaphore *sem);
14906+extern void __down_write(struct rw_semaphore *sem);
14907+extern int __must_check __down_write_killable(struct rw_semaphore *sem);
14908+extern int __down_write_trylock(struct rw_semaphore *sem);
14909+extern void __up_read(struct rw_semaphore *sem);
14910+extern void __up_write(struct rw_semaphore *sem);
14911+extern void __downgrade_write(struct rw_semaphore *sem);
14912+
14913+#endif
14914diff --git a/include/linux/sched.h b/include/linux/sched.h
14915index e04919aa8201..a6ffb552be01 100644
14916--- a/include/linux/sched.h
14917+++ b/include/linux/sched.h
14918@@ -27,6 +27,7 @@
14919 #include <linux/signal_types.h>
14920 #include <linux/mm_types_task.h>
14921 #include <linux/task_io_accounting.h>
14922+#include <asm/kmap_types.h>
14923
14924 /* task_struct member predeclarations (sorted alphabetically): */
14925 struct audit_context;
14926@@ -93,7 +94,6 @@ struct task_group;
14927
14928 /* Convenience macros for the sake of wake_up(): */
14929 #define TASK_NORMAL (TASK_INTERRUPTIBLE | TASK_UNINTERRUPTIBLE)
14930-#define TASK_ALL (TASK_NORMAL | __TASK_STOPPED | __TASK_TRACED)
14931
14932 /* get_task_state(): */
14933 #define TASK_REPORT (TASK_RUNNING | TASK_INTERRUPTIBLE | \
14934@@ -101,12 +101,8 @@ struct task_group;
14935 __TASK_TRACED | EXIT_DEAD | EXIT_ZOMBIE | \
14936 TASK_PARKED)
14937
14938-#define task_is_traced(task) ((task->state & __TASK_TRACED) != 0)
14939-
14940 #define task_is_stopped(task) ((task->state & __TASK_STOPPED) != 0)
14941
14942-#define task_is_stopped_or_traced(task) ((task->state & (__TASK_STOPPED | __TASK_TRACED)) != 0)
14943-
14944 #define task_contributes_to_load(task) ((task->state & TASK_UNINTERRUPTIBLE) != 0 && \
14945 (task->flags & PF_FROZEN) == 0 && \
14946 (task->state & TASK_NOLOAD) == 0)
14947@@ -134,6 +130,11 @@ struct task_group;
14948 smp_store_mb(current->state, (state_value)); \
14949 } while (0)
14950
14951+#define __set_current_state_no_track(state_value) \
14952+ current->state = (state_value);
14953+#define set_current_state_no_track(state_value) \
14954+ smp_store_mb(current->state, (state_value));
14955+
14956 #define set_special_state(state_value) \
14957 do { \
14958 unsigned long flags; /* may shadow */ \
14959@@ -187,6 +188,9 @@ struct task_group;
14960 #define set_current_state(state_value) \
14961 smp_store_mb(current->state, (state_value))
14962
14963+#define __set_current_state_no_track(state_value) __set_current_state(state_value)
14964+#define set_current_state_no_track(state_value) set_current_state(state_value)
14965+
14966 /*
14967 * set_special_state() should be used for those states when the blocking task
14968 * can not use the regular condition based wait-loop. In that case we must
14969@@ -566,6 +570,8 @@ struct task_struct {
14970 #endif
14971 /* -1 unrunnable, 0 runnable, >0 stopped: */
14972 volatile long state;
14973+ /* saved state for "spinlock sleepers" */
14974+ volatile long saved_state;
14975
14976 /*
14977 * This begins the randomizable portion of task_struct. Only
14978@@ -618,7 +624,25 @@ struct task_struct {
14979
14980 unsigned int policy;
14981 int nr_cpus_allowed;
14982- cpumask_t cpus_allowed;
14983+ const cpumask_t *cpus_ptr;
14984+ cpumask_t cpus_mask;
14985+#if defined(CONFIG_SMP) && defined(CONFIG_PREEMPT_RT_BASE)
14986+ int migrate_disable;
14987+ int migrate_disable_update;
14988+ int pinned_on_cpu;
14989+# ifdef CONFIG_SCHED_DEBUG
14990+ int migrate_disable_atomic;
14991+# endif
14992+
14993+#elif !defined(CONFIG_SMP) && defined(CONFIG_PREEMPT_RT_BASE)
14994+# ifdef CONFIG_SCHED_DEBUG
14995+ int migrate_disable;
14996+ int migrate_disable_atomic;
14997+# endif
14998+#endif
14999+#ifdef CONFIG_PREEMPT_RT_FULL
15000+ int sleeping_lock;
15001+#endif
15002
15003 #ifdef CONFIG_PREEMPT_RCU
15004 int rcu_read_lock_nesting;
15005@@ -777,6 +801,9 @@ struct task_struct {
15006 #ifdef CONFIG_POSIX_TIMERS
15007 struct task_cputime cputime_expires;
15008 struct list_head cpu_timers[3];
15009+#ifdef CONFIG_PREEMPT_RT_BASE
15010+ struct task_struct *posix_timer_list;
15011+#endif
15012 #endif
15013
15014 /* Process credentials: */
15015@@ -820,11 +847,17 @@ struct task_struct {
15016 /* Signal handlers: */
15017 struct signal_struct *signal;
15018 struct sighand_struct *sighand;
15019+ struct sigqueue *sigqueue_cache;
15020+
15021 sigset_t blocked;
15022 sigset_t real_blocked;
15023 /* Restored if set_restore_sigmask() was used: */
15024 sigset_t saved_sigmask;
15025 struct sigpending pending;
15026+#ifdef CONFIG_PREEMPT_RT_FULL
15027+ /* TODO: move me into ->restart_block ? */
15028+ struct siginfo forced_info;
15029+#endif
15030 unsigned long sas_ss_sp;
15031 size_t sas_ss_size;
15032 unsigned int sas_ss_flags;
15033@@ -849,6 +882,7 @@ struct task_struct {
15034 raw_spinlock_t pi_lock;
15035
15036 struct wake_q_node wake_q;
15037+ struct wake_q_node wake_q_sleeper;
15038
15039 #ifdef CONFIG_RT_MUTEXES
15040 /* PI waiters blocked on a rt_mutex held by this task: */
15041@@ -1116,8 +1150,22 @@ struct task_struct {
15042 unsigned int sequential_io;
15043 unsigned int sequential_io_avg;
15044 #endif
15045+#ifdef CONFIG_PREEMPT_RT_BASE
15046+ struct rcu_head put_rcu;
15047+ int softirq_nestcnt;
15048+ unsigned int softirqs_raised;
15049+#endif
15050+#ifdef CONFIG_PREEMPT_RT_FULL
15051+# if defined CONFIG_HIGHMEM || defined CONFIG_X86_32
15052+ int kmap_idx;
15053+ pte_t kmap_pte[KM_TYPE_NR];
15054+# endif
15055+#endif
15056 #ifdef CONFIG_DEBUG_ATOMIC_SLEEP
15057 unsigned long task_state_change;
15058+#endif
15059+#ifdef CONFIG_PREEMPT_RT_FULL
15060+ int xmit_recursion;
15061 #endif
15062 int pagefault_disabled;
15063 #ifdef CONFIG_MMU
15064@@ -1332,6 +1380,7 @@ extern struct pid *cad_pid;
15065 /*
15066 * Per process flags
15067 */
15068+#define PF_IN_SOFTIRQ 0x00000001 /* Task is serving softirq */
15069 #define PF_IDLE 0x00000002 /* I am an IDLE thread */
15070 #define PF_EXITING 0x00000004 /* Getting shut down */
15071 #define PF_EXITPIDONE 0x00000008 /* PI exit done on shut down */
15072@@ -1355,7 +1404,7 @@ extern struct pid *cad_pid;
15073 #define PF_KTHREAD 0x00200000 /* I am a kernel thread */
15074 #define PF_RANDOMIZE 0x00400000 /* Randomize virtual address space */
15075 #define PF_SWAPWRITE 0x00800000 /* Allowed to write to swap */
15076-#define PF_NO_SETAFFINITY 0x04000000 /* Userland is not allowed to meddle with cpus_allowed */
15077+#define PF_NO_SETAFFINITY 0x04000000 /* Userland is not allowed to meddle with cpus_mask */
15078 #define PF_MCE_EARLY 0x08000000 /* Early kill for mce process policy */
15079 #define PF_MUTEX_TESTER 0x20000000 /* Thread belongs to the rt mutex tester */
15080 #define PF_FREEZER_SKIP 0x40000000 /* Freezer should not count it as freezable */
15081@@ -1535,6 +1584,7 @@ extern struct task_struct *find_task_by_pid_ns(pid_t nr, struct pid_namespace *n
15082
15083 extern int wake_up_state(struct task_struct *tsk, unsigned int state);
15084 extern int wake_up_process(struct task_struct *tsk);
15085+extern int wake_up_lock_sleeper(struct task_struct *tsk);
15086 extern void wake_up_new_task(struct task_struct *tsk);
15087
15088 #ifdef CONFIG_SMP
15089@@ -1611,6 +1661,89 @@ static inline int test_tsk_need_resched(struct task_struct *tsk)
15090 return unlikely(test_tsk_thread_flag(tsk,TIF_NEED_RESCHED));
15091 }
15092
15093+#ifdef CONFIG_PREEMPT_LAZY
15094+static inline void set_tsk_need_resched_lazy(struct task_struct *tsk)
15095+{
15096+ set_tsk_thread_flag(tsk,TIF_NEED_RESCHED_LAZY);
15097+}
15098+
15099+static inline void clear_tsk_need_resched_lazy(struct task_struct *tsk)
15100+{
15101+ clear_tsk_thread_flag(tsk,TIF_NEED_RESCHED_LAZY);
15102+}
15103+
15104+static inline int test_tsk_need_resched_lazy(struct task_struct *tsk)
15105+{
15106+ return unlikely(test_tsk_thread_flag(tsk,TIF_NEED_RESCHED_LAZY));
15107+}
15108+
15109+static inline int need_resched_lazy(void)
15110+{
15111+ return test_thread_flag(TIF_NEED_RESCHED_LAZY);
15112+}
15113+
15114+static inline int need_resched_now(void)
15115+{
15116+ return test_thread_flag(TIF_NEED_RESCHED);
15117+}
15118+
15119+#else
15120+static inline void clear_tsk_need_resched_lazy(struct task_struct *tsk) { }
15121+static inline int need_resched_lazy(void) { return 0; }
15122+
15123+static inline int need_resched_now(void)
15124+{
15125+ return test_thread_flag(TIF_NEED_RESCHED);
15126+}
15127+
15128+#endif
15129+
15130+
15131+static inline bool __task_is_stopped_or_traced(struct task_struct *task)
15132+{
15133+ if (task->state & (__TASK_STOPPED | __TASK_TRACED))
15134+ return true;
15135+#ifdef CONFIG_PREEMPT_RT_FULL
15136+ if (task->saved_state & (__TASK_STOPPED | __TASK_TRACED))
15137+ return true;
15138+#endif
15139+ return false;
15140+}
15141+
15142+static inline bool task_is_stopped_or_traced(struct task_struct *task)
15143+{
15144+ bool traced_stopped;
15145+
15146+#ifdef CONFIG_PREEMPT_RT_FULL
15147+ unsigned long flags;
15148+
15149+ raw_spin_lock_irqsave(&task->pi_lock, flags);
15150+ traced_stopped = __task_is_stopped_or_traced(task);
15151+ raw_spin_unlock_irqrestore(&task->pi_lock, flags);
15152+#else
15153+ traced_stopped = __task_is_stopped_or_traced(task);
15154+#endif
15155+ return traced_stopped;
15156+}
15157+
15158+static inline bool task_is_traced(struct task_struct *task)
15159+{
15160+ bool traced = false;
15161+
15162+ if (task->state & __TASK_TRACED)
15163+ return true;
15164+#ifdef CONFIG_PREEMPT_RT_FULL
15165+ /* in case the task is sleeping on tasklist_lock */
15166+ raw_spin_lock_irq(&task->pi_lock);
15167+ if (task->state & __TASK_TRACED)
15168+ traced = true;
15169+ else if (task->saved_state & __TASK_TRACED)
15170+ traced = true;
15171+ raw_spin_unlock_irq(&task->pi_lock);
15172+#endif
15173+ return traced;
15174+}
15175+
15176 /*
15177 * cond_resched() and cond_resched_lock(): latency reduction via
15178 * explicit rescheduling in places that are safe. The return
15179@@ -1636,12 +1769,16 @@ extern int __cond_resched_lock(spinlock_t *lock);
15180 __cond_resched_lock(lock); \
15181 })
15182
15183+#ifndef CONFIG_PREEMPT_RT_FULL
15184 extern int __cond_resched_softirq(void);
15185
15186 #define cond_resched_softirq() ({ \
15187 ___might_sleep(__FILE__, __LINE__, SOFTIRQ_DISABLE_OFFSET); \
15188 __cond_resched_softirq(); \
15189 })
15190+#else
15191+# define cond_resched_softirq() cond_resched()
15192+#endif
15193
15194 static inline void cond_resched_rcu(void)
15195 {
15196@@ -1671,6 +1808,23 @@ static __always_inline bool need_resched(void)
15197 return unlikely(tif_need_resched());
15198 }
15199
15200+#ifdef CONFIG_PREEMPT_RT_FULL
15201+static inline void sleeping_lock_inc(void)
15202+{
15203+ current->sleeping_lock++;
15204+}
15205+
15206+static inline void sleeping_lock_dec(void)
15207+{
15208+ current->sleeping_lock--;
15209+}
15210+
15211+#else
15212+
15213+static inline void sleeping_lock_inc(void) { }
15214+static inline void sleeping_lock_dec(void) { }
15215+#endif
15216+
15217 /*
15218 * Wrappers for p->thread_info->cpu access. No-op on UP.
15219 */
15220diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h
15221index 3d49b91b674d..d8f2fa8f500c 100644
15222--- a/include/linux/sched/mm.h
15223+++ b/include/linux/sched/mm.h
15224@@ -43,6 +43,17 @@ static inline void mmdrop(struct mm_struct *mm)
15225 __mmdrop(mm);
15226 }
15227
15228+#ifdef CONFIG_PREEMPT_RT_BASE
15229+extern void __mmdrop_delayed(struct rcu_head *rhp);
15230+static inline void mmdrop_delayed(struct mm_struct *mm)
15231+{
15232+ if (atomic_dec_and_test(&mm->mm_count))
15233+ call_rcu(&mm->delayed_drop, __mmdrop_delayed);
15234+}
15235+#else
15236+# define mmdrop_delayed(mm) mmdrop(mm)
15237+#endif
15238+
15239 static inline void mmdrop_async_fn(struct work_struct *work)
15240 {
15241 struct mm_struct *mm = container_of(work, struct mm_struct, async_put_work);
15242diff --git a/include/linux/sched/task.h b/include/linux/sched/task.h
15243index a74ec619ac51..8e7f741370c5 100644
15244--- a/include/linux/sched/task.h
15245+++ b/include/linux/sched/task.h
15246@@ -88,6 +88,15 @@ extern void sched_exec(void);
15247
15248 #define get_task_struct(tsk) do { atomic_inc(&(tsk)->usage); } while(0)
15249
15250+#ifdef CONFIG_PREEMPT_RT_BASE
15251+extern void __put_task_struct_cb(struct rcu_head *rhp);
15252+
15253+static inline void put_task_struct(struct task_struct *t)
15254+{
15255+ if (atomic_dec_and_test(&t->usage))
15256+ call_rcu(&t->put_rcu, __put_task_struct_cb);
15257+}
15258+#else
15259 extern void __put_task_struct(struct task_struct *t);
15260
15261 static inline void put_task_struct(struct task_struct *t)
15262@@ -95,7 +104,7 @@ static inline void put_task_struct(struct task_struct *t)
15263 if (atomic_dec_and_test(&t->usage))
15264 __put_task_struct(t);
15265 }
15266-
15267+#endif
15268 struct task_struct *task_rcu_dereference(struct task_struct **ptask);
15269
15270 #ifdef CONFIG_ARCH_WANTS_DYNAMIC_TASK_STRUCT
15271diff --git a/include/linux/sched/wake_q.h b/include/linux/sched/wake_q.h
15272index 10b19a192b2d..ce3ccff3d9d8 100644
15273--- a/include/linux/sched/wake_q.h
15274+++ b/include/linux/sched/wake_q.h
15275@@ -47,8 +47,29 @@ static inline void wake_q_init(struct wake_q_head *head)
15276 head->lastp = &head->first;
15277 }
15278
15279-extern void wake_q_add(struct wake_q_head *head,
15280- struct task_struct *task);
15281-extern void wake_up_q(struct wake_q_head *head);
15282+extern void __wake_q_add(struct wake_q_head *head,
15283+ struct task_struct *task, bool sleeper);
15284+static inline void wake_q_add(struct wake_q_head *head,
15285+ struct task_struct *task)
15286+{
15287+ __wake_q_add(head, task, false);
15288+}
15289+
15290+static inline void wake_q_add_sleeper(struct wake_q_head *head,
15291+ struct task_struct *task)
15292+{
15293+ __wake_q_add(head, task, true);
15294+}
15295+
15296+extern void __wake_up_q(struct wake_q_head *head, bool sleeper);
15297+static inline void wake_up_q(struct wake_q_head *head)
15298+{
15299+ __wake_up_q(head, false);
15300+}
15301+
15302+static inline void wake_up_q_sleeper(struct wake_q_head *head)
15303+{
15304+ __wake_up_q(head, true);
15305+}
15306
15307 #endif /* _LINUX_SCHED_WAKE_Q_H */
15308diff --git a/include/linux/seqlock.h b/include/linux/seqlock.h
15309index f189a8a3bbb8..107079a2d7ed 100644
15310--- a/include/linux/seqlock.h
15311+++ b/include/linux/seqlock.h
15312@@ -221,20 +221,30 @@ static inline int read_seqcount_retry(const seqcount_t *s, unsigned start)
15313 return __read_seqcount_retry(s, start);
15314 }
15315
15316-
15317-
15318-static inline void raw_write_seqcount_begin(seqcount_t *s)
15319+static inline void __raw_write_seqcount_begin(seqcount_t *s)
15320 {
15321 s->sequence++;
15322 smp_wmb();
15323 }
15324
15325-static inline void raw_write_seqcount_end(seqcount_t *s)
15326+static inline void raw_write_seqcount_begin(seqcount_t *s)
15327+{
15328+ preempt_disable_rt();
15329+ __raw_write_seqcount_begin(s);
15330+}
15331+
15332+static inline void __raw_write_seqcount_end(seqcount_t *s)
15333 {
15334 smp_wmb();
15335 s->sequence++;
15336 }
15337
15338+static inline void raw_write_seqcount_end(seqcount_t *s)
15339+{
15340+ __raw_write_seqcount_end(s);
15341+ preempt_enable_rt();
15342+}
15343+
15344 /**
15345 * raw_write_seqcount_barrier - do a seq write barrier
15346 * @s: pointer to seqcount_t
15347@@ -429,10 +439,33 @@ typedef struct {
15348 /*
15349 * Read side functions for starting and finalizing a read side section.
15350 */
15351+#ifndef CONFIG_PREEMPT_RT_FULL
15352 static inline unsigned read_seqbegin(const seqlock_t *sl)
15353 {
15354 return read_seqcount_begin(&sl->seqcount);
15355 }
15356+#else
15357+/*
15358+ * Starvation safe read side for RT
15359+ */
15360+static inline unsigned read_seqbegin(seqlock_t *sl)
15361+{
15362+ unsigned ret;
15363+
15364+repeat:
15365+ ret = ACCESS_ONCE(sl->seqcount.sequence);
15366+ if (unlikely(ret & 1)) {
15367+ /*
15368+ * Take the lock and let the writer proceed (i.e. evtl
15369+ * boost it), otherwise we could loop here forever.
15370+ */
15371+ spin_unlock_wait(&sl->lock);
15372+ goto repeat;
15373+ }
15374+ smp_rmb();
15375+ return ret;
15376+}
15377+#endif
15378
15379 static inline unsigned read_seqretry(const seqlock_t *sl, unsigned start)
15380 {
15381@@ -447,36 +480,45 @@ static inline unsigned read_seqretry(const seqlock_t *sl, unsigned start)
15382 static inline void write_seqlock(seqlock_t *sl)
15383 {
15384 spin_lock(&sl->lock);
15385- write_seqcount_begin(&sl->seqcount);
15386+ __raw_write_seqcount_begin(&sl->seqcount);
15387+}
15388+
15389+static inline int try_write_seqlock(seqlock_t *sl)
15390+{
15391+ if (spin_trylock(&sl->lock)) {
15392+ __raw_write_seqcount_begin(&sl->seqcount);
15393+ return 1;
15394+ }
15395+ return 0;
15396 }
15397
15398 static inline void write_sequnlock(seqlock_t *sl)
15399 {
15400- write_seqcount_end(&sl->seqcount);
15401+ __raw_write_seqcount_end(&sl->seqcount);
15402 spin_unlock(&sl->lock);
15403 }
15404
15405 static inline void write_seqlock_bh(seqlock_t *sl)
15406 {
15407 spin_lock_bh(&sl->lock);
15408- write_seqcount_begin(&sl->seqcount);
15409+ __raw_write_seqcount_begin(&sl->seqcount);
15410 }
15411
15412 static inline void write_sequnlock_bh(seqlock_t *sl)
15413 {
15414- write_seqcount_end(&sl->seqcount);
15415+ __raw_write_seqcount_end(&sl->seqcount);
15416 spin_unlock_bh(&sl->lock);
15417 }
15418
15419 static inline void write_seqlock_irq(seqlock_t *sl)
15420 {
15421 spin_lock_irq(&sl->lock);
15422- write_seqcount_begin(&sl->seqcount);
15423+ __raw_write_seqcount_begin(&sl->seqcount);
15424 }
15425
15426 static inline void write_sequnlock_irq(seqlock_t *sl)
15427 {
15428- write_seqcount_end(&sl->seqcount);
15429+ __raw_write_seqcount_end(&sl->seqcount);
15430 spin_unlock_irq(&sl->lock);
15431 }
15432
15433@@ -485,7 +527,7 @@ static inline unsigned long __write_seqlock_irqsave(seqlock_t *sl)
15434 unsigned long flags;
15435
15436 spin_lock_irqsave(&sl->lock, flags);
15437- write_seqcount_begin(&sl->seqcount);
15438+ __raw_write_seqcount_begin(&sl->seqcount);
15439 return flags;
15440 }
15441
15442@@ -495,7 +537,7 @@ static inline unsigned long __write_seqlock_irqsave(seqlock_t *sl)
15443 static inline void
15444 write_sequnlock_irqrestore(seqlock_t *sl, unsigned long flags)
15445 {
15446- write_seqcount_end(&sl->seqcount);
15447+ __raw_write_seqcount_end(&sl->seqcount);
15448 spin_unlock_irqrestore(&sl->lock, flags);
15449 }
15450
15451diff --git a/include/linux/signal.h b/include/linux/signal.h
15452index 042968dd98f0..a7d20f85cc0e 100644
15453--- a/include/linux/signal.h
15454+++ b/include/linux/signal.h
15455@@ -243,6 +243,7 @@ static inline void init_sigpending(struct sigpending *sig)
15456 }
15457
15458 extern void flush_sigqueue(struct sigpending *queue);
15459+extern void flush_task_sigqueue(struct task_struct *tsk);
15460
15461 /* Test if 'sig' is valid signal. Use this instead of testing _NSIG directly */
15462 static inline int valid_signal(unsigned long sig)
15463diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
15464index f64e88444082..07576a062ac0 100644
15465--- a/include/linux/skbuff.h
15466+++ b/include/linux/skbuff.h
15467@@ -287,6 +287,7 @@ struct sk_buff_head {
15468
15469 __u32 qlen;
15470 spinlock_t lock;
15471+ raw_spinlock_t raw_lock;
15472 };
15473
15474 struct sk_buff;
15475@@ -1672,6 +1673,12 @@ static inline void skb_queue_head_init(struct sk_buff_head *list)
15476 __skb_queue_head_init(list);
15477 }
15478
15479+static inline void skb_queue_head_init_raw(struct sk_buff_head *list)
15480+{
15481+ raw_spin_lock_init(&list->raw_lock);
15482+ __skb_queue_head_init(list);
15483+}
15484+
15485 static inline void skb_queue_head_init_class(struct sk_buff_head *list,
15486 struct lock_class_key *class)
15487 {
15488diff --git a/include/linux/smp.h b/include/linux/smp.h
15489index 9fb239e12b82..5801e516ba63 100644
15490--- a/include/linux/smp.h
15491+++ b/include/linux/smp.h
15492@@ -202,6 +202,9 @@ static inline int get_boot_cpu_id(void)
15493 #define get_cpu() ({ preempt_disable(); smp_processor_id(); })
15494 #define put_cpu() preempt_enable()
15495
15496+#define get_cpu_light() ({ migrate_disable(); smp_processor_id(); })
15497+#define put_cpu_light() migrate_enable()
15498+
15499 /*
15500 * Callback to arch code if there's nosmp or maxcpus=0 on the
15501 * boot command line:
15502diff --git a/include/linux/spinlock.h b/include/linux/spinlock.h
15503index 341e1a12bfc7..7c8f0a985b9e 100644
15504--- a/include/linux/spinlock.h
15505+++ b/include/linux/spinlock.h
15506@@ -286,7 +286,11 @@ static inline void do_raw_spin_unlock(raw_spinlock_t *lock) __releases(lock)
15507 #define raw_spin_can_lock(lock) (!raw_spin_is_locked(lock))
15508
15509 /* Include rwlock functions */
15510-#include <linux/rwlock.h>
15511+#ifdef CONFIG_PREEMPT_RT_FULL
15512+# include <linux/rwlock_rt.h>
15513+#else
15514+# include <linux/rwlock.h>
15515+#endif
15516
15517 /*
15518 * Pull the _spin_*()/_read_*()/_write_*() functions/declarations:
15519@@ -297,6 +301,10 @@ static inline void do_raw_spin_unlock(raw_spinlock_t *lock) __releases(lock)
15520 # include <linux/spinlock_api_up.h>
15521 #endif
15522
15523+#ifdef CONFIG_PREEMPT_RT_FULL
15524+# include <linux/spinlock_rt.h>
15525+#else /* PREEMPT_RT_FULL */
15526+
15527 /*
15528 * Map the spin_lock functions to the raw variants for PREEMPT_RT=n
15529 */
15530@@ -421,4 +429,6 @@ extern int _atomic_dec_and_lock(atomic_t *atomic, spinlock_t *lock);
15531 #define atomic_dec_and_lock(atomic, lock) \
15532 __cond_lock(lock, _atomic_dec_and_lock(atomic, lock))
15533
15534+#endif /* !PREEMPT_RT_FULL */
15535+
15536 #endif /* __LINUX_SPINLOCK_H */
15537diff --git a/include/linux/spinlock_api_smp.h b/include/linux/spinlock_api_smp.h
15538index 42dfab89e740..29d99ae5a8ab 100644
15539--- a/include/linux/spinlock_api_smp.h
15540+++ b/include/linux/spinlock_api_smp.h
15541@@ -187,6 +187,8 @@ static inline int __raw_spin_trylock_bh(raw_spinlock_t *lock)
15542 return 0;
15543 }
15544
15545-#include <linux/rwlock_api_smp.h>
15546+#ifndef CONFIG_PREEMPT_RT_FULL
15547+# include <linux/rwlock_api_smp.h>
15548+#endif
15549
15550 #endif /* __LINUX_SPINLOCK_API_SMP_H */
15551diff --git a/include/linux/spinlock_rt.h b/include/linux/spinlock_rt.h
15552new file mode 100644
15553index 000000000000..c95e1f5145ac
15554--- /dev/null
15555+++ b/include/linux/spinlock_rt.h
15556@@ -0,0 +1,159 @@
15557+#ifndef __LINUX_SPINLOCK_RT_H
15558+#define __LINUX_SPINLOCK_RT_H
15559+
15560+#ifndef __LINUX_SPINLOCK_H
15561+#error Do not include directly. Use spinlock.h
15562+#endif
15563+
15564+#include <linux/bug.h>
15565+
15566+extern void
15567+__rt_spin_lock_init(spinlock_t *lock, const char *name, struct lock_class_key *key);
15568+
15569+#define spin_lock_init(slock) \
15570+do { \
15571+ static struct lock_class_key __key; \
15572+ \
15573+ rt_mutex_init(&(slock)->lock); \
15574+ __rt_spin_lock_init(slock, #slock, &__key); \
15575+} while (0)
15576+
15577+extern void __lockfunc rt_spin_lock(spinlock_t *lock);
15578+extern unsigned long __lockfunc rt_spin_lock_trace_flags(spinlock_t *lock);
15579+extern void __lockfunc rt_spin_lock_nested(spinlock_t *lock, int subclass);
15580+extern void __lockfunc rt_spin_unlock(spinlock_t *lock);
15581+extern void __lockfunc rt_spin_unlock_wait(spinlock_t *lock);
15582+extern int __lockfunc rt_spin_trylock_irqsave(spinlock_t *lock, unsigned long *flags);
15583+extern int __lockfunc rt_spin_trylock_bh(spinlock_t *lock);
15584+extern int __lockfunc rt_spin_trylock(spinlock_t *lock);
15585+extern int atomic_dec_and_spin_lock(atomic_t *atomic, spinlock_t *lock);
15586+
15587+/*
15588+ * lockdep-less calls, for derived types like rwlock:
15589+ * (for trylock they can use rt_mutex_trylock() directly.
15590+ * Migrate disable handling must be done at the call site.
15591+ */
15592+extern void __lockfunc __rt_spin_lock(struct rt_mutex *lock);
15593+extern void __lockfunc __rt_spin_trylock(struct rt_mutex *lock);
15594+extern void __lockfunc __rt_spin_unlock(struct rt_mutex *lock);
15595+
15596+#define spin_lock(lock) rt_spin_lock(lock)
15597+
15598+#define spin_lock_bh(lock) \
15599+ do { \
15600+ local_bh_disable(); \
15601+ rt_spin_lock(lock); \
15602+ } while (0)
15603+
15604+#define spin_lock_irq(lock) spin_lock(lock)
15605+
15606+#define spin_do_trylock(lock) __cond_lock(lock, rt_spin_trylock(lock))
15607+
15608+#define spin_trylock(lock) \
15609+({ \
15610+ int __locked; \
15611+ __locked = spin_do_trylock(lock); \
15612+ __locked; \
15613+})
15614+
15615+#ifdef CONFIG_LOCKDEP
15616+# define spin_lock_nested(lock, subclass) \
15617+ do { \
15618+ rt_spin_lock_nested(lock, subclass); \
15619+ } while (0)
15620+
15621+#define spin_lock_bh_nested(lock, subclass) \
15622+ do { \
15623+ local_bh_disable(); \
15624+ rt_spin_lock_nested(lock, subclass); \
15625+ } while (0)
15626+
15627+# define spin_lock_irqsave_nested(lock, flags, subclass) \
15628+ do { \
15629+ typecheck(unsigned long, flags); \
15630+ flags = 0; \
15631+ rt_spin_lock_nested(lock, subclass); \
15632+ } while (0)
15633+#else
15634+# define spin_lock_nested(lock, subclass) spin_lock(lock)
15635+# define spin_lock_bh_nested(lock, subclass) spin_lock_bh(lock)
15636+
15637+# define spin_lock_irqsave_nested(lock, flags, subclass) \
15638+ do { \
15639+ typecheck(unsigned long, flags); \
15640+ flags = 0; \
15641+ spin_lock(lock); \
15642+ } while (0)
15643+#endif
15644+
15645+#define spin_lock_irqsave(lock, flags) \
15646+ do { \
15647+ typecheck(unsigned long, flags); \
15648+ flags = 0; \
15649+ spin_lock(lock); \
15650+ } while (0)
15651+
15652+static inline unsigned long spin_lock_trace_flags(spinlock_t *lock)
15653+{
15654+ unsigned long flags = 0;
15655+#ifdef CONFIG_TRACE_IRQFLAGS
15656+ flags = rt_spin_lock_trace_flags(lock);
15657+#else
15658+ spin_lock(lock); /* lock_local */
15659+#endif
15660+ return flags;
15661+}
15662+
15663+/* FIXME: we need rt_spin_lock_nest_lock */
15664+#define spin_lock_nest_lock(lock, nest_lock) spin_lock_nested(lock, 0)
15665+
15666+#define spin_unlock(lock) rt_spin_unlock(lock)
15667+
15668+#define spin_unlock_bh(lock) \
15669+ do { \
15670+ rt_spin_unlock(lock); \
15671+ local_bh_enable(); \
15672+ } while (0)
15673+
15674+#define spin_unlock_irq(lock) spin_unlock(lock)
15675+
15676+#define spin_unlock_irqrestore(lock, flags) \
15677+ do { \
15678+ typecheck(unsigned long, flags); \
15679+ (void) flags; \
15680+ spin_unlock(lock); \
15681+ } while (0)
15682+
15683+#define spin_trylock_bh(lock) __cond_lock(lock, rt_spin_trylock_bh(lock))
15684+#define spin_trylock_irq(lock) spin_trylock(lock)
15685+
15686+#define spin_trylock_irqsave(lock, flags) \
15687+ rt_spin_trylock_irqsave(lock, &(flags))
15688+
15689+#define spin_unlock_wait(lock) rt_spin_unlock_wait(lock)
15690+
15691+#ifdef CONFIG_GENERIC_LOCKBREAK
15692+# define spin_is_contended(lock) ((lock)->break_lock)
15693+#else
15694+# define spin_is_contended(lock) (((void)(lock), 0))
15695+#endif
15696+
15697+static inline int spin_can_lock(spinlock_t *lock)
15698+{
15699+ return !rt_mutex_is_locked(&lock->lock);
15700+}
15701+
15702+static inline int spin_is_locked(spinlock_t *lock)
15703+{
15704+ return rt_mutex_is_locked(&lock->lock);
15705+}
15706+
15707+static inline void assert_spin_locked(spinlock_t *lock)
15708+{
15709+ BUG_ON(!spin_is_locked(lock));
15710+}
15711+
15712+#define atomic_dec_and_lock(atomic, lock) \
15713+ atomic_dec_and_spin_lock(atomic, lock)
15714+
15715+#endif
15716diff --git a/include/linux/spinlock_types.h b/include/linux/spinlock_types.h
15717index 73548eb13a5d..10bac715ea96 100644
15718--- a/include/linux/spinlock_types.h
15719+++ b/include/linux/spinlock_types.h
15720@@ -9,80 +9,15 @@
15721 * Released under the General Public License (GPL).
15722 */
15723
15724-#if defined(CONFIG_SMP)
15725-# include <asm/spinlock_types.h>
15726-#else
15727-# include <linux/spinlock_types_up.h>
15728-#endif
15729-
15730-#include <linux/lockdep.h>
15731-
15732-typedef struct raw_spinlock {
15733- arch_spinlock_t raw_lock;
15734-#ifdef CONFIG_GENERIC_LOCKBREAK
15735- unsigned int break_lock;
15736-#endif
15737-#ifdef CONFIG_DEBUG_SPINLOCK
15738- unsigned int magic, owner_cpu;
15739- void *owner;
15740-#endif
15741-#ifdef CONFIG_DEBUG_LOCK_ALLOC
15742- struct lockdep_map dep_map;
15743-#endif
15744-} raw_spinlock_t;
15745-
15746-#define SPINLOCK_MAGIC 0xdead4ead
15747-
15748-#define SPINLOCK_OWNER_INIT ((void *)-1L)
15749-
15750-#ifdef CONFIG_DEBUG_LOCK_ALLOC
15751-# define SPIN_DEP_MAP_INIT(lockname) .dep_map = { .name = #lockname }
15752-#else
15753-# define SPIN_DEP_MAP_INIT(lockname)
15754-#endif
15755+#include <linux/spinlock_types_raw.h>
15756
15757-#ifdef CONFIG_DEBUG_SPINLOCK
15758-# define SPIN_DEBUG_INIT(lockname) \
15759- .magic = SPINLOCK_MAGIC, \
15760- .owner_cpu = -1, \
15761- .owner = SPINLOCK_OWNER_INIT,
15762+#ifndef CONFIG_PREEMPT_RT_FULL
15763+# include <linux/spinlock_types_nort.h>
15764+# include <linux/rwlock_types.h>
15765 #else
15766-# define SPIN_DEBUG_INIT(lockname)
15767+# include <linux/rtmutex.h>
15768+# include <linux/spinlock_types_rt.h>
15769+# include <linux/rwlock_types_rt.h>
15770 #endif
15771
15772-#define __RAW_SPIN_LOCK_INITIALIZER(lockname) \
15773- { \
15774- .raw_lock = __ARCH_SPIN_LOCK_UNLOCKED, \
15775- SPIN_DEBUG_INIT(lockname) \
15776- SPIN_DEP_MAP_INIT(lockname) }
15777-
15778-#define __RAW_SPIN_LOCK_UNLOCKED(lockname) \
15779- (raw_spinlock_t) __RAW_SPIN_LOCK_INITIALIZER(lockname)
15780-
15781-#define DEFINE_RAW_SPINLOCK(x) raw_spinlock_t x = __RAW_SPIN_LOCK_UNLOCKED(x)
15782-
15783-typedef struct spinlock {
15784- union {
15785- struct raw_spinlock rlock;
15786-
15787-#ifdef CONFIG_DEBUG_LOCK_ALLOC
15788-# define LOCK_PADSIZE (offsetof(struct raw_spinlock, dep_map))
15789- struct {
15790- u8 __padding[LOCK_PADSIZE];
15791- struct lockdep_map dep_map;
15792- };
15793-#endif
15794- };
15795-} spinlock_t;
15796-
15797-#define __SPIN_LOCK_INITIALIZER(lockname) \
15798- { { .rlock = __RAW_SPIN_LOCK_INITIALIZER(lockname) } }
15799-
15800-#define __SPIN_LOCK_UNLOCKED(lockname) \
15801- (spinlock_t ) __SPIN_LOCK_INITIALIZER(lockname)
15802-
15803-#define DEFINE_SPINLOCK(x) spinlock_t x = __SPIN_LOCK_UNLOCKED(x)
15804-
15805-#include <linux/rwlock_types.h>
15806-
15807 #endif /* __LINUX_SPINLOCK_TYPES_H */
15808diff --git a/include/linux/spinlock_types_nort.h b/include/linux/spinlock_types_nort.h
15809new file mode 100644
15810index 000000000000..f1dac1fb1d6a
15811--- /dev/null
15812+++ b/include/linux/spinlock_types_nort.h
15813@@ -0,0 +1,33 @@
15814+#ifndef __LINUX_SPINLOCK_TYPES_NORT_H
15815+#define __LINUX_SPINLOCK_TYPES_NORT_H
15816+
15817+#ifndef __LINUX_SPINLOCK_TYPES_H
15818+#error "Do not include directly. Include spinlock_types.h instead"
15819+#endif
15820+
15821+/*
15822+ * The non RT version maps spinlocks to raw_spinlocks
15823+ */
15824+typedef struct spinlock {
15825+ union {
15826+ struct raw_spinlock rlock;
15827+
15828+#ifdef CONFIG_DEBUG_LOCK_ALLOC
15829+# define LOCK_PADSIZE (offsetof(struct raw_spinlock, dep_map))
15830+ struct {
15831+ u8 __padding[LOCK_PADSIZE];
15832+ struct lockdep_map dep_map;
15833+ };
15834+#endif
15835+ };
15836+} spinlock_t;
15837+
15838+#define __SPIN_LOCK_INITIALIZER(lockname) \
15839+ { { .rlock = __RAW_SPIN_LOCK_INITIALIZER(lockname) } }
15840+
15841+#define __SPIN_LOCK_UNLOCKED(lockname) \
15842+ (spinlock_t ) __SPIN_LOCK_INITIALIZER(lockname)
15843+
15844+#define DEFINE_SPINLOCK(x) spinlock_t x = __SPIN_LOCK_UNLOCKED(x)
15845+
15846+#endif
15847diff --git a/include/linux/spinlock_types_raw.h b/include/linux/spinlock_types_raw.h
15848new file mode 100644
15849index 000000000000..03235b475b77
15850--- /dev/null
15851+++ b/include/linux/spinlock_types_raw.h
15852@@ -0,0 +1,58 @@
15853+#ifndef __LINUX_SPINLOCK_TYPES_RAW_H
15854+#define __LINUX_SPINLOCK_TYPES_RAW_H
15855+
15856+#include <linux/types.h>
15857+
15858+#if defined(CONFIG_SMP)
15859+# include <asm/spinlock_types.h>
15860+#else
15861+# include <linux/spinlock_types_up.h>
15862+#endif
15863+
15864+#include <linux/lockdep.h>
15865+
15866+typedef struct raw_spinlock {
15867+ arch_spinlock_t raw_lock;
15868+#ifdef CONFIG_GENERIC_LOCKBREAK
15869+ unsigned int break_lock;
15870+#endif
15871+#ifdef CONFIG_DEBUG_SPINLOCK
15872+ unsigned int magic, owner_cpu;
15873+ void *owner;
15874+#endif
15875+#ifdef CONFIG_DEBUG_LOCK_ALLOC
15876+ struct lockdep_map dep_map;
15877+#endif
15878+} raw_spinlock_t;
15879+
15880+#define SPINLOCK_MAGIC 0xdead4ead
15881+
15882+#define SPINLOCK_OWNER_INIT ((void *)-1L)
15883+
15884+#ifdef CONFIG_DEBUG_LOCK_ALLOC
15885+# define SPIN_DEP_MAP_INIT(lockname) .dep_map = { .name = #lockname }
15886+#else
15887+# define SPIN_DEP_MAP_INIT(lockname)
15888+#endif
15889+
15890+#ifdef CONFIG_DEBUG_SPINLOCK
15891+# define SPIN_DEBUG_INIT(lockname) \
15892+ .magic = SPINLOCK_MAGIC, \
15893+ .owner_cpu = -1, \
15894+ .owner = SPINLOCK_OWNER_INIT,
15895+#else
15896+# define SPIN_DEBUG_INIT(lockname)
15897+#endif
15898+
15899+#define __RAW_SPIN_LOCK_INITIALIZER(lockname) \
15900+ { \
15901+ .raw_lock = __ARCH_SPIN_LOCK_UNLOCKED, \
15902+ SPIN_DEBUG_INIT(lockname) \
15903+ SPIN_DEP_MAP_INIT(lockname) }
15904+
15905+#define __RAW_SPIN_LOCK_UNLOCKED(lockname) \
15906+ (raw_spinlock_t) __RAW_SPIN_LOCK_INITIALIZER(lockname)
15907+
15908+#define DEFINE_RAW_SPINLOCK(x) raw_spinlock_t x = __RAW_SPIN_LOCK_UNLOCKED(x)
15909+
15910+#endif
15911diff --git a/include/linux/spinlock_types_rt.h b/include/linux/spinlock_types_rt.h
15912new file mode 100644
15913index 000000000000..3e3d8c5f7a9a
15914--- /dev/null
15915+++ b/include/linux/spinlock_types_rt.h
15916@@ -0,0 +1,48 @@
15917+#ifndef __LINUX_SPINLOCK_TYPES_RT_H
15918+#define __LINUX_SPINLOCK_TYPES_RT_H
15919+
15920+#ifndef __LINUX_SPINLOCK_TYPES_H
15921+#error "Do not include directly. Include spinlock_types.h instead"
15922+#endif
15923+
15924+#include <linux/cache.h>
15925+
15926+/*
15927+ * PREEMPT_RT: spinlocks - an RT mutex plus lock-break field:
15928+ */
15929+typedef struct spinlock {
15930+ struct rt_mutex lock;
15931+ unsigned int break_lock;
15932+#ifdef CONFIG_DEBUG_LOCK_ALLOC
15933+ struct lockdep_map dep_map;
15934+#endif
15935+} spinlock_t;
15936+
15937+#ifdef CONFIG_DEBUG_RT_MUTEXES
15938+# define __RT_SPIN_INITIALIZER(name) \
15939+ { \
15940+ .wait_lock = __RAW_SPIN_LOCK_UNLOCKED(name.wait_lock), \
15941+ .save_state = 1, \
15942+ .file = __FILE__, \
15943+ .line = __LINE__ , \
15944+ }
15945+#else
15946+# define __RT_SPIN_INITIALIZER(name) \
15947+ { \
15948+ .wait_lock = __RAW_SPIN_LOCK_UNLOCKED(name.wait_lock), \
15949+ .save_state = 1, \
15950+ }
15951+#endif
15952+
15953+/*
15954+.wait_list = PLIST_HEAD_INIT_RAW((name).lock.wait_list, (name).lock.wait_lock)
15955+*/
15956+
15957+#define __SPIN_LOCK_UNLOCKED(name) \
15958+ { .lock = __RT_SPIN_INITIALIZER(name.lock), \
15959+ SPIN_DEP_MAP_INIT(name) }
15960+
15961+#define DEFINE_SPINLOCK(name) \
15962+ spinlock_t name = __SPIN_LOCK_UNLOCKED(name)
15963+
15964+#endif
15965diff --git a/include/linux/spinlock_types_up.h b/include/linux/spinlock_types_up.h
15966index c09b6407ae1b..b0243ba07fb7 100644
15967--- a/include/linux/spinlock_types_up.h
15968+++ b/include/linux/spinlock_types_up.h
15969@@ -1,10 +1,6 @@
15970 #ifndef __LINUX_SPINLOCK_TYPES_UP_H
15971 #define __LINUX_SPINLOCK_TYPES_UP_H
15972
15973-#ifndef __LINUX_SPINLOCK_TYPES_H
15974-# error "please don't include this file directly"
15975-#endif
15976-
15977 /*
15978 * include/linux/spinlock_types_up.h - spinlock type definitions for UP
15979 *
15980diff --git a/include/linux/srcutiny.h b/include/linux/srcutiny.h
15981index 261471f407a5..f41d2fb09f87 100644
15982--- a/include/linux/srcutiny.h
15983+++ b/include/linux/srcutiny.h
15984@@ -43,7 +43,7 @@ struct srcu_struct {
15985
15986 void srcu_drive_gp(struct work_struct *wp);
15987
15988-#define __SRCU_STRUCT_INIT(name) \
15989+#define __SRCU_STRUCT_INIT(name, __ignored) \
15990 { \
15991 .srcu_wq = __SWAIT_QUEUE_HEAD_INITIALIZER(name.srcu_wq), \
15992 .srcu_cb_tail = &name.srcu_cb_head, \
15993@@ -56,9 +56,9 @@ void srcu_drive_gp(struct work_struct *wp);
15994 * Tree SRCU, which needs some per-CPU data.
15995 */
15996 #define DEFINE_SRCU(name) \
15997- struct srcu_struct name = __SRCU_STRUCT_INIT(name)
15998+ struct srcu_struct name = __SRCU_STRUCT_INIT(name, name)
15999 #define DEFINE_STATIC_SRCU(name) \
16000- static struct srcu_struct name = __SRCU_STRUCT_INIT(name)
16001+ static struct srcu_struct name = __SRCU_STRUCT_INIT(name, name)
16002
16003 void synchronize_srcu(struct srcu_struct *sp);
16004
16005diff --git a/include/linux/srcutree.h b/include/linux/srcutree.h
16006index a949f4f9e4d7..745d4ca4dd50 100644
16007--- a/include/linux/srcutree.h
16008+++ b/include/linux/srcutree.h
16009@@ -40,7 +40,7 @@ struct srcu_data {
16010 unsigned long srcu_unlock_count[2]; /* Unlocks per CPU. */
16011
16012 /* Update-side state. */
16013- raw_spinlock_t __private lock ____cacheline_internodealigned_in_smp;
16014+ spinlock_t __private lock ____cacheline_internodealigned_in_smp;
16015 struct rcu_segcblist srcu_cblist; /* List of callbacks.*/
16016 unsigned long srcu_gp_seq_needed; /* Furthest future GP needed. */
16017 unsigned long srcu_gp_seq_needed_exp; /* Furthest future exp GP. */
16018@@ -58,7 +58,7 @@ struct srcu_data {
16019 * Node in SRCU combining tree, similar in function to rcu_data.
16020 */
16021 struct srcu_node {
16022- raw_spinlock_t __private lock;
16023+ spinlock_t __private lock;
16024 unsigned long srcu_have_cbs[4]; /* GP seq for children */
16025 /* having CBs, but only */
16026 /* is > ->srcu_gq_seq. */
16027@@ -78,7 +78,7 @@ struct srcu_struct {
16028 struct srcu_node *level[RCU_NUM_LVLS + 1];
16029 /* First node at each level. */
16030 struct mutex srcu_cb_mutex; /* Serialize CB preparation. */
16031- raw_spinlock_t __private lock; /* Protect counters */
16032+ spinlock_t __private lock; /* Protect counters */
16033 struct mutex srcu_gp_mutex; /* Serialize GP work. */
16034 unsigned int srcu_idx; /* Current rdr array element. */
16035 unsigned long srcu_gp_seq; /* Grace-period seq #. */
16036@@ -104,10 +104,10 @@ struct srcu_struct {
16037 #define SRCU_STATE_SCAN1 1
16038 #define SRCU_STATE_SCAN2 2
16039
16040-#define __SRCU_STRUCT_INIT(name) \
16041+#define __SRCU_STRUCT_INIT(name, pcpu_name) \
16042 { \
16043- .sda = &name##_srcu_data, \
16044- .lock = __RAW_SPIN_LOCK_UNLOCKED(name.lock), \
16045+ .sda = &pcpu_name, \
16046+ .lock = __SPIN_LOCK_UNLOCKED(name.lock), \
16047 .srcu_gp_seq_needed = 0 - 1, \
16048 __SRCU_DEP_MAP_INIT(name) \
16049 }
16050@@ -133,7 +133,7 @@ struct srcu_struct {
16051 */
16052 #define __DEFINE_SRCU(name, is_static) \
16053 static DEFINE_PER_CPU(struct srcu_data, name##_srcu_data);\
16054- is_static struct srcu_struct name = __SRCU_STRUCT_INIT(name)
16055+ is_static struct srcu_struct name = __SRCU_STRUCT_INIT(name, name##_srcu_data)
16056 #define DEFINE_SRCU(name) __DEFINE_SRCU(name, /* not static */)
16057 #define DEFINE_STATIC_SRCU(name) __DEFINE_SRCU(name, static)
16058
16059diff --git a/include/linux/suspend.h b/include/linux/suspend.h
16060index 8544357d92d0..616ea66cd283 100644
16061--- a/include/linux/suspend.h
16062+++ b/include/linux/suspend.h
16063@@ -196,6 +196,12 @@ struct platform_s2idle_ops {
16064 void (*end)(void);
16065 };
16066
16067+#if defined(CONFIG_SUSPEND) || defined(CONFIG_HIBERNATION)
16068+extern bool pm_in_action;
16069+#else
16070+# define pm_in_action false
16071+#endif
16072+
16073 #ifdef CONFIG_SUSPEND
16074 extern suspend_state_t mem_sleep_current;
16075 extern suspend_state_t mem_sleep_default;
16076diff --git a/include/linux/swait.h b/include/linux/swait.h
16077index c98aaf677466..853f3e61a9f4 100644
16078--- a/include/linux/swait.h
16079+++ b/include/linux/swait.h
16080@@ -5,6 +5,7 @@
16081 #include <linux/list.h>
16082 #include <linux/stddef.h>
16083 #include <linux/spinlock.h>
16084+#include <linux/wait.h>
16085 #include <asm/current.h>
16086
16087 /*
16088@@ -147,6 +148,7 @@ static inline bool swq_has_sleeper(struct swait_queue_head *wq)
16089 extern void swake_up(struct swait_queue_head *q);
16090 extern void swake_up_all(struct swait_queue_head *q);
16091 extern void swake_up_locked(struct swait_queue_head *q);
16092+extern void swake_up_all_locked(struct swait_queue_head *q);
16093
16094 extern void __prepare_to_swait(struct swait_queue_head *q, struct swait_queue *wait);
16095 extern void prepare_to_swait(struct swait_queue_head *q, struct swait_queue *wait, int state);
16096diff --git a/include/linux/swap.h b/include/linux/swap.h
16097index f02fb5db8914..6c775168df67 100644
16098--- a/include/linux/swap.h
16099+++ b/include/linux/swap.h
16100@@ -12,6 +12,7 @@
16101 #include <linux/fs.h>
16102 #include <linux/atomic.h>
16103 #include <linux/page-flags.h>
16104+#include <linux/locallock.h>
16105 #include <asm/page.h>
16106
16107 struct notifier_block;
16108@@ -297,7 +298,8 @@ struct vma_swap_readahead {
16109 void *workingset_eviction(struct address_space *mapping, struct page *page);
16110 bool workingset_refault(void *shadow);
16111 void workingset_activation(struct page *page);
16112-void workingset_update_node(struct radix_tree_node *node, void *private);
16113+void __workingset_update_node(struct radix_tree_node *node, void *private);
16114+DECLARE_LOCAL_IRQ_LOCK(shadow_nodes_lock);
16115
16116 /* linux/mm/page_alloc.c */
16117 extern unsigned long totalram_pages;
16118@@ -310,6 +312,7 @@ extern unsigned long nr_free_pagecache_pages(void);
16119
16120
16121 /* linux/mm/swap.c */
16122+DECLARE_LOCAL_IRQ_LOCK(swapvec_lock);
16123 extern void lru_cache_add(struct page *);
16124 extern void lru_cache_add_anon(struct page *page);
16125 extern void lru_cache_add_file(struct page *page);
16126diff --git a/include/linux/swork.h b/include/linux/swork.h
16127new file mode 100644
16128index 000000000000..f175fa9a6016
16129--- /dev/null
16130+++ b/include/linux/swork.h
16131@@ -0,0 +1,24 @@
16132+#ifndef _LINUX_SWORK_H
16133+#define _LINUX_SWORK_H
16134+
16135+#include <linux/list.h>
16136+
16137+struct swork_event {
16138+ struct list_head item;
16139+ unsigned long flags;
16140+ void (*func)(struct swork_event *);
16141+};
16142+
16143+static inline void INIT_SWORK(struct swork_event *event,
16144+ void (*func)(struct swork_event *))
16145+{
16146+ event->flags = 0;
16147+ event->func = func;
16148+}
16149+
16150+bool swork_queue(struct swork_event *sev);
16151+
16152+int swork_get(void);
16153+void swork_put(void);
16154+
16155+#endif /* _LINUX_SWORK_H */
16156diff --git a/include/linux/thread_info.h b/include/linux/thread_info.h
16157index cf2862bd134a..fd05d83740df 100644
16158--- a/include/linux/thread_info.h
16159+++ b/include/linux/thread_info.h
16160@@ -86,7 +86,17 @@ static inline int test_ti_thread_flag(struct thread_info *ti, int flag)
16161 #define test_thread_flag(flag) \
16162 test_ti_thread_flag(current_thread_info(), flag)
16163
16164-#define tif_need_resched() test_thread_flag(TIF_NEED_RESCHED)
16165+#ifdef CONFIG_PREEMPT_LAZY
16166+#define tif_need_resched() (test_thread_flag(TIF_NEED_RESCHED) || \
16167+ test_thread_flag(TIF_NEED_RESCHED_LAZY))
16168+#define tif_need_resched_now() (test_thread_flag(TIF_NEED_RESCHED))
16169+#define tif_need_resched_lazy() test_thread_flag(TIF_NEED_RESCHED_LAZY))
16170+
16171+#else
16172+#define tif_need_resched() test_thread_flag(TIF_NEED_RESCHED)
16173+#define tif_need_resched_now() test_thread_flag(TIF_NEED_RESCHED)
16174+#define tif_need_resched_lazy() 0
16175+#endif
16176
16177 #ifndef CONFIG_HAVE_ARCH_WITHIN_STACK_FRAMES
16178 static inline int arch_within_stack_frames(const void * const stack,
16179diff --git a/include/linux/timer.h b/include/linux/timer.h
16180index e0ea1fe87572..df3085ddf662 100644
16181--- a/include/linux/timer.h
16182+++ b/include/linux/timer.h
16183@@ -213,7 +213,7 @@ extern void add_timer(struct timer_list *timer);
16184
16185 extern int try_to_del_timer_sync(struct timer_list *timer);
16186
16187-#ifdef CONFIG_SMP
16188+#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT_FULL)
16189 extern int del_timer_sync(struct timer_list *timer);
16190 #else
16191 # define del_timer_sync(t) del_timer(t)
16192diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h
16193index 2bcb4dc6df1a..edd1e42e8a2f 100644
16194--- a/include/linux/trace_events.h
16195+++ b/include/linux/trace_events.h
16196@@ -62,6 +62,9 @@ struct trace_entry {
16197 unsigned char flags;
16198 unsigned char preempt_count;
16199 int pid;
16200+ unsigned short migrate_disable;
16201+ unsigned short padding;
16202+ unsigned char preempt_lazy_count;
16203 };
16204
16205 #define TRACE_EVENT_TYPE_MAX \
16206@@ -402,11 +405,13 @@ enum event_trigger_type {
16207
16208 extern int filter_match_preds(struct event_filter *filter, void *rec);
16209
16210-extern enum event_trigger_type event_triggers_call(struct trace_event_file *file,
16211- void *rec);
16212-extern void event_triggers_post_call(struct trace_event_file *file,
16213- enum event_trigger_type tt,
16214- void *rec);
16215+extern enum event_trigger_type
16216+event_triggers_call(struct trace_event_file *file, void *rec,
16217+ struct ring_buffer_event *event);
16218+extern void
16219+event_triggers_post_call(struct trace_event_file *file,
16220+ enum event_trigger_type tt,
16221+ void *rec, struct ring_buffer_event *event);
16222
16223 bool trace_event_ignore_this_pid(struct trace_event_file *trace_file);
16224
16225@@ -426,7 +431,7 @@ trace_trigger_soft_disabled(struct trace_event_file *file)
16226
16227 if (!(eflags & EVENT_FILE_FL_TRIGGER_COND)) {
16228 if (eflags & EVENT_FILE_FL_TRIGGER_MODE)
16229- event_triggers_call(file, NULL);
16230+ event_triggers_call(file, NULL, NULL);
16231 if (eflags & EVENT_FILE_FL_SOFT_DISABLED)
16232 return true;
16233 if (eflags & EVENT_FILE_FL_PID_FILTER)
16234diff --git a/include/linux/uaccess.h b/include/linux/uaccess.h
16235index 251e655d407f..57e8e32ef2b0 100644
16236--- a/include/linux/uaccess.h
16237+++ b/include/linux/uaccess.h
16238@@ -185,6 +185,7 @@ static __always_inline void pagefault_disabled_dec(void)
16239 */
16240 static inline void pagefault_disable(void)
16241 {
16242+ migrate_disable();
16243 pagefault_disabled_inc();
16244 /*
16245 * make sure to have issued the store before a pagefault
16246@@ -201,6 +202,7 @@ static inline void pagefault_enable(void)
16247 */
16248 barrier();
16249 pagefault_disabled_dec();
16250+ migrate_enable();
16251 }
16252
16253 /*
16254diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h
16255index 1e0cb72e0598..87ab0996a9b0 100644
16256--- a/include/linux/vmstat.h
16257+++ b/include/linux/vmstat.h
16258@@ -33,7 +33,9 @@ DECLARE_PER_CPU(struct vm_event_state, vm_event_states);
16259 */
16260 static inline void __count_vm_event(enum vm_event_item item)
16261 {
16262+ preempt_disable_rt();
16263 raw_cpu_inc(vm_event_states.event[item]);
16264+ preempt_enable_rt();
16265 }
16266
16267 static inline void count_vm_event(enum vm_event_item item)
16268@@ -43,7 +45,9 @@ static inline void count_vm_event(enum vm_event_item item)
16269
16270 static inline void __count_vm_events(enum vm_event_item item, long delta)
16271 {
16272+ preempt_disable_rt();
16273 raw_cpu_add(vm_event_states.event[item], delta);
16274+ preempt_enable_rt();
16275 }
16276
16277 static inline void count_vm_events(enum vm_event_item item, long delta)
16278diff --git a/include/linux/wait.h b/include/linux/wait.h
16279index 158715445ffb..3451706a3074 100644
16280--- a/include/linux/wait.h
16281+++ b/include/linux/wait.h
16282@@ -10,6 +10,7 @@
16283
16284 #include <asm/current.h>
16285 #include <uapi/linux/wait.h>
16286+#include <linux/atomic.h>
16287
16288 typedef struct wait_queue_entry wait_queue_entry_t;
16289
16290@@ -486,8 +487,8 @@ do { \
16291 int __ret = 0; \
16292 struct hrtimer_sleeper __t; \
16293 \
16294- hrtimer_init_on_stack(&__t.timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); \
16295- hrtimer_init_sleeper(&__t, current); \
16296+ hrtimer_init_sleeper_on_stack(&__t, CLOCK_MONOTONIC, HRTIMER_MODE_REL, \
16297+ current); \
16298 if ((timeout) != KTIME_MAX) \
16299 hrtimer_start_range_ns(&__t.timer, timeout, \
16300 current->timer_slack_ns, \
16301diff --git a/include/net/gen_stats.h b/include/net/gen_stats.h
16302index 304f7aa9cc01..00d3813cef26 100644
16303--- a/include/net/gen_stats.h
16304+++ b/include/net/gen_stats.h
16305@@ -6,6 +6,7 @@
16306 #include <linux/socket.h>
16307 #include <linux/rtnetlink.h>
16308 #include <linux/pkt_sched.h>
16309+#include <net/net_seq_lock.h>
16310
16311 struct gnet_stats_basic_cpu {
16312 struct gnet_stats_basic_packed bstats;
16313@@ -36,11 +37,11 @@ int gnet_stats_start_copy_compat(struct sk_buff *skb, int type,
16314 spinlock_t *lock, struct gnet_dump *d,
16315 int padattr);
16316
16317-int gnet_stats_copy_basic(const seqcount_t *running,
16318+int gnet_stats_copy_basic(net_seqlock_t *running,
16319 struct gnet_dump *d,
16320 struct gnet_stats_basic_cpu __percpu *cpu,
16321 struct gnet_stats_basic_packed *b);
16322-void __gnet_stats_copy_basic(const seqcount_t *running,
16323+void __gnet_stats_copy_basic(net_seqlock_t *running,
16324 struct gnet_stats_basic_packed *bstats,
16325 struct gnet_stats_basic_cpu __percpu *cpu,
16326 struct gnet_stats_basic_packed *b);
16327@@ -57,13 +58,13 @@ int gen_new_estimator(struct gnet_stats_basic_packed *bstats,
16328 struct gnet_stats_basic_cpu __percpu *cpu_bstats,
16329 struct net_rate_estimator __rcu **rate_est,
16330 spinlock_t *stats_lock,
16331- seqcount_t *running, struct nlattr *opt);
16332+ net_seqlock_t *running, struct nlattr *opt);
16333 void gen_kill_estimator(struct net_rate_estimator __rcu **ptr);
16334 int gen_replace_estimator(struct gnet_stats_basic_packed *bstats,
16335 struct gnet_stats_basic_cpu __percpu *cpu_bstats,
16336 struct net_rate_estimator __rcu **ptr,
16337 spinlock_t *stats_lock,
16338- seqcount_t *running, struct nlattr *opt);
16339+ net_seqlock_t *running, struct nlattr *opt);
16340 bool gen_estimator_active(struct net_rate_estimator __rcu **ptr);
16341 bool gen_estimator_read(struct net_rate_estimator __rcu **ptr,
16342 struct gnet_stats_rate_est64 *sample);
16343diff --git a/include/net/neighbour.h b/include/net/neighbour.h
16344index a964366a7ef5..51c854583987 100644
16345--- a/include/net/neighbour.h
16346+++ b/include/net/neighbour.h
16347@@ -450,7 +450,7 @@ static inline int neigh_hh_bridge(struct hh_cache *hh, struct sk_buff *skb)
16348 }
16349 #endif
16350
16351-static inline int neigh_hh_output(const struct hh_cache *hh, struct sk_buff *skb)
16352+static inline int neigh_hh_output(struct hh_cache *hh, struct sk_buff *skb)
16353 {
16354 unsigned int seq;
16355 unsigned int hh_len;
16356@@ -474,7 +474,7 @@ static inline int neigh_hh_output(const struct hh_cache *hh, struct sk_buff *skb
16357
16358 static inline int neigh_output(struct neighbour *n, struct sk_buff *skb)
16359 {
16360- const struct hh_cache *hh = &n->hh;
16361+ struct hh_cache *hh = &n->hh;
16362
16363 if ((n->nud_state & NUD_CONNECTED) && hh->hh_len)
16364 return neigh_hh_output(hh, skb);
16365@@ -515,7 +515,7 @@ struct neighbour_cb {
16366
16367 #define NEIGH_CB(skb) ((struct neighbour_cb *)(skb)->cb)
16368
16369-static inline void neigh_ha_snapshot(char *dst, const struct neighbour *n,
16370+static inline void neigh_ha_snapshot(char *dst, struct neighbour *n,
16371 const struct net_device *dev)
16372 {
16373 unsigned int seq;
16374diff --git a/include/net/net_seq_lock.h b/include/net/net_seq_lock.h
16375new file mode 100644
16376index 000000000000..a7034298a82a
16377--- /dev/null
16378+++ b/include/net/net_seq_lock.h
16379@@ -0,0 +1,15 @@
16380+#ifndef __NET_NET_SEQ_LOCK_H__
16381+#define __NET_NET_SEQ_LOCK_H__
16382+
16383+#ifdef CONFIG_PREEMPT_RT_BASE
16384+# define net_seqlock_t seqlock_t
16385+# define net_seq_begin(__r) read_seqbegin(__r)
16386+# define net_seq_retry(__r, __s) read_seqretry(__r, __s)
16387+
16388+#else
16389+# define net_seqlock_t seqcount_t
16390+# define net_seq_begin(__r) read_seqcount_begin(__r)
16391+# define net_seq_retry(__r, __s) read_seqcount_retry(__r, __s)
16392+#endif
16393+
16394+#endif
16395diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
16396index f59acacaa265..6ac7c3659973 100644
16397--- a/include/net/sch_generic.h
16398+++ b/include/net/sch_generic.h
16399@@ -10,6 +10,7 @@
16400 #include <linux/percpu.h>
16401 #include <linux/dynamic_queue_limits.h>
16402 #include <linux/list.h>
16403+#include <net/net_seq_lock.h>
16404 #include <linux/refcount.h>
16405 #include <linux/workqueue.h>
16406 #include <net/gen_stats.h>
16407@@ -90,7 +91,7 @@ struct Qdisc {
16408 struct sk_buff *gso_skb ____cacheline_aligned_in_smp;
16409 struct qdisc_skb_head q;
16410 struct gnet_stats_basic_packed bstats;
16411- seqcount_t running;
16412+ net_seqlock_t running;
16413 struct gnet_stats_queue qstats;
16414 unsigned long state;
16415 struct Qdisc *next_sched;
16416@@ -109,13 +110,22 @@ static inline void qdisc_refcount_inc(struct Qdisc *qdisc)
16417 refcount_inc(&qdisc->refcnt);
16418 }
16419
16420-static inline bool qdisc_is_running(const struct Qdisc *qdisc)
16421+static inline bool qdisc_is_running(struct Qdisc *qdisc)
16422 {
16423+#ifdef CONFIG_PREEMPT_RT_BASE
16424+ return spin_is_locked(&qdisc->running.lock) ? true : false;
16425+#else
16426 return (raw_read_seqcount(&qdisc->running) & 1) ? true : false;
16427+#endif
16428 }
16429
16430 static inline bool qdisc_run_begin(struct Qdisc *qdisc)
16431 {
16432+#ifdef CONFIG_PREEMPT_RT_BASE
16433+ if (try_write_seqlock(&qdisc->running))
16434+ return true;
16435+ return false;
16436+#else
16437 if (qdisc_is_running(qdisc))
16438 return false;
16439 /* Variant of write_seqcount_begin() telling lockdep a trylock
16440@@ -124,11 +134,16 @@ static inline bool qdisc_run_begin(struct Qdisc *qdisc)
16441 raw_write_seqcount_begin(&qdisc->running);
16442 seqcount_acquire(&qdisc->running.dep_map, 0, 1, _RET_IP_);
16443 return true;
16444+#endif
16445 }
16446
16447 static inline void qdisc_run_end(struct Qdisc *qdisc)
16448 {
16449+#ifdef CONFIG_PREEMPT_RT_BASE
16450+ write_sequnlock(&qdisc->running);
16451+#else
16452 write_seqcount_end(&qdisc->running);
16453+#endif
16454 }
16455
16456 static inline bool qdisc_may_bulk(const struct Qdisc *qdisc)
16457@@ -337,7 +352,7 @@ static inline spinlock_t *qdisc_root_sleeping_lock(const struct Qdisc *qdisc)
16458 return qdisc_lock(root);
16459 }
16460
16461-static inline seqcount_t *qdisc_root_sleeping_running(const struct Qdisc *qdisc)
16462+static inline net_seqlock_t *qdisc_root_sleeping_running(const struct Qdisc *qdisc)
16463 {
16464 struct Qdisc *root = qdisc_root_sleeping(qdisc);
16465
16466diff --git a/include/net/xfrm.h b/include/net/xfrm.h
16467index db99efb2d1d0..a7b95ffbbf8b 100644
16468--- a/include/net/xfrm.h
16469+++ b/include/net/xfrm.h
16470@@ -217,7 +217,7 @@ struct xfrm_state {
16471 struct xfrm_stats stats;
16472
16473 struct xfrm_lifetime_cur curlft;
16474- struct tasklet_hrtimer mtimer;
16475+ struct hrtimer mtimer;
16476
16477 struct xfrm_state_offload xso;
16478
16479diff --git a/include/trace/events/timer.h b/include/trace/events/timer.h
16480index c6f728037c53..a57e4ee989d6 100644
16481--- a/include/trace/events/timer.h
16482+++ b/include/trace/events/timer.h
16483@@ -148,7 +148,11 @@ DEFINE_EVENT(timer_class, timer_cancel,
16484 { HRTIMER_MODE_ABS, "ABS" }, \
16485 { HRTIMER_MODE_REL, "REL" }, \
16486 { HRTIMER_MODE_ABS_PINNED, "ABS|PINNED" }, \
16487- { HRTIMER_MODE_REL_PINNED, "REL|PINNED" })
16488+ { HRTIMER_MODE_REL_PINNED, "REL|PINNED" }, \
16489+ { HRTIMER_MODE_ABS_SOFT, "ABS|SOFT" }, \
16490+ { HRTIMER_MODE_REL_SOFT, "REL|SOFT" }, \
16491+ { HRTIMER_MODE_ABS_PINNED_SOFT, "ABS|PINNED|SOFT" }, \
16492+ { HRTIMER_MODE_REL_PINNED_SOFT, "REL|PINNED|SOFT" })
16493
16494 /**
16495 * hrtimer_init - called when the hrtimer is initialized
16496@@ -186,15 +190,16 @@ TRACE_EVENT(hrtimer_init,
16497 */
16498 TRACE_EVENT(hrtimer_start,
16499
16500- TP_PROTO(struct hrtimer *hrtimer),
16501+ TP_PROTO(struct hrtimer *hrtimer, enum hrtimer_mode mode),
16502
16503- TP_ARGS(hrtimer),
16504+ TP_ARGS(hrtimer, mode),
16505
16506 TP_STRUCT__entry(
16507 __field( void *, hrtimer )
16508 __field( void *, function )
16509 __field( s64, expires )
16510 __field( s64, softexpires )
16511+ __field( enum hrtimer_mode, mode )
16512 ),
16513
16514 TP_fast_assign(
16515@@ -202,12 +207,14 @@ TRACE_EVENT(hrtimer_start,
16516 __entry->function = hrtimer->function;
16517 __entry->expires = hrtimer_get_expires(hrtimer);
16518 __entry->softexpires = hrtimer_get_softexpires(hrtimer);
16519+ __entry->mode = mode;
16520 ),
16521
16522- TP_printk("hrtimer=%p function=%pf expires=%llu softexpires=%llu",
16523- __entry->hrtimer, __entry->function,
16524+ TP_printk("hrtimer=%p function=%pf expires=%llu softexpires=%llu "
16525+ "mode=%s", __entry->hrtimer, __entry->function,
16526 (unsigned long long) __entry->expires,
16527- (unsigned long long) __entry->softexpires)
16528+ (unsigned long long) __entry->softexpires,
16529+ decode_hrtimer_mode(__entry->mode))
16530 );
16531
16532 /**
16533diff --git a/init/Kconfig b/init/Kconfig
16534index 46075327c165..a7aff2c1a203 100644
16535--- a/init/Kconfig
16536+++ b/init/Kconfig
16537@@ -744,6 +744,7 @@ config CFS_BANDWIDTH
16538 config RT_GROUP_SCHED
16539 bool "Group scheduling for SCHED_RR/FIFO"
16540 depends on CGROUP_SCHED
16541+ depends on !PREEMPT_RT_FULL
16542 default n
16543 help
16544 This feature lets you explicitly allocate real CPU bandwidth
16545@@ -1533,6 +1534,7 @@ choice
16546
16547 config SLAB
16548 bool "SLAB"
16549+ depends on !PREEMPT_RT_FULL
16550 select HAVE_HARDENED_USERCOPY_ALLOCATOR
16551 help
16552 The regular slab allocator that is established and known to work
16553@@ -1553,6 +1555,7 @@ config SLUB
16554 config SLOB
16555 depends on EXPERT
16556 bool "SLOB (Simple Allocator)"
16557+ depends on !PREEMPT_RT_FULL
16558 help
16559 SLOB replaces the stock allocator with a drastically simpler
16560 allocator. SLOB is generally more space efficient but
16561@@ -1594,7 +1597,7 @@ config SLAB_FREELIST_HARDENED
16562
16563 config SLUB_CPU_PARTIAL
16564 default y
16565- depends on SLUB && SMP
16566+ depends on SLUB && SMP && !PREEMPT_RT_FULL
16567 bool "SLUB per cpu partial cache"
16568 help
16569 Per cpu partial caches accellerate objects allocation and freeing
16570diff --git a/init/Makefile b/init/Makefile
16571index 1dbb23787290..eabf3f1b14be 100644
16572--- a/init/Makefile
16573+++ b/init/Makefile
16574@@ -36,4 +36,4 @@ silent_chk_compile.h = :
16575 include/generated/compile.h: FORCE
16576 @$($(quiet)chk_compile.h)
16577 $(Q)$(CONFIG_SHELL) $(srctree)/scripts/mkcompile_h $@ \
16578- "$(UTS_MACHINE)" "$(CONFIG_SMP)" "$(CONFIG_PREEMPT)" "$(CC) $(KBUILD_CFLAGS)"
16579+ "$(UTS_MACHINE)" "$(CONFIG_SMP)" "$(CONFIG_PREEMPT)" "$(CONFIG_PREEMPT_RT_FULL)" "$(CC) $(KBUILD_CFLAGS)"
16580diff --git a/init/main.c b/init/main.c
16581index c4a45145e102..c86f3d3b9a72 100644
16582--- a/init/main.c
16583+++ b/init/main.c
16584@@ -543,6 +543,7 @@ asmlinkage __visible void __init start_kernel(void)
16585 setup_command_line(command_line);
16586 setup_nr_cpu_ids();
16587 setup_per_cpu_areas();
16588+ softirq_early_init();
16589 smp_prepare_boot_cpu(); /* arch-specific boot-cpu hooks */
16590 boot_cpu_hotplug_init();
16591
16592diff --git a/kernel/Kconfig.locks b/kernel/Kconfig.locks
16593index 84d882f3e299..af27c4000812 100644
16594--- a/kernel/Kconfig.locks
16595+++ b/kernel/Kconfig.locks
16596@@ -225,11 +225,11 @@ config ARCH_SUPPORTS_ATOMIC_RMW
16597
16598 config MUTEX_SPIN_ON_OWNER
16599 def_bool y
16600- depends on SMP && ARCH_SUPPORTS_ATOMIC_RMW
16601+ depends on SMP && ARCH_SUPPORTS_ATOMIC_RMW && !PREEMPT_RT_FULL
16602
16603 config RWSEM_SPIN_ON_OWNER
16604 def_bool y
16605- depends on SMP && RWSEM_XCHGADD_ALGORITHM && ARCH_SUPPORTS_ATOMIC_RMW
16606+ depends on SMP && RWSEM_XCHGADD_ALGORITHM && ARCH_SUPPORTS_ATOMIC_RMW && !PREEMPT_RT_FULL
16607
16608 config LOCK_SPIN_ON_OWNER
16609 def_bool y
16610diff --git a/kernel/Kconfig.preempt b/kernel/Kconfig.preempt
16611index 3f9c97419f02..11dbe26a8279 100644
16612--- a/kernel/Kconfig.preempt
16613+++ b/kernel/Kconfig.preempt
16614@@ -1,3 +1,16 @@
16615+config PREEMPT
16616+ bool
16617+ select PREEMPT_COUNT
16618+
16619+config PREEMPT_RT_BASE
16620+ bool
16621+ select PREEMPT
16622+
16623+config HAVE_PREEMPT_LAZY
16624+ bool
16625+
16626+config PREEMPT_LAZY
16627+ def_bool y if HAVE_PREEMPT_LAZY && PREEMPT_RT_FULL
16628
16629 choice
16630 prompt "Preemption Model"
16631@@ -33,9 +46,9 @@ config PREEMPT_VOLUNTARY
16632
16633 Select this if you are building a kernel for a desktop system.
16634
16635-config PREEMPT
16636+config PREEMPT__LL
16637 bool "Preemptible Kernel (Low-Latency Desktop)"
16638- select PREEMPT_COUNT
16639+ select PREEMPT
16640 select UNINLINE_SPIN_UNLOCK if !ARCH_INLINE_SPIN_UNLOCK
16641 help
16642 This option reduces the latency of the kernel by making
16643@@ -52,6 +65,22 @@ config PREEMPT
16644 embedded system with latency requirements in the milliseconds
16645 range.
16646
16647+config PREEMPT_RTB
16648+ bool "Preemptible Kernel (Basic RT)"
16649+ select PREEMPT_RT_BASE
16650+ help
16651+ This option is basically the same as (Low-Latency Desktop) but
16652+ enables changes which are preliminary for the full preemptible
16653+ RT kernel.
16654+
16655+config PREEMPT_RT_FULL
16656+ bool "Fully Preemptible Kernel (RT)"
16657+ depends on IRQ_FORCED_THREADING
16658+ select PREEMPT_RT_BASE
16659+ select PREEMPT_RCU
16660+ help
16661+ All and everything
16662+
16663 endchoice
16664
16665 config PREEMPT_COUNT
16666diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
16667index 3fc11b8851ac..a04c3aded76b 100644
16668--- a/kernel/cgroup/cgroup.c
16669+++ b/kernel/cgroup/cgroup.c
16670@@ -4515,10 +4515,10 @@ static void css_free_rcu_fn(struct rcu_head *rcu_head)
16671 queue_work(cgroup_destroy_wq, &css->destroy_work);
16672 }
16673
16674-static void css_release_work_fn(struct work_struct *work)
16675+static void css_release_work_fn(struct swork_event *sev)
16676 {
16677 struct cgroup_subsys_state *css =
16678- container_of(work, struct cgroup_subsys_state, destroy_work);
16679+ container_of(sev, struct cgroup_subsys_state, destroy_swork);
16680 struct cgroup_subsys *ss = css->ss;
16681 struct cgroup *cgrp = css->cgroup;
16682
16683@@ -4569,8 +4569,8 @@ static void css_release(struct percpu_ref *ref)
16684 struct cgroup_subsys_state *css =
16685 container_of(ref, struct cgroup_subsys_state, refcnt);
16686
16687- INIT_WORK(&css->destroy_work, css_release_work_fn);
16688- queue_work(cgroup_destroy_wq, &css->destroy_work);
16689+ INIT_SWORK(&css->destroy_swork, css_release_work_fn);
16690+ swork_queue(&css->destroy_swork);
16691 }
16692
16693 static void init_and_link_css(struct cgroup_subsys_state *css,
16694@@ -5276,6 +5276,7 @@ static int __init cgroup_wq_init(void)
16695 */
16696 cgroup_destroy_wq = alloc_workqueue("cgroup_destroy", 0, 1);
16697 BUG_ON(!cgroup_destroy_wq);
16698+ BUG_ON(swork_get());
16699 return 0;
16700 }
16701 core_initcall(cgroup_wq_init);
16702diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
16703index 4657e2924ecb..bda2af78277a 100644
16704--- a/kernel/cgroup/cpuset.c
16705+++ b/kernel/cgroup/cpuset.c
16706@@ -288,7 +288,7 @@ static struct cpuset top_cpuset = {
16707 */
16708
16709 static DEFINE_MUTEX(cpuset_mutex);
16710-static DEFINE_SPINLOCK(callback_lock);
16711+static DEFINE_RAW_SPINLOCK(callback_lock);
16712
16713 static struct workqueue_struct *cpuset_migrate_mm_wq;
16714
16715@@ -926,9 +926,9 @@ static void update_cpumasks_hier(struct cpuset *cs, struct cpumask *new_cpus)
16716 continue;
16717 rcu_read_unlock();
16718
16719- spin_lock_irq(&callback_lock);
16720+ raw_spin_lock_irq(&callback_lock);
16721 cpumask_copy(cp->effective_cpus, new_cpus);
16722- spin_unlock_irq(&callback_lock);
16723+ raw_spin_unlock_irq(&callback_lock);
16724
16725 WARN_ON(!is_in_v2_mode() &&
16726 !cpumask_equal(cp->cpus_allowed, cp->effective_cpus));
16727@@ -993,9 +993,9 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
16728 if (retval < 0)
16729 return retval;
16730
16731- spin_lock_irq(&callback_lock);
16732+ raw_spin_lock_irq(&callback_lock);
16733 cpumask_copy(cs->cpus_allowed, trialcs->cpus_allowed);
16734- spin_unlock_irq(&callback_lock);
16735+ raw_spin_unlock_irq(&callback_lock);
16736
16737 /* use trialcs->cpus_allowed as a temp variable */
16738 update_cpumasks_hier(cs, trialcs->cpus_allowed);
16739@@ -1179,9 +1179,9 @@ static void update_nodemasks_hier(struct cpuset *cs, nodemask_t *new_mems)
16740 continue;
16741 rcu_read_unlock();
16742
16743- spin_lock_irq(&callback_lock);
16744+ raw_spin_lock_irq(&callback_lock);
16745 cp->effective_mems = *new_mems;
16746- spin_unlock_irq(&callback_lock);
16747+ raw_spin_unlock_irq(&callback_lock);
16748
16749 WARN_ON(!is_in_v2_mode() &&
16750 !nodes_equal(cp->mems_allowed, cp->effective_mems));
16751@@ -1249,9 +1249,9 @@ static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs,
16752 if (retval < 0)
16753 goto done;
16754
16755- spin_lock_irq(&callback_lock);
16756+ raw_spin_lock_irq(&callback_lock);
16757 cs->mems_allowed = trialcs->mems_allowed;
16758- spin_unlock_irq(&callback_lock);
16759+ raw_spin_unlock_irq(&callback_lock);
16760
16761 /* use trialcs->mems_allowed as a temp variable */
16762 update_nodemasks_hier(cs, &trialcs->mems_allowed);
16763@@ -1342,9 +1342,9 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs,
16764 spread_flag_changed = ((is_spread_slab(cs) != is_spread_slab(trialcs))
16765 || (is_spread_page(cs) != is_spread_page(trialcs)));
16766
16767- spin_lock_irq(&callback_lock);
16768+ raw_spin_lock_irq(&callback_lock);
16769 cs->flags = trialcs->flags;
16770- spin_unlock_irq(&callback_lock);
16771+ raw_spin_unlock_irq(&callback_lock);
16772
16773 if (!cpumask_empty(trialcs->cpus_allowed) && balance_flag_changed)
16774 rebuild_sched_domains_locked();
16775@@ -1759,7 +1759,7 @@ static int cpuset_common_seq_show(struct seq_file *sf, void *v)
16776 cpuset_filetype_t type = seq_cft(sf)->private;
16777 int ret = 0;
16778
16779- spin_lock_irq(&callback_lock);
16780+ raw_spin_lock_irq(&callback_lock);
16781
16782 switch (type) {
16783 case FILE_CPULIST:
16784@@ -1778,7 +1778,7 @@ static int cpuset_common_seq_show(struct seq_file *sf, void *v)
16785 ret = -EINVAL;
16786 }
16787
16788- spin_unlock_irq(&callback_lock);
16789+ raw_spin_unlock_irq(&callback_lock);
16790 return ret;
16791 }
16792
16793@@ -1993,12 +1993,12 @@ static int cpuset_css_online(struct cgroup_subsys_state *css)
16794
16795 cpuset_inc();
16796
16797- spin_lock_irq(&callback_lock);
16798+ raw_spin_lock_irq(&callback_lock);
16799 if (is_in_v2_mode()) {
16800 cpumask_copy(cs->effective_cpus, parent->effective_cpus);
16801 cs->effective_mems = parent->effective_mems;
16802 }
16803- spin_unlock_irq(&callback_lock);
16804+ raw_spin_unlock_irq(&callback_lock);
16805
16806 if (!test_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags))
16807 goto out_unlock;
16808@@ -2025,12 +2025,12 @@ static int cpuset_css_online(struct cgroup_subsys_state *css)
16809 }
16810 rcu_read_unlock();
16811
16812- spin_lock_irq(&callback_lock);
16813+ raw_spin_lock_irq(&callback_lock);
16814 cs->mems_allowed = parent->mems_allowed;
16815 cs->effective_mems = parent->mems_allowed;
16816 cpumask_copy(cs->cpus_allowed, parent->cpus_allowed);
16817 cpumask_copy(cs->effective_cpus, parent->cpus_allowed);
16818- spin_unlock_irq(&callback_lock);
16819+ raw_spin_unlock_irq(&callback_lock);
16820 out_unlock:
16821 mutex_unlock(&cpuset_mutex);
16822 return 0;
16823@@ -2069,7 +2069,7 @@ static void cpuset_css_free(struct cgroup_subsys_state *css)
16824 static void cpuset_bind(struct cgroup_subsys_state *root_css)
16825 {
16826 mutex_lock(&cpuset_mutex);
16827- spin_lock_irq(&callback_lock);
16828+ raw_spin_lock_irq(&callback_lock);
16829
16830 if (is_in_v2_mode()) {
16831 cpumask_copy(top_cpuset.cpus_allowed, cpu_possible_mask);
16832@@ -2080,7 +2080,7 @@ static void cpuset_bind(struct cgroup_subsys_state *root_css)
16833 top_cpuset.mems_allowed = top_cpuset.effective_mems;
16834 }
16835
16836- spin_unlock_irq(&callback_lock);
16837+ raw_spin_unlock_irq(&callback_lock);
16838 mutex_unlock(&cpuset_mutex);
16839 }
16840
16841@@ -2094,7 +2094,7 @@ static void cpuset_fork(struct task_struct *task)
16842 if (task_css_is_root(task, cpuset_cgrp_id))
16843 return;
16844
16845- set_cpus_allowed_ptr(task, &current->cpus_allowed);
16846+ set_cpus_allowed_ptr(task, current->cpus_ptr);
16847 task->mems_allowed = current->mems_allowed;
16848 }
16849
16850@@ -2178,12 +2178,12 @@ hotplug_update_tasks_legacy(struct cpuset *cs,
16851 {
16852 bool is_empty;
16853
16854- spin_lock_irq(&callback_lock);
16855+ raw_spin_lock_irq(&callback_lock);
16856 cpumask_copy(cs->cpus_allowed, new_cpus);
16857 cpumask_copy(cs->effective_cpus, new_cpus);
16858 cs->mems_allowed = *new_mems;
16859 cs->effective_mems = *new_mems;
16860- spin_unlock_irq(&callback_lock);
16861+ raw_spin_unlock_irq(&callback_lock);
16862
16863 /*
16864 * Don't call update_tasks_cpumask() if the cpuset becomes empty,
16865@@ -2220,10 +2220,10 @@ hotplug_update_tasks(struct cpuset *cs,
16866 if (nodes_empty(*new_mems))
16867 *new_mems = parent_cs(cs)->effective_mems;
16868
16869- spin_lock_irq(&callback_lock);
16870+ raw_spin_lock_irq(&callback_lock);
16871 cpumask_copy(cs->effective_cpus, new_cpus);
16872 cs->effective_mems = *new_mems;
16873- spin_unlock_irq(&callback_lock);
16874+ raw_spin_unlock_irq(&callback_lock);
16875
16876 if (cpus_updated)
16877 update_tasks_cpumask(cs);
16878@@ -2316,21 +2316,21 @@ static void cpuset_hotplug_workfn(struct work_struct *work)
16879
16880 /* synchronize cpus_allowed to cpu_active_mask */
16881 if (cpus_updated) {
16882- spin_lock_irq(&callback_lock);
16883+ raw_spin_lock_irq(&callback_lock);
16884 if (!on_dfl)
16885 cpumask_copy(top_cpuset.cpus_allowed, &new_cpus);
16886 cpumask_copy(top_cpuset.effective_cpus, &new_cpus);
16887- spin_unlock_irq(&callback_lock);
16888+ raw_spin_unlock_irq(&callback_lock);
16889 /* we don't mess with cpumasks of tasks in top_cpuset */
16890 }
16891
16892 /* synchronize mems_allowed to N_MEMORY */
16893 if (mems_updated) {
16894- spin_lock_irq(&callback_lock);
16895+ raw_spin_lock_irq(&callback_lock);
16896 if (!on_dfl)
16897 top_cpuset.mems_allowed = new_mems;
16898 top_cpuset.effective_mems = new_mems;
16899- spin_unlock_irq(&callback_lock);
16900+ raw_spin_unlock_irq(&callback_lock);
16901 update_tasks_nodemask(&top_cpuset);
16902 }
16903
16904@@ -2429,11 +2429,11 @@ void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask)
16905 {
16906 unsigned long flags;
16907
16908- spin_lock_irqsave(&callback_lock, flags);
16909+ raw_spin_lock_irqsave(&callback_lock, flags);
16910 rcu_read_lock();
16911 guarantee_online_cpus(task_cs(tsk), pmask);
16912 rcu_read_unlock();
16913- spin_unlock_irqrestore(&callback_lock, flags);
16914+ raw_spin_unlock_irqrestore(&callback_lock, flags);
16915 }
16916
16917 void cpuset_cpus_allowed_fallback(struct task_struct *tsk)
16918@@ -2481,11 +2481,11 @@ nodemask_t cpuset_mems_allowed(struct task_struct *tsk)
16919 nodemask_t mask;
16920 unsigned long flags;
16921
16922- spin_lock_irqsave(&callback_lock, flags);
16923+ raw_spin_lock_irqsave(&callback_lock, flags);
16924 rcu_read_lock();
16925 guarantee_online_mems(task_cs(tsk), &mask);
16926 rcu_read_unlock();
16927- spin_unlock_irqrestore(&callback_lock, flags);
16928+ raw_spin_unlock_irqrestore(&callback_lock, flags);
16929
16930 return mask;
16931 }
16932@@ -2577,14 +2577,14 @@ bool __cpuset_node_allowed(int node, gfp_t gfp_mask)
16933 return true;
16934
16935 /* Not hardwall and node outside mems_allowed: scan up cpusets */
16936- spin_lock_irqsave(&callback_lock, flags);
16937+ raw_spin_lock_irqsave(&callback_lock, flags);
16938
16939 rcu_read_lock();
16940 cs = nearest_hardwall_ancestor(task_cs(current));
16941 allowed = node_isset(node, cs->mems_allowed);
16942 rcu_read_unlock();
16943
16944- spin_unlock_irqrestore(&callback_lock, flags);
16945+ raw_spin_unlock_irqrestore(&callback_lock, flags);
16946 return allowed;
16947 }
16948
16949diff --git a/kernel/cpu.c b/kernel/cpu.c
16950index f3f389e33343..7d777b62e4eb 100644
16951--- a/kernel/cpu.c
16952+++ b/kernel/cpu.c
16953@@ -74,6 +74,11 @@ static DEFINE_PER_CPU(struct cpuhp_cpu_state, cpuhp_state) = {
16954 .fail = CPUHP_INVALID,
16955 };
16956
16957+#if defined(CONFIG_HOTPLUG_CPU) && defined(CONFIG_PREEMPT_RT_FULL)
16958+static DEFINE_PER_CPU(struct rt_rw_lock, cpuhp_pin_lock) = \
16959+ __RWLOCK_RT_INITIALIZER(cpuhp_pin_lock);
16960+#endif
16961+
16962 #if defined(CONFIG_LOCKDEP) && defined(CONFIG_SMP)
16963 static struct lockdep_map cpuhp_state_up_map =
16964 STATIC_LOCKDEP_MAP_INIT("cpuhp_state-up", &cpuhp_state_up_map);
16965@@ -287,6 +292,55 @@ static int cpu_hotplug_disabled;
16966
16967 #ifdef CONFIG_HOTPLUG_CPU
16968
16969+/**
16970+ * pin_current_cpu - Prevent the current cpu from being unplugged
16971+ */
16972+void pin_current_cpu(void)
16973+{
16974+#ifdef CONFIG_PREEMPT_RT_FULL
16975+ struct rt_rw_lock *cpuhp_pin;
16976+ unsigned int cpu;
16977+ int ret;
16978+
16979+again:
16980+ cpuhp_pin = this_cpu_ptr(&cpuhp_pin_lock);
16981+ ret = __read_rt_trylock(cpuhp_pin);
16982+ if (ret) {
16983+ current->pinned_on_cpu = smp_processor_id();
16984+ return;
16985+ }
16986+ cpu = smp_processor_id();
16987+ preempt_lazy_enable();
16988+ preempt_enable();
16989+
16990+ __read_rt_lock(cpuhp_pin);
16991+
16992+ preempt_disable();
16993+ preempt_lazy_disable();
16994+ if (cpu != smp_processor_id()) {
16995+ __read_rt_unlock(cpuhp_pin);
16996+ goto again;
16997+ }
16998+ current->pinned_on_cpu = cpu;
16999+#endif
17000+}
17001+
17002+/**
17003+ * unpin_current_cpu - Allow unplug of current cpu
17004+ */
17005+void unpin_current_cpu(void)
17006+{
17007+#ifdef CONFIG_PREEMPT_RT_FULL
17008+ struct rt_rw_lock *cpuhp_pin = this_cpu_ptr(&cpuhp_pin_lock);
17009+
17010+ if (WARN_ON(current->pinned_on_cpu != smp_processor_id()))
17011+ cpuhp_pin = per_cpu_ptr(&cpuhp_pin_lock, current->pinned_on_cpu);
17012+
17013+ current->pinned_on_cpu = -1;
17014+ __read_rt_unlock(cpuhp_pin);
17015+#endif
17016+}
17017+
17018 DEFINE_STATIC_PERCPU_RWSEM(cpu_hotplug_lock);
17019
17020 void cpus_read_lock(void)
17021@@ -843,6 +897,9 @@ static int take_cpu_down(void *_param)
17022
17023 static int takedown_cpu(unsigned int cpu)
17024 {
17025+#ifdef CONFIG_PREEMPT_RT_FULL
17026+ struct rt_rw_lock *cpuhp_pin = per_cpu_ptr(&cpuhp_pin_lock, cpu);
17027+#endif
17028 struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
17029 int err;
17030
17031@@ -855,11 +912,18 @@ static int takedown_cpu(unsigned int cpu)
17032 */
17033 irq_lock_sparse();
17034
17035+#ifdef CONFIG_PREEMPT_RT_FULL
17036+ __write_rt_lock(cpuhp_pin);
17037+#endif
17038+
17039 /*
17040 * So now all preempt/rcu users must observe !cpu_active().
17041 */
17042 err = stop_machine_cpuslocked(take_cpu_down, NULL, cpumask_of(cpu));
17043 if (err) {
17044+#ifdef CONFIG_PREEMPT_RT_FULL
17045+ __write_rt_unlock(cpuhp_pin);
17046+#endif
17047 /* CPU refused to die */
17048 irq_unlock_sparse();
17049 /* Unpark the hotplug thread so we can rollback there */
17050@@ -878,6 +942,9 @@ static int takedown_cpu(unsigned int cpu)
17051 wait_for_ap_thread(st, false);
17052 BUG_ON(st->state != CPUHP_AP_IDLE_DEAD);
17053
17054+#ifdef CONFIG_PREEMPT_RT_FULL
17055+ __write_rt_unlock(cpuhp_pin);
17056+#endif
17057 /* Interrupts are moved away from the dying cpu, reenable alloc/free */
17058 irq_unlock_sparse();
17059
17060diff --git a/kernel/debug/kdb/kdb_io.c b/kernel/debug/kdb/kdb_io.c
17061index ed5d34925ad0..c0d4c24fc241 100644
17062--- a/kernel/debug/kdb/kdb_io.c
17063+++ b/kernel/debug/kdb/kdb_io.c
17064@@ -854,9 +854,11 @@ int kdb_printf(const char *fmt, ...)
17065 va_list ap;
17066 int r;
17067
17068+ kdb_trap_printk++;
17069 va_start(ap, fmt);
17070 r = vkdb_printf(KDB_MSGSRC_INTERNAL, fmt, ap);
17071 va_end(ap);
17072+ kdb_trap_printk--;
17073
17074 return r;
17075 }
17076diff --git a/kernel/events/core.c b/kernel/events/core.c
17077index 4dbce29a9313..de3d23bae9bf 100644
17078--- a/kernel/events/core.c
17079+++ b/kernel/events/core.c
17080@@ -1065,7 +1065,7 @@ static void __perf_mux_hrtimer_init(struct perf_cpu_context *cpuctx, int cpu)
17081 cpuctx->hrtimer_interval = ns_to_ktime(NSEC_PER_MSEC * interval);
17082
17083 raw_spin_lock_init(&cpuctx->hrtimer_lock);
17084- hrtimer_init(timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED);
17085+ hrtimer_init(timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED_HARD);
17086 timer->function = perf_mux_hrtimer_handler;
17087 }
17088
17089@@ -8760,7 +8760,7 @@ static void perf_swevent_init_hrtimer(struct perf_event *event)
17090 if (!is_sampling_event(event))
17091 return;
17092
17093- hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
17094+ hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD);
17095 hwc->hrtimer.function = perf_swevent_hrtimer;
17096
17097 /*
17098diff --git a/kernel/exit.c b/kernel/exit.c
17099index e3a08761eb40..26f3b352b37a 100644
17100--- a/kernel/exit.c
17101+++ b/kernel/exit.c
17102@@ -159,7 +159,7 @@ static void __exit_signal(struct task_struct *tsk)
17103 * Do this under ->siglock, we can race with another thread
17104 * doing sigqueue_free() if we have SIGQUEUE_PREALLOC signals.
17105 */
17106- flush_sigqueue(&tsk->pending);
17107+ flush_task_sigqueue(tsk);
17108 tsk->sighand = NULL;
17109 spin_unlock(&sighand->siglock);
17110
17111diff --git a/kernel/fork.c b/kernel/fork.c
17112index 6a219fea4926..bc849ac60aa6 100644
17113--- a/kernel/fork.c
17114+++ b/kernel/fork.c
17115@@ -40,6 +40,7 @@
17116 #include <linux/hmm.h>
17117 #include <linux/fs.h>
17118 #include <linux/mm.h>
17119+#include <linux/kprobes.h>
17120 #include <linux/vmacache.h>
17121 #include <linux/nsproxy.h>
17122 #include <linux/capability.h>
17123@@ -407,13 +408,24 @@ static inline void put_signal_struct(struct signal_struct *sig)
17124 if (atomic_dec_and_test(&sig->sigcnt))
17125 free_signal_struct(sig);
17126 }
17127-
17128+#ifdef CONFIG_PREEMPT_RT_BASE
17129+static
17130+#endif
17131 void __put_task_struct(struct task_struct *tsk)
17132 {
17133 WARN_ON(!tsk->exit_state);
17134 WARN_ON(atomic_read(&tsk->usage));
17135 WARN_ON(tsk == current);
17136
17137+ /*
17138+ * Remove function-return probe instances associated with this
17139+ * task and put them back on the free list.
17140+ */
17141+ kprobe_flush_task(tsk);
17142+
17143+ /* Task is done with its stack. */
17144+ put_task_stack(tsk);
17145+
17146 cgroup_free(tsk);
17147 task_numa_free(tsk);
17148 security_task_free(tsk);
17149@@ -424,7 +436,18 @@ void __put_task_struct(struct task_struct *tsk)
17150 if (!profile_handoff_task(tsk))
17151 free_task(tsk);
17152 }
17153+#ifndef CONFIG_PREEMPT_RT_BASE
17154 EXPORT_SYMBOL_GPL(__put_task_struct);
17155+#else
17156+void __put_task_struct_cb(struct rcu_head *rhp)
17157+{
17158+ struct task_struct *tsk = container_of(rhp, struct task_struct, put_rcu);
17159+
17160+ __put_task_struct(tsk);
17161+
17162+}
17163+EXPORT_SYMBOL_GPL(__put_task_struct_cb);
17164+#endif
17165
17166 void __init __weak arch_task_cache_init(void) { }
17167
17168@@ -563,7 +586,8 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
17169 #ifdef CONFIG_CC_STACKPROTECTOR
17170 tsk->stack_canary = get_random_canary();
17171 #endif
17172-
17173+ if (orig->cpus_ptr == &orig->cpus_mask)
17174+ tsk->cpus_ptr = &tsk->cpus_mask;
17175 /*
17176 * One for us, one for whoever does the "release_task()" (usually
17177 * parent)
17178@@ -575,6 +599,7 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
17179 tsk->splice_pipe = NULL;
17180 tsk->task_frag.page = NULL;
17181 tsk->wake_q.next = NULL;
17182+ tsk->wake_q_sleeper.next = NULL;
17183
17184 account_kernel_stack(tsk, 1);
17185
17186@@ -915,6 +940,19 @@ void __mmdrop(struct mm_struct *mm)
17187 }
17188 EXPORT_SYMBOL_GPL(__mmdrop);
17189
17190+#ifdef CONFIG_PREEMPT_RT_BASE
17191+/*
17192+ * RCU callback for delayed mm drop. Not strictly rcu, but we don't
17193+ * want another facility to make this work.
17194+ */
17195+void __mmdrop_delayed(struct rcu_head *rhp)
17196+{
17197+ struct mm_struct *mm = container_of(rhp, struct mm_struct, delayed_drop);
17198+
17199+ __mmdrop(mm);
17200+}
17201+#endif
17202+
17203 static inline void __mmput(struct mm_struct *mm)
17204 {
17205 VM_BUG_ON(atomic_read(&mm->mm_users));
17206@@ -1496,6 +1534,9 @@ static void rt_mutex_init_task(struct task_struct *p)
17207 */
17208 static void posix_cpu_timers_init(struct task_struct *tsk)
17209 {
17210+#ifdef CONFIG_PREEMPT_RT_BASE
17211+ tsk->posix_timer_list = NULL;
17212+#endif
17213 tsk->cputime_expires.prof_exp = 0;
17214 tsk->cputime_expires.virt_exp = 0;
17215 tsk->cputime_expires.sched_exp = 0;
17216@@ -1648,6 +1689,7 @@ static __latent_entropy struct task_struct *copy_process(
17217 spin_lock_init(&p->alloc_lock);
17218
17219 init_sigpending(&p->pending);
17220+ p->sigqueue_cache = NULL;
17221
17222 p->utime = p->stime = p->gtime = 0;
17223 #ifdef CONFIG_ARCH_HAS_SCALED_CPUTIME
17224diff --git a/kernel/futex.c b/kernel/futex.c
17225index 046cd780d057..2ba7fb04a107 100644
17226--- a/kernel/futex.c
17227+++ b/kernel/futex.c
17228@@ -936,7 +936,9 @@ void exit_pi_state_list(struct task_struct *curr)
17229 if (head->next != next) {
17230 /* retain curr->pi_lock for the loop invariant */
17231 raw_spin_unlock(&pi_state->pi_mutex.wait_lock);
17232+ raw_spin_unlock_irq(&curr->pi_lock);
17233 spin_unlock(&hb->lock);
17234+ raw_spin_lock_irq(&curr->pi_lock);
17235 put_pi_state(pi_state);
17236 continue;
17237 }
17238@@ -1430,6 +1432,7 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_pi_state *pi_
17239 struct task_struct *new_owner;
17240 bool postunlock = false;
17241 DEFINE_WAKE_Q(wake_q);
17242+ DEFINE_WAKE_Q(wake_sleeper_q);
17243 int ret = 0;
17244
17245 new_owner = rt_mutex_next_owner(&pi_state->pi_mutex);
17246@@ -1491,13 +1494,13 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_pi_state *pi_
17247 pi_state->owner = new_owner;
17248 raw_spin_unlock(&new_owner->pi_lock);
17249
17250- postunlock = __rt_mutex_futex_unlock(&pi_state->pi_mutex, &wake_q);
17251-
17252+ postunlock = __rt_mutex_futex_unlock(&pi_state->pi_mutex, &wake_q,
17253+ &wake_sleeper_q);
17254 out_unlock:
17255 raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
17256
17257 if (postunlock)
17258- rt_mutex_postunlock(&wake_q);
17259+ rt_mutex_postunlock(&wake_q, &wake_sleeper_q);
17260
17261 return ret;
17262 }
17263@@ -2104,6 +2107,16 @@ static int futex_requeue(u32 __user *uaddr1, unsigned int flags,
17264 requeue_pi_wake_futex(this, &key2, hb2);
17265 drop_count++;
17266 continue;
17267+ } else if (ret == -EAGAIN) {
17268+ /*
17269+ * Waiter was woken by timeout or
17270+ * signal and has set pi_blocked_on to
17271+ * PI_WAKEUP_INPROGRESS before we
17272+ * tried to enqueue it on the rtmutex.
17273+ */
17274+ this->pi_state = NULL;
17275+ put_pi_state(pi_state);
17276+ continue;
17277 } else if (ret) {
17278 /*
17279 * rt_mutex_start_proxy_lock() detected a
17280@@ -2642,10 +2655,9 @@ static int futex_wait(u32 __user *uaddr, unsigned int flags, u32 val,
17281 if (abs_time) {
17282 to = &timeout;
17283
17284- hrtimer_init_on_stack(&to->timer, (flags & FLAGS_CLOCKRT) ?
17285- CLOCK_REALTIME : CLOCK_MONOTONIC,
17286- HRTIMER_MODE_ABS);
17287- hrtimer_init_sleeper(to, current);
17288+ hrtimer_init_sleeper_on_stack(to, (flags & FLAGS_CLOCKRT) ?
17289+ CLOCK_REALTIME : CLOCK_MONOTONIC,
17290+ HRTIMER_MODE_ABS, current);
17291 hrtimer_set_expires_range_ns(&to->timer, *abs_time,
17292 current->timer_slack_ns);
17293 }
17294@@ -2744,9 +2756,8 @@ static int futex_lock_pi(u32 __user *uaddr, unsigned int flags,
17295
17296 if (time) {
17297 to = &timeout;
17298- hrtimer_init_on_stack(&to->timer, CLOCK_REALTIME,
17299- HRTIMER_MODE_ABS);
17300- hrtimer_init_sleeper(to, current);
17301+ hrtimer_init_sleeper_on_stack(to, CLOCK_REALTIME,
17302+ HRTIMER_MODE_ABS, current);
17303 hrtimer_set_expires(&to->timer, *time);
17304 }
17305
17306@@ -2801,7 +2812,7 @@ static int futex_lock_pi(u32 __user *uaddr, unsigned int flags,
17307 goto no_block;
17308 }
17309
17310- rt_mutex_init_waiter(&rt_waiter);
17311+ rt_mutex_init_waiter(&rt_waiter, false);
17312
17313 /*
17314 * On PREEMPT_RT_FULL, when hb->lock becomes an rt_mutex, we must not
17315@@ -2816,9 +2827,18 @@ static int futex_lock_pi(u32 __user *uaddr, unsigned int flags,
17316 * lock handoff sequence.
17317 */
17318 raw_spin_lock_irq(&q.pi_state->pi_mutex.wait_lock);
17319+ /*
17320+ * the migrate_disable() here disables migration in the in_atomic() fast
17321+ * path which is enabled again in the following spin_unlock(). We have
17322+ * one migrate_disable() pending in the slow-path which is reversed
17323+ * after the raw_spin_unlock_irq() where we leave the atomic context.
17324+ */
17325+ migrate_disable();
17326+
17327 spin_unlock(q.lock_ptr);
17328 ret = __rt_mutex_start_proxy_lock(&q.pi_state->pi_mutex, &rt_waiter, current);
17329 raw_spin_unlock_irq(&q.pi_state->pi_mutex.wait_lock);
17330+ migrate_enable();
17331
17332 if (ret) {
17333 if (ret == 1)
17334@@ -2965,11 +2985,21 @@ static int futex_unlock_pi(u32 __user *uaddr, unsigned int flags)
17335 * observed.
17336 */
17337 raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
17338+ /*
17339+ * Magic trickery for now to make the RT migrate disable
17340+ * logic happy. The following spin_unlock() happens with
17341+ * interrupts disabled so the internal migrate_enable()
17342+ * won't undo the migrate_disable() which was issued when
17343+ * locking hb->lock.
17344+ */
17345+ migrate_disable();
17346 spin_unlock(&hb->lock);
17347
17348 /* drops pi_state->pi_mutex.wait_lock */
17349 ret = wake_futex_pi(uaddr, uval, pi_state);
17350
17351+ migrate_enable();
17352+
17353 put_pi_state(pi_state);
17354
17355 /*
17356@@ -3127,7 +3157,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
17357 struct hrtimer_sleeper timeout, *to = NULL;
17358 struct futex_pi_state *pi_state = NULL;
17359 struct rt_mutex_waiter rt_waiter;
17360- struct futex_hash_bucket *hb;
17361+ struct futex_hash_bucket *hb, *hb2;
17362 union futex_key key2 = FUTEX_KEY_INIT;
17363 struct futex_q q = futex_q_init;
17364 int res, ret;
17365@@ -3143,10 +3173,9 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
17366
17367 if (abs_time) {
17368 to = &timeout;
17369- hrtimer_init_on_stack(&to->timer, (flags & FLAGS_CLOCKRT) ?
17370- CLOCK_REALTIME : CLOCK_MONOTONIC,
17371- HRTIMER_MODE_ABS);
17372- hrtimer_init_sleeper(to, current);
17373+ hrtimer_init_sleeper_on_stack(to, (flags & FLAGS_CLOCKRT) ?
17374+ CLOCK_REALTIME : CLOCK_MONOTONIC,
17375+ HRTIMER_MODE_ABS, current);
17376 hrtimer_set_expires_range_ns(&to->timer, *abs_time,
17377 current->timer_slack_ns);
17378 }
17379@@ -3155,7 +3184,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
17380 * The waiter is allocated on our stack, manipulated by the requeue
17381 * code while we sleep on uaddr.
17382 */
17383- rt_mutex_init_waiter(&rt_waiter);
17384+ rt_mutex_init_waiter(&rt_waiter, false);
17385
17386 ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2, VERIFY_WRITE);
17387 if (unlikely(ret != 0))
17388@@ -3186,20 +3215,55 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
17389 /* Queue the futex_q, drop the hb lock, wait for wakeup. */
17390 futex_wait_queue_me(hb, &q, to);
17391
17392- spin_lock(&hb->lock);
17393- ret = handle_early_requeue_pi_wakeup(hb, &q, &key2, to);
17394- spin_unlock(&hb->lock);
17395- if (ret)
17396- goto out_put_keys;
17397+ /*
17398+ * On RT we must avoid races with requeue and trying to block
17399+ * on two mutexes (hb->lock and uaddr2's rtmutex) by
17400+ * serializing access to pi_blocked_on with pi_lock.
17401+ */
17402+ raw_spin_lock_irq(&current->pi_lock);
17403+ if (current->pi_blocked_on) {
17404+ /*
17405+ * We have been requeued or are in the process of
17406+ * being requeued.
17407+ */
17408+ raw_spin_unlock_irq(&current->pi_lock);
17409+ } else {
17410+ /*
17411+ * Setting pi_blocked_on to PI_WAKEUP_INPROGRESS
17412+ * prevents a concurrent requeue from moving us to the
17413+ * uaddr2 rtmutex. After that we can safely acquire
17414+ * (and possibly block on) hb->lock.
17415+ */
17416+ current->pi_blocked_on = PI_WAKEUP_INPROGRESS;
17417+ raw_spin_unlock_irq(&current->pi_lock);
17418+
17419+ spin_lock(&hb->lock);
17420+
17421+ /*
17422+ * Clean up pi_blocked_on. We might leak it otherwise
17423+ * when we succeeded with the hb->lock in the fast
17424+ * path.
17425+ */
17426+ raw_spin_lock_irq(&current->pi_lock);
17427+ current->pi_blocked_on = NULL;
17428+ raw_spin_unlock_irq(&current->pi_lock);
17429+
17430+ ret = handle_early_requeue_pi_wakeup(hb, &q, &key2, to);
17431+ spin_unlock(&hb->lock);
17432+ if (ret)
17433+ goto out_put_keys;
17434+ }
17435
17436 /*
17437- * In order for us to be here, we know our q.key == key2, and since
17438- * we took the hb->lock above, we also know that futex_requeue() has
17439- * completed and we no longer have to concern ourselves with a wakeup
17440- * race with the atomic proxy lock acquisition by the requeue code. The
17441- * futex_requeue dropped our key1 reference and incremented our key2
17442- * reference count.
17443+ * In order to be here, we have either been requeued, are in
17444+ * the process of being requeued, or requeue successfully
17445+ * acquired uaddr2 on our behalf. If pi_blocked_on was
17446+ * non-null above, we may be racing with a requeue. Do not
17447+ * rely on q->lock_ptr to be hb2->lock until after blocking on
17448+ * hb->lock or hb2->lock. The futex_requeue dropped our key1
17449+ * reference and incremented our key2 reference count.
17450 */
17451+ hb2 = hash_futex(&key2);
17452
17453 /* Check if the requeue code acquired the second futex for us. */
17454 if (!q.rt_waiter) {
17455@@ -3208,7 +3272,8 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
17456 * did a lock-steal - fix up the PI-state in that case.
17457 */
17458 if (q.pi_state && (q.pi_state->owner != current)) {
17459- spin_lock(q.lock_ptr);
17460+ spin_lock(&hb2->lock);
17461+ BUG_ON(&hb2->lock != q.lock_ptr);
17462 ret = fixup_pi_state_owner(uaddr2, &q, current);
17463 if (ret && rt_mutex_owner(&q.pi_state->pi_mutex) == current) {
17464 pi_state = q.pi_state;
17465@@ -3219,7 +3284,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
17466 * the requeue_pi() code acquired for us.
17467 */
17468 put_pi_state(q.pi_state);
17469- spin_unlock(q.lock_ptr);
17470+ spin_unlock(&hb2->lock);
17471 }
17472 } else {
17473 struct rt_mutex *pi_mutex;
17474@@ -3233,7 +3298,8 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
17475 pi_mutex = &q.pi_state->pi_mutex;
17476 ret = rt_mutex_wait_proxy_lock(pi_mutex, to, &rt_waiter);
17477
17478- spin_lock(q.lock_ptr);
17479+ spin_lock(&hb2->lock);
17480+ BUG_ON(&hb2->lock != q.lock_ptr);
17481 if (ret && !rt_mutex_cleanup_proxy_lock(pi_mutex, &rt_waiter))
17482 ret = 0;
17483
17484diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
17485index 79f987b942b8..d1dbacc29941 100644
17486--- a/kernel/irq/handle.c
17487+++ b/kernel/irq/handle.c
17488@@ -183,10 +183,16 @@ irqreturn_t handle_irq_event_percpu(struct irq_desc *desc)
17489 {
17490 irqreturn_t retval;
17491 unsigned int flags = 0;
17492+ struct pt_regs *regs = get_irq_regs();
17493+ u64 ip = regs ? instruction_pointer(regs) : 0;
17494
17495 retval = __handle_irq_event_percpu(desc, &flags);
17496
17497- add_interrupt_randomness(desc->irq_data.irq, flags);
17498+#ifdef CONFIG_PREEMPT_RT_FULL
17499+ desc->random_ip = ip;
17500+#else
17501+ add_interrupt_randomness(desc->irq_data.irq, flags, ip);
17502+#endif
17503
17504 if (!noirqdebug)
17505 note_interrupt(desc, retval);
17506diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
17507index 069311541577..f82dcca81712 100644
17508--- a/kernel/irq/manage.c
17509+++ b/kernel/irq/manage.c
17510@@ -24,6 +24,7 @@
17511 #include "internals.h"
17512
17513 #ifdef CONFIG_IRQ_FORCED_THREADING
17514+# ifndef CONFIG_PREEMPT_RT_BASE
17515 __read_mostly bool force_irqthreads;
17516
17517 static int __init setup_forced_irqthreads(char *arg)
17518@@ -32,6 +33,7 @@ static int __init setup_forced_irqthreads(char *arg)
17519 return 0;
17520 }
17521 early_param("threadirqs", setup_forced_irqthreads);
17522+# endif
17523 #endif
17524
17525 static void __synchronize_hardirq(struct irq_desc *desc)
17526@@ -224,7 +226,12 @@ int irq_set_affinity_locked(struct irq_data *data, const struct cpumask *mask,
17527
17528 if (desc->affinity_notify) {
17529 kref_get(&desc->affinity_notify->kref);
17530+
17531+#ifdef CONFIG_PREEMPT_RT_BASE
17532+ swork_queue(&desc->affinity_notify->swork);
17533+#else
17534 schedule_work(&desc->affinity_notify->work);
17535+#endif
17536 }
17537 irqd_set(data, IRQD_AFFINITY_SET);
17538
17539@@ -262,10 +269,8 @@ int irq_set_affinity_hint(unsigned int irq, const struct cpumask *m)
17540 }
17541 EXPORT_SYMBOL_GPL(irq_set_affinity_hint);
17542
17543-static void irq_affinity_notify(struct work_struct *work)
17544+static void _irq_affinity_notify(struct irq_affinity_notify *notify)
17545 {
17546- struct irq_affinity_notify *notify =
17547- container_of(work, struct irq_affinity_notify, work);
17548 struct irq_desc *desc = irq_to_desc(notify->irq);
17549 cpumask_var_t cpumask;
17550 unsigned long flags;
17551@@ -287,6 +292,35 @@ static void irq_affinity_notify(struct work_struct *work)
17552 kref_put(&notify->kref, notify->release);
17553 }
17554
17555+#ifdef CONFIG_PREEMPT_RT_BASE
17556+static void init_helper_thread(void)
17557+{
17558+ static int init_sworker_once;
17559+
17560+ if (init_sworker_once)
17561+ return;
17562+ if (WARN_ON(swork_get()))
17563+ return;
17564+ init_sworker_once = 1;
17565+}
17566+
17567+static void irq_affinity_notify(struct swork_event *swork)
17568+{
17569+ struct irq_affinity_notify *notify =
17570+ container_of(swork, struct irq_affinity_notify, swork);
17571+ _irq_affinity_notify(notify);
17572+}
17573+
17574+#else
17575+
17576+static void irq_affinity_notify(struct work_struct *work)
17577+{
17578+ struct irq_affinity_notify *notify =
17579+ container_of(work, struct irq_affinity_notify, work);
17580+ _irq_affinity_notify(notify);
17581+}
17582+#endif
17583+
17584 /**
17585 * irq_set_affinity_notifier - control notification of IRQ affinity changes
17586 * @irq: Interrupt for which to enable/disable notification
17587@@ -315,7 +349,12 @@ irq_set_affinity_notifier(unsigned int irq, struct irq_affinity_notify *notify)
17588 if (notify) {
17589 notify->irq = irq;
17590 kref_init(&notify->kref);
17591+#ifdef CONFIG_PREEMPT_RT_BASE
17592+ INIT_SWORK(&notify->swork, irq_affinity_notify);
17593+ init_helper_thread();
17594+#else
17595 INIT_WORK(&notify->work, irq_affinity_notify);
17596+#endif
17597 }
17598
17599 raw_spin_lock_irqsave(&desc->lock, flags);
17600@@ -883,7 +922,15 @@ irq_forced_thread_fn(struct irq_desc *desc, struct irqaction *action)
17601 local_bh_disable();
17602 ret = action->thread_fn(action->irq, action->dev_id);
17603 irq_finalize_oneshot(desc, action);
17604- local_bh_enable();
17605+ /*
17606+ * Interrupts which have real time requirements can be set up
17607+ * to avoid softirq processing in the thread handler. This is
17608+ * safe as these interrupts do not raise soft interrupts.
17609+ */
17610+ if (irq_settings_no_softirq_call(desc))
17611+ _local_bh_enable();
17612+ else
17613+ local_bh_enable();
17614 return ret;
17615 }
17616
17617@@ -980,6 +1027,12 @@ static int irq_thread(void *data)
17618 if (action_ret == IRQ_WAKE_THREAD)
17619 irq_wake_secondary(desc, action);
17620
17621+#ifdef CONFIG_PREEMPT_RT_FULL
17622+ migrate_disable();
17623+ add_interrupt_randomness(action->irq, 0,
17624+ desc->random_ip ^ (unsigned long) action);
17625+ migrate_enable();
17626+#endif
17627 wake_threads_waitq(desc);
17628 }
17629
17630@@ -1378,6 +1431,9 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
17631 irqd_set(&desc->irq_data, IRQD_NO_BALANCING);
17632 }
17633
17634+ if (new->flags & IRQF_NO_SOFTIRQ_CALL)
17635+ irq_settings_set_no_softirq_call(desc);
17636+
17637 if (irq_settings_can_autoenable(desc)) {
17638 irq_startup(desc, IRQ_RESEND, IRQ_START_COND);
17639 } else {
17640@@ -2159,7 +2215,7 @@ EXPORT_SYMBOL_GPL(irq_get_irqchip_state);
17641 * This call sets the internal irqchip state of an interrupt,
17642 * depending on the value of @which.
17643 *
17644- * This function should be called with preemption disabled if the
17645+ * This function should be called with migration disabled if the
17646 * interrupt controller has per-cpu registers.
17647 */
17648 int irq_set_irqchip_state(unsigned int irq, enum irqchip_irq_state which,
17649diff --git a/kernel/irq/settings.h b/kernel/irq/settings.h
17650index e43795cd2ccf..47e2f9e23586 100644
17651--- a/kernel/irq/settings.h
17652+++ b/kernel/irq/settings.h
17653@@ -17,6 +17,7 @@ enum {
17654 _IRQ_PER_CPU_DEVID = IRQ_PER_CPU_DEVID,
17655 _IRQ_IS_POLLED = IRQ_IS_POLLED,
17656 _IRQ_DISABLE_UNLAZY = IRQ_DISABLE_UNLAZY,
17657+ _IRQ_NO_SOFTIRQ_CALL = IRQ_NO_SOFTIRQ_CALL,
17658 _IRQF_MODIFY_MASK = IRQF_MODIFY_MASK,
17659 };
17660
17661@@ -31,6 +32,7 @@ enum {
17662 #define IRQ_PER_CPU_DEVID GOT_YOU_MORON
17663 #define IRQ_IS_POLLED GOT_YOU_MORON
17664 #define IRQ_DISABLE_UNLAZY GOT_YOU_MORON
17665+#define IRQ_NO_SOFTIRQ_CALL GOT_YOU_MORON
17666 #undef IRQF_MODIFY_MASK
17667 #define IRQF_MODIFY_MASK GOT_YOU_MORON
17668
17669@@ -41,6 +43,16 @@ irq_settings_clr_and_set(struct irq_desc *desc, u32 clr, u32 set)
17670 desc->status_use_accessors |= (set & _IRQF_MODIFY_MASK);
17671 }
17672
17673+static inline bool irq_settings_no_softirq_call(struct irq_desc *desc)
17674+{
17675+ return desc->status_use_accessors & _IRQ_NO_SOFTIRQ_CALL;
17676+}
17677+
17678+static inline void irq_settings_set_no_softirq_call(struct irq_desc *desc)
17679+{
17680+ desc->status_use_accessors |= _IRQ_NO_SOFTIRQ_CALL;
17681+}
17682+
17683 static inline bool irq_settings_is_per_cpu(struct irq_desc *desc)
17684 {
17685 return desc->status_use_accessors & _IRQ_PER_CPU;
17686diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c
17687index 987d7bca4864..75347fb1dfea 100644
17688--- a/kernel/irq/spurious.c
17689+++ b/kernel/irq/spurious.c
17690@@ -445,6 +445,10 @@ MODULE_PARM_DESC(noirqdebug, "Disable irq lockup detection when true");
17691
17692 static int __init irqfixup_setup(char *str)
17693 {
17694+#ifdef CONFIG_PREEMPT_RT_BASE
17695+ pr_warn("irqfixup boot option not supported w/ CONFIG_PREEMPT_RT_BASE\n");
17696+ return 1;
17697+#endif
17698 irqfixup = 1;
17699 printk(KERN_WARNING "Misrouted IRQ fixup support enabled.\n");
17700 printk(KERN_WARNING "This may impact system performance.\n");
17701@@ -457,6 +461,10 @@ module_param(irqfixup, int, 0644);
17702
17703 static int __init irqpoll_setup(char *str)
17704 {
17705+#ifdef CONFIG_PREEMPT_RT_BASE
17706+ pr_warn("irqpoll boot option not supported w/ CONFIG_PREEMPT_RT_BASE\n");
17707+ return 1;
17708+#endif
17709 irqfixup = 2;
17710 printk(KERN_WARNING "Misrouted IRQ fixup and polling support "
17711 "enabled\n");
17712diff --git a/kernel/irq_work.c b/kernel/irq_work.c
17713index bcf107ce0854..2899ba0d23d1 100644
17714--- a/kernel/irq_work.c
17715+++ b/kernel/irq_work.c
17716@@ -17,6 +17,7 @@
17717 #include <linux/cpu.h>
17718 #include <linux/notifier.h>
17719 #include <linux/smp.h>
17720+#include <linux/interrupt.h>
17721 #include <asm/processor.h>
17722
17723
17724@@ -65,6 +66,8 @@ void __weak arch_irq_work_raise(void)
17725 */
17726 bool irq_work_queue_on(struct irq_work *work, int cpu)
17727 {
17728+ struct llist_head *list;
17729+
17730 /* All work should have been flushed before going offline */
17731 WARN_ON_ONCE(cpu_is_offline(cpu));
17732
17733@@ -75,7 +78,12 @@ bool irq_work_queue_on(struct irq_work *work, int cpu)
17734 if (!irq_work_claim(work))
17735 return false;
17736
17737- if (llist_add(&work->llnode, &per_cpu(raised_list, cpu)))
17738+ if (IS_ENABLED(CONFIG_PREEMPT_RT_FULL) && !(work->flags & IRQ_WORK_HARD_IRQ))
17739+ list = &per_cpu(lazy_list, cpu);
17740+ else
17741+ list = &per_cpu(raised_list, cpu);
17742+
17743+ if (llist_add(&work->llnode, list))
17744 arch_send_call_function_single_ipi(cpu);
17745
17746 return true;
17747@@ -86,6 +94,9 @@ EXPORT_SYMBOL_GPL(irq_work_queue_on);
17748 /* Enqueue the irq work @work on the current CPU */
17749 bool irq_work_queue(struct irq_work *work)
17750 {
17751+ struct llist_head *list;
17752+ bool lazy_work, realtime = IS_ENABLED(CONFIG_PREEMPT_RT_FULL);
17753+
17754 /* Only queue if not already pending */
17755 if (!irq_work_claim(work))
17756 return false;
17757@@ -93,13 +104,15 @@ bool irq_work_queue(struct irq_work *work)
17758 /* Queue the entry and raise the IPI if needed. */
17759 preempt_disable();
17760
17761- /* If the work is "lazy", handle it from next tick if any */
17762- if (work->flags & IRQ_WORK_LAZY) {
17763- if (llist_add(&work->llnode, this_cpu_ptr(&lazy_list)) &&
17764- tick_nohz_tick_stopped())
17765- arch_irq_work_raise();
17766- } else {
17767- if (llist_add(&work->llnode, this_cpu_ptr(&raised_list)))
17768+ lazy_work = work->flags & IRQ_WORK_LAZY;
17769+
17770+ if (lazy_work || (realtime && !(work->flags & IRQ_WORK_HARD_IRQ)))
17771+ list = this_cpu_ptr(&lazy_list);
17772+ else
17773+ list = this_cpu_ptr(&raised_list);
17774+
17775+ if (llist_add(&work->llnode, list)) {
17776+ if (!lazy_work || tick_nohz_tick_stopped())
17777 arch_irq_work_raise();
17778 }
17779
17780@@ -116,9 +129,8 @@ bool irq_work_needs_cpu(void)
17781 raised = this_cpu_ptr(&raised_list);
17782 lazy = this_cpu_ptr(&lazy_list);
17783
17784- if (llist_empty(raised) || arch_irq_work_has_interrupt())
17785- if (llist_empty(lazy))
17786- return false;
17787+ if (llist_empty(raised) && llist_empty(lazy))
17788+ return false;
17789
17790 /* All work should have been flushed before going offline */
17791 WARN_ON_ONCE(cpu_is_offline(smp_processor_id()));
17792@@ -132,7 +144,7 @@ static void irq_work_run_list(struct llist_head *list)
17793 struct irq_work *work;
17794 struct llist_node *llnode;
17795
17796- BUG_ON(!irqs_disabled());
17797+ BUG_ON_NONRT(!irqs_disabled());
17798
17799 if (llist_empty(list))
17800 return;
17801@@ -169,7 +181,16 @@ static void irq_work_run_list(struct llist_head *list)
17802 void irq_work_run(void)
17803 {
17804 irq_work_run_list(this_cpu_ptr(&raised_list));
17805- irq_work_run_list(this_cpu_ptr(&lazy_list));
17806+ if (IS_ENABLED(CONFIG_PREEMPT_RT_FULL)) {
17807+ /*
17808+ * NOTE: we raise softirq via IPI for safety,
17809+ * and execute in irq_work_tick() to move the
17810+ * overhead from hard to soft irq context.
17811+ */
17812+ if (!llist_empty(this_cpu_ptr(&lazy_list)))
17813+ raise_softirq(TIMER_SOFTIRQ);
17814+ } else
17815+ irq_work_run_list(this_cpu_ptr(&lazy_list));
17816 }
17817 EXPORT_SYMBOL_GPL(irq_work_run);
17818
17819@@ -179,8 +200,17 @@ void irq_work_tick(void)
17820
17821 if (!llist_empty(raised) && !arch_irq_work_has_interrupt())
17822 irq_work_run_list(raised);
17823+
17824+ if (!IS_ENABLED(CONFIG_PREEMPT_RT_FULL))
17825+ irq_work_run_list(this_cpu_ptr(&lazy_list));
17826+}
17827+
17828+#if defined(CONFIG_IRQ_WORK) && defined(CONFIG_PREEMPT_RT_FULL)
17829+void irq_work_tick_soft(void)
17830+{
17831 irq_work_run_list(this_cpu_ptr(&lazy_list));
17832 }
17833+#endif
17834
17835 /*
17836 * Synchronize against the irq_work @entry, ensures the entry is not
17837diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c
17838index 46ba853656f6..9a23632b6294 100644
17839--- a/kernel/ksysfs.c
17840+++ b/kernel/ksysfs.c
17841@@ -140,6 +140,15 @@ KERNEL_ATTR_RO(vmcoreinfo);
17842
17843 #endif /* CONFIG_CRASH_CORE */
17844
17845+#if defined(CONFIG_PREEMPT_RT_FULL)
17846+static ssize_t realtime_show(struct kobject *kobj,
17847+ struct kobj_attribute *attr, char *buf)
17848+{
17849+ return sprintf(buf, "%d\n", 1);
17850+}
17851+KERNEL_ATTR_RO(realtime);
17852+#endif
17853+
17854 /* whether file capabilities are enabled */
17855 static ssize_t fscaps_show(struct kobject *kobj,
17856 struct kobj_attribute *attr, char *buf)
17857@@ -230,6 +239,9 @@ static struct attribute * kernel_attrs[] = {
17858 #ifndef CONFIG_TINY_RCU
17859 &rcu_expedited_attr.attr,
17860 &rcu_normal_attr.attr,
17861+#endif
17862+#ifdef CONFIG_PREEMPT_RT_FULL
17863+ &realtime_attr.attr,
17864 #endif
17865 NULL
17866 };
17867diff --git a/kernel/locking/Makefile b/kernel/locking/Makefile
17868index 392c7f23af76..c0bf04b6b965 100644
17869--- a/kernel/locking/Makefile
17870+++ b/kernel/locking/Makefile
17871@@ -3,7 +3,7 @@
17872 # and is generally not a function of system call inputs.
17873 KCOV_INSTRUMENT := n
17874
17875-obj-y += mutex.o semaphore.o rwsem.o percpu-rwsem.o
17876+obj-y += semaphore.o percpu-rwsem.o
17877
17878 ifdef CONFIG_FUNCTION_TRACER
17879 CFLAGS_REMOVE_lockdep.o = $(CC_FLAGS_FTRACE)
17880@@ -12,7 +12,11 @@ CFLAGS_REMOVE_mutex-debug.o = $(CC_FLAGS_FTRACE)
17881 CFLAGS_REMOVE_rtmutex-debug.o = $(CC_FLAGS_FTRACE)
17882 endif
17883
17884+ifneq ($(CONFIG_PREEMPT_RT_FULL),y)
17885+obj-y += mutex.o
17886 obj-$(CONFIG_DEBUG_MUTEXES) += mutex-debug.o
17887+endif
17888+obj-y += rwsem.o
17889 obj-$(CONFIG_LOCKDEP) += lockdep.o
17890 ifeq ($(CONFIG_PROC_FS),y)
17891 obj-$(CONFIG_LOCKDEP) += lockdep_proc.o
17892@@ -25,8 +29,11 @@ obj-$(CONFIG_RT_MUTEXES) += rtmutex.o
17893 obj-$(CONFIG_DEBUG_RT_MUTEXES) += rtmutex-debug.o
17894 obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock.o
17895 obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock_debug.o
17896+ifneq ($(CONFIG_PREEMPT_RT_FULL),y)
17897 obj-$(CONFIG_RWSEM_GENERIC_SPINLOCK) += rwsem-spinlock.o
17898 obj-$(CONFIG_RWSEM_XCHGADD_ALGORITHM) += rwsem-xadd.o
17899+endif
17900+obj-$(CONFIG_PREEMPT_RT_FULL) += mutex-rt.o rwsem-rt.o rwlock-rt.o
17901 obj-$(CONFIG_QUEUED_RWLOCKS) += qrwlock.o
17902 obj-$(CONFIG_LOCK_TORTURE_TEST) += locktorture.o
17903 obj-$(CONFIG_WW_MUTEX_SELFTEST) += test-ww_mutex.o
17904diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
17905index d7c155048ea9..def51a27f20f 100644
17906--- a/kernel/locking/lockdep.c
17907+++ b/kernel/locking/lockdep.c
17908@@ -3914,6 +3914,7 @@ static void check_flags(unsigned long flags)
17909 }
17910 }
17911
17912+#ifndef CONFIG_PREEMPT_RT_FULL
17913 /*
17914 * We dont accurately track softirq state in e.g.
17915 * hardirq contexts (such as on 4KSTACKS), so only
17916@@ -3928,6 +3929,7 @@ static void check_flags(unsigned long flags)
17917 DEBUG_LOCKS_WARN_ON(!current->softirqs_enabled);
17918 }
17919 }
17920+#endif
17921
17922 if (!debug_locks)
17923 print_irqtrace_events(current);
17924diff --git a/kernel/locking/locktorture.c b/kernel/locking/locktorture.c
17925index 6dca260eeccf..5d01ac590d4c 100644
17926--- a/kernel/locking/locktorture.c
17927+++ b/kernel/locking/locktorture.c
17928@@ -26,7 +26,6 @@
17929 #include <linux/kthread.h>
17930 #include <linux/sched/rt.h>
17931 #include <linux/spinlock.h>
17932-#include <linux/rwlock.h>
17933 #include <linux/mutex.h>
17934 #include <linux/rwsem.h>
17935 #include <linux/smp.h>
17936diff --git a/kernel/locking/mutex-rt.c b/kernel/locking/mutex-rt.c
17937new file mode 100644
17938index 000000000000..4f81595c0f52
17939--- /dev/null
17940+++ b/kernel/locking/mutex-rt.c
17941@@ -0,0 +1,223 @@
17942+/*
17943+ * kernel/rt.c
17944+ *
17945+ * Real-Time Preemption Support
17946+ *
17947+ * started by Ingo Molnar:
17948+ *
17949+ * Copyright (C) 2004-2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
17950+ * Copyright (C) 2006, Timesys Corp., Thomas Gleixner <tglx@timesys.com>
17951+ *
17952+ * historic credit for proving that Linux spinlocks can be implemented via
17953+ * RT-aware mutexes goes to many people: The Pmutex project (Dirk Grambow
17954+ * and others) who prototyped it on 2.4 and did lots of comparative
17955+ * research and analysis; TimeSys, for proving that you can implement a
17956+ * fully preemptible kernel via the use of IRQ threading and mutexes;
17957+ * Bill Huey for persuasively arguing on lkml that the mutex model is the
17958+ * right one; and to MontaVista, who ported pmutexes to 2.6.
17959+ *
17960+ * This code is a from-scratch implementation and is not based on pmutexes,
17961+ * but the idea of converting spinlocks to mutexes is used here too.
17962+ *
17963+ * lock debugging, locking tree, deadlock detection:
17964+ *
17965+ * Copyright (C) 2004, LynuxWorks, Inc., Igor Manyilov, Bill Huey
17966+ * Released under the General Public License (GPL).
17967+ *
17968+ * Includes portions of the generic R/W semaphore implementation from:
17969+ *
17970+ * Copyright (c) 2001 David Howells (dhowells@redhat.com).
17971+ * - Derived partially from idea by Andrea Arcangeli <andrea@suse.de>
17972+ * - Derived also from comments by Linus
17973+ *
17974+ * Pending ownership of locks and ownership stealing:
17975+ *
17976+ * Copyright (C) 2005, Kihon Technologies Inc., Steven Rostedt
17977+ *
17978+ * (also by Steven Rostedt)
17979+ * - Converted single pi_lock to individual task locks.
17980+ *
17981+ * By Esben Nielsen:
17982+ * Doing priority inheritance with help of the scheduler.
17983+ *
17984+ * Copyright (C) 2006, Timesys Corp., Thomas Gleixner <tglx@timesys.com>
17985+ * - major rework based on Esben Nielsens initial patch
17986+ * - replaced thread_info references by task_struct refs
17987+ * - removed task->pending_owner dependency
17988+ * - BKL drop/reacquire for semaphore style locks to avoid deadlocks
17989+ * in the scheduler return path as discussed with Steven Rostedt
17990+ *
17991+ * Copyright (C) 2006, Kihon Technologies Inc.
17992+ * Steven Rostedt <rostedt@goodmis.org>
17993+ * - debugged and patched Thomas Gleixner's rework.
17994+ * - added back the cmpxchg to the rework.
17995+ * - turned atomic require back on for SMP.
17996+ */
17997+
17998+#include <linux/spinlock.h>
17999+#include <linux/rtmutex.h>
18000+#include <linux/sched.h>
18001+#include <linux/delay.h>
18002+#include <linux/module.h>
18003+#include <linux/kallsyms.h>
18004+#include <linux/syscalls.h>
18005+#include <linux/interrupt.h>
18006+#include <linux/plist.h>
18007+#include <linux/fs.h>
18008+#include <linux/futex.h>
18009+#include <linux/hrtimer.h>
18010+
18011+#include "rtmutex_common.h"
18012+
18013+/*
18014+ * struct mutex functions
18015+ */
18016+void __mutex_do_init(struct mutex *mutex, const char *name,
18017+ struct lock_class_key *key)
18018+{
18019+#ifdef CONFIG_DEBUG_LOCK_ALLOC
18020+ /*
18021+ * Make sure we are not reinitializing a held lock:
18022+ */
18023+ debug_check_no_locks_freed((void *)mutex, sizeof(*mutex));
18024+ lockdep_init_map(&mutex->dep_map, name, key, 0);
18025+#endif
18026+ mutex->lock.save_state = 0;
18027+}
18028+EXPORT_SYMBOL(__mutex_do_init);
18029+
18030+void __lockfunc _mutex_lock(struct mutex *lock)
18031+{
18032+ mutex_acquire(&lock->dep_map, 0, 0, _RET_IP_);
18033+ __rt_mutex_lock_state(&lock->lock, TASK_UNINTERRUPTIBLE);
18034+}
18035+EXPORT_SYMBOL(_mutex_lock);
18036+
18037+void __lockfunc _mutex_lock_io(struct mutex *lock)
18038+{
18039+ int token;
18040+
18041+ token = io_schedule_prepare();
18042+ _mutex_lock(lock);
18043+ io_schedule_finish(token);
18044+}
18045+EXPORT_SYMBOL_GPL(_mutex_lock_io);
18046+
18047+int __lockfunc _mutex_lock_interruptible(struct mutex *lock)
18048+{
18049+ int ret;
18050+
18051+ mutex_acquire(&lock->dep_map, 0, 0, _RET_IP_);
18052+ ret = __rt_mutex_lock_state(&lock->lock, TASK_INTERRUPTIBLE);
18053+ if (ret)
18054+ mutex_release(&lock->dep_map, 1, _RET_IP_);
18055+ return ret;
18056+}
18057+EXPORT_SYMBOL(_mutex_lock_interruptible);
18058+
18059+int __lockfunc _mutex_lock_killable(struct mutex *lock)
18060+{
18061+ int ret;
18062+
18063+ mutex_acquire(&lock->dep_map, 0, 0, _RET_IP_);
18064+ ret = __rt_mutex_lock_state(&lock->lock, TASK_KILLABLE);
18065+ if (ret)
18066+ mutex_release(&lock->dep_map, 1, _RET_IP_);
18067+ return ret;
18068+}
18069+EXPORT_SYMBOL(_mutex_lock_killable);
18070+
18071+#ifdef CONFIG_DEBUG_LOCK_ALLOC
18072+void __lockfunc _mutex_lock_nested(struct mutex *lock, int subclass)
18073+{
18074+ mutex_acquire_nest(&lock->dep_map, subclass, 0, NULL, _RET_IP_);
18075+ __rt_mutex_lock_state(&lock->lock, TASK_UNINTERRUPTIBLE);
18076+}
18077+EXPORT_SYMBOL(_mutex_lock_nested);
18078+
18079+void __lockfunc _mutex_lock_io_nested(struct mutex *lock, int subclass)
18080+{
18081+ int token;
18082+
18083+ token = io_schedule_prepare();
18084+
18085+ mutex_acquire_nest(&lock->dep_map, subclass, 0, NULL, _RET_IP_);
18086+ __rt_mutex_lock_state(&lock->lock, TASK_UNINTERRUPTIBLE);
18087+
18088+ io_schedule_finish(token);
18089+}
18090+EXPORT_SYMBOL_GPL(_mutex_lock_io_nested);
18091+
18092+void __lockfunc _mutex_lock_nest_lock(struct mutex *lock, struct lockdep_map *nest)
18093+{
18094+ mutex_acquire_nest(&lock->dep_map, 0, 0, nest, _RET_IP_);
18095+ __rt_mutex_lock_state(&lock->lock, TASK_UNINTERRUPTIBLE);
18096+}
18097+EXPORT_SYMBOL(_mutex_lock_nest_lock);
18098+
18099+int __lockfunc _mutex_lock_interruptible_nested(struct mutex *lock, int subclass)
18100+{
18101+ int ret;
18102+
18103+ mutex_acquire_nest(&lock->dep_map, subclass, 0, NULL, _RET_IP_);
18104+ ret = __rt_mutex_lock_state(&lock->lock, TASK_INTERRUPTIBLE);
18105+ if (ret)
18106+ mutex_release(&lock->dep_map, 1, _RET_IP_);
18107+ return ret;
18108+}
18109+EXPORT_SYMBOL(_mutex_lock_interruptible_nested);
18110+
18111+int __lockfunc _mutex_lock_killable_nested(struct mutex *lock, int subclass)
18112+{
18113+ int ret;
18114+
18115+ mutex_acquire(&lock->dep_map, subclass, 0, _RET_IP_);
18116+ ret = __rt_mutex_lock_state(&lock->lock, TASK_KILLABLE);
18117+ if (ret)
18118+ mutex_release(&lock->dep_map, 1, _RET_IP_);
18119+ return ret;
18120+}
18121+EXPORT_SYMBOL(_mutex_lock_killable_nested);
18122+#endif
18123+
18124+int __lockfunc _mutex_trylock(struct mutex *lock)
18125+{
18126+ int ret = __rt_mutex_trylock(&lock->lock);
18127+
18128+ if (ret)
18129+ mutex_acquire(&lock->dep_map, 0, 1, _RET_IP_);
18130+
18131+ return ret;
18132+}
18133+EXPORT_SYMBOL(_mutex_trylock);
18134+
18135+void __lockfunc _mutex_unlock(struct mutex *lock)
18136+{
18137+ mutex_release(&lock->dep_map, 1, _RET_IP_);
18138+ __rt_mutex_unlock(&lock->lock);
18139+}
18140+EXPORT_SYMBOL(_mutex_unlock);
18141+
18142+/**
18143+ * atomic_dec_and_mutex_lock - return holding mutex if we dec to 0
18144+ * @cnt: the atomic which we are to dec
18145+ * @lock: the mutex to return holding if we dec to 0
18146+ *
18147+ * return true and hold lock if we dec to 0, return false otherwise
18148+ */
18149+int atomic_dec_and_mutex_lock(atomic_t *cnt, struct mutex *lock)
18150+{
18151+ /* dec if we can't possibly hit 0 */
18152+ if (atomic_add_unless(cnt, -1, 1))
18153+ return 0;
18154+ /* we might hit 0, so take the lock */
18155+ mutex_lock(lock);
18156+ if (!atomic_dec_and_test(cnt)) {
18157+ /* when we actually did the dec, we didn't hit 0 */
18158+ mutex_unlock(lock);
18159+ return 0;
18160+ }
18161+ /* we hit 0, and we hold the lock */
18162+ return 1;
18163+}
18164+EXPORT_SYMBOL(atomic_dec_and_mutex_lock);
18165diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c
18166index 4ad35718f123..08e233b7dc21 100644
18167--- a/kernel/locking/rtmutex.c
18168+++ b/kernel/locking/rtmutex.c
18169@@ -7,6 +7,11 @@
18170 * Copyright (C) 2005-2006 Timesys Corp., Thomas Gleixner <tglx@timesys.com>
18171 * Copyright (C) 2005 Kihon Technologies Inc., Steven Rostedt
18172 * Copyright (C) 2006 Esben Nielsen
18173+ * Adaptive Spinlocks:
18174+ * Copyright (C) 2008 Novell, Inc., Gregory Haskins, Sven Dietrich,
18175+ * and Peter Morreale,
18176+ * Adaptive Spinlocks simplification:
18177+ * Copyright (C) 2008 Red Hat, Inc., Steven Rostedt <srostedt@redhat.com>
18178 *
18179 * See Documentation/locking/rt-mutex-design.txt for details.
18180 */
18181@@ -18,6 +23,8 @@
18182 #include <linux/sched/wake_q.h>
18183 #include <linux/sched/debug.h>
18184 #include <linux/timer.h>
18185+#include <linux/ww_mutex.h>
18186+#include <linux/blkdev.h>
18187
18188 #include "rtmutex_common.h"
18189
18190@@ -135,6 +142,12 @@ static void fixup_rt_mutex_waiters(struct rt_mutex *lock)
18191 WRITE_ONCE(*p, owner & ~RT_MUTEX_HAS_WAITERS);
18192 }
18193
18194+static int rt_mutex_real_waiter(struct rt_mutex_waiter *waiter)
18195+{
18196+ return waiter && waiter != PI_WAKEUP_INPROGRESS &&
18197+ waiter != PI_REQUEUE_INPROGRESS;
18198+}
18199+
18200 /*
18201 * We can speed up the acquire/release, if there's no debugging state to be
18202 * set up.
18203@@ -228,7 +241,7 @@ static inline bool unlock_rt_mutex_safe(struct rt_mutex *lock,
18204 * Only use with rt_mutex_waiter_{less,equal}()
18205 */
18206 #define task_to_waiter(p) \
18207- &(struct rt_mutex_waiter){ .prio = (p)->prio, .deadline = (p)->dl.deadline }
18208+ &(struct rt_mutex_waiter){ .prio = (p)->prio, .deadline = (p)->dl.deadline, .task = (p) }
18209
18210 static inline int
18211 rt_mutex_waiter_less(struct rt_mutex_waiter *left,
18212@@ -268,6 +281,27 @@ rt_mutex_waiter_equal(struct rt_mutex_waiter *left,
18213 return 1;
18214 }
18215
18216+#define STEAL_NORMAL 0
18217+#define STEAL_LATERAL 1
18218+
18219+static inline int
18220+rt_mutex_steal(struct rt_mutex *lock, struct rt_mutex_waiter *waiter, int mode)
18221+{
18222+ struct rt_mutex_waiter *top_waiter = rt_mutex_top_waiter(lock);
18223+
18224+ if (waiter == top_waiter || rt_mutex_waiter_less(waiter, top_waiter))
18225+ return 1;
18226+
18227+ /*
18228+ * Note that RT tasks are excluded from lateral-steals
18229+ * to prevent the introduction of an unbounded latency.
18230+ */
18231+ if (mode == STEAL_NORMAL || rt_task(waiter->task))
18232+ return 0;
18233+
18234+ return rt_mutex_waiter_equal(waiter, top_waiter);
18235+}
18236+
18237 static void
18238 rt_mutex_enqueue(struct rt_mutex *lock, struct rt_mutex_waiter *waiter)
18239 {
18240@@ -372,6 +406,14 @@ static bool rt_mutex_cond_detect_deadlock(struct rt_mutex_waiter *waiter,
18241 return debug_rt_mutex_detect_deadlock(waiter, chwalk);
18242 }
18243
18244+static void rt_mutex_wake_waiter(struct rt_mutex_waiter *waiter)
18245+{
18246+ if (waiter->savestate)
18247+ wake_up_lock_sleeper(waiter->task);
18248+ else
18249+ wake_up_process(waiter->task);
18250+}
18251+
18252 /*
18253 * Max number of times we'll walk the boosting chain:
18254 */
18255@@ -379,7 +421,8 @@ int max_lock_depth = 1024;
18256
18257 static inline struct rt_mutex *task_blocked_on_lock(struct task_struct *p)
18258 {
18259- return p->pi_blocked_on ? p->pi_blocked_on->lock : NULL;
18260+ return rt_mutex_real_waiter(p->pi_blocked_on) ?
18261+ p->pi_blocked_on->lock : NULL;
18262 }
18263
18264 /*
18265@@ -515,7 +558,7 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
18266 * reached or the state of the chain has changed while we
18267 * dropped the locks.
18268 */
18269- if (!waiter)
18270+ if (!rt_mutex_real_waiter(waiter))
18271 goto out_unlock_pi;
18272
18273 /*
18274@@ -696,13 +739,16 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
18275 * follow here. This is the end of the chain we are walking.
18276 */
18277 if (!rt_mutex_owner(lock)) {
18278+ struct rt_mutex_waiter *lock_top_waiter;
18279+
18280 /*
18281 * If the requeue [7] above changed the top waiter,
18282 * then we need to wake the new top waiter up to try
18283 * to get the lock.
18284 */
18285- if (prerequeue_top_waiter != rt_mutex_top_waiter(lock))
18286- wake_up_process(rt_mutex_top_waiter(lock)->task);
18287+ lock_top_waiter = rt_mutex_top_waiter(lock);
18288+ if (prerequeue_top_waiter != lock_top_waiter)
18289+ rt_mutex_wake_waiter(lock_top_waiter);
18290 raw_spin_unlock_irq(&lock->wait_lock);
18291 return 0;
18292 }
18293@@ -804,9 +850,11 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
18294 * @task: The task which wants to acquire the lock
18295 * @waiter: The waiter that is queued to the lock's wait tree if the
18296 * callsite called task_blocked_on_lock(), otherwise NULL
18297+ * @mode: Lock steal mode (STEAL_NORMAL, STEAL_LATERAL)
18298 */
18299-static int try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task,
18300- struct rt_mutex_waiter *waiter)
18301+static int __try_to_take_rt_mutex(struct rt_mutex *lock,
18302+ struct task_struct *task,
18303+ struct rt_mutex_waiter *waiter, int mode)
18304 {
18305 lockdep_assert_held(&lock->wait_lock);
18306
18307@@ -842,12 +890,11 @@ static int try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task,
18308 */
18309 if (waiter) {
18310 /*
18311- * If waiter is not the highest priority waiter of
18312- * @lock, give up.
18313+ * If waiter is not the highest priority waiter of @lock,
18314+ * or its peer when lateral steal is allowed, give up.
18315 */
18316- if (waiter != rt_mutex_top_waiter(lock))
18317+ if (!rt_mutex_steal(lock, waiter, mode))
18318 return 0;
18319-
18320 /*
18321 * We can acquire the lock. Remove the waiter from the
18322 * lock waiters tree.
18323@@ -865,14 +912,12 @@ static int try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task,
18324 */
18325 if (rt_mutex_has_waiters(lock)) {
18326 /*
18327- * If @task->prio is greater than or equal to
18328- * the top waiter priority (kernel view),
18329- * @task lost.
18330+ * If @task->prio is greater than the top waiter
18331+ * priority (kernel view), or equal to it when a
18332+ * lateral steal is forbidden, @task lost.
18333 */
18334- if (!rt_mutex_waiter_less(task_to_waiter(task),
18335- rt_mutex_top_waiter(lock)))
18336+ if (!rt_mutex_steal(lock, task_to_waiter(task), mode))
18337 return 0;
18338-
18339 /*
18340 * The current top waiter stays enqueued. We
18341 * don't have to change anything in the lock
18342@@ -919,6 +964,351 @@ static int try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task,
18343 return 1;
18344 }
18345
18346+#ifdef CONFIG_PREEMPT_RT_FULL
18347+/*
18348+ * preemptible spin_lock functions:
18349+ */
18350+static inline void rt_spin_lock_fastlock(struct rt_mutex *lock,
18351+ void (*slowfn)(struct rt_mutex *lock))
18352+{
18353+ might_sleep_no_state_check();
18354+
18355+ if (likely(rt_mutex_cmpxchg_acquire(lock, NULL, current)))
18356+ return;
18357+ else
18358+ slowfn(lock);
18359+}
18360+
18361+static inline void rt_spin_lock_fastunlock(struct rt_mutex *lock,
18362+ void (*slowfn)(struct rt_mutex *lock))
18363+{
18364+ if (likely(rt_mutex_cmpxchg_release(lock, current, NULL)))
18365+ return;
18366+ else
18367+ slowfn(lock);
18368+}
18369+#ifdef CONFIG_SMP
18370+/*
18371+ * Note that owner is a speculative pointer and dereferencing relies
18372+ * on rcu_read_lock() and the check against the lock owner.
18373+ */
18374+static int adaptive_wait(struct rt_mutex *lock,
18375+ struct task_struct *owner)
18376+{
18377+ int res = 0;
18378+
18379+ rcu_read_lock();
18380+ for (;;) {
18381+ if (owner != rt_mutex_owner(lock))
18382+ break;
18383+ /*
18384+ * Ensure that owner->on_cpu is dereferenced _after_
18385+ * checking the above to be valid.
18386+ */
18387+ barrier();
18388+ if (!owner->on_cpu) {
18389+ res = 1;
18390+ break;
18391+ }
18392+ cpu_relax();
18393+ }
18394+ rcu_read_unlock();
18395+ return res;
18396+}
18397+#else
18398+static int adaptive_wait(struct rt_mutex *lock,
18399+ struct task_struct *orig_owner)
18400+{
18401+ return 1;
18402+}
18403+#endif
18404+
18405+static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
18406+ struct rt_mutex_waiter *waiter,
18407+ struct task_struct *task,
18408+ enum rtmutex_chainwalk chwalk);
18409+/*
18410+ * Slow path lock function spin_lock style: this variant is very
18411+ * careful not to miss any non-lock wakeups.
18412+ *
18413+ * We store the current state under p->pi_lock in p->saved_state and
18414+ * the try_to_wake_up() code handles this accordingly.
18415+ */
18416+void __sched rt_spin_lock_slowlock_locked(struct rt_mutex *lock,
18417+ struct rt_mutex_waiter *waiter,
18418+ unsigned long flags)
18419+{
18420+ struct task_struct *lock_owner, *self = current;
18421+ struct rt_mutex_waiter *top_waiter;
18422+ int ret;
18423+
18424+ if (__try_to_take_rt_mutex(lock, self, NULL, STEAL_LATERAL))
18425+ return;
18426+
18427+ BUG_ON(rt_mutex_owner(lock) == self);
18428+
18429+ /*
18430+ * We save whatever state the task is in and we'll restore it
18431+ * after acquiring the lock taking real wakeups into account
18432+ * as well. We are serialized via pi_lock against wakeups. See
18433+ * try_to_wake_up().
18434+ */
18435+ raw_spin_lock(&self->pi_lock);
18436+ self->saved_state = self->state;
18437+ __set_current_state_no_track(TASK_UNINTERRUPTIBLE);
18438+ raw_spin_unlock(&self->pi_lock);
18439+
18440+ ret = task_blocks_on_rt_mutex(lock, waiter, self, RT_MUTEX_MIN_CHAINWALK);
18441+ BUG_ON(ret);
18442+
18443+ for (;;) {
18444+ /* Try to acquire the lock again. */
18445+ if (__try_to_take_rt_mutex(lock, self, waiter, STEAL_LATERAL))
18446+ break;
18447+
18448+ top_waiter = rt_mutex_top_waiter(lock);
18449+ lock_owner = rt_mutex_owner(lock);
18450+
18451+ raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
18452+
18453+ debug_rt_mutex_print_deadlock(waiter);
18454+
18455+ if (top_waiter != waiter || adaptive_wait(lock, lock_owner))
18456+ schedule();
18457+
18458+ raw_spin_lock_irqsave(&lock->wait_lock, flags);
18459+
18460+ raw_spin_lock(&self->pi_lock);
18461+ __set_current_state_no_track(TASK_UNINTERRUPTIBLE);
18462+ raw_spin_unlock(&self->pi_lock);
18463+ }
18464+
18465+ /*
18466+ * Restore the task state to current->saved_state. We set it
18467+ * to the original state above and the try_to_wake_up() code
18468+ * has possibly updated it when a real (non-rtmutex) wakeup
18469+ * happened while we were blocked. Clear saved_state so
18470+ * try_to_wakeup() does not get confused.
18471+ */
18472+ raw_spin_lock(&self->pi_lock);
18473+ __set_current_state_no_track(self->saved_state);
18474+ self->saved_state = TASK_RUNNING;
18475+ raw_spin_unlock(&self->pi_lock);
18476+
18477+ /*
18478+ * try_to_take_rt_mutex() sets the waiter bit
18479+ * unconditionally. We might have to fix that up:
18480+ */
18481+ fixup_rt_mutex_waiters(lock);
18482+
18483+ BUG_ON(rt_mutex_has_waiters(lock) && waiter == rt_mutex_top_waiter(lock));
18484+ BUG_ON(!RB_EMPTY_NODE(&waiter->tree_entry));
18485+}
18486+
18487+static void noinline __sched rt_spin_lock_slowlock(struct rt_mutex *lock)
18488+{
18489+ struct rt_mutex_waiter waiter;
18490+ unsigned long flags;
18491+
18492+ rt_mutex_init_waiter(&waiter, true);
18493+
18494+ raw_spin_lock_irqsave(&lock->wait_lock, flags);
18495+ rt_spin_lock_slowlock_locked(lock, &waiter, flags);
18496+ raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
18497+ debug_rt_mutex_free_waiter(&waiter);
18498+}
18499+
18500+static bool __sched __rt_mutex_unlock_common(struct rt_mutex *lock,
18501+ struct wake_q_head *wake_q,
18502+ struct wake_q_head *wq_sleeper);
18503+/*
18504+ * Slow path to release a rt_mutex spin_lock style
18505+ */
18506+void __sched rt_spin_lock_slowunlock(struct rt_mutex *lock)
18507+{
18508+ unsigned long flags;
18509+ DEFINE_WAKE_Q(wake_q);
18510+ DEFINE_WAKE_Q(wake_sleeper_q);
18511+ bool postunlock;
18512+
18513+ raw_spin_lock_irqsave(&lock->wait_lock, flags);
18514+ postunlock = __rt_mutex_unlock_common(lock, &wake_q, &wake_sleeper_q);
18515+ raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
18516+
18517+ if (postunlock)
18518+ rt_mutex_postunlock(&wake_q, &wake_sleeper_q);
18519+}
18520+
18521+void __lockfunc rt_spin_lock(spinlock_t *lock)
18522+{
18523+ sleeping_lock_inc();
18524+ migrate_disable();
18525+ spin_acquire(&lock->dep_map, 0, 0, _RET_IP_);
18526+ rt_spin_lock_fastlock(&lock->lock, rt_spin_lock_slowlock);
18527+}
18528+EXPORT_SYMBOL(rt_spin_lock);
18529+
18530+void __lockfunc __rt_spin_lock(struct rt_mutex *lock)
18531+{
18532+ rt_spin_lock_fastlock(lock, rt_spin_lock_slowlock);
18533+}
18534+
18535+#ifdef CONFIG_DEBUG_LOCK_ALLOC
18536+void __lockfunc rt_spin_lock_nested(spinlock_t *lock, int subclass)
18537+{
18538+ sleeping_lock_inc();
18539+ migrate_disable();
18540+ spin_acquire(&lock->dep_map, subclass, 0, _RET_IP_);
18541+ rt_spin_lock_fastlock(&lock->lock, rt_spin_lock_slowlock);
18542+}
18543+EXPORT_SYMBOL(rt_spin_lock_nested);
18544+#endif
18545+
18546+void __lockfunc rt_spin_unlock(spinlock_t *lock)
18547+{
18548+ /* NOTE: we always pass in '1' for nested, for simplicity */
18549+ spin_release(&lock->dep_map, 1, _RET_IP_);
18550+ rt_spin_lock_fastunlock(&lock->lock, rt_spin_lock_slowunlock);
18551+ migrate_enable();
18552+ sleeping_lock_dec();
18553+}
18554+EXPORT_SYMBOL(rt_spin_unlock);
18555+
18556+void __lockfunc __rt_spin_unlock(struct rt_mutex *lock)
18557+{
18558+ rt_spin_lock_fastunlock(lock, rt_spin_lock_slowunlock);
18559+}
18560+EXPORT_SYMBOL(__rt_spin_unlock);
18561+
18562+/*
18563+ * Wait for the lock to get unlocked: instead of polling for an unlock
18564+ * (like raw spinlocks do), we lock and unlock, to force the kernel to
18565+ * schedule if there's contention:
18566+ */
18567+void __lockfunc rt_spin_unlock_wait(spinlock_t *lock)
18568+{
18569+ spin_lock(lock);
18570+ spin_unlock(lock);
18571+}
18572+EXPORT_SYMBOL(rt_spin_unlock_wait);
18573+
18574+int __lockfunc rt_spin_trylock(spinlock_t *lock)
18575+{
18576+ int ret;
18577+
18578+ sleeping_lock_inc();
18579+ migrate_disable();
18580+ ret = __rt_mutex_trylock(&lock->lock);
18581+ if (ret) {
18582+ spin_acquire(&lock->dep_map, 0, 1, _RET_IP_);
18583+ } else {
18584+ migrate_enable();
18585+ sleeping_lock_dec();
18586+ }
18587+ return ret;
18588+}
18589+EXPORT_SYMBOL(rt_spin_trylock);
18590+
18591+int __lockfunc rt_spin_trylock_bh(spinlock_t *lock)
18592+{
18593+ int ret;
18594+
18595+ local_bh_disable();
18596+ ret = __rt_mutex_trylock(&lock->lock);
18597+ if (ret) {
18598+ sleeping_lock_inc();
18599+ migrate_disable();
18600+ spin_acquire(&lock->dep_map, 0, 1, _RET_IP_);
18601+ } else
18602+ local_bh_enable();
18603+ return ret;
18604+}
18605+EXPORT_SYMBOL(rt_spin_trylock_bh);
18606+
18607+int __lockfunc rt_spin_trylock_irqsave(spinlock_t *lock, unsigned long *flags)
18608+{
18609+ int ret;
18610+
18611+ *flags = 0;
18612+ ret = __rt_mutex_trylock(&lock->lock);
18613+ if (ret) {
18614+ sleeping_lock_inc();
18615+ migrate_disable();
18616+ spin_acquire(&lock->dep_map, 0, 1, _RET_IP_);
18617+ }
18618+ return ret;
18619+}
18620+EXPORT_SYMBOL(rt_spin_trylock_irqsave);
18621+
18622+int atomic_dec_and_spin_lock(atomic_t *atomic, spinlock_t *lock)
18623+{
18624+ /* Subtract 1 from counter unless that drops it to 0 (ie. it was 1) */
18625+ if (atomic_add_unless(atomic, -1, 1))
18626+ return 0;
18627+ rt_spin_lock(lock);
18628+ if (atomic_dec_and_test(atomic))
18629+ return 1;
18630+ rt_spin_unlock(lock);
18631+ return 0;
18632+}
18633+EXPORT_SYMBOL(atomic_dec_and_spin_lock);
18634+
18635+void
18636+__rt_spin_lock_init(spinlock_t *lock, const char *name, struct lock_class_key *key)
18637+{
18638+#ifdef CONFIG_DEBUG_LOCK_ALLOC
18639+ /*
18640+ * Make sure we are not reinitializing a held lock:
18641+ */
18642+ debug_check_no_locks_freed((void *)lock, sizeof(*lock));
18643+ lockdep_init_map(&lock->dep_map, name, key, 0);
18644+#endif
18645+}
18646+EXPORT_SYMBOL(__rt_spin_lock_init);
18647+
18648+#endif /* PREEMPT_RT_FULL */
18649+
18650+#ifdef CONFIG_PREEMPT_RT_FULL
18651+ static inline int __sched
18652+__mutex_lock_check_stamp(struct rt_mutex *lock, struct ww_acquire_ctx *ctx)
18653+{
18654+ struct ww_mutex *ww = container_of(lock, struct ww_mutex, base.lock);
18655+ struct ww_acquire_ctx *hold_ctx = ACCESS_ONCE(ww->ctx);
18656+
18657+ if (!hold_ctx)
18658+ return 0;
18659+
18660+ if (unlikely(ctx == hold_ctx))
18661+ return -EALREADY;
18662+
18663+ if (ctx->stamp - hold_ctx->stamp <= LONG_MAX &&
18664+ (ctx->stamp != hold_ctx->stamp || ctx > hold_ctx)) {
18665+#ifdef CONFIG_DEBUG_MUTEXES
18666+ DEBUG_LOCKS_WARN_ON(ctx->contending_lock);
18667+ ctx->contending_lock = ww;
18668+#endif
18669+ return -EDEADLK;
18670+ }
18671+
18672+ return 0;
18673+}
18674+#else
18675+ static inline int __sched
18676+__mutex_lock_check_stamp(struct rt_mutex *lock, struct ww_acquire_ctx *ctx)
18677+{
18678+ BUG();
18679+ return 0;
18680+}
18681+
18682+#endif
18683+
18684+static inline int
18685+try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task,
18686+ struct rt_mutex_waiter *waiter)
18687+{
18688+ return __try_to_take_rt_mutex(lock, task, waiter, STEAL_NORMAL);
18689+}
18690+
18691 /*
18692 * Task blocks on lock.
18693 *
18694@@ -951,6 +1341,22 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
18695 return -EDEADLK;
18696
18697 raw_spin_lock(&task->pi_lock);
18698+ /*
18699+ * In the case of futex requeue PI, this will be a proxy
18700+ * lock. The task will wake unaware that it is enqueueed on
18701+ * this lock. Avoid blocking on two locks and corrupting
18702+ * pi_blocked_on via the PI_WAKEUP_INPROGRESS
18703+ * flag. futex_wait_requeue_pi() sets this when it wakes up
18704+ * before requeue (due to a signal or timeout). Do not enqueue
18705+ * the task if PI_WAKEUP_INPROGRESS is set.
18706+ */
18707+ if (task != current && task->pi_blocked_on == PI_WAKEUP_INPROGRESS) {
18708+ raw_spin_unlock(&task->pi_lock);
18709+ return -EAGAIN;
18710+ }
18711+
18712+ BUG_ON(rt_mutex_real_waiter(task->pi_blocked_on));
18713+
18714 waiter->task = task;
18715 waiter->lock = lock;
18716 waiter->prio = task->prio;
18717@@ -974,7 +1380,7 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
18718 rt_mutex_enqueue_pi(owner, waiter);
18719
18720 rt_mutex_adjust_prio(owner);
18721- if (owner->pi_blocked_on)
18722+ if (rt_mutex_real_waiter(owner->pi_blocked_on))
18723 chain_walk = 1;
18724 } else if (rt_mutex_cond_detect_deadlock(waiter, chwalk)) {
18725 chain_walk = 1;
18726@@ -1016,6 +1422,7 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
18727 * Called with lock->wait_lock held and interrupts disabled.
18728 */
18729 static void mark_wakeup_next_waiter(struct wake_q_head *wake_q,
18730+ struct wake_q_head *wake_sleeper_q,
18731 struct rt_mutex *lock)
18732 {
18733 struct rt_mutex_waiter *waiter;
18734@@ -1055,7 +1462,10 @@ static void mark_wakeup_next_waiter(struct wake_q_head *wake_q,
18735 * Pairs with preempt_enable() in rt_mutex_postunlock();
18736 */
18737 preempt_disable();
18738- wake_q_add(wake_q, waiter->task);
18739+ if (waiter->savestate)
18740+ wake_q_add_sleeper(wake_sleeper_q, waiter->task);
18741+ else
18742+ wake_q_add(wake_q, waiter->task);
18743 raw_spin_unlock(&current->pi_lock);
18744 }
18745
18746@@ -1070,7 +1480,7 @@ static void remove_waiter(struct rt_mutex *lock,
18747 {
18748 bool is_top_waiter = (waiter == rt_mutex_top_waiter(lock));
18749 struct task_struct *owner = rt_mutex_owner(lock);
18750- struct rt_mutex *next_lock;
18751+ struct rt_mutex *next_lock = NULL;
18752
18753 lockdep_assert_held(&lock->wait_lock);
18754
18755@@ -1096,7 +1506,8 @@ static void remove_waiter(struct rt_mutex *lock,
18756 rt_mutex_adjust_prio(owner);
18757
18758 /* Store the lock on which owner is blocked or NULL */
18759- next_lock = task_blocked_on_lock(owner);
18760+ if (rt_mutex_real_waiter(owner->pi_blocked_on))
18761+ next_lock = task_blocked_on_lock(owner);
18762
18763 raw_spin_unlock(&owner->pi_lock);
18764
18765@@ -1132,26 +1543,28 @@ void rt_mutex_adjust_pi(struct task_struct *task)
18766 raw_spin_lock_irqsave(&task->pi_lock, flags);
18767
18768 waiter = task->pi_blocked_on;
18769- if (!waiter || rt_mutex_waiter_equal(waiter, task_to_waiter(task))) {
18770+ if (!rt_mutex_real_waiter(waiter) ||
18771+ rt_mutex_waiter_equal(waiter, task_to_waiter(task))) {
18772 raw_spin_unlock_irqrestore(&task->pi_lock, flags);
18773 return;
18774 }
18775 next_lock = waiter->lock;
18776- raw_spin_unlock_irqrestore(&task->pi_lock, flags);
18777
18778 /* gets dropped in rt_mutex_adjust_prio_chain()! */
18779 get_task_struct(task);
18780
18781+ raw_spin_unlock_irqrestore(&task->pi_lock, flags);
18782 rt_mutex_adjust_prio_chain(task, RT_MUTEX_MIN_CHAINWALK, NULL,
18783 next_lock, NULL, task);
18784 }
18785
18786-void rt_mutex_init_waiter(struct rt_mutex_waiter *waiter)
18787+void rt_mutex_init_waiter(struct rt_mutex_waiter *waiter, bool savestate)
18788 {
18789 debug_rt_mutex_init_waiter(waiter);
18790 RB_CLEAR_NODE(&waiter->pi_tree_entry);
18791 RB_CLEAR_NODE(&waiter->tree_entry);
18792 waiter->task = NULL;
18793+ waiter->savestate = savestate;
18794 }
18795
18796 /**
18797@@ -1167,7 +1580,8 @@ void rt_mutex_init_waiter(struct rt_mutex_waiter *waiter)
18798 static int __sched
18799 __rt_mutex_slowlock(struct rt_mutex *lock, int state,
18800 struct hrtimer_sleeper *timeout,
18801- struct rt_mutex_waiter *waiter)
18802+ struct rt_mutex_waiter *waiter,
18803+ struct ww_acquire_ctx *ww_ctx)
18804 {
18805 int ret = 0;
18806
18807@@ -1176,16 +1590,17 @@ __rt_mutex_slowlock(struct rt_mutex *lock, int state,
18808 if (try_to_take_rt_mutex(lock, current, waiter))
18809 break;
18810
18811- /*
18812- * TASK_INTERRUPTIBLE checks for signals and
18813- * timeout. Ignored otherwise.
18814- */
18815- if (likely(state == TASK_INTERRUPTIBLE)) {
18816- /* Signal pending? */
18817- if (signal_pending(current))
18818- ret = -EINTR;
18819- if (timeout && !timeout->task)
18820- ret = -ETIMEDOUT;
18821+ if (timeout && !timeout->task) {
18822+ ret = -ETIMEDOUT;
18823+ break;
18824+ }
18825+ if (signal_pending_state(state, current)) {
18826+ ret = -EINTR;
18827+ break;
18828+ }
18829+
18830+ if (ww_ctx && ww_ctx->acquired > 0) {
18831+ ret = __mutex_lock_check_stamp(lock, ww_ctx);
18832 if (ret)
18833 break;
18834 }
18835@@ -1224,33 +1639,104 @@ static void rt_mutex_handle_deadlock(int res, int detect_deadlock,
18836 }
18837 }
18838
18839-/*
18840- * Slow path lock function:
18841- */
18842-static int __sched
18843-rt_mutex_slowlock(struct rt_mutex *lock, int state,
18844- struct hrtimer_sleeper *timeout,
18845- enum rtmutex_chainwalk chwalk)
18846+static __always_inline void ww_mutex_lock_acquired(struct ww_mutex *ww,
18847+ struct ww_acquire_ctx *ww_ctx)
18848 {
18849- struct rt_mutex_waiter waiter;
18850- unsigned long flags;
18851- int ret = 0;
18852+#ifdef CONFIG_DEBUG_MUTEXES
18853+ /*
18854+ * If this WARN_ON triggers, you used ww_mutex_lock to acquire,
18855+ * but released with a normal mutex_unlock in this call.
18856+ *
18857+ * This should never happen, always use ww_mutex_unlock.
18858+ */
18859+ DEBUG_LOCKS_WARN_ON(ww->ctx);
18860
18861- rt_mutex_init_waiter(&waiter);
18862+ /*
18863+ * Not quite done after calling ww_acquire_done() ?
18864+ */
18865+ DEBUG_LOCKS_WARN_ON(ww_ctx->done_acquire);
18866+
18867+ if (ww_ctx->contending_lock) {
18868+ /*
18869+ * After -EDEADLK you tried to
18870+ * acquire a different ww_mutex? Bad!
18871+ */
18872+ DEBUG_LOCKS_WARN_ON(ww_ctx->contending_lock != ww);
18873+
18874+ /*
18875+ * You called ww_mutex_lock after receiving -EDEADLK,
18876+ * but 'forgot' to unlock everything else first?
18877+ */
18878+ DEBUG_LOCKS_WARN_ON(ww_ctx->acquired > 0);
18879+ ww_ctx->contending_lock = NULL;
18880+ }
18881
18882 /*
18883- * Technically we could use raw_spin_[un]lock_irq() here, but this can
18884- * be called in early boot if the cmpxchg() fast path is disabled
18885- * (debug, no architecture support). In this case we will acquire the
18886- * rtmutex with lock->wait_lock held. But we cannot unconditionally
18887- * enable interrupts in that early boot case. So we need to use the
18888- * irqsave/restore variants.
18889+ * Naughty, using a different class will lead to undefined behavior!
18890 */
18891- raw_spin_lock_irqsave(&lock->wait_lock, flags);
18892+ DEBUG_LOCKS_WARN_ON(ww_ctx->ww_class != ww->ww_class);
18893+#endif
18894+ ww_ctx->acquired++;
18895+}
18896+
18897+#ifdef CONFIG_PREEMPT_RT_FULL
18898+static void ww_mutex_account_lock(struct rt_mutex *lock,
18899+ struct ww_acquire_ctx *ww_ctx)
18900+{
18901+ struct ww_mutex *ww = container_of(lock, struct ww_mutex, base.lock);
18902+ struct rt_mutex_waiter *waiter, *n;
18903+
18904+ /*
18905+ * This branch gets optimized out for the common case,
18906+ * and is only important for ww_mutex_lock.
18907+ */
18908+ ww_mutex_lock_acquired(ww, ww_ctx);
18909+ ww->ctx = ww_ctx;
18910+
18911+ /*
18912+ * Give any possible sleeping processes the chance to wake up,
18913+ * so they can recheck if they have to back off.
18914+ */
18915+ rbtree_postorder_for_each_entry_safe(waiter, n, &lock->waiters.rb_root,
18916+ tree_entry) {
18917+ /* XXX debug rt mutex waiter wakeup */
18918+
18919+ BUG_ON(waiter->lock != lock);
18920+ rt_mutex_wake_waiter(waiter);
18921+ }
18922+}
18923+
18924+#else
18925+
18926+static void ww_mutex_account_lock(struct rt_mutex *lock,
18927+ struct ww_acquire_ctx *ww_ctx)
18928+{
18929+ BUG();
18930+}
18931+#endif
18932+
18933+int __sched rt_mutex_slowlock_locked(struct rt_mutex *lock, int state,
18934+ struct hrtimer_sleeper *timeout,
18935+ enum rtmutex_chainwalk chwalk,
18936+ struct ww_acquire_ctx *ww_ctx,
18937+ struct rt_mutex_waiter *waiter)
18938+{
18939+ int ret;
18940+
18941+#ifdef CONFIG_PREEMPT_RT_FULL
18942+ if (ww_ctx) {
18943+ struct ww_mutex *ww;
18944+
18945+ ww = container_of(lock, struct ww_mutex, base.lock);
18946+ if (unlikely(ww_ctx == READ_ONCE(ww->ctx)))
18947+ return -EALREADY;
18948+ }
18949+#endif
18950
18951 /* Try to acquire the lock again: */
18952 if (try_to_take_rt_mutex(lock, current, NULL)) {
18953- raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
18954+ if (ww_ctx)
18955+ ww_mutex_account_lock(lock, ww_ctx);
18956 return 0;
18957 }
18958
18959@@ -1260,17 +1746,27 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state,
18960 if (unlikely(timeout))
18961 hrtimer_start_expires(&timeout->timer, HRTIMER_MODE_ABS);
18962
18963- ret = task_blocks_on_rt_mutex(lock, &waiter, current, chwalk);
18964+ ret = task_blocks_on_rt_mutex(lock, waiter, current, chwalk);
18965
18966- if (likely(!ret))
18967+ if (likely(!ret)) {
18968 /* sleep on the mutex */
18969- ret = __rt_mutex_slowlock(lock, state, timeout, &waiter);
18970+ ret = __rt_mutex_slowlock(lock, state, timeout, waiter,
18971+ ww_ctx);
18972+ } else if (ww_ctx) {
18973+ /* ww_mutex received EDEADLK, let it become EALREADY */
18974+ ret = __mutex_lock_check_stamp(lock, ww_ctx);
18975+ BUG_ON(!ret);
18976+ }
18977
18978 if (unlikely(ret)) {
18979 __set_current_state(TASK_RUNNING);
18980 if (rt_mutex_has_waiters(lock))
18981- remove_waiter(lock, &waiter);
18982- rt_mutex_handle_deadlock(ret, chwalk, &waiter);
18983+ remove_waiter(lock, waiter);
18984+ /* ww_mutex want to report EDEADLK/EALREADY, let them */
18985+ if (!ww_ctx)
18986+ rt_mutex_handle_deadlock(ret, chwalk, waiter);
18987+ } else if (ww_ctx) {
18988+ ww_mutex_account_lock(lock, ww_ctx);
18989 }
18990
18991 /*
18992@@ -1278,6 +1774,36 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state,
18993 * unconditionally. We might have to fix that up.
18994 */
18995 fixup_rt_mutex_waiters(lock);
18996+ return ret;
18997+}
18998+
18999+/*
19000+ * Slow path lock function:
19001+ */
19002+static int __sched
19003+rt_mutex_slowlock(struct rt_mutex *lock, int state,
19004+ struct hrtimer_sleeper *timeout,
19005+ enum rtmutex_chainwalk chwalk,
19006+ struct ww_acquire_ctx *ww_ctx)
19007+{
19008+ struct rt_mutex_waiter waiter;
19009+ unsigned long flags;
19010+ int ret = 0;
19011+
19012+ rt_mutex_init_waiter(&waiter, false);
19013+
19014+ /*
19015+ * Technically we could use raw_spin_[un]lock_irq() here, but this can
19016+ * be called in early boot if the cmpxchg() fast path is disabled
19017+ * (debug, no architecture support). In this case we will acquire the
19018+ * rtmutex with lock->wait_lock held. But we cannot unconditionally
19019+ * enable interrupts in that early boot case. So we need to use the
19020+ * irqsave/restore variants.
19021+ */
19022+ raw_spin_lock_irqsave(&lock->wait_lock, flags);
19023+
19024+ ret = rt_mutex_slowlock_locked(lock, state, timeout, chwalk, ww_ctx,
19025+ &waiter);
19026
19027 raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
19028
19029@@ -1338,7 +1864,8 @@ static inline int rt_mutex_slowtrylock(struct rt_mutex *lock)
19030 * Return whether the current task needs to call rt_mutex_postunlock().
19031 */
19032 static bool __sched rt_mutex_slowunlock(struct rt_mutex *lock,
19033- struct wake_q_head *wake_q)
19034+ struct wake_q_head *wake_q,
19035+ struct wake_q_head *wake_sleeper_q)
19036 {
19037 unsigned long flags;
19038
19039@@ -1392,7 +1919,7 @@ static bool __sched rt_mutex_slowunlock(struct rt_mutex *lock,
19040 *
19041 * Queue the next waiter for wakeup once we release the wait_lock.
19042 */
19043- mark_wakeup_next_waiter(wake_q, lock);
19044+ mark_wakeup_next_waiter(wake_q, wake_sleeper_q, lock);
19045 raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
19046
19047 return true; /* call rt_mutex_postunlock() */
19048@@ -1406,29 +1933,45 @@ static bool __sched rt_mutex_slowunlock(struct rt_mutex *lock,
19049 */
19050 static inline int
19051 rt_mutex_fastlock(struct rt_mutex *lock, int state,
19052+ struct ww_acquire_ctx *ww_ctx,
19053 int (*slowfn)(struct rt_mutex *lock, int state,
19054 struct hrtimer_sleeper *timeout,
19055- enum rtmutex_chainwalk chwalk))
19056+ enum rtmutex_chainwalk chwalk,
19057+ struct ww_acquire_ctx *ww_ctx))
19058 {
19059 if (likely(rt_mutex_cmpxchg_acquire(lock, NULL, current)))
19060 return 0;
19061
19062- return slowfn(lock, state, NULL, RT_MUTEX_MIN_CHAINWALK);
19063+ /*
19064+ * If rt_mutex blocks, the function sched_submit_work will not call
19065+ * blk_schedule_flush_plug (because tsk_is_pi_blocked would be true).
19066+ * We must call blk_schedule_flush_plug here, if we don't call it,
19067+ * a deadlock in device mapper may happen.
19068+ */
19069+ if (unlikely(blk_needs_flush_plug(current)))
19070+ blk_schedule_flush_plug(current);
19071+
19072+ return slowfn(lock, state, NULL, RT_MUTEX_MIN_CHAINWALK, ww_ctx);
19073 }
19074
19075 static inline int
19076 rt_mutex_timed_fastlock(struct rt_mutex *lock, int state,
19077 struct hrtimer_sleeper *timeout,
19078 enum rtmutex_chainwalk chwalk,
19079+ struct ww_acquire_ctx *ww_ctx,
19080 int (*slowfn)(struct rt_mutex *lock, int state,
19081 struct hrtimer_sleeper *timeout,
19082- enum rtmutex_chainwalk chwalk))
19083+ enum rtmutex_chainwalk chwalk,
19084+ struct ww_acquire_ctx *ww_ctx))
19085 {
19086 if (chwalk == RT_MUTEX_MIN_CHAINWALK &&
19087 likely(rt_mutex_cmpxchg_acquire(lock, NULL, current)))
19088 return 0;
19089
19090- return slowfn(lock, state, timeout, chwalk);
19091+ if (unlikely(blk_needs_flush_plug(current)))
19092+ blk_schedule_flush_plug(current);
19093+
19094+ return slowfn(lock, state, timeout, chwalk, ww_ctx);
19095 }
19096
19097 static inline int
19098@@ -1444,9 +1987,11 @@ rt_mutex_fasttrylock(struct rt_mutex *lock,
19099 /*
19100 * Performs the wakeup of the the top-waiter and re-enables preemption.
19101 */
19102-void rt_mutex_postunlock(struct wake_q_head *wake_q)
19103+void rt_mutex_postunlock(struct wake_q_head *wake_q,
19104+ struct wake_q_head *wake_sleeper_q)
19105 {
19106 wake_up_q(wake_q);
19107+ wake_up_q_sleeper(wake_sleeper_q);
19108
19109 /* Pairs with preempt_disable() in rt_mutex_slowunlock() */
19110 preempt_enable();
19111@@ -1455,23 +2000,40 @@ void rt_mutex_postunlock(struct wake_q_head *wake_q)
19112 static inline void
19113 rt_mutex_fastunlock(struct rt_mutex *lock,
19114 bool (*slowfn)(struct rt_mutex *lock,
19115- struct wake_q_head *wqh))
19116+ struct wake_q_head *wqh,
19117+ struct wake_q_head *wq_sleeper))
19118 {
19119 DEFINE_WAKE_Q(wake_q);
19120+ DEFINE_WAKE_Q(wake_sleeper_q);
19121
19122 if (likely(rt_mutex_cmpxchg_release(lock, current, NULL)))
19123 return;
19124
19125- if (slowfn(lock, &wake_q))
19126- rt_mutex_postunlock(&wake_q);
19127+ if (slowfn(lock, &wake_q, &wake_sleeper_q))
19128+ rt_mutex_postunlock(&wake_q, &wake_sleeper_q);
19129 }
19130
19131-static inline void __rt_mutex_lock(struct rt_mutex *lock, unsigned int subclass)
19132+int __sched __rt_mutex_lock_state(struct rt_mutex *lock, int state)
19133 {
19134 might_sleep();
19135+ return rt_mutex_fastlock(lock, state, NULL, rt_mutex_slowlock);
19136+}
19137+
19138+/**
19139+ * rt_mutex_lock_state - lock a rt_mutex with a given state
19140+ *
19141+ * @lock: The rt_mutex to be locked
19142+ * @state: The state to set when blocking on the rt_mutex
19143+ */
19144+static int __sched rt_mutex_lock_state(struct rt_mutex *lock, int state, unsigned int subclass)
19145+{
19146+ int ret;
19147
19148 mutex_acquire(&lock->dep_map, subclass, 0, _RET_IP_);
19149- rt_mutex_fastlock(lock, TASK_UNINTERRUPTIBLE, rt_mutex_slowlock);
19150+ ret = __rt_mutex_lock_state(lock, state);
19151+ if (ret)
19152+ mutex_release(&lock->dep_map, 1, _RET_IP_);
19153+ return ret;
19154 }
19155
19156 #ifdef CONFIG_DEBUG_LOCK_ALLOC
19157@@ -1483,7 +2045,7 @@ static inline void __rt_mutex_lock(struct rt_mutex *lock, unsigned int subclass)
19158 */
19159 void __sched rt_mutex_lock_nested(struct rt_mutex *lock, unsigned int subclass)
19160 {
19161- __rt_mutex_lock(lock, subclass);
19162+ rt_mutex_lock_state(lock, TASK_UNINTERRUPTIBLE, subclass);
19163 }
19164 EXPORT_SYMBOL_GPL(rt_mutex_lock_nested);
19165 #endif
19166@@ -1496,7 +2058,7 @@ EXPORT_SYMBOL_GPL(rt_mutex_lock_nested);
19167 */
19168 void __sched rt_mutex_lock(struct rt_mutex *lock)
19169 {
19170- __rt_mutex_lock(lock, 0);
19171+ rt_mutex_lock_state(lock, TASK_UNINTERRUPTIBLE, 0);
19172 }
19173 EXPORT_SYMBOL_GPL(rt_mutex_lock);
19174 #endif
19175@@ -1512,16 +2074,7 @@ EXPORT_SYMBOL_GPL(rt_mutex_lock);
19176 */
19177 int __sched rt_mutex_lock_interruptible(struct rt_mutex *lock)
19178 {
19179- int ret;
19180-
19181- might_sleep();
19182-
19183- mutex_acquire(&lock->dep_map, 0, 0, _RET_IP_);
19184- ret = rt_mutex_fastlock(lock, TASK_INTERRUPTIBLE, rt_mutex_slowlock);
19185- if (ret)
19186- mutex_release(&lock->dep_map, 1, _RET_IP_);
19187-
19188- return ret;
19189+ return rt_mutex_lock_state(lock, TASK_INTERRUPTIBLE, 0);
19190 }
19191 EXPORT_SYMBOL_GPL(rt_mutex_lock_interruptible);
19192
19193@@ -1538,6 +2091,22 @@ int __sched __rt_mutex_futex_trylock(struct rt_mutex *lock)
19194 return __rt_mutex_slowtrylock(lock);
19195 }
19196
19197+/**
19198+ * rt_mutex_lock_killable - lock a rt_mutex killable
19199+ *
19200+ * @lock: the rt_mutex to be locked
19201+ * @detect_deadlock: deadlock detection on/off
19202+ *
19203+ * Returns:
19204+ * 0 on success
19205+ * -EINTR when interrupted by a signal
19206+ */
19207+int __sched rt_mutex_lock_killable(struct rt_mutex *lock)
19208+{
19209+ return rt_mutex_lock_state(lock, TASK_KILLABLE, 0);
19210+}
19211+EXPORT_SYMBOL_GPL(rt_mutex_lock_killable);
19212+
19213 /**
19214 * rt_mutex_timed_lock - lock a rt_mutex interruptible
19215 * the timeout structure is provided
19216@@ -1561,6 +2130,7 @@ rt_mutex_timed_lock(struct rt_mutex *lock, struct hrtimer_sleeper *timeout)
19217 mutex_acquire(&lock->dep_map, 0, 0, _RET_IP_);
19218 ret = rt_mutex_timed_fastlock(lock, TASK_INTERRUPTIBLE, timeout,
19219 RT_MUTEX_MIN_CHAINWALK,
19220+ NULL,
19221 rt_mutex_slowlock);
19222 if (ret)
19223 mutex_release(&lock->dep_map, 1, _RET_IP_);
19224@@ -1569,6 +2139,18 @@ rt_mutex_timed_lock(struct rt_mutex *lock, struct hrtimer_sleeper *timeout)
19225 }
19226 EXPORT_SYMBOL_GPL(rt_mutex_timed_lock);
19227
19228+int __sched __rt_mutex_trylock(struct rt_mutex *lock)
19229+{
19230+#ifdef CONFIG_PREEMPT_RT_FULL
19231+ if (WARN_ON_ONCE(in_irq() || in_nmi()))
19232+#else
19233+ if (WARN_ON_ONCE(in_irq() || in_nmi() || in_serving_softirq()))
19234+#endif
19235+ return 0;
19236+
19237+ return rt_mutex_fasttrylock(lock, rt_mutex_slowtrylock);
19238+}
19239+
19240 /**
19241 * rt_mutex_trylock - try to lock a rt_mutex
19242 *
19243@@ -1584,10 +2166,7 @@ int __sched rt_mutex_trylock(struct rt_mutex *lock)
19244 {
19245 int ret;
19246
19247- if (WARN_ON_ONCE(in_irq() || in_nmi() || in_serving_softirq()))
19248- return 0;
19249-
19250- ret = rt_mutex_fasttrylock(lock, rt_mutex_slowtrylock);
19251+ ret = __rt_mutex_trylock(lock);
19252 if (ret)
19253 mutex_acquire(&lock->dep_map, 0, 1, _RET_IP_);
19254
19255@@ -1595,6 +2174,11 @@ int __sched rt_mutex_trylock(struct rt_mutex *lock)
19256 }
19257 EXPORT_SYMBOL_GPL(rt_mutex_trylock);
19258
19259+void __sched __rt_mutex_unlock(struct rt_mutex *lock)
19260+{
19261+ rt_mutex_fastunlock(lock, rt_mutex_slowunlock);
19262+}
19263+
19264 /**
19265 * rt_mutex_unlock - unlock a rt_mutex
19266 *
19267@@ -1603,16 +2187,13 @@ EXPORT_SYMBOL_GPL(rt_mutex_trylock);
19268 void __sched rt_mutex_unlock(struct rt_mutex *lock)
19269 {
19270 mutex_release(&lock->dep_map, 1, _RET_IP_);
19271- rt_mutex_fastunlock(lock, rt_mutex_slowunlock);
19272+ __rt_mutex_unlock(lock);
19273 }
19274 EXPORT_SYMBOL_GPL(rt_mutex_unlock);
19275
19276-/**
19277- * Futex variant, that since futex variants do not use the fast-path, can be
19278- * simple and will not need to retry.
19279- */
19280-bool __sched __rt_mutex_futex_unlock(struct rt_mutex *lock,
19281- struct wake_q_head *wake_q)
19282+static bool __sched __rt_mutex_unlock_common(struct rt_mutex *lock,
19283+ struct wake_q_head *wake_q,
19284+ struct wake_q_head *wq_sleeper)
19285 {
19286 lockdep_assert_held(&lock->wait_lock);
19287
19288@@ -1629,22 +2210,35 @@ bool __sched __rt_mutex_futex_unlock(struct rt_mutex *lock,
19289 * avoid inversion prior to the wakeup. preempt_disable()
19290 * therein pairs with rt_mutex_postunlock().
19291 */
19292- mark_wakeup_next_waiter(wake_q, lock);
19293+ mark_wakeup_next_waiter(wake_q, wq_sleeper, lock);
19294
19295 return true; /* call postunlock() */
19296 }
19297
19298+/**
19299+ * Futex variant, that since futex variants do not use the fast-path, can be
19300+ * simple and will not need to retry.
19301+ */
19302+bool __sched __rt_mutex_futex_unlock(struct rt_mutex *lock,
19303+ struct wake_q_head *wake_q,
19304+ struct wake_q_head *wq_sleeper)
19305+{
19306+ return __rt_mutex_unlock_common(lock, wake_q, wq_sleeper);
19307+}
19308+
19309 void __sched rt_mutex_futex_unlock(struct rt_mutex *lock)
19310 {
19311 DEFINE_WAKE_Q(wake_q);
19312+ DEFINE_WAKE_Q(wake_sleeper_q);
19313+ unsigned long flags;
19314 bool postunlock;
19315
19316- raw_spin_lock_irq(&lock->wait_lock);
19317- postunlock = __rt_mutex_futex_unlock(lock, &wake_q);
19318- raw_spin_unlock_irq(&lock->wait_lock);
19319+ raw_spin_lock_irqsave(&lock->wait_lock, flags);
19320+ postunlock = __rt_mutex_futex_unlock(lock, &wake_q, &wake_sleeper_q);
19321+ raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
19322
19323 if (postunlock)
19324- rt_mutex_postunlock(&wake_q);
19325+ rt_mutex_postunlock(&wake_q, &wake_sleeper_q);
19326 }
19327
19328 /**
19329@@ -1683,7 +2277,7 @@ void __rt_mutex_init(struct rt_mutex *lock, const char *name,
19330 if (name && key)
19331 debug_rt_mutex_init(lock, name, key);
19332 }
19333-EXPORT_SYMBOL_GPL(__rt_mutex_init);
19334+EXPORT_SYMBOL(__rt_mutex_init);
19335
19336 /**
19337 * rt_mutex_init_proxy_locked - initialize and lock a rt_mutex on behalf of a
19338@@ -1703,6 +2297,14 @@ void rt_mutex_init_proxy_locked(struct rt_mutex *lock,
19339 struct task_struct *proxy_owner)
19340 {
19341 __rt_mutex_init(lock, NULL, NULL);
19342+#ifdef CONFIG_DEBUG_SPINLOCK
19343+ /*
19344+ * get another key class for the wait_lock. LOCK_PI and UNLOCK_PI is
19345+ * holding the ->wait_lock of the proxy_lock while unlocking a sleeping
19346+ * lock.
19347+ */
19348+ raw_spin_lock_init(&lock->wait_lock);
19349+#endif
19350 debug_rt_mutex_proxy_lock(lock, proxy_owner);
19351 rt_mutex_set_owner(lock, proxy_owner);
19352 }
19353@@ -1735,6 +2337,34 @@ int __rt_mutex_start_proxy_lock(struct rt_mutex *lock,
19354 if (try_to_take_rt_mutex(lock, task, NULL))
19355 return 1;
19356
19357+#ifdef CONFIG_PREEMPT_RT_FULL
19358+ /*
19359+ * In PREEMPT_RT there's an added race.
19360+ * If the task, that we are about to requeue, times out,
19361+ * it can set the PI_WAKEUP_INPROGRESS. This tells the requeue
19362+ * to skip this task. But right after the task sets
19363+ * its pi_blocked_on to PI_WAKEUP_INPROGRESS it can then
19364+ * block on the spin_lock(&hb->lock), which in RT is an rtmutex.
19365+ * This will replace the PI_WAKEUP_INPROGRESS with the actual
19366+ * lock that it blocks on. We *must not* place this task
19367+ * on this proxy lock in that case.
19368+ *
19369+ * To prevent this race, we first take the task's pi_lock
19370+ * and check if it has updated its pi_blocked_on. If it has,
19371+ * we assume that it woke up and we return -EAGAIN.
19372+ * Otherwise, we set the task's pi_blocked_on to
19373+ * PI_REQUEUE_INPROGRESS, so that if the task is waking up
19374+ * it will know that we are in the process of requeuing it.
19375+ */
19376+ raw_spin_lock(&task->pi_lock);
19377+ if (task->pi_blocked_on) {
19378+ raw_spin_unlock(&task->pi_lock);
19379+ return -EAGAIN;
19380+ }
19381+ task->pi_blocked_on = PI_REQUEUE_INPROGRESS;
19382+ raw_spin_unlock(&task->pi_lock);
19383+#endif
19384+
19385 /* We enforce deadlock detection for futexes */
19386 ret = task_blocks_on_rt_mutex(lock, waiter, task,
19387 RT_MUTEX_FULL_CHAINWALK);
19388@@ -1749,7 +2379,7 @@ int __rt_mutex_start_proxy_lock(struct rt_mutex *lock,
19389 ret = 0;
19390 }
19391
19392- if (unlikely(ret))
19393+ if (ret && rt_mutex_has_waiters(lock))
19394 remove_waiter(lock, waiter);
19395
19396 debug_rt_mutex_print_deadlock(waiter);
19397@@ -1824,17 +2454,36 @@ int rt_mutex_wait_proxy_lock(struct rt_mutex *lock,
19398 struct hrtimer_sleeper *to,
19399 struct rt_mutex_waiter *waiter)
19400 {
19401+ struct task_struct *tsk = current;
19402 int ret;
19403
19404 raw_spin_lock_irq(&lock->wait_lock);
19405 /* sleep on the mutex */
19406 set_current_state(TASK_INTERRUPTIBLE);
19407- ret = __rt_mutex_slowlock(lock, TASK_INTERRUPTIBLE, to, waiter);
19408+ ret = __rt_mutex_slowlock(lock, TASK_INTERRUPTIBLE, to, waiter, NULL);
19409 /*
19410 * try_to_take_rt_mutex() sets the waiter bit unconditionally. We might
19411 * have to fix that up.
19412 */
19413 fixup_rt_mutex_waiters(lock);
19414+ /*
19415+ * RT has a problem here when the wait got interrupted by a timeout
19416+ * or a signal. task->pi_blocked_on is still set. The task must
19417+ * acquire the hash bucket lock when returning from this function.
19418+ *
19419+ * If the hash bucket lock is contended then the
19420+ * BUG_ON(rt_mutex_real_waiter(task->pi_blocked_on)) in
19421+ * task_blocks_on_rt_mutex() will trigger. This can be avoided by
19422+ * clearing task->pi_blocked_on which removes the task from the
19423+ * boosting chain of the rtmutex. That's correct because the task
19424+ * is not longer blocked on it.
19425+ */
19426+ if (ret) {
19427+ raw_spin_lock(&tsk->pi_lock);
19428+ tsk->pi_blocked_on = NULL;
19429+ raw_spin_unlock(&tsk->pi_lock);
19430+ }
19431+
19432 raw_spin_unlock_irq(&lock->wait_lock);
19433
19434 return ret;
19435@@ -1895,3 +2544,99 @@ bool rt_mutex_cleanup_proxy_lock(struct rt_mutex *lock,
19436
19437 return cleanup;
19438 }
19439+
19440+static inline int
19441+ww_mutex_deadlock_injection(struct ww_mutex *lock, struct ww_acquire_ctx *ctx)
19442+{
19443+#ifdef CONFIG_DEBUG_WW_MUTEX_SLOWPATH
19444+ unsigned tmp;
19445+
19446+ if (ctx->deadlock_inject_countdown-- == 0) {
19447+ tmp = ctx->deadlock_inject_interval;
19448+ if (tmp > UINT_MAX/4)
19449+ tmp = UINT_MAX;
19450+ else
19451+ tmp = tmp*2 + tmp + tmp/2;
19452+
19453+ ctx->deadlock_inject_interval = tmp;
19454+ ctx->deadlock_inject_countdown = tmp;
19455+ ctx->contending_lock = lock;
19456+
19457+ ww_mutex_unlock(lock);
19458+
19459+ return -EDEADLK;
19460+ }
19461+#endif
19462+
19463+ return 0;
19464+}
19465+
19466+#ifdef CONFIG_PREEMPT_RT_FULL
19467+int __sched
19468+ww_mutex_lock_interruptible(struct ww_mutex *lock, struct ww_acquire_ctx *ctx)
19469+{
19470+ int ret;
19471+
19472+ might_sleep();
19473+
19474+ mutex_acquire_nest(&lock->base.dep_map, 0, 0,
19475+ ctx ? &ctx->dep_map : NULL, _RET_IP_);
19476+ ret = rt_mutex_slowlock(&lock->base.lock, TASK_INTERRUPTIBLE, NULL, 0,
19477+ ctx);
19478+ if (ret)
19479+ mutex_release(&lock->base.dep_map, 1, _RET_IP_);
19480+ else if (!ret && ctx && ctx->acquired > 1)
19481+ return ww_mutex_deadlock_injection(lock, ctx);
19482+
19483+ return ret;
19484+}
19485+EXPORT_SYMBOL_GPL(ww_mutex_lock_interruptible);
19486+
19487+int __sched
19488+ww_mutex_lock(struct ww_mutex *lock, struct ww_acquire_ctx *ctx)
19489+{
19490+ int ret;
19491+
19492+ might_sleep();
19493+
19494+ mutex_acquire_nest(&lock->base.dep_map, 0, 0,
19495+ ctx ? &ctx->dep_map : NULL, _RET_IP_);
19496+ ret = rt_mutex_slowlock(&lock->base.lock, TASK_UNINTERRUPTIBLE, NULL, 0,
19497+ ctx);
19498+ if (ret)
19499+ mutex_release(&lock->base.dep_map, 1, _RET_IP_);
19500+ else if (!ret && ctx && ctx->acquired > 1)
19501+ return ww_mutex_deadlock_injection(lock, ctx);
19502+
19503+ return ret;
19504+}
19505+EXPORT_SYMBOL_GPL(ww_mutex_lock);
19506+
19507+void __sched ww_mutex_unlock(struct ww_mutex *lock)
19508+{
19509+ int nest = !!lock->ctx;
19510+
19511+ /*
19512+ * The unlocking fastpath is the 0->1 transition from 'locked'
19513+ * into 'unlocked' state:
19514+ */
19515+ if (nest) {
19516+#ifdef CONFIG_DEBUG_MUTEXES
19517+ DEBUG_LOCKS_WARN_ON(!lock->ctx->acquired);
19518+#endif
19519+ if (lock->ctx->acquired > 0)
19520+ lock->ctx->acquired--;
19521+ lock->ctx = NULL;
19522+ }
19523+
19524+ mutex_release(&lock->base.dep_map, nest, _RET_IP_);
19525+ __rt_mutex_unlock(&lock->base.lock);
19526+}
19527+EXPORT_SYMBOL(ww_mutex_unlock);
19528+
19529+int __rt_mutex_owner_current(struct rt_mutex *lock)
19530+{
19531+ return rt_mutex_owner(lock) == current;
19532+}
19533+EXPORT_SYMBOL(__rt_mutex_owner_current);
19534+#endif
19535diff --git a/kernel/locking/rtmutex_common.h b/kernel/locking/rtmutex_common.h
19536index 68686b3ec3c1..2a157c78e18c 100644
19537--- a/kernel/locking/rtmutex_common.h
19538+++ b/kernel/locking/rtmutex_common.h
19539@@ -15,6 +15,7 @@
19540
19541 #include <linux/rtmutex.h>
19542 #include <linux/sched/wake_q.h>
19543+#include <linux/sched/debug.h>
19544
19545 /*
19546 * This is the control structure for tasks blocked on a rt_mutex,
19547@@ -29,6 +30,7 @@ struct rt_mutex_waiter {
19548 struct rb_node pi_tree_entry;
19549 struct task_struct *task;
19550 struct rt_mutex *lock;
19551+ bool savestate;
19552 #ifdef CONFIG_DEBUG_RT_MUTEXES
19553 unsigned long ip;
19554 struct pid *deadlock_task_pid;
19555@@ -129,12 +131,15 @@ enum rtmutex_chainwalk {
19556 /*
19557 * PI-futex support (proxy locking functions, etc.):
19558 */
19559+#define PI_WAKEUP_INPROGRESS ((struct rt_mutex_waiter *) 1)
19560+#define PI_REQUEUE_INPROGRESS ((struct rt_mutex_waiter *) 2)
19561+
19562 extern struct task_struct *rt_mutex_next_owner(struct rt_mutex *lock);
19563 extern void rt_mutex_init_proxy_locked(struct rt_mutex *lock,
19564 struct task_struct *proxy_owner);
19565 extern void rt_mutex_proxy_unlock(struct rt_mutex *lock,
19566 struct task_struct *proxy_owner);
19567-extern void rt_mutex_init_waiter(struct rt_mutex_waiter *waiter);
19568+extern void rt_mutex_init_waiter(struct rt_mutex_waiter *waiter, bool savetate);
19569 extern int __rt_mutex_start_proxy_lock(struct rt_mutex *lock,
19570 struct rt_mutex_waiter *waiter,
19571 struct task_struct *task);
19572@@ -152,9 +157,27 @@ extern int __rt_mutex_futex_trylock(struct rt_mutex *l);
19573
19574 extern void rt_mutex_futex_unlock(struct rt_mutex *lock);
19575 extern bool __rt_mutex_futex_unlock(struct rt_mutex *lock,
19576- struct wake_q_head *wqh);
19577-
19578-extern void rt_mutex_postunlock(struct wake_q_head *wake_q);
19579+ struct wake_q_head *wqh,
19580+ struct wake_q_head *wq_sleeper);
19581+
19582+extern void rt_mutex_postunlock(struct wake_q_head *wake_q,
19583+ struct wake_q_head *wake_sleeper_q);
19584+
19585+/* RW semaphore special interface */
19586+struct ww_acquire_ctx;
19587+
19588+extern int __rt_mutex_lock_state(struct rt_mutex *lock, int state);
19589+extern int __rt_mutex_trylock(struct rt_mutex *lock);
19590+extern void __rt_mutex_unlock(struct rt_mutex *lock);
19591+int __sched rt_mutex_slowlock_locked(struct rt_mutex *lock, int state,
19592+ struct hrtimer_sleeper *timeout,
19593+ enum rtmutex_chainwalk chwalk,
19594+ struct ww_acquire_ctx *ww_ctx,
19595+ struct rt_mutex_waiter *waiter);
19596+void __sched rt_spin_lock_slowlock_locked(struct rt_mutex *lock,
19597+ struct rt_mutex_waiter *waiter,
19598+ unsigned long flags);
19599+void __sched rt_spin_lock_slowunlock(struct rt_mutex *lock);
19600
19601 #ifdef CONFIG_DEBUG_RT_MUTEXES
19602 # include "rtmutex-debug.h"
19603diff --git a/kernel/locking/rwlock-rt.c b/kernel/locking/rwlock-rt.c
19604new file mode 100644
19605index 000000000000..f2e155b2c4a8
19606--- /dev/null
19607+++ b/kernel/locking/rwlock-rt.c
19608@@ -0,0 +1,378 @@
19609+/*
19610+ */
19611+#include <linux/sched/debug.h>
19612+#include <linux/export.h>
19613+
19614+#include "rtmutex_common.h"
19615+#include <linux/rwlock_types_rt.h>
19616+
19617+/*
19618+ * RT-specific reader/writer locks
19619+ *
19620+ * write_lock()
19621+ * 1) Lock lock->rtmutex
19622+ * 2) Remove the reader BIAS to force readers into the slow path
19623+ * 3) Wait until all readers have left the critical region
19624+ * 4) Mark it write locked
19625+ *
19626+ * write_unlock()
19627+ * 1) Remove the write locked marker
19628+ * 2) Set the reader BIAS so readers can use the fast path again
19629+ * 3) Unlock lock->rtmutex to release blocked readers
19630+ *
19631+ * read_lock()
19632+ * 1) Try fast path acquisition (reader BIAS is set)
19633+ * 2) Take lock->rtmutex.wait_lock which protects the writelocked flag
19634+ * 3) If !writelocked, acquire it for read
19635+ * 4) If writelocked, block on lock->rtmutex
19636+ * 5) unlock lock->rtmutex, goto 1)
19637+ *
19638+ * read_unlock()
19639+ * 1) Try fast path release (reader count != 1)
19640+ * 2) Wake the writer waiting in write_lock()#3
19641+ *
19642+ * read_lock()#3 has the consequence, that rw locks on RT are not writer
19643+ * fair, but writers, which should be avoided in RT tasks (think tasklist
19644+ * lock), are subject to the rtmutex priority/DL inheritance mechanism.
19645+ *
19646+ * It's possible to make the rw locks writer fair by keeping a list of
19647+ * active readers. A blocked writer would force all newly incoming readers
19648+ * to block on the rtmutex, but the rtmutex would have to be proxy locked
19649+ * for one reader after the other. We can't use multi-reader inheritance
19650+ * because there is no way to support that with
19651+ * SCHED_DEADLINE. Implementing the one by one reader boosting/handover
19652+ * mechanism is a major surgery for a very dubious value.
19653+ *
19654+ * The risk of writer starvation is there, but the pathological use cases
19655+ * which trigger it are not necessarily the typical RT workloads.
19656+ */
19657+
19658+void __rwlock_biased_rt_init(struct rt_rw_lock *lock, const char *name,
19659+ struct lock_class_key *key)
19660+{
19661+#ifdef CONFIG_DEBUG_LOCK_ALLOC
19662+ /*
19663+ * Make sure we are not reinitializing a held semaphore:
19664+ */
19665+ debug_check_no_locks_freed((void *)lock, sizeof(*lock));
19666+ lockdep_init_map(&lock->dep_map, name, key, 0);
19667+#endif
19668+ atomic_set(&lock->readers, READER_BIAS);
19669+ rt_mutex_init(&lock->rtmutex);
19670+ lock->rtmutex.save_state = 1;
19671+}
19672+
19673+int __read_rt_trylock(struct rt_rw_lock *lock)
19674+{
19675+ int r, old;
19676+
19677+ /*
19678+ * Increment reader count, if lock->readers < 0, i.e. READER_BIAS is
19679+ * set.
19680+ */
19681+ for (r = atomic_read(&lock->readers); r < 0;) {
19682+ old = atomic_cmpxchg(&lock->readers, r, r + 1);
19683+ if (likely(old == r))
19684+ return 1;
19685+ r = old;
19686+ }
19687+ return 0;
19688+}
19689+
19690+void __sched __read_rt_lock(struct rt_rw_lock *lock)
19691+{
19692+ struct rt_mutex *m = &lock->rtmutex;
19693+ struct rt_mutex_waiter waiter;
19694+ unsigned long flags;
19695+
19696+ if (__read_rt_trylock(lock))
19697+ return;
19698+
19699+ raw_spin_lock_irqsave(&m->wait_lock, flags);
19700+ /*
19701+ * Allow readers as long as the writer has not completely
19702+ * acquired the semaphore for write.
19703+ */
19704+ if (atomic_read(&lock->readers) != WRITER_BIAS) {
19705+ atomic_inc(&lock->readers);
19706+ raw_spin_unlock_irqrestore(&m->wait_lock, flags);
19707+ return;
19708+ }
19709+
19710+ /*
19711+ * Call into the slow lock path with the rtmutex->wait_lock
19712+ * held, so this can't result in the following race:
19713+ *
19714+ * Reader1 Reader2 Writer
19715+ * read_lock()
19716+ * write_lock()
19717+ * rtmutex_lock(m)
19718+ * swait()
19719+ * read_lock()
19720+ * unlock(m->wait_lock)
19721+ * read_unlock()
19722+ * swake()
19723+ * lock(m->wait_lock)
19724+ * lock->writelocked=true
19725+ * unlock(m->wait_lock)
19726+ *
19727+ * write_unlock()
19728+ * lock->writelocked=false
19729+ * rtmutex_unlock(m)
19730+ * read_lock()
19731+ * write_lock()
19732+ * rtmutex_lock(m)
19733+ * swait()
19734+ * rtmutex_lock(m)
19735+ *
19736+ * That would put Reader1 behind the writer waiting on
19737+ * Reader2 to call read_unlock() which might be unbound.
19738+ */
19739+ rt_mutex_init_waiter(&waiter, false);
19740+ rt_spin_lock_slowlock_locked(m, &waiter, flags);
19741+ /*
19742+ * The slowlock() above is guaranteed to return with the rtmutex is
19743+ * now held, so there can't be a writer active. Increment the reader
19744+ * count and immediately drop the rtmutex again.
19745+ */
19746+ atomic_inc(&lock->readers);
19747+ raw_spin_unlock_irqrestore(&m->wait_lock, flags);
19748+ rt_spin_lock_slowunlock(m);
19749+
19750+ debug_rt_mutex_free_waiter(&waiter);
19751+}
19752+
19753+void __read_rt_unlock(struct rt_rw_lock *lock)
19754+{
19755+ struct rt_mutex *m = &lock->rtmutex;
19756+ struct task_struct *tsk;
19757+
19758+ /*
19759+ * sem->readers can only hit 0 when a writer is waiting for the
19760+ * active readers to leave the critical region.
19761+ */
19762+ if (!atomic_dec_and_test(&lock->readers))
19763+ return;
19764+
19765+ raw_spin_lock_irq(&m->wait_lock);
19766+ /*
19767+ * Wake the writer, i.e. the rtmutex owner. It might release the
19768+ * rtmutex concurrently in the fast path, but to clean up the rw
19769+ * lock it needs to acquire m->wait_lock. The worst case which can
19770+ * happen is a spurious wakeup.
19771+ */
19772+ tsk = rt_mutex_owner(m);
19773+ if (tsk)
19774+ wake_up_process(tsk);
19775+
19776+ raw_spin_unlock_irq(&m->wait_lock);
19777+}
19778+
19779+static void __write_unlock_common(struct rt_rw_lock *lock, int bias,
19780+ unsigned long flags)
19781+{
19782+ struct rt_mutex *m = &lock->rtmutex;
19783+
19784+ atomic_add(READER_BIAS - bias, &lock->readers);
19785+ raw_spin_unlock_irqrestore(&m->wait_lock, flags);
19786+ rt_spin_lock_slowunlock(m);
19787+}
19788+
19789+void __sched __write_rt_lock(struct rt_rw_lock *lock)
19790+{
19791+ struct rt_mutex *m = &lock->rtmutex;
19792+ struct task_struct *self = current;
19793+ unsigned long flags;
19794+
19795+ /* Take the rtmutex as a first step */
19796+ __rt_spin_lock(m);
19797+
19798+ /* Force readers into slow path */
19799+ atomic_sub(READER_BIAS, &lock->readers);
19800+
19801+ raw_spin_lock_irqsave(&m->wait_lock, flags);
19802+
19803+ raw_spin_lock(&self->pi_lock);
19804+ self->saved_state = self->state;
19805+ __set_current_state_no_track(TASK_UNINTERRUPTIBLE);
19806+ raw_spin_unlock(&self->pi_lock);
19807+
19808+ for (;;) {
19809+ /* Have all readers left the critical region? */
19810+ if (!atomic_read(&lock->readers)) {
19811+ atomic_set(&lock->readers, WRITER_BIAS);
19812+ raw_spin_lock(&self->pi_lock);
19813+ __set_current_state_no_track(self->saved_state);
19814+ self->saved_state = TASK_RUNNING;
19815+ raw_spin_unlock(&self->pi_lock);
19816+ raw_spin_unlock_irqrestore(&m->wait_lock, flags);
19817+ return;
19818+ }
19819+
19820+ raw_spin_unlock_irqrestore(&m->wait_lock, flags);
19821+
19822+ if (atomic_read(&lock->readers) != 0)
19823+ schedule();
19824+
19825+ raw_spin_lock_irqsave(&m->wait_lock, flags);
19826+
19827+ raw_spin_lock(&self->pi_lock);
19828+ __set_current_state_no_track(TASK_UNINTERRUPTIBLE);
19829+ raw_spin_unlock(&self->pi_lock);
19830+ }
19831+}
19832+
19833+int __write_rt_trylock(struct rt_rw_lock *lock)
19834+{
19835+ struct rt_mutex *m = &lock->rtmutex;
19836+ unsigned long flags;
19837+
19838+ if (!__rt_mutex_trylock(m))
19839+ return 0;
19840+
19841+ atomic_sub(READER_BIAS, &lock->readers);
19842+
19843+ raw_spin_lock_irqsave(&m->wait_lock, flags);
19844+ if (!atomic_read(&lock->readers)) {
19845+ atomic_set(&lock->readers, WRITER_BIAS);
19846+ raw_spin_unlock_irqrestore(&m->wait_lock, flags);
19847+ return 1;
19848+ }
19849+ __write_unlock_common(lock, 0, flags);
19850+ return 0;
19851+}
19852+
19853+void __write_rt_unlock(struct rt_rw_lock *lock)
19854+{
19855+ struct rt_mutex *m = &lock->rtmutex;
19856+ unsigned long flags;
19857+
19858+ raw_spin_lock_irqsave(&m->wait_lock, flags);
19859+ __write_unlock_common(lock, WRITER_BIAS, flags);
19860+}
19861+
19862+/* Map the reader biased implementation */
19863+static inline int do_read_rt_trylock(rwlock_t *rwlock)
19864+{
19865+ return __read_rt_trylock(rwlock);
19866+}
19867+
19868+static inline int do_write_rt_trylock(rwlock_t *rwlock)
19869+{
19870+ return __write_rt_trylock(rwlock);
19871+}
19872+
19873+static inline void do_read_rt_lock(rwlock_t *rwlock)
19874+{
19875+ __read_rt_lock(rwlock);
19876+}
19877+
19878+static inline void do_write_rt_lock(rwlock_t *rwlock)
19879+{
19880+ __write_rt_lock(rwlock);
19881+}
19882+
19883+static inline void do_read_rt_unlock(rwlock_t *rwlock)
19884+{
19885+ __read_rt_unlock(rwlock);
19886+}
19887+
19888+static inline void do_write_rt_unlock(rwlock_t *rwlock)
19889+{
19890+ __write_rt_unlock(rwlock);
19891+}
19892+
19893+static inline void do_rwlock_rt_init(rwlock_t *rwlock, const char *name,
19894+ struct lock_class_key *key)
19895+{
19896+ __rwlock_biased_rt_init(rwlock, name, key);
19897+}
19898+
19899+int __lockfunc rt_read_can_lock(rwlock_t *rwlock)
19900+{
19901+ return atomic_read(&rwlock->readers) < 0;
19902+}
19903+
19904+int __lockfunc rt_write_can_lock(rwlock_t *rwlock)
19905+{
19906+ return atomic_read(&rwlock->readers) == READER_BIAS;
19907+}
19908+
19909+/*
19910+ * The common functions which get wrapped into the rwlock API.
19911+ */
19912+int __lockfunc rt_read_trylock(rwlock_t *rwlock)
19913+{
19914+ int ret;
19915+
19916+ sleeping_lock_inc();
19917+ migrate_disable();
19918+ ret = do_read_rt_trylock(rwlock);
19919+ if (ret) {
19920+ rwlock_acquire_read(&rwlock->dep_map, 0, 1, _RET_IP_);
19921+ } else {
19922+ migrate_enable();
19923+ sleeping_lock_dec();
19924+ }
19925+ return ret;
19926+}
19927+EXPORT_SYMBOL(rt_read_trylock);
19928+
19929+int __lockfunc rt_write_trylock(rwlock_t *rwlock)
19930+{
19931+ int ret;
19932+
19933+ sleeping_lock_inc();
19934+ migrate_disable();
19935+ ret = do_write_rt_trylock(rwlock);
19936+ if (ret) {
19937+ rwlock_acquire(&rwlock->dep_map, 0, 1, _RET_IP_);
19938+ } else {
19939+ migrate_enable();
19940+ sleeping_lock_dec();
19941+ }
19942+ return ret;
19943+}
19944+EXPORT_SYMBOL(rt_write_trylock);
19945+
19946+void __lockfunc rt_read_lock(rwlock_t *rwlock)
19947+{
19948+ sleeping_lock_inc();
19949+ migrate_disable();
19950+ rwlock_acquire_read(&rwlock->dep_map, 0, 0, _RET_IP_);
19951+ do_read_rt_lock(rwlock);
19952+}
19953+EXPORT_SYMBOL(rt_read_lock);
19954+
19955+void __lockfunc rt_write_lock(rwlock_t *rwlock)
19956+{
19957+ sleeping_lock_inc();
19958+ migrate_disable();
19959+ rwlock_acquire(&rwlock->dep_map, 0, 0, _RET_IP_);
19960+ do_write_rt_lock(rwlock);
19961+}
19962+EXPORT_SYMBOL(rt_write_lock);
19963+
19964+void __lockfunc rt_read_unlock(rwlock_t *rwlock)
19965+{
19966+ rwlock_release(&rwlock->dep_map, 1, _RET_IP_);
19967+ do_read_rt_unlock(rwlock);
19968+ migrate_enable();
19969+ sleeping_lock_dec();
19970+}
19971+EXPORT_SYMBOL(rt_read_unlock);
19972+
19973+void __lockfunc rt_write_unlock(rwlock_t *rwlock)
19974+{
19975+ rwlock_release(&rwlock->dep_map, 1, _RET_IP_);
19976+ do_write_rt_unlock(rwlock);
19977+ migrate_enable();
19978+ sleeping_lock_dec();
19979+}
19980+EXPORT_SYMBOL(rt_write_unlock);
19981+
19982+void __rt_rwlock_init(rwlock_t *rwlock, char *name, struct lock_class_key *key)
19983+{
19984+ do_rwlock_rt_init(rwlock, name, key);
19985+}
19986+EXPORT_SYMBOL(__rt_rwlock_init);
19987diff --git a/kernel/locking/rwsem-rt.c b/kernel/locking/rwsem-rt.c
19988new file mode 100644
19989index 000000000000..26991ddb6c5a
19990--- /dev/null
19991+++ b/kernel/locking/rwsem-rt.c
19992@@ -0,0 +1,269 @@
19993+/*
19994+ */
19995+#include <linux/rwsem.h>
19996+#include <linux/sched/debug.h>
19997+#include <linux/sched/signal.h>
19998+#include <linux/export.h>
19999+
20000+#include "rtmutex_common.h"
20001+
20002+/*
20003+ * RT-specific reader/writer semaphores
20004+ *
20005+ * down_write()
20006+ * 1) Lock sem->rtmutex
20007+ * 2) Remove the reader BIAS to force readers into the slow path
20008+ * 3) Wait until all readers have left the critical region
20009+ * 4) Mark it write locked
20010+ *
20011+ * up_write()
20012+ * 1) Remove the write locked marker
20013+ * 2) Set the reader BIAS so readers can use the fast path again
20014+ * 3) Unlock sem->rtmutex to release blocked readers
20015+ *
20016+ * down_read()
20017+ * 1) Try fast path acquisition (reader BIAS is set)
20018+ * 2) Take sem->rtmutex.wait_lock which protects the writelocked flag
20019+ * 3) If !writelocked, acquire it for read
20020+ * 4) If writelocked, block on sem->rtmutex
20021+ * 5) unlock sem->rtmutex, goto 1)
20022+ *
20023+ * up_read()
20024+ * 1) Try fast path release (reader count != 1)
20025+ * 2) Wake the writer waiting in down_write()#3
20026+ *
20027+ * down_read()#3 has the consequence, that rw semaphores on RT are not writer
20028+ * fair, but writers, which should be avoided in RT tasks (think mmap_sem),
20029+ * are subject to the rtmutex priority/DL inheritance mechanism.
20030+ *
20031+ * It's possible to make the rw semaphores writer fair by keeping a list of
20032+ * active readers. A blocked writer would force all newly incoming readers to
20033+ * block on the rtmutex, but the rtmutex would have to be proxy locked for one
20034+ * reader after the other. We can't use multi-reader inheritance because there
20035+ * is no way to support that with SCHED_DEADLINE. Implementing the one by one
20036+ * reader boosting/handover mechanism is a major surgery for a very dubious
20037+ * value.
20038+ *
20039+ * The risk of writer starvation is there, but the pathological use cases
20040+ * which trigger it are not necessarily the typical RT workloads.
20041+ */
20042+
20043+void __rwsem_init(struct rw_semaphore *sem, const char *name,
20044+ struct lock_class_key *key)
20045+{
20046+#ifdef CONFIG_DEBUG_LOCK_ALLOC
20047+ /*
20048+ * Make sure we are not reinitializing a held semaphore:
20049+ */
20050+ debug_check_no_locks_freed((void *)sem, sizeof(*sem));
20051+ lockdep_init_map(&sem->dep_map, name, key, 0);
20052+#endif
20053+ atomic_set(&sem->readers, READER_BIAS);
20054+}
20055+EXPORT_SYMBOL(__rwsem_init);
20056+
20057+int __down_read_trylock(struct rw_semaphore *sem)
20058+{
20059+ int r, old;
20060+
20061+ /*
20062+ * Increment reader count, if sem->readers < 0, i.e. READER_BIAS is
20063+ * set.
20064+ */
20065+ for (r = atomic_read(&sem->readers); r < 0;) {
20066+ old = atomic_cmpxchg(&sem->readers, r, r + 1);
20067+ if (likely(old == r))
20068+ return 1;
20069+ r = old;
20070+ }
20071+ return 0;
20072+}
20073+
20074+void __sched __down_read(struct rw_semaphore *sem)
20075+{
20076+ struct rt_mutex *m = &sem->rtmutex;
20077+ struct rt_mutex_waiter waiter;
20078+
20079+ if (__down_read_trylock(sem))
20080+ return;
20081+
20082+ might_sleep();
20083+ raw_spin_lock_irq(&m->wait_lock);
20084+ /*
20085+ * Allow readers as long as the writer has not completely
20086+ * acquired the semaphore for write.
20087+ */
20088+ if (atomic_read(&sem->readers) != WRITER_BIAS) {
20089+ atomic_inc(&sem->readers);
20090+ raw_spin_unlock_irq(&m->wait_lock);
20091+ return;
20092+ }
20093+
20094+ /*
20095+ * Call into the slow lock path with the rtmutex->wait_lock
20096+ * held, so this can't result in the following race:
20097+ *
20098+ * Reader1 Reader2 Writer
20099+ * down_read()
20100+ * down_write()
20101+ * rtmutex_lock(m)
20102+ * swait()
20103+ * down_read()
20104+ * unlock(m->wait_lock)
20105+ * up_read()
20106+ * swake()
20107+ * lock(m->wait_lock)
20108+ * sem->writelocked=true
20109+ * unlock(m->wait_lock)
20110+ *
20111+ * up_write()
20112+ * sem->writelocked=false
20113+ * rtmutex_unlock(m)
20114+ * down_read()
20115+ * down_write()
20116+ * rtmutex_lock(m)
20117+ * swait()
20118+ * rtmutex_lock(m)
20119+ *
20120+ * That would put Reader1 behind the writer waiting on
20121+ * Reader2 to call up_read() which might be unbound.
20122+ */
20123+ rt_mutex_init_waiter(&waiter, false);
20124+ rt_mutex_slowlock_locked(m, TASK_UNINTERRUPTIBLE, NULL,
20125+ RT_MUTEX_MIN_CHAINWALK, NULL,
20126+ &waiter);
20127+ /*
20128+ * The slowlock() above is guaranteed to return with the rtmutex is
20129+ * now held, so there can't be a writer active. Increment the reader
20130+ * count and immediately drop the rtmutex again.
20131+ */
20132+ atomic_inc(&sem->readers);
20133+ raw_spin_unlock_irq(&m->wait_lock);
20134+ __rt_mutex_unlock(m);
20135+
20136+ debug_rt_mutex_free_waiter(&waiter);
20137+}
20138+
20139+void __up_read(struct rw_semaphore *sem)
20140+{
20141+ struct rt_mutex *m = &sem->rtmutex;
20142+ struct task_struct *tsk;
20143+
20144+ /*
20145+ * sem->readers can only hit 0 when a writer is waiting for the
20146+ * active readers to leave the critical region.
20147+ */
20148+ if (!atomic_dec_and_test(&sem->readers))
20149+ return;
20150+
20151+ might_sleep();
20152+ raw_spin_lock_irq(&m->wait_lock);
20153+ /*
20154+ * Wake the writer, i.e. the rtmutex owner. It might release the
20155+ * rtmutex concurrently in the fast path (due to a signal), but to
20156+ * clean up the rwsem it needs to acquire m->wait_lock. The worst
20157+ * case which can happen is a spurious wakeup.
20158+ */
20159+ tsk = rt_mutex_owner(m);
20160+ if (tsk)
20161+ wake_up_process(tsk);
20162+
20163+ raw_spin_unlock_irq(&m->wait_lock);
20164+}
20165+
20166+static void __up_write_unlock(struct rw_semaphore *sem, int bias,
20167+ unsigned long flags)
20168+{
20169+ struct rt_mutex *m = &sem->rtmutex;
20170+
20171+ atomic_add(READER_BIAS - bias, &sem->readers);
20172+ raw_spin_unlock_irqrestore(&m->wait_lock, flags);
20173+ __rt_mutex_unlock(m);
20174+}
20175+
20176+static int __sched __down_write_common(struct rw_semaphore *sem, int state)
20177+{
20178+ struct rt_mutex *m = &sem->rtmutex;
20179+ unsigned long flags;
20180+
20181+ /* Take the rtmutex as a first step */
20182+ if (__rt_mutex_lock_state(m, state))
20183+ return -EINTR;
20184+
20185+ /* Force readers into slow path */
20186+ atomic_sub(READER_BIAS, &sem->readers);
20187+ might_sleep();
20188+
20189+ set_current_state(state);
20190+ for (;;) {
20191+ raw_spin_lock_irqsave(&m->wait_lock, flags);
20192+ /* Have all readers left the critical region? */
20193+ if (!atomic_read(&sem->readers)) {
20194+ atomic_set(&sem->readers, WRITER_BIAS);
20195+ __set_current_state(TASK_RUNNING);
20196+ raw_spin_unlock_irqrestore(&m->wait_lock, flags);
20197+ return 0;
20198+ }
20199+
20200+ if (signal_pending_state(state, current)) {
20201+ __set_current_state(TASK_RUNNING);
20202+ __up_write_unlock(sem, 0, flags);
20203+ return -EINTR;
20204+ }
20205+ raw_spin_unlock_irqrestore(&m->wait_lock, flags);
20206+
20207+ if (atomic_read(&sem->readers) != 0) {
20208+ schedule();
20209+ set_current_state(state);
20210+ }
20211+ }
20212+}
20213+
20214+void __sched __down_write(struct rw_semaphore *sem)
20215+{
20216+ __down_write_common(sem, TASK_UNINTERRUPTIBLE);
20217+}
20218+
20219+int __sched __down_write_killable(struct rw_semaphore *sem)
20220+{
20221+ return __down_write_common(sem, TASK_KILLABLE);
20222+}
20223+
20224+int __down_write_trylock(struct rw_semaphore *sem)
20225+{
20226+ struct rt_mutex *m = &sem->rtmutex;
20227+ unsigned long flags;
20228+
20229+ if (!__rt_mutex_trylock(m))
20230+ return 0;
20231+
20232+ atomic_sub(READER_BIAS, &sem->readers);
20233+
20234+ raw_spin_lock_irqsave(&m->wait_lock, flags);
20235+ if (!atomic_read(&sem->readers)) {
20236+ atomic_set(&sem->readers, WRITER_BIAS);
20237+ raw_spin_unlock_irqrestore(&m->wait_lock, flags);
20238+ return 1;
20239+ }
20240+ __up_write_unlock(sem, 0, flags);
20241+ return 0;
20242+}
20243+
20244+void __up_write(struct rw_semaphore *sem)
20245+{
20246+ struct rt_mutex *m = &sem->rtmutex;
20247+ unsigned long flags;
20248+
20249+ raw_spin_lock_irqsave(&m->wait_lock, flags);
20250+ __up_write_unlock(sem, WRITER_BIAS, flags);
20251+}
20252+
20253+void __downgrade_write(struct rw_semaphore *sem)
20254+{
20255+ struct rt_mutex *m = &sem->rtmutex;
20256+ unsigned long flags;
20257+
20258+ raw_spin_lock_irqsave(&m->wait_lock, flags);
20259+ /* Release it and account current as reader */
20260+ __up_write_unlock(sem, WRITER_BIAS - 1, flags);
20261+}
20262diff --git a/kernel/locking/spinlock.c b/kernel/locking/spinlock.c
20263index 6e40fdfba326..401bda23f786 100644
20264--- a/kernel/locking/spinlock.c
20265+++ b/kernel/locking/spinlock.c
20266@@ -125,8 +125,11 @@ void __lockfunc __raw_##op##_lock_bh(locktype##_t *lock) \
20267 * __[spin|read|write]_lock_bh()
20268 */
20269 BUILD_LOCK_OPS(spin, raw_spinlock);
20270+
20271+#ifndef CONFIG_PREEMPT_RT_FULL
20272 BUILD_LOCK_OPS(read, rwlock);
20273 BUILD_LOCK_OPS(write, rwlock);
20274+#endif
20275
20276 #endif
20277
20278@@ -210,6 +213,8 @@ void __lockfunc _raw_spin_unlock_bh(raw_spinlock_t *lock)
20279 EXPORT_SYMBOL(_raw_spin_unlock_bh);
20280 #endif
20281
20282+#ifndef CONFIG_PREEMPT_RT_FULL
20283+
20284 #ifndef CONFIG_INLINE_READ_TRYLOCK
20285 int __lockfunc _raw_read_trylock(rwlock_t *lock)
20286 {
20287@@ -354,6 +359,8 @@ void __lockfunc _raw_write_unlock_bh(rwlock_t *lock)
20288 EXPORT_SYMBOL(_raw_write_unlock_bh);
20289 #endif
20290
20291+#endif /* !PREEMPT_RT_FULL */
20292+
20293 #ifdef CONFIG_DEBUG_LOCK_ALLOC
20294
20295 void __lockfunc _raw_spin_lock_nested(raw_spinlock_t *lock, int subclass)
20296diff --git a/kernel/locking/spinlock_debug.c b/kernel/locking/spinlock_debug.c
20297index 9aa0fccd5d43..76d0b40d9193 100644
20298--- a/kernel/locking/spinlock_debug.c
20299+++ b/kernel/locking/spinlock_debug.c
20300@@ -31,6 +31,7 @@ void __raw_spin_lock_init(raw_spinlock_t *lock, const char *name,
20301
20302 EXPORT_SYMBOL(__raw_spin_lock_init);
20303
20304+#ifndef CONFIG_PREEMPT_RT_FULL
20305 void __rwlock_init(rwlock_t *lock, const char *name,
20306 struct lock_class_key *key)
20307 {
20308@@ -48,6 +49,7 @@ void __rwlock_init(rwlock_t *lock, const char *name,
20309 }
20310
20311 EXPORT_SYMBOL(__rwlock_init);
20312+#endif
20313
20314 static void spin_dump(raw_spinlock_t *lock, const char *msg)
20315 {
20316@@ -135,6 +137,7 @@ void do_raw_spin_unlock(raw_spinlock_t *lock)
20317 arch_spin_unlock(&lock->raw_lock);
20318 }
20319
20320+#ifndef CONFIG_PREEMPT_RT_FULL
20321 static void rwlock_bug(rwlock_t *lock, const char *msg)
20322 {
20323 if (!debug_locks_off())
20324@@ -224,3 +227,5 @@ void do_raw_write_unlock(rwlock_t *lock)
20325 debug_write_unlock(lock);
20326 arch_write_unlock(&lock->raw_lock);
20327 }
20328+
20329+#endif
20330diff --git a/kernel/panic.c b/kernel/panic.c
20331index bdd18afa19a4..5da649633795 100644
20332--- a/kernel/panic.c
20333+++ b/kernel/panic.c
20334@@ -482,9 +482,11 @@ static u64 oops_id;
20335
20336 static int init_oops_id(void)
20337 {
20338+#ifndef CONFIG_PREEMPT_RT_FULL
20339 if (!oops_id)
20340 get_random_bytes(&oops_id, sizeof(oops_id));
20341 else
20342+#endif
20343 oops_id++;
20344
20345 return 0;
20346diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
20347index a5c36e9c56a6..a4b83cb0c6e5 100644
20348--- a/kernel/power/hibernate.c
20349+++ b/kernel/power/hibernate.c
20350@@ -287,6 +287,8 @@ static int create_image(int platform_mode)
20351
20352 local_irq_disable();
20353
20354+ system_state = SYSTEM_SUSPEND;
20355+
20356 error = syscore_suspend();
20357 if (error) {
20358 pr_err("Some system devices failed to power down, aborting hibernation\n");
20359@@ -317,6 +319,7 @@ static int create_image(int platform_mode)
20360 syscore_resume();
20361
20362 Enable_irqs:
20363+ system_state = SYSTEM_RUNNING;
20364 local_irq_enable();
20365
20366 Enable_cpus:
20367@@ -445,6 +448,7 @@ static int resume_target_kernel(bool platform_mode)
20368 goto Enable_cpus;
20369
20370 local_irq_disable();
20371+ system_state = SYSTEM_SUSPEND;
20372
20373 error = syscore_suspend();
20374 if (error)
20375@@ -478,6 +482,7 @@ static int resume_target_kernel(bool platform_mode)
20376 syscore_resume();
20377
20378 Enable_irqs:
20379+ system_state = SYSTEM_RUNNING;
20380 local_irq_enable();
20381
20382 Enable_cpus:
20383@@ -563,6 +568,7 @@ int hibernation_platform_enter(void)
20384 goto Enable_cpus;
20385
20386 local_irq_disable();
20387+ system_state = SYSTEM_SUSPEND;
20388 syscore_suspend();
20389 if (pm_wakeup_pending()) {
20390 error = -EAGAIN;
20391@@ -575,6 +581,7 @@ int hibernation_platform_enter(void)
20392
20393 Power_up:
20394 syscore_resume();
20395+ system_state = SYSTEM_RUNNING;
20396 local_irq_enable();
20397
20398 Enable_cpus:
20399@@ -672,6 +679,10 @@ static int load_image_and_restore(void)
20400 return error;
20401 }
20402
20403+#ifndef CONFIG_SUSPEND
20404+bool pm_in_action;
20405+#endif
20406+
20407 /**
20408 * hibernate - Carry out system hibernation, including saving the image.
20409 */
20410@@ -685,6 +696,8 @@ int hibernate(void)
20411 return -EPERM;
20412 }
20413
20414+ pm_in_action = true;
20415+
20416 lock_system_sleep();
20417 /* The snapshot device should not be opened while we're running */
20418 if (!atomic_add_unless(&snapshot_device_available, -1, 0)) {
20419@@ -763,6 +776,7 @@ int hibernate(void)
20420 atomic_inc(&snapshot_device_available);
20421 Unlock:
20422 unlock_system_sleep();
20423+ pm_in_action = false;
20424 pr_info("hibernation exit\n");
20425
20426 return error;
20427diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
20428index c0bc2c89697a..b89605fe0e88 100644
20429--- a/kernel/power/suspend.c
20430+++ b/kernel/power/suspend.c
20431@@ -27,6 +27,7 @@
20432 #include <linux/export.h>
20433 #include <linux/suspend.h>
20434 #include <linux/syscore_ops.h>
20435+#include <linux/swait.h>
20436 #include <linux/ftrace.h>
20437 #include <trace/events/power.h>
20438 #include <linux/compiler.h>
20439@@ -57,7 +58,7 @@ EXPORT_SYMBOL_GPL(pm_suspend_global_flags);
20440
20441 static const struct platform_suspend_ops *suspend_ops;
20442 static const struct platform_s2idle_ops *s2idle_ops;
20443-static DECLARE_WAIT_QUEUE_HEAD(s2idle_wait_head);
20444+static DECLARE_SWAIT_QUEUE_HEAD(s2idle_wait_head);
20445
20446 enum s2idle_states __read_mostly s2idle_state;
20447 static DEFINE_RAW_SPINLOCK(s2idle_lock);
20448@@ -91,8 +92,8 @@ static void s2idle_enter(void)
20449 /* Push all the CPUs into the idle loop. */
20450 wake_up_all_idle_cpus();
20451 /* Make the current CPU wait so it can enter the idle loop too. */
20452- wait_event(s2idle_wait_head,
20453- s2idle_state == S2IDLE_STATE_WAKE);
20454+ swait_event(s2idle_wait_head,
20455+ s2idle_state == S2IDLE_STATE_WAKE);
20456
20457 cpuidle_pause();
20458 put_online_cpus();
20459@@ -159,7 +160,7 @@ void s2idle_wake(void)
20460 raw_spin_lock_irqsave(&s2idle_lock, flags);
20461 if (s2idle_state > S2IDLE_STATE_NONE) {
20462 s2idle_state = S2IDLE_STATE_WAKE;
20463- wake_up(&s2idle_wait_head);
20464+ swake_up(&s2idle_wait_head);
20465 }
20466 raw_spin_unlock_irqrestore(&s2idle_lock, flags);
20467 }
20468@@ -428,6 +429,8 @@ static int suspend_enter(suspend_state_t state, bool *wakeup)
20469 arch_suspend_disable_irqs();
20470 BUG_ON(!irqs_disabled());
20471
20472+ system_state = SYSTEM_SUSPEND;
20473+
20474 error = syscore_suspend();
20475 if (!error) {
20476 *wakeup = pm_wakeup_pending();
20477@@ -443,6 +446,8 @@ static int suspend_enter(suspend_state_t state, bool *wakeup)
20478 syscore_resume();
20479 }
20480
20481+ system_state = SYSTEM_RUNNING;
20482+
20483 arch_suspend_enable_irqs();
20484 BUG_ON(irqs_disabled());
20485
20486@@ -589,6 +594,8 @@ static int enter_state(suspend_state_t state)
20487 return error;
20488 }
20489
20490+bool pm_in_action;
20491+
20492 /**
20493 * pm_suspend - Externally visible function for suspending the system.
20494 * @state: System sleep state to enter.
20495@@ -603,6 +610,7 @@ int pm_suspend(suspend_state_t state)
20496 if (state <= PM_SUSPEND_ON || state >= PM_SUSPEND_MAX)
20497 return -EINVAL;
20498
20499+ pm_in_action = true;
20500 pr_info("suspend entry (%s)\n", mem_sleep_labels[state]);
20501 error = enter_state(state);
20502 if (error) {
20503@@ -612,6 +620,7 @@ int pm_suspend(suspend_state_t state)
20504 suspend_stats.success++;
20505 }
20506 pr_info("suspend exit\n");
20507+ pm_in_action = false;
20508 return error;
20509 }
20510 EXPORT_SYMBOL(pm_suspend);
20511diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
20512index f0223a7d9ed1..13fd0bcf2367 100644
20513--- a/kernel/printk/printk.c
20514+++ b/kernel/printk/printk.c
20515@@ -1348,6 +1348,8 @@ static int syslog_print_all(char __user *buf, int size, bool clear)
20516 {
20517 char *text;
20518 int len = 0;
20519+ int attempts = 0;
20520+ int num_msg;
20521
20522 text = kmalloc(LOG_LINE_MAX + PREFIX_MAX, GFP_KERNEL);
20523 if (!text)
20524@@ -1359,6 +1361,14 @@ static int syslog_print_all(char __user *buf, int size, bool clear)
20525 u64 seq;
20526 u32 idx;
20527
20528+try_again:
20529+ attempts++;
20530+ if (attempts > 10) {
20531+ len = -EBUSY;
20532+ goto out;
20533+ }
20534+ num_msg = 0;
20535+
20536 /*
20537 * Find first record that fits, including all following records,
20538 * into the user-provided buffer for this dump.
20539@@ -1371,6 +1381,14 @@ static int syslog_print_all(char __user *buf, int size, bool clear)
20540 len += msg_print_text(msg, true, NULL, 0);
20541 idx = log_next(idx);
20542 seq++;
20543+ num_msg++;
20544+ if (num_msg > 5) {
20545+ num_msg = 0;
20546+ logbuf_unlock_irq();
20547+ logbuf_lock_irq();
20548+ if (clear_seq < log_first_seq)
20549+ goto try_again;
20550+ }
20551 }
20552
20553 /* move first record forward until length fits into the buffer */
20554@@ -1382,6 +1400,14 @@ static int syslog_print_all(char __user *buf, int size, bool clear)
20555 len -= msg_print_text(msg, true, NULL, 0);
20556 idx = log_next(idx);
20557 seq++;
20558+ num_msg++;
20559+ if (num_msg > 5) {
20560+ num_msg = 0;
20561+ logbuf_unlock_irq();
20562+ logbuf_lock_irq();
20563+ if (clear_seq < log_first_seq)
20564+ goto try_again;
20565+ }
20566 }
20567
20568 /* last message fitting into this dump */
20569@@ -1420,6 +1446,7 @@ static int syslog_print_all(char __user *buf, int size, bool clear)
20570 clear_seq = log_next_seq;
20571 clear_idx = log_next_idx;
20572 }
20573+out:
20574 logbuf_unlock_irq();
20575
20576 kfree(text);
20577@@ -1558,6 +1585,12 @@ static void call_console_drivers(const char *ext_text, size_t ext_len,
20578 if (!console_drivers)
20579 return;
20580
20581+ if (IS_ENABLED(CONFIG_PREEMPT_RT_BASE)) {
20582+ if (in_irq() || in_nmi())
20583+ return;
20584+ }
20585+
20586+ migrate_disable();
20587 for_each_console(con) {
20588 if (exclusive_console && con != exclusive_console)
20589 continue;
20590@@ -1573,6 +1606,7 @@ static void call_console_drivers(const char *ext_text, size_t ext_len,
20591 else
20592 con->write(con, text, len);
20593 }
20594+ migrate_enable();
20595 }
20596
20597 int printk_delay_msec __read_mostly;
20598@@ -1757,12 +1791,22 @@ asmlinkage int vprintk_emit(int facility, int level,
20599
20600 /* If called from the scheduler, we can not call up(). */
20601 if (!in_sched) {
20602+ int may_trylock = 1;
20603+
20604+#ifdef CONFIG_PREEMPT_RT_FULL
20605+ /*
20606+ * we can't take a sleeping lock with IRQs or preeption disabled
20607+ * so we can't print in these contexts
20608+ */
20609+ if (!(preempt_count() == 0 && !irqs_disabled()))
20610+ may_trylock = 0;
20611+#endif
20612 /*
20613 * Try to acquire and then immediately release the console
20614 * semaphore. The release will print out buffers and wake up
20615 * /dev/kmsg and syslog() users.
20616 */
20617- if (console_trylock())
20618+ if (may_trylock && console_trylock())
20619 console_unlock();
20620 }
20621
20622@@ -1872,26 +1916,6 @@ static bool suppress_message_printing(int level) { return false; }
20623
20624 #endif /* CONFIG_PRINTK */
20625
20626-#ifdef CONFIG_EARLY_PRINTK
20627-struct console *early_console;
20628-
20629-asmlinkage __visible void early_printk(const char *fmt, ...)
20630-{
20631- va_list ap;
20632- char buf[512];
20633- int n;
20634-
20635- if (!early_console)
20636- return;
20637-
20638- va_start(ap, fmt);
20639- n = vscnprintf(buf, sizeof(buf), fmt, ap);
20640- va_end(ap);
20641-
20642- early_console->write(early_console, buf, n);
20643-}
20644-#endif
20645-
20646 static int __add_preferred_console(char *name, int idx, char *options,
20647 char *brl_options)
20648 {
20649@@ -2238,10 +2262,15 @@ void console_unlock(void)
20650 console_seq++;
20651 raw_spin_unlock(&logbuf_lock);
20652
20653+#ifdef CONFIG_PREEMPT_RT_FULL
20654+ printk_safe_exit_irqrestore(flags);
20655+ call_console_drivers(ext_text, ext_len, text, len);
20656+#else
20657 stop_critical_timings(); /* don't trace print latency */
20658 call_console_drivers(ext_text, ext_len, text, len);
20659 start_critical_timings();
20660 printk_safe_exit_irqrestore(flags);
20661+#endif
20662
20663 if (do_cond_resched)
20664 cond_resched();
20665@@ -2295,6 +2324,11 @@ void console_unblank(void)
20666 {
20667 struct console *c;
20668
20669+ if (IS_ENABLED(CONFIG_PREEMPT_RT_BASE)) {
20670+ if (in_irq() || in_nmi())
20671+ return;
20672+ }
20673+
20674 /*
20675 * console_unblank can no longer be called in interrupt context unless
20676 * oops_in_progress is set to 1..
20677diff --git a/kernel/printk/printk_safe.c b/kernel/printk/printk_safe.c
20678index 64f8046586b6..a24e16bef51c 100644
20679--- a/kernel/printk/printk_safe.c
20680+++ b/kernel/printk/printk_safe.c
20681@@ -22,6 +22,7 @@
20682 #include <linux/cpumask.h>
20683 #include <linux/irq_work.h>
20684 #include <linux/printk.h>
20685+#include <linux/console.h>
20686
20687 #include "internal.h"
20688
20689@@ -373,8 +374,74 @@ void __printk_safe_exit(void)
20690 this_cpu_dec(printk_context);
20691 }
20692
20693+#ifdef CONFIG_EARLY_PRINTK
20694+struct console *early_console;
20695+
20696+static void early_vprintk(const char *fmt, va_list ap)
20697+{
20698+ if (early_console) {
20699+ char buf[512];
20700+ int n = vscnprintf(buf, sizeof(buf), fmt, ap);
20701+
20702+ early_console->write(early_console, buf, n);
20703+ }
20704+}
20705+
20706+asmlinkage void early_printk(const char *fmt, ...)
20707+{
20708+ va_list ap;
20709+
20710+ va_start(ap, fmt);
20711+ early_vprintk(fmt, ap);
20712+ va_end(ap);
20713+}
20714+
20715+/*
20716+ * This is independent of any log levels - a global
20717+ * kill switch that turns off all of printk.
20718+ *
20719+ * Used by the NMI watchdog if early-printk is enabled.
20720+ */
20721+static bool __read_mostly printk_killswitch;
20722+
20723+static int __init force_early_printk_setup(char *str)
20724+{
20725+ printk_killswitch = true;
20726+ return 0;
20727+}
20728+early_param("force_early_printk", force_early_printk_setup);
20729+
20730+void printk_kill(void)
20731+{
20732+ printk_killswitch = true;
20733+}
20734+
20735+#ifdef CONFIG_PRINTK
20736+static int forced_early_printk(const char *fmt, va_list ap)
20737+{
20738+ if (!printk_killswitch)
20739+ return 0;
20740+ early_vprintk(fmt, ap);
20741+ return 1;
20742+}
20743+#endif
20744+
20745+#else
20746+static inline int forced_early_printk(const char *fmt, va_list ap)
20747+{
20748+ return 0;
20749+}
20750+#endif
20751+
20752 __printf(1, 0) int vprintk_func(const char *fmt, va_list args)
20753 {
20754+ /*
20755+ * Fall back to early_printk if a debugging subsystem has
20756+ * killed printk output
20757+ */
20758+ if (unlikely(forced_early_printk(fmt, args)))
20759+ return 1;
20760+
20761 /*
20762 * Try to use the main logbuf even in NMI. But avoid calling console
20763 * drivers that might have their own locks.
20764diff --git a/kernel/ptrace.c b/kernel/ptrace.c
20765index 84b1367935e4..b32a86f63522 100644
20766--- a/kernel/ptrace.c
20767+++ b/kernel/ptrace.c
20768@@ -175,7 +175,14 @@ static bool ptrace_freeze_traced(struct task_struct *task)
20769
20770 spin_lock_irq(&task->sighand->siglock);
20771 if (task_is_traced(task) && !__fatal_signal_pending(task)) {
20772- task->state = __TASK_TRACED;
20773+ unsigned long flags;
20774+
20775+ raw_spin_lock_irqsave(&task->pi_lock, flags);
20776+ if (task->state & __TASK_TRACED)
20777+ task->state = __TASK_TRACED;
20778+ else
20779+ task->saved_state = __TASK_TRACED;
20780+ raw_spin_unlock_irqrestore(&task->pi_lock, flags);
20781 ret = true;
20782 }
20783 spin_unlock_irq(&task->sighand->siglock);
20784diff --git a/kernel/rcu/Kconfig b/kernel/rcu/Kconfig
20785index 9210379c0353..0be2c96fb640 100644
20786--- a/kernel/rcu/Kconfig
20787+++ b/kernel/rcu/Kconfig
20788@@ -36,7 +36,7 @@ config TINY_RCU
20789
20790 config RCU_EXPERT
20791 bool "Make expert-level adjustments to RCU configuration"
20792- default n
20793+ default y if PREEMPT_RT_FULL
20794 help
20795 This option needs to be enabled if you wish to make
20796 expert-level adjustments to RCU configuration. By default,
20797@@ -172,7 +172,7 @@ config RCU_FANOUT_LEAF
20798
20799 config RCU_FAST_NO_HZ
20800 bool "Accelerate last non-dyntick-idle CPU's grace periods"
20801- depends on NO_HZ_COMMON && SMP && RCU_EXPERT
20802+ depends on NO_HZ_COMMON && SMP && RCU_EXPERT && !PREEMPT_RT_FULL
20803 default n
20804 help
20805 This option permits CPUs to enter dynticks-idle state even if
20806@@ -191,7 +191,7 @@ config RCU_FAST_NO_HZ
20807 config RCU_BOOST
20808 bool "Enable RCU priority boosting"
20809 depends on RT_MUTEXES && PREEMPT_RCU && RCU_EXPERT
20810- default n
20811+ default y if PREEMPT_RT_FULL
20812 help
20813 This option boosts the priority of preempted RCU readers that
20814 block the current preemptible RCU grace period for too long.
20815diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h
20816index e4b43fef89f5..0b056c30e9b1 100644
20817--- a/kernel/rcu/rcu.h
20818+++ b/kernel/rcu/rcu.h
20819@@ -462,18 +462,26 @@ static inline void show_rcu_gp_kthreads(void) { }
20820 extern unsigned long rcutorture_testseq;
20821 extern unsigned long rcutorture_vernum;
20822 unsigned long rcu_batches_started(void);
20823-unsigned long rcu_batches_started_bh(void);
20824 unsigned long rcu_batches_started_sched(void);
20825 unsigned long rcu_batches_completed(void);
20826-unsigned long rcu_batches_completed_bh(void);
20827 unsigned long rcu_batches_completed_sched(void);
20828 unsigned long rcu_exp_batches_completed(void);
20829 unsigned long rcu_exp_batches_completed_sched(void);
20830 unsigned long srcu_batches_completed(struct srcu_struct *sp);
20831 void show_rcu_gp_kthreads(void);
20832 void rcu_force_quiescent_state(void);
20833-void rcu_bh_force_quiescent_state(void);
20834 void rcu_sched_force_quiescent_state(void);
20835+
20836+#ifndef CONFIG_PREEMPT_RT_FULL
20837+void rcu_bh_force_quiescent_state(void);
20838+unsigned long rcu_batches_started_bh(void);
20839+unsigned long rcu_batches_completed_bh(void);
20840+#else
20841+# define rcu_bh_force_quiescent_state rcu_force_quiescent_state
20842+# define rcu_batches_completed_bh rcu_batches_completed
20843+# define rcu_batches_started_bh rcu_batches_completed
20844+#endif
20845+
20846 #endif /* #else #ifdef CONFIG_TINY_RCU */
20847
20848 #ifdef CONFIG_RCU_NOCB_CPU
20849diff --git a/kernel/rcu/rcu_segcblist.c b/kernel/rcu/rcu_segcblist.c
20850index 7649fcd2c4c7..88cba7c2956c 100644
20851--- a/kernel/rcu/rcu_segcblist.c
20852+++ b/kernel/rcu/rcu_segcblist.c
20853@@ -23,6 +23,7 @@
20854 #include <linux/types.h>
20855 #include <linux/kernel.h>
20856 #include <linux/interrupt.h>
20857+#include <linux/rcupdate.h>
20858
20859 #include "rcu_segcblist.h"
20860
20861diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
20862index 45f2ffbc1e78..2e9dbb734d5a 100644
20863--- a/kernel/rcu/rcutorture.c
20864+++ b/kernel/rcu/rcutorture.c
20865@@ -417,6 +417,7 @@ static struct rcu_torture_ops rcu_ops = {
20866 .name = "rcu"
20867 };
20868
20869+#ifndef CONFIG_PREEMPT_RT_FULL
20870 /*
20871 * Definitions for rcu_bh torture testing.
20872 */
20873@@ -456,6 +457,12 @@ static struct rcu_torture_ops rcu_bh_ops = {
20874 .name = "rcu_bh"
20875 };
20876
20877+#else
20878+static struct rcu_torture_ops rcu_bh_ops = {
20879+ .ttype = INVALID_RCU_FLAVOR,
20880+};
20881+#endif
20882+
20883 /*
20884 * Don't even think about trying any of these in real life!!!
20885 * The names includes "busted", and they really means it!
20886diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c
20887index 6d5880089ff6..0e3b2bd3f2ac 100644
20888--- a/kernel/rcu/srcutree.c
20889+++ b/kernel/rcu/srcutree.c
20890@@ -36,6 +36,8 @@
20891 #include <linux/delay.h>
20892 #include <linux/module.h>
20893 #include <linux/srcu.h>
20894+#include <linux/cpu.h>
20895+#include <linux/locallock.h>
20896
20897 #include "rcu.h"
20898 #include "rcu_segcblist.h"
20899@@ -53,6 +55,33 @@ static void srcu_invoke_callbacks(struct work_struct *work);
20900 static void srcu_reschedule(struct srcu_struct *sp, unsigned long delay);
20901 static void process_srcu(struct work_struct *work);
20902
20903+/* Wrappers for lock acquisition and release, see raw_spin_lock_rcu_node(). */
20904+#define spin_lock_rcu_node(p) \
20905+do { \
20906+ spin_lock(&ACCESS_PRIVATE(p, lock)); \
20907+ smp_mb__after_unlock_lock(); \
20908+} while (0)
20909+
20910+#define spin_unlock_rcu_node(p) spin_unlock(&ACCESS_PRIVATE(p, lock))
20911+
20912+#define spin_lock_irq_rcu_node(p) \
20913+do { \
20914+ spin_lock_irq(&ACCESS_PRIVATE(p, lock)); \
20915+ smp_mb__after_unlock_lock(); \
20916+} while (0)
20917+
20918+#define spin_unlock_irq_rcu_node(p) \
20919+ spin_unlock_irq(&ACCESS_PRIVATE(p, lock))
20920+
20921+#define spin_lock_irqsave_rcu_node(p, flags) \
20922+do { \
20923+ spin_lock_irqsave(&ACCESS_PRIVATE(p, lock), flags); \
20924+ smp_mb__after_unlock_lock(); \
20925+} while (0)
20926+
20927+#define spin_unlock_irqrestore_rcu_node(p, flags) \
20928+ spin_unlock_irqrestore(&ACCESS_PRIVATE(p, lock), flags) \
20929+
20930 /*
20931 * Initialize SRCU combining tree. Note that statically allocated
20932 * srcu_struct structures might already have srcu_read_lock() and
20933@@ -77,7 +106,7 @@ static void init_srcu_struct_nodes(struct srcu_struct *sp, bool is_static)
20934
20935 /* Each pass through this loop initializes one srcu_node structure. */
20936 rcu_for_each_node_breadth_first(sp, snp) {
20937- raw_spin_lock_init(&ACCESS_PRIVATE(snp, lock));
20938+ spin_lock_init(&ACCESS_PRIVATE(snp, lock));
20939 WARN_ON_ONCE(ARRAY_SIZE(snp->srcu_have_cbs) !=
20940 ARRAY_SIZE(snp->srcu_data_have_cbs));
20941 for (i = 0; i < ARRAY_SIZE(snp->srcu_have_cbs); i++) {
20942@@ -111,7 +140,7 @@ static void init_srcu_struct_nodes(struct srcu_struct *sp, bool is_static)
20943 snp_first = sp->level[level];
20944 for_each_possible_cpu(cpu) {
20945 sdp = per_cpu_ptr(sp->sda, cpu);
20946- raw_spin_lock_init(&ACCESS_PRIVATE(sdp, lock));
20947+ spin_lock_init(&ACCESS_PRIVATE(sdp, lock));
20948 rcu_segcblist_init(&sdp->srcu_cblist);
20949 sdp->srcu_cblist_invoking = false;
20950 sdp->srcu_gp_seq_needed = sp->srcu_gp_seq;
20951@@ -170,7 +199,7 @@ int __init_srcu_struct(struct srcu_struct *sp, const char *name,
20952 /* Don't re-initialize a lock while it is held. */
20953 debug_check_no_locks_freed((void *)sp, sizeof(*sp));
20954 lockdep_init_map(&sp->dep_map, name, key, 0);
20955- raw_spin_lock_init(&ACCESS_PRIVATE(sp, lock));
20956+ spin_lock_init(&ACCESS_PRIVATE(sp, lock));
20957 return init_srcu_struct_fields(sp, false);
20958 }
20959 EXPORT_SYMBOL_GPL(__init_srcu_struct);
20960@@ -187,7 +216,7 @@ EXPORT_SYMBOL_GPL(__init_srcu_struct);
20961 */
20962 int init_srcu_struct(struct srcu_struct *sp)
20963 {
20964- raw_spin_lock_init(&ACCESS_PRIVATE(sp, lock));
20965+ spin_lock_init(&ACCESS_PRIVATE(sp, lock));
20966 return init_srcu_struct_fields(sp, false);
20967 }
20968 EXPORT_SYMBOL_GPL(init_srcu_struct);
20969@@ -210,13 +239,13 @@ static void check_init_srcu_struct(struct srcu_struct *sp)
20970 /* The smp_load_acquire() pairs with the smp_store_release(). */
20971 if (!rcu_seq_state(smp_load_acquire(&sp->srcu_gp_seq_needed))) /*^^^*/
20972 return; /* Already initialized. */
20973- raw_spin_lock_irqsave_rcu_node(sp, flags);
20974+ spin_lock_irqsave_rcu_node(sp, flags);
20975 if (!rcu_seq_state(sp->srcu_gp_seq_needed)) {
20976- raw_spin_unlock_irqrestore_rcu_node(sp, flags);
20977+ spin_unlock_irqrestore_rcu_node(sp, flags);
20978 return;
20979 }
20980 init_srcu_struct_fields(sp, true);
20981- raw_spin_unlock_irqrestore_rcu_node(sp, flags);
20982+ spin_unlock_irqrestore_rcu_node(sp, flags);
20983 }
20984
20985 /*
20986@@ -424,21 +453,6 @@ static void srcu_gp_start(struct srcu_struct *sp)
20987 WARN_ON_ONCE(state != SRCU_STATE_SCAN1);
20988 }
20989
20990-/*
20991- * Track online CPUs to guide callback workqueue placement.
20992- */
20993-DEFINE_PER_CPU(bool, srcu_online);
20994-
20995-void srcu_online_cpu(unsigned int cpu)
20996-{
20997- WRITE_ONCE(per_cpu(srcu_online, cpu), true);
20998-}
20999-
21000-void srcu_offline_cpu(unsigned int cpu)
21001-{
21002- WRITE_ONCE(per_cpu(srcu_online, cpu), false);
21003-}
21004-
21005 /*
21006 * Place the workqueue handler on the specified CPU if online, otherwise
21007 * just run it whereever. This is useful for placing workqueue handlers
21008@@ -450,12 +464,12 @@ static bool srcu_queue_delayed_work_on(int cpu, struct workqueue_struct *wq,
21009 {
21010 bool ret;
21011
21012- preempt_disable();
21013- if (READ_ONCE(per_cpu(srcu_online, cpu)))
21014+ cpus_read_lock();
21015+ if (cpu_online(cpu))
21016 ret = queue_delayed_work_on(cpu, wq, dwork, delay);
21017 else
21018 ret = queue_delayed_work(wq, dwork, delay);
21019- preempt_enable();
21020+ cpus_read_unlock();
21021 return ret;
21022 }
21023
21024@@ -513,7 +527,7 @@ static void srcu_gp_end(struct srcu_struct *sp)
21025 mutex_lock(&sp->srcu_cb_mutex);
21026
21027 /* End the current grace period. */
21028- raw_spin_lock_irq_rcu_node(sp);
21029+ spin_lock_irq_rcu_node(sp);
21030 idx = rcu_seq_state(sp->srcu_gp_seq);
21031 WARN_ON_ONCE(idx != SRCU_STATE_SCAN2);
21032 cbdelay = srcu_get_delay(sp);
21033@@ -522,7 +536,7 @@ static void srcu_gp_end(struct srcu_struct *sp)
21034 gpseq = rcu_seq_current(&sp->srcu_gp_seq);
21035 if (ULONG_CMP_LT(sp->srcu_gp_seq_needed_exp, gpseq))
21036 sp->srcu_gp_seq_needed_exp = gpseq;
21037- raw_spin_unlock_irq_rcu_node(sp);
21038+ spin_unlock_irq_rcu_node(sp);
21039 mutex_unlock(&sp->srcu_gp_mutex);
21040 /* A new grace period can start at this point. But only one. */
21041
21042@@ -530,7 +544,7 @@ static void srcu_gp_end(struct srcu_struct *sp)
21043 idx = rcu_seq_ctr(gpseq) % ARRAY_SIZE(snp->srcu_have_cbs);
21044 idxnext = (idx + 1) % ARRAY_SIZE(snp->srcu_have_cbs);
21045 rcu_for_each_node_breadth_first(sp, snp) {
21046- raw_spin_lock_irq_rcu_node(snp);
21047+ spin_lock_irq_rcu_node(snp);
21048 cbs = false;
21049 if (snp >= sp->level[rcu_num_lvls - 1])
21050 cbs = snp->srcu_have_cbs[idx] == gpseq;
21051@@ -540,7 +554,7 @@ static void srcu_gp_end(struct srcu_struct *sp)
21052 snp->srcu_gp_seq_needed_exp = gpseq;
21053 mask = snp->srcu_data_have_cbs[idx];
21054 snp->srcu_data_have_cbs[idx] = 0;
21055- raw_spin_unlock_irq_rcu_node(snp);
21056+ spin_unlock_irq_rcu_node(snp);
21057 if (cbs)
21058 srcu_schedule_cbs_snp(sp, snp, mask, cbdelay);
21059
21060@@ -548,11 +562,11 @@ static void srcu_gp_end(struct srcu_struct *sp)
21061 if (!(gpseq & counter_wrap_check))
21062 for (cpu = snp->grplo; cpu <= snp->grphi; cpu++) {
21063 sdp = per_cpu_ptr(sp->sda, cpu);
21064- raw_spin_lock_irqsave_rcu_node(sdp, flags);
21065+ spin_lock_irqsave_rcu_node(sdp, flags);
21066 if (ULONG_CMP_GE(gpseq,
21067 sdp->srcu_gp_seq_needed + 100))
21068 sdp->srcu_gp_seq_needed = gpseq;
21069- raw_spin_unlock_irqrestore_rcu_node(sdp, flags);
21070+ spin_unlock_irqrestore_rcu_node(sdp, flags);
21071 }
21072 }
21073
21074@@ -560,17 +574,17 @@ static void srcu_gp_end(struct srcu_struct *sp)
21075 mutex_unlock(&sp->srcu_cb_mutex);
21076
21077 /* Start a new grace period if needed. */
21078- raw_spin_lock_irq_rcu_node(sp);
21079+ spin_lock_irq_rcu_node(sp);
21080 gpseq = rcu_seq_current(&sp->srcu_gp_seq);
21081 if (!rcu_seq_state(gpseq) &&
21082 ULONG_CMP_LT(gpseq, sp->srcu_gp_seq_needed)) {
21083 srcu_gp_start(sp);
21084- raw_spin_unlock_irq_rcu_node(sp);
21085+ spin_unlock_irq_rcu_node(sp);
21086 /* Throttle expedited grace periods: Should be rare! */
21087 srcu_reschedule(sp, rcu_seq_ctr(gpseq) & 0x3ff
21088 ? 0 : SRCU_INTERVAL);
21089 } else {
21090- raw_spin_unlock_irq_rcu_node(sp);
21091+ spin_unlock_irq_rcu_node(sp);
21092 }
21093 }
21094
21095@@ -590,18 +604,18 @@ static void srcu_funnel_exp_start(struct srcu_struct *sp, struct srcu_node *snp,
21096 if (rcu_seq_done(&sp->srcu_gp_seq, s) ||
21097 ULONG_CMP_GE(READ_ONCE(snp->srcu_gp_seq_needed_exp), s))
21098 return;
21099- raw_spin_lock_irqsave_rcu_node(snp, flags);
21100+ spin_lock_irqsave_rcu_node(snp, flags);
21101 if (ULONG_CMP_GE(snp->srcu_gp_seq_needed_exp, s)) {
21102- raw_spin_unlock_irqrestore_rcu_node(snp, flags);
21103+ spin_unlock_irqrestore_rcu_node(snp, flags);
21104 return;
21105 }
21106 WRITE_ONCE(snp->srcu_gp_seq_needed_exp, s);
21107- raw_spin_unlock_irqrestore_rcu_node(snp, flags);
21108+ spin_unlock_irqrestore_rcu_node(snp, flags);
21109 }
21110- raw_spin_lock_irqsave_rcu_node(sp, flags);
21111+ spin_lock_irqsave_rcu_node(sp, flags);
21112 if (!ULONG_CMP_LT(sp->srcu_gp_seq_needed_exp, s))
21113 sp->srcu_gp_seq_needed_exp = s;
21114- raw_spin_unlock_irqrestore_rcu_node(sp, flags);
21115+ spin_unlock_irqrestore_rcu_node(sp, flags);
21116 }
21117
21118 /*
21119@@ -623,12 +637,12 @@ static void srcu_funnel_gp_start(struct srcu_struct *sp, struct srcu_data *sdp,
21120 for (; snp != NULL; snp = snp->srcu_parent) {
21121 if (rcu_seq_done(&sp->srcu_gp_seq, s) && snp != sdp->mynode)
21122 return; /* GP already done and CBs recorded. */
21123- raw_spin_lock_irqsave_rcu_node(snp, flags);
21124+ spin_lock_irqsave_rcu_node(snp, flags);
21125 if (ULONG_CMP_GE(snp->srcu_have_cbs[idx], s)) {
21126 snp_seq = snp->srcu_have_cbs[idx];
21127 if (snp == sdp->mynode && snp_seq == s)
21128 snp->srcu_data_have_cbs[idx] |= sdp->grpmask;
21129- raw_spin_unlock_irqrestore_rcu_node(snp, flags);
21130+ spin_unlock_irqrestore_rcu_node(snp, flags);
21131 if (snp == sdp->mynode && snp_seq != s) {
21132 srcu_schedule_cbs_sdp(sdp, do_norm
21133 ? SRCU_INTERVAL
21134@@ -644,11 +658,11 @@ static void srcu_funnel_gp_start(struct srcu_struct *sp, struct srcu_data *sdp,
21135 snp->srcu_data_have_cbs[idx] |= sdp->grpmask;
21136 if (!do_norm && ULONG_CMP_LT(snp->srcu_gp_seq_needed_exp, s))
21137 snp->srcu_gp_seq_needed_exp = s;
21138- raw_spin_unlock_irqrestore_rcu_node(snp, flags);
21139+ spin_unlock_irqrestore_rcu_node(snp, flags);
21140 }
21141
21142 /* Top of tree, must ensure the grace period will be started. */
21143- raw_spin_lock_irqsave_rcu_node(sp, flags);
21144+ spin_lock_irqsave_rcu_node(sp, flags);
21145 if (ULONG_CMP_LT(sp->srcu_gp_seq_needed, s)) {
21146 /*
21147 * Record need for grace period s. Pair with load
21148@@ -667,7 +681,7 @@ static void srcu_funnel_gp_start(struct srcu_struct *sp, struct srcu_data *sdp,
21149 queue_delayed_work(system_power_efficient_wq, &sp->work,
21150 srcu_get_delay(sp));
21151 }
21152- raw_spin_unlock_irqrestore_rcu_node(sp, flags);
21153+ spin_unlock_irqrestore_rcu_node(sp, flags);
21154 }
21155
21156 /*
21157@@ -736,6 +750,8 @@ static void srcu_flip(struct srcu_struct *sp)
21158 * negligible when amoritized over that time period, and the extra latency
21159 * of a needlessly non-expedited grace period is similarly negligible.
21160 */
21161+static DEFINE_LOCAL_IRQ_LOCK(sp_llock);
21162+
21163 static bool srcu_might_be_idle(struct srcu_struct *sp)
21164 {
21165 unsigned long curseq;
21166@@ -744,13 +760,13 @@ static bool srcu_might_be_idle(struct srcu_struct *sp)
21167 unsigned long t;
21168
21169 /* If the local srcu_data structure has callbacks, not idle. */
21170- local_irq_save(flags);
21171+ local_lock_irqsave(sp_llock, flags);
21172 sdp = this_cpu_ptr(sp->sda);
21173 if (rcu_segcblist_pend_cbs(&sdp->srcu_cblist)) {
21174- local_irq_restore(flags);
21175+ local_unlock_irqrestore(sp_llock, flags);
21176 return false; /* Callbacks already present, so not idle. */
21177 }
21178- local_irq_restore(flags);
21179+ local_unlock_irqrestore(sp_llock, flags);
21180
21181 /*
21182 * No local callbacks, so probabalistically probe global state.
21183@@ -828,9 +844,9 @@ void __call_srcu(struct srcu_struct *sp, struct rcu_head *rhp,
21184 return;
21185 }
21186 rhp->func = func;
21187- local_irq_save(flags);
21188+ local_lock_irqsave(sp_llock, flags);
21189 sdp = this_cpu_ptr(sp->sda);
21190- raw_spin_lock_rcu_node(sdp);
21191+ spin_lock_rcu_node(sdp);
21192 rcu_segcblist_enqueue(&sdp->srcu_cblist, rhp, false);
21193 rcu_segcblist_advance(&sdp->srcu_cblist,
21194 rcu_seq_current(&sp->srcu_gp_seq));
21195@@ -844,7 +860,8 @@ void __call_srcu(struct srcu_struct *sp, struct rcu_head *rhp,
21196 sdp->srcu_gp_seq_needed_exp = s;
21197 needexp = true;
21198 }
21199- raw_spin_unlock_irqrestore_rcu_node(sdp, flags);
21200+ spin_unlock_rcu_node(sdp);
21201+ local_unlock_irqrestore(sp_llock, flags);
21202 if (needgp)
21203 srcu_funnel_gp_start(sp, sdp, s, do_norm);
21204 else if (needexp)
21205@@ -900,7 +917,7 @@ static void __synchronize_srcu(struct srcu_struct *sp, bool do_norm)
21206
21207 /*
21208 * Make sure that later code is ordered after the SRCU grace
21209- * period. This pairs with the raw_spin_lock_irq_rcu_node()
21210+ * period. This pairs with the spin_lock_irq_rcu_node()
21211 * in srcu_invoke_callbacks(). Unlike Tree RCU, this is needed
21212 * because the current CPU might have been totally uninvolved with
21213 * (and thus unordered against) that grace period.
21214@@ -1024,7 +1041,7 @@ void srcu_barrier(struct srcu_struct *sp)
21215 */
21216 for_each_possible_cpu(cpu) {
21217 sdp = per_cpu_ptr(sp->sda, cpu);
21218- raw_spin_lock_irq_rcu_node(sdp);
21219+ spin_lock_irq_rcu_node(sdp);
21220 atomic_inc(&sp->srcu_barrier_cpu_cnt);
21221 sdp->srcu_barrier_head.func = srcu_barrier_cb;
21222 debug_rcu_head_queue(&sdp->srcu_barrier_head);
21223@@ -1033,7 +1050,7 @@ void srcu_barrier(struct srcu_struct *sp)
21224 debug_rcu_head_unqueue(&sdp->srcu_barrier_head);
21225 atomic_dec(&sp->srcu_barrier_cpu_cnt);
21226 }
21227- raw_spin_unlock_irq_rcu_node(sdp);
21228+ spin_unlock_irq_rcu_node(sdp);
21229 }
21230
21231 /* Remove the initial count, at which point reaching zero can happen. */
21232@@ -1082,17 +1099,17 @@ static void srcu_advance_state(struct srcu_struct *sp)
21233 */
21234 idx = rcu_seq_state(smp_load_acquire(&sp->srcu_gp_seq)); /* ^^^ */
21235 if (idx == SRCU_STATE_IDLE) {
21236- raw_spin_lock_irq_rcu_node(sp);
21237+ spin_lock_irq_rcu_node(sp);
21238 if (ULONG_CMP_GE(sp->srcu_gp_seq, sp->srcu_gp_seq_needed)) {
21239 WARN_ON_ONCE(rcu_seq_state(sp->srcu_gp_seq));
21240- raw_spin_unlock_irq_rcu_node(sp);
21241+ spin_unlock_irq_rcu_node(sp);
21242 mutex_unlock(&sp->srcu_gp_mutex);
21243 return;
21244 }
21245 idx = rcu_seq_state(READ_ONCE(sp->srcu_gp_seq));
21246 if (idx == SRCU_STATE_IDLE)
21247 srcu_gp_start(sp);
21248- raw_spin_unlock_irq_rcu_node(sp);
21249+ spin_unlock_irq_rcu_node(sp);
21250 if (idx != SRCU_STATE_IDLE) {
21251 mutex_unlock(&sp->srcu_gp_mutex);
21252 return; /* Someone else started the grace period. */
21253@@ -1141,19 +1158,19 @@ static void srcu_invoke_callbacks(struct work_struct *work)
21254 sdp = container_of(work, struct srcu_data, work.work);
21255 sp = sdp->sp;
21256 rcu_cblist_init(&ready_cbs);
21257- raw_spin_lock_irq_rcu_node(sdp);
21258+ spin_lock_irq_rcu_node(sdp);
21259 rcu_segcblist_advance(&sdp->srcu_cblist,
21260 rcu_seq_current(&sp->srcu_gp_seq));
21261 if (sdp->srcu_cblist_invoking ||
21262 !rcu_segcblist_ready_cbs(&sdp->srcu_cblist)) {
21263- raw_spin_unlock_irq_rcu_node(sdp);
21264+ spin_unlock_irq_rcu_node(sdp);
21265 return; /* Someone else on the job or nothing to do. */
21266 }
21267
21268 /* We are on the job! Extract and invoke ready callbacks. */
21269 sdp->srcu_cblist_invoking = true;
21270 rcu_segcblist_extract_done_cbs(&sdp->srcu_cblist, &ready_cbs);
21271- raw_spin_unlock_irq_rcu_node(sdp);
21272+ spin_unlock_irq_rcu_node(sdp);
21273 rhp = rcu_cblist_dequeue(&ready_cbs);
21274 for (; rhp != NULL; rhp = rcu_cblist_dequeue(&ready_cbs)) {
21275 debug_rcu_head_unqueue(rhp);
21276@@ -1166,13 +1183,13 @@ static void srcu_invoke_callbacks(struct work_struct *work)
21277 * Update counts, accelerate new callbacks, and if needed,
21278 * schedule another round of callback invocation.
21279 */
21280- raw_spin_lock_irq_rcu_node(sdp);
21281+ spin_lock_irq_rcu_node(sdp);
21282 rcu_segcblist_insert_count(&sdp->srcu_cblist, &ready_cbs);
21283 (void)rcu_segcblist_accelerate(&sdp->srcu_cblist,
21284 rcu_seq_snap(&sp->srcu_gp_seq));
21285 sdp->srcu_cblist_invoking = false;
21286 more = rcu_segcblist_ready_cbs(&sdp->srcu_cblist);
21287- raw_spin_unlock_irq_rcu_node(sdp);
21288+ spin_unlock_irq_rcu_node(sdp);
21289 if (more)
21290 srcu_schedule_cbs_sdp(sdp, 0);
21291 }
21292@@ -1185,7 +1202,7 @@ static void srcu_reschedule(struct srcu_struct *sp, unsigned long delay)
21293 {
21294 bool pushgp = true;
21295
21296- raw_spin_lock_irq_rcu_node(sp);
21297+ spin_lock_irq_rcu_node(sp);
21298 if (ULONG_CMP_GE(sp->srcu_gp_seq, sp->srcu_gp_seq_needed)) {
21299 if (!WARN_ON_ONCE(rcu_seq_state(sp->srcu_gp_seq))) {
21300 /* All requests fulfilled, time to go idle. */
21301@@ -1195,7 +1212,7 @@ static void srcu_reschedule(struct srcu_struct *sp, unsigned long delay)
21302 /* Outstanding request and no GP. Start one. */
21303 srcu_gp_start(sp);
21304 }
21305- raw_spin_unlock_irq_rcu_node(sp);
21306+ spin_unlock_irq_rcu_node(sp);
21307
21308 if (pushgp)
21309 queue_delayed_work(system_power_efficient_wq, &sp->work, delay);
21310diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
21311index 3e3650e94ae6..0a722b56d90b 100644
21312--- a/kernel/rcu/tree.c
21313+++ b/kernel/rcu/tree.c
21314@@ -58,6 +58,11 @@
21315 #include <linux/trace_events.h>
21316 #include <linux/suspend.h>
21317 #include <linux/ftrace.h>
21318+#include <linux/delay.h>
21319+#include <linux/gfp.h>
21320+#include <linux/oom.h>
21321+#include <linux/smpboot.h>
21322+#include "../time/tick-internal.h"
21323
21324 #include "tree.h"
21325 #include "rcu.h"
21326@@ -243,6 +248,19 @@ void rcu_sched_qs(void)
21327 this_cpu_ptr(&rcu_sched_data), true);
21328 }
21329
21330+#ifdef CONFIG_PREEMPT_RT_FULL
21331+static void rcu_preempt_qs(void);
21332+
21333+void rcu_bh_qs(void)
21334+{
21335+ unsigned long flags;
21336+
21337+ /* Callers to this function, rcu_preempt_qs(), must disable irqs. */
21338+ local_irq_save(flags);
21339+ rcu_preempt_qs();
21340+ local_irq_restore(flags);
21341+}
21342+#else
21343 void rcu_bh_qs(void)
21344 {
21345 RCU_LOCKDEP_WARN(preemptible(), "rcu_bh_qs() invoked with preemption enabled!!!");
21346@@ -253,6 +271,7 @@ void rcu_bh_qs(void)
21347 __this_cpu_write(rcu_bh_data.cpu_no_qs.b.norm, false);
21348 }
21349 }
21350+#endif
21351
21352 /*
21353 * Steal a bit from the bottom of ->dynticks for idle entry/exit
21354@@ -564,11 +583,13 @@ EXPORT_SYMBOL_GPL(rcu_batches_started_sched);
21355 /*
21356 * Return the number of RCU BH batches started thus far for debug & stats.
21357 */
21358+#ifndef CONFIG_PREEMPT_RT_FULL
21359 unsigned long rcu_batches_started_bh(void)
21360 {
21361 return rcu_bh_state.gpnum;
21362 }
21363 EXPORT_SYMBOL_GPL(rcu_batches_started_bh);
21364+#endif
21365
21366 /*
21367 * Return the number of RCU batches completed thus far for debug & stats.
21368@@ -588,6 +609,7 @@ unsigned long rcu_batches_completed_sched(void)
21369 }
21370 EXPORT_SYMBOL_GPL(rcu_batches_completed_sched);
21371
21372+#ifndef CONFIG_PREEMPT_RT_FULL
21373 /*
21374 * Return the number of RCU BH batches completed thus far for debug & stats.
21375 */
21376@@ -596,6 +618,7 @@ unsigned long rcu_batches_completed_bh(void)
21377 return rcu_bh_state.completed;
21378 }
21379 EXPORT_SYMBOL_GPL(rcu_batches_completed_bh);
21380+#endif
21381
21382 /*
21383 * Return the number of RCU expedited batches completed thus far for
21384@@ -619,6 +642,7 @@ unsigned long rcu_exp_batches_completed_sched(void)
21385 }
21386 EXPORT_SYMBOL_GPL(rcu_exp_batches_completed_sched);
21387
21388+#ifndef CONFIG_PREEMPT_RT_FULL
21389 /*
21390 * Force a quiescent state.
21391 */
21392@@ -637,6 +661,13 @@ void rcu_bh_force_quiescent_state(void)
21393 }
21394 EXPORT_SYMBOL_GPL(rcu_bh_force_quiescent_state);
21395
21396+#else
21397+void rcu_force_quiescent_state(void)
21398+{
21399+}
21400+EXPORT_SYMBOL_GPL(rcu_force_quiescent_state);
21401+#endif
21402+
21403 /*
21404 * Force a quiescent state for RCU-sched.
21405 */
21406@@ -687,9 +718,11 @@ void rcutorture_get_gp_data(enum rcutorture_type test_type, int *flags,
21407 case RCU_FLAVOR:
21408 rsp = rcu_state_p;
21409 break;
21410+#ifndef CONFIG_PREEMPT_RT_FULL
21411 case RCU_BH_FLAVOR:
21412 rsp = &rcu_bh_state;
21413 break;
21414+#endif
21415 case RCU_SCHED_FLAVOR:
21416 rsp = &rcu_sched_state;
21417 break;
21418@@ -2918,18 +2951,17 @@ __rcu_process_callbacks(struct rcu_state *rsp)
21419 /*
21420 * Do RCU core processing for the current CPU.
21421 */
21422-static __latent_entropy void rcu_process_callbacks(struct softirq_action *unused)
21423+static __latent_entropy void rcu_process_callbacks(void)
21424 {
21425 struct rcu_state *rsp;
21426
21427 if (cpu_is_offline(smp_processor_id()))
21428 return;
21429- trace_rcu_utilization(TPS("Start RCU core"));
21430 for_each_rcu_flavor(rsp)
21431 __rcu_process_callbacks(rsp);
21432- trace_rcu_utilization(TPS("End RCU core"));
21433 }
21434
21435+static DEFINE_PER_CPU(struct task_struct *, rcu_cpu_kthread_task);
21436 /*
21437 * Schedule RCU callback invocation. If the specified type of RCU
21438 * does not support RCU priority boosting, just do a direct call,
21439@@ -2941,19 +2973,106 @@ static void invoke_rcu_callbacks(struct rcu_state *rsp, struct rcu_data *rdp)
21440 {
21441 if (unlikely(!READ_ONCE(rcu_scheduler_fully_active)))
21442 return;
21443- if (likely(!rsp->boost)) {
21444- rcu_do_batch(rsp, rdp);
21445- return;
21446- }
21447- invoke_rcu_callbacks_kthread();
21448+ rcu_do_batch(rsp, rdp);
21449 }
21450
21451+static void rcu_wake_cond(struct task_struct *t, int status)
21452+{
21453+ /*
21454+ * If the thread is yielding, only wake it when this
21455+ * is invoked from idle
21456+ */
21457+ if (t && (status != RCU_KTHREAD_YIELDING || is_idle_task(current)))
21458+ wake_up_process(t);
21459+}
21460+
21461+/*
21462+ * Wake up this CPU's rcuc kthread to do RCU core processing.
21463+ */
21464 static void invoke_rcu_core(void)
21465 {
21466- if (cpu_online(smp_processor_id()))
21467- raise_softirq(RCU_SOFTIRQ);
21468+ unsigned long flags;
21469+ struct task_struct *t;
21470+
21471+ if (!cpu_online(smp_processor_id()))
21472+ return;
21473+ local_irq_save(flags);
21474+ __this_cpu_write(rcu_cpu_has_work, 1);
21475+ t = __this_cpu_read(rcu_cpu_kthread_task);
21476+ if (t != NULL && current != t)
21477+ rcu_wake_cond(t, __this_cpu_read(rcu_cpu_kthread_status));
21478+ local_irq_restore(flags);
21479+}
21480+
21481+static void rcu_cpu_kthread_park(unsigned int cpu)
21482+{
21483+ per_cpu(rcu_cpu_kthread_status, cpu) = RCU_KTHREAD_OFFCPU;
21484+}
21485+
21486+static int rcu_cpu_kthread_should_run(unsigned int cpu)
21487+{
21488+ return __this_cpu_read(rcu_cpu_has_work);
21489 }
21490
21491+/*
21492+ * Per-CPU kernel thread that invokes RCU callbacks. This replaces the
21493+ * RCU softirq used in flavors and configurations of RCU that do not
21494+ * support RCU priority boosting.
21495+ */
21496+static void rcu_cpu_kthread(unsigned int cpu)
21497+{
21498+ unsigned int *statusp = this_cpu_ptr(&rcu_cpu_kthread_status);
21499+ char work, *workp = this_cpu_ptr(&rcu_cpu_has_work);
21500+ int spincnt;
21501+
21502+ for (spincnt = 0; spincnt < 10; spincnt++) {
21503+ trace_rcu_utilization(TPS("Start CPU kthread@rcu_wait"));
21504+ local_bh_disable();
21505+ *statusp = RCU_KTHREAD_RUNNING;
21506+ this_cpu_inc(rcu_cpu_kthread_loops);
21507+ local_irq_disable();
21508+ work = *workp;
21509+ *workp = 0;
21510+ local_irq_enable();
21511+ if (work)
21512+ rcu_process_callbacks();
21513+ local_bh_enable();
21514+ if (*workp == 0) {
21515+ trace_rcu_utilization(TPS("End CPU kthread@rcu_wait"));
21516+ *statusp = RCU_KTHREAD_WAITING;
21517+ return;
21518+ }
21519+ }
21520+ *statusp = RCU_KTHREAD_YIELDING;
21521+ trace_rcu_utilization(TPS("Start CPU kthread@rcu_yield"));
21522+ schedule_timeout_interruptible(2);
21523+ trace_rcu_utilization(TPS("End CPU kthread@rcu_yield"));
21524+ *statusp = RCU_KTHREAD_WAITING;
21525+}
21526+
21527+static struct smp_hotplug_thread rcu_cpu_thread_spec = {
21528+ .store = &rcu_cpu_kthread_task,
21529+ .thread_should_run = rcu_cpu_kthread_should_run,
21530+ .thread_fn = rcu_cpu_kthread,
21531+ .thread_comm = "rcuc/%u",
21532+ .setup = rcu_cpu_kthread_setup,
21533+ .park = rcu_cpu_kthread_park,
21534+};
21535+
21536+/*
21537+ * Spawn per-CPU RCU core processing kthreads.
21538+ */
21539+static int __init rcu_spawn_core_kthreads(void)
21540+{
21541+ int cpu;
21542+
21543+ for_each_possible_cpu(cpu)
21544+ per_cpu(rcu_cpu_has_work, cpu) = 0;
21545+ BUG_ON(smpboot_register_percpu_thread(&rcu_cpu_thread_spec));
21546+ return 0;
21547+}
21548+early_initcall(rcu_spawn_core_kthreads);
21549+
21550 /*
21551 * Handle any core-RCU processing required by a call_rcu() invocation.
21552 */
21553@@ -3113,6 +3232,7 @@ void call_rcu_sched(struct rcu_head *head, rcu_callback_t func)
21554 }
21555 EXPORT_SYMBOL_GPL(call_rcu_sched);
21556
21557+#ifndef CONFIG_PREEMPT_RT_FULL
21558 /**
21559 * call_rcu_bh() - Queue an RCU for invocation after a quicker grace period.
21560 * @head: structure to be used for queueing the RCU updates.
21561@@ -3140,6 +3260,7 @@ void call_rcu_bh(struct rcu_head *head, rcu_callback_t func)
21562 __call_rcu(head, func, &rcu_bh_state, -1, 0);
21563 }
21564 EXPORT_SYMBOL_GPL(call_rcu_bh);
21565+#endif
21566
21567 /*
21568 * Queue an RCU callback for lazy invocation after a grace period.
21569@@ -3225,6 +3346,7 @@ void synchronize_sched(void)
21570 }
21571 EXPORT_SYMBOL_GPL(synchronize_sched);
21572
21573+#ifndef CONFIG_PREEMPT_RT_FULL
21574 /**
21575 * synchronize_rcu_bh - wait until an rcu_bh grace period has elapsed.
21576 *
21577@@ -3251,6 +3373,7 @@ void synchronize_rcu_bh(void)
21578 wait_rcu_gp(call_rcu_bh);
21579 }
21580 EXPORT_SYMBOL_GPL(synchronize_rcu_bh);
21581+#endif
21582
21583 /**
21584 * get_state_synchronize_rcu - Snapshot current RCU state
21585@@ -3601,6 +3724,7 @@ static void _rcu_barrier(struct rcu_state *rsp)
21586 mutex_unlock(&rsp->barrier_mutex);
21587 }
21588
21589+#ifndef CONFIG_PREEMPT_RT_FULL
21590 /**
21591 * rcu_barrier_bh - Wait until all in-flight call_rcu_bh() callbacks complete.
21592 */
21593@@ -3609,6 +3733,7 @@ void rcu_barrier_bh(void)
21594 _rcu_barrier(&rcu_bh_state);
21595 }
21596 EXPORT_SYMBOL_GPL(rcu_barrier_bh);
21597+#endif
21598
21599 /**
21600 * rcu_barrier_sched - Wait for in-flight call_rcu_sched() callbacks.
21601@@ -3741,8 +3866,6 @@ int rcutree_online_cpu(unsigned int cpu)
21602 {
21603 sync_sched_exp_online_cleanup(cpu);
21604 rcutree_affinity_setting(cpu, -1);
21605- if (IS_ENABLED(CONFIG_TREE_SRCU))
21606- srcu_online_cpu(cpu);
21607 return 0;
21608 }
21609
21610@@ -3753,8 +3876,6 @@ int rcutree_online_cpu(unsigned int cpu)
21611 int rcutree_offline_cpu(unsigned int cpu)
21612 {
21613 rcutree_affinity_setting(cpu, cpu);
21614- if (IS_ENABLED(CONFIG_TREE_SRCU))
21615- srcu_offline_cpu(cpu);
21616 return 0;
21617 }
21618
21619@@ -4184,12 +4305,13 @@ void __init rcu_init(void)
21620
21621 rcu_bootup_announce();
21622 rcu_init_geometry();
21623+#ifndef CONFIG_PREEMPT_RT_FULL
21624 rcu_init_one(&rcu_bh_state);
21625+#endif
21626 rcu_init_one(&rcu_sched_state);
21627 if (dump_tree)
21628 rcu_dump_rcu_node_tree(&rcu_sched_state);
21629 __rcu_init_preempt();
21630- open_softirq(RCU_SOFTIRQ, rcu_process_callbacks);
21631
21632 /*
21633 * We don't need protection against CPU-hotplug here because
21634@@ -4200,8 +4322,6 @@ void __init rcu_init(void)
21635 for_each_online_cpu(cpu) {
21636 rcutree_prepare_cpu(cpu);
21637 rcu_cpu_starting(cpu);
21638- if (IS_ENABLED(CONFIG_TREE_SRCU))
21639- srcu_online_cpu(cpu);
21640 }
21641 }
21642
21643diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
21644index 8e1f285f0a70..7acc23da94e2 100644
21645--- a/kernel/rcu/tree.h
21646+++ b/kernel/rcu/tree.h
21647@@ -427,7 +427,9 @@ extern struct list_head rcu_struct_flavors;
21648 */
21649 extern struct rcu_state rcu_sched_state;
21650
21651+#ifndef CONFIG_PREEMPT_RT_FULL
21652 extern struct rcu_state rcu_bh_state;
21653+#endif
21654
21655 #ifdef CONFIG_PREEMPT_RCU
21656 extern struct rcu_state rcu_preempt_state;
21657@@ -436,12 +438,10 @@ extern struct rcu_state rcu_preempt_state;
21658 int rcu_dynticks_snap(struct rcu_dynticks *rdtp);
21659 bool rcu_eqs_special_set(int cpu);
21660
21661-#ifdef CONFIG_RCU_BOOST
21662 DECLARE_PER_CPU(unsigned int, rcu_cpu_kthread_status);
21663 DECLARE_PER_CPU(int, rcu_cpu_kthread_cpu);
21664 DECLARE_PER_CPU(unsigned int, rcu_cpu_kthread_loops);
21665 DECLARE_PER_CPU(char, rcu_cpu_has_work);
21666-#endif /* #ifdef CONFIG_RCU_BOOST */
21667
21668 #ifndef RCU_TREE_NONCORE
21669
21670@@ -461,10 +461,9 @@ void call_rcu(struct rcu_head *head, rcu_callback_t func);
21671 static void __init __rcu_init_preempt(void);
21672 static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags);
21673 static void rcu_preempt_boost_start_gp(struct rcu_node *rnp);
21674-static void invoke_rcu_callbacks_kthread(void);
21675 static bool rcu_is_callbacks_kthread(void);
21676+static void rcu_cpu_kthread_setup(unsigned int cpu);
21677 #ifdef CONFIG_RCU_BOOST
21678-static void rcu_preempt_do_callbacks(void);
21679 static int rcu_spawn_one_boost_kthread(struct rcu_state *rsp,
21680 struct rcu_node *rnp);
21681 #endif /* #ifdef CONFIG_RCU_BOOST */
21682diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
21683index 8b3102d22823..17ee8d1f38c4 100644
21684--- a/kernel/rcu/tree_plugin.h
21685+++ b/kernel/rcu/tree_plugin.h
21686@@ -24,39 +24,16 @@
21687 * Paul E. McKenney <paulmck@linux.vnet.ibm.com>
21688 */
21689
21690-#include <linux/delay.h>
21691-#include <linux/gfp.h>
21692-#include <linux/oom.h>
21693-#include <linux/sched/debug.h>
21694-#include <linux/smpboot.h>
21695-#include <uapi/linux/sched/types.h>
21696-#include "../time/tick-internal.h"
21697-
21698-#ifdef CONFIG_RCU_BOOST
21699-
21700 #include "../locking/rtmutex_common.h"
21701
21702 /*
21703 * Control variables for per-CPU and per-rcu_node kthreads. These
21704 * handle all flavors of RCU.
21705 */
21706-static DEFINE_PER_CPU(struct task_struct *, rcu_cpu_kthread_task);
21707 DEFINE_PER_CPU(unsigned int, rcu_cpu_kthread_status);
21708 DEFINE_PER_CPU(unsigned int, rcu_cpu_kthread_loops);
21709 DEFINE_PER_CPU(char, rcu_cpu_has_work);
21710
21711-#else /* #ifdef CONFIG_RCU_BOOST */
21712-
21713-/*
21714- * Some architectures do not define rt_mutexes, but if !CONFIG_RCU_BOOST,
21715- * all uses are in dead code. Provide a definition to keep the compiler
21716- * happy, but add WARN_ON_ONCE() to complain if used in the wrong place.
21717- * This probably needs to be excluded from -rt builds.
21718- */
21719-#define rt_mutex_owner(a) ({ WARN_ON_ONCE(1); NULL; })
21720-
21721-#endif /* #else #ifdef CONFIG_RCU_BOOST */
21722-
21723 #ifdef CONFIG_RCU_NOCB_CPU
21724 static cpumask_var_t rcu_nocb_mask; /* CPUs to have callbacks offloaded. */
21725 static bool have_rcu_nocb_mask; /* Was rcu_nocb_mask allocated? */
21726@@ -324,9 +301,13 @@ static void rcu_preempt_note_context_switch(bool preempt)
21727 struct task_struct *t = current;
21728 struct rcu_data *rdp;
21729 struct rcu_node *rnp;
21730+ int sleeping_l = 0;
21731
21732 RCU_LOCKDEP_WARN(!irqs_disabled(), "rcu_preempt_note_context_switch() invoked with interrupts enabled!!!\n");
21733- WARN_ON_ONCE(!preempt && t->rcu_read_lock_nesting > 0);
21734+#if defined(CONFIG_PREEMPT_RT_FULL)
21735+ sleeping_l = t->sleeping_lock;
21736+#endif
21737+ WARN_ON_ONCE(!preempt && t->rcu_read_lock_nesting > 0 && !sleeping_l);
21738 if (t->rcu_read_lock_nesting > 0 &&
21739 !t->rcu_read_unlock_special.b.blocked) {
21740
21741@@ -463,7 +444,7 @@ void rcu_read_unlock_special(struct task_struct *t)
21742 }
21743
21744 /* Hardware IRQ handlers cannot block, complain if they get here. */
21745- if (in_irq() || in_serving_softirq()) {
21746+ if (preempt_count() & (HARDIRQ_MASK | SOFTIRQ_OFFSET)) {
21747 lockdep_rcu_suspicious(__FILE__, __LINE__,
21748 "rcu_read_unlock() from irq or softirq with blocking in critical section!!!\n");
21749 pr_alert("->rcu_read_unlock_special: %#x (b: %d, enq: %d nq: %d)\n",
21750@@ -530,7 +511,7 @@ void rcu_read_unlock_special(struct task_struct *t)
21751
21752 /* Unboost if we were boosted. */
21753 if (IS_ENABLED(CONFIG_RCU_BOOST) && drop_boost_mutex)
21754- rt_mutex_unlock(&rnp->boost_mtx);
21755+ rt_mutex_futex_unlock(&rnp->boost_mtx);
21756
21757 /*
21758 * If this was the last task on the expedited lists,
21759@@ -684,15 +665,6 @@ static void rcu_preempt_check_callbacks(void)
21760 t->rcu_read_unlock_special.b.need_qs = true;
21761 }
21762
21763-#ifdef CONFIG_RCU_BOOST
21764-
21765-static void rcu_preempt_do_callbacks(void)
21766-{
21767- rcu_do_batch(rcu_state_p, this_cpu_ptr(rcu_data_p));
21768-}
21769-
21770-#endif /* #ifdef CONFIG_RCU_BOOST */
21771-
21772 /**
21773 * call_rcu() - Queue an RCU callback for invocation after a grace period.
21774 * @head: structure to be used for queueing the RCU updates.
21775@@ -915,20 +887,23 @@ void exit_rcu(void)
21776
21777 #endif /* #else #ifdef CONFIG_PREEMPT_RCU */
21778
21779+/*
21780+ * If boosting, set rcuc kthreads to realtime priority.
21781+ */
21782+static void rcu_cpu_kthread_setup(unsigned int cpu)
21783+{
21784+#ifdef CONFIG_RCU_BOOST
21785+ struct sched_param sp;
21786+
21787+ sp.sched_priority = kthread_prio;
21788+ sched_setscheduler_nocheck(current, SCHED_FIFO, &sp);
21789+#endif /* #ifdef CONFIG_RCU_BOOST */
21790+}
21791+
21792 #ifdef CONFIG_RCU_BOOST
21793
21794 #include "../locking/rtmutex_common.h"
21795
21796-static void rcu_wake_cond(struct task_struct *t, int status)
21797-{
21798- /*
21799- * If the thread is yielding, only wake it when this
21800- * is invoked from idle
21801- */
21802- if (status != RCU_KTHREAD_YIELDING || is_idle_task(current))
21803- wake_up_process(t);
21804-}
21805-
21806 /*
21807 * Carry out RCU priority boosting on the task indicated by ->exp_tasks
21808 * or ->boost_tasks, advancing the pointer to the next task in the
21809@@ -1070,23 +1045,6 @@ static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags)
21810 }
21811 }
21812
21813-/*
21814- * Wake up the per-CPU kthread to invoke RCU callbacks.
21815- */
21816-static void invoke_rcu_callbacks_kthread(void)
21817-{
21818- unsigned long flags;
21819-
21820- local_irq_save(flags);
21821- __this_cpu_write(rcu_cpu_has_work, 1);
21822- if (__this_cpu_read(rcu_cpu_kthread_task) != NULL &&
21823- current != __this_cpu_read(rcu_cpu_kthread_task)) {
21824- rcu_wake_cond(__this_cpu_read(rcu_cpu_kthread_task),
21825- __this_cpu_read(rcu_cpu_kthread_status));
21826- }
21827- local_irq_restore(flags);
21828-}
21829-
21830 /*
21831 * Is the current CPU running the RCU-callbacks kthread?
21832 * Caller must have preemption disabled.
21833@@ -1141,67 +1099,6 @@ static int rcu_spawn_one_boost_kthread(struct rcu_state *rsp,
21834 return 0;
21835 }
21836
21837-static void rcu_kthread_do_work(void)
21838-{
21839- rcu_do_batch(&rcu_sched_state, this_cpu_ptr(&rcu_sched_data));
21840- rcu_do_batch(&rcu_bh_state, this_cpu_ptr(&rcu_bh_data));
21841- rcu_preempt_do_callbacks();
21842-}
21843-
21844-static void rcu_cpu_kthread_setup(unsigned int cpu)
21845-{
21846- struct sched_param sp;
21847-
21848- sp.sched_priority = kthread_prio;
21849- sched_setscheduler_nocheck(current, SCHED_FIFO, &sp);
21850-}
21851-
21852-static void rcu_cpu_kthread_park(unsigned int cpu)
21853-{
21854- per_cpu(rcu_cpu_kthread_status, cpu) = RCU_KTHREAD_OFFCPU;
21855-}
21856-
21857-static int rcu_cpu_kthread_should_run(unsigned int cpu)
21858-{
21859- return __this_cpu_read(rcu_cpu_has_work);
21860-}
21861-
21862-/*
21863- * Per-CPU kernel thread that invokes RCU callbacks. This replaces the
21864- * RCU softirq used in flavors and configurations of RCU that do not
21865- * support RCU priority boosting.
21866- */
21867-static void rcu_cpu_kthread(unsigned int cpu)
21868-{
21869- unsigned int *statusp = this_cpu_ptr(&rcu_cpu_kthread_status);
21870- char work, *workp = this_cpu_ptr(&rcu_cpu_has_work);
21871- int spincnt;
21872-
21873- for (spincnt = 0; spincnt < 10; spincnt++) {
21874- trace_rcu_utilization(TPS("Start CPU kthread@rcu_wait"));
21875- local_bh_disable();
21876- *statusp = RCU_KTHREAD_RUNNING;
21877- this_cpu_inc(rcu_cpu_kthread_loops);
21878- local_irq_disable();
21879- work = *workp;
21880- *workp = 0;
21881- local_irq_enable();
21882- if (work)
21883- rcu_kthread_do_work();
21884- local_bh_enable();
21885- if (*workp == 0) {
21886- trace_rcu_utilization(TPS("End CPU kthread@rcu_wait"));
21887- *statusp = RCU_KTHREAD_WAITING;
21888- return;
21889- }
21890- }
21891- *statusp = RCU_KTHREAD_YIELDING;
21892- trace_rcu_utilization(TPS("Start CPU kthread@rcu_yield"));
21893- schedule_timeout_interruptible(2);
21894- trace_rcu_utilization(TPS("End CPU kthread@rcu_yield"));
21895- *statusp = RCU_KTHREAD_WAITING;
21896-}
21897-
21898 /*
21899 * Set the per-rcu_node kthread's affinity to cover all CPUs that are
21900 * served by the rcu_node in question. The CPU hotplug lock is still
21901@@ -1232,26 +1129,12 @@ static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu)
21902 free_cpumask_var(cm);
21903 }
21904
21905-static struct smp_hotplug_thread rcu_cpu_thread_spec = {
21906- .store = &rcu_cpu_kthread_task,
21907- .thread_should_run = rcu_cpu_kthread_should_run,
21908- .thread_fn = rcu_cpu_kthread,
21909- .thread_comm = "rcuc/%u",
21910- .setup = rcu_cpu_kthread_setup,
21911- .park = rcu_cpu_kthread_park,
21912-};
21913-
21914 /*
21915 * Spawn boost kthreads -- called as soon as the scheduler is running.
21916 */
21917 static void __init rcu_spawn_boost_kthreads(void)
21918 {
21919 struct rcu_node *rnp;
21920- int cpu;
21921-
21922- for_each_possible_cpu(cpu)
21923- per_cpu(rcu_cpu_has_work, cpu) = 0;
21924- BUG_ON(smpboot_register_percpu_thread(&rcu_cpu_thread_spec));
21925 rcu_for_each_leaf_node(rcu_state_p, rnp)
21926 (void)rcu_spawn_one_boost_kthread(rcu_state_p, rnp);
21927 }
21928@@ -1274,11 +1157,6 @@ static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags)
21929 raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
21930 }
21931
21932-static void invoke_rcu_callbacks_kthread(void)
21933-{
21934- WARN_ON_ONCE(1);
21935-}
21936-
21937 static bool rcu_is_callbacks_kthread(void)
21938 {
21939 return false;
21940@@ -1302,7 +1180,7 @@ static void rcu_prepare_kthreads(int cpu)
21941
21942 #endif /* #else #ifdef CONFIG_RCU_BOOST */
21943
21944-#if !defined(CONFIG_RCU_FAST_NO_HZ)
21945+#if !defined(CONFIG_RCU_FAST_NO_HZ) || defined(CONFIG_PREEMPT_RT_FULL)
21946
21947 /*
21948 * Check to see if any future RCU-related work will need to be done
21949@@ -1318,7 +1196,9 @@ int rcu_needs_cpu(u64 basemono, u64 *nextevt)
21950 *nextevt = KTIME_MAX;
21951 return rcu_cpu_has_callbacks(NULL);
21952 }
21953+#endif /* !defined(CONFIG_RCU_FAST_NO_HZ) || defined(CONFIG_PREEMPT_RT_FULL) */
21954
21955+#if !defined(CONFIG_RCU_FAST_NO_HZ)
21956 /*
21957 * Because we do not have RCU_FAST_NO_HZ, don't bother cleaning up
21958 * after it.
21959@@ -1414,6 +1294,8 @@ static bool __maybe_unused rcu_try_advance_all_cbs(void)
21960 return cbs_ready;
21961 }
21962
21963+#ifndef CONFIG_PREEMPT_RT_FULL
21964+
21965 /*
21966 * Allow the CPU to enter dyntick-idle mode unless it has callbacks ready
21967 * to invoke. If the CPU has callbacks, try to advance them. Tell the
21968@@ -1456,6 +1338,7 @@ int rcu_needs_cpu(u64 basemono, u64 *nextevt)
21969 *nextevt = basemono + dj * TICK_NSEC;
21970 return 0;
21971 }
21972+#endif /* #ifndef CONFIG_PREEMPT_RT_FULL */
21973
21974 /*
21975 * Prepare a CPU for idle from an RCU perspective. The first major task
21976diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c
21977index 7a577bd989a4..2006a09680aa 100644
21978--- a/kernel/rcu/update.c
21979+++ b/kernel/rcu/update.c
21980@@ -66,7 +66,7 @@ extern int rcu_expedited; /* from sysctl */
21981 module_param(rcu_expedited, int, 0);
21982 extern int rcu_normal; /* from sysctl */
21983 module_param(rcu_normal, int, 0);
21984-static int rcu_normal_after_boot;
21985+static int rcu_normal_after_boot = IS_ENABLED(CONFIG_PREEMPT_RT_FULL);
21986 module_param(rcu_normal_after_boot, int, 0);
21987 #endif /* #ifndef CONFIG_TINY_RCU */
21988
21989@@ -333,6 +333,7 @@ int rcu_read_lock_held(void)
21990 }
21991 EXPORT_SYMBOL_GPL(rcu_read_lock_held);
21992
21993+#ifndef CONFIG_PREEMPT_RT_FULL
21994 /**
21995 * rcu_read_lock_bh_held() - might we be in RCU-bh read-side critical section?
21996 *
21997@@ -359,6 +360,7 @@ int rcu_read_lock_bh_held(void)
21998 return in_softirq() || irqs_disabled();
21999 }
22000 EXPORT_SYMBOL_GPL(rcu_read_lock_bh_held);
22001+#endif
22002
22003 #endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
22004
22005diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile
22006index a9ee16bbc693..9943019095e9 100644
22007--- a/kernel/sched/Makefile
22008+++ b/kernel/sched/Makefile
22009@@ -18,7 +18,7 @@ endif
22010
22011 obj-y += core.o loadavg.o clock.o cputime.o
22012 obj-y += idle_task.o fair.o rt.o deadline.o
22013-obj-y += wait.o wait_bit.o swait.o completion.o idle.o
22014+obj-y += wait.o wait_bit.o swait.o swork.o completion.o idle.o
22015 obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o topology.o stop_task.o
22016 obj-$(CONFIG_SCHED_AUTOGROUP) += autogroup.o
22017 obj-$(CONFIG_SCHEDSTATS) += stats.o
22018diff --git a/kernel/sched/completion.c b/kernel/sched/completion.c
22019index 2ddaec40956f..0fe2982e46a0 100644
22020--- a/kernel/sched/completion.c
22021+++ b/kernel/sched/completion.c
22022@@ -32,7 +32,7 @@ void complete(struct completion *x)
22023 {
22024 unsigned long flags;
22025
22026- spin_lock_irqsave(&x->wait.lock, flags);
22027+ raw_spin_lock_irqsave(&x->wait.lock, flags);
22028
22029 /*
22030 * Perform commit of crossrelease here.
22031@@ -41,8 +41,8 @@ void complete(struct completion *x)
22032
22033 if (x->done != UINT_MAX)
22034 x->done++;
22035- __wake_up_locked(&x->wait, TASK_NORMAL, 1);
22036- spin_unlock_irqrestore(&x->wait.lock, flags);
22037+ swake_up_locked(&x->wait);
22038+ raw_spin_unlock_irqrestore(&x->wait.lock, flags);
22039 }
22040 EXPORT_SYMBOL(complete);
22041
22042@@ -66,10 +66,10 @@ void complete_all(struct completion *x)
22043 {
22044 unsigned long flags;
22045
22046- spin_lock_irqsave(&x->wait.lock, flags);
22047+ raw_spin_lock_irqsave(&x->wait.lock, flags);
22048 x->done = UINT_MAX;
22049- __wake_up_locked(&x->wait, TASK_NORMAL, 0);
22050- spin_unlock_irqrestore(&x->wait.lock, flags);
22051+ swake_up_all_locked(&x->wait);
22052+ raw_spin_unlock_irqrestore(&x->wait.lock, flags);
22053 }
22054 EXPORT_SYMBOL(complete_all);
22055
22056@@ -78,20 +78,20 @@ do_wait_for_common(struct completion *x,
22057 long (*action)(long), long timeout, int state)
22058 {
22059 if (!x->done) {
22060- DECLARE_WAITQUEUE(wait, current);
22061+ DECLARE_SWAITQUEUE(wait);
22062
22063- __add_wait_queue_entry_tail_exclusive(&x->wait, &wait);
22064+ __prepare_to_swait(&x->wait, &wait);
22065 do {
22066 if (signal_pending_state(state, current)) {
22067 timeout = -ERESTARTSYS;
22068 break;
22069 }
22070 __set_current_state(state);
22071- spin_unlock_irq(&x->wait.lock);
22072+ raw_spin_unlock_irq(&x->wait.lock);
22073 timeout = action(timeout);
22074- spin_lock_irq(&x->wait.lock);
22075+ raw_spin_lock_irq(&x->wait.lock);
22076 } while (!x->done && timeout);
22077- __remove_wait_queue(&x->wait, &wait);
22078+ __finish_swait(&x->wait, &wait);
22079 if (!x->done)
22080 return timeout;
22081 }
22082@@ -108,9 +108,9 @@ __wait_for_common(struct completion *x,
22083
22084 complete_acquire(x);
22085
22086- spin_lock_irq(&x->wait.lock);
22087+ raw_spin_lock_irq(&x->wait.lock);
22088 timeout = do_wait_for_common(x, action, timeout, state);
22089- spin_unlock_irq(&x->wait.lock);
22090+ raw_spin_unlock_irq(&x->wait.lock);
22091
22092 complete_release(x);
22093
22094@@ -299,12 +299,12 @@ bool try_wait_for_completion(struct completion *x)
22095 if (!READ_ONCE(x->done))
22096 return 0;
22097
22098- spin_lock_irqsave(&x->wait.lock, flags);
22099+ raw_spin_lock_irqsave(&x->wait.lock, flags);
22100 if (!x->done)
22101 ret = 0;
22102 else if (x->done != UINT_MAX)
22103 x->done--;
22104- spin_unlock_irqrestore(&x->wait.lock, flags);
22105+ raw_spin_unlock_irqrestore(&x->wait.lock, flags);
22106 return ret;
22107 }
22108 EXPORT_SYMBOL(try_wait_for_completion);
22109@@ -330,8 +330,8 @@ bool completion_done(struct completion *x)
22110 * otherwise we can end up freeing the completion before complete()
22111 * is done referencing it.
22112 */
22113- spin_lock_irqsave(&x->wait.lock, flags);
22114- spin_unlock_irqrestore(&x->wait.lock, flags);
22115+ raw_spin_lock_irqsave(&x->wait.lock, flags);
22116+ raw_spin_unlock_irqrestore(&x->wait.lock, flags);
22117 return true;
22118 }
22119 EXPORT_SYMBOL(completion_done);
22120diff --git a/kernel/sched/core.c b/kernel/sched/core.c
22121index 4e89ed8a0fb2..6e6bd5262f23 100644
22122--- a/kernel/sched/core.c
22123+++ b/kernel/sched/core.c
22124@@ -59,7 +59,11 @@ const_debug unsigned int sysctl_sched_features =
22125 * Number of tasks to iterate in a single balance run.
22126 * Limited because this is done with IRQs disabled.
22127 */
22128+#ifndef CONFIG_PREEMPT_RT_FULL
22129 const_debug unsigned int sysctl_sched_nr_migrate = 32;
22130+#else
22131+const_debug unsigned int sysctl_sched_nr_migrate = 8;
22132+#endif
22133
22134 /*
22135 * period over which we average the RT time consumption, measured
22136@@ -341,7 +345,7 @@ static void init_rq_hrtick(struct rq *rq)
22137 rq->hrtick_csd.info = rq;
22138 #endif
22139
22140- hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
22141+ hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD);
22142 rq->hrtick_timer.function = hrtick;
22143 }
22144 #else /* CONFIG_SCHED_HRTICK */
22145@@ -423,9 +427,15 @@ static bool set_nr_if_polling(struct task_struct *p)
22146 #endif
22147 #endif
22148
22149-void wake_q_add(struct wake_q_head *head, struct task_struct *task)
22150+void __wake_q_add(struct wake_q_head *head, struct task_struct *task,
22151+ bool sleeper)
22152 {
22153- struct wake_q_node *node = &task->wake_q;
22154+ struct wake_q_node *node;
22155+
22156+ if (sleeper)
22157+ node = &task->wake_q_sleeper;
22158+ else
22159+ node = &task->wake_q;
22160
22161 /*
22162 * Atomically grab the task, if ->wake_q is !nil already it means
22163@@ -447,24 +457,32 @@ void wake_q_add(struct wake_q_head *head, struct task_struct *task)
22164 head->lastp = &node->next;
22165 }
22166
22167-void wake_up_q(struct wake_q_head *head)
22168+void __wake_up_q(struct wake_q_head *head, bool sleeper)
22169 {
22170 struct wake_q_node *node = head->first;
22171
22172 while (node != WAKE_Q_TAIL) {
22173 struct task_struct *task;
22174
22175- task = container_of(node, struct task_struct, wake_q);
22176+ if (sleeper)
22177+ task = container_of(node, struct task_struct, wake_q_sleeper);
22178+ else
22179+ task = container_of(node, struct task_struct, wake_q);
22180 BUG_ON(!task);
22181 /* Task can safely be re-inserted now: */
22182 node = node->next;
22183- task->wake_q.next = NULL;
22184-
22185+ if (sleeper)
22186+ task->wake_q_sleeper.next = NULL;
22187+ else
22188+ task->wake_q.next = NULL;
22189 /*
22190 * wake_up_process() implies a wmb() to pair with the queueing
22191 * in wake_q_add() so as not to miss wakeups.
22192 */
22193- wake_up_process(task);
22194+ if (sleeper)
22195+ wake_up_lock_sleeper(task);
22196+ else
22197+ wake_up_process(task);
22198 put_task_struct(task);
22199 }
22200 }
22201@@ -500,6 +518,48 @@ void resched_curr(struct rq *rq)
22202 trace_sched_wake_idle_without_ipi(cpu);
22203 }
22204
22205+#ifdef CONFIG_PREEMPT_LAZY
22206+
22207+static int tsk_is_polling(struct task_struct *p)
22208+{
22209+#ifdef TIF_POLLING_NRFLAG
22210+ return test_tsk_thread_flag(p, TIF_POLLING_NRFLAG);
22211+#else
22212+ return 0;
22213+#endif
22214+}
22215+
22216+void resched_curr_lazy(struct rq *rq)
22217+{
22218+ struct task_struct *curr = rq->curr;
22219+ int cpu;
22220+
22221+ if (!sched_feat(PREEMPT_LAZY)) {
22222+ resched_curr(rq);
22223+ return;
22224+ }
22225+
22226+ lockdep_assert_held(&rq->lock);
22227+
22228+ if (test_tsk_need_resched(curr))
22229+ return;
22230+
22231+ if (test_tsk_need_resched_lazy(curr))
22232+ return;
22233+
22234+ set_tsk_need_resched_lazy(curr);
22235+
22236+ cpu = cpu_of(rq);
22237+ if (cpu == smp_processor_id())
22238+ return;
22239+
22240+ /* NEED_RESCHED_LAZY must be visible before we test polling */
22241+ smp_mb();
22242+ if (!tsk_is_polling(curr))
22243+ smp_send_reschedule(cpu);
22244+}
22245+#endif
22246+
22247 void resched_cpu(int cpu)
22248 {
22249 struct rq *rq = cpu_rq(cpu);
22250@@ -523,11 +583,14 @@ void resched_cpu(int cpu)
22251 */
22252 int get_nohz_timer_target(void)
22253 {
22254- int i, cpu = smp_processor_id();
22255+ int i, cpu;
22256 struct sched_domain *sd;
22257
22258+ preempt_disable_rt();
22259+ cpu = smp_processor_id();
22260+
22261 if (!idle_cpu(cpu) && is_housekeeping_cpu(cpu))
22262- return cpu;
22263+ goto preempt_en_rt;
22264
22265 rcu_read_lock();
22266 for_each_domain(cpu, sd) {
22267@@ -546,6 +609,8 @@ int get_nohz_timer_target(void)
22268 cpu = housekeeping_any_cpu();
22269 unlock:
22270 rcu_read_unlock();
22271+preempt_en_rt:
22272+ preempt_enable_rt();
22273 return cpu;
22274 }
22275
22276@@ -912,10 +977,10 @@ static inline bool is_per_cpu_kthread(struct task_struct *p)
22277 */
22278 static inline bool is_cpu_allowed(struct task_struct *p, int cpu)
22279 {
22280- if (!cpumask_test_cpu(cpu, &p->cpus_allowed))
22281+ if (!cpumask_test_cpu(cpu, p->cpus_ptr))
22282 return false;
22283
22284- if (is_per_cpu_kthread(p))
22285+ if (is_per_cpu_kthread(p) || __migrate_disabled(p))
22286 return cpu_online(cpu);
22287
22288 return cpu_active(cpu);
22289@@ -1007,7 +1072,7 @@ static int migration_cpu_stop(void *data)
22290 local_irq_disable();
22291 /*
22292 * We need to explicitly wake pending tasks before running
22293- * __migrate_task() such that we will not miss enforcing cpus_allowed
22294+ * __migrate_task() such that we will not miss enforcing cpus_ptr
22295 * during wakeups, see set_cpus_allowed_ptr()'s TASK_WAKING test.
22296 */
22297 sched_ttwu_pending();
22298@@ -1038,11 +1103,19 @@ static int migration_cpu_stop(void *data)
22299 */
22300 void set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask)
22301 {
22302- cpumask_copy(&p->cpus_allowed, new_mask);
22303+ cpumask_copy(&p->cpus_mask, new_mask);
22304 p->nr_cpus_allowed = cpumask_weight(new_mask);
22305 }
22306
22307-void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
22308+#if defined(CONFIG_SMP) && defined(CONFIG_PREEMPT_RT_BASE)
22309+int __migrate_disabled(struct task_struct *p)
22310+{
22311+ return p->migrate_disable;
22312+}
22313+#endif
22314+
22315+static void __do_set_cpus_allowed_tail(struct task_struct *p,
22316+ const struct cpumask *new_mask)
22317 {
22318 struct rq *rq = task_rq(p);
22319 bool queued, running;
22320@@ -1071,6 +1144,20 @@ void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
22321 set_curr_task(rq, p);
22322 }
22323
22324+void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
22325+{
22326+#if defined(CONFIG_SMP) && defined(CONFIG_PREEMPT_RT_BASE)
22327+ if (__migrate_disabled(p)) {
22328+ lockdep_assert_held(&p->pi_lock);
22329+
22330+ cpumask_copy(&p->cpus_mask, new_mask);
22331+ p->migrate_disable_update = 1;
22332+ return;
22333+ }
22334+#endif
22335+ __do_set_cpus_allowed_tail(p, new_mask);
22336+}
22337+
22338 /*
22339 * Change a given task's CPU affinity. Migrate the thread to a
22340 * proper CPU and schedule it away if the CPU it's executing on
22341@@ -1108,7 +1195,7 @@ static int __set_cpus_allowed_ptr(struct task_struct *p,
22342 goto out;
22343 }
22344
22345- if (cpumask_equal(&p->cpus_allowed, new_mask))
22346+ if (cpumask_equal(p->cpus_ptr, new_mask))
22347 goto out;
22348
22349 if (!cpumask_intersects(new_mask, cpu_valid_mask)) {
22350@@ -1129,9 +1216,16 @@ static int __set_cpus_allowed_ptr(struct task_struct *p,
22351 }
22352
22353 /* Can the task run on the task's current CPU? If so, we're done */
22354- if (cpumask_test_cpu(task_cpu(p), new_mask))
22355+ if (cpumask_test_cpu(task_cpu(p), new_mask) || __migrate_disabled(p))
22356 goto out;
22357
22358+#if defined(CONFIG_SMP) && defined(CONFIG_PREEMPT_RT_BASE)
22359+ if (__migrate_disabled(p)) {
22360+ p->migrate_disable_update = 1;
22361+ goto out;
22362+ }
22363+#endif
22364+
22365 dest_cpu = cpumask_any_and(cpu_valid_mask, new_mask);
22366 if (task_running(rq, p) || p->state == TASK_WAKING) {
22367 struct migration_arg arg = { p, dest_cpu };
22368@@ -1269,10 +1363,10 @@ static int migrate_swap_stop(void *data)
22369 if (task_cpu(arg->src_task) != arg->src_cpu)
22370 goto unlock;
22371
22372- if (!cpumask_test_cpu(arg->dst_cpu, &arg->src_task->cpus_allowed))
22373+ if (!cpumask_test_cpu(arg->dst_cpu, arg->src_task->cpus_ptr))
22374 goto unlock;
22375
22376- if (!cpumask_test_cpu(arg->src_cpu, &arg->dst_task->cpus_allowed))
22377+ if (!cpumask_test_cpu(arg->src_cpu, arg->dst_task->cpus_ptr))
22378 goto unlock;
22379
22380 __migrate_swap_task(arg->src_task, arg->dst_cpu);
22381@@ -1313,10 +1407,10 @@ int migrate_swap(struct task_struct *cur, struct task_struct *p)
22382 if (!cpu_active(arg.src_cpu) || !cpu_active(arg.dst_cpu))
22383 goto out;
22384
22385- if (!cpumask_test_cpu(arg.dst_cpu, &arg.src_task->cpus_allowed))
22386+ if (!cpumask_test_cpu(arg.dst_cpu, arg.src_task->cpus_ptr))
22387 goto out;
22388
22389- if (!cpumask_test_cpu(arg.src_cpu, &arg.dst_task->cpus_allowed))
22390+ if (!cpumask_test_cpu(arg.src_cpu, arg.dst_task->cpus_ptr))
22391 goto out;
22392
22393 trace_sched_swap_numa(cur, arg.src_cpu, p, arg.dst_cpu);
22394@@ -1326,6 +1420,18 @@ int migrate_swap(struct task_struct *cur, struct task_struct *p)
22395 return ret;
22396 }
22397
22398+static bool check_task_state(struct task_struct *p, long match_state)
22399+{
22400+ bool match = false;
22401+
22402+ raw_spin_lock_irq(&p->pi_lock);
22403+ if (p->state == match_state || p->saved_state == match_state)
22404+ match = true;
22405+ raw_spin_unlock_irq(&p->pi_lock);
22406+
22407+ return match;
22408+}
22409+
22410 /*
22411 * wait_task_inactive - wait for a thread to unschedule.
22412 *
22413@@ -1370,7 +1476,7 @@ unsigned long wait_task_inactive(struct task_struct *p, long match_state)
22414 * is actually now running somewhere else!
22415 */
22416 while (task_running(rq, p)) {
22417- if (match_state && unlikely(p->state != match_state))
22418+ if (match_state && !check_task_state(p, match_state))
22419 return 0;
22420 cpu_relax();
22421 }
22422@@ -1385,7 +1491,8 @@ unsigned long wait_task_inactive(struct task_struct *p, long match_state)
22423 running = task_running(rq, p);
22424 queued = task_on_rq_queued(p);
22425 ncsw = 0;
22426- if (!match_state || p->state == match_state)
22427+ if (!match_state || p->state == match_state ||
22428+ p->saved_state == match_state)
22429 ncsw = p->nvcsw | LONG_MIN; /* sets MSB */
22430 task_rq_unlock(rq, p, &rf);
22431
22432@@ -1460,7 +1567,7 @@ void kick_process(struct task_struct *p)
22433 EXPORT_SYMBOL_GPL(kick_process);
22434
22435 /*
22436- * ->cpus_allowed is protected by both rq->lock and p->pi_lock
22437+ * ->cpus_ptr is protected by both rq->lock and p->pi_lock
22438 *
22439 * A few notes on cpu_active vs cpu_online:
22440 *
22441@@ -1500,14 +1607,14 @@ static int select_fallback_rq(int cpu, struct task_struct *p)
22442 for_each_cpu(dest_cpu, nodemask) {
22443 if (!cpu_active(dest_cpu))
22444 continue;
22445- if (cpumask_test_cpu(dest_cpu, &p->cpus_allowed))
22446+ if (cpumask_test_cpu(dest_cpu, p->cpus_ptr))
22447 return dest_cpu;
22448 }
22449 }
22450
22451 for (;;) {
22452 /* Any allowed, online CPU? */
22453- for_each_cpu(dest_cpu, &p->cpus_allowed) {
22454+ for_each_cpu(dest_cpu, p->cpus_ptr) {
22455 if (!is_cpu_allowed(p, dest_cpu))
22456 continue;
22457
22458@@ -1551,7 +1658,7 @@ static int select_fallback_rq(int cpu, struct task_struct *p)
22459 }
22460
22461 /*
22462- * The caller (fork, wakeup) owns p->pi_lock, ->cpus_allowed is stable.
22463+ * The caller (fork, wakeup) owns p->pi_lock, ->cpus_ptr is stable.
22464 */
22465 static inline
22466 int select_task_rq(struct task_struct *p, int cpu, int sd_flags, int wake_flags)
22467@@ -1561,11 +1668,11 @@ int select_task_rq(struct task_struct *p, int cpu, int sd_flags, int wake_flags)
22468 if (p->nr_cpus_allowed > 1)
22469 cpu = p->sched_class->select_task_rq(p, cpu, sd_flags, wake_flags);
22470 else
22471- cpu = cpumask_any(&p->cpus_allowed);
22472+ cpu = cpumask_any(p->cpus_ptr);
22473
22474 /*
22475 * In order not to call set_task_cpu() on a blocking task we need
22476- * to rely on ttwu() to place the task on a valid ->cpus_allowed
22477+ * to rely on ttwu() to place the task on a valid ->cpus_ptr
22478 * CPU.
22479 *
22480 * Since this is common to all placement strategies, this lives here.
22481@@ -1668,10 +1775,6 @@ static inline void ttwu_activate(struct rq *rq, struct task_struct *p, int en_fl
22482 {
22483 activate_task(rq, p, en_flags);
22484 p->on_rq = TASK_ON_RQ_QUEUED;
22485-
22486- /* If a worker is waking up, notify the workqueue: */
22487- if (p->flags & PF_WQ_WORKER)
22488- wq_worker_waking_up(p, cpu_of(rq));
22489 }
22490
22491 /*
22492@@ -1995,8 +2098,27 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
22493 */
22494 raw_spin_lock_irqsave(&p->pi_lock, flags);
22495 smp_mb__after_spinlock();
22496- if (!(p->state & state))
22497+ if (!(p->state & state)) {
22498+ /*
22499+ * The task might be running due to a spinlock sleeper
22500+ * wakeup. Check the saved state and set it to running
22501+ * if the wakeup condition is true.
22502+ */
22503+ if (!(wake_flags & WF_LOCK_SLEEPER)) {
22504+ if (p->saved_state & state) {
22505+ p->saved_state = TASK_RUNNING;
22506+ success = 1;
22507+ }
22508+ }
22509 goto out;
22510+ }
22511+
22512+ /*
22513+ * If this is a regular wakeup, then we can unconditionally
22514+ * clear the saved state of a "lock sleeper".
22515+ */
22516+ if (!(wake_flags & WF_LOCK_SLEEPER))
22517+ p->saved_state = TASK_RUNNING;
22518
22519 trace_sched_waking(p);
22520
22521@@ -2092,56 +2214,6 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
22522 return success;
22523 }
22524
22525-/**
22526- * try_to_wake_up_local - try to wake up a local task with rq lock held
22527- * @p: the thread to be awakened
22528- * @rf: request-queue flags for pinning
22529- *
22530- * Put @p on the run-queue if it's not already there. The caller must
22531- * ensure that this_rq() is locked, @p is bound to this_rq() and not
22532- * the current task.
22533- */
22534-static void try_to_wake_up_local(struct task_struct *p, struct rq_flags *rf)
22535-{
22536- struct rq *rq = task_rq(p);
22537-
22538- if (WARN_ON_ONCE(rq != this_rq()) ||
22539- WARN_ON_ONCE(p == current))
22540- return;
22541-
22542- lockdep_assert_held(&rq->lock);
22543-
22544- if (!raw_spin_trylock(&p->pi_lock)) {
22545- /*
22546- * This is OK, because current is on_cpu, which avoids it being
22547- * picked for load-balance and preemption/IRQs are still
22548- * disabled avoiding further scheduler activity on it and we've
22549- * not yet picked a replacement task.
22550- */
22551- rq_unlock(rq, rf);
22552- raw_spin_lock(&p->pi_lock);
22553- rq_relock(rq, rf);
22554- }
22555-
22556- if (!(p->state & TASK_NORMAL))
22557- goto out;
22558-
22559- trace_sched_waking(p);
22560-
22561- if (!task_on_rq_queued(p)) {
22562- if (p->in_iowait) {
22563- delayacct_blkio_end(p);
22564- atomic_dec(&rq->nr_iowait);
22565- }
22566- ttwu_activate(rq, p, ENQUEUE_WAKEUP | ENQUEUE_NOCLOCK);
22567- }
22568-
22569- ttwu_do_wakeup(rq, p, 0, rf);
22570- ttwu_stat(p, smp_processor_id(), 0);
22571-out:
22572- raw_spin_unlock(&p->pi_lock);
22573-}
22574-
22575 /**
22576 * wake_up_process - Wake up a specific process
22577 * @p: The process to be woken up.
22578@@ -2160,6 +2232,18 @@ int wake_up_process(struct task_struct *p)
22579 }
22580 EXPORT_SYMBOL(wake_up_process);
22581
22582+/**
22583+ * wake_up_lock_sleeper - Wake up a specific process blocked on a "sleeping lock"
22584+ * @p: The process to be woken up.
22585+ *
22586+ * Same as wake_up_process() above, but wake_flags=WF_LOCK_SLEEPER to indicate
22587+ * the nature of the wakeup.
22588+ */
22589+int wake_up_lock_sleeper(struct task_struct *p)
22590+{
22591+ return try_to_wake_up(p, TASK_UNINTERRUPTIBLE, WF_LOCK_SLEEPER);
22592+}
22593+
22594 int wake_up_state(struct task_struct *p, unsigned int state)
22595 {
22596 return try_to_wake_up(p, state, 0);
22597@@ -2420,6 +2504,9 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p)
22598 p->on_cpu = 0;
22599 #endif
22600 init_task_preempt_count(p);
22601+#ifdef CONFIG_HAVE_PREEMPT_LAZY
22602+ task_thread_info(p)->preempt_lazy_count = 0;
22603+#endif
22604 #ifdef CONFIG_SMP
22605 plist_node_init(&p->pushable_tasks, MAX_PRIO);
22606 RB_CLEAR_NODE(&p->pushable_dl_tasks);
22607@@ -2462,7 +2549,7 @@ void wake_up_new_task(struct task_struct *p)
22608 #ifdef CONFIG_SMP
22609 /*
22610 * Fork balancing, do it here and not earlier because:
22611- * - cpus_allowed can change in the fork path
22612+ * - cpus_ptr can change in the fork path
22613 * - any previously selected CPU might disappear through hotplug
22614 *
22615 * Use __set_task_cpu() to avoid calling sched_class::migrate_task_rq,
22616@@ -2675,21 +2762,16 @@ static struct rq *finish_task_switch(struct task_struct *prev)
22617 finish_arch_post_lock_switch();
22618
22619 fire_sched_in_preempt_notifiers(current);
22620+ /*
22621+ * We use mmdrop_delayed() here so we don't have to do the
22622+ * full __mmdrop() when we are the last user.
22623+ */
22624 if (mm)
22625- mmdrop(mm);
22626+ mmdrop_delayed(mm);
22627 if (unlikely(prev_state == TASK_DEAD)) {
22628 if (prev->sched_class->task_dead)
22629 prev->sched_class->task_dead(prev);
22630
22631- /*
22632- * Remove function-return probe instances associated with this
22633- * task and put them back on the free list.
22634- */
22635- kprobe_flush_task(prev);
22636-
22637- /* Task is done with its stack. */
22638- put_task_stack(prev);
22639-
22640 put_task_struct(prev);
22641 }
22642
22643@@ -3336,25 +3418,13 @@ static void __sched notrace __schedule(bool preempt)
22644 atomic_inc(&rq->nr_iowait);
22645 delayacct_blkio_start();
22646 }
22647-
22648- /*
22649- * If a worker went to sleep, notify and ask workqueue
22650- * whether it wants to wake up a task to maintain
22651- * concurrency.
22652- */
22653- if (prev->flags & PF_WQ_WORKER) {
22654- struct task_struct *to_wakeup;
22655-
22656- to_wakeup = wq_worker_sleeping(prev);
22657- if (to_wakeup)
22658- try_to_wake_up_local(to_wakeup, &rf);
22659- }
22660 }
22661 switch_count = &prev->nvcsw;
22662 }
22663
22664 next = pick_next_task(rq, prev, &rf);
22665 clear_tsk_need_resched(prev);
22666+ clear_tsk_need_resched_lazy(prev);
22667 clear_preempt_need_resched();
22668
22669 if (likely(prev != next)) {
22670@@ -3407,8 +3477,24 @@ void __noreturn do_task_dead(void)
22671
22672 static inline void sched_submit_work(struct task_struct *tsk)
22673 {
22674- if (!tsk->state || tsk_is_pi_blocked(tsk))
22675+ if (!tsk->state)
22676 return;
22677+ /*
22678+ * If a worker went to sleep, notify and ask workqueue whether
22679+ * it wants to wake up a task to maintain concurrency.
22680+ * As this function is called inside the schedule() context,
22681+ * we disable preemption to avoid it calling schedule() again
22682+ * in the possible wakeup of a kworker.
22683+ */
22684+ if (tsk->flags & PF_WQ_WORKER) {
22685+ preempt_disable();
22686+ wq_worker_sleeping(tsk);
22687+ preempt_enable_no_resched();
22688+ }
22689+
22690+ if (tsk_is_pi_blocked(tsk))
22691+ return;
22692+
22693 /*
22694 * If we are going to sleep and we have plugged IO queued,
22695 * make sure to submit it to avoid deadlocks.
22696@@ -3417,6 +3503,12 @@ static inline void sched_submit_work(struct task_struct *tsk)
22697 blk_schedule_flush_plug(tsk);
22698 }
22699
22700+static void sched_update_worker(struct task_struct *tsk)
22701+{
22702+ if (tsk->flags & PF_WQ_WORKER)
22703+ wq_worker_running(tsk);
22704+}
22705+
22706 asmlinkage __visible void __sched schedule(void)
22707 {
22708 struct task_struct *tsk = current;
22709@@ -3427,6 +3519,7 @@ asmlinkage __visible void __sched schedule(void)
22710 __schedule(false);
22711 sched_preempt_enable_no_resched();
22712 } while (need_resched());
22713+ sched_update_worker(tsk);
22714 }
22715 EXPORT_SYMBOL(schedule);
22716
22717@@ -3515,6 +3608,30 @@ static void __sched notrace preempt_schedule_common(void)
22718 } while (need_resched());
22719 }
22720
22721+#ifdef CONFIG_PREEMPT_LAZY
22722+/*
22723+ * If TIF_NEED_RESCHED is then we allow to be scheduled away since this is
22724+ * set by a RT task. Oterwise we try to avoid beeing scheduled out as long as
22725+ * preempt_lazy_count counter >0.
22726+ */
22727+static __always_inline int preemptible_lazy(void)
22728+{
22729+ if (test_thread_flag(TIF_NEED_RESCHED))
22730+ return 1;
22731+ if (current_thread_info()->preempt_lazy_count)
22732+ return 0;
22733+ return 1;
22734+}
22735+
22736+#else
22737+
22738+static inline int preemptible_lazy(void)
22739+{
22740+ return 1;
22741+}
22742+
22743+#endif
22744+
22745 #ifdef CONFIG_PREEMPT
22746 /*
22747 * this is the entry point to schedule() from in-kernel preemption
22748@@ -3529,7 +3646,8 @@ asmlinkage __visible void __sched notrace preempt_schedule(void)
22749 */
22750 if (likely(!preemptible()))
22751 return;
22752-
22753+ if (!preemptible_lazy())
22754+ return;
22755 preempt_schedule_common();
22756 }
22757 NOKPROBE_SYMBOL(preempt_schedule);
22758@@ -3556,6 +3674,9 @@ asmlinkage __visible void __sched notrace preempt_schedule_notrace(void)
22759 if (likely(!preemptible()))
22760 return;
22761
22762+ if (!preemptible_lazy())
22763+ return;
22764+
22765 do {
22766 /*
22767 * Because the function tracer can trace preempt_count_sub()
22768@@ -3578,7 +3699,16 @@ asmlinkage __visible void __sched notrace preempt_schedule_notrace(void)
22769 * an infinite recursion.
22770 */
22771 prev_ctx = exception_enter();
22772+ /*
22773+ * The add/subtract must not be traced by the function
22774+ * tracer. But we still want to account for the
22775+ * preempt off latency tracer. Since the _notrace versions
22776+ * of add/subtract skip the accounting for latency tracer
22777+ * we must force it manually.
22778+ */
22779+ start_critical_timings();
22780 __schedule(true);
22781+ stop_critical_timings();
22782 exception_exit(prev_ctx);
22783
22784 preempt_latency_stop(1);
22785@@ -4164,7 +4294,7 @@ static int __sched_setscheduler(struct task_struct *p,
22786 * the entire root_domain to become SCHED_DEADLINE. We
22787 * will also fail if there's no bandwidth available.
22788 */
22789- if (!cpumask_subset(span, &p->cpus_allowed) ||
22790+ if (!cpumask_subset(span, p->cpus_ptr) ||
22791 rq->rd->dl_bw.bw == 0) {
22792 task_rq_unlock(rq, p, &rf);
22793 return -EPERM;
22794@@ -4758,7 +4888,7 @@ long sched_getaffinity(pid_t pid, struct cpumask *mask)
22795 goto out_unlock;
22796
22797 raw_spin_lock_irqsave(&p->pi_lock, flags);
22798- cpumask_and(mask, &p->cpus_allowed, cpu_active_mask);
22799+ cpumask_and(mask, &p->cpus_mask, cpu_active_mask);
22800 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
22801
22802 out_unlock:
22803@@ -4877,6 +5007,7 @@ int __cond_resched_lock(spinlock_t *lock)
22804 }
22805 EXPORT_SYMBOL(__cond_resched_lock);
22806
22807+#ifndef CONFIG_PREEMPT_RT_FULL
22808 int __sched __cond_resched_softirq(void)
22809 {
22810 BUG_ON(!in_softirq());
22811@@ -4890,6 +5021,7 @@ int __sched __cond_resched_softirq(void)
22812 return 0;
22813 }
22814 EXPORT_SYMBOL(__cond_resched_softirq);
22815+#endif
22816
22817 /**
22818 * yield - yield the current processor to other threads.
22819@@ -5284,7 +5416,9 @@ void init_idle(struct task_struct *idle, int cpu)
22820
22821 /* Set the preempt count _outside_ the spinlocks! */
22822 init_idle_preempt_count(idle, cpu);
22823-
22824+#ifdef CONFIG_HAVE_PREEMPT_LAZY
22825+ task_thread_info(idle)->preempt_lazy_count = 0;
22826+#endif
22827 /*
22828 * The idle tasks have their own, simple scheduling class:
22829 */
22830@@ -5323,7 +5457,7 @@ int task_can_attach(struct task_struct *p,
22831 * allowed nodes is unnecessary. Thus, cpusets are not
22832 * applicable for such threads. This prevents checking for
22833 * success of set_cpus_allowed_ptr() on all attached tasks
22834- * before cpus_allowed may be changed.
22835+ * before cpus_mask may be changed.
22836 */
22837 if (p->flags & PF_NO_SETAFFINITY) {
22838 ret = -EINVAL;
22839@@ -5350,7 +5484,7 @@ int migrate_task_to(struct task_struct *p, int target_cpu)
22840 if (curr_cpu == target_cpu)
22841 return 0;
22842
22843- if (!cpumask_test_cpu(target_cpu, &p->cpus_allowed))
22844+ if (!cpumask_test_cpu(target_cpu, p->cpus_ptr))
22845 return -EINVAL;
22846
22847 /* TODO: This is not properly updating schedstats */
22848@@ -5389,6 +5523,8 @@ void sched_setnuma(struct task_struct *p, int nid)
22849 #endif /* CONFIG_NUMA_BALANCING */
22850
22851 #ifdef CONFIG_HOTPLUG_CPU
22852+static DEFINE_PER_CPU(struct mm_struct *, idle_last_mm);
22853+
22854 /*
22855 * Ensure that the idle task is using init_mm right before its CPU goes
22856 * offline.
22857@@ -5403,7 +5539,12 @@ void idle_task_exit(void)
22858 switch_mm(mm, &init_mm, current);
22859 finish_arch_post_lock_switch();
22860 }
22861- mmdrop(mm);
22862+ /*
22863+ * Defer the cleanup to an alive cpu. On RT we can neither
22864+ * call mmdrop() nor mmdrop_delayed() from here.
22865+ */
22866+ per_cpu(idle_last_mm, smp_processor_id()) = mm;
22867+
22868 }
22869
22870 /*
22871@@ -5487,7 +5628,7 @@ static void migrate_tasks(struct rq *dead_rq, struct rq_flags *rf)
22872 put_prev_task(rq, next);
22873
22874 /*
22875- * Rules for changing task_struct::cpus_allowed are holding
22876+ * Rules for changing task_struct::cpus_mask are holding
22877 * both pi_lock and rq->lock, such that holding either
22878 * stabilizes the mask.
22879 *
22880@@ -5718,6 +5859,10 @@ int sched_cpu_dying(unsigned int cpu)
22881 update_max_interval();
22882 nohz_balance_exit_idle(cpu);
22883 hrtick_clear(rq);
22884+ if (per_cpu(idle_last_mm, cpu)) {
22885+ mmdrop_delayed(per_cpu(idle_last_mm, cpu));
22886+ per_cpu(idle_last_mm, cpu) = NULL;
22887+ }
22888 return 0;
22889 }
22890 #endif
22891@@ -5964,7 +6109,7 @@ void __init sched_init(void)
22892 #ifdef CONFIG_DEBUG_ATOMIC_SLEEP
22893 static inline int preempt_count_equals(int preempt_offset)
22894 {
22895- int nested = preempt_count() + rcu_preempt_depth();
22896+ int nested = preempt_count() + sched_rcu_preempt_depth();
22897
22898 return (nested == preempt_offset);
22899 }
22900@@ -6756,3 +6901,196 @@ const u32 sched_prio_to_wmult[40] = {
22901 /* 10 */ 39045157, 49367440, 61356676, 76695844, 95443717,
22902 /* 15 */ 119304647, 148102320, 186737708, 238609294, 286331153,
22903 };
22904+
22905+#if defined(CONFIG_SMP) && defined(CONFIG_PREEMPT_RT_BASE)
22906+
22907+static inline void
22908+update_nr_migratory(struct task_struct *p, long delta)
22909+{
22910+ if (unlikely((p->sched_class == &rt_sched_class ||
22911+ p->sched_class == &dl_sched_class) &&
22912+ p->nr_cpus_allowed > 1)) {
22913+ if (p->sched_class == &rt_sched_class)
22914+ task_rq(p)->rt.rt_nr_migratory += delta;
22915+ else
22916+ task_rq(p)->dl.dl_nr_migratory += delta;
22917+ }
22918+}
22919+
22920+static inline void
22921+migrate_disable_update_cpus_allowed(struct task_struct *p)
22922+{
22923+ struct rq *rq;
22924+ struct rq_flags rf;
22925+
22926+ p->cpus_ptr = cpumask_of(smp_processor_id());
22927+
22928+ rq = task_rq_lock(p, &rf);
22929+ update_nr_migratory(p, -1);
22930+ p->nr_cpus_allowed = 1;
22931+ task_rq_unlock(rq, p, &rf);
22932+}
22933+
22934+static inline void
22935+migrate_enable_update_cpus_allowed(struct task_struct *p)
22936+{
22937+ struct rq *rq;
22938+ struct rq_flags rf;
22939+
22940+ p->cpus_ptr = &p->cpus_mask;
22941+
22942+ rq = task_rq_lock(p, &rf);
22943+ p->nr_cpus_allowed = cpumask_weight(&p->cpus_mask);
22944+ update_nr_migratory(p, 1);
22945+ task_rq_unlock(rq, p, &rf);
22946+}
22947+
22948+void migrate_disable(void)
22949+{
22950+ struct task_struct *p = current;
22951+
22952+ if (in_atomic() || irqs_disabled()) {
22953+#ifdef CONFIG_SCHED_DEBUG
22954+ p->migrate_disable_atomic++;
22955+#endif
22956+ return;
22957+ }
22958+#ifdef CONFIG_SCHED_DEBUG
22959+ if (unlikely(p->migrate_disable_atomic)) {
22960+ tracing_off();
22961+ WARN_ON_ONCE(1);
22962+ }
22963+#endif
22964+
22965+ if (p->migrate_disable) {
22966+ p->migrate_disable++;
22967+ return;
22968+ }
22969+
22970+ preempt_disable();
22971+ preempt_lazy_disable();
22972+ pin_current_cpu();
22973+
22974+ migrate_disable_update_cpus_allowed(p);
22975+ p->migrate_disable = 1;
22976+
22977+ preempt_enable();
22978+}
22979+EXPORT_SYMBOL(migrate_disable);
22980+
22981+void migrate_enable(void)
22982+{
22983+ struct task_struct *p = current;
22984+
22985+ if (in_atomic() || irqs_disabled()) {
22986+#ifdef CONFIG_SCHED_DEBUG
22987+ p->migrate_disable_atomic--;
22988+#endif
22989+ return;
22990+ }
22991+
22992+#ifdef CONFIG_SCHED_DEBUG
22993+ if (unlikely(p->migrate_disable_atomic)) {
22994+ tracing_off();
22995+ WARN_ON_ONCE(1);
22996+ }
22997+#endif
22998+
22999+ WARN_ON_ONCE(p->migrate_disable <= 0);
23000+ if (p->migrate_disable > 1) {
23001+ p->migrate_disable--;
23002+ return;
23003+ }
23004+
23005+ preempt_disable();
23006+
23007+ p->migrate_disable = 0;
23008+ migrate_enable_update_cpus_allowed(p);
23009+
23010+ if (p->migrate_disable_update) {
23011+ struct rq *rq;
23012+ struct rq_flags rf;
23013+
23014+ rq = task_rq_lock(p, &rf);
23015+ update_rq_clock(rq);
23016+
23017+ __do_set_cpus_allowed_tail(p, &p->cpus_mask);
23018+ task_rq_unlock(rq, p, &rf);
23019+
23020+ p->migrate_disable_update = 0;
23021+
23022+ WARN_ON(smp_processor_id() != task_cpu(p));
23023+ if (!cpumask_test_cpu(task_cpu(p), &p->cpus_mask)) {
23024+ const struct cpumask *cpu_valid_mask = cpu_active_mask;
23025+ struct migration_arg arg;
23026+ unsigned int dest_cpu;
23027+
23028+ if (p->flags & PF_KTHREAD) {
23029+ /*
23030+ * Kernel threads are allowed on online && !active CPUs
23031+ */
23032+ cpu_valid_mask = cpu_online_mask;
23033+ }
23034+ dest_cpu = cpumask_any_and(cpu_valid_mask, &p->cpus_mask);
23035+ arg.task = p;
23036+ arg.dest_cpu = dest_cpu;
23037+
23038+ unpin_current_cpu();
23039+ preempt_lazy_enable();
23040+ preempt_enable();
23041+ stop_one_cpu(task_cpu(p), migration_cpu_stop, &arg);
23042+ tlb_migrate_finish(p->mm);
23043+
23044+ return;
23045+ }
23046+ }
23047+ unpin_current_cpu();
23048+ preempt_lazy_enable();
23049+ preempt_enable();
23050+}
23051+EXPORT_SYMBOL(migrate_enable);
23052+
23053+#elif !defined(CONFIG_SMP) && defined(CONFIG_PREEMPT_RT_BASE)
23054+void migrate_disable(void)
23055+{
23056+#ifdef CONFIG_SCHED_DEBUG
23057+ struct task_struct *p = current;
23058+
23059+ if (in_atomic() || irqs_disabled()) {
23060+ p->migrate_disable_atomic++;
23061+ return;
23062+ }
23063+
23064+ if (unlikely(p->migrate_disable_atomic)) {
23065+ tracing_off();
23066+ WARN_ON_ONCE(1);
23067+ }
23068+
23069+ p->migrate_disable++;
23070+#endif
23071+ barrier();
23072+}
23073+EXPORT_SYMBOL(migrate_disable);
23074+
23075+void migrate_enable(void)
23076+{
23077+#ifdef CONFIG_SCHED_DEBUG
23078+ struct task_struct *p = current;
23079+
23080+ if (in_atomic() || irqs_disabled()) {
23081+ p->migrate_disable_atomic--;
23082+ return;
23083+ }
23084+
23085+ if (unlikely(p->migrate_disable_atomic)) {
23086+ tracing_off();
23087+ WARN_ON_ONCE(1);
23088+ }
23089+
23090+ WARN_ON_ONCE(p->migrate_disable <= 0);
23091+ p->migrate_disable--;
23092+#endif
23093+ barrier();
23094+}
23095+EXPORT_SYMBOL(migrate_enable);
23096+#endif
23097diff --git a/kernel/sched/cpudeadline.c b/kernel/sched/cpudeadline.c
23098index 8d9562d890d3..91a0702fe3df 100644
23099--- a/kernel/sched/cpudeadline.c
23100+++ b/kernel/sched/cpudeadline.c
23101@@ -127,13 +127,13 @@ int cpudl_find(struct cpudl *cp, struct task_struct *p,
23102 const struct sched_dl_entity *dl_se = &p->dl;
23103
23104 if (later_mask &&
23105- cpumask_and(later_mask, cp->free_cpus, &p->cpus_allowed)) {
23106+ cpumask_and(later_mask, cp->free_cpus, p->cpus_ptr)) {
23107 return 1;
23108 } else {
23109 int best_cpu = cpudl_maximum(cp);
23110 WARN_ON(best_cpu != -1 && !cpu_present(best_cpu));
23111
23112- if (cpumask_test_cpu(best_cpu, &p->cpus_allowed) &&
23113+ if (cpumask_test_cpu(best_cpu, p->cpus_ptr) &&
23114 dl_time_before(dl_se->deadline, cp->elements[0].dl)) {
23115 if (later_mask)
23116 cpumask_set_cpu(best_cpu, later_mask);
23117diff --git a/kernel/sched/cpupri.c b/kernel/sched/cpupri.c
23118index 2511aba36b89..7b9bc1de0e6c 100644
23119--- a/kernel/sched/cpupri.c
23120+++ b/kernel/sched/cpupri.c
23121@@ -103,11 +103,11 @@ int cpupri_find(struct cpupri *cp, struct task_struct *p,
23122 if (skip)
23123 continue;
23124
23125- if (cpumask_any_and(&p->cpus_allowed, vec->mask) >= nr_cpu_ids)
23126+ if (cpumask_any_and(p->cpus_ptr, vec->mask) >= nr_cpu_ids)
23127 continue;
23128
23129 if (lowest_mask) {
23130- cpumask_and(lowest_mask, &p->cpus_allowed, vec->mask);
23131+ cpumask_and(lowest_mask, p->cpus_ptr, vec->mask);
23132
23133 /*
23134 * We have to ensure that we have at least one bit
23135diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
23136index b2589c7e9439..28a75a9526ac 100644
23137--- a/kernel/sched/deadline.c
23138+++ b/kernel/sched/deadline.c
23139@@ -504,7 +504,7 @@ static struct rq *dl_task_offline_migration(struct rq *rq, struct task_struct *p
23140 * If we cannot preempt any rq, fall back to pick any
23141 * online cpu.
23142 */
23143- cpu = cpumask_any_and(cpu_active_mask, &p->cpus_allowed);
23144+ cpu = cpumask_any_and(cpu_active_mask, p->cpus_ptr);
23145 if (cpu >= nr_cpu_ids) {
23146 /*
23147 * Fail to find any suitable cpu.
23148@@ -1020,7 +1020,7 @@ void init_dl_task_timer(struct sched_dl_entity *dl_se)
23149 {
23150 struct hrtimer *timer = &dl_se->dl_timer;
23151
23152- hrtimer_init(timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
23153+ hrtimer_init(timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD);
23154 timer->function = dl_task_timer;
23155 }
23156
23157@@ -1753,7 +1753,7 @@ static void set_curr_task_dl(struct rq *rq)
23158 static int pick_dl_task(struct rq *rq, struct task_struct *p, int cpu)
23159 {
23160 if (!task_running(rq, p) &&
23161- cpumask_test_cpu(cpu, &p->cpus_allowed))
23162+ cpumask_test_cpu(cpu, p->cpus_ptr))
23163 return 1;
23164 return 0;
23165 }
23166@@ -1903,7 +1903,7 @@ static struct rq *find_lock_later_rq(struct task_struct *task, struct rq *rq)
23167 /* Retry if something changed. */
23168 if (double_lock_balance(rq, later_rq)) {
23169 if (unlikely(task_rq(task) != rq ||
23170- !cpumask_test_cpu(later_rq->cpu, &task->cpus_allowed) ||
23171+ !cpumask_test_cpu(later_rq->cpu, task->cpus_ptr) ||
23172 task_running(rq, task) ||
23173 !dl_task(task) ||
23174 !task_on_rq_queued(task))) {
23175diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
23176index 2f93e4a2d9f6..b5b43861c2b6 100644
23177--- a/kernel/sched/debug.c
23178+++ b/kernel/sched/debug.c
23179@@ -1017,6 +1017,10 @@ void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns,
23180 P(dl.runtime);
23181 P(dl.deadline);
23182 }
23183+#if defined(CONFIG_SMP) && defined(CONFIG_PREEMPT_RT_BASE)
23184+ P(migrate_disable);
23185+#endif
23186+ P(nr_cpus_allowed);
23187 #undef PN_SCHEDSTAT
23188 #undef PN
23189 #undef __PN
23190diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
23191index b2d699f28304..20e7d867af7a 100644
23192--- a/kernel/sched/fair.c
23193+++ b/kernel/sched/fair.c
23194@@ -1598,7 +1598,7 @@ static void task_numa_compare(struct task_numa_env *env,
23195 */
23196 if (cur) {
23197 /* Skip this swap candidate if cannot move to the source cpu */
23198- if (!cpumask_test_cpu(env->src_cpu, &cur->cpus_allowed))
23199+ if (!cpumask_test_cpu(env->src_cpu, cur->cpus_ptr))
23200 goto unlock;
23201
23202 /*
23203@@ -1708,7 +1708,7 @@ static void task_numa_find_cpu(struct task_numa_env *env,
23204
23205 for_each_cpu(cpu, cpumask_of_node(env->dst_nid)) {
23206 /* Skip this CPU if the source task cannot migrate */
23207- if (!cpumask_test_cpu(cpu, &env->p->cpus_allowed))
23208+ if (!cpumask_test_cpu(cpu, env->p->cpus_ptr))
23209 continue;
23210
23211 env->dst_cpu = cpu;
23212@@ -3842,7 +3842,7 @@ check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
23213 ideal_runtime = sched_slice(cfs_rq, curr);
23214 delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime;
23215 if (delta_exec > ideal_runtime) {
23216- resched_curr(rq_of(cfs_rq));
23217+ resched_curr_lazy(rq_of(cfs_rq));
23218 /*
23219 * The current task ran long enough, ensure it doesn't get
23220 * re-elected due to buddy favours.
23221@@ -3866,7 +3866,7 @@ check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
23222 return;
23223
23224 if (delta > ideal_runtime)
23225- resched_curr(rq_of(cfs_rq));
23226+ resched_curr_lazy(rq_of(cfs_rq));
23227 }
23228
23229 static void
23230@@ -4008,7 +4008,7 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
23231 * validating it and just reschedule.
23232 */
23233 if (queued) {
23234- resched_curr(rq_of(cfs_rq));
23235+ resched_curr_lazy(rq_of(cfs_rq));
23236 return;
23237 }
23238 /*
23239@@ -4190,7 +4190,7 @@ static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec)
23240 * hierarchy can be throttled
23241 */
23242 if (!assign_cfs_rq_runtime(cfs_rq) && likely(cfs_rq->curr))
23243- resched_curr(rq_of(cfs_rq));
23244+ resched_curr_lazy(rq_of(cfs_rq));
23245 }
23246
23247 static __always_inline
23248@@ -4686,9 +4686,9 @@ void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
23249 cfs_b->period = ns_to_ktime(default_cfs_period());
23250
23251 INIT_LIST_HEAD(&cfs_b->throttled_cfs_rq);
23252- hrtimer_init(&cfs_b->period_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED);
23253+ hrtimer_init(&cfs_b->period_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED_HARD);
23254 cfs_b->period_timer.function = sched_cfs_period_timer;
23255- hrtimer_init(&cfs_b->slack_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
23256+ hrtimer_init(&cfs_b->slack_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD);
23257 cfs_b->slack_timer.function = sched_cfs_slack_timer;
23258 }
23259
23260@@ -4839,7 +4839,7 @@ static void hrtick_start_fair(struct rq *rq, struct task_struct *p)
23261
23262 if (delta < 0) {
23263 if (rq->curr == p)
23264- resched_curr(rq);
23265+ resched_curr_lazy(rq);
23266 return;
23267 }
23268 hrtick_start(rq, delta);
23269@@ -5477,7 +5477,7 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p,
23270
23271 /* Skip over this group if it has no CPUs allowed */
23272 if (!cpumask_intersects(sched_group_span(group),
23273- &p->cpus_allowed))
23274+ p->cpus_ptr))
23275 continue;
23276
23277 local_group = cpumask_test_cpu(this_cpu,
23278@@ -5597,7 +5597,7 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
23279 return cpumask_first(sched_group_span(group));
23280
23281 /* Traverse only the allowed CPUs */
23282- for_each_cpu_and(i, sched_group_span(group), &p->cpus_allowed) {
23283+ for_each_cpu_and(i, sched_group_span(group), p->cpus_ptr) {
23284 if (idle_cpu(i)) {
23285 struct rq *rq = cpu_rq(i);
23286 struct cpuidle_state *idle = idle_get_state(rq);
23287@@ -5700,7 +5700,7 @@ static int select_idle_core(struct task_struct *p, struct sched_domain *sd, int
23288 if (!test_idle_cores(target, false))
23289 return -1;
23290
23291- cpumask_and(cpus, sched_domain_span(sd), &p->cpus_allowed);
23292+ cpumask_and(cpus, sched_domain_span(sd), p->cpus_ptr);
23293
23294 for_each_cpu_wrap(core, cpus, target) {
23295 bool idle = true;
23296@@ -5734,7 +5734,7 @@ static int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int t
23297 return -1;
23298
23299 for_each_cpu(cpu, cpu_smt_mask(target)) {
23300- if (!cpumask_test_cpu(cpu, &p->cpus_allowed))
23301+ if (!cpumask_test_cpu(cpu, p->cpus_ptr))
23302 continue;
23303 if (idle_cpu(cpu))
23304 return cpu;
23305@@ -5797,7 +5797,7 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int t
23306 for_each_cpu_wrap(cpu, sched_domain_span(sd), target) {
23307 if (!--nr)
23308 return -1;
23309- if (!cpumask_test_cpu(cpu, &p->cpus_allowed))
23310+ if (!cpumask_test_cpu(cpu, p->cpus_ptr))
23311 continue;
23312 if (idle_cpu(cpu))
23313 break;
23314@@ -5952,7 +5952,7 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f
23315 if (sd_flag & SD_BALANCE_WAKE) {
23316 record_wakee(p);
23317 want_affine = !wake_wide(p) && !wake_cap(p, cpu, prev_cpu)
23318- && cpumask_test_cpu(cpu, &p->cpus_allowed);
23319+ && cpumask_test_cpu(cpu, p->cpus_ptr);
23320 }
23321
23322 rcu_read_lock();
23323@@ -6233,7 +6233,7 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
23324 return;
23325
23326 preempt:
23327- resched_curr(rq);
23328+ resched_curr_lazy(rq);
23329 /*
23330 * Only set the backward buddy when the current task is still
23331 * on the rq. This can happen when a wakeup gets interleaved
23332@@ -6701,14 +6701,14 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
23333 /*
23334 * We do not migrate tasks that are:
23335 * 1) throttled_lb_pair, or
23336- * 2) cannot be migrated to this CPU due to cpus_allowed, or
23337+ * 2) cannot be migrated to this CPU due to cpus_ptr, or
23338 * 3) running (obviously), or
23339 * 4) are cache-hot on their current CPU.
23340 */
23341 if (throttled_lb_pair(task_group(p), env->src_cpu, env->dst_cpu))
23342 return 0;
23343
23344- if (!cpumask_test_cpu(env->dst_cpu, &p->cpus_allowed)) {
23345+ if (!cpumask_test_cpu(env->dst_cpu, p->cpus_ptr)) {
23346 int cpu;
23347
23348 schedstat_inc(p->se.statistics.nr_failed_migrations_affine);
23349@@ -6728,7 +6728,7 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
23350
23351 /* Prevent to re-select dst_cpu via env's cpus */
23352 for_each_cpu_and(cpu, env->dst_grpmask, env->cpus) {
23353- if (cpumask_test_cpu(cpu, &p->cpus_allowed)) {
23354+ if (cpumask_test_cpu(cpu, p->cpus_ptr)) {
23355 env->flags |= LBF_DST_PINNED;
23356 env->new_dst_cpu = cpu;
23357 break;
23358@@ -7297,7 +7297,7 @@ check_cpu_capacity(struct rq *rq, struct sched_domain *sd)
23359
23360 /*
23361 * Group imbalance indicates (and tries to solve) the problem where balancing
23362- * groups is inadequate due to ->cpus_allowed constraints.
23363+ * groups is inadequate due to ->cpus_ptr constraints.
23364 *
23365 * Imagine a situation of two groups of 4 cpus each and 4 tasks each with a
23366 * cpumask covering 1 cpu of the first group and 3 cpus of the second group.
23367@@ -7873,7 +7873,7 @@ static struct sched_group *find_busiest_group(struct lb_env *env)
23368 /*
23369 * If the busiest group is imbalanced the below checks don't
23370 * work because they assume all things are equal, which typically
23371- * isn't true due to cpus_allowed constraints and the like.
23372+ * isn't true due to cpus_ptr constraints and the like.
23373 */
23374 if (busiest->group_type == group_imbalanced)
23375 goto force_balance;
23376@@ -8265,7 +8265,7 @@ static int load_balance(int this_cpu, struct rq *this_rq,
23377 * if the curr task on busiest cpu can't be
23378 * moved to this_cpu
23379 */
23380- if (!cpumask_test_cpu(this_cpu, &busiest->curr->cpus_allowed)) {
23381+ if (!cpumask_test_cpu(this_cpu, busiest->curr->cpus_ptr)) {
23382 raw_spin_unlock_irqrestore(&busiest->lock,
23383 flags);
23384 env.flags |= LBF_ALL_PINNED;
23385@@ -9087,7 +9087,7 @@ static void task_fork_fair(struct task_struct *p)
23386 * 'current' within the tree based on its new key value.
23387 */
23388 swap(curr->vruntime, se->vruntime);
23389- resched_curr(rq);
23390+ resched_curr_lazy(rq);
23391 }
23392
23393 se->vruntime -= cfs_rq->min_vruntime;
23394@@ -9111,7 +9111,7 @@ prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio)
23395 */
23396 if (rq->curr == p) {
23397 if (p->prio > oldprio)
23398- resched_curr(rq);
23399+ resched_curr_lazy(rq);
23400 } else
23401 check_preempt_curr(rq, p, 0);
23402 }
23403diff --git a/kernel/sched/features.h b/kernel/sched/features.h
23404index 9552fd5854bf..fb069998b518 100644
23405--- a/kernel/sched/features.h
23406+++ b/kernel/sched/features.h
23407@@ -46,11 +46,19 @@ SCHED_FEAT(LB_BIAS, true)
23408 */
23409 SCHED_FEAT(NONTASK_CAPACITY, true)
23410
23411+#ifdef CONFIG_PREEMPT_RT_FULL
23412+SCHED_FEAT(TTWU_QUEUE, false)
23413+# ifdef CONFIG_PREEMPT_LAZY
23414+SCHED_FEAT(PREEMPT_LAZY, true)
23415+# endif
23416+#else
23417+
23418 /*
23419 * Queue remote wakeups on the target CPU and process them
23420 * using the scheduler IPI. Reduces rq->lock contention/bounces.
23421 */
23422 SCHED_FEAT(TTWU_QUEUE, true)
23423+#endif
23424
23425 /*
23426 * When doing wakeups, attempt to limit superfluous scans of the LLC domain.
23427diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
23428index cb9a5b8532fa..6c72332dab3f 100644
23429--- a/kernel/sched/rt.c
23430+++ b/kernel/sched/rt.c
23431@@ -47,8 +47,8 @@ void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime)
23432
23433 raw_spin_lock_init(&rt_b->rt_runtime_lock);
23434
23435- hrtimer_init(&rt_b->rt_period_timer,
23436- CLOCK_MONOTONIC, HRTIMER_MODE_REL);
23437+ hrtimer_init(&rt_b->rt_period_timer, CLOCK_MONOTONIC,
23438+ HRTIMER_MODE_REL_HARD);
23439 rt_b->rt_period_timer.function = sched_rt_period_timer;
23440 }
23441
23442@@ -1596,7 +1596,7 @@ static void put_prev_task_rt(struct rq *rq, struct task_struct *p)
23443 static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu)
23444 {
23445 if (!task_running(rq, p) &&
23446- cpumask_test_cpu(cpu, &p->cpus_allowed))
23447+ cpumask_test_cpu(cpu, p->cpus_ptr))
23448 return 1;
23449 return 0;
23450 }
23451@@ -1731,7 +1731,7 @@ static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq)
23452 * Also make sure that it wasn't scheduled on its rq.
23453 */
23454 if (unlikely(task_rq(task) != rq ||
23455- !cpumask_test_cpu(lowest_rq->cpu, &task->cpus_allowed) ||
23456+ !cpumask_test_cpu(lowest_rq->cpu, task->cpus_ptr) ||
23457 task_running(rq, task) ||
23458 !rt_task(task) ||
23459 !task_on_rq_queued(task))) {
23460diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
23461index b29376169f3f..96481980c8c7 100644
23462--- a/kernel/sched/sched.h
23463+++ b/kernel/sched/sched.h
23464@@ -1354,6 +1354,7 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
23465 #define WF_SYNC 0x01 /* waker goes to sleep after wakeup */
23466 #define WF_FORK 0x02 /* child wakeup after fork */
23467 #define WF_MIGRATED 0x4 /* internal use, task got migrated */
23468+#define WF_LOCK_SLEEPER 0x08 /* wakeup spinlock "sleeper" */
23469
23470 /*
23471 * To aid in avoiding the subversion of "niceness" due to uneven distribution
23472@@ -1545,6 +1546,15 @@ extern void init_sched_fair_class(void);
23473 extern void resched_curr(struct rq *rq);
23474 extern void resched_cpu(int cpu);
23475
23476+#ifdef CONFIG_PREEMPT_LAZY
23477+extern void resched_curr_lazy(struct rq *rq);
23478+#else
23479+static inline void resched_curr_lazy(struct rq *rq)
23480+{
23481+ resched_curr(rq);
23482+}
23483+#endif
23484+
23485 extern struct rt_bandwidth def_rt_bandwidth;
23486 extern void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime);
23487
23488diff --git a/kernel/sched/swait.c b/kernel/sched/swait.c
23489index 9ff1555341ed..b14638a05ec9 100644
23490--- a/kernel/sched/swait.c
23491+++ b/kernel/sched/swait.c
23492@@ -1,6 +1,7 @@
23493 // SPDX-License-Identifier: GPL-2.0
23494 #include <linux/sched/signal.h>
23495 #include <linux/swait.h>
23496+#include <linux/suspend.h>
23497
23498 void __init_swait_queue_head(struct swait_queue_head *q, const char *name,
23499 struct lock_class_key *key)
23500@@ -30,6 +31,25 @@ void swake_up_locked(struct swait_queue_head *q)
23501 }
23502 EXPORT_SYMBOL(swake_up_locked);
23503
23504+void swake_up_all_locked(struct swait_queue_head *q)
23505+{
23506+ struct swait_queue *curr;
23507+ int wakes = 0;
23508+
23509+ while (!list_empty(&q->task_list)) {
23510+
23511+ curr = list_first_entry(&q->task_list, typeof(*curr),
23512+ task_list);
23513+ wake_up_process(curr->task);
23514+ list_del_init(&curr->task_list);
23515+ wakes++;
23516+ }
23517+ if (pm_in_action)
23518+ return;
23519+ WARN(wakes > 2, "complete_all() with %d waiters\n", wakes);
23520+}
23521+EXPORT_SYMBOL(swake_up_all_locked);
23522+
23523 void swake_up(struct swait_queue_head *q)
23524 {
23525 unsigned long flags;
23526@@ -49,6 +69,7 @@ void swake_up_all(struct swait_queue_head *q)
23527 struct swait_queue *curr;
23528 LIST_HEAD(tmp);
23529
23530+ WARN_ON(irqs_disabled());
23531 raw_spin_lock_irq(&q->lock);
23532 list_splice_init(&q->task_list, &tmp);
23533 while (!list_empty(&tmp)) {
23534diff --git a/kernel/sched/swork.c b/kernel/sched/swork.c
23535new file mode 100644
23536index 000000000000..1950f40ca725
23537--- /dev/null
23538+++ b/kernel/sched/swork.c
23539@@ -0,0 +1,173 @@
23540+/*
23541+ * Copyright (C) 2014 BMW Car IT GmbH, Daniel Wagner daniel.wagner@bmw-carit.de
23542+ *
23543+ * Provides a framework for enqueuing callbacks from irq context
23544+ * PREEMPT_RT_FULL safe. The callbacks are executed in kthread context.
23545+ */
23546+
23547+#include <linux/swait.h>
23548+#include <linux/swork.h>
23549+#include <linux/kthread.h>
23550+#include <linux/slab.h>
23551+#include <linux/spinlock.h>
23552+#include <linux/export.h>
23553+
23554+#define SWORK_EVENT_PENDING (1 << 0)
23555+
23556+static DEFINE_MUTEX(worker_mutex);
23557+static struct sworker *glob_worker;
23558+
23559+struct sworker {
23560+ struct list_head events;
23561+ struct swait_queue_head wq;
23562+
23563+ raw_spinlock_t lock;
23564+
23565+ struct task_struct *task;
23566+ int refs;
23567+};
23568+
23569+static bool swork_readable(struct sworker *worker)
23570+{
23571+ bool r;
23572+
23573+ if (kthread_should_stop())
23574+ return true;
23575+
23576+ raw_spin_lock_irq(&worker->lock);
23577+ r = !list_empty(&worker->events);
23578+ raw_spin_unlock_irq(&worker->lock);
23579+
23580+ return r;
23581+}
23582+
23583+static int swork_kthread(void *arg)
23584+{
23585+ struct sworker *worker = arg;
23586+
23587+ for (;;) {
23588+ swait_event_interruptible(worker->wq,
23589+ swork_readable(worker));
23590+ if (kthread_should_stop())
23591+ break;
23592+
23593+ raw_spin_lock_irq(&worker->lock);
23594+ while (!list_empty(&worker->events)) {
23595+ struct swork_event *sev;
23596+
23597+ sev = list_first_entry(&worker->events,
23598+ struct swork_event, item);
23599+ list_del(&sev->item);
23600+ raw_spin_unlock_irq(&worker->lock);
23601+
23602+ WARN_ON_ONCE(!test_and_clear_bit(SWORK_EVENT_PENDING,
23603+ &sev->flags));
23604+ sev->func(sev);
23605+ raw_spin_lock_irq(&worker->lock);
23606+ }
23607+ raw_spin_unlock_irq(&worker->lock);
23608+ }
23609+ return 0;
23610+}
23611+
23612+static struct sworker *swork_create(void)
23613+{
23614+ struct sworker *worker;
23615+
23616+ worker = kzalloc(sizeof(*worker), GFP_KERNEL);
23617+ if (!worker)
23618+ return ERR_PTR(-ENOMEM);
23619+
23620+ INIT_LIST_HEAD(&worker->events);
23621+ raw_spin_lock_init(&worker->lock);
23622+ init_swait_queue_head(&worker->wq);
23623+
23624+ worker->task = kthread_run(swork_kthread, worker, "kswork");
23625+ if (IS_ERR(worker->task)) {
23626+ kfree(worker);
23627+ return ERR_PTR(-ENOMEM);
23628+ }
23629+
23630+ return worker;
23631+}
23632+
23633+static void swork_destroy(struct sworker *worker)
23634+{
23635+ kthread_stop(worker->task);
23636+
23637+ WARN_ON(!list_empty(&worker->events));
23638+ kfree(worker);
23639+}
23640+
23641+/**
23642+ * swork_queue - queue swork
23643+ *
23644+ * Returns %false if @work was already on a queue, %true otherwise.
23645+ *
23646+ * The work is queued and processed on a random CPU
23647+ */
23648+bool swork_queue(struct swork_event *sev)
23649+{
23650+ unsigned long flags;
23651+
23652+ if (test_and_set_bit(SWORK_EVENT_PENDING, &sev->flags))
23653+ return false;
23654+
23655+ raw_spin_lock_irqsave(&glob_worker->lock, flags);
23656+ list_add_tail(&sev->item, &glob_worker->events);
23657+ raw_spin_unlock_irqrestore(&glob_worker->lock, flags);
23658+
23659+ swake_up(&glob_worker->wq);
23660+ return true;
23661+}
23662+EXPORT_SYMBOL_GPL(swork_queue);
23663+
23664+/**
23665+ * swork_get - get an instance of the sworker
23666+ *
23667+ * Returns an negative error code if the initialization if the worker did not
23668+ * work, %0 otherwise.
23669+ *
23670+ */
23671+int swork_get(void)
23672+{
23673+ struct sworker *worker;
23674+
23675+ mutex_lock(&worker_mutex);
23676+ if (!glob_worker) {
23677+ worker = swork_create();
23678+ if (IS_ERR(worker)) {
23679+ mutex_unlock(&worker_mutex);
23680+ return -ENOMEM;
23681+ }
23682+
23683+ glob_worker = worker;
23684+ }
23685+
23686+ glob_worker->refs++;
23687+ mutex_unlock(&worker_mutex);
23688+
23689+ return 0;
23690+}
23691+EXPORT_SYMBOL_GPL(swork_get);
23692+
23693+/**
23694+ * swork_put - puts an instance of the sworker
23695+ *
23696+ * Will destroy the sworker thread. This function must not be called until all
23697+ * queued events have been completed.
23698+ */
23699+void swork_put(void)
23700+{
23701+ mutex_lock(&worker_mutex);
23702+
23703+ glob_worker->refs--;
23704+ if (glob_worker->refs > 0)
23705+ goto out;
23706+
23707+ swork_destroy(glob_worker);
23708+ glob_worker = NULL;
23709+out:
23710+ mutex_unlock(&worker_mutex);
23711+}
23712+EXPORT_SYMBOL_GPL(swork_put);
23713diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
23714index 659e075ef70b..bb22e3620a90 100644
23715--- a/kernel/sched/topology.c
23716+++ b/kernel/sched/topology.c
23717@@ -286,6 +286,7 @@ static int init_rootdomain(struct root_domain *rd)
23718 rd->rto_cpu = -1;
23719 raw_spin_lock_init(&rd->rto_lock);
23720 init_irq_work(&rd->rto_push_work, rto_push_irq_work_func);
23721+ rd->rto_push_work.flags |= IRQ_WORK_HARD_IRQ;
23722 #endif
23723
23724 init_dl_bw(&rd->dl_bw);
23725diff --git a/kernel/signal.c b/kernel/signal.c
23726index 4439ba9dc5d9..d8f75a030292 100644
23727--- a/kernel/signal.c
23728+++ b/kernel/signal.c
23729@@ -19,6 +19,7 @@
23730 #include <linux/sched/task.h>
23731 #include <linux/sched/task_stack.h>
23732 #include <linux/sched/cputime.h>
23733+#include <linux/sched/rt.h>
23734 #include <linux/fs.h>
23735 #include <linux/tty.h>
23736 #include <linux/binfmts.h>
23737@@ -360,13 +361,30 @@ static bool task_participate_group_stop(struct task_struct *task)
23738 return false;
23739 }
23740
23741+static inline struct sigqueue *get_task_cache(struct task_struct *t)
23742+{
23743+ struct sigqueue *q = t->sigqueue_cache;
23744+
23745+ if (cmpxchg(&t->sigqueue_cache, q, NULL) != q)
23746+ return NULL;
23747+ return q;
23748+}
23749+
23750+static inline int put_task_cache(struct task_struct *t, struct sigqueue *q)
23751+{
23752+ if (cmpxchg(&t->sigqueue_cache, NULL, q) == NULL)
23753+ return 0;
23754+ return 1;
23755+}
23756+
23757 /*
23758 * allocate a new signal queue record
23759 * - this may be called without locks if and only if t == current, otherwise an
23760 * appropriate lock must be held to stop the target task from exiting
23761 */
23762 static struct sigqueue *
23763-__sigqueue_alloc(int sig, struct task_struct *t, gfp_t flags, int override_rlimit)
23764+__sigqueue_do_alloc(int sig, struct task_struct *t, gfp_t flags,
23765+ int override_rlimit, int fromslab)
23766 {
23767 struct sigqueue *q = NULL;
23768 struct user_struct *user;
23769@@ -383,7 +401,10 @@ __sigqueue_alloc(int sig, struct task_struct *t, gfp_t flags, int override_rlimi
23770 if (override_rlimit ||
23771 atomic_read(&user->sigpending) <=
23772 task_rlimit(t, RLIMIT_SIGPENDING)) {
23773- q = kmem_cache_alloc(sigqueue_cachep, flags);
23774+ if (!fromslab)
23775+ q = get_task_cache(t);
23776+ if (!q)
23777+ q = kmem_cache_alloc(sigqueue_cachep, flags);
23778 } else {
23779 print_dropped_signal(sig);
23780 }
23781@@ -400,6 +421,13 @@ __sigqueue_alloc(int sig, struct task_struct *t, gfp_t flags, int override_rlimi
23782 return q;
23783 }
23784
23785+static struct sigqueue *
23786+__sigqueue_alloc(int sig, struct task_struct *t, gfp_t flags,
23787+ int override_rlimit)
23788+{
23789+ return __sigqueue_do_alloc(sig, t, flags, override_rlimit, 0);
23790+}
23791+
23792 static void __sigqueue_free(struct sigqueue *q)
23793 {
23794 if (q->flags & SIGQUEUE_PREALLOC)
23795@@ -409,6 +437,21 @@ static void __sigqueue_free(struct sigqueue *q)
23796 kmem_cache_free(sigqueue_cachep, q);
23797 }
23798
23799+static void sigqueue_free_current(struct sigqueue *q)
23800+{
23801+ struct user_struct *up;
23802+
23803+ if (q->flags & SIGQUEUE_PREALLOC)
23804+ return;
23805+
23806+ up = q->user;
23807+ if (rt_prio(current->normal_prio) && !put_task_cache(current, q)) {
23808+ atomic_dec(&up->sigpending);
23809+ free_uid(up);
23810+ } else
23811+ __sigqueue_free(q);
23812+}
23813+
23814 void flush_sigqueue(struct sigpending *queue)
23815 {
23816 struct sigqueue *q;
23817@@ -421,6 +464,21 @@ void flush_sigqueue(struct sigpending *queue)
23818 }
23819 }
23820
23821+/*
23822+ * Called from __exit_signal. Flush tsk->pending and
23823+ * tsk->sigqueue_cache
23824+ */
23825+void flush_task_sigqueue(struct task_struct *tsk)
23826+{
23827+ struct sigqueue *q;
23828+
23829+ flush_sigqueue(&tsk->pending);
23830+
23831+ q = get_task_cache(tsk);
23832+ if (q)
23833+ kmem_cache_free(sigqueue_cachep, q);
23834+}
23835+
23836 /*
23837 * Flush all pending signals for this kthread.
23838 */
23839@@ -542,7 +600,7 @@ static void collect_signal(int sig, struct sigpending *list, siginfo_t *info,
23840 (info->si_code == SI_TIMER) &&
23841 (info->si_sys_private);
23842
23843- __sigqueue_free(first);
23844+ sigqueue_free_current(first);
23845 } else {
23846 /*
23847 * Ok, it wasn't in the queue. This must be
23848@@ -578,6 +636,8 @@ int dequeue_signal(struct task_struct *tsk, sigset_t *mask, siginfo_t *info)
23849 bool resched_timer = false;
23850 int signr;
23851
23852+ WARN_ON_ONCE(tsk != current);
23853+
23854 /* We only dequeue private signals from ourselves, we don't let
23855 * signalfd steal them
23856 */
23857@@ -1177,8 +1237,8 @@ int do_send_sig_info(int sig, struct siginfo *info, struct task_struct *p,
23858 * We don't want to have recursive SIGSEGV's etc, for example,
23859 * that is why we also clear SIGNAL_UNKILLABLE.
23860 */
23861-int
23862-force_sig_info(int sig, struct siginfo *info, struct task_struct *t)
23863+static int
23864+do_force_sig_info(int sig, struct siginfo *info, struct task_struct *t)
23865 {
23866 unsigned long int flags;
23867 int ret, blocked, ignored;
23868@@ -1207,6 +1267,39 @@ force_sig_info(int sig, struct siginfo *info, struct task_struct *t)
23869 return ret;
23870 }
23871
23872+int force_sig_info(int sig, struct siginfo *info, struct task_struct *t)
23873+{
23874+/*
23875+ * On some archs, PREEMPT_RT has to delay sending a signal from a trap
23876+ * since it can not enable preemption, and the signal code's spin_locks
23877+ * turn into mutexes. Instead, it must set TIF_NOTIFY_RESUME which will
23878+ * send the signal on exit of the trap.
23879+ */
23880+#ifdef ARCH_RT_DELAYS_SIGNAL_SEND
23881+ if (in_atomic()) {
23882+ if (WARN_ON_ONCE(t != current))
23883+ return 0;
23884+ if (WARN_ON_ONCE(t->forced_info.si_signo))
23885+ return 0;
23886+
23887+ if (is_si_special(info)) {
23888+ WARN_ON_ONCE(info != SEND_SIG_PRIV);
23889+ t->forced_info.si_signo = sig;
23890+ t->forced_info.si_errno = 0;
23891+ t->forced_info.si_code = SI_KERNEL;
23892+ t->forced_info.si_pid = 0;
23893+ t->forced_info.si_uid = 0;
23894+ } else {
23895+ t->forced_info = *info;
23896+ }
23897+
23898+ set_tsk_thread_flag(t, TIF_NOTIFY_RESUME);
23899+ return 0;
23900+ }
23901+#endif
23902+ return do_force_sig_info(sig, info, t);
23903+}
23904+
23905 /*
23906 * Nuke all other threads in the group.
23907 */
23908@@ -1241,12 +1334,12 @@ struct sighand_struct *__lock_task_sighand(struct task_struct *tsk,
23909 * Disable interrupts early to avoid deadlocks.
23910 * See rcu_read_unlock() comment header for details.
23911 */
23912- local_irq_save(*flags);
23913+ local_irq_save_nort(*flags);
23914 rcu_read_lock();
23915 sighand = rcu_dereference(tsk->sighand);
23916 if (unlikely(sighand == NULL)) {
23917 rcu_read_unlock();
23918- local_irq_restore(*flags);
23919+ local_irq_restore_nort(*flags);
23920 break;
23921 }
23922 /*
23923@@ -1267,7 +1360,7 @@ struct sighand_struct *__lock_task_sighand(struct task_struct *tsk,
23924 }
23925 spin_unlock(&sighand->siglock);
23926 rcu_read_unlock();
23927- local_irq_restore(*flags);
23928+ local_irq_restore_nort(*flags);
23929 }
23930
23931 return sighand;
23932@@ -1514,7 +1607,8 @@ EXPORT_SYMBOL(kill_pid);
23933 */
23934 struct sigqueue *sigqueue_alloc(void)
23935 {
23936- struct sigqueue *q = __sigqueue_alloc(-1, current, GFP_KERNEL, 0);
23937+ /* Preallocated sigqueue objects always from the slabcache ! */
23938+ struct sigqueue *q = __sigqueue_do_alloc(-1, current, GFP_KERNEL, 0, 1);
23939
23940 if (q)
23941 q->flags |= SIGQUEUE_PREALLOC;
23942@@ -1888,15 +1982,7 @@ static void ptrace_stop(int exit_code, int why, int clear_code, siginfo_t *info)
23943 if (gstop_done && ptrace_reparented(current))
23944 do_notify_parent_cldstop(current, false, why);
23945
23946- /*
23947- * Don't want to allow preemption here, because
23948- * sys_ptrace() needs this task to be inactive.
23949- *
23950- * XXX: implement read_unlock_no_resched().
23951- */
23952- preempt_disable();
23953 read_unlock(&tasklist_lock);
23954- preempt_enable_no_resched();
23955 freezable_schedule();
23956 } else {
23957 /*
23958diff --git a/kernel/softirq.c b/kernel/softirq.c
23959index a4c87cf27f9d..583c9ecf04e3 100644
23960--- a/kernel/softirq.c
23961+++ b/kernel/softirq.c
23962@@ -21,11 +21,14 @@
23963 #include <linux/freezer.h>
23964 #include <linux/kthread.h>
23965 #include <linux/rcupdate.h>
23966+#include <linux/delay.h>
23967 #include <linux/ftrace.h>
23968 #include <linux/smp.h>
23969 #include <linux/smpboot.h>
23970 #include <linux/tick.h>
23971+#include <linux/locallock.h>
23972 #include <linux/irq.h>
23973+#include <linux/sched/types.h>
23974
23975 #define CREATE_TRACE_POINTS
23976 #include <trace/events/irq.h>
23977@@ -56,12 +59,108 @@ EXPORT_SYMBOL(irq_stat);
23978 static struct softirq_action softirq_vec[NR_SOFTIRQS] __cacheline_aligned_in_smp;
23979
23980 DEFINE_PER_CPU(struct task_struct *, ksoftirqd);
23981+#ifdef CONFIG_PREEMPT_RT_FULL
23982+#define TIMER_SOFTIRQS ((1 << TIMER_SOFTIRQ) | (1 << HRTIMER_SOFTIRQ))
23983+DEFINE_PER_CPU(struct task_struct *, ktimer_softirqd);
23984+#endif
23985
23986 const char * const softirq_to_name[NR_SOFTIRQS] = {
23987 "HI", "TIMER", "NET_TX", "NET_RX", "BLOCK", "IRQ_POLL",
23988 "TASKLET", "SCHED", "HRTIMER", "RCU"
23989 };
23990
23991+#ifdef CONFIG_NO_HZ_COMMON
23992+# ifdef CONFIG_PREEMPT_RT_FULL
23993+
23994+struct softirq_runner {
23995+ struct task_struct *runner[NR_SOFTIRQS];
23996+};
23997+
23998+static DEFINE_PER_CPU(struct softirq_runner, softirq_runners);
23999+
24000+static inline void softirq_set_runner(unsigned int sirq)
24001+{
24002+ struct softirq_runner *sr = this_cpu_ptr(&softirq_runners);
24003+
24004+ sr->runner[sirq] = current;
24005+}
24006+
24007+static inline void softirq_clr_runner(unsigned int sirq)
24008+{
24009+ struct softirq_runner *sr = this_cpu_ptr(&softirq_runners);
24010+
24011+ sr->runner[sirq] = NULL;
24012+}
24013+
24014+/*
24015+ * On preempt-rt a softirq running context might be blocked on a
24016+ * lock. There might be no other runnable task on this CPU because the
24017+ * lock owner runs on some other CPU. So we have to go into idle with
24018+ * the pending bit set. Therefor we need to check this otherwise we
24019+ * warn about false positives which confuses users and defeats the
24020+ * whole purpose of this test.
24021+ *
24022+ * This code is called with interrupts disabled.
24023+ */
24024+void softirq_check_pending_idle(void)
24025+{
24026+ static int rate_limit;
24027+ struct softirq_runner *sr = this_cpu_ptr(&softirq_runners);
24028+ u32 warnpending;
24029+ int i;
24030+
24031+ if (rate_limit >= 10)
24032+ return;
24033+
24034+ warnpending = local_softirq_pending() & SOFTIRQ_STOP_IDLE_MASK;
24035+ for (i = 0; i < NR_SOFTIRQS; i++) {
24036+ struct task_struct *tsk = sr->runner[i];
24037+
24038+ /*
24039+ * The wakeup code in rtmutex.c wakes up the task
24040+ * _before_ it sets pi_blocked_on to NULL under
24041+ * tsk->pi_lock. So we need to check for both: state
24042+ * and pi_blocked_on.
24043+ */
24044+ if (tsk) {
24045+ raw_spin_lock(&tsk->pi_lock);
24046+ if (tsk->pi_blocked_on || tsk->state == TASK_RUNNING) {
24047+ /* Clear all bits pending in that task */
24048+ warnpending &= ~(tsk->softirqs_raised);
24049+ warnpending &= ~(1 << i);
24050+ }
24051+ raw_spin_unlock(&tsk->pi_lock);
24052+ }
24053+ }
24054+
24055+ if (warnpending) {
24056+ printk(KERN_ERR "NOHZ: local_softirq_pending %02x\n",
24057+ warnpending);
24058+ rate_limit++;
24059+ }
24060+}
24061+# else
24062+/*
24063+ * On !PREEMPT_RT we just printk rate limited:
24064+ */
24065+void softirq_check_pending_idle(void)
24066+{
24067+ static int rate_limit;
24068+
24069+ if (rate_limit < 10 && !in_softirq() &&
24070+ (local_softirq_pending() & SOFTIRQ_STOP_IDLE_MASK)) {
24071+ printk(KERN_ERR "NOHZ: local_softirq_pending %02x\n",
24072+ local_softirq_pending());
24073+ rate_limit++;
24074+ }
24075+}
24076+# endif
24077+
24078+#else /* !CONFIG_NO_HZ_COMMON */
24079+static inline void softirq_set_runner(unsigned int sirq) { }
24080+static inline void softirq_clr_runner(unsigned int sirq) { }
24081+#endif
24082+
24083 /*
24084 * we cannot loop indefinitely here to avoid userspace starvation,
24085 * but we also don't want to introduce a worst case 1/HZ latency
24086@@ -77,6 +176,38 @@ static void wakeup_softirqd(void)
24087 wake_up_process(tsk);
24088 }
24089
24090+#ifdef CONFIG_PREEMPT_RT_FULL
24091+static void wakeup_timer_softirqd(void)
24092+{
24093+ /* Interrupts are disabled: no need to stop preemption */
24094+ struct task_struct *tsk = __this_cpu_read(ktimer_softirqd);
24095+
24096+ if (tsk && tsk->state != TASK_RUNNING)
24097+ wake_up_process(tsk);
24098+}
24099+#endif
24100+
24101+static void handle_softirq(unsigned int vec_nr)
24102+{
24103+ struct softirq_action *h = softirq_vec + vec_nr;
24104+ int prev_count;
24105+
24106+ prev_count = preempt_count();
24107+
24108+ kstat_incr_softirqs_this_cpu(vec_nr);
24109+
24110+ trace_softirq_entry(vec_nr);
24111+ h->action(h);
24112+ trace_softirq_exit(vec_nr);
24113+ if (unlikely(prev_count != preempt_count())) {
24114+ pr_err("huh, entered softirq %u %s %p with preempt_count %08x, exited with %08x?\n",
24115+ vec_nr, softirq_to_name[vec_nr], h->action,
24116+ prev_count, preempt_count());
24117+ preempt_count_set(prev_count);
24118+ }
24119+}
24120+
24121+#ifndef CONFIG_PREEMPT_RT_FULL
24122 /*
24123 * If ksoftirqd is scheduled, we do not want to process pending softirqs
24124 * right now. Let ksoftirqd handle this at its own rate, to get fairness,
24125@@ -92,6 +223,47 @@ static bool ksoftirqd_running(unsigned long pending)
24126 return tsk && (tsk->state == TASK_RUNNING);
24127 }
24128
24129+static inline int ksoftirqd_softirq_pending(void)
24130+{
24131+ return local_softirq_pending();
24132+}
24133+
24134+static void handle_pending_softirqs(u32 pending)
24135+{
24136+ struct softirq_action *h = softirq_vec;
24137+ int softirq_bit;
24138+
24139+ local_irq_enable();
24140+
24141+ h = softirq_vec;
24142+
24143+ while ((softirq_bit = ffs(pending))) {
24144+ unsigned int vec_nr;
24145+
24146+ h += softirq_bit - 1;
24147+ vec_nr = h - softirq_vec;
24148+ handle_softirq(vec_nr);
24149+
24150+ h++;
24151+ pending >>= softirq_bit;
24152+ }
24153+
24154+ rcu_bh_qs();
24155+ local_irq_disable();
24156+}
24157+
24158+static void run_ksoftirqd(unsigned int cpu)
24159+{
24160+ local_irq_disable();
24161+ if (ksoftirqd_softirq_pending()) {
24162+ __do_softirq();
24163+ local_irq_enable();
24164+ cond_resched_rcu_qs();
24165+ return;
24166+ }
24167+ local_irq_enable();
24168+}
24169+
24170 /*
24171 * preempt_count and SOFTIRQ_OFFSET usage:
24172 * - preempt_count is changed by SOFTIRQ_OFFSET on entering or leaving
24173@@ -247,10 +419,8 @@ asmlinkage __visible void __softirq_entry __do_softirq(void)
24174 unsigned long end = jiffies + MAX_SOFTIRQ_TIME;
24175 unsigned long old_flags = current->flags;
24176 int max_restart = MAX_SOFTIRQ_RESTART;
24177- struct softirq_action *h;
24178 bool in_hardirq;
24179 __u32 pending;
24180- int softirq_bit;
24181
24182 /*
24183 * Mask out PF_MEMALLOC s current task context is borrowed for the
24184@@ -269,36 +439,7 @@ asmlinkage __visible void __softirq_entry __do_softirq(void)
24185 /* Reset the pending bitmask before enabling irqs */
24186 set_softirq_pending(0);
24187
24188- local_irq_enable();
24189-
24190- h = softirq_vec;
24191-
24192- while ((softirq_bit = ffs(pending))) {
24193- unsigned int vec_nr;
24194- int prev_count;
24195-
24196- h += softirq_bit - 1;
24197-
24198- vec_nr = h - softirq_vec;
24199- prev_count = preempt_count();
24200-
24201- kstat_incr_softirqs_this_cpu(vec_nr);
24202-
24203- trace_softirq_entry(vec_nr);
24204- h->action(h);
24205- trace_softirq_exit(vec_nr);
24206- if (unlikely(prev_count != preempt_count())) {
24207- pr_err("huh, entered softirq %u %s %p with preempt_count %08x, exited with %08x?\n",
24208- vec_nr, softirq_to_name[vec_nr], h->action,
24209- prev_count, preempt_count());
24210- preempt_count_set(prev_count);
24211- }
24212- h++;
24213- pending >>= softirq_bit;
24214- }
24215-
24216- rcu_bh_qs();
24217- local_irq_disable();
24218+ handle_pending_softirqs(pending);
24219
24220 pending = local_softirq_pending();
24221 if (pending) {
24222@@ -334,6 +475,309 @@ asmlinkage __visible void do_softirq(void)
24223 local_irq_restore(flags);
24224 }
24225
24226+/*
24227+ * This function must run with irqs disabled!
24228+ */
24229+void raise_softirq_irqoff(unsigned int nr)
24230+{
24231+ __raise_softirq_irqoff(nr);
24232+
24233+ /*
24234+ * If we're in an interrupt or softirq, we're done
24235+ * (this also catches softirq-disabled code). We will
24236+ * actually run the softirq once we return from
24237+ * the irq or softirq.
24238+ *
24239+ * Otherwise we wake up ksoftirqd to make sure we
24240+ * schedule the softirq soon.
24241+ */
24242+ if (!in_interrupt())
24243+ wakeup_softirqd();
24244+}
24245+
24246+void __raise_softirq_irqoff(unsigned int nr)
24247+{
24248+ trace_softirq_raise(nr);
24249+ or_softirq_pending(1UL << nr);
24250+}
24251+
24252+static inline void local_bh_disable_nort(void) { local_bh_disable(); }
24253+static inline void _local_bh_enable_nort(void) { _local_bh_enable(); }
24254+static void ksoftirqd_set_sched_params(unsigned int cpu) { }
24255+
24256+#else /* !PREEMPT_RT_FULL */
24257+
24258+/*
24259+ * On RT we serialize softirq execution with a cpu local lock per softirq
24260+ */
24261+static DEFINE_PER_CPU(struct local_irq_lock [NR_SOFTIRQS], local_softirq_locks);
24262+
24263+void __init softirq_early_init(void)
24264+{
24265+ int i;
24266+
24267+ for (i = 0; i < NR_SOFTIRQS; i++)
24268+ local_irq_lock_init(local_softirq_locks[i]);
24269+}
24270+
24271+static void lock_softirq(int which)
24272+{
24273+ local_lock(local_softirq_locks[which]);
24274+}
24275+
24276+static void unlock_softirq(int which)
24277+{
24278+ local_unlock(local_softirq_locks[which]);
24279+}
24280+
24281+static void do_single_softirq(int which)
24282+{
24283+ unsigned long old_flags = current->flags;
24284+
24285+ current->flags &= ~PF_MEMALLOC;
24286+ vtime_account_irq_enter(current);
24287+ current->flags |= PF_IN_SOFTIRQ;
24288+ lockdep_softirq_enter();
24289+ local_irq_enable();
24290+ handle_softirq(which);
24291+ local_irq_disable();
24292+ lockdep_softirq_exit();
24293+ current->flags &= ~PF_IN_SOFTIRQ;
24294+ vtime_account_irq_enter(current);
24295+ current_restore_flags(old_flags, PF_MEMALLOC);
24296+}
24297+
24298+/*
24299+ * Called with interrupts disabled. Process softirqs which were raised
24300+ * in current context (or on behalf of ksoftirqd).
24301+ */
24302+static void do_current_softirqs(void)
24303+{
24304+ while (current->softirqs_raised) {
24305+ int i = __ffs(current->softirqs_raised);
24306+ unsigned int pending, mask = (1U << i);
24307+
24308+ current->softirqs_raised &= ~mask;
24309+ local_irq_enable();
24310+
24311+ /*
24312+ * If the lock is contended, we boost the owner to
24313+ * process the softirq or leave the critical section
24314+ * now.
24315+ */
24316+ lock_softirq(i);
24317+ local_irq_disable();
24318+ softirq_set_runner(i);
24319+ /*
24320+ * Check with the local_softirq_pending() bits,
24321+ * whether we need to process this still or if someone
24322+ * else took care of it.
24323+ */
24324+ pending = local_softirq_pending();
24325+ if (pending & mask) {
24326+ set_softirq_pending(pending & ~mask);
24327+ do_single_softirq(i);
24328+ }
24329+ softirq_clr_runner(i);
24330+ WARN_ON(current->softirq_nestcnt != 1);
24331+ local_irq_enable();
24332+ unlock_softirq(i);
24333+ local_irq_disable();
24334+ }
24335+}
24336+
24337+void __local_bh_disable(void)
24338+{
24339+ if (++current->softirq_nestcnt == 1)
24340+ migrate_disable();
24341+}
24342+EXPORT_SYMBOL(__local_bh_disable);
24343+
24344+void __local_bh_enable(void)
24345+{
24346+ if (WARN_ON(current->softirq_nestcnt == 0))
24347+ return;
24348+
24349+ local_irq_disable();
24350+ if (current->softirq_nestcnt == 1 && current->softirqs_raised)
24351+ do_current_softirqs();
24352+ local_irq_enable();
24353+
24354+ if (--current->softirq_nestcnt == 0)
24355+ migrate_enable();
24356+}
24357+EXPORT_SYMBOL(__local_bh_enable);
24358+
24359+void _local_bh_enable(void)
24360+{
24361+ if (WARN_ON(current->softirq_nestcnt == 0))
24362+ return;
24363+ if (--current->softirq_nestcnt == 0)
24364+ migrate_enable();
24365+}
24366+EXPORT_SYMBOL(_local_bh_enable);
24367+
24368+int in_serving_softirq(void)
24369+{
24370+ return current->flags & PF_IN_SOFTIRQ;
24371+}
24372+EXPORT_SYMBOL(in_serving_softirq);
24373+
24374+/* Called with preemption disabled */
24375+static void run_ksoftirqd(unsigned int cpu)
24376+{
24377+ local_irq_disable();
24378+ current->softirq_nestcnt++;
24379+
24380+ do_current_softirqs();
24381+ current->softirq_nestcnt--;
24382+ local_irq_enable();
24383+ cond_resched_rcu_qs();
24384+}
24385+
24386+/*
24387+ * Called from netif_rx_ni(). Preemption enabled, but migration
24388+ * disabled. So the cpu can't go away under us.
24389+ */
24390+void thread_do_softirq(void)
24391+{
24392+ if (!in_serving_softirq() && current->softirqs_raised) {
24393+ current->softirq_nestcnt++;
24394+ do_current_softirqs();
24395+ current->softirq_nestcnt--;
24396+ }
24397+}
24398+
24399+static void do_raise_softirq_irqoff(unsigned int nr)
24400+{
24401+ unsigned int mask;
24402+
24403+ mask = 1UL << nr;
24404+
24405+ trace_softirq_raise(nr);
24406+ or_softirq_pending(mask);
24407+
24408+ /*
24409+ * If we are not in a hard interrupt and inside a bh disabled
24410+ * region, we simply raise the flag on current. local_bh_enable()
24411+ * will make sure that the softirq is executed. Otherwise we
24412+ * delegate it to ksoftirqd.
24413+ */
24414+ if (!in_irq() && current->softirq_nestcnt)
24415+ current->softirqs_raised |= mask;
24416+ else if (!__this_cpu_read(ksoftirqd) || !__this_cpu_read(ktimer_softirqd))
24417+ return;
24418+
24419+ if (mask & TIMER_SOFTIRQS)
24420+ __this_cpu_read(ktimer_softirqd)->softirqs_raised |= mask;
24421+ else
24422+ __this_cpu_read(ksoftirqd)->softirqs_raised |= mask;
24423+}
24424+
24425+static void wakeup_proper_softirq(unsigned int nr)
24426+{
24427+ if ((1UL << nr) & TIMER_SOFTIRQS)
24428+ wakeup_timer_softirqd();
24429+ else
24430+ wakeup_softirqd();
24431+}
24432+
24433+void __raise_softirq_irqoff(unsigned int nr)
24434+{
24435+ do_raise_softirq_irqoff(nr);
24436+ if (!in_irq() && !current->softirq_nestcnt)
24437+ wakeup_proper_softirq(nr);
24438+}
24439+
24440+/*
24441+ * Same as __raise_softirq_irqoff() but will process them in ksoftirqd
24442+ */
24443+void __raise_softirq_irqoff_ksoft(unsigned int nr)
24444+{
24445+ unsigned int mask;
24446+
24447+ if (WARN_ON_ONCE(!__this_cpu_read(ksoftirqd) ||
24448+ !__this_cpu_read(ktimer_softirqd)))
24449+ return;
24450+ mask = 1UL << nr;
24451+
24452+ trace_softirq_raise(nr);
24453+ or_softirq_pending(mask);
24454+ if (mask & TIMER_SOFTIRQS)
24455+ __this_cpu_read(ktimer_softirqd)->softirqs_raised |= mask;
24456+ else
24457+ __this_cpu_read(ksoftirqd)->softirqs_raised |= mask;
24458+ wakeup_proper_softirq(nr);
24459+}
24460+
24461+/*
24462+ * This function must run with irqs disabled!
24463+ */
24464+void raise_softirq_irqoff(unsigned int nr)
24465+{
24466+ do_raise_softirq_irqoff(nr);
24467+
24468+ /*
24469+ * If we're in an hard interrupt we let irq return code deal
24470+ * with the wakeup of ksoftirqd.
24471+ */
24472+ if (in_irq())
24473+ return;
24474+ /*
24475+ * If we are in thread context but outside of a bh disabled
24476+ * region, we need to wake ksoftirqd as well.
24477+ *
24478+ * CHECKME: Some of the places which do that could be wrapped
24479+ * into local_bh_disable/enable pairs. Though it's unclear
24480+ * whether this is worth the effort. To find those places just
24481+ * raise a WARN() if the condition is met.
24482+ */
24483+ if (!current->softirq_nestcnt)
24484+ wakeup_proper_softirq(nr);
24485+}
24486+
24487+static inline int ksoftirqd_softirq_pending(void)
24488+{
24489+ return current->softirqs_raised;
24490+}
24491+
24492+static inline void local_bh_disable_nort(void) { }
24493+static inline void _local_bh_enable_nort(void) { }
24494+
24495+static inline void ksoftirqd_set_sched_params(unsigned int cpu)
24496+{
24497+ /* Take over all but timer pending softirqs when starting */
24498+ local_irq_disable();
24499+ current->softirqs_raised = local_softirq_pending() & ~TIMER_SOFTIRQS;
24500+ local_irq_enable();
24501+}
24502+
24503+static inline void ktimer_softirqd_set_sched_params(unsigned int cpu)
24504+{
24505+ struct sched_param param = { .sched_priority = 1 };
24506+
24507+ sched_setscheduler(current, SCHED_FIFO, &param);
24508+
24509+ /* Take over timer pending softirqs when starting */
24510+ local_irq_disable();
24511+ current->softirqs_raised = local_softirq_pending() & TIMER_SOFTIRQS;
24512+ local_irq_enable();
24513+}
24514+
24515+static inline void ktimer_softirqd_clr_sched_params(unsigned int cpu,
24516+ bool online)
24517+{
24518+ struct sched_param param = { .sched_priority = 0 };
24519+
24520+ sched_setscheduler(current, SCHED_NORMAL, &param);
24521+}
24522+
24523+static int ktimer_softirqd_should_run(unsigned int cpu)
24524+{
24525+ return current->softirqs_raised;
24526+}
24527+
24528+#endif /* PREEMPT_RT_FULL */
24529 /*
24530 * Enter an interrupt context.
24531 */
24532@@ -345,9 +789,9 @@ void irq_enter(void)
24533 * Prevent raise_softirq from needlessly waking up ksoftirqd
24534 * here, as softirq will be serviced on return from interrupt.
24535 */
24536- local_bh_disable();
24537+ local_bh_disable_nort();
24538 tick_irq_enter();
24539- _local_bh_enable();
24540+ _local_bh_enable_nort();
24541 }
24542
24543 __irq_enter();
24544@@ -355,6 +799,7 @@ void irq_enter(void)
24545
24546 static inline void invoke_softirq(void)
24547 {
24548+#ifndef CONFIG_PREEMPT_RT_FULL
24549 if (ksoftirqd_running(local_softirq_pending()))
24550 return;
24551
24552@@ -377,6 +822,18 @@ static inline void invoke_softirq(void)
24553 } else {
24554 wakeup_softirqd();
24555 }
24556+#else /* PREEMPT_RT_FULL */
24557+ unsigned long flags;
24558+
24559+ local_irq_save(flags);
24560+ if (__this_cpu_read(ksoftirqd) &&
24561+ __this_cpu_read(ksoftirqd)->softirqs_raised)
24562+ wakeup_softirqd();
24563+ if (__this_cpu_read(ktimer_softirqd) &&
24564+ __this_cpu_read(ktimer_softirqd)->softirqs_raised)
24565+ wakeup_timer_softirqd();
24566+ local_irq_restore(flags);
24567+#endif
24568 }
24569
24570 static inline void tick_irq_exit(void)
24571@@ -385,7 +842,8 @@ static inline void tick_irq_exit(void)
24572 int cpu = smp_processor_id();
24573
24574 /* Make sure that timer wheel updates are propagated */
24575- if ((idle_cpu(cpu) && !need_resched()) || tick_nohz_full_cpu(cpu)) {
24576+ if ((idle_cpu(cpu) || tick_nohz_full_cpu(cpu)) &&
24577+ !need_resched() && !local_softirq_pending()) {
24578 if (!in_irq())
24579 tick_nohz_irq_exit();
24580 }
24581@@ -413,26 +871,6 @@ void irq_exit(void)
24582 trace_hardirq_exit(); /* must be last! */
24583 }
24584
24585-/*
24586- * This function must run with irqs disabled!
24587- */
24588-inline void raise_softirq_irqoff(unsigned int nr)
24589-{
24590- __raise_softirq_irqoff(nr);
24591-
24592- /*
24593- * If we're in an interrupt or softirq, we're done
24594- * (this also catches softirq-disabled code). We will
24595- * actually run the softirq once we return from
24596- * the irq or softirq.
24597- *
24598- * Otherwise we wake up ksoftirqd to make sure we
24599- * schedule the softirq soon.
24600- */
24601- if (!in_interrupt())
24602- wakeup_softirqd();
24603-}
24604-
24605 void raise_softirq(unsigned int nr)
24606 {
24607 unsigned long flags;
24608@@ -442,12 +880,6 @@ void raise_softirq(unsigned int nr)
24609 local_irq_restore(flags);
24610 }
24611
24612-void __raise_softirq_irqoff(unsigned int nr)
24613-{
24614- trace_softirq_raise(nr);
24615- or_softirq_pending(1UL << nr);
24616-}
24617-
24618 void open_softirq(int nr, void (*action)(struct softirq_action *))
24619 {
24620 softirq_vec[nr].action = action;
24621@@ -464,15 +896,45 @@ struct tasklet_head {
24622 static DEFINE_PER_CPU(struct tasklet_head, tasklet_vec);
24623 static DEFINE_PER_CPU(struct tasklet_head, tasklet_hi_vec);
24624
24625+static void inline
24626+__tasklet_common_schedule(struct tasklet_struct *t, struct tasklet_head *head, unsigned int nr)
24627+{
24628+ if (tasklet_trylock(t)) {
24629+again:
24630+ /* We may have been preempted before tasklet_trylock
24631+ * and __tasklet_action may have already run.
24632+ * So double check the sched bit while the takslet
24633+ * is locked before adding it to the list.
24634+ */
24635+ if (test_bit(TASKLET_STATE_SCHED, &t->state)) {
24636+ t->next = NULL;
24637+ *head->tail = t;
24638+ head->tail = &(t->next);
24639+ raise_softirq_irqoff(nr);
24640+ tasklet_unlock(t);
24641+ } else {
24642+ /* This is subtle. If we hit the corner case above
24643+ * It is possible that we get preempted right here,
24644+ * and another task has successfully called
24645+ * tasklet_schedule(), then this function, and
24646+ * failed on the trylock. Thus we must be sure
24647+ * before releasing the tasklet lock, that the
24648+ * SCHED_BIT is clear. Otherwise the tasklet
24649+ * may get its SCHED_BIT set, but not added to the
24650+ * list
24651+ */
24652+ if (!tasklet_tryunlock(t))
24653+ goto again;
24654+ }
24655+ }
24656+}
24657+
24658 void __tasklet_schedule(struct tasklet_struct *t)
24659 {
24660 unsigned long flags;
24661
24662 local_irq_save(flags);
24663- t->next = NULL;
24664- *__this_cpu_read(tasklet_vec.tail) = t;
24665- __this_cpu_write(tasklet_vec.tail, &(t->next));
24666- raise_softirq_irqoff(TASKLET_SOFTIRQ);
24667+ __tasklet_common_schedule(t, this_cpu_ptr(&tasklet_vec), TASKLET_SOFTIRQ);
24668 local_irq_restore(flags);
24669 }
24670 EXPORT_SYMBOL(__tasklet_schedule);
24671@@ -482,50 +944,108 @@ void __tasklet_hi_schedule(struct tasklet_struct *t)
24672 unsigned long flags;
24673
24674 local_irq_save(flags);
24675- t->next = NULL;
24676- *__this_cpu_read(tasklet_hi_vec.tail) = t;
24677- __this_cpu_write(tasklet_hi_vec.tail, &(t->next));
24678- raise_softirq_irqoff(HI_SOFTIRQ);
24679+ __tasklet_common_schedule(t, this_cpu_ptr(&tasklet_hi_vec), HI_SOFTIRQ);
24680 local_irq_restore(flags);
24681 }
24682 EXPORT_SYMBOL(__tasklet_hi_schedule);
24683
24684-static __latent_entropy void tasklet_action(struct softirq_action *a)
24685+void tasklet_enable(struct tasklet_struct *t)
24686 {
24687- struct tasklet_struct *list;
24688+ if (!atomic_dec_and_test(&t->count))
24689+ return;
24690+ if (test_and_clear_bit(TASKLET_STATE_PENDING, &t->state))
24691+ tasklet_schedule(t);
24692+}
24693+EXPORT_SYMBOL(tasklet_enable);
24694
24695- local_irq_disable();
24696- list = __this_cpu_read(tasklet_vec.head);
24697- __this_cpu_write(tasklet_vec.head, NULL);
24698- __this_cpu_write(tasklet_vec.tail, this_cpu_ptr(&tasklet_vec.head));
24699- local_irq_enable();
24700+static void __tasklet_action(struct softirq_action *a,
24701+ struct tasklet_struct *list)
24702+{
24703+ int loops = 1000000;
24704
24705 while (list) {
24706 struct tasklet_struct *t = list;
24707
24708 list = list->next;
24709
24710- if (tasklet_trylock(t)) {
24711- if (!atomic_read(&t->count)) {
24712- if (!test_and_clear_bit(TASKLET_STATE_SCHED,
24713- &t->state))
24714- BUG();
24715- t->func(t->data);
24716- tasklet_unlock(t);
24717- continue;
24718- }
24719- tasklet_unlock(t);
24720+ /*
24721+ * Should always succeed - after a tasklist got on the
24722+ * list (after getting the SCHED bit set from 0 to 1),
24723+ * nothing but the tasklet softirq it got queued to can
24724+ * lock it:
24725+ */
24726+ if (!tasklet_trylock(t)) {
24727+ WARN_ON(1);
24728+ continue;
24729 }
24730
24731- local_irq_disable();
24732 t->next = NULL;
24733- *__this_cpu_read(tasklet_vec.tail) = t;
24734- __this_cpu_write(tasklet_vec.tail, &(t->next));
24735- __raise_softirq_irqoff(TASKLET_SOFTIRQ);
24736- local_irq_enable();
24737+
24738+ /*
24739+ * If we cannot handle the tasklet because it's disabled,
24740+ * mark it as pending. tasklet_enable() will later
24741+ * re-schedule the tasklet.
24742+ */
24743+ if (unlikely(atomic_read(&t->count))) {
24744+out_disabled:
24745+ /* implicit unlock: */
24746+ wmb();
24747+ t->state = TASKLET_STATEF_PENDING;
24748+ continue;
24749+ }
24750+
24751+ /*
24752+ * After this point on the tasklet might be rescheduled
24753+ * on another CPU, but it can only be added to another
24754+ * CPU's tasklet list if we unlock the tasklet (which we
24755+ * dont do yet).
24756+ */
24757+ if (!test_and_clear_bit(TASKLET_STATE_SCHED, &t->state))
24758+ WARN_ON(1);
24759+
24760+again:
24761+ t->func(t->data);
24762+
24763+ /*
24764+ * Try to unlock the tasklet. We must use cmpxchg, because
24765+ * another CPU might have scheduled or disabled the tasklet.
24766+ * We only allow the STATE_RUN -> 0 transition here.
24767+ */
24768+ while (!tasklet_tryunlock(t)) {
24769+ /*
24770+ * If it got disabled meanwhile, bail out:
24771+ */
24772+ if (atomic_read(&t->count))
24773+ goto out_disabled;
24774+ /*
24775+ * If it got scheduled meanwhile, re-execute
24776+ * the tasklet function:
24777+ */
24778+ if (test_and_clear_bit(TASKLET_STATE_SCHED, &t->state))
24779+ goto again;
24780+ if (!--loops) {
24781+ printk("hm, tasklet state: %08lx\n", t->state);
24782+ WARN_ON(1);
24783+ tasklet_unlock(t);
24784+ break;
24785+ }
24786+ }
24787 }
24788 }
24789
24790+static __latent_entropy void tasklet_action(struct softirq_action *a)
24791+{
24792+ struct tasklet_struct *list;
24793+
24794+ local_irq_disable();
24795+ list = __this_cpu_read(tasklet_vec.head);
24796+ __this_cpu_write(tasklet_vec.head, NULL);
24797+ __this_cpu_write(tasklet_vec.tail, this_cpu_ptr(&tasklet_vec.head));
24798+ local_irq_enable();
24799+
24800+ __tasklet_action(a, list);
24801+}
24802+
24803 static __latent_entropy void tasklet_hi_action(struct softirq_action *a)
24804 {
24805 struct tasklet_struct *list;
24806@@ -536,30 +1056,7 @@ static __latent_entropy void tasklet_hi_action(struct softirq_action *a)
24807 __this_cpu_write(tasklet_hi_vec.tail, this_cpu_ptr(&tasklet_hi_vec.head));
24808 local_irq_enable();
24809
24810- while (list) {
24811- struct tasklet_struct *t = list;
24812-
24813- list = list->next;
24814-
24815- if (tasklet_trylock(t)) {
24816- if (!atomic_read(&t->count)) {
24817- if (!test_and_clear_bit(TASKLET_STATE_SCHED,
24818- &t->state))
24819- BUG();
24820- t->func(t->data);
24821- tasklet_unlock(t);
24822- continue;
24823- }
24824- tasklet_unlock(t);
24825- }
24826-
24827- local_irq_disable();
24828- t->next = NULL;
24829- *__this_cpu_read(tasklet_hi_vec.tail) = t;
24830- __this_cpu_write(tasklet_hi_vec.tail, &(t->next));
24831- __raise_softirq_irqoff(HI_SOFTIRQ);
24832- local_irq_enable();
24833- }
24834+ __tasklet_action(a, list);
24835 }
24836
24837 void tasklet_init(struct tasklet_struct *t,
24838@@ -580,7 +1077,7 @@ void tasklet_kill(struct tasklet_struct *t)
24839
24840 while (test_and_set_bit(TASKLET_STATE_SCHED, &t->state)) {
24841 do {
24842- yield();
24843+ msleep(1);
24844 } while (test_bit(TASKLET_STATE_SCHED, &t->state));
24845 }
24846 tasklet_unlock_wait(t);
24847@@ -588,57 +1085,6 @@ void tasklet_kill(struct tasklet_struct *t)
24848 }
24849 EXPORT_SYMBOL(tasklet_kill);
24850
24851-/*
24852- * tasklet_hrtimer
24853- */
24854-
24855-/*
24856- * The trampoline is called when the hrtimer expires. It schedules a tasklet
24857- * to run __tasklet_hrtimer_trampoline() which in turn will call the intended
24858- * hrtimer callback, but from softirq context.
24859- */
24860-static enum hrtimer_restart __hrtimer_tasklet_trampoline(struct hrtimer *timer)
24861-{
24862- struct tasklet_hrtimer *ttimer =
24863- container_of(timer, struct tasklet_hrtimer, timer);
24864-
24865- tasklet_hi_schedule(&ttimer->tasklet);
24866- return HRTIMER_NORESTART;
24867-}
24868-
24869-/*
24870- * Helper function which calls the hrtimer callback from
24871- * tasklet/softirq context
24872- */
24873-static void __tasklet_hrtimer_trampoline(unsigned long data)
24874-{
24875- struct tasklet_hrtimer *ttimer = (void *)data;
24876- enum hrtimer_restart restart;
24877-
24878- restart = ttimer->function(&ttimer->timer);
24879- if (restart != HRTIMER_NORESTART)
24880- hrtimer_restart(&ttimer->timer);
24881-}
24882-
24883-/**
24884- * tasklet_hrtimer_init - Init a tasklet/hrtimer combo for softirq callbacks
24885- * @ttimer: tasklet_hrtimer which is initialized
24886- * @function: hrtimer callback function which gets called from softirq context
24887- * @which_clock: clock id (CLOCK_MONOTONIC/CLOCK_REALTIME)
24888- * @mode: hrtimer mode (HRTIMER_MODE_ABS/HRTIMER_MODE_REL)
24889- */
24890-void tasklet_hrtimer_init(struct tasklet_hrtimer *ttimer,
24891- enum hrtimer_restart (*function)(struct hrtimer *),
24892- clockid_t which_clock, enum hrtimer_mode mode)
24893-{
24894- hrtimer_init(&ttimer->timer, which_clock, mode);
24895- ttimer->timer.function = __hrtimer_tasklet_trampoline;
24896- tasklet_init(&ttimer->tasklet, __tasklet_hrtimer_trampoline,
24897- (unsigned long)ttimer);
24898- ttimer->function = function;
24899-}
24900-EXPORT_SYMBOL_GPL(tasklet_hrtimer_init);
24901-
24902 void __init softirq_init(void)
24903 {
24904 int cpu;
24905@@ -654,25 +1100,26 @@ void __init softirq_init(void)
24906 open_softirq(HI_SOFTIRQ, tasklet_hi_action);
24907 }
24908
24909-static int ksoftirqd_should_run(unsigned int cpu)
24910-{
24911- return local_softirq_pending();
24912-}
24913-
24914-static void run_ksoftirqd(unsigned int cpu)
24915+#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT_FULL)
24916+void tasklet_unlock_wait(struct tasklet_struct *t)
24917 {
24918- local_irq_disable();
24919- if (local_softirq_pending()) {
24920+ while (test_bit(TASKLET_STATE_RUN, &(t)->state)) {
24921 /*
24922- * We can safely run softirq on inline stack, as we are not deep
24923- * in the task stack here.
24924+ * Hack for now to avoid this busy-loop:
24925 */
24926- __do_softirq();
24927- local_irq_enable();
24928- cond_resched_rcu_qs();
24929- return;
24930+#ifdef CONFIG_PREEMPT_RT_FULL
24931+ msleep(1);
24932+#else
24933+ barrier();
24934+#endif
24935 }
24936- local_irq_enable();
24937+}
24938+EXPORT_SYMBOL(tasklet_unlock_wait);
24939+#endif
24940+
24941+static int ksoftirqd_should_run(unsigned int cpu)
24942+{
24943+ return ksoftirqd_softirq_pending();
24944 }
24945
24946 #ifdef CONFIG_HOTPLUG_CPU
24947@@ -739,17 +1186,31 @@ static int takeover_tasklets(unsigned int cpu)
24948
24949 static struct smp_hotplug_thread softirq_threads = {
24950 .store = &ksoftirqd,
24951+ .setup = ksoftirqd_set_sched_params,
24952 .thread_should_run = ksoftirqd_should_run,
24953 .thread_fn = run_ksoftirqd,
24954 .thread_comm = "ksoftirqd/%u",
24955 };
24956
24957+#ifdef CONFIG_PREEMPT_RT_FULL
24958+static struct smp_hotplug_thread softirq_timer_threads = {
24959+ .store = &ktimer_softirqd,
24960+ .setup = ktimer_softirqd_set_sched_params,
24961+ .cleanup = ktimer_softirqd_clr_sched_params,
24962+ .thread_should_run = ktimer_softirqd_should_run,
24963+ .thread_fn = run_ksoftirqd,
24964+ .thread_comm = "ktimersoftd/%u",
24965+};
24966+#endif
24967+
24968 static __init int spawn_ksoftirqd(void)
24969 {
24970 cpuhp_setup_state_nocalls(CPUHP_SOFTIRQ_DEAD, "softirq:dead", NULL,
24971 takeover_tasklets);
24972 BUG_ON(smpboot_register_percpu_thread(&softirq_threads));
24973-
24974+#ifdef CONFIG_PREEMPT_RT_FULL
24975+ BUG_ON(smpboot_register_percpu_thread(&softirq_timer_threads));
24976+#endif
24977 return 0;
24978 }
24979 early_initcall(spawn_ksoftirqd);
24980diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
24981index 067cb83f37ea..56f2f2e01229 100644
24982--- a/kernel/stop_machine.c
24983+++ b/kernel/stop_machine.c
24984@@ -503,6 +503,8 @@ static void cpu_stopper_thread(unsigned int cpu)
24985 struct cpu_stop_done *done = work->done;
24986 int ret;
24987
24988+ /* XXX */
24989+
24990 /* cpu stop callbacks must not sleep, make in_atomic() == T */
24991 preempt_count_inc();
24992 ret = fn(arg);
24993diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c
24994index fa5de5e8de61..6020ee66e517 100644
24995--- a/kernel/time/alarmtimer.c
24996+++ b/kernel/time/alarmtimer.c
24997@@ -436,7 +436,7 @@ int alarm_cancel(struct alarm *alarm)
24998 int ret = alarm_try_to_cancel(alarm);
24999 if (ret >= 0)
25000 return ret;
25001- cpu_relax();
25002+ hrtimer_wait_for_timer(&alarm->timer);
25003 }
25004 }
25005 EXPORT_SYMBOL_GPL(alarm_cancel);
25006diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
25007index d00e85ac10d6..b59e009087a9 100644
25008--- a/kernel/time/hrtimer.c
25009+++ b/kernel/time/hrtimer.c
25010@@ -59,6 +59,15 @@
25011
25012 #include "tick-internal.h"
25013
25014+/*
25015+ * Masks for selecting the soft and hard context timers from
25016+ * cpu_base->active
25017+ */
25018+#define MASK_SHIFT (HRTIMER_BASE_MONOTONIC_SOFT)
25019+#define HRTIMER_ACTIVE_HARD ((1U << MASK_SHIFT) - 1)
25020+#define HRTIMER_ACTIVE_SOFT (HRTIMER_ACTIVE_HARD << MASK_SHIFT)
25021+#define HRTIMER_ACTIVE_ALL (HRTIMER_ACTIVE_SOFT | HRTIMER_ACTIVE_HARD)
25022+
25023 /*
25024 * The timer bases:
25025 *
25026@@ -70,7 +79,6 @@
25027 DEFINE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases) =
25028 {
25029 .lock = __RAW_SPIN_LOCK_UNLOCKED(hrtimer_bases.lock),
25030- .seq = SEQCNT_ZERO(hrtimer_bases.seq),
25031 .clock_base =
25032 {
25033 {
25034@@ -93,6 +101,26 @@ DEFINE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases) =
25035 .clockid = CLOCK_TAI,
25036 .get_time = &ktime_get_clocktai,
25037 },
25038+ {
25039+ .index = HRTIMER_BASE_MONOTONIC_SOFT,
25040+ .clockid = CLOCK_MONOTONIC,
25041+ .get_time = &ktime_get,
25042+ },
25043+ {
25044+ .index = HRTIMER_BASE_REALTIME_SOFT,
25045+ .clockid = CLOCK_REALTIME,
25046+ .get_time = &ktime_get_real,
25047+ },
25048+ {
25049+ .index = HRTIMER_BASE_BOOTTIME_SOFT,
25050+ .clockid = CLOCK_BOOTTIME,
25051+ .get_time = &ktime_get_boottime,
25052+ },
25053+ {
25054+ .index = HRTIMER_BASE_TAI_SOFT,
25055+ .clockid = CLOCK_TAI,
25056+ .get_time = &ktime_get_clocktai,
25057+ },
25058 }
25059 };
25060
25061@@ -118,7 +146,6 @@ static const int hrtimer_clock_to_base_table[MAX_CLOCKS] = {
25062 * timer->base->cpu_base
25063 */
25064 static struct hrtimer_cpu_base migration_cpu_base = {
25065- .seq = SEQCNT_ZERO(migration_cpu_base),
25066 .clock_base = { { .cpu_base = &migration_cpu_base, }, },
25067 };
25068
25069@@ -156,45 +183,33 @@ struct hrtimer_clock_base *lock_hrtimer_base(const struct hrtimer *timer,
25070 }
25071
25072 /*
25073- * With HIGHRES=y we do not migrate the timer when it is expiring
25074- * before the next event on the target cpu because we cannot reprogram
25075- * the target cpu hardware and we would cause it to fire late.
25076+ * We do not migrate the timer when it is expiring before the next
25077+ * event on the target cpu. When high resolution is enabled, we cannot
25078+ * reprogram the target cpu hardware and we would cause it to fire
25079+ * late. To keep it simple, we handle the high resolution enabled and
25080+ * disabled case similar.
25081 *
25082 * Called with cpu_base->lock of target cpu held.
25083 */
25084 static int
25085 hrtimer_check_target(struct hrtimer *timer, struct hrtimer_clock_base *new_base)
25086 {
25087-#ifdef CONFIG_HIGH_RES_TIMERS
25088 ktime_t expires;
25089
25090- if (!new_base->cpu_base->hres_active)
25091- return 0;
25092-
25093 expires = ktime_sub(hrtimer_get_expires(timer), new_base->offset);
25094- return expires <= new_base->cpu_base->expires_next;
25095-#else
25096- return 0;
25097-#endif
25098+ return expires < new_base->cpu_base->expires_next;
25099 }
25100
25101-#ifdef CONFIG_NO_HZ_COMMON
25102-static inline
25103-struct hrtimer_cpu_base *get_target_base(struct hrtimer_cpu_base *base,
25104- int pinned)
25105-{
25106- if (pinned || !base->migration_enabled)
25107- return base;
25108- return &per_cpu(hrtimer_bases, get_nohz_timer_target());
25109-}
25110-#else
25111 static inline
25112 struct hrtimer_cpu_base *get_target_base(struct hrtimer_cpu_base *base,
25113 int pinned)
25114 {
25115+#if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON)
25116+ if (static_branch_unlikely(&timers_migration_enabled) && !pinned)
25117+ return &per_cpu(hrtimer_bases, get_nohz_timer_target());
25118+#endif
25119 return base;
25120 }
25121-#endif
25122
25123 /*
25124 * We switch the timer base to a power-optimized selected CPU target,
25125@@ -396,7 +411,8 @@ static inline void debug_hrtimer_init(struct hrtimer *timer)
25126 debug_object_init(timer, &hrtimer_debug_descr);
25127 }
25128
25129-static inline void debug_hrtimer_activate(struct hrtimer *timer)
25130+static inline void debug_hrtimer_activate(struct hrtimer *timer,
25131+ enum hrtimer_mode mode)
25132 {
25133 debug_object_activate(timer, &hrtimer_debug_descr);
25134 }
25135@@ -429,8 +445,10 @@ void destroy_hrtimer_on_stack(struct hrtimer *timer)
25136 EXPORT_SYMBOL_GPL(destroy_hrtimer_on_stack);
25137
25138 #else
25139+
25140 static inline void debug_hrtimer_init(struct hrtimer *timer) { }
25141-static inline void debug_hrtimer_activate(struct hrtimer *timer) { }
25142+static inline void debug_hrtimer_activate(struct hrtimer *timer,
25143+ enum hrtimer_mode mode) { }
25144 static inline void debug_hrtimer_deactivate(struct hrtimer *timer) { }
25145 #endif
25146
25147@@ -442,10 +460,11 @@ debug_init(struct hrtimer *timer, clockid_t clockid,
25148 trace_hrtimer_init(timer, clockid, mode);
25149 }
25150
25151-static inline void debug_activate(struct hrtimer *timer)
25152+static inline void debug_activate(struct hrtimer *timer,
25153+ enum hrtimer_mode mode)
25154 {
25155- debug_hrtimer_activate(timer);
25156- trace_hrtimer_start(timer);
25157+ debug_hrtimer_activate(timer, mode);
25158+ trace_hrtimer_start(timer, mode);
25159 }
25160
25161 static inline void debug_deactivate(struct hrtimer *timer)
25162@@ -454,35 +473,43 @@ static inline void debug_deactivate(struct hrtimer *timer)
25163 trace_hrtimer_cancel(timer);
25164 }
25165
25166-#if defined(CONFIG_NO_HZ_COMMON) || defined(CONFIG_HIGH_RES_TIMERS)
25167-static inline void hrtimer_update_next_timer(struct hrtimer_cpu_base *cpu_base,
25168- struct hrtimer *timer)
25169+static struct hrtimer_clock_base *
25170+__next_base(struct hrtimer_cpu_base *cpu_base, unsigned int *active)
25171 {
25172-#ifdef CONFIG_HIGH_RES_TIMERS
25173- cpu_base->next_timer = timer;
25174-#endif
25175+ unsigned int idx;
25176+
25177+ if (!*active)
25178+ return NULL;
25179+
25180+ idx = __ffs(*active);
25181+ *active &= ~(1U << idx);
25182+
25183+ return &cpu_base->clock_base[idx];
25184 }
25185
25186-static ktime_t __hrtimer_get_next_event(struct hrtimer_cpu_base *cpu_base)
25187+#define for_each_active_base(base, cpu_base, active) \
25188+ while ((base = __next_base((cpu_base), &(active))))
25189+
25190+static ktime_t __hrtimer_next_event_base(struct hrtimer_cpu_base *cpu_base,
25191+ unsigned int active,
25192+ ktime_t expires_next)
25193 {
25194- struct hrtimer_clock_base *base = cpu_base->clock_base;
25195- unsigned int active = cpu_base->active_bases;
25196- ktime_t expires, expires_next = KTIME_MAX;
25197+ struct hrtimer_clock_base *base;
25198+ ktime_t expires;
25199
25200- hrtimer_update_next_timer(cpu_base, NULL);
25201- for (; active; base++, active >>= 1) {
25202+ for_each_active_base(base, cpu_base, active) {
25203 struct timerqueue_node *next;
25204 struct hrtimer *timer;
25205
25206- if (!(active & 0x01))
25207- continue;
25208-
25209 next = timerqueue_getnext(&base->active);
25210 timer = container_of(next, struct hrtimer, node);
25211 expires = ktime_sub(hrtimer_get_expires(timer), base->offset);
25212 if (expires < expires_next) {
25213 expires_next = expires;
25214- hrtimer_update_next_timer(cpu_base, timer);
25215+ if (timer->is_soft)
25216+ cpu_base->softirq_next_timer = timer;
25217+ else
25218+ cpu_base->next_timer = timer;
25219 }
25220 }
25221 /*
25222@@ -494,7 +521,47 @@ static ktime_t __hrtimer_get_next_event(struct hrtimer_cpu_base *cpu_base)
25223 expires_next = 0;
25224 return expires_next;
25225 }
25226-#endif
25227+
25228+/*
25229+ * Recomputes cpu_base::*next_timer and returns the earliest expires_next but
25230+ * does not set cpu_base::*expires_next, that is done by hrtimer_reprogram.
25231+ *
25232+ * When a softirq is pending, we can ignore the HRTIMER_ACTIVE_SOFT bases,
25233+ * those timers will get run whenever the softirq gets handled, at the end of
25234+ * hrtimer_run_softirq(), hrtimer_update_softirq_timer() will re-add these bases.
25235+ *
25236+ * Therefore softirq values are those from the HRTIMER_ACTIVE_SOFT clock bases.
25237+ * The !softirq values are the minima across HRTIMER_ACTIVE_ALL, unless an actual
25238+ * softirq is pending, in which case they're the minima of HRTIMER_ACTIVE_HARD.
25239+ *
25240+ * @active_mask must be one of:
25241+ * - HRTIMER_ACTIVE_ALL,
25242+ * - HRTIMER_ACTIVE_SOFT, or
25243+ * - HRTIMER_ACTIVE_HARD.
25244+ */
25245+static ktime_t
25246+__hrtimer_get_next_event(struct hrtimer_cpu_base *cpu_base, unsigned int active_mask)
25247+{
25248+ unsigned int active;
25249+ struct hrtimer *next_timer = NULL;
25250+ ktime_t expires_next = KTIME_MAX;
25251+
25252+ if (!cpu_base->softirq_activated && (active_mask & HRTIMER_ACTIVE_SOFT)) {
25253+ active = cpu_base->active_bases & HRTIMER_ACTIVE_SOFT;
25254+ cpu_base->softirq_next_timer = NULL;
25255+ expires_next = __hrtimer_next_event_base(cpu_base, active, KTIME_MAX);
25256+
25257+ next_timer = cpu_base->softirq_next_timer;
25258+ }
25259+
25260+ if (active_mask & HRTIMER_ACTIVE_HARD) {
25261+ active = cpu_base->active_bases & HRTIMER_ACTIVE_HARD;
25262+ cpu_base->next_timer = next_timer;
25263+ expires_next = __hrtimer_next_event_base(cpu_base, active, expires_next);
25264+ }
25265+
25266+ return expires_next;
25267+}
25268
25269 static inline ktime_t hrtimer_update_base(struct hrtimer_cpu_base *base)
25270 {
25271@@ -502,36 +569,14 @@ static inline ktime_t hrtimer_update_base(struct hrtimer_cpu_base *base)
25272 ktime_t *offs_boot = &base->clock_base[HRTIMER_BASE_BOOTTIME].offset;
25273 ktime_t *offs_tai = &base->clock_base[HRTIMER_BASE_TAI].offset;
25274
25275- return ktime_get_update_offsets_now(&base->clock_was_set_seq,
25276+ ktime_t now = ktime_get_update_offsets_now(&base->clock_was_set_seq,
25277 offs_real, offs_boot, offs_tai);
25278-}
25279-
25280-/* High resolution timer related functions */
25281-#ifdef CONFIG_HIGH_RES_TIMERS
25282-
25283-/*
25284- * High resolution timer enabled ?
25285- */
25286-static bool hrtimer_hres_enabled __read_mostly = true;
25287-unsigned int hrtimer_resolution __read_mostly = LOW_RES_NSEC;
25288-EXPORT_SYMBOL_GPL(hrtimer_resolution);
25289-
25290-/*
25291- * Enable / Disable high resolution mode
25292- */
25293-static int __init setup_hrtimer_hres(char *str)
25294-{
25295- return (kstrtobool(str, &hrtimer_hres_enabled) == 0);
25296-}
25297
25298-__setup("highres=", setup_hrtimer_hres);
25299+ base->clock_base[HRTIMER_BASE_REALTIME_SOFT].offset = *offs_real;
25300+ base->clock_base[HRTIMER_BASE_BOOTTIME_SOFT].offset = *offs_boot;
25301+ base->clock_base[HRTIMER_BASE_TAI_SOFT].offset = *offs_tai;
25302
25303-/*
25304- * hrtimer_high_res_enabled - query, if the highres mode is enabled
25305- */
25306-static inline int hrtimer_is_hres_enabled(void)
25307-{
25308- return hrtimer_hres_enabled;
25309+ return now;
25310 }
25311
25312 /*
25313@@ -539,7 +584,8 @@ static inline int hrtimer_is_hres_enabled(void)
25314 */
25315 static inline int __hrtimer_hres_active(struct hrtimer_cpu_base *cpu_base)
25316 {
25317- return cpu_base->hres_active;
25318+ return IS_ENABLED(CONFIG_HIGH_RES_TIMERS) ?
25319+ cpu_base->hres_active : 0;
25320 }
25321
25322 static inline int hrtimer_hres_active(void)
25323@@ -557,10 +603,23 @@ hrtimer_force_reprogram(struct hrtimer_cpu_base *cpu_base, int skip_equal)
25324 {
25325 ktime_t expires_next;
25326
25327- if (!cpu_base->hres_active)
25328- return;
25329+ /*
25330+ * Find the current next expiration time.
25331+ */
25332+ expires_next = __hrtimer_get_next_event(cpu_base, HRTIMER_ACTIVE_ALL);
25333
25334- expires_next = __hrtimer_get_next_event(cpu_base);
25335+ if (cpu_base->next_timer && cpu_base->next_timer->is_soft) {
25336+ /*
25337+ * When the softirq is activated, hrtimer has to be
25338+ * programmed with the first hard hrtimer because soft
25339+ * timer interrupt could occur too late.
25340+ */
25341+ if (cpu_base->softirq_activated)
25342+ expires_next = __hrtimer_get_next_event(cpu_base,
25343+ HRTIMER_ACTIVE_HARD);
25344+ else
25345+ cpu_base->softirq_expires_next = expires_next;
25346+ }
25347
25348 if (skip_equal && expires_next == cpu_base->expires_next)
25349 return;
25350@@ -568,6 +627,9 @@ hrtimer_force_reprogram(struct hrtimer_cpu_base *cpu_base, int skip_equal)
25351 cpu_base->expires_next = expires_next;
25352
25353 /*
25354+ * If hres is not active, hardware does not have to be
25355+ * reprogrammed yet.
25356+ *
25357 * If a hang was detected in the last timer interrupt then we
25358 * leave the hang delay active in the hardware. We want the
25359 * system to make progress. That also prevents the following
25360@@ -581,83 +643,38 @@ hrtimer_force_reprogram(struct hrtimer_cpu_base *cpu_base, int skip_equal)
25361 * set. So we'd effectivly block all timers until the T2 event
25362 * fires.
25363 */
25364- if (cpu_base->hang_detected)
25365+ if (!__hrtimer_hres_active(cpu_base) || cpu_base->hang_detected)
25366 return;
25367
25368 tick_program_event(cpu_base->expires_next, 1);
25369 }
25370
25371+/* High resolution timer related functions */
25372+#ifdef CONFIG_HIGH_RES_TIMERS
25373+
25374 /*
25375- * When a timer is enqueued and expires earlier than the already enqueued
25376- * timers, we have to check, whether it expires earlier than the timer for
25377- * which the clock event device was armed.
25378- *
25379- * Called with interrupts disabled and base->cpu_base.lock held
25380+ * High resolution timer enabled ?
25381 */
25382-static void hrtimer_reprogram(struct hrtimer *timer,
25383- struct hrtimer_clock_base *base)
25384-{
25385- struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases);
25386- ktime_t expires = ktime_sub(hrtimer_get_expires(timer), base->offset);
25387-
25388- WARN_ON_ONCE(hrtimer_get_expires_tv64(timer) < 0);
25389-
25390- /*
25391- * If the timer is not on the current cpu, we cannot reprogram
25392- * the other cpus clock event device.
25393- */
25394- if (base->cpu_base != cpu_base)
25395- return;
25396-
25397- /*
25398- * If the hrtimer interrupt is running, then it will
25399- * reevaluate the clock bases and reprogram the clock event
25400- * device. The callbacks are always executed in hard interrupt
25401- * context so we don't need an extra check for a running
25402- * callback.
25403- */
25404- if (cpu_base->in_hrtirq)
25405- return;
25406-
25407- /*
25408- * CLOCK_REALTIME timer might be requested with an absolute
25409- * expiry time which is less than base->offset. Set it to 0.
25410- */
25411- if (expires < 0)
25412- expires = 0;
25413-
25414- if (expires >= cpu_base->expires_next)
25415- return;
25416-
25417- /* Update the pointer to the next expiring timer */
25418- cpu_base->next_timer = timer;
25419-
25420- /*
25421- * If a hang was detected in the last timer interrupt then we
25422- * do not schedule a timer which is earlier than the expiry
25423- * which we enforced in the hang detection. We want the system
25424- * to make progress.
25425- */
25426- if (cpu_base->hang_detected)
25427- return;
25428+static bool hrtimer_hres_enabled __read_mostly = true;
25429+unsigned int hrtimer_resolution __read_mostly = LOW_RES_NSEC;
25430+EXPORT_SYMBOL_GPL(hrtimer_resolution);
25431
25432- /*
25433- * Program the timer hardware. We enforce the expiry for
25434- * events which are already in the past.
25435- */
25436- cpu_base->expires_next = expires;
25437- tick_program_event(expires, 1);
25438+/*
25439+ * Enable / Disable high resolution mode
25440+ */
25441+static int __init setup_hrtimer_hres(char *str)
25442+{
25443+ return (kstrtobool(str, &hrtimer_hres_enabled) == 0);
25444 }
25445
25446+__setup("highres=", setup_hrtimer_hres);
25447+
25448 /*
25449- * Initialize the high resolution related parts of cpu_base
25450+ * hrtimer_high_res_enabled - query, if the highres mode is enabled
25451 */
25452-static inline void hrtimer_init_hres(struct hrtimer_cpu_base *base)
25453+static inline int hrtimer_is_hres_enabled(void)
25454 {
25455- base->expires_next = KTIME_MAX;
25456- base->hang_detected = 0;
25457- base->hres_active = 0;
25458- base->next_timer = NULL;
25459+ return hrtimer_hres_enabled;
25460 }
25461
25462 /*
25463@@ -669,7 +686,7 @@ static void retrigger_next_event(void *arg)
25464 {
25465 struct hrtimer_cpu_base *base = this_cpu_ptr(&hrtimer_bases);
25466
25467- if (!base->hres_active)
25468+ if (!__hrtimer_hres_active(base))
25469 return;
25470
25471 raw_spin_lock(&base->lock);
25472@@ -698,6 +715,29 @@ static void hrtimer_switch_to_hres(void)
25473 retrigger_next_event(NULL);
25474 }
25475
25476+#ifdef CONFIG_PREEMPT_RT_FULL
25477+
25478+static struct swork_event clock_set_delay_work;
25479+
25480+static void run_clock_set_delay(struct swork_event *event)
25481+{
25482+ clock_was_set();
25483+}
25484+
25485+void clock_was_set_delayed(void)
25486+{
25487+ swork_queue(&clock_set_delay_work);
25488+}
25489+
25490+static __init int create_clock_set_delay_thread(void)
25491+{
25492+ WARN_ON(swork_get());
25493+ INIT_SWORK(&clock_set_delay_work, run_clock_set_delay);
25494+ return 0;
25495+}
25496+early_initcall(create_clock_set_delay_thread);
25497+#else /* PREEMPT_RT_FULL */
25498+
25499 static void clock_was_set_work(struct work_struct *work)
25500 {
25501 clock_was_set();
25502@@ -713,25 +753,105 @@ void clock_was_set_delayed(void)
25503 {
25504 schedule_work(&hrtimer_work);
25505 }
25506+#endif
25507
25508 #else
25509
25510-static inline int __hrtimer_hres_active(struct hrtimer_cpu_base *b) { return 0; }
25511-static inline int hrtimer_hres_active(void) { return 0; }
25512 static inline int hrtimer_is_hres_enabled(void) { return 0; }
25513 static inline void hrtimer_switch_to_hres(void) { }
25514-static inline void
25515-hrtimer_force_reprogram(struct hrtimer_cpu_base *base, int skip_equal) { }
25516-static inline int hrtimer_reprogram(struct hrtimer *timer,
25517- struct hrtimer_clock_base *base)
25518-{
25519- return 0;
25520-}
25521-static inline void hrtimer_init_hres(struct hrtimer_cpu_base *base) { }
25522 static inline void retrigger_next_event(void *arg) { }
25523
25524 #endif /* CONFIG_HIGH_RES_TIMERS */
25525
25526+/*
25527+ * When a timer is enqueued and expires earlier than the already enqueued
25528+ * timers, we have to check, whether it expires earlier than the timer for
25529+ * which the clock event device was armed.
25530+ *
25531+ * Called with interrupts disabled and base->cpu_base.lock held
25532+ */
25533+static void hrtimer_reprogram(struct hrtimer *timer, bool reprogram)
25534+{
25535+ struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases);
25536+ struct hrtimer_clock_base *base = timer->base;
25537+ ktime_t expires = ktime_sub(hrtimer_get_expires(timer), base->offset);
25538+
25539+ WARN_ON_ONCE(hrtimer_get_expires_tv64(timer) < 0);
25540+
25541+ /*
25542+ * CLOCK_REALTIME timer might be requested with an absolute
25543+ * expiry time which is less than base->offset. Set it to 0.
25544+ */
25545+ if (expires < 0)
25546+ expires = 0;
25547+
25548+ if (timer->is_soft) {
25549+ /*
25550+ * soft hrtimer could be started on a remote CPU. In this
25551+ * case softirq_expires_next needs to be updated on the
25552+ * remote CPU. The soft hrtimer will not expire before the
25553+ * first hard hrtimer on the remote CPU -
25554+ * hrtimer_check_target() prevents this case.
25555+ */
25556+ struct hrtimer_cpu_base *timer_cpu_base = base->cpu_base;
25557+
25558+ if (timer_cpu_base->softirq_activated)
25559+ return;
25560+
25561+ if (!ktime_before(expires, timer_cpu_base->softirq_expires_next))
25562+ return;
25563+
25564+ timer_cpu_base->softirq_next_timer = timer;
25565+ timer_cpu_base->softirq_expires_next = expires;
25566+
25567+ if (!ktime_before(expires, timer_cpu_base->expires_next) ||
25568+ !reprogram)
25569+ return;
25570+ }
25571+
25572+ /*
25573+ * If the timer is not on the current cpu, we cannot reprogram
25574+ * the other cpus clock event device.
25575+ */
25576+ if (base->cpu_base != cpu_base)
25577+ return;
25578+
25579+ /*
25580+ * If the hrtimer interrupt is running, then it will
25581+ * reevaluate the clock bases and reprogram the clock event
25582+ * device. The callbacks are always executed in hard interrupt
25583+ * context so we don't need an extra check for a running
25584+ * callback.
25585+ */
25586+ if (cpu_base->in_hrtirq)
25587+ return;
25588+
25589+ if (expires >= cpu_base->expires_next)
25590+ return;
25591+
25592+ /* Update the pointer to the next expiring timer */
25593+ cpu_base->next_timer = timer;
25594+ cpu_base->expires_next = expires;
25595+
25596+ /*
25597+ * If hres is not active, hardware does not have to be
25598+ * programmed yet.
25599+ *
25600+ * If a hang was detected in the last timer interrupt then we
25601+ * do not schedule a timer which is earlier than the expiry
25602+ * which we enforced in the hang detection. We want the system
25603+ * to make progress.
25604+ */
25605+ if (!__hrtimer_hres_active(cpu_base) || cpu_base->hang_detected)
25606+ return;
25607+
25608+ /*
25609+ * Program the timer hardware. We enforce the expiry for
25610+ * events which are already in the past.
25611+ */
25612+ tick_program_event(expires, 1);
25613+}
25614+
25615 /*
25616 * Clock realtime was set
25617 *
25618@@ -830,6 +950,33 @@ u64 hrtimer_forward(struct hrtimer *timer, ktime_t now, ktime_t interval)
25619 }
25620 EXPORT_SYMBOL_GPL(hrtimer_forward);
25621
25622+#ifdef CONFIG_PREEMPT_RT_BASE
25623+# define wake_up_timer_waiters(b) wake_up(&(b)->wait)
25624+
25625+/**
25626+ * hrtimer_wait_for_timer - Wait for a running timer
25627+ *
25628+ * @timer: timer to wait for
25629+ *
25630+ * The function waits in case the timers callback function is
25631+ * currently executed on the waitqueue of the timer base. The
25632+ * waitqueue is woken up after the timer callback function has
25633+ * finished execution.
25634+ */
25635+void hrtimer_wait_for_timer(const struct hrtimer *timer)
25636+{
25637+ struct hrtimer_clock_base *base = timer->base;
25638+
25639+ if (base && base->cpu_base &&
25640+ base->index >= HRTIMER_BASE_MONOTONIC_SOFT)
25641+ wait_event(base->cpu_base->wait,
25642+ !(hrtimer_callback_running(timer)));
25643+}
25644+
25645+#else
25646+# define wake_up_timer_waiters(b) do { } while (0)
25647+#endif
25648+
25649 /*
25650 * enqueue_hrtimer - internal function to (re)start a timer
25651 *
25652@@ -839,9 +986,10 @@ EXPORT_SYMBOL_GPL(hrtimer_forward);
25653 * Returns 1 when the new timer is the leftmost timer in the tree.
25654 */
25655 static int enqueue_hrtimer(struct hrtimer *timer,
25656- struct hrtimer_clock_base *base)
25657+ struct hrtimer_clock_base *base,
25658+ enum hrtimer_mode mode)
25659 {
25660- debug_activate(timer);
25661+ debug_activate(timer, mode);
25662
25663 base->cpu_base->active_bases |= 1 << base->index;
25664
25665@@ -874,7 +1022,6 @@ static void __remove_hrtimer(struct hrtimer *timer,
25666 if (!timerqueue_del(&base->active, &timer->node))
25667 cpu_base->active_bases &= ~(1 << base->index);
25668
25669-#ifdef CONFIG_HIGH_RES_TIMERS
25670 /*
25671 * Note: If reprogram is false we do not update
25672 * cpu_base->next_timer. This happens when we remove the first
25673@@ -885,7 +1032,6 @@ static void __remove_hrtimer(struct hrtimer *timer,
25674 */
25675 if (reprogram && timer == cpu_base->next_timer)
25676 hrtimer_force_reprogram(cpu_base, 1);
25677-#endif
25678 }
25679
25680 /*
25681@@ -934,22 +1080,36 @@ static inline ktime_t hrtimer_update_lowres(struct hrtimer *timer, ktime_t tim,
25682 return tim;
25683 }
25684
25685-/**
25686- * hrtimer_start_range_ns - (re)start an hrtimer on the current CPU
25687- * @timer: the timer to be added
25688- * @tim: expiry time
25689- * @delta_ns: "slack" range for the timer
25690- * @mode: expiry mode: absolute (HRTIMER_MODE_ABS) or
25691- * relative (HRTIMER_MODE_REL)
25692- */
25693-void hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim,
25694- u64 delta_ns, const enum hrtimer_mode mode)
25695+static void
25696+hrtimer_update_softirq_timer(struct hrtimer_cpu_base *cpu_base, bool reprogram)
25697 {
25698- struct hrtimer_clock_base *base, *new_base;
25699- unsigned long flags;
25700- int leftmost;
25701+ ktime_t expires;
25702
25703- base = lock_hrtimer_base(timer, &flags);
25704+ /*
25705+ * Find the next SOFT expiration.
25706+ */
25707+ expires = __hrtimer_get_next_event(cpu_base, HRTIMER_ACTIVE_SOFT);
25708+
25709+ /*
25710+ * reprogramming needs to be triggered, even if the next soft
25711+ * hrtimer expires at the same time than the next hard
25712+ * hrtimer. cpu_base->softirq_expires_next needs to be updated!
25713+ */
25714+ if (expires == KTIME_MAX)
25715+ return;
25716+
25717+ /*
25718+ * cpu_base->*next_timer is recomputed by __hrtimer_get_next_event()
25719+ * cpu_base->*expires_next is only set by hrtimer_reprogram()
25720+ */
25721+ hrtimer_reprogram(cpu_base->softirq_next_timer, reprogram);
25722+}
25723+
25724+static int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim,
25725+ u64 delta_ns, const enum hrtimer_mode mode,
25726+ struct hrtimer_clock_base *base)
25727+{
25728+ struct hrtimer_clock_base *new_base;
25729
25730 /* Remove an active timer from the queue: */
25731 remove_hrtimer(timer, base, true);
25732@@ -964,21 +1124,37 @@ void hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim,
25733 /* Switch the timer base, if necessary: */
25734 new_base = switch_hrtimer_base(timer, base, mode & HRTIMER_MODE_PINNED);
25735
25736- leftmost = enqueue_hrtimer(timer, new_base);
25737- if (!leftmost)
25738- goto unlock;
25739+ return enqueue_hrtimer(timer, new_base, mode);
25740+}
25741+
25742+/**
25743+ * hrtimer_start_range_ns - (re)start an hrtimer
25744+ * @timer: the timer to be added
25745+ * @tim: expiry time
25746+ * @delta_ns: "slack" range for the timer
25747+ * @mode: timer mode: absolute (HRTIMER_MODE_ABS) or
25748+ * relative (HRTIMER_MODE_REL), and pinned (HRTIMER_MODE_PINNED);
25749+ * softirq based mode is considered for debug purpose only!
25750+ */
25751+void hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim,
25752+ u64 delta_ns, const enum hrtimer_mode mode)
25753+{
25754+ struct hrtimer_clock_base *base;
25755+ unsigned long flags;
25756+
25757+ /*
25758+ * Check whether the HRTIMER_MODE_SOFT bit and hrtimer.is_soft
25759+ * match.
25760+ */
25761+#ifndef CONFIG_PREEMPT_RT_BASE
25762+ WARN_ON_ONCE(!(mode & HRTIMER_MODE_SOFT) ^ !timer->is_soft);
25763+#endif
25764+
25765+ base = lock_hrtimer_base(timer, &flags);
25766+
25767+ if (__hrtimer_start_range_ns(timer, tim, delta_ns, mode, base))
25768+ hrtimer_reprogram(timer, true);
25769
25770- if (!hrtimer_is_hres_active(timer)) {
25771- /*
25772- * Kick to reschedule the next tick to handle the new timer
25773- * on dynticks target.
25774- */
25775- if (new_base->cpu_base->nohz_active)
25776- wake_up_nohz_cpu(new_base->cpu_base->cpu);
25777- } else {
25778- hrtimer_reprogram(timer, new_base);
25779- }
25780-unlock:
25781 unlock_hrtimer_base(timer, &flags);
25782 }
25783 EXPORT_SYMBOL_GPL(hrtimer_start_range_ns);
25784@@ -1035,7 +1211,7 @@ int hrtimer_cancel(struct hrtimer *timer)
25785
25786 if (ret >= 0)
25787 return ret;
25788- cpu_relax();
25789+ hrtimer_wait_for_timer(timer);
25790 }
25791 }
25792 EXPORT_SYMBOL_GPL(hrtimer_cancel);
25793@@ -1076,7 +1252,7 @@ u64 hrtimer_get_next_event(void)
25794 raw_spin_lock_irqsave(&cpu_base->lock, flags);
25795
25796 if (!__hrtimer_hres_active(cpu_base))
25797- expires = __hrtimer_get_next_event(cpu_base);
25798+ expires = __hrtimer_get_next_event(cpu_base, HRTIMER_ACTIVE_ALL);
25799
25800 raw_spin_unlock_irqrestore(&cpu_base->lock, flags);
25801
25802@@ -1099,8 +1275,16 @@ static inline int hrtimer_clockid_to_base(clockid_t clock_id)
25803 static void __hrtimer_init(struct hrtimer *timer, clockid_t clock_id,
25804 enum hrtimer_mode mode)
25805 {
25806- struct hrtimer_cpu_base *cpu_base;
25807+ bool softtimer;
25808 int base;
25809+ struct hrtimer_cpu_base *cpu_base;
25810+
25811+ softtimer = !!(mode & HRTIMER_MODE_SOFT);
25812+#ifdef CONFIG_PREEMPT_RT_FULL
25813+ if (!softtimer && !(mode & HRTIMER_MODE_HARD))
25814+ softtimer = true;
25815+#endif
25816+ base = softtimer ? HRTIMER_MAX_CLOCK_BASES / 2 : 0;
25817
25818 memset(timer, 0, sizeof(struct hrtimer));
25819
25820@@ -1114,7 +1298,8 @@ static void __hrtimer_init(struct hrtimer *timer, clockid_t clock_id,
25821 if (clock_id == CLOCK_REALTIME && mode & HRTIMER_MODE_REL)
25822 clock_id = CLOCK_MONOTONIC;
25823
25824- base = hrtimer_clockid_to_base(clock_id);
25825+ base += hrtimer_clockid_to_base(clock_id);
25826+ timer->is_soft = softtimer;
25827 timer->base = &cpu_base->clock_base[base];
25828 timerqueue_init(&timer->node);
25829 }
25830@@ -1123,7 +1308,13 @@ static void __hrtimer_init(struct hrtimer *timer, clockid_t clock_id,
25831 * hrtimer_init - initialize a timer to the given clock
25832 * @timer: the timer to be initialized
25833 * @clock_id: the clock to be used
25834- * @mode: timer mode abs/rel
25835+ * @mode: The modes which are relevant for intitialization:
25836+ * HRTIMER_MODE_ABS, HRTIMER_MODE_REL, HRTIMER_MODE_ABS_SOFT,
25837+ * HRTIMER_MODE_REL_SOFT
25838+ *
25839+ * The PINNED variants of the above can be handed in,
25840+ * but the PINNED bit is ignored as pinning happens
25841+ * when the hrtimer is started
25842 */
25843 void hrtimer_init(struct hrtimer *timer, clockid_t clock_id,
25844 enum hrtimer_mode mode)
25845@@ -1142,19 +1333,19 @@ EXPORT_SYMBOL_GPL(hrtimer_init);
25846 */
25847 bool hrtimer_active(const struct hrtimer *timer)
25848 {
25849- struct hrtimer_cpu_base *cpu_base;
25850+ struct hrtimer_clock_base *base;
25851 unsigned int seq;
25852
25853 do {
25854- cpu_base = READ_ONCE(timer->base->cpu_base);
25855- seq = raw_read_seqcount_begin(&cpu_base->seq);
25856+ base = READ_ONCE(timer->base);
25857+ seq = raw_read_seqcount_begin(&base->seq);
25858
25859 if (timer->state != HRTIMER_STATE_INACTIVE ||
25860- cpu_base->running == timer)
25861+ base->running == timer)
25862 return true;
25863
25864- } while (read_seqcount_retry(&cpu_base->seq, seq) ||
25865- cpu_base != READ_ONCE(timer->base->cpu_base));
25866+ } while (read_seqcount_retry(&base->seq, seq) ||
25867+ base != READ_ONCE(timer->base));
25868
25869 return false;
25870 }
25871@@ -1180,7 +1371,8 @@ EXPORT_SYMBOL_GPL(hrtimer_active);
25872
25873 static void __run_hrtimer(struct hrtimer_cpu_base *cpu_base,
25874 struct hrtimer_clock_base *base,
25875- struct hrtimer *timer, ktime_t *now)
25876+ struct hrtimer *timer, ktime_t *now,
25877+ unsigned long flags)
25878 {
25879 enum hrtimer_restart (*fn)(struct hrtimer *);
25880 int restart;
25881@@ -1188,16 +1380,16 @@ static void __run_hrtimer(struct hrtimer_cpu_base *cpu_base,
25882 lockdep_assert_held(&cpu_base->lock);
25883
25884 debug_deactivate(timer);
25885- cpu_base->running = timer;
25886+ base->running = timer;
25887
25888 /*
25889 * Separate the ->running assignment from the ->state assignment.
25890 *
25891 * As with a regular write barrier, this ensures the read side in
25892- * hrtimer_active() cannot observe cpu_base->running == NULL &&
25893+ * hrtimer_active() cannot observe base->running == NULL &&
25894 * timer->state == INACTIVE.
25895 */
25896- raw_write_seqcount_barrier(&cpu_base->seq);
25897+ raw_write_seqcount_barrier(&base->seq);
25898
25899 __remove_hrtimer(timer, base, HRTIMER_STATE_INACTIVE, 0);
25900 fn = timer->function;
25901@@ -1211,15 +1403,15 @@ static void __run_hrtimer(struct hrtimer_cpu_base *cpu_base,
25902 timer->is_rel = false;
25903
25904 /*
25905- * Because we run timers from hardirq context, there is no chance
25906- * they get migrated to another cpu, therefore its safe to unlock
25907- * the timer base.
25908+ * The timer is marked as running in the cpu base, so it is
25909+ * protected against migration to a different CPU even if the lock
25910+ * is dropped.
25911 */
25912- raw_spin_unlock(&cpu_base->lock);
25913+ raw_spin_unlock_irqrestore(&cpu_base->lock, flags);
25914 trace_hrtimer_expire_entry(timer, now);
25915 restart = fn(timer);
25916 trace_hrtimer_expire_exit(timer);
25917- raw_spin_lock(&cpu_base->lock);
25918+ raw_spin_lock_irq(&cpu_base->lock);
25919
25920 /*
25921 * Note: We clear the running state after enqueue_hrtimer and
25922@@ -1232,33 +1424,31 @@ static void __run_hrtimer(struct hrtimer_cpu_base *cpu_base,
25923 */
25924 if (restart != HRTIMER_NORESTART &&
25925 !(timer->state & HRTIMER_STATE_ENQUEUED))
25926- enqueue_hrtimer(timer, base);
25927+ enqueue_hrtimer(timer, base, HRTIMER_MODE_ABS);
25928
25929 /*
25930 * Separate the ->running assignment from the ->state assignment.
25931 *
25932 * As with a regular write barrier, this ensures the read side in
25933- * hrtimer_active() cannot observe cpu_base->running == NULL &&
25934+ * hrtimer_active() cannot observe base->running.timer == NULL &&
25935 * timer->state == INACTIVE.
25936 */
25937- raw_write_seqcount_barrier(&cpu_base->seq);
25938+ raw_write_seqcount_barrier(&base->seq);
25939
25940- WARN_ON_ONCE(cpu_base->running != timer);
25941- cpu_base->running = NULL;
25942+ WARN_ON_ONCE(base->running != timer);
25943+ base->running = NULL;
25944 }
25945
25946-static void __hrtimer_run_queues(struct hrtimer_cpu_base *cpu_base, ktime_t now)
25947+static void __hrtimer_run_queues(struct hrtimer_cpu_base *cpu_base, ktime_t now,
25948+ unsigned long flags, unsigned int active_mask)
25949 {
25950- struct hrtimer_clock_base *base = cpu_base->clock_base;
25951- unsigned int active = cpu_base->active_bases;
25952+ struct hrtimer_clock_base *base;
25953+ unsigned int active = cpu_base->active_bases & active_mask;
25954
25955- for (; active; base++, active >>= 1) {
25956+ for_each_active_base(base, cpu_base, active) {
25957 struct timerqueue_node *node;
25958 ktime_t basenow;
25959
25960- if (!(active & 0x01))
25961- continue;
25962-
25963 basenow = ktime_add(now, base->offset);
25964
25965 while ((node = timerqueue_getnext(&base->active))) {
25966@@ -1281,11 +1471,29 @@ static void __hrtimer_run_queues(struct hrtimer_cpu_base *cpu_base, ktime_t now)
25967 if (basenow < hrtimer_get_softexpires_tv64(timer))
25968 break;
25969
25970- __run_hrtimer(cpu_base, base, timer, &basenow);
25971+ __run_hrtimer(cpu_base, base, timer, &basenow, flags);
25972 }
25973 }
25974 }
25975
25976+static __latent_entropy void hrtimer_run_softirq(struct softirq_action *h)
25977+{
25978+ struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases);
25979+ unsigned long flags;
25980+ ktime_t now;
25981+
25982+ raw_spin_lock_irqsave(&cpu_base->lock, flags);
25983+
25984+ now = hrtimer_update_base(cpu_base);
25985+ __hrtimer_run_queues(cpu_base, now, flags, HRTIMER_ACTIVE_SOFT);
25986+
25987+ cpu_base->softirq_activated = 0;
25988+ hrtimer_update_softirq_timer(cpu_base, true);
25989+
25990+ raw_spin_unlock_irqrestore(&cpu_base->lock, flags);
25991+ wake_up_timer_waiters(cpu_base);
25992+}
25993+
25994 #ifdef CONFIG_HIGH_RES_TIMERS
25995
25996 /*
25997@@ -1296,13 +1504,14 @@ void hrtimer_interrupt(struct clock_event_device *dev)
25998 {
25999 struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases);
26000 ktime_t expires_next, now, entry_time, delta;
26001+ unsigned long flags;
26002 int retries = 0;
26003
26004 BUG_ON(!cpu_base->hres_active);
26005 cpu_base->nr_events++;
26006 dev->next_event = KTIME_MAX;
26007
26008- raw_spin_lock(&cpu_base->lock);
26009+ raw_spin_lock_irqsave(&cpu_base->lock, flags);
26010 entry_time = now = hrtimer_update_base(cpu_base);
26011 retry:
26012 cpu_base->in_hrtirq = 1;
26013@@ -1315,17 +1524,23 @@ void hrtimer_interrupt(struct clock_event_device *dev)
26014 */
26015 cpu_base->expires_next = KTIME_MAX;
26016
26017- __hrtimer_run_queues(cpu_base, now);
26018+ if (!ktime_before(now, cpu_base->softirq_expires_next)) {
26019+ cpu_base->softirq_expires_next = KTIME_MAX;
26020+ cpu_base->softirq_activated = 1;
26021+ raise_softirq_irqoff(HRTIMER_SOFTIRQ);
26022+ }
26023+
26024+ __hrtimer_run_queues(cpu_base, now, flags, HRTIMER_ACTIVE_HARD);
26025
26026 /* Reevaluate the clock bases for the next expiry */
26027- expires_next = __hrtimer_get_next_event(cpu_base);
26028+ expires_next = __hrtimer_get_next_event(cpu_base, HRTIMER_ACTIVE_ALL);
26029 /*
26030 * Store the new expiry value so the migration code can verify
26031 * against it.
26032 */
26033 cpu_base->expires_next = expires_next;
26034 cpu_base->in_hrtirq = 0;
26035- raw_spin_unlock(&cpu_base->lock);
26036+ raw_spin_unlock_irqrestore(&cpu_base->lock, flags);
26037
26038 /* Reprogramming necessary ? */
26039 if (!tick_program_event(expires_next, 0)) {
26040@@ -1346,7 +1561,7 @@ void hrtimer_interrupt(struct clock_event_device *dev)
26041 * Acquire base lock for updating the offsets and retrieving
26042 * the current time.
26043 */
26044- raw_spin_lock(&cpu_base->lock);
26045+ raw_spin_lock_irqsave(&cpu_base->lock, flags);
26046 now = hrtimer_update_base(cpu_base);
26047 cpu_base->nr_retries++;
26048 if (++retries < 3)
26049@@ -1359,7 +1574,8 @@ void hrtimer_interrupt(struct clock_event_device *dev)
26050 */
26051 cpu_base->nr_hangs++;
26052 cpu_base->hang_detected = 1;
26053- raw_spin_unlock(&cpu_base->lock);
26054+ raw_spin_unlock_irqrestore(&cpu_base->lock, flags);
26055+
26056 delta = ktime_sub(now, entry_time);
26057 if ((unsigned int)delta > cpu_base->max_hang_time)
26058 cpu_base->max_hang_time = (unsigned int) delta;
26059@@ -1401,6 +1617,7 @@ static inline void __hrtimer_peek_ahead_timers(void) { }
26060 void hrtimer_run_queues(void)
26061 {
26062 struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases);
26063+ unsigned long flags;
26064 ktime_t now;
26065
26066 if (__hrtimer_hres_active(cpu_base))
26067@@ -1418,10 +1635,17 @@ void hrtimer_run_queues(void)
26068 return;
26069 }
26070
26071- raw_spin_lock(&cpu_base->lock);
26072+ raw_spin_lock_irqsave(&cpu_base->lock, flags);
26073 now = hrtimer_update_base(cpu_base);
26074- __hrtimer_run_queues(cpu_base, now);
26075- raw_spin_unlock(&cpu_base->lock);
26076+
26077+ if (!ktime_before(now, cpu_base->softirq_expires_next)) {
26078+ cpu_base->softirq_expires_next = KTIME_MAX;
26079+ cpu_base->softirq_activated = 1;
26080+ raise_softirq_irqoff(HRTIMER_SOFTIRQ);
26081+ }
26082+
26083+ __hrtimer_run_queues(cpu_base, now, flags, HRTIMER_ACTIVE_HARD);
26084+ raw_spin_unlock_irqrestore(&cpu_base->lock, flags);
26085 }
26086
26087 /*
26088@@ -1440,13 +1664,65 @@ static enum hrtimer_restart hrtimer_wakeup(struct hrtimer *timer)
26089 return HRTIMER_NORESTART;
26090 }
26091
26092-void hrtimer_init_sleeper(struct hrtimer_sleeper *sl, struct task_struct *task)
26093+#ifdef CONFIG_PREEMPT_RT_FULL
26094+static bool task_is_realtime(struct task_struct *tsk)
26095 {
26096+ int policy = tsk->policy;
26097+
26098+ if (policy == SCHED_FIFO || policy == SCHED_RR)
26099+ return true;
26100+ if (policy == SCHED_DEADLINE)
26101+ return true;
26102+ return false;
26103+}
26104+#endif
26105+
26106+static void __hrtimer_init_sleeper(struct hrtimer_sleeper *sl,
26107+ clockid_t clock_id,
26108+ enum hrtimer_mode mode,
26109+ struct task_struct *task)
26110+{
26111+#ifdef CONFIG_PREEMPT_RT_FULL
26112+ if (!(mode & (HRTIMER_MODE_SOFT | HRTIMER_MODE_HARD))) {
26113+ if (task_is_realtime(current) || system_state != SYSTEM_RUNNING)
26114+ mode |= HRTIMER_MODE_HARD;
26115+ else
26116+ mode |= HRTIMER_MODE_SOFT;
26117+ }
26118+#endif
26119+ __hrtimer_init(&sl->timer, clock_id, mode);
26120 sl->timer.function = hrtimer_wakeup;
26121 sl->task = task;
26122 }
26123+
26124+/**
26125+ * hrtimer_init_sleeper - initialize sleeper to the given clock
26126+ * @sl: sleeper to be initialized
26127+ * @clock_id: the clock to be used
26128+ * @mode: timer mode abs/rel
26129+ * @task: the task to wake up
26130+ */
26131+void hrtimer_init_sleeper(struct hrtimer_sleeper *sl, clockid_t clock_id,
26132+ enum hrtimer_mode mode, struct task_struct *task)
26133+{
26134+ debug_init(&sl->timer, clock_id, mode);
26135+ __hrtimer_init_sleeper(sl, clock_id, mode, task);
26136+
26137+}
26138 EXPORT_SYMBOL_GPL(hrtimer_init_sleeper);
26139
26140+#ifdef CONFIG_DEBUG_OBJECTS_TIMERS
26141+void hrtimer_init_sleeper_on_stack(struct hrtimer_sleeper *sl,
26142+ clockid_t clock_id,
26143+ enum hrtimer_mode mode,
26144+ struct task_struct *task)
26145+{
26146+ debug_object_init_on_stack(&sl->timer, &hrtimer_debug_descr);
26147+ __hrtimer_init_sleeper(sl, clock_id, mode, task);
26148+}
26149+EXPORT_SYMBOL_GPL(hrtimer_init_sleeper_on_stack);
26150+#endif
26151+
26152 int nanosleep_copyout(struct restart_block *restart, struct timespec64 *ts)
26153 {
26154 switch(restart->nanosleep.type) {
26155@@ -1470,8 +1746,6 @@ static int __sched do_nanosleep(struct hrtimer_sleeper *t, enum hrtimer_mode mod
26156 {
26157 struct restart_block *restart;
26158
26159- hrtimer_init_sleeper(t, current);
26160-
26161 do {
26162 set_current_state(TASK_INTERRUPTIBLE);
26163 hrtimer_start_expires(&t->timer, mode);
26164@@ -1508,10 +1782,9 @@ static long __sched hrtimer_nanosleep_restart(struct restart_block *restart)
26165 struct hrtimer_sleeper t;
26166 int ret;
26167
26168- hrtimer_init_on_stack(&t.timer, restart->nanosleep.clockid,
26169- HRTIMER_MODE_ABS);
26170+ hrtimer_init_sleeper_on_stack(&t, restart->nanosleep.clockid,
26171+ HRTIMER_MODE_ABS, current);
26172 hrtimer_set_expires_tv64(&t.timer, restart->nanosleep.expires);
26173-
26174 ret = do_nanosleep(&t, HRTIMER_MODE_ABS);
26175 destroy_hrtimer_on_stack(&t.timer);
26176 return ret;
26177@@ -1529,7 +1802,7 @@ long hrtimer_nanosleep(const struct timespec64 *rqtp,
26178 if (dl_task(current) || rt_task(current))
26179 slack = 0;
26180
26181- hrtimer_init_on_stack(&t.timer, clockid, mode);
26182+ hrtimer_init_sleeper_on_stack(&t, clockid, mode, current);
26183 hrtimer_set_expires_range_ns(&t.timer, timespec64_to_ktime(*rqtp), slack);
26184 ret = do_nanosleep(&t, mode);
26185 if (ret != -ERESTART_RESTARTBLOCK)
26186@@ -1585,6 +1858,27 @@ COMPAT_SYSCALL_DEFINE2(nanosleep, struct compat_timespec __user *, rqtp,
26187 }
26188 #endif
26189
26190+#ifdef CONFIG_PREEMPT_RT_FULL
26191+/*
26192+ * Sleep for 1 ms in hope whoever holds what we want will let it go.
26193+ */
26194+void cpu_chill(void)
26195+{
26196+ ktime_t chill_time;
26197+ unsigned int freeze_flag = current->flags & PF_NOFREEZE;
26198+
26199+ chill_time = ktime_set(0, NSEC_PER_MSEC);
26200+ set_current_state(TASK_UNINTERRUPTIBLE);
26201+ current->flags |= PF_NOFREEZE;
26202+ sleeping_lock_inc();
26203+ schedule_hrtimeout(&chill_time, HRTIMER_MODE_REL_HARD);
26204+ sleeping_lock_dec();
26205+ if (!freeze_flag)
26206+ current->flags &= ~PF_NOFREEZE;
26207+}
26208+EXPORT_SYMBOL(cpu_chill);
26209+#endif
26210+
26211 /*
26212 * Functions related to boot-time initialization:
26213 */
26214@@ -1598,9 +1892,17 @@ int hrtimers_prepare_cpu(unsigned int cpu)
26215 timerqueue_init_head(&cpu_base->clock_base[i].active);
26216 }
26217
26218- cpu_base->active_bases = 0;
26219 cpu_base->cpu = cpu;
26220- hrtimer_init_hres(cpu_base);
26221+ cpu_base->active_bases = 0;
26222+ cpu_base->hres_active = 0;
26223+ cpu_base->hang_detected = 0;
26224+ cpu_base->next_timer = NULL;
26225+ cpu_base->softirq_next_timer = NULL;
26226+ cpu_base->expires_next = KTIME_MAX;
26227+ cpu_base->softirq_expires_next = KTIME_MAX;
26228+#ifdef CONFIG_PREEMPT_RT_BASE
26229+ init_waitqueue_head(&cpu_base->wait);
26230+#endif
26231 return 0;
26232 }
26233
26234@@ -1632,7 +1934,7 @@ static void migrate_hrtimer_list(struct hrtimer_clock_base *old_base,
26235 * sort out already expired timers and reprogram the
26236 * event device.
26237 */
26238- enqueue_hrtimer(timer, new_base);
26239+ enqueue_hrtimer(timer, new_base, HRTIMER_MODE_ABS);
26240 }
26241 }
26242
26243@@ -1644,6 +1946,12 @@ int hrtimers_dead_cpu(unsigned int scpu)
26244 BUG_ON(cpu_online(scpu));
26245 tick_cancel_sched_timer(scpu);
26246
26247+ /*
26248+ * this BH disable ensures that raise_softirq_irqoff() does
26249+ * not wakeup ksoftirqd (and acquire the pi-lock) while
26250+ * holding the cpu_base lock
26251+ */
26252+ local_bh_disable();
26253 local_irq_disable();
26254 old_base = &per_cpu(hrtimer_bases, scpu);
26255 new_base = this_cpu_ptr(&hrtimer_bases);
26256@@ -1659,12 +1967,19 @@ int hrtimers_dead_cpu(unsigned int scpu)
26257 &new_base->clock_base[i]);
26258 }
26259
26260+ /*
26261+ * The migration might have changed the first expiring softirq
26262+ * timer on this CPU. Update it.
26263+ */
26264+ hrtimer_update_softirq_timer(new_base, false);
26265+
26266 raw_spin_unlock(&old_base->lock);
26267 raw_spin_unlock(&new_base->lock);
26268
26269 /* Check, if we got expired work to do */
26270 __hrtimer_peek_ahead_timers();
26271 local_irq_enable();
26272+ local_bh_enable();
26273 return 0;
26274 }
26275
26276@@ -1673,18 +1988,19 @@ int hrtimers_dead_cpu(unsigned int scpu)
26277 void __init hrtimers_init(void)
26278 {
26279 hrtimers_prepare_cpu(smp_processor_id());
26280+ open_softirq(HRTIMER_SOFTIRQ, hrtimer_run_softirq);
26281 }
26282
26283 /**
26284 * schedule_hrtimeout_range_clock - sleep until timeout
26285 * @expires: timeout value (ktime_t)
26286 * @delta: slack in expires timeout (ktime_t)
26287- * @mode: timer mode, HRTIMER_MODE_ABS or HRTIMER_MODE_REL
26288- * @clock: timer clock, CLOCK_MONOTONIC or CLOCK_REALTIME
26289+ * @mode: timer mode
26290+ * @clock_id: timer clock to be used
26291 */
26292 int __sched
26293 schedule_hrtimeout_range_clock(ktime_t *expires, u64 delta,
26294- const enum hrtimer_mode mode, int clock)
26295+ const enum hrtimer_mode mode, clockid_t clock_id)
26296 {
26297 struct hrtimer_sleeper t;
26298
26299@@ -1705,11 +2021,9 @@ schedule_hrtimeout_range_clock(ktime_t *expires, u64 delta,
26300 return -EINTR;
26301 }
26302
26303- hrtimer_init_on_stack(&t.timer, clock, mode);
26304+ hrtimer_init_sleeper_on_stack(&t, clock_id, mode, current);
26305 hrtimer_set_expires_range_ns(&t.timer, *expires, delta);
26306
26307- hrtimer_init_sleeper(&t, current);
26308-
26309 hrtimer_start_expires(&t.timer, mode);
26310
26311 if (likely(t.task))
26312@@ -1727,7 +2041,7 @@ schedule_hrtimeout_range_clock(ktime_t *expires, u64 delta,
26313 * schedule_hrtimeout_range - sleep until timeout
26314 * @expires: timeout value (ktime_t)
26315 * @delta: slack in expires timeout (ktime_t)
26316- * @mode: timer mode, HRTIMER_MODE_ABS or HRTIMER_MODE_REL
26317+ * @mode: timer mode
26318 *
26319 * Make the current task sleep until the given expiry time has
26320 * elapsed. The routine will return immediately unless
26321@@ -1766,7 +2080,7 @@ EXPORT_SYMBOL_GPL(schedule_hrtimeout_range);
26322 /**
26323 * schedule_hrtimeout - sleep until timeout
26324 * @expires: timeout value (ktime_t)
26325- * @mode: timer mode, HRTIMER_MODE_ABS or HRTIMER_MODE_REL
26326+ * @mode: timer mode
26327 *
26328 * Make the current task sleep until the given expiry time has
26329 * elapsed. The routine will return immediately unless
26330diff --git a/kernel/time/itimer.c b/kernel/time/itimer.c
26331index f26acef5d7b4..760f38528365 100644
26332--- a/kernel/time/itimer.c
26333+++ b/kernel/time/itimer.c
26334@@ -214,6 +214,7 @@ int do_setitimer(int which, struct itimerval *value, struct itimerval *ovalue)
26335 /* We are sharing ->siglock with it_real_fn() */
26336 if (hrtimer_try_to_cancel(timer) < 0) {
26337 spin_unlock_irq(&tsk->sighand->siglock);
26338+ hrtimer_wait_for_timer(&tsk->signal->real_timer);
26339 goto again;
26340 }
26341 expires = timeval_to_ktime(value->it_value);
26342diff --git a/kernel/time/jiffies.c b/kernel/time/jiffies.c
26343index 497719127bf9..62acb8914c9e 100644
26344--- a/kernel/time/jiffies.c
26345+++ b/kernel/time/jiffies.c
26346@@ -74,7 +74,8 @@ static struct clocksource clocksource_jiffies = {
26347 .max_cycles = 10,
26348 };
26349
26350-__cacheline_aligned_in_smp DEFINE_SEQLOCK(jiffies_lock);
26351+__cacheline_aligned_in_smp DEFINE_RAW_SPINLOCK(jiffies_lock);
26352+__cacheline_aligned_in_smp seqcount_t jiffies_seq;
26353
26354 #if (BITS_PER_LONG < 64)
26355 u64 get_jiffies_64(void)
26356@@ -83,9 +84,9 @@ u64 get_jiffies_64(void)
26357 u64 ret;
26358
26359 do {
26360- seq = read_seqbegin(&jiffies_lock);
26361+ seq = read_seqcount_begin(&jiffies_seq);
26362 ret = jiffies_64;
26363- } while (read_seqretry(&jiffies_lock, seq));
26364+ } while (read_seqcount_retry(&jiffies_seq, seq));
26365 return ret;
26366 }
26367 EXPORT_SYMBOL(get_jiffies_64);
26368diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c
26369index 2da660d53a4b..c7b7d047d12e 100644
26370--- a/kernel/time/posix-cpu-timers.c
26371+++ b/kernel/time/posix-cpu-timers.c
26372@@ -3,8 +3,10 @@
26373 * Implement CPU time clocks for the POSIX clock interface.
26374 */
26375
26376+#include <uapi/linux/sched/types.h>
26377 #include <linux/sched/signal.h>
26378 #include <linux/sched/cputime.h>
26379+#include <linux/sched/rt.h>
26380 #include <linux/posix-timers.h>
26381 #include <linux/errno.h>
26382 #include <linux/math64.h>
26383@@ -14,6 +16,7 @@
26384 #include <linux/tick.h>
26385 #include <linux/workqueue.h>
26386 #include <linux/compat.h>
26387+#include <linux/smpboot.h>
26388
26389 #include "posix-timers.h"
26390
26391@@ -603,7 +606,7 @@ static int posix_cpu_timer_set(struct k_itimer *timer, int timer_flags,
26392 /*
26393 * Disarm any old timer after extracting its expiry time.
26394 */
26395- WARN_ON_ONCE(!irqs_disabled());
26396+ WARN_ON_ONCE_NONRT(!irqs_disabled());
26397
26398 ret = 0;
26399 old_incr = timer->it.cpu.incr;
26400@@ -1034,7 +1037,7 @@ static void posix_cpu_timer_rearm(struct k_itimer *timer)
26401 /*
26402 * Now re-arm for the new expiry time.
26403 */
26404- WARN_ON_ONCE(!irqs_disabled());
26405+ WARN_ON_ONCE_NONRT(!irqs_disabled());
26406 arm_timer(timer);
26407 unlock:
26408 unlock_task_sighand(p, &flags);
26409@@ -1119,13 +1122,13 @@ static inline int fastpath_timer_check(struct task_struct *tsk)
26410 * already updated our counts. We need to check if any timers fire now.
26411 * Interrupts are disabled.
26412 */
26413-void run_posix_cpu_timers(struct task_struct *tsk)
26414+static void __run_posix_cpu_timers(struct task_struct *tsk)
26415 {
26416 LIST_HEAD(firing);
26417 struct k_itimer *timer, *next;
26418 unsigned long flags;
26419
26420- WARN_ON_ONCE(!irqs_disabled());
26421+ WARN_ON_ONCE_NONRT(!irqs_disabled());
26422
26423 /*
26424 * The fast path checks that there are no expired thread or thread
26425@@ -1179,6 +1182,152 @@ void run_posix_cpu_timers(struct task_struct *tsk)
26426 }
26427 }
26428
26429+#ifdef CONFIG_PREEMPT_RT_BASE
26430+#include <linux/kthread.h>
26431+#include <linux/cpu.h>
26432+DEFINE_PER_CPU(struct task_struct *, posix_timer_task);
26433+DEFINE_PER_CPU(struct task_struct *, posix_timer_tasklist);
26434+DEFINE_PER_CPU(bool, posix_timer_th_active);
26435+
26436+static void posix_cpu_kthread_fn(unsigned int cpu)
26437+{
26438+ struct task_struct *tsk = NULL;
26439+ struct task_struct *next = NULL;
26440+
26441+ BUG_ON(per_cpu(posix_timer_task, cpu) != current);
26442+
26443+ /* grab task list */
26444+ raw_local_irq_disable();
26445+ tsk = per_cpu(posix_timer_tasklist, cpu);
26446+ per_cpu(posix_timer_tasklist, cpu) = NULL;
26447+ raw_local_irq_enable();
26448+
26449+ /* its possible the list is empty, just return */
26450+ if (!tsk)
26451+ return;
26452+
26453+ /* Process task list */
26454+ while (1) {
26455+ /* save next */
26456+ next = tsk->posix_timer_list;
26457+
26458+ /* run the task timers, clear its ptr and
26459+ * unreference it
26460+ */
26461+ __run_posix_cpu_timers(tsk);
26462+ tsk->posix_timer_list = NULL;
26463+ put_task_struct(tsk);
26464+
26465+ /* check if this is the last on the list */
26466+ if (next == tsk)
26467+ break;
26468+ tsk = next;
26469+ }
26470+}
26471+
26472+static inline int __fastpath_timer_check(struct task_struct *tsk)
26473+{
26474+ /* tsk == current, ensure it is safe to use ->signal/sighand */
26475+ if (unlikely(tsk->exit_state))
26476+ return 0;
26477+
26478+ if (!task_cputime_zero(&tsk->cputime_expires))
26479+ return 1;
26480+
26481+ if (!task_cputime_zero(&tsk->signal->cputime_expires))
26482+ return 1;
26483+
26484+ return 0;
26485+}
26486+
26487+void run_posix_cpu_timers(struct task_struct *tsk)
26488+{
26489+ unsigned int cpu = smp_processor_id();
26490+ struct task_struct *tasklist;
26491+
26492+ BUG_ON(!irqs_disabled());
26493+
26494+ if (per_cpu(posix_timer_th_active, cpu) != true)
26495+ return;
26496+
26497+ /* get per-cpu references */
26498+ tasklist = per_cpu(posix_timer_tasklist, cpu);
26499+
26500+ /* check to see if we're already queued */
26501+ if (!tsk->posix_timer_list && __fastpath_timer_check(tsk)) {
26502+ get_task_struct(tsk);
26503+ if (tasklist) {
26504+ tsk->posix_timer_list = tasklist;
26505+ } else {
26506+ /*
26507+ * The list is terminated by a self-pointing
26508+ * task_struct
26509+ */
26510+ tsk->posix_timer_list = tsk;
26511+ }
26512+ per_cpu(posix_timer_tasklist, cpu) = tsk;
26513+
26514+ wake_up_process(per_cpu(posix_timer_task, cpu));
26515+ }
26516+}
26517+
26518+static int posix_cpu_kthread_should_run(unsigned int cpu)
26519+{
26520+ return __this_cpu_read(posix_timer_tasklist) != NULL;
26521+}
26522+
26523+static void posix_cpu_kthread_park(unsigned int cpu)
26524+{
26525+ this_cpu_write(posix_timer_th_active, false);
26526+}
26527+
26528+static void posix_cpu_kthread_unpark(unsigned int cpu)
26529+{
26530+ this_cpu_write(posix_timer_th_active, true);
26531+}
26532+
26533+static void posix_cpu_kthread_setup(unsigned int cpu)
26534+{
26535+ struct sched_param sp;
26536+
26537+ sp.sched_priority = MAX_RT_PRIO - 1;
26538+ sched_setscheduler_nocheck(current, SCHED_FIFO, &sp);
26539+ posix_cpu_kthread_unpark(cpu);
26540+}
26541+
26542+static struct smp_hotplug_thread posix_cpu_thread = {
26543+ .store = &posix_timer_task,
26544+ .thread_should_run = posix_cpu_kthread_should_run,
26545+ .thread_fn = posix_cpu_kthread_fn,
26546+ .thread_comm = "posixcputmr/%u",
26547+ .setup = posix_cpu_kthread_setup,
26548+ .park = posix_cpu_kthread_park,
26549+ .unpark = posix_cpu_kthread_unpark,
26550+};
26551+
26552+static int __init posix_cpu_thread_init(void)
26553+{
26554+ /* Start one for boot CPU. */
26555+ unsigned long cpu;
26556+ int ret;
26557+
26558+ /* init the per-cpu posix_timer_tasklets */
26559+ for_each_possible_cpu(cpu)
26560+ per_cpu(posix_timer_tasklist, cpu) = NULL;
26561+
26562+ ret = smpboot_register_percpu_thread(&posix_cpu_thread);
26563+ WARN_ON(ret);
26564+
26565+ return 0;
26566+}
26567+early_initcall(posix_cpu_thread_init);
26568+#else /* CONFIG_PREEMPT_RT_BASE */
26569+void run_posix_cpu_timers(struct task_struct *tsk)
26570+{
26571+ __run_posix_cpu_timers(tsk);
26572+}
26573+#endif /* CONFIG_PREEMPT_RT_BASE */
26574+
26575 /*
26576 * Set one of the process-wide special case CPU timers or RLIMIT_CPU.
26577 * The tsk->sighand->siglock must be held by the caller.
26578diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c
26579index 55d45fe2cc17..5a59538f3d16 100644
26580--- a/kernel/time/posix-timers.c
26581+++ b/kernel/time/posix-timers.c
26582@@ -443,6 +443,7 @@ static enum hrtimer_restart posix_timer_fn(struct hrtimer *timer)
26583 static struct pid *good_sigevent(sigevent_t * event)
26584 {
26585 struct task_struct *rtn = current->group_leader;
26586+ int sig = event->sigev_signo;
26587
26588 switch (event->sigev_notify) {
26589 case SIGEV_SIGNAL | SIGEV_THREAD_ID:
26590@@ -452,7 +453,8 @@ static struct pid *good_sigevent(sigevent_t * event)
26591 /* FALLTHRU */
26592 case SIGEV_SIGNAL:
26593 case SIGEV_THREAD:
26594- if (event->sigev_signo <= 0 || event->sigev_signo > SIGRTMAX)
26595+ if (sig <= 0 || sig > SIGRTMAX ||
26596+ sig_kernel_only(sig) || sig_kernel_coredump(sig))
26597 return NULL;
26598 /* FALLTHRU */
26599 case SIGEV_NONE:
26600@@ -478,7 +480,7 @@ static struct k_itimer * alloc_posix_timer(void)
26601
26602 static void k_itimer_rcu_free(struct rcu_head *head)
26603 {
26604- struct k_itimer *tmr = container_of(head, struct k_itimer, it.rcu);
26605+ struct k_itimer *tmr = container_of(head, struct k_itimer, rcu);
26606
26607 kmem_cache_free(posix_timers_cache, tmr);
26608 }
26609@@ -495,7 +497,7 @@ static void release_posix_timer(struct k_itimer *tmr, int it_id_set)
26610 }
26611 put_pid(tmr->it_pid);
26612 sigqueue_free(tmr->sigq);
26613- call_rcu(&tmr->it.rcu, k_itimer_rcu_free);
26614+ call_rcu(&tmr->rcu, k_itimer_rcu_free);
26615 }
26616
26617 static int common_timer_create(struct k_itimer *new_timer)
26618@@ -834,6 +836,22 @@ static void common_hrtimer_arm(struct k_itimer *timr, ktime_t expires,
26619 hrtimer_start_expires(timer, HRTIMER_MODE_ABS);
26620 }
26621
26622+/*
26623+ * Protected by RCU!
26624+ */
26625+static void timer_wait_for_callback(const struct k_clock *kc, struct k_itimer *timr)
26626+{
26627+#ifdef CONFIG_PREEMPT_RT_FULL
26628+ if (kc->timer_arm == common_hrtimer_arm)
26629+ hrtimer_wait_for_timer(&timr->it.real.timer);
26630+ else if (kc == &alarm_clock)
26631+ hrtimer_wait_for_timer(&timr->it.alarm.alarmtimer.timer);
26632+ else
26633+ /* FIXME: Whacky hack for posix-cpu-timers */
26634+ schedule_timeout(1);
26635+#endif
26636+}
26637+
26638 static int common_hrtimer_try_to_cancel(struct k_itimer *timr)
26639 {
26640 return hrtimer_try_to_cancel(&timr->it.real.timer);
26641@@ -898,6 +916,7 @@ static int do_timer_settime(timer_t timer_id, int flags,
26642 if (!timr)
26643 return -EINVAL;
26644
26645+ rcu_read_lock();
26646 kc = timr->kclock;
26647 if (WARN_ON_ONCE(!kc || !kc->timer_set))
26648 error = -EINVAL;
26649@@ -906,9 +925,12 @@ static int do_timer_settime(timer_t timer_id, int flags,
26650
26651 unlock_timer(timr, flag);
26652 if (error == TIMER_RETRY) {
26653+ timer_wait_for_callback(kc, timr);
26654 old_spec64 = NULL; // We already got the old time...
26655+ rcu_read_unlock();
26656 goto retry;
26657 }
26658+ rcu_read_unlock();
26659
26660 return error;
26661 }
26662@@ -990,10 +1012,15 @@ SYSCALL_DEFINE1(timer_delete, timer_t, timer_id)
26663 if (!timer)
26664 return -EINVAL;
26665
26666+ rcu_read_lock();
26667 if (timer_delete_hook(timer) == TIMER_RETRY) {
26668 unlock_timer(timer, flags);
26669+ timer_wait_for_callback(clockid_to_kclock(timer->it_clock),
26670+ timer);
26671+ rcu_read_unlock();
26672 goto retry_delete;
26673 }
26674+ rcu_read_unlock();
26675
26676 spin_lock(&current->sighand->siglock);
26677 list_del(&timer->list);
26678@@ -1019,8 +1046,18 @@ static void itimer_delete(struct k_itimer *timer)
26679 retry_delete:
26680 spin_lock_irqsave(&timer->it_lock, flags);
26681
26682+ /* On RT we can race with a deletion */
26683+ if (!timer->it_signal) {
26684+ unlock_timer(timer, flags);
26685+ return;
26686+ }
26687+
26688 if (timer_delete_hook(timer) == TIMER_RETRY) {
26689+ rcu_read_lock();
26690 unlock_timer(timer, flags);
26691+ timer_wait_for_callback(clockid_to_kclock(timer->it_clock),
26692+ timer);
26693+ rcu_read_unlock();
26694 goto retry_delete;
26695 }
26696 list_del(&timer->list);
26697diff --git a/kernel/time/tick-broadcast-hrtimer.c b/kernel/time/tick-broadcast-hrtimer.c
26698index 58045eb976c3..f0a34afbc252 100644
26699--- a/kernel/time/tick-broadcast-hrtimer.c
26700+++ b/kernel/time/tick-broadcast-hrtimer.c
26701@@ -106,7 +106,7 @@ static enum hrtimer_restart bc_handler(struct hrtimer *t)
26702
26703 void tick_setup_hrtimer_broadcast(void)
26704 {
26705- hrtimer_init(&bctimer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
26706+ hrtimer_init(&bctimer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_HARD);
26707 bctimer.function = bc_handler;
26708 clockevents_register_device(&ce_broadcast_hrtimer);
26709 }
26710diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c
26711index 49edc1c4f3e6..7a87a4488a5e 100644
26712--- a/kernel/time/tick-common.c
26713+++ b/kernel/time/tick-common.c
26714@@ -79,13 +79,15 @@ int tick_is_oneshot_available(void)
26715 static void tick_periodic(int cpu)
26716 {
26717 if (tick_do_timer_cpu == cpu) {
26718- write_seqlock(&jiffies_lock);
26719+ raw_spin_lock(&jiffies_lock);
26720+ write_seqcount_begin(&jiffies_seq);
26721
26722 /* Keep track of the next tick event */
26723 tick_next_period = ktime_add(tick_next_period, tick_period);
26724
26725 do_timer(1);
26726- write_sequnlock(&jiffies_lock);
26727+ write_seqcount_end(&jiffies_seq);
26728+ raw_spin_unlock(&jiffies_lock);
26729 update_wall_time();
26730 }
26731
26732@@ -157,9 +159,9 @@ void tick_setup_periodic(struct clock_event_device *dev, int broadcast)
26733 ktime_t next;
26734
26735 do {
26736- seq = read_seqbegin(&jiffies_lock);
26737+ seq = read_seqcount_begin(&jiffies_seq);
26738 next = tick_next_period;
26739- } while (read_seqretry(&jiffies_lock, seq));
26740+ } while (read_seqcount_retry(&jiffies_seq, seq));
26741
26742 clockevents_switch_state(dev, CLOCK_EVT_STATE_ONESHOT);
26743
26744@@ -490,6 +492,7 @@ void tick_freeze(void)
26745 if (tick_freeze_depth == num_online_cpus()) {
26746 trace_suspend_resume(TPS("timekeeping_freeze"),
26747 smp_processor_id(), true);
26748+ system_state = SYSTEM_SUSPEND;
26749 timekeeping_suspend();
26750 } else {
26751 tick_suspend_local();
26752@@ -513,6 +516,7 @@ void tick_unfreeze(void)
26753
26754 if (tick_freeze_depth == num_online_cpus()) {
26755 timekeeping_resume();
26756+ system_state = SYSTEM_RUNNING;
26757 trace_suspend_resume(TPS("timekeeping_freeze"),
26758 smp_processor_id(), false);
26759 } else {
26760diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h
26761index f8e1845aa464..e277284c2831 100644
26762--- a/kernel/time/tick-internal.h
26763+++ b/kernel/time/tick-internal.h
26764@@ -150,16 +150,15 @@ static inline void tick_nohz_init(void) { }
26765
26766 #ifdef CONFIG_NO_HZ_COMMON
26767 extern unsigned long tick_nohz_active;
26768-#else
26769+extern void timers_update_nohz(void);
26770+# ifdef CONFIG_SMP
26771+extern struct static_key_false timers_migration_enabled;
26772+# endif
26773+#else /* CONFIG_NO_HZ_COMMON */
26774+static inline void timers_update_nohz(void) { }
26775 #define tick_nohz_active (0)
26776 #endif
26777
26778-#if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON)
26779-extern void timers_update_migration(bool update_nohz);
26780-#else
26781-static inline void timers_update_migration(bool update_nohz) { }
26782-#endif
26783-
26784 DECLARE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases);
26785
26786 extern u64 get_next_timer_interrupt(unsigned long basej, u64 basem);
26787diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
26788index a8fa0a896b78..643b36a0b8e1 100644
26789--- a/kernel/time/tick-sched.c
26790+++ b/kernel/time/tick-sched.c
26791@@ -66,7 +66,8 @@ static void tick_do_update_jiffies64(ktime_t now)
26792 return;
26793
26794 /* Reevaluate with jiffies_lock held */
26795- write_seqlock(&jiffies_lock);
26796+ raw_spin_lock(&jiffies_lock);
26797+ write_seqcount_begin(&jiffies_seq);
26798
26799 delta = ktime_sub(now, last_jiffies_update);
26800 if (delta >= tick_period) {
26801@@ -89,10 +90,12 @@ static void tick_do_update_jiffies64(ktime_t now)
26802 /* Keep the tick_next_period variable up to date */
26803 tick_next_period = ktime_add(last_jiffies_update, tick_period);
26804 } else {
26805- write_sequnlock(&jiffies_lock);
26806+ write_seqcount_end(&jiffies_seq);
26807+ raw_spin_unlock(&jiffies_lock);
26808 return;
26809 }
26810- write_sequnlock(&jiffies_lock);
26811+ write_seqcount_end(&jiffies_seq);
26812+ raw_spin_unlock(&jiffies_lock);
26813 update_wall_time();
26814 }
26815
26816@@ -103,12 +106,14 @@ static ktime_t tick_init_jiffy_update(void)
26817 {
26818 ktime_t period;
26819
26820- write_seqlock(&jiffies_lock);
26821+ raw_spin_lock(&jiffies_lock);
26822+ write_seqcount_begin(&jiffies_seq);
26823 /* Did we start the jiffies update yet ? */
26824 if (last_jiffies_update == 0)
26825 last_jiffies_update = tick_next_period;
26826 period = last_jiffies_update;
26827- write_sequnlock(&jiffies_lock);
26828+ write_seqcount_end(&jiffies_seq);
26829+ raw_spin_unlock(&jiffies_lock);
26830 return period;
26831 }
26832
26833@@ -225,6 +230,7 @@ static void nohz_full_kick_func(struct irq_work *work)
26834
26835 static DEFINE_PER_CPU(struct irq_work, nohz_full_kick_work) = {
26836 .func = nohz_full_kick_func,
26837+ .flags = IRQ_WORK_HARD_IRQ,
26838 };
26839
26840 /*
26841@@ -689,10 +695,10 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts,
26842
26843 /* Read jiffies and the time when jiffies were updated last */
26844 do {
26845- seq = read_seqbegin(&jiffies_lock);
26846+ seq = read_seqcount_begin(&jiffies_seq);
26847 basemono = last_jiffies_update;
26848 basejiff = jiffies;
26849- } while (read_seqretry(&jiffies_lock, seq));
26850+ } while (read_seqcount_retry(&jiffies_seq, seq));
26851 ts->last_jiffies = basejiff;
26852
26853 /*
26854@@ -906,14 +912,7 @@ static bool can_stop_idle_tick(int cpu, struct tick_sched *ts)
26855 return false;
26856
26857 if (unlikely(local_softirq_pending() && cpu_online(cpu))) {
26858- static int ratelimit;
26859-
26860- if (ratelimit < 10 && !in_softirq() &&
26861- (local_softirq_pending() & SOFTIRQ_STOP_IDLE_MASK)) {
26862- pr_warn("NOHZ: local_softirq_pending %02x\n",
26863- (unsigned int) local_softirq_pending());
26864- ratelimit++;
26865- }
26866+ softirq_check_pending_idle();
26867 return false;
26868 }
26869
26870@@ -1132,7 +1131,7 @@ static inline void tick_nohz_activate(struct tick_sched *ts, int mode)
26871 ts->nohz_mode = mode;
26872 /* One update is enough */
26873 if (!test_and_set_bit(0, &tick_nohz_active))
26874- timers_update_migration(true);
26875+ timers_update_nohz();
26876 }
26877
26878 /**
26879@@ -1250,7 +1249,7 @@ void tick_setup_sched_timer(void)
26880 /*
26881 * Emulate tick processing via per-CPU hrtimers:
26882 */
26883- hrtimer_init(&ts->sched_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
26884+ hrtimer_init(&ts->sched_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_HARD);
26885 ts->sched_timer.function = tick_sched_timer;
26886
26887 /* Get the next period (per-CPU) */
26888diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
26889index 2cafb49aa65e..2720f2c29a6d 100644
26890--- a/kernel/time/timekeeping.c
26891+++ b/kernel/time/timekeeping.c
26892@@ -2326,8 +2326,10 @@ EXPORT_SYMBOL(hardpps);
26893 */
26894 void xtime_update(unsigned long ticks)
26895 {
26896- write_seqlock(&jiffies_lock);
26897+ raw_spin_lock(&jiffies_lock);
26898+ write_seqcount_begin(&jiffies_seq);
26899 do_timer(ticks);
26900- write_sequnlock(&jiffies_lock);
26901+ write_seqcount_end(&jiffies_seq);
26902+ raw_spin_unlock(&jiffies_lock);
26903 update_wall_time();
26904 }
26905diff --git a/kernel/time/timekeeping.h b/kernel/time/timekeeping.h
26906index c9f9af339914..0c0f52bf1927 100644
26907--- a/kernel/time/timekeeping.h
26908+++ b/kernel/time/timekeeping.h
26909@@ -18,7 +18,8 @@ extern void timekeeping_resume(void);
26910 extern void do_timer(unsigned long ticks);
26911 extern void update_wall_time(void);
26912
26913-extern seqlock_t jiffies_lock;
26914+extern raw_spinlock_t jiffies_lock;
26915+extern seqcount_t jiffies_seq;
26916
26917 #define CS_NAME_LEN 32
26918
26919diff --git a/kernel/time/timer.c b/kernel/time/timer.c
26920index f17c76a1a05f..5fadd754ce20 100644
26921--- a/kernel/time/timer.c
26922+++ b/kernel/time/timer.c
26923@@ -44,6 +44,7 @@
26924 #include <linux/sched/debug.h>
26925 #include <linux/slab.h>
26926 #include <linux/compat.h>
26927+#include <linux/swait.h>
26928
26929 #include <linux/uaccess.h>
26930 #include <asm/unistd.h>
26931@@ -197,11 +198,12 @@ EXPORT_SYMBOL(jiffies_64);
26932 struct timer_base {
26933 raw_spinlock_t lock;
26934 struct timer_list *running_timer;
26935+#ifdef CONFIG_PREEMPT_RT_FULL
26936+ struct swait_queue_head wait_for_running_timer;
26937+#endif
26938 unsigned long clk;
26939 unsigned long next_expiry;
26940 unsigned int cpu;
26941- bool migration_enabled;
26942- bool nohz_active;
26943 bool is_idle;
26944 bool must_forward_clk;
26945 DECLARE_BITMAP(pending_map, WHEEL_SIZE);
26946@@ -210,45 +212,73 @@ struct timer_base {
26947
26948 static DEFINE_PER_CPU(struct timer_base, timer_bases[NR_BASES]);
26949
26950-#if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON)
26951+#ifdef CONFIG_NO_HZ_COMMON
26952+
26953+static DEFINE_STATIC_KEY_FALSE(timers_nohz_active);
26954+static DEFINE_MUTEX(timer_keys_mutex);
26955+
26956+static struct swork_event timer_update_swork;
26957+
26958+#ifdef CONFIG_SMP
26959 unsigned int sysctl_timer_migration = 1;
26960
26961-void timers_update_migration(bool update_nohz)
26962+DEFINE_STATIC_KEY_FALSE(timers_migration_enabled);
26963+
26964+static void timers_update_migration(void)
26965 {
26966 bool on = sysctl_timer_migration && tick_nohz_active;
26967- unsigned int cpu;
26968
26969- /* Avoid the loop, if nothing to update */
26970- if (this_cpu_read(timer_bases[BASE_STD].migration_enabled) == on)
26971- return;
26972+ if (on)
26973+ static_branch_enable(&timers_migration_enabled);
26974+ else
26975+ static_branch_disable(&timers_migration_enabled);
26976+}
26977+#else
26978+static inline void timers_update_migration(void) { }
26979+#endif /* !CONFIG_SMP */
26980
26981- for_each_possible_cpu(cpu) {
26982- per_cpu(timer_bases[BASE_STD].migration_enabled, cpu) = on;
26983- per_cpu(timer_bases[BASE_DEF].migration_enabled, cpu) = on;
26984- per_cpu(hrtimer_bases.migration_enabled, cpu) = on;
26985- if (!update_nohz)
26986- continue;
26987- per_cpu(timer_bases[BASE_STD].nohz_active, cpu) = true;
26988- per_cpu(timer_bases[BASE_DEF].nohz_active, cpu) = true;
26989- per_cpu(hrtimer_bases.nohz_active, cpu) = true;
26990- }
26991+static void timer_update_keys(struct swork_event *event)
26992+{
26993+ mutex_lock(&timer_keys_mutex);
26994+ timers_update_migration();
26995+ static_branch_enable(&timers_nohz_active);
26996+ mutex_unlock(&timer_keys_mutex);
26997+}
26998+
26999+void timers_update_nohz(void)
27000+{
27001+ swork_queue(&timer_update_swork);
27002+}
27003+
27004+static __init int hrtimer_init_thread(void)
27005+{
27006+ WARN_ON(swork_get());
27007+ INIT_SWORK(&timer_update_swork, timer_update_keys);
27008+ return 0;
27009 }
27010+early_initcall(hrtimer_init_thread);
27011
27012 int timer_migration_handler(struct ctl_table *table, int write,
27013 void __user *buffer, size_t *lenp,
27014 loff_t *ppos)
27015 {
27016- static DEFINE_MUTEX(mutex);
27017 int ret;
27018
27019- mutex_lock(&mutex);
27020+ mutex_lock(&timer_keys_mutex);
27021 ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
27022 if (!ret && write)
27023- timers_update_migration(false);
27024- mutex_unlock(&mutex);
27025+ timers_update_migration();
27026+ mutex_unlock(&timer_keys_mutex);
27027 return ret;
27028 }
27029-#endif
27030+
27031+static inline bool is_timers_nohz_active(void)
27032+{
27033+ return static_branch_unlikely(&timers_nohz_active);
27034+}
27035+#else
27036+static inline bool is_timers_nohz_active(void) { return false; }
27037+#endif /* NO_HZ_COMMON */
27038
27039 static unsigned long round_jiffies_common(unsigned long j, int cpu,
27040 bool force_up)
27041@@ -534,7 +564,7 @@ __internal_add_timer(struct timer_base *base, struct timer_list *timer)
27042 static void
27043 trigger_dyntick_cpu(struct timer_base *base, struct timer_list *timer)
27044 {
27045- if (!IS_ENABLED(CONFIG_NO_HZ_COMMON) || !base->nohz_active)
27046+ if (!is_timers_nohz_active())
27047 return;
27048
27049 /*
27050@@ -840,21 +870,20 @@ static inline struct timer_base *get_timer_base(u32 tflags)
27051 return get_timer_cpu_base(tflags, tflags & TIMER_CPUMASK);
27052 }
27053
27054-#ifdef CONFIG_NO_HZ_COMMON
27055 static inline struct timer_base *
27056 get_target_base(struct timer_base *base, unsigned tflags)
27057 {
27058-#ifdef CONFIG_SMP
27059- if ((tflags & TIMER_PINNED) || !base->migration_enabled)
27060- return get_timer_this_cpu_base(tflags);
27061- return get_timer_cpu_base(tflags, get_nohz_timer_target());
27062-#else
27063- return get_timer_this_cpu_base(tflags);
27064+#if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON)
27065+ if (static_branch_unlikely(&timers_migration_enabled) &&
27066+ !(tflags & TIMER_PINNED))
27067+ return get_timer_cpu_base(tflags, get_nohz_timer_target());
27068 #endif
27069+ return get_timer_this_cpu_base(tflags);
27070 }
27071
27072 static inline void forward_timer_base(struct timer_base *base)
27073 {
27074+#ifdef CONFIG_NO_HZ_COMMON
27075 unsigned long jnow;
27076
27077 /*
27078@@ -878,16 +907,8 @@ static inline void forward_timer_base(struct timer_base *base)
27079 base->clk = jnow;
27080 else
27081 base->clk = base->next_expiry;
27082-}
27083-#else
27084-static inline struct timer_base *
27085-get_target_base(struct timer_base *base, unsigned tflags)
27086-{
27087- return get_timer_this_cpu_base(tflags);
27088-}
27089-
27090-static inline void forward_timer_base(struct timer_base *base) { }
27091 #endif
27092+}
27093
27094
27095 /*
27096@@ -1130,6 +1151,33 @@ void add_timer_on(struct timer_list *timer, int cpu)
27097 }
27098 EXPORT_SYMBOL_GPL(add_timer_on);
27099
27100+#ifdef CONFIG_PREEMPT_RT_FULL
27101+/*
27102+ * Wait for a running timer
27103+ */
27104+static void wait_for_running_timer(struct timer_list *timer)
27105+{
27106+ struct timer_base *base;
27107+ u32 tf = timer->flags;
27108+
27109+ if (tf & TIMER_MIGRATING)
27110+ return;
27111+
27112+ base = get_timer_base(tf);
27113+ swait_event(base->wait_for_running_timer,
27114+ base->running_timer != timer);
27115+}
27116+
27117+# define wakeup_timer_waiters(b) swake_up_all(&(b)->wait_for_running_timer)
27118+#else
27119+static inline void wait_for_running_timer(struct timer_list *timer)
27120+{
27121+ cpu_relax();
27122+}
27123+
27124+# define wakeup_timer_waiters(b) do { } while (0)
27125+#endif
27126+
27127 /**
27128 * del_timer - deactivate a timer.
27129 * @timer: the timer to be deactivated
27130@@ -1185,7 +1233,7 @@ int try_to_del_timer_sync(struct timer_list *timer)
27131 }
27132 EXPORT_SYMBOL(try_to_del_timer_sync);
27133
27134-#ifdef CONFIG_SMP
27135+#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT_FULL)
27136 /**
27137 * del_timer_sync - deactivate a timer and wait for the handler to finish.
27138 * @timer: the timer to be deactivated
27139@@ -1245,7 +1293,7 @@ int del_timer_sync(struct timer_list *timer)
27140 int ret = try_to_del_timer_sync(timer);
27141 if (ret >= 0)
27142 return ret;
27143- cpu_relax();
27144+ wait_for_running_timer(timer);
27145 }
27146 }
27147 EXPORT_SYMBOL(del_timer_sync);
27148@@ -1309,13 +1357,16 @@ static void expire_timers(struct timer_base *base, struct hlist_head *head)
27149 fn = timer->function;
27150 data = timer->data;
27151
27152- if (timer->flags & TIMER_IRQSAFE) {
27153+ if (!IS_ENABLED(CONFIG_PREEMPT_RT_FULL) &&
27154+ timer->flags & TIMER_IRQSAFE) {
27155 raw_spin_unlock(&base->lock);
27156 call_timer_fn(timer, fn, data);
27157+ base->running_timer = NULL;
27158 raw_spin_lock(&base->lock);
27159 } else {
27160 raw_spin_unlock_irq(&base->lock);
27161 call_timer_fn(timer, fn, data);
27162+ base->running_timer = NULL;
27163 raw_spin_lock_irq(&base->lock);
27164 }
27165 }
27166@@ -1586,7 +1637,7 @@ void update_process_times(int user_tick)
27167 account_process_tick(p, user_tick);
27168 run_local_timers();
27169 rcu_check_callbacks(user_tick);
27170-#ifdef CONFIG_IRQ_WORK
27171+#if defined(CONFIG_IRQ_WORK)
27172 if (in_irq())
27173 irq_work_tick();
27174 #endif
27175@@ -1633,8 +1684,8 @@ static inline void __run_timers(struct timer_base *base)
27176 while (levels--)
27177 expire_timers(base, heads + levels);
27178 }
27179- base->running_timer = NULL;
27180 raw_spin_unlock_irq(&base->lock);
27181+ wakeup_timer_waiters(base);
27182 }
27183
27184 /*
27185@@ -1644,6 +1695,7 @@ static __latent_entropy void run_timer_softirq(struct softirq_action *h)
27186 {
27187 struct timer_base *base = this_cpu_ptr(&timer_bases[BASE_STD]);
27188
27189+ irq_work_tick_soft();
27190 __run_timers(base);
27191 if (IS_ENABLED(CONFIG_NO_HZ_COMMON))
27192 __run_timers(this_cpu_ptr(&timer_bases[BASE_DEF]));
27193@@ -1867,6 +1919,9 @@ static void __init init_timer_cpu(int cpu)
27194 base->cpu = cpu;
27195 raw_spin_lock_init(&base->lock);
27196 base->clk = jiffies;
27197+#ifdef CONFIG_PREEMPT_RT_FULL
27198+ init_swait_queue_head(&base->wait_for_running_timer);
27199+#endif
27200 }
27201 }
27202
27203diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
27204index 4ad6f6ca18c1..55d39a3fbdf7 100644
27205--- a/kernel/trace/Kconfig
27206+++ b/kernel/trace/Kconfig
27207@@ -585,7 +585,10 @@ config HIST_TRIGGERS
27208 event activity as an initial guide for further investigation
27209 using more advanced tools.
27210
27211- See Documentation/trace/events.txt.
27212+ Inter-event tracing of quantities such as latencies is also
27213+ supported using hist triggers under this option.
27214+
27215+ See Documentation/trace/histogram.txt.
27216 If in doubt, say N.
27217
27218 config MMIOTRACE_TEST
27219diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
27220index a1d5e0949dcf..e8ca1e01facd 100644
27221--- a/kernel/trace/ring_buffer.c
27222+++ b/kernel/trace/ring_buffer.c
27223@@ -41,6 +41,8 @@ int ring_buffer_print_entry_header(struct trace_seq *s)
27224 RINGBUF_TYPE_PADDING);
27225 trace_seq_printf(s, "\ttime_extend : type == %d\n",
27226 RINGBUF_TYPE_TIME_EXTEND);
27227+ trace_seq_printf(s, "\ttime_stamp : type == %d\n",
27228+ RINGBUF_TYPE_TIME_STAMP);
27229 trace_seq_printf(s, "\tdata max type_len == %d\n",
27230 RINGBUF_TYPE_DATA_TYPE_LEN_MAX);
27231
27232@@ -140,12 +142,15 @@ int ring_buffer_print_entry_header(struct trace_seq *s)
27233
27234 enum {
27235 RB_LEN_TIME_EXTEND = 8,
27236- RB_LEN_TIME_STAMP = 16,
27237+ RB_LEN_TIME_STAMP = 8,
27238 };
27239
27240 #define skip_time_extend(event) \
27241 ((struct ring_buffer_event *)((char *)event + RB_LEN_TIME_EXTEND))
27242
27243+#define extended_time(event) \
27244+ (event->type_len >= RINGBUF_TYPE_TIME_EXTEND)
27245+
27246 static inline int rb_null_event(struct ring_buffer_event *event)
27247 {
27248 return event->type_len == RINGBUF_TYPE_PADDING && !event->time_delta;
27249@@ -209,7 +214,7 @@ rb_event_ts_length(struct ring_buffer_event *event)
27250 {
27251 unsigned len = 0;
27252
27253- if (event->type_len == RINGBUF_TYPE_TIME_EXTEND) {
27254+ if (extended_time(event)) {
27255 /* time extends include the data event after it */
27256 len = RB_LEN_TIME_EXTEND;
27257 event = skip_time_extend(event);
27258@@ -231,7 +236,7 @@ unsigned ring_buffer_event_length(struct ring_buffer_event *event)
27259 {
27260 unsigned length;
27261
27262- if (event->type_len == RINGBUF_TYPE_TIME_EXTEND)
27263+ if (extended_time(event))
27264 event = skip_time_extend(event);
27265
27266 length = rb_event_length(event);
27267@@ -248,7 +253,7 @@ EXPORT_SYMBOL_GPL(ring_buffer_event_length);
27268 static __always_inline void *
27269 rb_event_data(struct ring_buffer_event *event)
27270 {
27271- if (event->type_len == RINGBUF_TYPE_TIME_EXTEND)
27272+ if (extended_time(event))
27273 event = skip_time_extend(event);
27274 BUG_ON(event->type_len > RINGBUF_TYPE_DATA_TYPE_LEN_MAX);
27275 /* If length is in len field, then array[0] has the data */
27276@@ -275,6 +280,27 @@ EXPORT_SYMBOL_GPL(ring_buffer_event_data);
27277 #define TS_MASK ((1ULL << TS_SHIFT) - 1)
27278 #define TS_DELTA_TEST (~TS_MASK)
27279
27280+/**
27281+ * ring_buffer_event_time_stamp - return the event's extended timestamp
27282+ * @event: the event to get the timestamp of
27283+ *
27284+ * Returns the extended timestamp associated with a data event.
27285+ * An extended time_stamp is a 64-bit timestamp represented
27286+ * internally in a special way that makes the best use of space
27287+ * contained within a ring buffer event. This function decodes
27288+ * it and maps it to a straight u64 value.
27289+ */
27290+u64 ring_buffer_event_time_stamp(struct ring_buffer_event *event)
27291+{
27292+ u64 ts;
27293+
27294+ ts = event->array[0];
27295+ ts <<= TS_SHIFT;
27296+ ts += event->time_delta;
27297+
27298+ return ts;
27299+}
27300+
27301 /* Flag when events were overwritten */
27302 #define RB_MISSED_EVENTS (1 << 31)
27303 /* Missed count stored at end */
27304@@ -451,6 +477,7 @@ struct ring_buffer_per_cpu {
27305 struct buffer_page *reader_page;
27306 unsigned long lost_events;
27307 unsigned long last_overrun;
27308+ unsigned long nest;
27309 local_t entries_bytes;
27310 local_t entries;
27311 local_t overrun;
27312@@ -488,6 +515,7 @@ struct ring_buffer {
27313 u64 (*clock)(void);
27314
27315 struct rb_irq_work irq_work;
27316+ bool time_stamp_abs;
27317 };
27318
27319 struct ring_buffer_iter {
27320@@ -1387,6 +1415,16 @@ void ring_buffer_set_clock(struct ring_buffer *buffer,
27321 buffer->clock = clock;
27322 }
27323
27324+void ring_buffer_set_time_stamp_abs(struct ring_buffer *buffer, bool abs)
27325+{
27326+ buffer->time_stamp_abs = abs;
27327+}
27328+
27329+bool ring_buffer_time_stamp_abs(struct ring_buffer *buffer)
27330+{
27331+ return buffer->time_stamp_abs;
27332+}
27333+
27334 static void rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer);
27335
27336 static inline unsigned long rb_page_entries(struct buffer_page *bpage)
27337@@ -2219,12 +2257,15 @@ rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer,
27338
27339 /* Slow path, do not inline */
27340 static noinline struct ring_buffer_event *
27341-rb_add_time_stamp(struct ring_buffer_event *event, u64 delta)
27342+rb_add_time_stamp(struct ring_buffer_event *event, u64 delta, bool abs)
27343 {
27344- event->type_len = RINGBUF_TYPE_TIME_EXTEND;
27345+ if (abs)
27346+ event->type_len = RINGBUF_TYPE_TIME_STAMP;
27347+ else
27348+ event->type_len = RINGBUF_TYPE_TIME_EXTEND;
27349
27350- /* Not the first event on the page? */
27351- if (rb_event_index(event)) {
27352+ /* Not the first event on the page, or not delta? */
27353+ if (abs || rb_event_index(event)) {
27354 event->time_delta = delta & TS_MASK;
27355 event->array[0] = delta >> TS_SHIFT;
27356 } else {
27357@@ -2267,7 +2308,9 @@ rb_update_event(struct ring_buffer_per_cpu *cpu_buffer,
27358 * add it to the start of the resevered space.
27359 */
27360 if (unlikely(info->add_timestamp)) {
27361- event = rb_add_time_stamp(event, delta);
27362+ bool abs = ring_buffer_time_stamp_abs(cpu_buffer->buffer);
27363+
27364+ event = rb_add_time_stamp(event, info->delta, abs);
27365 length -= RB_LEN_TIME_EXTEND;
27366 delta = 0;
27367 }
27368@@ -2455,7 +2498,7 @@ static __always_inline void rb_end_commit(struct ring_buffer_per_cpu *cpu_buffer
27369
27370 static inline void rb_event_discard(struct ring_buffer_event *event)
27371 {
27372- if (event->type_len == RINGBUF_TYPE_TIME_EXTEND)
27373+ if (extended_time(event))
27374 event = skip_time_extend(event);
27375
27376 /* array[0] holds the actual length for the discarded event */
27377@@ -2499,10 +2542,11 @@ rb_update_write_stamp(struct ring_buffer_per_cpu *cpu_buffer,
27378 cpu_buffer->write_stamp =
27379 cpu_buffer->commit_page->page->time_stamp;
27380 else if (event->type_len == RINGBUF_TYPE_TIME_EXTEND) {
27381- delta = event->array[0];
27382- delta <<= TS_SHIFT;
27383- delta += event->time_delta;
27384+ delta = ring_buffer_event_time_stamp(event);
27385 cpu_buffer->write_stamp += delta;
27386+ } else if (event->type_len == RINGBUF_TYPE_TIME_STAMP) {
27387+ delta = ring_buffer_event_time_stamp(event);
27388+ cpu_buffer->write_stamp = delta;
27389 } else
27390 cpu_buffer->write_stamp += event->time_delta;
27391 }
27392@@ -2585,22 +2629,19 @@ static __always_inline int
27393 trace_recursive_lock(struct ring_buffer_per_cpu *cpu_buffer)
27394 {
27395 unsigned int val = cpu_buffer->current_context;
27396+ unsigned long pc = preempt_count();
27397 int bit;
27398
27399- if (in_interrupt()) {
27400- if (in_nmi())
27401- bit = RB_CTX_NMI;
27402- else if (in_irq())
27403- bit = RB_CTX_IRQ;
27404- else
27405- bit = RB_CTX_SOFTIRQ;
27406- } else
27407+ if (!(pc & (NMI_MASK | HARDIRQ_MASK | SOFTIRQ_OFFSET)))
27408 bit = RB_CTX_NORMAL;
27409+ else
27410+ bit = pc & NMI_MASK ? RB_CTX_NMI :
27411+ pc & HARDIRQ_MASK ? RB_CTX_IRQ : RB_CTX_SOFTIRQ;
27412
27413- if (unlikely(val & (1 << bit)))
27414+ if (unlikely(val & (1 << (bit + cpu_buffer->nest))))
27415 return 1;
27416
27417- val |= (1 << bit);
27418+ val |= (1 << (bit + cpu_buffer->nest));
27419 cpu_buffer->current_context = val;
27420
27421 return 0;
27422@@ -2609,7 +2650,57 @@ trace_recursive_lock(struct ring_buffer_per_cpu *cpu_buffer)
27423 static __always_inline void
27424 trace_recursive_unlock(struct ring_buffer_per_cpu *cpu_buffer)
27425 {
27426- cpu_buffer->current_context &= cpu_buffer->current_context - 1;
27427+ cpu_buffer->current_context &=
27428+ cpu_buffer->current_context - (1 << cpu_buffer->nest);
27429+}
27430+
27431+/* The recursive locking above uses 4 bits */
27432+#define NESTED_BITS 4
27433+
27434+/**
27435+ * ring_buffer_nest_start - Allow to trace while nested
27436+ * @buffer: The ring buffer to modify
27437+ *
27438+ * The ring buffer has a safty mechanism to prevent recursion.
27439+ * But there may be a case where a trace needs to be done while
27440+ * tracing something else. In this case, calling this function
27441+ * will allow this function to nest within a currently active
27442+ * ring_buffer_lock_reserve().
27443+ *
27444+ * Call this function before calling another ring_buffer_lock_reserve() and
27445+ * call ring_buffer_nest_end() after the nested ring_buffer_unlock_commit().
27446+ */
27447+void ring_buffer_nest_start(struct ring_buffer *buffer)
27448+{
27449+ struct ring_buffer_per_cpu *cpu_buffer;
27450+ int cpu;
27451+
27452+ /* Enabled by ring_buffer_nest_end() */
27453+ preempt_disable_notrace();
27454+ cpu = raw_smp_processor_id();
27455+ cpu_buffer = buffer->buffers[cpu];
27456+ /* This is the shift value for the above recusive locking */
27457+ cpu_buffer->nest += NESTED_BITS;
27458+}
27459+
27460+/**
27461+ * ring_buffer_nest_end - Allow to trace while nested
27462+ * @buffer: The ring buffer to modify
27463+ *
27464+ * Must be called after ring_buffer_nest_start() and after the
27465+ * ring_buffer_unlock_commit().
27466+ */
27467+void ring_buffer_nest_end(struct ring_buffer *buffer)
27468+{
27469+ struct ring_buffer_per_cpu *cpu_buffer;
27470+ int cpu;
27471+
27472+ /* disabled by ring_buffer_nest_start() */
27473+ cpu = raw_smp_processor_id();
27474+ cpu_buffer = buffer->buffers[cpu];
27475+ /* This is the shift value for the above recusive locking */
27476+ cpu_buffer->nest -= NESTED_BITS;
27477+ preempt_enable_notrace();
27478 }
27479
27480 /**
27481@@ -2685,7 +2776,7 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
27482 * If this is the first commit on the page, then it has the same
27483 * timestamp as the page itself.
27484 */
27485- if (!tail)
27486+ if (!tail && !ring_buffer_time_stamp_abs(cpu_buffer->buffer))
27487 info->delta = 0;
27488
27489 /* See if we shot pass the end of this buffer page */
27490@@ -2762,8 +2853,11 @@ rb_reserve_next_event(struct ring_buffer *buffer,
27491 /* make sure this diff is calculated here */
27492 barrier();
27493
27494- /* Did the write stamp get updated already? */
27495- if (likely(info.ts >= cpu_buffer->write_stamp)) {
27496+ if (ring_buffer_time_stamp_abs(buffer)) {
27497+ info.delta = info.ts;
27498+ rb_handle_timestamp(cpu_buffer, &info);
27499+ } else /* Did the write stamp get updated already? */
27500+ if (likely(info.ts >= cpu_buffer->write_stamp)) {
27501 info.delta = diff;
27502 if (unlikely(test_time_stamp(info.delta)))
27503 rb_handle_timestamp(cpu_buffer, &info);
27504@@ -3461,14 +3555,13 @@ rb_update_read_stamp(struct ring_buffer_per_cpu *cpu_buffer,
27505 return;
27506
27507 case RINGBUF_TYPE_TIME_EXTEND:
27508- delta = event->array[0];
27509- delta <<= TS_SHIFT;
27510- delta += event->time_delta;
27511+ delta = ring_buffer_event_time_stamp(event);
27512 cpu_buffer->read_stamp += delta;
27513 return;
27514
27515 case RINGBUF_TYPE_TIME_STAMP:
27516- /* FIXME: not implemented */
27517+ delta = ring_buffer_event_time_stamp(event);
27518+ cpu_buffer->read_stamp = delta;
27519 return;
27520
27521 case RINGBUF_TYPE_DATA:
27522@@ -3492,14 +3585,13 @@ rb_update_iter_read_stamp(struct ring_buffer_iter *iter,
27523 return;
27524
27525 case RINGBUF_TYPE_TIME_EXTEND:
27526- delta = event->array[0];
27527- delta <<= TS_SHIFT;
27528- delta += event->time_delta;
27529+ delta = ring_buffer_event_time_stamp(event);
27530 iter->read_stamp += delta;
27531 return;
27532
27533 case RINGBUF_TYPE_TIME_STAMP:
27534- /* FIXME: not implemented */
27535+ delta = ring_buffer_event_time_stamp(event);
27536+ iter->read_stamp = delta;
27537 return;
27538
27539 case RINGBUF_TYPE_DATA:
27540@@ -3723,6 +3815,8 @@ rb_buffer_peek(struct ring_buffer_per_cpu *cpu_buffer, u64 *ts,
27541 struct buffer_page *reader;
27542 int nr_loops = 0;
27543
27544+ if (ts)
27545+ *ts = 0;
27546 again:
27547 /*
27548 * We repeat when a time extend is encountered.
27549@@ -3759,12 +3853,17 @@ rb_buffer_peek(struct ring_buffer_per_cpu *cpu_buffer, u64 *ts,
27550 goto again;
27551
27552 case RINGBUF_TYPE_TIME_STAMP:
27553- /* FIXME: not implemented */
27554+ if (ts) {
27555+ *ts = ring_buffer_event_time_stamp(event);
27556+ ring_buffer_normalize_time_stamp(cpu_buffer->buffer,
27557+ cpu_buffer->cpu, ts);
27558+ }
27559+ /* Internal data, OK to advance */
27560 rb_advance_reader(cpu_buffer);
27561 goto again;
27562
27563 case RINGBUF_TYPE_DATA:
27564- if (ts) {
27565+ if (ts && !(*ts)) {
27566 *ts = cpu_buffer->read_stamp + event->time_delta;
27567 ring_buffer_normalize_time_stamp(cpu_buffer->buffer,
27568 cpu_buffer->cpu, ts);
27569@@ -3789,6 +3888,9 @@ rb_iter_peek(struct ring_buffer_iter *iter, u64 *ts)
27570 struct ring_buffer_event *event;
27571 int nr_loops = 0;
27572
27573+ if (ts)
27574+ *ts = 0;
27575+
27576 cpu_buffer = iter->cpu_buffer;
27577 buffer = cpu_buffer->buffer;
27578
27579@@ -3841,12 +3943,17 @@ rb_iter_peek(struct ring_buffer_iter *iter, u64 *ts)
27580 goto again;
27581
27582 case RINGBUF_TYPE_TIME_STAMP:
27583- /* FIXME: not implemented */
27584+ if (ts) {
27585+ *ts = ring_buffer_event_time_stamp(event);
27586+ ring_buffer_normalize_time_stamp(cpu_buffer->buffer,
27587+ cpu_buffer->cpu, ts);
27588+ }
27589+ /* Internal data, OK to advance */
27590 rb_advance_iter(iter);
27591 goto again;
27592
27593 case RINGBUF_TYPE_DATA:
27594- if (ts) {
27595+ if (ts && !(*ts)) {
27596 *ts = iter->read_stamp + event->time_delta;
27597 ring_buffer_normalize_time_stamp(buffer,
27598 cpu_buffer->cpu, ts);
27599diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
27600index e9cbb96cd99e..4fc60e5ec4b9 100644
27601--- a/kernel/trace/trace.c
27602+++ b/kernel/trace/trace.c
27603@@ -1170,6 +1170,14 @@ static struct {
27604 ARCH_TRACE_CLOCKS
27605 };
27606
27607+bool trace_clock_in_ns(struct trace_array *tr)
27608+{
27609+ if (trace_clocks[tr->clock_id].in_ns)
27610+ return true;
27611+
27612+ return false;
27613+}
27614+
27615 /*
27616 * trace_parser_get_init - gets the buffer for trace parser
27617 */
27618@@ -2127,6 +2135,7 @@ tracing_generic_entry_update(struct trace_entry *entry, unsigned long flags,
27619 struct task_struct *tsk = current;
27620
27621 entry->preempt_count = pc & 0xff;
27622+ entry->preempt_lazy_count = preempt_lazy_count();
27623 entry->pid = (tsk) ? tsk->pid : 0;
27624 entry->flags =
27625 #ifdef CONFIG_TRACE_IRQFLAGS_SUPPORT
27626@@ -2137,8 +2146,11 @@ tracing_generic_entry_update(struct trace_entry *entry, unsigned long flags,
27627 ((pc & NMI_MASK ) ? TRACE_FLAG_NMI : 0) |
27628 ((pc & HARDIRQ_MASK) ? TRACE_FLAG_HARDIRQ : 0) |
27629 ((pc & SOFTIRQ_OFFSET) ? TRACE_FLAG_SOFTIRQ : 0) |
27630- (tif_need_resched() ? TRACE_FLAG_NEED_RESCHED : 0) |
27631+ (tif_need_resched_now() ? TRACE_FLAG_NEED_RESCHED : 0) |
27632+ (need_resched_lazy() ? TRACE_FLAG_NEED_RESCHED_LAZY : 0) |
27633 (test_preempt_need_resched() ? TRACE_FLAG_PREEMPT_RESCHED : 0);
27634+
27635+ entry->migrate_disable = (tsk) ? __migrate_disabled(tsk) & 0xFF : 0;
27636 }
27637 EXPORT_SYMBOL_GPL(tracing_generic_entry_update);
27638
27639@@ -2275,7 +2287,7 @@ trace_event_buffer_lock_reserve(struct ring_buffer **current_rb,
27640
27641 *current_rb = trace_file->tr->trace_buffer.buffer;
27642
27643- if ((trace_file->flags &
27644+ if (!ring_buffer_time_stamp_abs(*current_rb) && (trace_file->flags &
27645 (EVENT_FILE_FL_SOFT_DISABLED | EVENT_FILE_FL_FILTERED)) &&
27646 (entry = this_cpu_read(trace_buffered_event))) {
27647 /* Try to use the per cpu buffer first */
27648@@ -3342,14 +3354,17 @@ get_total_entries(struct trace_buffer *buf,
27649
27650 static void print_lat_help_header(struct seq_file *m)
27651 {
27652- seq_puts(m, "# _------=> CPU# \n"
27653- "# / _-----=> irqs-off \n"
27654- "# | / _----=> need-resched \n"
27655- "# || / _---=> hardirq/softirq \n"
27656- "# ||| / _--=> preempt-depth \n"
27657- "# |||| / delay \n"
27658- "# cmd pid ||||| time | caller \n"
27659- "# \\ / ||||| \\ | / \n");
27660+ seq_puts(m, "# _--------=> CPU# \n"
27661+ "# / _-------=> irqs-off \n"
27662+ "# | / _------=> need-resched \n"
27663+ "# || / _-----=> need-resched_lazy \n"
27664+ "# ||| / _----=> hardirq/softirq \n"
27665+ "# |||| / _---=> preempt-depth \n"
27666+ "# ||||| / _--=> preempt-lazy-depth\n"
27667+ "# |||||| / _-=> migrate-disable \n"
27668+ "# ||||||| / delay \n"
27669+ "# cmd pid |||||||| time | caller \n"
27670+ "# \\ / |||||||| \\ | / \n");
27671 }
27672
27673 static void print_event_info(struct trace_buffer *buf, struct seq_file *m)
27674@@ -3385,15 +3400,17 @@ static void print_func_help_header_irq(struct trace_buffer *buf, struct seq_file
27675 tgid ? tgid_space : space);
27676 seq_printf(m, "# %s / _----=> need-resched\n",
27677 tgid ? tgid_space : space);
27678- seq_printf(m, "# %s| / _---=> hardirq/softirq\n",
27679+ seq_printf(m, "# %s| / _----=> need-resched_lazy\n",
27680+ tgid ? tgid_space : space);
27681+ seq_printf(m, "# %s|| / _---=> hardirq/softirq\n",
27682 tgid ? tgid_space : space);
27683- seq_printf(m, "# %s|| / _--=> preempt-depth\n",
27684+ seq_printf(m, "# %s||| / _--=> preempt-depth\n",
27685 tgid ? tgid_space : space);
27686- seq_printf(m, "# %s||| / delay\n",
27687+ seq_printf(m, "# %s|||| / delay\n",
27688 tgid ? tgid_space : space);
27689- seq_printf(m, "# TASK-PID %sCPU# |||| TIMESTAMP FUNCTION\n",
27690+ seq_printf(m, "# TASK-PID %sCPU# ||||| TIMESTAMP FUNCTION\n",
27691 tgid ? " TGID " : space);
27692- seq_printf(m, "# | | %s | |||| | |\n",
27693+ seq_printf(m, "# | | %s | ||||| | |\n",
27694 tgid ? " | " : space);
27695 }
27696
27697@@ -4531,6 +4548,9 @@ static const char readme_msg[] =
27698 #ifdef CONFIG_X86_64
27699 " x86-tsc: TSC cycle counter\n"
27700 #endif
27701+ "\n timestamp_mode\t-view the mode used to timestamp events\n"
27702+ " delta: Delta difference against a buffer-wide timestamp\n"
27703+ " absolute: Absolute (standalone) timestamp\n"
27704 "\n trace_marker\t\t- Writes into this file writes into the kernel buffer\n"
27705 "\n trace_marker_raw\t\t- Writes into this file writes binary data into the kernel buffer\n"
27706 " tracing_cpumask\t- Limit which CPUs to trace\n"
27707@@ -4707,8 +4727,9 @@ static const char readme_msg[] =
27708 "\t .sym display an address as a symbol\n"
27709 "\t .sym-offset display an address as a symbol and offset\n"
27710 "\t .execname display a common_pid as a program name\n"
27711- "\t .syscall display a syscall id as a syscall name\n\n"
27712- "\t .log2 display log2 value rather than raw number\n\n"
27713+ "\t .syscall display a syscall id as a syscall name\n"
27714+ "\t .log2 display log2 value rather than raw number\n"
27715+ "\t .usecs display a common_timestamp in microseconds\n\n"
27716 "\t The 'pause' parameter can be used to pause an existing hist\n"
27717 "\t trigger or to start a hist trigger but not log any events\n"
27718 "\t until told to do so. 'continue' can be used to start or\n"
27719@@ -6218,7 +6239,7 @@ static int tracing_clock_show(struct seq_file *m, void *v)
27720 return 0;
27721 }
27722
27723-static int tracing_set_clock(struct trace_array *tr, const char *clockstr)
27724+int tracing_set_clock(struct trace_array *tr, const char *clockstr)
27725 {
27726 int i;
27727
27728@@ -6298,6 +6319,71 @@ static int tracing_clock_open(struct inode *inode, struct file *file)
27729 return ret;
27730 }
27731
27732+static int tracing_time_stamp_mode_show(struct seq_file *m, void *v)
27733+{
27734+ struct trace_array *tr = m->private;
27735+
27736+ mutex_lock(&trace_types_lock);
27737+
27738+ if (ring_buffer_time_stamp_abs(tr->trace_buffer.buffer))
27739+ seq_puts(m, "delta [absolute]\n");
27740+ else
27741+ seq_puts(m, "[delta] absolute\n");
27742+
27743+ mutex_unlock(&trace_types_lock);
27744+
27745+ return 0;
27746+}
27747+
27748+static int tracing_time_stamp_mode_open(struct inode *inode, struct file *file)
27749+{
27750+ struct trace_array *tr = inode->i_private;
27751+ int ret;
27752+
27753+ if (tracing_disabled)
27754+ return -ENODEV;
27755+
27756+ if (trace_array_get(tr))
27757+ return -ENODEV;
27758+
27759+ ret = single_open(file, tracing_time_stamp_mode_show, inode->i_private);
27760+ if (ret < 0)
27761+ trace_array_put(tr);
27762+
27763+ return ret;
27764+}
27765+
27766+int tracing_set_time_stamp_abs(struct trace_array *tr, bool abs)
27767+{
27768+ int ret = 0;
27769+
27770+ mutex_lock(&trace_types_lock);
27771+
27772+ if (abs && tr->time_stamp_abs_ref++)
27773+ goto out;
27774+
27775+ if (!abs) {
27776+ if (WARN_ON_ONCE(!tr->time_stamp_abs_ref)) {
27777+ ret = -EINVAL;
27778+ goto out;
27779+ }
27780+
27781+ if (--tr->time_stamp_abs_ref)
27782+ goto out;
27783+ }
27784+
27785+ ring_buffer_set_time_stamp_abs(tr->trace_buffer.buffer, abs);
27786+
27787+#ifdef CONFIG_TRACER_MAX_TRACE
27788+ if (tr->max_buffer.buffer)
27789+ ring_buffer_set_time_stamp_abs(tr->max_buffer.buffer, abs);
27790+#endif
27791+ out:
27792+ mutex_unlock(&trace_types_lock);
27793+
27794+ return ret;
27795+}
27796+
27797 struct ftrace_buffer_info {
27798 struct trace_iterator iter;
27799 void *spare;
27800@@ -6545,6 +6631,13 @@ static const struct file_operations trace_clock_fops = {
27801 .write = tracing_clock_write,
27802 };
27803
27804+static const struct file_operations trace_time_stamp_mode_fops = {
27805+ .open = tracing_time_stamp_mode_open,
27806+ .read = seq_read,
27807+ .llseek = seq_lseek,
27808+ .release = tracing_single_release_tr,
27809+};
27810+
27811 #ifdef CONFIG_TRACER_SNAPSHOT
27812 static const struct file_operations snapshot_fops = {
27813 .open = tracing_snapshot_open,
27814@@ -7684,6 +7777,7 @@ static int instance_mkdir(const char *name)
27815 struct trace_array *tr;
27816 int ret;
27817
27818+ mutex_lock(&event_mutex);
27819 mutex_lock(&trace_types_lock);
27820
27821 ret = -EEXIST;
27822@@ -7716,6 +7810,7 @@ static int instance_mkdir(const char *name)
27823
27824 INIT_LIST_HEAD(&tr->systems);
27825 INIT_LIST_HEAD(&tr->events);
27826+ INIT_LIST_HEAD(&tr->hist_vars);
27827
27828 if (allocate_trace_buffers(tr, trace_buf_size) < 0)
27829 goto out_free_tr;
27830@@ -7739,6 +7834,7 @@ static int instance_mkdir(const char *name)
27831 list_add(&tr->list, &ftrace_trace_arrays);
27832
27833 mutex_unlock(&trace_types_lock);
27834+ mutex_unlock(&event_mutex);
27835
27836 return 0;
27837
27838@@ -7750,6 +7846,7 @@ static int instance_mkdir(const char *name)
27839
27840 out_unlock:
27841 mutex_unlock(&trace_types_lock);
27842+ mutex_unlock(&event_mutex);
27843
27844 return ret;
27845
27846@@ -7762,6 +7859,7 @@ static int instance_rmdir(const char *name)
27847 int ret;
27848 int i;
27849
27850+ mutex_lock(&event_mutex);
27851 mutex_lock(&trace_types_lock);
27852
27853 ret = -ENODEV;
27854@@ -7807,6 +7905,7 @@ static int instance_rmdir(const char *name)
27855
27856 out_unlock:
27857 mutex_unlock(&trace_types_lock);
27858+ mutex_unlock(&event_mutex);
27859
27860 return ret;
27861 }
27862@@ -7864,6 +7963,9 @@ init_tracer_tracefs(struct trace_array *tr, struct dentry *d_tracer)
27863 trace_create_file("tracing_on", 0644, d_tracer,
27864 tr, &rb_simple_fops);
27865
27866+ trace_create_file("timestamp_mode", 0444, d_tracer, tr,
27867+ &trace_time_stamp_mode_fops);
27868+
27869 create_trace_options_dir(tr);
27870
27871 #if defined(CONFIG_TRACER_MAX_TRACE) || defined(CONFIG_HWLAT_TRACER)
27872@@ -8275,6 +8377,92 @@ void ftrace_dump(enum ftrace_dump_mode oops_dump_mode)
27873 }
27874 EXPORT_SYMBOL_GPL(ftrace_dump);
27875
27876+int trace_run_command(const char *buf, int (*createfn)(int, char **))
27877+{
27878+ char **argv;
27879+ int argc, ret;
27880+
27881+ argc = 0;
27882+ ret = 0;
27883+ argv = argv_split(GFP_KERNEL, buf, &argc);
27884+ if (!argv)
27885+ return -ENOMEM;
27886+
27887+ if (argc)
27888+ ret = createfn(argc, argv);
27889+
27890+ argv_free(argv);
27891+
27892+ return ret;
27893+}
27894+
27895+#define WRITE_BUFSIZE 4096
27896+
27897+ssize_t trace_parse_run_command(struct file *file, const char __user *buffer,
27898+ size_t count, loff_t *ppos,
27899+ int (*createfn)(int, char **))
27900+{
27901+ char *kbuf, *buf, *tmp;
27902+ int ret = 0;
27903+ size_t done = 0;
27904+ size_t size;
27905+
27906+ kbuf = kmalloc(WRITE_BUFSIZE, GFP_KERNEL);
27907+ if (!kbuf)
27908+ return -ENOMEM;
27909+
27910+ while (done < count) {
27911+ size = count - done;
27912+
27913+ if (size >= WRITE_BUFSIZE)
27914+ size = WRITE_BUFSIZE - 1;
27915+
27916+ if (copy_from_user(kbuf, buffer + done, size)) {
27917+ ret = -EFAULT;
27918+ goto out;
27919+ }
27920+ kbuf[size] = '\0';
27921+ buf = kbuf;
27922+ do {
27923+ tmp = strchr(buf, '\n');
27924+ if (tmp) {
27925+ *tmp = '\0';
27926+ size = tmp - buf + 1;
27927+ } else {
27928+ size = strlen(buf);
27929+ if (done + size < count) {
27930+ if (buf != kbuf)
27931+ break;
27932+ /* This can accept WRITE_BUFSIZE - 2 ('\n' + '\0') */
27933+ pr_warn("Line length is too long: Should be less than %d\n",
27934+ WRITE_BUFSIZE - 2);
27935+ ret = -EINVAL;
27936+ goto out;
27937+ }
27938+ }
27939+ done += size;
27940+
27941+ /* Remove comments */
27942+ tmp = strchr(buf, '#');
27943+
27944+ if (tmp)
27945+ *tmp = '\0';
27946+
27947+ ret = trace_run_command(buf, createfn);
27948+ if (ret)
27949+ goto out;
27950+ buf += size;
27951+
27952+ } while (done < count);
27953+ }
27954+ ret = done;
27955+
27956+out:
27957+ kfree(kbuf);
27958+
27959+ return ret;
27960+}
27961+
27962 __init static int tracer_alloc_buffers(void)
27963 {
27964 int ring_buf_size;
27965@@ -8375,6 +8563,7 @@ __init static int tracer_alloc_buffers(void)
27966
27967 INIT_LIST_HEAD(&global_trace.systems);
27968 INIT_LIST_HEAD(&global_trace.events);
27969+ INIT_LIST_HEAD(&global_trace.hist_vars);
27970 list_add(&global_trace.list, &ftrace_trace_arrays);
27971
27972 apply_trace_boot_options();
27973diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
27974index 851cd1605085..18bf383f46e8 100644
27975--- a/kernel/trace/trace.h
27976+++ b/kernel/trace/trace.h
27977@@ -127,6 +127,7 @@ struct kretprobe_trace_entry_head {
27978 * NEED_RESCHED - reschedule is requested
27979 * HARDIRQ - inside an interrupt handler
27980 * SOFTIRQ - inside a softirq handler
27981+ * NEED_RESCHED_LAZY - lazy reschedule is requested
27982 */
27983 enum trace_flag_type {
27984 TRACE_FLAG_IRQS_OFF = 0x01,
27985@@ -136,6 +137,7 @@ enum trace_flag_type {
27986 TRACE_FLAG_SOFTIRQ = 0x10,
27987 TRACE_FLAG_PREEMPT_RESCHED = 0x20,
27988 TRACE_FLAG_NMI = 0x40,
27989+ TRACE_FLAG_NEED_RESCHED_LAZY = 0x80,
27990 };
27991
27992 #define TRACE_BUF_SIZE 1024
27993@@ -273,6 +275,8 @@ struct trace_array {
27994 /* function tracing enabled */
27995 int function_enabled;
27996 #endif
27997+ int time_stamp_abs_ref;
27998+ struct list_head hist_vars;
27999 };
28000
28001 enum {
28002@@ -286,6 +290,11 @@ extern struct mutex trace_types_lock;
28003 extern int trace_array_get(struct trace_array *tr);
28004 extern void trace_array_put(struct trace_array *tr);
28005
28006+extern int tracing_set_time_stamp_abs(struct trace_array *tr, bool abs);
28007+extern int tracing_set_clock(struct trace_array *tr, const char *clockstr);
28008+
28009+extern bool trace_clock_in_ns(struct trace_array *tr);
28010+
28011 /*
28012 * The global tracer (top) should be the first trace array added,
28013 * but we check the flag anyway.
28014@@ -1293,7 +1302,7 @@ __event_trigger_test_discard(struct trace_event_file *file,
28015 unsigned long eflags = file->flags;
28016
28017 if (eflags & EVENT_FILE_FL_TRIGGER_COND)
28018- *tt = event_triggers_call(file, entry);
28019+ *tt = event_triggers_call(file, entry, event);
28020
28021 if (test_bit(EVENT_FILE_FL_SOFT_DISABLED_BIT, &file->flags) ||
28022 (unlikely(file->flags & EVENT_FILE_FL_FILTERED) &&
28023@@ -1330,7 +1339,7 @@ event_trigger_unlock_commit(struct trace_event_file *file,
28024 trace_buffer_unlock_commit(file->tr, buffer, event, irq_flags, pc);
28025
28026 if (tt)
28027- event_triggers_post_call(file, tt, entry);
28028+ event_triggers_post_call(file, tt, entry, event);
28029 }
28030
28031 /**
28032@@ -1363,7 +1372,7 @@ event_trigger_unlock_commit_regs(struct trace_event_file *file,
28033 irq_flags, pc, regs);
28034
28035 if (tt)
28036- event_triggers_post_call(file, tt, entry);
28037+ event_triggers_post_call(file, tt, entry, event);
28038 }
28039
28040 #define FILTER_PRED_INVALID ((unsigned short)-1)
28041@@ -1545,6 +1554,8 @@ extern void pause_named_trigger(struct event_trigger_data *data);
28042 extern void unpause_named_trigger(struct event_trigger_data *data);
28043 extern void set_named_trigger_data(struct event_trigger_data *data,
28044 struct event_trigger_data *named_data);
28045+extern struct event_trigger_data *
28046+get_named_trigger_data(struct event_trigger_data *data);
28047 extern int register_event_command(struct event_command *cmd);
28048 extern int unregister_event_command(struct event_command *cmd);
28049 extern int register_trigger_hist_enable_disable_cmds(void);
28050@@ -1588,7 +1599,8 @@ extern int register_trigger_hist_enable_disable_cmds(void);
28051 */
28052 struct event_trigger_ops {
28053 void (*func)(struct event_trigger_data *data,
28054- void *rec);
28055+ void *rec,
28056+ struct ring_buffer_event *rbe);
28057 int (*init)(struct event_trigger_ops *ops,
28058 struct event_trigger_data *data);
28059 void (*free)(struct event_trigger_ops *ops,
28060@@ -1755,6 +1767,13 @@ void trace_printk_start_comm(void);
28061 int trace_keep_overwrite(struct tracer *tracer, u32 mask, int set);
28062 int set_tracer_flag(struct trace_array *tr, unsigned int mask, int enabled);
28063
28064+#define MAX_EVENT_NAME_LEN 64
28065+
28066+extern int trace_run_command(const char *buf, int (*createfn)(int, char**));
28067+extern ssize_t trace_parse_run_command(struct file *file,
28068+ const char __user *buffer, size_t count, loff_t *ppos,
28069+ int (*createfn)(int, char**));
28070+
28071 /*
28072 * Normal trace_printk() and friends allocates special buffers
28073 * to do the manipulation, as well as saves the print formats
28074diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
28075index d53268a4e167..9ba230a4052f 100644
28076--- a/kernel/trace/trace_events.c
28077+++ b/kernel/trace/trace_events.c
28078@@ -187,6 +187,8 @@ static int trace_define_common_fields(void)
28079 __common_field(unsigned char, flags);
28080 __common_field(unsigned char, preempt_count);
28081 __common_field(int, pid);
28082+ __common_field(unsigned short, migrate_disable);
28083+ __common_field(unsigned short, padding);
28084
28085 return ret;
28086 }
28087@@ -1406,8 +1408,8 @@ static int subsystem_open(struct inode *inode, struct file *filp)
28088 return -ENODEV;
28089
28090 /* Make sure the system still exists */
28091- mutex_lock(&trace_types_lock);
28092 mutex_lock(&event_mutex);
28093+ mutex_lock(&trace_types_lock);
28094 list_for_each_entry(tr, &ftrace_trace_arrays, list) {
28095 list_for_each_entry(dir, &tr->systems, list) {
28096 if (dir == inode->i_private) {
28097@@ -1421,8 +1423,8 @@ static int subsystem_open(struct inode *inode, struct file *filp)
28098 }
28099 }
28100 exit_loop:
28101- mutex_unlock(&event_mutex);
28102 mutex_unlock(&trace_types_lock);
28103+ mutex_unlock(&event_mutex);
28104
28105 if (!system)
28106 return -ENODEV;
28107@@ -2308,15 +2310,15 @@ static void __add_event_to_tracers(struct trace_event_call *call);
28108 int trace_add_event_call(struct trace_event_call *call)
28109 {
28110 int ret;
28111- mutex_lock(&trace_types_lock);
28112 mutex_lock(&event_mutex);
28113+ mutex_lock(&trace_types_lock);
28114
28115 ret = __register_event(call, NULL);
28116 if (ret >= 0)
28117 __add_event_to_tracers(call);
28118
28119- mutex_unlock(&event_mutex);
28120 mutex_unlock(&trace_types_lock);
28121+ mutex_unlock(&event_mutex);
28122 return ret;
28123 }
28124
28125@@ -2370,13 +2372,13 @@ int trace_remove_event_call(struct trace_event_call *call)
28126 {
28127 int ret;
28128
28129- mutex_lock(&trace_types_lock);
28130 mutex_lock(&event_mutex);
28131+ mutex_lock(&trace_types_lock);
28132 down_write(&trace_event_sem);
28133 ret = probe_remove_event_call(call);
28134 up_write(&trace_event_sem);
28135- mutex_unlock(&event_mutex);
28136 mutex_unlock(&trace_types_lock);
28137+ mutex_unlock(&event_mutex);
28138
28139 return ret;
28140 }
28141@@ -2438,8 +2440,8 @@ static int trace_module_notify(struct notifier_block *self,
28142 {
28143 struct module *mod = data;
28144
28145- mutex_lock(&trace_types_lock);
28146 mutex_lock(&event_mutex);
28147+ mutex_lock(&trace_types_lock);
28148 switch (val) {
28149 case MODULE_STATE_COMING:
28150 trace_module_add_events(mod);
28151@@ -2448,8 +2450,8 @@ static int trace_module_notify(struct notifier_block *self,
28152 trace_module_remove_events(mod);
28153 break;
28154 }
28155- mutex_unlock(&event_mutex);
28156 mutex_unlock(&trace_types_lock);
28157+ mutex_unlock(&event_mutex);
28158
28159 return 0;
28160 }
28161@@ -2964,24 +2966,24 @@ create_event_toplevel_files(struct dentry *parent, struct trace_array *tr)
28162 * creates the event hierachry in the @parent/events directory.
28163 *
28164 * Returns 0 on success.
28165+ *
28166+ * Must be called with event_mutex held.
28167 */
28168 int event_trace_add_tracer(struct dentry *parent, struct trace_array *tr)
28169 {
28170 int ret;
28171
28172- mutex_lock(&event_mutex);
28173+ lockdep_assert_held(&event_mutex);
28174
28175 ret = create_event_toplevel_files(parent, tr);
28176 if (ret)
28177- goto out_unlock;
28178+ goto out;
28179
28180 down_write(&trace_event_sem);
28181 __trace_add_event_dirs(tr);
28182 up_write(&trace_event_sem);
28183
28184- out_unlock:
28185- mutex_unlock(&event_mutex);
28186-
28187+ out:
28188 return ret;
28189 }
28190
28191@@ -3010,9 +3012,10 @@ early_event_add_tracer(struct dentry *parent, struct trace_array *tr)
28192 return ret;
28193 }
28194
28195+/* Must be called with event_mutex held */
28196 int event_trace_del_tracer(struct trace_array *tr)
28197 {
28198- mutex_lock(&event_mutex);
28199+ lockdep_assert_held(&event_mutex);
28200
28201 /* Disable any event triggers and associated soft-disabled events */
28202 clear_event_triggers(tr);
28203@@ -3033,8 +3036,6 @@ int event_trace_del_tracer(struct trace_array *tr)
28204
28205 tr->event_dir = NULL;
28206
28207- mutex_unlock(&event_mutex);
28208-
28209 return 0;
28210 }
28211
28212diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c
28213index 7eb975a2d0e1..24bc0769fdd6 100644
28214--- a/kernel/trace/trace_events_hist.c
28215+++ b/kernel/trace/trace_events_hist.c
28216@@ -20,13 +20,39 @@
28217 #include <linux/slab.h>
28218 #include <linux/stacktrace.h>
28219 #include <linux/rculist.h>
28220+#include <linux/tracefs.h>
28221
28222 #include "tracing_map.h"
28223 #include "trace.h"
28224
28225+#define SYNTH_SYSTEM "synthetic"
28226+#define SYNTH_FIELDS_MAX 16
28227+
28228+#define STR_VAR_LEN_MAX 32 /* must be multiple of sizeof(u64) */
28229+
28230 struct hist_field;
28231
28232-typedef u64 (*hist_field_fn_t) (struct hist_field *field, void *event);
28233+typedef u64 (*hist_field_fn_t) (struct hist_field *field,
28234+ struct tracing_map_elt *elt,
28235+ struct ring_buffer_event *rbe,
28236+ void *event);
28237+
28238+#define HIST_FIELD_OPERANDS_MAX 2
28239+#define HIST_FIELDS_MAX (TRACING_MAP_FIELDS_MAX + TRACING_MAP_VARS_MAX)
28240+#define HIST_ACTIONS_MAX 8
28241+
28242+enum field_op_id {
28243+ FIELD_OP_NONE,
28244+ FIELD_OP_PLUS,
28245+ FIELD_OP_MINUS,
28246+ FIELD_OP_UNARY_MINUS,
28247+};
28248+
28249+struct hist_var {
28250+ char *name;
28251+ struct hist_trigger_data *hist_data;
28252+ unsigned int idx;
28253+};
28254
28255 struct hist_field {
28256 struct ftrace_event_field *field;
28257@@ -34,26 +60,50 @@ struct hist_field {
28258 hist_field_fn_t fn;
28259 unsigned int size;
28260 unsigned int offset;
28261+ unsigned int is_signed;
28262+ const char *type;
28263+ struct hist_field *operands[HIST_FIELD_OPERANDS_MAX];
28264+ struct hist_trigger_data *hist_data;
28265+ struct hist_var var;
28266+ enum field_op_id operator;
28267+ char *system;
28268+ char *event_name;
28269+ char *name;
28270+ unsigned int var_idx;
28271+ unsigned int var_ref_idx;
28272+ bool read_once;
28273 };
28274
28275-static u64 hist_field_none(struct hist_field *field, void *event)
28276+static u64 hist_field_none(struct hist_field *field,
28277+ struct tracing_map_elt *elt,
28278+ struct ring_buffer_event *rbe,
28279+ void *event)
28280 {
28281 return 0;
28282 }
28283
28284-static u64 hist_field_counter(struct hist_field *field, void *event)
28285+static u64 hist_field_counter(struct hist_field *field,
28286+ struct tracing_map_elt *elt,
28287+ struct ring_buffer_event *rbe,
28288+ void *event)
28289 {
28290 return 1;
28291 }
28292
28293-static u64 hist_field_string(struct hist_field *hist_field, void *event)
28294+static u64 hist_field_string(struct hist_field *hist_field,
28295+ struct tracing_map_elt *elt,
28296+ struct ring_buffer_event *rbe,
28297+ void *event)
28298 {
28299 char *addr = (char *)(event + hist_field->field->offset);
28300
28301 return (u64)(unsigned long)addr;
28302 }
28303
28304-static u64 hist_field_dynstring(struct hist_field *hist_field, void *event)
28305+static u64 hist_field_dynstring(struct hist_field *hist_field,
28306+ struct tracing_map_elt *elt,
28307+ struct ring_buffer_event *rbe,
28308+ void *event)
28309 {
28310 u32 str_item = *(u32 *)(event + hist_field->field->offset);
28311 int str_loc = str_item & 0xffff;
28312@@ -62,22 +112,74 @@ static u64 hist_field_dynstring(struct hist_field *hist_field, void *event)
28313 return (u64)(unsigned long)addr;
28314 }
28315
28316-static u64 hist_field_pstring(struct hist_field *hist_field, void *event)
28317+static u64 hist_field_pstring(struct hist_field *hist_field,
28318+ struct tracing_map_elt *elt,
28319+ struct ring_buffer_event *rbe,
28320+ void *event)
28321 {
28322 char **addr = (char **)(event + hist_field->field->offset);
28323
28324 return (u64)(unsigned long)*addr;
28325 }
28326
28327-static u64 hist_field_log2(struct hist_field *hist_field, void *event)
28328+static u64 hist_field_log2(struct hist_field *hist_field,
28329+ struct tracing_map_elt *elt,
28330+ struct ring_buffer_event *rbe,
28331+ void *event)
28332 {
28333- u64 val = *(u64 *)(event + hist_field->field->offset);
28334+ struct hist_field *operand = hist_field->operands[0];
28335+
28336+ u64 val = operand->fn(operand, elt, rbe, event);
28337
28338 return (u64) ilog2(roundup_pow_of_two(val));
28339 }
28340
28341+static u64 hist_field_plus(struct hist_field *hist_field,
28342+ struct tracing_map_elt *elt,
28343+ struct ring_buffer_event *rbe,
28344+ void *event)
28345+{
28346+ struct hist_field *operand1 = hist_field->operands[0];
28347+ struct hist_field *operand2 = hist_field->operands[1];
28348+
28349+ u64 val1 = operand1->fn(operand1, elt, rbe, event);
28350+ u64 val2 = operand2->fn(operand2, elt, rbe, event);
28351+
28352+ return val1 + val2;
28353+}
28354+
28355+static u64 hist_field_minus(struct hist_field *hist_field,
28356+ struct tracing_map_elt *elt,
28357+ struct ring_buffer_event *rbe,
28358+ void *event)
28359+{
28360+ struct hist_field *operand1 = hist_field->operands[0];
28361+ struct hist_field *operand2 = hist_field->operands[1];
28362+
28363+ u64 val1 = operand1->fn(operand1, elt, rbe, event);
28364+ u64 val2 = operand2->fn(operand2, elt, rbe, event);
28365+
28366+ return val1 - val2;
28367+}
28368+
28369+static u64 hist_field_unary_minus(struct hist_field *hist_field,
28370+ struct tracing_map_elt *elt,
28371+ struct ring_buffer_event *rbe,
28372+ void *event)
28373+{
28374+ struct hist_field *operand = hist_field->operands[0];
28375+
28376+ s64 sval = (s64)operand->fn(operand, elt, rbe, event);
28377+ u64 val = (u64)-sval;
28378+
28379+ return val;
28380+}
28381+
28382 #define DEFINE_HIST_FIELD_FN(type) \
28383-static u64 hist_field_##type(struct hist_field *hist_field, void *event)\
28384+ static u64 hist_field_##type(struct hist_field *hist_field, \
28385+ struct tracing_map_elt *elt, \
28386+ struct ring_buffer_event *rbe, \
28387+ void *event) \
28388 { \
28389 type *addr = (type *)(event + hist_field->field->offset); \
28390 \
28391@@ -110,16 +212,29 @@ DEFINE_HIST_FIELD_FN(u8);
28392 #define HIST_KEY_SIZE_MAX (MAX_FILTER_STR_VAL + HIST_STACKTRACE_SIZE)
28393
28394 enum hist_field_flags {
28395- HIST_FIELD_FL_HITCOUNT = 1,
28396- HIST_FIELD_FL_KEY = 2,
28397- HIST_FIELD_FL_STRING = 4,
28398- HIST_FIELD_FL_HEX = 8,
28399- HIST_FIELD_FL_SYM = 16,
28400- HIST_FIELD_FL_SYM_OFFSET = 32,
28401- HIST_FIELD_FL_EXECNAME = 64,
28402- HIST_FIELD_FL_SYSCALL = 128,
28403- HIST_FIELD_FL_STACKTRACE = 256,
28404- HIST_FIELD_FL_LOG2 = 512,
28405+ HIST_FIELD_FL_HITCOUNT = 1 << 0,
28406+ HIST_FIELD_FL_KEY = 1 << 1,
28407+ HIST_FIELD_FL_STRING = 1 << 2,
28408+ HIST_FIELD_FL_HEX = 1 << 3,
28409+ HIST_FIELD_FL_SYM = 1 << 4,
28410+ HIST_FIELD_FL_SYM_OFFSET = 1 << 5,
28411+ HIST_FIELD_FL_EXECNAME = 1 << 6,
28412+ HIST_FIELD_FL_SYSCALL = 1 << 7,
28413+ HIST_FIELD_FL_STACKTRACE = 1 << 8,
28414+ HIST_FIELD_FL_LOG2 = 1 << 9,
28415+ HIST_FIELD_FL_TIMESTAMP = 1 << 10,
28416+ HIST_FIELD_FL_TIMESTAMP_USECS = 1 << 11,
28417+ HIST_FIELD_FL_VAR = 1 << 12,
28418+ HIST_FIELD_FL_EXPR = 1 << 13,
28419+ HIST_FIELD_FL_VAR_REF = 1 << 14,
28420+ HIST_FIELD_FL_CPU = 1 << 15,
28421+ HIST_FIELD_FL_ALIAS = 1 << 16,
28422+};
28423+
28424+struct var_defs {
28425+ unsigned int n_vars;
28426+ char *name[TRACING_MAP_VARS_MAX];
28427+ char *expr[TRACING_MAP_VARS_MAX];
28428 };
28429
28430 struct hist_trigger_attrs {
28431@@ -127,298 +242,3585 @@ struct hist_trigger_attrs {
28432 char *vals_str;
28433 char *sort_key_str;
28434 char *name;
28435+ char *clock;
28436 bool pause;
28437 bool cont;
28438 bool clear;
28439+ bool ts_in_usecs;
28440 unsigned int map_bits;
28441+
28442+ char *assignment_str[TRACING_MAP_VARS_MAX];
28443+ unsigned int n_assignments;
28444+
28445+ char *action_str[HIST_ACTIONS_MAX];
28446+ unsigned int n_actions;
28447+
28448+ struct var_defs var_defs;
28449+};
28450+
28451+struct field_var {
28452+ struct hist_field *var;
28453+ struct hist_field *val;
28454+};
28455+
28456+struct field_var_hist {
28457+ struct hist_trigger_data *hist_data;
28458+ char *cmd;
28459 };
28460
28461 struct hist_trigger_data {
28462- struct hist_field *fields[TRACING_MAP_FIELDS_MAX];
28463+ struct hist_field *fields[HIST_FIELDS_MAX];
28464 unsigned int n_vals;
28465 unsigned int n_keys;
28466 unsigned int n_fields;
28467+ unsigned int n_vars;
28468 unsigned int key_size;
28469 struct tracing_map_sort_key sort_keys[TRACING_MAP_SORT_KEYS_MAX];
28470 unsigned int n_sort_keys;
28471 struct trace_event_file *event_file;
28472 struct hist_trigger_attrs *attrs;
28473 struct tracing_map *map;
28474+ bool enable_timestamps;
28475+ bool remove;
28476+ struct hist_field *var_refs[TRACING_MAP_VARS_MAX];
28477+ unsigned int n_var_refs;
28478+
28479+ struct action_data *actions[HIST_ACTIONS_MAX];
28480+ unsigned int n_actions;
28481+
28482+ struct hist_field *synth_var_refs[SYNTH_FIELDS_MAX];
28483+ unsigned int n_synth_var_refs;
28484+ struct field_var *field_vars[SYNTH_FIELDS_MAX];
28485+ unsigned int n_field_vars;
28486+ unsigned int n_field_var_str;
28487+ struct field_var_hist *field_var_hists[SYNTH_FIELDS_MAX];
28488+ unsigned int n_field_var_hists;
28489+
28490+ struct field_var *max_vars[SYNTH_FIELDS_MAX];
28491+ unsigned int n_max_vars;
28492+ unsigned int n_max_var_str;
28493 };
28494
28495-static hist_field_fn_t select_value_fn(int field_size, int field_is_signed)
28496-{
28497- hist_field_fn_t fn = NULL;
28498+struct synth_field {
28499+ char *type;
28500+ char *name;
28501+ size_t size;
28502+ bool is_signed;
28503+ bool is_string;
28504+};
28505
28506- switch (field_size) {
28507- case 8:
28508- if (field_is_signed)
28509- fn = hist_field_s64;
28510- else
28511- fn = hist_field_u64;
28512- break;
28513- case 4:
28514- if (field_is_signed)
28515- fn = hist_field_s32;
28516- else
28517- fn = hist_field_u32;
28518- break;
28519- case 2:
28520- if (field_is_signed)
28521- fn = hist_field_s16;
28522- else
28523- fn = hist_field_u16;
28524- break;
28525- case 1:
28526- if (field_is_signed)
28527- fn = hist_field_s8;
28528- else
28529- fn = hist_field_u8;
28530- break;
28531- }
28532+struct synth_event {
28533+ struct list_head list;
28534+ int ref;
28535+ char *name;
28536+ struct synth_field **fields;
28537+ unsigned int n_fields;
28538+ unsigned int n_u64;
28539+ struct trace_event_class class;
28540+ struct trace_event_call call;
28541+ struct tracepoint *tp;
28542+};
28543
28544- return fn;
28545+struct action_data;
28546+
28547+typedef void (*action_fn_t) (struct hist_trigger_data *hist_data,
28548+ struct tracing_map_elt *elt, void *rec,
28549+ struct ring_buffer_event *rbe,
28550+ struct action_data *data, u64 *var_ref_vals);
28551+
28552+struct action_data {
28553+ action_fn_t fn;
28554+ unsigned int n_params;
28555+ char *params[SYNTH_FIELDS_MAX];
28556+
28557+ union {
28558+ struct {
28559+ unsigned int var_ref_idx;
28560+ char *match_event;
28561+ char *match_event_system;
28562+ char *synth_event_name;
28563+ struct synth_event *synth_event;
28564+ } onmatch;
28565+
28566+ struct {
28567+ char *var_str;
28568+ char *fn_name;
28569+ unsigned int max_var_ref_idx;
28570+ struct hist_field *max_var;
28571+ struct hist_field *var;
28572+ } onmax;
28573+ };
28574+};
28575+
28576+
28577+static char last_hist_cmd[MAX_FILTER_STR_VAL];
28578+static char hist_err_str[MAX_FILTER_STR_VAL];
28579+
28580+static void last_cmd_set(char *str)
28581+{
28582+ if (!str)
28583+ return;
28584+
28585+ strncpy(last_hist_cmd, str, MAX_FILTER_STR_VAL - 1);
28586 }
28587
28588-static int parse_map_size(char *str)
28589+static void hist_err(char *str, char *var)
28590 {
28591- unsigned long size, map_bits;
28592- int ret;
28593+ int maxlen = MAX_FILTER_STR_VAL - 1;
28594
28595- strsep(&str, "=");
28596- if (!str) {
28597- ret = -EINVAL;
28598- goto out;
28599- }
28600+ if (!str)
28601+ return;
28602
28603- ret = kstrtoul(str, 0, &size);
28604- if (ret)
28605- goto out;
28606+ if (strlen(hist_err_str))
28607+ return;
28608
28609- map_bits = ilog2(roundup_pow_of_two(size));
28610- if (map_bits < TRACING_MAP_BITS_MIN ||
28611- map_bits > TRACING_MAP_BITS_MAX)
28612- ret = -EINVAL;
28613- else
28614- ret = map_bits;
28615- out:
28616- return ret;
28617+ if (!var)
28618+ var = "";
28619+
28620+ if (strlen(hist_err_str) + strlen(str) + strlen(var) > maxlen)
28621+ return;
28622+
28623+ strcat(hist_err_str, str);
28624+ strcat(hist_err_str, var);
28625 }
28626
28627-static void destroy_hist_trigger_attrs(struct hist_trigger_attrs *attrs)
28628+static void hist_err_event(char *str, char *system, char *event, char *var)
28629 {
28630- if (!attrs)
28631- return;
28632+ char err[MAX_FILTER_STR_VAL];
28633
28634- kfree(attrs->name);
28635- kfree(attrs->sort_key_str);
28636- kfree(attrs->keys_str);
28637- kfree(attrs->vals_str);
28638- kfree(attrs);
28639+ if (system && var)
28640+ snprintf(err, MAX_FILTER_STR_VAL, "%s.%s.%s", system, event, var);
28641+ else if (system)
28642+ snprintf(err, MAX_FILTER_STR_VAL, "%s.%s", system, event);
28643+ else
28644+ strncpy(err, var, MAX_FILTER_STR_VAL);
28645+
28646+ hist_err(str, err);
28647 }
28648
28649-static struct hist_trigger_attrs *parse_hist_trigger_attrs(char *trigger_str)
28650+static void hist_err_clear(void)
28651 {
28652- struct hist_trigger_attrs *attrs;
28653- int ret = 0;
28654+ hist_err_str[0] = '\0';
28655+}
28656
28657- attrs = kzalloc(sizeof(*attrs), GFP_KERNEL);
28658- if (!attrs)
28659- return ERR_PTR(-ENOMEM);
28660+static bool have_hist_err(void)
28661+{
28662+ if (strlen(hist_err_str))
28663+ return true;
28664
28665- while (trigger_str) {
28666- char *str = strsep(&trigger_str, ":");
28667+ return false;
28668+}
28669
28670- if ((strncmp(str, "key=", strlen("key=")) == 0) ||
28671- (strncmp(str, "keys=", strlen("keys=")) == 0))
28672- attrs->keys_str = kstrdup(str, GFP_KERNEL);
28673- else if ((strncmp(str, "val=", strlen("val=")) == 0) ||
28674- (strncmp(str, "vals=", strlen("vals=")) == 0) ||
28675- (strncmp(str, "values=", strlen("values=")) == 0))
28676- attrs->vals_str = kstrdup(str, GFP_KERNEL);
28677- else if (strncmp(str, "sort=", strlen("sort=")) == 0)
28678- attrs->sort_key_str = kstrdup(str, GFP_KERNEL);
28679- else if (strncmp(str, "name=", strlen("name=")) == 0)
28680- attrs->name = kstrdup(str, GFP_KERNEL);
28681- else if (strcmp(str, "pause") == 0)
28682- attrs->pause = true;
28683- else if ((strcmp(str, "cont") == 0) ||
28684- (strcmp(str, "continue") == 0))
28685- attrs->cont = true;
28686- else if (strcmp(str, "clear") == 0)
28687- attrs->clear = true;
28688- else if (strncmp(str, "size=", strlen("size=")) == 0) {
28689- int map_bits = parse_map_size(str);
28690+static LIST_HEAD(synth_event_list);
28691+static DEFINE_MUTEX(synth_event_mutex);
28692
28693- if (map_bits < 0) {
28694- ret = map_bits;
28695- goto free;
28696- }
28697- attrs->map_bits = map_bits;
28698+struct synth_trace_event {
28699+ struct trace_entry ent;
28700+ u64 fields[];
28701+};
28702+
28703+static int synth_event_define_fields(struct trace_event_call *call)
28704+{
28705+ struct synth_trace_event trace;
28706+ int offset = offsetof(typeof(trace), fields);
28707+ struct synth_event *event = call->data;
28708+ unsigned int i, size, n_u64;
28709+ char *name, *type;
28710+ bool is_signed;
28711+ int ret = 0;
28712+
28713+ for (i = 0, n_u64 = 0; i < event->n_fields; i++) {
28714+ size = event->fields[i]->size;
28715+ is_signed = event->fields[i]->is_signed;
28716+ type = event->fields[i]->type;
28717+ name = event->fields[i]->name;
28718+ ret = trace_define_field(call, type, name, offset, size,
28719+ is_signed, FILTER_OTHER);
28720+ if (ret)
28721+ break;
28722+
28723+ if (event->fields[i]->is_string) {
28724+ offset += STR_VAR_LEN_MAX;
28725+ n_u64 += STR_VAR_LEN_MAX / sizeof(u64);
28726 } else {
28727- ret = -EINVAL;
28728- goto free;
28729+ offset += sizeof(u64);
28730+ n_u64++;
28731 }
28732 }
28733
28734- if (!attrs->keys_str) {
28735- ret = -EINVAL;
28736- goto free;
28737- }
28738+ event->n_u64 = n_u64;
28739
28740- return attrs;
28741- free:
28742- destroy_hist_trigger_attrs(attrs);
28743+ return ret;
28744+}
28745
28746- return ERR_PTR(ret);
28747+static bool synth_field_signed(char *type)
28748+{
28749+ if (strncmp(type, "u", 1) == 0)
28750+ return false;
28751+
28752+ return true;
28753 }
28754
28755-static inline void save_comm(char *comm, struct task_struct *task)
28756+static int synth_field_is_string(char *type)
28757 {
28758- if (!task->pid) {
28759- strcpy(comm, "<idle>");
28760- return;
28761- }
28762+ if (strstr(type, "char[") != NULL)
28763+ return true;
28764
28765- if (WARN_ON_ONCE(task->pid < 0)) {
28766- strcpy(comm, "<XXX>");
28767- return;
28768- }
28769+ return false;
28770+}
28771
28772- memcpy(comm, task->comm, TASK_COMM_LEN);
28773+static int synth_field_string_size(char *type)
28774+{
28775+ char buf[4], *end, *start;
28776+ unsigned int len;
28777+ int size, err;
28778+
28779+ start = strstr(type, "char[");
28780+ if (start == NULL)
28781+ return -EINVAL;
28782+ start += strlen("char[");
28783+
28784+ end = strchr(type, ']');
28785+ if (!end || end < start)
28786+ return -EINVAL;
28787+
28788+ len = end - start;
28789+ if (len > 3)
28790+ return -EINVAL;
28791+
28792+ strncpy(buf, start, len);
28793+ buf[len] = '\0';
28794+
28795+ err = kstrtouint(buf, 0, &size);
28796+ if (err)
28797+ return err;
28798+
28799+ if (size > STR_VAR_LEN_MAX)
28800+ return -EINVAL;
28801+
28802+ return size;
28803 }
28804
28805-static void hist_trigger_elt_comm_free(struct tracing_map_elt *elt)
28806+static int synth_field_size(char *type)
28807 {
28808- kfree((char *)elt->private_data);
28809+ int size = 0;
28810+
28811+ if (strcmp(type, "s64") == 0)
28812+ size = sizeof(s64);
28813+ else if (strcmp(type, "u64") == 0)
28814+ size = sizeof(u64);
28815+ else if (strcmp(type, "s32") == 0)
28816+ size = sizeof(s32);
28817+ else if (strcmp(type, "u32") == 0)
28818+ size = sizeof(u32);
28819+ else if (strcmp(type, "s16") == 0)
28820+ size = sizeof(s16);
28821+ else if (strcmp(type, "u16") == 0)
28822+ size = sizeof(u16);
28823+ else if (strcmp(type, "s8") == 0)
28824+ size = sizeof(s8);
28825+ else if (strcmp(type, "u8") == 0)
28826+ size = sizeof(u8);
28827+ else if (strcmp(type, "char") == 0)
28828+ size = sizeof(char);
28829+ else if (strcmp(type, "unsigned char") == 0)
28830+ size = sizeof(unsigned char);
28831+ else if (strcmp(type, "int") == 0)
28832+ size = sizeof(int);
28833+ else if (strcmp(type, "unsigned int") == 0)
28834+ size = sizeof(unsigned int);
28835+ else if (strcmp(type, "long") == 0)
28836+ size = sizeof(long);
28837+ else if (strcmp(type, "unsigned long") == 0)
28838+ size = sizeof(unsigned long);
28839+ else if (strcmp(type, "pid_t") == 0)
28840+ size = sizeof(pid_t);
28841+ else if (synth_field_is_string(type))
28842+ size = synth_field_string_size(type);
28843+
28844+ return size;
28845 }
28846
28847-static int hist_trigger_elt_comm_alloc(struct tracing_map_elt *elt)
28848+static const char *synth_field_fmt(char *type)
28849 {
28850- struct hist_trigger_data *hist_data = elt->map->private_data;
28851- struct hist_field *key_field;
28852- unsigned int i;
28853+ const char *fmt = "%llu";
28854+
28855+ if (strcmp(type, "s64") == 0)
28856+ fmt = "%lld";
28857+ else if (strcmp(type, "u64") == 0)
28858+ fmt = "%llu";
28859+ else if (strcmp(type, "s32") == 0)
28860+ fmt = "%d";
28861+ else if (strcmp(type, "u32") == 0)
28862+ fmt = "%u";
28863+ else if (strcmp(type, "s16") == 0)
28864+ fmt = "%d";
28865+ else if (strcmp(type, "u16") == 0)
28866+ fmt = "%u";
28867+ else if (strcmp(type, "s8") == 0)
28868+ fmt = "%d";
28869+ else if (strcmp(type, "u8") == 0)
28870+ fmt = "%u";
28871+ else if (strcmp(type, "char") == 0)
28872+ fmt = "%d";
28873+ else if (strcmp(type, "unsigned char") == 0)
28874+ fmt = "%u";
28875+ else if (strcmp(type, "int") == 0)
28876+ fmt = "%d";
28877+ else if (strcmp(type, "unsigned int") == 0)
28878+ fmt = "%u";
28879+ else if (strcmp(type, "long") == 0)
28880+ fmt = "%ld";
28881+ else if (strcmp(type, "unsigned long") == 0)
28882+ fmt = "%lu";
28883+ else if (strcmp(type, "pid_t") == 0)
28884+ fmt = "%d";
28885+ else if (synth_field_is_string(type))
28886+ fmt = "%s";
28887+
28888+ return fmt;
28889+}
28890
28891- for_each_hist_key_field(i, hist_data) {
28892- key_field = hist_data->fields[i];
28893+static enum print_line_t print_synth_event(struct trace_iterator *iter,
28894+ int flags,
28895+ struct trace_event *event)
28896+{
28897+ struct trace_array *tr = iter->tr;
28898+ struct trace_seq *s = &iter->seq;
28899+ struct synth_trace_event *entry;
28900+ struct synth_event *se;
28901+ unsigned int i, n_u64;
28902+ char print_fmt[32];
28903+ const char *fmt;
28904
28905- if (key_field->flags & HIST_FIELD_FL_EXECNAME) {
28906- unsigned int size = TASK_COMM_LEN + 1;
28907+ entry = (struct synth_trace_event *)iter->ent;
28908+ se = container_of(event, struct synth_event, call.event);
28909
28910- elt->private_data = kzalloc(size, GFP_KERNEL);
28911- if (!elt->private_data)
28912- return -ENOMEM;
28913- break;
28914+ trace_seq_printf(s, "%s: ", se->name);
28915+
28916+ for (i = 0, n_u64 = 0; i < se->n_fields; i++) {
28917+ if (trace_seq_has_overflowed(s))
28918+ goto end;
28919+
28920+ fmt = synth_field_fmt(se->fields[i]->type);
28921+
28922+ /* parameter types */
28923+ if (tr->trace_flags & TRACE_ITER_VERBOSE)
28924+ trace_seq_printf(s, "%s ", fmt);
28925+
28926+ snprintf(print_fmt, sizeof(print_fmt), "%%s=%s%%s", fmt);
28927+
28928+ /* parameter values */
28929+ if (se->fields[i]->is_string) {
28930+ trace_seq_printf(s, print_fmt, se->fields[i]->name,
28931+ (char *)&entry->fields[n_u64],
28932+ i == se->n_fields - 1 ? "" : " ");
28933+ n_u64 += STR_VAR_LEN_MAX / sizeof(u64);
28934+ } else {
28935+ trace_seq_printf(s, print_fmt, se->fields[i]->name,
28936+ entry->fields[n_u64],
28937+ i == se->n_fields - 1 ? "" : " ");
28938+ n_u64++;
28939 }
28940 }
28941+end:
28942+ trace_seq_putc(s, '\n');
28943
28944- return 0;
28945+ return trace_handle_return(s);
28946 }
28947
28948-static void hist_trigger_elt_comm_copy(struct tracing_map_elt *to,
28949- struct tracing_map_elt *from)
28950+static struct trace_event_functions synth_event_funcs = {
28951+ .trace = print_synth_event
28952+};
28953+
28954+static notrace void trace_event_raw_event_synth(void *__data,
28955+ u64 *var_ref_vals,
28956+ unsigned int var_ref_idx)
28957 {
28958- char *comm_from = from->private_data;
28959- char *comm_to = to->private_data;
28960+ struct trace_event_file *trace_file = __data;
28961+ struct synth_trace_event *entry;
28962+ struct trace_event_buffer fbuffer;
28963+ struct ring_buffer *buffer;
28964+ struct synth_event *event;
28965+ unsigned int i, n_u64;
28966+ int fields_size = 0;
28967+
28968+ event = trace_file->event_call->data;
28969+
28970+ if (trace_trigger_soft_disabled(trace_file))
28971+ return;
28972+
28973+ fields_size = event->n_u64 * sizeof(u64);
28974+
28975+ /*
28976+ * Avoid ring buffer recursion detection, as this event
28977+ * is being performed within another event.
28978+ */
28979+ buffer = trace_file->tr->trace_buffer.buffer;
28980+ ring_buffer_nest_start(buffer);
28981+
28982+ entry = trace_event_buffer_reserve(&fbuffer, trace_file,
28983+ sizeof(*entry) + fields_size);
28984+ if (!entry)
28985+ goto out;
28986+
28987+ for (i = 0, n_u64 = 0; i < event->n_fields; i++) {
28988+ if (event->fields[i]->is_string) {
28989+ char *str_val = (char *)(long)var_ref_vals[var_ref_idx + i];
28990+ char *str_field = (char *)&entry->fields[n_u64];
28991+
28992+ strscpy(str_field, str_val, STR_VAR_LEN_MAX);
28993+ n_u64 += STR_VAR_LEN_MAX / sizeof(u64);
28994+ } else {
28995+ entry->fields[n_u64] = var_ref_vals[var_ref_idx + i];
28996+ n_u64++;
28997+ }
28998+ }
28999+
29000+ trace_event_buffer_commit(&fbuffer);
29001+out:
29002+ ring_buffer_nest_end(buffer);
29003+}
29004
29005- if (comm_from)
29006- memcpy(comm_to, comm_from, TASK_COMM_LEN + 1);
29007+static void free_synth_event_print_fmt(struct trace_event_call *call)
29008+{
29009+ if (call) {
29010+ kfree(call->print_fmt);
29011+ call->print_fmt = NULL;
29012+ }
29013 }
29014
29015-static void hist_trigger_elt_comm_init(struct tracing_map_elt *elt)
29016+static int __set_synth_event_print_fmt(struct synth_event *event,
29017+ char *buf, int len)
29018 {
29019- char *comm = elt->private_data;
29020+ const char *fmt;
29021+ int pos = 0;
29022+ int i;
29023+
29024+ /* When len=0, we just calculate the needed length */
29025+#define LEN_OR_ZERO (len ? len - pos : 0)
29026+
29027+ pos += snprintf(buf + pos, LEN_OR_ZERO, "\"");
29028+ for (i = 0; i < event->n_fields; i++) {
29029+ fmt = synth_field_fmt(event->fields[i]->type);
29030+ pos += snprintf(buf + pos, LEN_OR_ZERO, "%s=%s%s",
29031+ event->fields[i]->name, fmt,
29032+ i == event->n_fields - 1 ? "" : ", ");
29033+ }
29034+ pos += snprintf(buf + pos, LEN_OR_ZERO, "\"");
29035+
29036+ for (i = 0; i < event->n_fields; i++) {
29037+ pos += snprintf(buf + pos, LEN_OR_ZERO,
29038+ ", REC->%s", event->fields[i]->name);
29039+ }
29040+
29041+#undef LEN_OR_ZERO
29042
29043- if (comm)
29044- save_comm(comm, current);
29045+ /* return the length of print_fmt */
29046+ return pos;
29047 }
29048
29049-static const struct tracing_map_ops hist_trigger_elt_comm_ops = {
29050- .elt_alloc = hist_trigger_elt_comm_alloc,
29051- .elt_copy = hist_trigger_elt_comm_copy,
29052- .elt_free = hist_trigger_elt_comm_free,
29053- .elt_init = hist_trigger_elt_comm_init,
29054-};
29055+static int set_synth_event_print_fmt(struct trace_event_call *call)
29056+{
29057+ struct synth_event *event = call->data;
29058+ char *print_fmt;
29059+ int len;
29060+
29061+ /* First: called with 0 length to calculate the needed length */
29062+ len = __set_synth_event_print_fmt(event, NULL, 0);
29063+
29064+ print_fmt = kmalloc(len + 1, GFP_KERNEL);
29065+ if (!print_fmt)
29066+ return -ENOMEM;
29067+
29068+ /* Second: actually write the @print_fmt */
29069+ __set_synth_event_print_fmt(event, print_fmt, len + 1);
29070+ call->print_fmt = print_fmt;
29071
29072-static void destroy_hist_field(struct hist_field *hist_field)
29073+ return 0;
29074+}
29075+
29076+static void free_synth_field(struct synth_field *field)
29077 {
29078- kfree(hist_field);
29079+ kfree(field->type);
29080+ kfree(field->name);
29081+ kfree(field);
29082 }
29083
29084-static struct hist_field *create_hist_field(struct ftrace_event_field *field,
29085- unsigned long flags)
29086+static struct synth_field *parse_synth_field(char *field_type,
29087+ char *field_name)
29088 {
29089- struct hist_field *hist_field;
29090+ struct synth_field *field;
29091+ int len, ret = 0;
29092+ char *array;
29093
29094- if (field && is_function_field(field))
29095- return NULL;
29096+ if (field_type[0] == ';')
29097+ field_type++;
29098
29099- hist_field = kzalloc(sizeof(struct hist_field), GFP_KERNEL);
29100- if (!hist_field)
29101- return NULL;
29102+ len = strlen(field_name);
29103+ if (field_name[len - 1] == ';')
29104+ field_name[len - 1] = '\0';
29105
29106- if (flags & HIST_FIELD_FL_HITCOUNT) {
29107- hist_field->fn = hist_field_counter;
29108- goto out;
29109+ field = kzalloc(sizeof(*field), GFP_KERNEL);
29110+ if (!field)
29111+ return ERR_PTR(-ENOMEM);
29112+
29113+ len = strlen(field_type) + 1;
29114+ array = strchr(field_name, '[');
29115+ if (array)
29116+ len += strlen(array);
29117+ field->type = kzalloc(len, GFP_KERNEL);
29118+ if (!field->type) {
29119+ ret = -ENOMEM;
29120+ goto free;
29121+ }
29122+ strcat(field->type, field_type);
29123+ if (array) {
29124+ strcat(field->type, array);
29125+ *array = '\0';
29126 }
29127
29128- if (flags & HIST_FIELD_FL_STACKTRACE) {
29129- hist_field->fn = hist_field_none;
29130- goto out;
29131+ field->size = synth_field_size(field->type);
29132+ if (!field->size) {
29133+ ret = -EINVAL;
29134+ goto free;
29135 }
29136
29137- if (flags & HIST_FIELD_FL_LOG2) {
29138- hist_field->fn = hist_field_log2;
29139- goto out;
29140+ if (synth_field_is_string(field->type))
29141+ field->is_string = true;
29142+
29143+ field->is_signed = synth_field_signed(field->type);
29144+
29145+ field->name = kstrdup(field_name, GFP_KERNEL);
29146+ if (!field->name) {
29147+ ret = -ENOMEM;
29148+ goto free;
29149+ }
29150+ out:
29151+ return field;
29152+ free:
29153+ free_synth_field(field);
29154+ field = ERR_PTR(ret);
29155+ goto out;
29156+}
29157+
29158+static void free_synth_tracepoint(struct tracepoint *tp)
29159+{
29160+ if (!tp)
29161+ return;
29162+
29163+ kfree(tp->name);
29164+ kfree(tp);
29165+}
29166+
29167+static struct tracepoint *alloc_synth_tracepoint(char *name)
29168+{
29169+ struct tracepoint *tp;
29170+
29171+ tp = kzalloc(sizeof(*tp), GFP_KERNEL);
29172+ if (!tp)
29173+ return ERR_PTR(-ENOMEM);
29174+
29175+ tp->name = kstrdup(name, GFP_KERNEL);
29176+ if (!tp->name) {
29177+ kfree(tp);
29178+ return ERR_PTR(-ENOMEM);
29179+ }
29180+
29181+ return tp;
29182+}
29183+
29184+typedef void (*synth_probe_func_t) (void *__data, u64 *var_ref_vals,
29185+ unsigned int var_ref_idx);
29186+
29187+static inline void trace_synth(struct synth_event *event, u64 *var_ref_vals,
29188+ unsigned int var_ref_idx)
29189+{
29190+ struct tracepoint *tp = event->tp;
29191+
29192+ if (unlikely(atomic_read(&tp->key.enabled) > 0)) {
29193+ struct tracepoint_func *probe_func_ptr;
29194+ synth_probe_func_t probe_func;
29195+ void *__data;
29196+
29197+ if (!(cpu_online(raw_smp_processor_id())))
29198+ return;
29199+
29200+ probe_func_ptr = rcu_dereference_sched((tp)->funcs);
29201+ if (probe_func_ptr) {
29202+ do {
29203+ probe_func = probe_func_ptr->func;
29204+ __data = probe_func_ptr->data;
29205+ probe_func(__data, var_ref_vals, var_ref_idx);
29206+ } while ((++probe_func_ptr)->func);
29207+ }
29208+ }
29209+}
29210+
29211+static struct synth_event *find_synth_event(const char *name)
29212+{
29213+ struct synth_event *event;
29214+
29215+ list_for_each_entry(event, &synth_event_list, list) {
29216+ if (strcmp(event->name, name) == 0)
29217+ return event;
29218+ }
29219+
29220+ return NULL;
29221+}
29222+
29223+static int register_synth_event(struct synth_event *event)
29224+{
29225+ struct trace_event_call *call = &event->call;
29226+ int ret = 0;
29227+
29228+ event->call.class = &event->class;
29229+ event->class.system = kstrdup(SYNTH_SYSTEM, GFP_KERNEL);
29230+ if (!event->class.system) {
29231+ ret = -ENOMEM;
29232+ goto out;
29233+ }
29234+
29235+ event->tp = alloc_synth_tracepoint(event->name);
29236+ if (IS_ERR(event->tp)) {
29237+ ret = PTR_ERR(event->tp);
29238+ event->tp = NULL;
29239+ goto out;
29240+ }
29241+
29242+ INIT_LIST_HEAD(&call->class->fields);
29243+ call->event.funcs = &synth_event_funcs;
29244+ call->class->define_fields = synth_event_define_fields;
29245+
29246+ ret = register_trace_event(&call->event);
29247+ if (!ret) {
29248+ ret = -ENODEV;
29249+ goto out;
29250+ }
29251+ call->flags = TRACE_EVENT_FL_TRACEPOINT;
29252+ call->class->reg = trace_event_reg;
29253+ call->class->probe = trace_event_raw_event_synth;
29254+ call->data = event;
29255+ call->tp = event->tp;
29256+
29257+ ret = trace_add_event_call(call);
29258+ if (ret) {
29259+ pr_warn("Failed to register synthetic event: %s\n",
29260+ trace_event_name(call));
29261+ goto err;
29262+ }
29263+
29264+ ret = set_synth_event_print_fmt(call);
29265+ if (ret < 0) {
29266+ trace_remove_event_call(call);
29267+ goto err;
29268+ }
29269+ out:
29270+ return ret;
29271+ err:
29272+ unregister_trace_event(&call->event);
29273+ goto out;
29274+}
29275+
29276+static int unregister_synth_event(struct synth_event *event)
29277+{
29278+ struct trace_event_call *call = &event->call;
29279+ int ret;
29280+
29281+ ret = trace_remove_event_call(call);
29282+
29283+ return ret;
29284+}
29285+
29286+static void free_synth_event(struct synth_event *event)
29287+{
29288+ unsigned int i;
29289+
29290+ if (!event)
29291+ return;
29292+
29293+ for (i = 0; i < event->n_fields; i++)
29294+ free_synth_field(event->fields[i]);
29295+
29296+ kfree(event->fields);
29297+ kfree(event->name);
29298+ kfree(event->class.system);
29299+ free_synth_tracepoint(event->tp);
29300+ free_synth_event_print_fmt(&event->call);
29301+ kfree(event);
29302+}
29303+
29304+static struct synth_event *alloc_synth_event(char *event_name, int n_fields,
29305+ struct synth_field **fields)
29306+{
29307+ struct synth_event *event;
29308+ unsigned int i;
29309+
29310+ event = kzalloc(sizeof(*event), GFP_KERNEL);
29311+ if (!event) {
29312+ event = ERR_PTR(-ENOMEM);
29313+ goto out;
29314+ }
29315+
29316+ event->name = kstrdup(event_name, GFP_KERNEL);
29317+ if (!event->name) {
29318+ kfree(event);
29319+ event = ERR_PTR(-ENOMEM);
29320+ goto out;
29321+ }
29322+
29323+ event->fields = kcalloc(n_fields, sizeof(*event->fields), GFP_KERNEL);
29324+ if (!event->fields) {
29325+ free_synth_event(event);
29326+ event = ERR_PTR(-ENOMEM);
29327+ goto out;
29328+ }
29329+
29330+ for (i = 0; i < n_fields; i++)
29331+ event->fields[i] = fields[i];
29332+
29333+ event->n_fields = n_fields;
29334+ out:
29335+ return event;
29336+}
29337+
29338+static void action_trace(struct hist_trigger_data *hist_data,
29339+ struct tracing_map_elt *elt, void *rec,
29340+ struct ring_buffer_event *rbe,
29341+ struct action_data *data, u64 *var_ref_vals)
29342+{
29343+ struct synth_event *event = data->onmatch.synth_event;
29344+
29345+ trace_synth(event, var_ref_vals, data->onmatch.var_ref_idx);
29346+}
29347+
29348+struct hist_var_data {
29349+ struct list_head list;
29350+ struct hist_trigger_data *hist_data;
29351+};
29352+
29353+static void add_or_delete_synth_event(struct synth_event *event, int delete)
29354+{
29355+ if (delete)
29356+ free_synth_event(event);
29357+ else {
29358+ mutex_lock(&synth_event_mutex);
29359+ if (!find_synth_event(event->name))
29360+ list_add(&event->list, &synth_event_list);
29361+ else
29362+ free_synth_event(event);
29363+ mutex_unlock(&synth_event_mutex);
29364+ }
29365+}
29366+
29367+static int create_synth_event(int argc, char **argv)
29368+{
29369+ struct synth_field *field, *fields[SYNTH_FIELDS_MAX];
29370+ struct synth_event *event = NULL;
29371+ bool delete_event = false;
29372+ int i, n_fields = 0, ret = 0;
29373+ char *name;
29374+
29375+ mutex_lock(&synth_event_mutex);
29376+
29377+ /*
29378+ * Argument syntax:
29379+ * - Add synthetic event: <event_name> field[;field] ...
29380+ * - Remove synthetic event: !<event_name> field[;field] ...
29381+ * where 'field' = type field_name
29382+ */
29383+ if (argc < 1) {
29384+ ret = -EINVAL;
29385+ goto out;
29386+ }
29387+
29388+ name = argv[0];
29389+ if (name[0] == '!') {
29390+ delete_event = true;
29391+ name++;
29392+ }
29393+
29394+ event = find_synth_event(name);
29395+ if (event) {
29396+ if (delete_event) {
29397+ if (event->ref) {
29398+ event = NULL;
29399+ ret = -EBUSY;
29400+ goto out;
29401+ }
29402+ list_del(&event->list);
29403+ goto out;
29404+ }
29405+ event = NULL;
29406+ ret = -EEXIST;
29407+ goto out;
29408+ } else if (delete_event)
29409+ goto out;
29410+
29411+ if (argc < 2) {
29412+ ret = -EINVAL;
29413+ goto out;
29414+ }
29415+
29416+ for (i = 1; i < argc - 1; i++) {
29417+ if (strcmp(argv[i], ";") == 0)
29418+ continue;
29419+ if (n_fields == SYNTH_FIELDS_MAX) {
29420+ ret = -EINVAL;
29421+ goto err;
29422+ }
29423+
29424+ field = parse_synth_field(argv[i], argv[i + 1]);
29425+ if (IS_ERR(field)) {
29426+ ret = PTR_ERR(field);
29427+ goto err;
29428+ }
29429+ fields[n_fields] = field;
29430+ i++; n_fields++;
29431+ }
29432+
29433+ if (i < argc) {
29434+ ret = -EINVAL;
29435+ goto err;
29436+ }
29437+
29438+ event = alloc_synth_event(name, n_fields, fields);
29439+ if (IS_ERR(event)) {
29440+ ret = PTR_ERR(event);
29441+ event = NULL;
29442+ goto err;
29443+ }
29444+ out:
29445+ mutex_unlock(&synth_event_mutex);
29446+
29447+ if (event) {
29448+ if (delete_event) {
29449+ ret = unregister_synth_event(event);
29450+ add_or_delete_synth_event(event, !ret);
29451+ } else {
29452+ ret = register_synth_event(event);
29453+ add_or_delete_synth_event(event, ret);
29454+ }
29455+ }
29456+
29457+ return ret;
29458+ err:
29459+ mutex_unlock(&synth_event_mutex);
29460+
29461+ for (i = 0; i < n_fields; i++)
29462+ free_synth_field(fields[i]);
29463+ free_synth_event(event);
29464+
29465+ return ret;
29466+}
29467+
29468+static int release_all_synth_events(void)
29469+{
29470+ struct list_head release_events;
29471+ struct synth_event *event, *e;
29472+ int ret = 0;
29473+
29474+ INIT_LIST_HEAD(&release_events);
29475+
29476+ mutex_lock(&synth_event_mutex);
29477+
29478+ list_for_each_entry(event, &synth_event_list, list) {
29479+ if (event->ref) {
29480+ mutex_unlock(&synth_event_mutex);
29481+ return -EBUSY;
29482+ }
29483+ }
29484+
29485+ list_splice_init(&event->list, &release_events);
29486+
29487+ mutex_unlock(&synth_event_mutex);
29488+
29489+ list_for_each_entry_safe(event, e, &release_events, list) {
29490+ list_del(&event->list);
29491+
29492+ ret = unregister_synth_event(event);
29493+ add_or_delete_synth_event(event, !ret);
29494+ }
29495+
29496+ return ret;
29497+}
29498+
29499+
29500+static void *synth_events_seq_start(struct seq_file *m, loff_t *pos)
29501+{
29502+ mutex_lock(&synth_event_mutex);
29503+
29504+ return seq_list_start(&synth_event_list, *pos);
29505+}
29506+
29507+static void *synth_events_seq_next(struct seq_file *m, void *v, loff_t *pos)
29508+{
29509+ return seq_list_next(v, &synth_event_list, pos);
29510+}
29511+
29512+static void synth_events_seq_stop(struct seq_file *m, void *v)
29513+{
29514+ mutex_unlock(&synth_event_mutex);
29515+}
29516+
29517+static int synth_events_seq_show(struct seq_file *m, void *v)
29518+{
29519+ struct synth_field *field;
29520+ struct synth_event *event = v;
29521+ unsigned int i;
29522+
29523+ seq_printf(m, "%s\t", event->name);
29524+
29525+ for (i = 0; i < event->n_fields; i++) {
29526+ field = event->fields[i];
29527+
29528+ /* parameter values */
29529+ seq_printf(m, "%s %s%s", field->type, field->name,
29530+ i == event->n_fields - 1 ? "" : "; ");
29531+ }
29532+
29533+ seq_putc(m, '\n');
29534+
29535+ return 0;
29536+}
29537+
29538+static const struct seq_operations synth_events_seq_op = {
29539+ .start = synth_events_seq_start,
29540+ .next = synth_events_seq_next,
29541+ .stop = synth_events_seq_stop,
29542+ .show = synth_events_seq_show
29543+};
29544+
29545+static int synth_events_open(struct inode *inode, struct file *file)
29546+{
29547+ int ret;
29548+
29549+ if ((file->f_mode & FMODE_WRITE) && (file->f_flags & O_TRUNC)) {
29550+ ret = release_all_synth_events();
29551+ if (ret < 0)
29552+ return ret;
29553+ }
29554+
29555+ return seq_open(file, &synth_events_seq_op);
29556+}
29557+
29558+static ssize_t synth_events_write(struct file *file,
29559+ const char __user *buffer,
29560+ size_t count, loff_t *ppos)
29561+{
29562+ return trace_parse_run_command(file, buffer, count, ppos,
29563+ create_synth_event);
29564+}
29565+
29566+static const struct file_operations synth_events_fops = {
29567+ .open = synth_events_open,
29568+ .write = synth_events_write,
29569+ .read = seq_read,
29570+ .llseek = seq_lseek,
29571+ .release = seq_release,
29572+};
29573+
29574+static u64 hist_field_timestamp(struct hist_field *hist_field,
29575+ struct tracing_map_elt *elt,
29576+ struct ring_buffer_event *rbe,
29577+ void *event)
29578+{
29579+ struct hist_trigger_data *hist_data = hist_field->hist_data;
29580+ struct trace_array *tr = hist_data->event_file->tr;
29581+
29582+ u64 ts = ring_buffer_event_time_stamp(rbe);
29583+
29584+ if (hist_data->attrs->ts_in_usecs && trace_clock_in_ns(tr))
29585+ ts = ns2usecs(ts);
29586+
29587+ return ts;
29588+}
29589+
29590+static u64 hist_field_cpu(struct hist_field *hist_field,
29591+ struct tracing_map_elt *elt,
29592+ struct ring_buffer_event *rbe,
29593+ void *event)
29594+{
29595+ int cpu = smp_processor_id();
29596+
29597+ return cpu;
29598+}
29599+
29600+static struct hist_field *
29601+check_field_for_var_ref(struct hist_field *hist_field,
29602+ struct hist_trigger_data *var_data,
29603+ unsigned int var_idx)
29604+{
29605+ struct hist_field *found = NULL;
29606+
29607+ if (hist_field && hist_field->flags & HIST_FIELD_FL_VAR_REF) {
29608+ if (hist_field->var.idx == var_idx &&
29609+ hist_field->var.hist_data == var_data) {
29610+ found = hist_field;
29611+ }
29612+ }
29613+
29614+ return found;
29615+}
29616+
29617+static struct hist_field *
29618+check_field_for_var_refs(struct hist_trigger_data *hist_data,
29619+ struct hist_field *hist_field,
29620+ struct hist_trigger_data *var_data,
29621+ unsigned int var_idx,
29622+ unsigned int level)
29623+{
29624+ struct hist_field *found = NULL;
29625+ unsigned int i;
29626+
29627+ if (level > 3)
29628+ return found;
29629+
29630+ if (!hist_field)
29631+ return found;
29632+
29633+ found = check_field_for_var_ref(hist_field, var_data, var_idx);
29634+ if (found)
29635+ return found;
29636+
29637+ for (i = 0; i < HIST_FIELD_OPERANDS_MAX; i++) {
29638+ struct hist_field *operand;
29639+
29640+ operand = hist_field->operands[i];
29641+ found = check_field_for_var_refs(hist_data, operand, var_data,
29642+ var_idx, level + 1);
29643+ if (found)
29644+ return found;
29645+ }
29646+
29647+ return found;
29648+}
29649+
29650+static struct hist_field *find_var_ref(struct hist_trigger_data *hist_data,
29651+ struct hist_trigger_data *var_data,
29652+ unsigned int var_idx)
29653+{
29654+ struct hist_field *hist_field, *found = NULL;
29655+ unsigned int i;
29656+
29657+ for_each_hist_field(i, hist_data) {
29658+ hist_field = hist_data->fields[i];
29659+ found = check_field_for_var_refs(hist_data, hist_field,
29660+ var_data, var_idx, 0);
29661+ if (found)
29662+ return found;
29663+ }
29664+
29665+ for (i = 0; i < hist_data->n_synth_var_refs; i++) {
29666+ hist_field = hist_data->synth_var_refs[i];
29667+ found = check_field_for_var_refs(hist_data, hist_field,
29668+ var_data, var_idx, 0);
29669+ if (found)
29670+ return found;
29671+ }
29672+
29673+ return found;
29674+}
29675+
29676+static struct hist_field *find_any_var_ref(struct hist_trigger_data *hist_data,
29677+ unsigned int var_idx)
29678+{
29679+ struct trace_array *tr = hist_data->event_file->tr;
29680+ struct hist_field *found = NULL;
29681+ struct hist_var_data *var_data;
29682+
29683+ list_for_each_entry(var_data, &tr->hist_vars, list) {
29684+ if (var_data->hist_data == hist_data)
29685+ continue;
29686+ found = find_var_ref(var_data->hist_data, hist_data, var_idx);
29687+ if (found)
29688+ break;
29689+ }
29690+
29691+ return found;
29692+}
29693+
29694+static bool check_var_refs(struct hist_trigger_data *hist_data)
29695+{
29696+ struct hist_field *field;
29697+ bool found = false;
29698+ int i;
29699+
29700+ for_each_hist_field(i, hist_data) {
29701+ field = hist_data->fields[i];
29702+ if (field && field->flags & HIST_FIELD_FL_VAR) {
29703+ if (find_any_var_ref(hist_data, field->var.idx)) {
29704+ found = true;
29705+ break;
29706+ }
29707+ }
29708+ }
29709+
29710+ return found;
29711+}
29712+
29713+static struct hist_var_data *find_hist_vars(struct hist_trigger_data *hist_data)
29714+{
29715+ struct trace_array *tr = hist_data->event_file->tr;
29716+ struct hist_var_data *var_data, *found = NULL;
29717+
29718+ list_for_each_entry(var_data, &tr->hist_vars, list) {
29719+ if (var_data->hist_data == hist_data) {
29720+ found = var_data;
29721+ break;
29722+ }
29723+ }
29724+
29725+ return found;
29726+}
29727+
29728+static bool field_has_hist_vars(struct hist_field *hist_field,
29729+ unsigned int level)
29730+{
29731+ int i;
29732+
29733+ if (level > 3)
29734+ return false;
29735+
29736+ if (!hist_field)
29737+ return false;
29738+
29739+ if (hist_field->flags & HIST_FIELD_FL_VAR ||
29740+ hist_field->flags & HIST_FIELD_FL_VAR_REF)
29741+ return true;
29742+
29743+ for (i = 0; i < HIST_FIELD_OPERANDS_MAX; i++) {
29744+ struct hist_field *operand;
29745+
29746+ operand = hist_field->operands[i];
29747+ if (field_has_hist_vars(operand, level + 1))
29748+ return true;
29749+ }
29750+
29751+ return false;
29752+}
29753+
29754+static bool has_hist_vars(struct hist_trigger_data *hist_data)
29755+{
29756+ struct hist_field *hist_field;
29757+ int i;
29758+
29759+ for_each_hist_field(i, hist_data) {
29760+ hist_field = hist_data->fields[i];
29761+ if (field_has_hist_vars(hist_field, 0))
29762+ return true;
29763+ }
29764+
29765+ return false;
29766+}
29767+
29768+static int save_hist_vars(struct hist_trigger_data *hist_data)
29769+{
29770+ struct trace_array *tr = hist_data->event_file->tr;
29771+ struct hist_var_data *var_data;
29772+
29773+ var_data = find_hist_vars(hist_data);
29774+ if (var_data)
29775+ return 0;
29776+
29777+ if (trace_array_get(tr) < 0)
29778+ return -ENODEV;
29779+
29780+ var_data = kzalloc(sizeof(*var_data), GFP_KERNEL);
29781+ if (!var_data) {
29782+ trace_array_put(tr);
29783+ return -ENOMEM;
29784+ }
29785+
29786+ var_data->hist_data = hist_data;
29787+ list_add(&var_data->list, &tr->hist_vars);
29788+
29789+ return 0;
29790+}
29791+
29792+static void remove_hist_vars(struct hist_trigger_data *hist_data)
29793+{
29794+ struct trace_array *tr = hist_data->event_file->tr;
29795+ struct hist_var_data *var_data;
29796+
29797+ var_data = find_hist_vars(hist_data);
29798+ if (!var_data)
29799+ return;
29800+
29801+ if (WARN_ON(check_var_refs(hist_data)))
29802+ return;
29803+
29804+ list_del(&var_data->list);
29805+
29806+ kfree(var_data);
29807+
29808+ trace_array_put(tr);
29809+}
29810+
29811+static struct hist_field *find_var_field(struct hist_trigger_data *hist_data,
29812+ const char *var_name)
29813+{
29814+ struct hist_field *hist_field, *found = NULL;
29815+ int i;
29816+
29817+ for_each_hist_field(i, hist_data) {
29818+ hist_field = hist_data->fields[i];
29819+ if (hist_field && hist_field->flags & HIST_FIELD_FL_VAR &&
29820+ strcmp(hist_field->var.name, var_name) == 0) {
29821+ found = hist_field;
29822+ break;
29823+ }
29824+ }
29825+
29826+ return found;
29827+}
29828+
29829+static struct hist_field *find_var(struct hist_trigger_data *hist_data,
29830+ struct trace_event_file *file,
29831+ const char *var_name)
29832+{
29833+ struct hist_trigger_data *test_data;
29834+ struct event_trigger_data *test;
29835+ struct hist_field *hist_field;
29836+
29837+ hist_field = find_var_field(hist_data, var_name);
29838+ if (hist_field)
29839+ return hist_field;
29840+
29841+ list_for_each_entry_rcu(test, &file->triggers, list) {
29842+ if (test->cmd_ops->trigger_type == ETT_EVENT_HIST) {
29843+ test_data = test->private_data;
29844+ hist_field = find_var_field(test_data, var_name);
29845+ if (hist_field)
29846+ return hist_field;
29847+ }
29848+ }
29849+
29850+ return NULL;
29851+}
29852+
29853+static struct trace_event_file *find_var_file(struct trace_array *tr,
29854+ char *system,
29855+ char *event_name,
29856+ char *var_name)
29857+{
29858+ struct hist_trigger_data *var_hist_data;
29859+ struct hist_var_data *var_data;
29860+ struct trace_event_file *file, *found = NULL;
29861+
29862+ if (system)
29863+ return find_event_file(tr, system, event_name);
29864+
29865+ list_for_each_entry(var_data, &tr->hist_vars, list) {
29866+ var_hist_data = var_data->hist_data;
29867+ file = var_hist_data->event_file;
29868+ if (file == found)
29869+ continue;
29870+
29871+ if (find_var_field(var_hist_data, var_name)) {
29872+ if (found) {
29873+ hist_err_event("Variable name not unique, need to use fully qualified name (subsys.event.var) for variable: ", system, event_name, var_name);
29874+ return NULL;
29875+ }
29876+
29877+ found = file;
29878+ }
29879+ }
29880+
29881+ return found;
29882+}
29883+
29884+static struct hist_field *find_file_var(struct trace_event_file *file,
29885+ const char *var_name)
29886+{
29887+ struct hist_trigger_data *test_data;
29888+ struct event_trigger_data *test;
29889+ struct hist_field *hist_field;
29890+
29891+ list_for_each_entry_rcu(test, &file->triggers, list) {
29892+ if (test->cmd_ops->trigger_type == ETT_EVENT_HIST) {
29893+ test_data = test->private_data;
29894+ hist_field = find_var_field(test_data, var_name);
29895+ if (hist_field)
29896+ return hist_field;
29897+ }
29898+ }
29899+
29900+ return NULL;
29901+}
29902+
29903+static struct hist_field *
29904+find_match_var(struct hist_trigger_data *hist_data, char *var_name)
29905+{
29906+ struct trace_array *tr = hist_data->event_file->tr;
29907+ struct hist_field *hist_field, *found = NULL;
29908+ struct trace_event_file *file;
29909+ unsigned int i;
29910+
29911+ for (i = 0; i < hist_data->n_actions; i++) {
29912+ struct action_data *data = hist_data->actions[i];
29913+
29914+ if (data->fn == action_trace) {
29915+ char *system = data->onmatch.match_event_system;
29916+ char *event_name = data->onmatch.match_event;
29917+
29918+ file = find_var_file(tr, system, event_name, var_name);
29919+ if (!file)
29920+ continue;
29921+ hist_field = find_file_var(file, var_name);
29922+ if (hist_field) {
29923+ if (found) {
29924+ hist_err_event("Variable name not unique, need to use fully qualified name (subsys.event.var) for variable: ", system, event_name, var_name);
29925+ return ERR_PTR(-EINVAL);
29926+ }
29927+
29928+ found = hist_field;
29929+ }
29930+ }
29931+ }
29932+ return found;
29933+}
29934+
29935+static struct hist_field *find_event_var(struct hist_trigger_data *hist_data,
29936+ char *system,
29937+ char *event_name,
29938+ char *var_name)
29939+{
29940+ struct trace_array *tr = hist_data->event_file->tr;
29941+ struct hist_field *hist_field = NULL;
29942+ struct trace_event_file *file;
29943+
29944+ if (!system || !event_name) {
29945+ hist_field = find_match_var(hist_data, var_name);
29946+ if (IS_ERR(hist_field))
29947+ return NULL;
29948+ if (hist_field)
29949+ return hist_field;
29950+ }
29951+
29952+ file = find_var_file(tr, system, event_name, var_name);
29953+ if (!file)
29954+ return NULL;
29955+
29956+ hist_field = find_file_var(file, var_name);
29957+
29958+ return hist_field;
29959+}
29960+
29961+struct hist_elt_data {
29962+ char *comm;
29963+ u64 *var_ref_vals;
29964+ char *field_var_str[SYNTH_FIELDS_MAX];
29965+};
29966+
29967+static u64 hist_field_var_ref(struct hist_field *hist_field,
29968+ struct tracing_map_elt *elt,
29969+ struct ring_buffer_event *rbe,
29970+ void *event)
29971+{
29972+ struct hist_elt_data *elt_data;
29973+ u64 var_val = 0;
29974+
29975+ elt_data = elt->private_data;
29976+ var_val = elt_data->var_ref_vals[hist_field->var_ref_idx];
29977+
29978+ return var_val;
29979+}
29980+
29981+static bool resolve_var_refs(struct hist_trigger_data *hist_data, void *key,
29982+ u64 *var_ref_vals, bool self)
29983+{
29984+ struct hist_trigger_data *var_data;
29985+ struct tracing_map_elt *var_elt;
29986+ struct hist_field *hist_field;
29987+ unsigned int i, var_idx;
29988+ bool resolved = true;
29989+ u64 var_val = 0;
29990+
29991+ for (i = 0; i < hist_data->n_var_refs; i++) {
29992+ hist_field = hist_data->var_refs[i];
29993+ var_idx = hist_field->var.idx;
29994+ var_data = hist_field->var.hist_data;
29995+
29996+ if (var_data == NULL) {
29997+ resolved = false;
29998+ break;
29999+ }
30000+
30001+ if ((self && var_data != hist_data) ||
30002+ (!self && var_data == hist_data))
30003+ continue;
30004+
30005+ var_elt = tracing_map_lookup(var_data->map, key);
30006+ if (!var_elt) {
30007+ resolved = false;
30008+ break;
30009+ }
30010+
30011+ if (!tracing_map_var_set(var_elt, var_idx)) {
30012+ resolved = false;
30013+ break;
30014+ }
30015+
30016+ if (self || !hist_field->read_once)
30017+ var_val = tracing_map_read_var(var_elt, var_idx);
30018+ else
30019+ var_val = tracing_map_read_var_once(var_elt, var_idx);
30020+
30021+ var_ref_vals[i] = var_val;
30022+ }
30023+
30024+ return resolved;
30025+}
30026+
30027+static const char *hist_field_name(struct hist_field *field,
30028+ unsigned int level)
30029+{
30030+ const char *field_name = "";
30031+
30032+ if (level > 1)
30033+ return field_name;
30034+
30035+ if (field->field)
30036+ field_name = field->field->name;
30037+ else if (field->flags & HIST_FIELD_FL_LOG2 ||
30038+ field->flags & HIST_FIELD_FL_ALIAS)
30039+ field_name = hist_field_name(field->operands[0], ++level);
30040+ else if (field->flags & HIST_FIELD_FL_CPU)
30041+ field_name = "cpu";
30042+ else if (field->flags & HIST_FIELD_FL_EXPR ||
30043+ field->flags & HIST_FIELD_FL_VAR_REF) {
30044+ if (field->system) {
30045+ static char full_name[MAX_FILTER_STR_VAL];
30046+
30047+ strcat(full_name, field->system);
30048+ strcat(full_name, ".");
30049+ strcat(full_name, field->event_name);
30050+ strcat(full_name, ".");
30051+ strcat(full_name, field->name);
30052+ field_name = full_name;
30053+ } else
30054+ field_name = field->name;
30055+ } else if (field->flags & HIST_FIELD_FL_TIMESTAMP)
30056+ field_name = "common_timestamp";
30057+
30058+ if (field_name == NULL)
30059+ field_name = "";
30060+
30061+ return field_name;
30062+}
30063+
30064+static hist_field_fn_t select_value_fn(int field_size, int field_is_signed)
30065+{
30066+ hist_field_fn_t fn = NULL;
30067+
30068+ switch (field_size) {
30069+ case 8:
30070+ if (field_is_signed)
30071+ fn = hist_field_s64;
30072+ else
30073+ fn = hist_field_u64;
30074+ break;
30075+ case 4:
30076+ if (field_is_signed)
30077+ fn = hist_field_s32;
30078+ else
30079+ fn = hist_field_u32;
30080+ break;
30081+ case 2:
30082+ if (field_is_signed)
30083+ fn = hist_field_s16;
30084+ else
30085+ fn = hist_field_u16;
30086+ break;
30087+ case 1:
30088+ if (field_is_signed)
30089+ fn = hist_field_s8;
30090+ else
30091+ fn = hist_field_u8;
30092+ break;
30093+ }
30094+
30095+ return fn;
30096+}
30097+
30098+static int parse_map_size(char *str)
30099+{
30100+ unsigned long size, map_bits;
30101+ int ret;
30102+
30103+ strsep(&str, "=");
30104+ if (!str) {
30105+ ret = -EINVAL;
30106+ goto out;
30107+ }
30108+
30109+ ret = kstrtoul(str, 0, &size);
30110+ if (ret)
30111+ goto out;
30112+
30113+ map_bits = ilog2(roundup_pow_of_two(size));
30114+ if (map_bits < TRACING_MAP_BITS_MIN ||
30115+ map_bits > TRACING_MAP_BITS_MAX)
30116+ ret = -EINVAL;
30117+ else
30118+ ret = map_bits;
30119+ out:
30120+ return ret;
30121+}
30122+
30123+static void destroy_hist_trigger_attrs(struct hist_trigger_attrs *attrs)
30124+{
30125+ unsigned int i;
30126+
30127+ if (!attrs)
30128+ return;
30129+
30130+ for (i = 0; i < attrs->n_assignments; i++)
30131+ kfree(attrs->assignment_str[i]);
30132+
30133+ for (i = 0; i < attrs->n_actions; i++)
30134+ kfree(attrs->action_str[i]);
30135+
30136+ kfree(attrs->name);
30137+ kfree(attrs->sort_key_str);
30138+ kfree(attrs->keys_str);
30139+ kfree(attrs->vals_str);
30140+ kfree(attrs->clock);
30141+ kfree(attrs);
30142+}
30143+
30144+static int parse_action(char *str, struct hist_trigger_attrs *attrs)
30145+{
30146+ int ret = -EINVAL;
30147+
30148+ if (attrs->n_actions >= HIST_ACTIONS_MAX)
30149+ return ret;
30150+
30151+ if ((strncmp(str, "onmatch(", strlen("onmatch(")) == 0) ||
30152+ (strncmp(str, "onmax(", strlen("onmax(")) == 0)) {
30153+ attrs->action_str[attrs->n_actions] = kstrdup(str, GFP_KERNEL);
30154+ if (!attrs->action_str[attrs->n_actions]) {
30155+ ret = -ENOMEM;
30156+ return ret;
30157+ }
30158+ attrs->n_actions++;
30159+ ret = 0;
30160+ }
30161+
30162+ return ret;
30163+}
30164+
30165+static int parse_assignment(char *str, struct hist_trigger_attrs *attrs)
30166+{
30167+ int ret = 0;
30168+
30169+ if ((strncmp(str, "key=", strlen("key=")) == 0) ||
30170+ (strncmp(str, "keys=", strlen("keys=")) == 0)) {
30171+ attrs->keys_str = kstrdup(str, GFP_KERNEL);
30172+ if (!attrs->keys_str) {
30173+ ret = -ENOMEM;
30174+ goto out;
30175+ }
30176+ } else if ((strncmp(str, "val=", strlen("val=")) == 0) ||
30177+ (strncmp(str, "vals=", strlen("vals=")) == 0) ||
30178+ (strncmp(str, "values=", strlen("values=")) == 0)) {
30179+ attrs->vals_str = kstrdup(str, GFP_KERNEL);
30180+ if (!attrs->vals_str) {
30181+ ret = -ENOMEM;
30182+ goto out;
30183+ }
30184+ } else if (strncmp(str, "sort=", strlen("sort=")) == 0) {
30185+ attrs->sort_key_str = kstrdup(str, GFP_KERNEL);
30186+ if (!attrs->sort_key_str) {
30187+ ret = -ENOMEM;
30188+ goto out;
30189+ }
30190+ } else if (strncmp(str, "name=", strlen("name=")) == 0) {
30191+ attrs->name = kstrdup(str, GFP_KERNEL);
30192+ if (!attrs->name) {
30193+ ret = -ENOMEM;
30194+ goto out;
30195+ }
30196+ } else if (strncmp(str, "clock=", strlen("clock=")) == 0) {
30197+ strsep(&str, "=");
30198+ if (!str) {
30199+ ret = -EINVAL;
30200+ goto out;
30201+ }
30202+
30203+ str = strstrip(str);
30204+ attrs->clock = kstrdup(str, GFP_KERNEL);
30205+ if (!attrs->clock) {
30206+ ret = -ENOMEM;
30207+ goto out;
30208+ }
30209+ } else if (strncmp(str, "size=", strlen("size=")) == 0) {
30210+ int map_bits = parse_map_size(str);
30211+
30212+ if (map_bits < 0) {
30213+ ret = map_bits;
30214+ goto out;
30215+ }
30216+ attrs->map_bits = map_bits;
30217+ } else {
30218+ char *assignment;
30219+
30220+ if (attrs->n_assignments == TRACING_MAP_VARS_MAX) {
30221+ hist_err("Too many variables defined: ", str);
30222+ ret = -EINVAL;
30223+ goto out;
30224+ }
30225+
30226+ assignment = kstrdup(str, GFP_KERNEL);
30227+ if (!assignment) {
30228+ ret = -ENOMEM;
30229+ goto out;
30230+ }
30231+
30232+ attrs->assignment_str[attrs->n_assignments++] = assignment;
30233+ }
30234+ out:
30235+ return ret;
30236+}
30237+
30238+static struct hist_trigger_attrs *parse_hist_trigger_attrs(char *trigger_str)
30239+{
30240+ struct hist_trigger_attrs *attrs;
30241+ int ret = 0;
30242+
30243+ attrs = kzalloc(sizeof(*attrs), GFP_KERNEL);
30244+ if (!attrs)
30245+ return ERR_PTR(-ENOMEM);
30246+
30247+ while (trigger_str) {
30248+ char *str = strsep(&trigger_str, ":");
30249+
30250+ if (strchr(str, '=')) {
30251+ ret = parse_assignment(str, attrs);
30252+ if (ret)
30253+ goto free;
30254+ } else if (strcmp(str, "pause") == 0)
30255+ attrs->pause = true;
30256+ else if ((strcmp(str, "cont") == 0) ||
30257+ (strcmp(str, "continue") == 0))
30258+ attrs->cont = true;
30259+ else if (strcmp(str, "clear") == 0)
30260+ attrs->clear = true;
30261+ else {
30262+ ret = parse_action(str, attrs);
30263+ if (ret)
30264+ goto free;
30265+ }
30266+ }
30267+
30268+ if (!attrs->keys_str) {
30269+ ret = -EINVAL;
30270+ goto free;
30271+ }
30272+
30273+ if (!attrs->clock) {
30274+ attrs->clock = kstrdup("global", GFP_KERNEL);
30275+ if (!attrs->clock) {
30276+ ret = -ENOMEM;
30277+ goto free;
30278+ }
30279+ }
30280+
30281+ return attrs;
30282+ free:
30283+ destroy_hist_trigger_attrs(attrs);
30284+
30285+ return ERR_PTR(ret);
30286+}
30287+
30288+static inline void save_comm(char *comm, struct task_struct *task)
30289+{
30290+ if (!task->pid) {
30291+ strcpy(comm, "<idle>");
30292+ return;
30293+ }
30294+
30295+ if (WARN_ON_ONCE(task->pid < 0)) {
30296+ strcpy(comm, "<XXX>");
30297+ return;
30298+ }
30299+
30300+ memcpy(comm, task->comm, TASK_COMM_LEN);
30301+}
30302+
30303+static void hist_elt_data_free(struct hist_elt_data *elt_data)
30304+{
30305+ unsigned int i;
30306+
30307+ for (i = 0; i < SYNTH_FIELDS_MAX; i++)
30308+ kfree(elt_data->field_var_str[i]);
30309+
30310+ kfree(elt_data->comm);
30311+ kfree(elt_data);
30312+}
30313+
30314+static void hist_trigger_elt_data_free(struct tracing_map_elt *elt)
30315+{
30316+ struct hist_elt_data *elt_data = elt->private_data;
30317+
30318+ hist_elt_data_free(elt_data);
30319+}
30320+
30321+static int hist_trigger_elt_data_alloc(struct tracing_map_elt *elt)
30322+{
30323+ struct hist_trigger_data *hist_data = elt->map->private_data;
30324+ unsigned int size = TASK_COMM_LEN;
30325+ struct hist_elt_data *elt_data;
30326+ struct hist_field *key_field;
30327+ unsigned int i, n_str;
30328+
30329+ elt_data = kzalloc(sizeof(*elt_data), GFP_KERNEL);
30330+ if (!elt_data)
30331+ return -ENOMEM;
30332+
30333+ for_each_hist_key_field(i, hist_data) {
30334+ key_field = hist_data->fields[i];
30335+
30336+ if (key_field->flags & HIST_FIELD_FL_EXECNAME) {
30337+ elt_data->comm = kzalloc(size, GFP_KERNEL);
30338+ if (!elt_data->comm) {
30339+ kfree(elt_data);
30340+ return -ENOMEM;
30341+ }
30342+ break;
30343+ }
30344+ }
30345+
30346+ n_str = hist_data->n_field_var_str + hist_data->n_max_var_str;
30347+
30348+ size = STR_VAR_LEN_MAX;
30349+
30350+ for (i = 0; i < n_str; i++) {
30351+ elt_data->field_var_str[i] = kzalloc(size, GFP_KERNEL);
30352+ if (!elt_data->field_var_str[i]) {
30353+ hist_elt_data_free(elt_data);
30354+ return -ENOMEM;
30355+ }
30356+ }
30357+
30358+ elt->private_data = elt_data;
30359+
30360+ return 0;
30361+}
30362+
30363+static void hist_trigger_elt_data_init(struct tracing_map_elt *elt)
30364+{
30365+ struct hist_elt_data *elt_data = elt->private_data;
30366+
30367+ if (elt_data->comm)
30368+ save_comm(elt_data->comm, current);
30369+}
30370+
30371+static const struct tracing_map_ops hist_trigger_elt_data_ops = {
30372+ .elt_alloc = hist_trigger_elt_data_alloc,
30373+ .elt_free = hist_trigger_elt_data_free,
30374+ .elt_init = hist_trigger_elt_data_init,
30375+};
30376+
30377+static const char *get_hist_field_flags(struct hist_field *hist_field)
30378+{
30379+ const char *flags_str = NULL;
30380+
30381+ if (hist_field->flags & HIST_FIELD_FL_HEX)
30382+ flags_str = "hex";
30383+ else if (hist_field->flags & HIST_FIELD_FL_SYM)
30384+ flags_str = "sym";
30385+ else if (hist_field->flags & HIST_FIELD_FL_SYM_OFFSET)
30386+ flags_str = "sym-offset";
30387+ else if (hist_field->flags & HIST_FIELD_FL_EXECNAME)
30388+ flags_str = "execname";
30389+ else if (hist_field->flags & HIST_FIELD_FL_SYSCALL)
30390+ flags_str = "syscall";
30391+ else if (hist_field->flags & HIST_FIELD_FL_LOG2)
30392+ flags_str = "log2";
30393+ else if (hist_field->flags & HIST_FIELD_FL_TIMESTAMP_USECS)
30394+ flags_str = "usecs";
30395+
30396+ return flags_str;
30397+}
30398+
30399+static void expr_field_str(struct hist_field *field, char *expr)
30400+{
30401+ if (field->flags & HIST_FIELD_FL_VAR_REF)
30402+ strcat(expr, "$");
30403+
30404+ strcat(expr, hist_field_name(field, 0));
30405+
30406+ if (field->flags && !(field->flags & HIST_FIELD_FL_VAR_REF)) {
30407+ const char *flags_str = get_hist_field_flags(field);
30408+
30409+ if (flags_str) {
30410+ strcat(expr, ".");
30411+ strcat(expr, flags_str);
30412+ }
30413+ }
30414+}
30415+
30416+static char *expr_str(struct hist_field *field, unsigned int level)
30417+{
30418+ char *expr;
30419+
30420+ if (level > 1)
30421+ return NULL;
30422+
30423+ expr = kzalloc(MAX_FILTER_STR_VAL, GFP_KERNEL);
30424+ if (!expr)
30425+ return NULL;
30426+
30427+ if (!field->operands[0]) {
30428+ expr_field_str(field, expr);
30429+ return expr;
30430+ }
30431+
30432+ if (field->operator == FIELD_OP_UNARY_MINUS) {
30433+ char *subexpr;
30434+
30435+ strcat(expr, "-(");
30436+ subexpr = expr_str(field->operands[0], ++level);
30437+ if (!subexpr) {
30438+ kfree(expr);
30439+ return NULL;
30440+ }
30441+ strcat(expr, subexpr);
30442+ strcat(expr, ")");
30443+
30444+ kfree(subexpr);
30445+
30446+ return expr;
30447+ }
30448+
30449+ expr_field_str(field->operands[0], expr);
30450+
30451+ switch (field->operator) {
30452+ case FIELD_OP_MINUS:
30453+ strcat(expr, "-");
30454+ break;
30455+ case FIELD_OP_PLUS:
30456+ strcat(expr, "+");
30457+ break;
30458+ default:
30459+ kfree(expr);
30460+ return NULL;
30461+ }
30462+
30463+ expr_field_str(field->operands[1], expr);
30464+
30465+ return expr;
30466+}
30467+
30468+static int contains_operator(char *str)
30469+{
30470+ enum field_op_id field_op = FIELD_OP_NONE;
30471+ char *op;
30472+
30473+ op = strpbrk(str, "+-");
30474+ if (!op)
30475+ return FIELD_OP_NONE;
30476+
30477+ switch (*op) {
30478+ case '-':
30479+ if (*str == '-')
30480+ field_op = FIELD_OP_UNARY_MINUS;
30481+ else
30482+ field_op = FIELD_OP_MINUS;
30483+ break;
30484+ case '+':
30485+ field_op = FIELD_OP_PLUS;
30486+ break;
30487+ default:
30488+ break;
30489+ }
30490+
30491+ return field_op;
30492+}
30493+
30494+static void destroy_hist_field(struct hist_field *hist_field,
30495+ unsigned int level)
30496+{
30497+ unsigned int i;
30498+
30499+ if (level > 3)
30500+ return;
30501+
30502+ if (!hist_field)
30503+ return;
30504+
30505+ for (i = 0; i < HIST_FIELD_OPERANDS_MAX; i++)
30506+ destroy_hist_field(hist_field->operands[i], level + 1);
30507+
30508+ kfree(hist_field->var.name);
30509+ kfree(hist_field->name);
30510+ kfree(hist_field->type);
30511+
30512+ kfree(hist_field);
30513+}
30514+
30515+static struct hist_field *create_hist_field(struct hist_trigger_data *hist_data,
30516+ struct ftrace_event_field *field,
30517+ unsigned long flags,
30518+ char *var_name)
30519+{
30520+ struct hist_field *hist_field;
30521+
30522+ if (field && is_function_field(field))
30523+ return NULL;
30524+
30525+ hist_field = kzalloc(sizeof(struct hist_field), GFP_KERNEL);
30526+ if (!hist_field)
30527+ return NULL;
30528+
30529+ hist_field->hist_data = hist_data;
30530+
30531+ if (flags & HIST_FIELD_FL_EXPR || flags & HIST_FIELD_FL_ALIAS)
30532+ goto out; /* caller will populate */
30533+
30534+ if (flags & HIST_FIELD_FL_VAR_REF) {
30535+ hist_field->fn = hist_field_var_ref;
30536+ goto out;
30537+ }
30538+
30539+ if (flags & HIST_FIELD_FL_HITCOUNT) {
30540+ hist_field->fn = hist_field_counter;
30541+ hist_field->size = sizeof(u64);
30542+ hist_field->type = kstrdup("u64", GFP_KERNEL);
30543+ if (!hist_field->type)
30544+ goto free;
30545+ goto out;
30546+ }
30547+
30548+ if (flags & HIST_FIELD_FL_STACKTRACE) {
30549+ hist_field->fn = hist_field_none;
30550+ goto out;
30551+ }
30552+
30553+ if (flags & HIST_FIELD_FL_LOG2) {
30554+ unsigned long fl = flags & ~HIST_FIELD_FL_LOG2;
30555+ hist_field->fn = hist_field_log2;
30556+ hist_field->operands[0] = create_hist_field(hist_data, field, fl, NULL);
30557+ hist_field->size = hist_field->operands[0]->size;
30558+ hist_field->type = kstrdup(hist_field->operands[0]->type, GFP_KERNEL);
30559+ if (!hist_field->type)
30560+ goto free;
30561+ goto out;
30562+ }
30563+
30564+ if (flags & HIST_FIELD_FL_TIMESTAMP) {
30565+ hist_field->fn = hist_field_timestamp;
30566+ hist_field->size = sizeof(u64);
30567+ hist_field->type = kstrdup("u64", GFP_KERNEL);
30568+ if (!hist_field->type)
30569+ goto free;
30570+ goto out;
30571+ }
30572+
30573+ if (flags & HIST_FIELD_FL_CPU) {
30574+ hist_field->fn = hist_field_cpu;
30575+ hist_field->size = sizeof(int);
30576+ hist_field->type = kstrdup("unsigned int", GFP_KERNEL);
30577+ if (!hist_field->type)
30578+ goto free;
30579+ goto out;
30580+ }
30581+
30582+ if (WARN_ON_ONCE(!field))
30583+ goto out;
30584+
30585+ if (is_string_field(field)) {
30586+ flags |= HIST_FIELD_FL_STRING;
30587+
30588+ hist_field->size = MAX_FILTER_STR_VAL;
30589+ hist_field->type = kstrdup(field->type, GFP_KERNEL);
30590+ if (!hist_field->type)
30591+ goto free;
30592+
30593+ if (field->filter_type == FILTER_STATIC_STRING)
30594+ hist_field->fn = hist_field_string;
30595+ else if (field->filter_type == FILTER_DYN_STRING)
30596+ hist_field->fn = hist_field_dynstring;
30597+ else
30598+ hist_field->fn = hist_field_pstring;
30599+ } else {
30600+ hist_field->size = field->size;
30601+ hist_field->is_signed = field->is_signed;
30602+ hist_field->type = kstrdup(field->type, GFP_KERNEL);
30603+ if (!hist_field->type)
30604+ goto free;
30605+
30606+ hist_field->fn = select_value_fn(field->size,
30607+ field->is_signed);
30608+ if (!hist_field->fn) {
30609+ destroy_hist_field(hist_field, 0);
30610+ return NULL;
30611+ }
30612+ }
30613+ out:
30614+ hist_field->field = field;
30615+ hist_field->flags = flags;
30616+
30617+ if (var_name) {
30618+ hist_field->var.name = kstrdup(var_name, GFP_KERNEL);
30619+ if (!hist_field->var.name)
30620+ goto free;
30621+ }
30622+
30623+ return hist_field;
30624+ free:
30625+ destroy_hist_field(hist_field, 0);
30626+ return NULL;
30627+}
30628+
30629+static void destroy_hist_fields(struct hist_trigger_data *hist_data)
30630+{
30631+ unsigned int i;
30632+
30633+ for (i = 0; i < HIST_FIELDS_MAX; i++) {
30634+ if (hist_data->fields[i]) {
30635+ destroy_hist_field(hist_data->fields[i], 0);
30636+ hist_data->fields[i] = NULL;
30637+ }
30638+ }
30639+}
30640+
30641+static int init_var_ref(struct hist_field *ref_field,
30642+ struct hist_field *var_field,
30643+ char *system, char *event_name)
30644+{
30645+ int err = 0;
30646+
30647+ ref_field->var.idx = var_field->var.idx;
30648+ ref_field->var.hist_data = var_field->hist_data;
30649+ ref_field->size = var_field->size;
30650+ ref_field->is_signed = var_field->is_signed;
30651+ ref_field->flags |= var_field->flags &
30652+ (HIST_FIELD_FL_TIMESTAMP | HIST_FIELD_FL_TIMESTAMP_USECS);
30653+
30654+ if (system) {
30655+ ref_field->system = kstrdup(system, GFP_KERNEL);
30656+ if (!ref_field->system)
30657+ return -ENOMEM;
30658+ }
30659+
30660+ if (event_name) {
30661+ ref_field->event_name = kstrdup(event_name, GFP_KERNEL);
30662+ if (!ref_field->event_name) {
30663+ err = -ENOMEM;
30664+ goto free;
30665+ }
30666+ }
30667+
30668+ if (var_field->var.name) {
30669+ ref_field->name = kstrdup(var_field->var.name, GFP_KERNEL);
30670+ if (!ref_field->name) {
30671+ err = -ENOMEM;
30672+ goto free;
30673+ }
30674+ } else if (var_field->name) {
30675+ ref_field->name = kstrdup(var_field->name, GFP_KERNEL);
30676+ if (!ref_field->name) {
30677+ err = -ENOMEM;
30678+ goto free;
30679+ }
30680+ }
30681+
30682+ ref_field->type = kstrdup(var_field->type, GFP_KERNEL);
30683+ if (!ref_field->type) {
30684+ err = -ENOMEM;
30685+ goto free;
30686+ }
30687+ out:
30688+ return err;
30689+ free:
30690+ kfree(ref_field->system);
30691+ kfree(ref_field->event_name);
30692+ kfree(ref_field->name);
30693+
30694+ goto out;
30695+}
30696+
30697+static struct hist_field *create_var_ref(struct hist_field *var_field,
30698+ char *system, char *event_name)
30699+{
30700+ unsigned long flags = HIST_FIELD_FL_VAR_REF;
30701+ struct hist_field *ref_field;
30702+
30703+ ref_field = create_hist_field(var_field->hist_data, NULL, flags, NULL);
30704+ if (ref_field) {
30705+ if (init_var_ref(ref_field, var_field, system, event_name)) {
30706+ destroy_hist_field(ref_field, 0);
30707+ return NULL;
30708+ }
30709+ }
30710+
30711+ return ref_field;
30712+}
30713+
30714+static bool is_var_ref(char *var_name)
30715+{
30716+ if (!var_name || strlen(var_name) < 2 || var_name[0] != '$')
30717+ return false;
30718+
30719+ return true;
30720+}
30721+
30722+static char *field_name_from_var(struct hist_trigger_data *hist_data,
30723+ char *var_name)
30724+{
30725+ char *name, *field;
30726+ unsigned int i;
30727+
30728+ for (i = 0; i < hist_data->attrs->var_defs.n_vars; i++) {
30729+ name = hist_data->attrs->var_defs.name[i];
30730+
30731+ if (strcmp(var_name, name) == 0) {
30732+ field = hist_data->attrs->var_defs.expr[i];
30733+ if (contains_operator(field) || is_var_ref(field))
30734+ continue;
30735+ return field;
30736+ }
30737+ }
30738+
30739+ return NULL;
30740+}
30741+
30742+static char *local_field_var_ref(struct hist_trigger_data *hist_data,
30743+ char *system, char *event_name,
30744+ char *var_name)
30745+{
30746+ struct trace_event_call *call;
30747+
30748+ if (system && event_name) {
30749+ call = hist_data->event_file->event_call;
30750+
30751+ if (strcmp(system, call->class->system) != 0)
30752+ return NULL;
30753+
30754+ if (strcmp(event_name, trace_event_name(call)) != 0)
30755+ return NULL;
30756+ }
30757+
30758+ if (!!system != !!event_name)
30759+ return NULL;
30760+
30761+ if (!is_var_ref(var_name))
30762+ return NULL;
30763+
30764+ var_name++;
30765+
30766+ return field_name_from_var(hist_data, var_name);
30767+}
30768+
30769+static struct hist_field *parse_var_ref(struct hist_trigger_data *hist_data,
30770+ char *system, char *event_name,
30771+ char *var_name)
30772+{
30773+ struct hist_field *var_field = NULL, *ref_field = NULL;
30774+
30775+ if (!is_var_ref(var_name))
30776+ return NULL;
30777+
30778+ var_name++;
30779+
30780+ var_field = find_event_var(hist_data, system, event_name, var_name);
30781+ if (var_field)
30782+ ref_field = create_var_ref(var_field, system, event_name);
30783+
30784+ if (!ref_field)
30785+ hist_err_event("Couldn't find variable: $",
30786+ system, event_name, var_name);
30787+
30788+ return ref_field;
30789+}
30790+
30791+static struct ftrace_event_field *
30792+parse_field(struct hist_trigger_data *hist_data, struct trace_event_file *file,
30793+ char *field_str, unsigned long *flags)
30794+{
30795+ struct ftrace_event_field *field = NULL;
30796+ char *field_name, *modifier, *str;
30797+
30798+ modifier = str = kstrdup(field_str, GFP_KERNEL);
30799+ if (!modifier)
30800+ return ERR_PTR(-ENOMEM);
30801+
30802+ field_name = strsep(&modifier, ".");
30803+ if (modifier) {
30804+ if (strcmp(modifier, "hex") == 0)
30805+ *flags |= HIST_FIELD_FL_HEX;
30806+ else if (strcmp(modifier, "sym") == 0)
30807+ *flags |= HIST_FIELD_FL_SYM;
30808+ else if (strcmp(modifier, "sym-offset") == 0)
30809+ *flags |= HIST_FIELD_FL_SYM_OFFSET;
30810+ else if ((strcmp(modifier, "execname") == 0) &&
30811+ (strcmp(field_name, "common_pid") == 0))
30812+ *flags |= HIST_FIELD_FL_EXECNAME;
30813+ else if (strcmp(modifier, "syscall") == 0)
30814+ *flags |= HIST_FIELD_FL_SYSCALL;
30815+ else if (strcmp(modifier, "log2") == 0)
30816+ *flags |= HIST_FIELD_FL_LOG2;
30817+ else if (strcmp(modifier, "usecs") == 0)
30818+ *flags |= HIST_FIELD_FL_TIMESTAMP_USECS;
30819+ else {
30820+ hist_err("Invalid field modifier: ", modifier);
30821+ field = ERR_PTR(-EINVAL);
30822+ goto out;
30823+ }
30824+ }
30825+
30826+ if (strcmp(field_name, "common_timestamp") == 0) {
30827+ *flags |= HIST_FIELD_FL_TIMESTAMP;
30828+ hist_data->enable_timestamps = true;
30829+ if (*flags & HIST_FIELD_FL_TIMESTAMP_USECS)
30830+ hist_data->attrs->ts_in_usecs = true;
30831+ } else if (strcmp(field_name, "cpu") == 0)
30832+ *flags |= HIST_FIELD_FL_CPU;
30833+ else {
30834+ field = trace_find_event_field(file->event_call, field_name);
30835+ if (!field || !field->size) {
30836+ hist_err("Couldn't find field: ", field_name);
30837+ field = ERR_PTR(-EINVAL);
30838+ goto out;
30839+ }
30840+ }
30841+ out:
30842+ kfree(str);
30843+
30844+ return field;
30845+}
30846+
30847+static struct hist_field *create_alias(struct hist_trigger_data *hist_data,
30848+ struct hist_field *var_ref,
30849+ char *var_name)
30850+{
30851+ struct hist_field *alias = NULL;
30852+ unsigned long flags = HIST_FIELD_FL_ALIAS | HIST_FIELD_FL_VAR;
30853+
30854+ alias = create_hist_field(hist_data, NULL, flags, var_name);
30855+ if (!alias)
30856+ return NULL;
30857+
30858+ alias->fn = var_ref->fn;
30859+ alias->operands[0] = var_ref;
30860+
30861+ if (init_var_ref(alias, var_ref, var_ref->system, var_ref->event_name)) {
30862+ destroy_hist_field(alias, 0);
30863+ return NULL;
30864+ }
30865+
30866+ return alias;
30867+}
30868+
30869+static struct hist_field *parse_atom(struct hist_trigger_data *hist_data,
30870+ struct trace_event_file *file, char *str,
30871+ unsigned long *flags, char *var_name)
30872+{
30873+ char *s, *ref_system = NULL, *ref_event = NULL, *ref_var = str;
30874+ struct ftrace_event_field *field = NULL;
30875+ struct hist_field *hist_field = NULL;
30876+ int ret = 0;
30877+
30878+ s = strchr(str, '.');
30879+ if (s) {
30880+ s = strchr(++s, '.');
30881+ if (s) {
30882+ ref_system = strsep(&str, ".");
30883+ if (!str) {
30884+ ret = -EINVAL;
30885+ goto out;
30886+ }
30887+ ref_event = strsep(&str, ".");
30888+ if (!str) {
30889+ ret = -EINVAL;
30890+ goto out;
30891+ }
30892+ ref_var = str;
30893+ }
30894+ }
30895+
30896+ s = local_field_var_ref(hist_data, ref_system, ref_event, ref_var);
30897+ if (!s) {
30898+ hist_field = parse_var_ref(hist_data, ref_system, ref_event, ref_var);
30899+ if (hist_field) {
30900+ hist_data->var_refs[hist_data->n_var_refs] = hist_field;
30901+ hist_field->var_ref_idx = hist_data->n_var_refs++;
30902+ if (var_name) {
30903+ hist_field = create_alias(hist_data, hist_field, var_name);
30904+ if (!hist_field) {
30905+ ret = -ENOMEM;
30906+ goto out;
30907+ }
30908+ }
30909+ return hist_field;
30910+ }
30911+ } else
30912+ str = s;
30913+
30914+ field = parse_field(hist_data, file, str, flags);
30915+ if (IS_ERR(field)) {
30916+ ret = PTR_ERR(field);
30917+ goto out;
30918+ }
30919+
30920+ hist_field = create_hist_field(hist_data, field, *flags, var_name);
30921+ if (!hist_field) {
30922+ ret = -ENOMEM;
30923+ goto out;
30924+ }
30925+
30926+ return hist_field;
30927+ out:
30928+ return ERR_PTR(ret);
30929+}
30930+
30931+static struct hist_field *parse_expr(struct hist_trigger_data *hist_data,
30932+ struct trace_event_file *file,
30933+ char *str, unsigned long flags,
30934+ char *var_name, unsigned int level);
30935+
30936+static struct hist_field *parse_unary(struct hist_trigger_data *hist_data,
30937+ struct trace_event_file *file,
30938+ char *str, unsigned long flags,
30939+ char *var_name, unsigned int level)
30940+{
30941+ struct hist_field *operand1, *expr = NULL;
30942+ unsigned long operand_flags;
30943+ int ret = 0;
30944+ char *s;
30945+
30946+ // we support only -(xxx) i.e. explicit parens required
30947+
30948+ if (level > 3) {
30949+ hist_err("Too many subexpressions (3 max): ", str);
30950+ ret = -EINVAL;
30951+ goto free;
30952+ }
30953+
30954+ str++; // skip leading '-'
30955+
30956+ s = strchr(str, '(');
30957+ if (s)
30958+ str++;
30959+ else {
30960+ ret = -EINVAL;
30961+ goto free;
30962+ }
30963+
30964+ s = strrchr(str, ')');
30965+ if (s)
30966+ *s = '\0';
30967+ else {
30968+ ret = -EINVAL; // no closing ')'
30969+ goto free;
30970+ }
30971+
30972+ flags |= HIST_FIELD_FL_EXPR;
30973+ expr = create_hist_field(hist_data, NULL, flags, var_name);
30974+ if (!expr) {
30975+ ret = -ENOMEM;
30976+ goto free;
30977+ }
30978+
30979+ operand_flags = 0;
30980+ operand1 = parse_expr(hist_data, file, str, operand_flags, NULL, ++level);
30981+ if (IS_ERR(operand1)) {
30982+ ret = PTR_ERR(operand1);
30983+ goto free;
30984+ }
30985+
30986+ expr->flags |= operand1->flags &
30987+ (HIST_FIELD_FL_TIMESTAMP | HIST_FIELD_FL_TIMESTAMP_USECS);
30988+ expr->fn = hist_field_unary_minus;
30989+ expr->operands[0] = operand1;
30990+ expr->operator = FIELD_OP_UNARY_MINUS;
30991+ expr->name = expr_str(expr, 0);
30992+ expr->type = kstrdup(operand1->type, GFP_KERNEL);
30993+ if (!expr->type) {
30994+ ret = -ENOMEM;
30995+ goto free;
30996+ }
30997+
30998+ return expr;
30999+ free:
31000+ destroy_hist_field(expr, 0);
31001+ return ERR_PTR(ret);
31002+}
31003+
31004+static int check_expr_operands(struct hist_field *operand1,
31005+ struct hist_field *operand2)
31006+{
31007+ unsigned long operand1_flags = operand1->flags;
31008+ unsigned long operand2_flags = operand2->flags;
31009+
31010+ if ((operand1_flags & HIST_FIELD_FL_VAR_REF) ||
31011+ (operand1_flags & HIST_FIELD_FL_ALIAS)) {
31012+ struct hist_field *var;
31013+
31014+ var = find_var_field(operand1->var.hist_data, operand1->name);
31015+ if (!var)
31016+ return -EINVAL;
31017+ operand1_flags = var->flags;
31018+ }
31019+
31020+ if ((operand2_flags & HIST_FIELD_FL_VAR_REF) ||
31021+ (operand2_flags & HIST_FIELD_FL_ALIAS)) {
31022+ struct hist_field *var;
31023+
31024+ var = find_var_field(operand2->var.hist_data, operand2->name);
31025+ if (!var)
31026+ return -EINVAL;
31027+ operand2_flags = var->flags;
31028+ }
31029+
31030+ if ((operand1_flags & HIST_FIELD_FL_TIMESTAMP_USECS) !=
31031+ (operand2_flags & HIST_FIELD_FL_TIMESTAMP_USECS)) {
31032+ hist_err("Timestamp units in expression don't match", NULL);
31033+ return -EINVAL;
31034+ }
31035+
31036+ return 0;
31037+}
31038+
31039+static struct hist_field *parse_expr(struct hist_trigger_data *hist_data,
31040+ struct trace_event_file *file,
31041+ char *str, unsigned long flags,
31042+ char *var_name, unsigned int level)
31043+{
31044+ struct hist_field *operand1 = NULL, *operand2 = NULL, *expr = NULL;
31045+ unsigned long operand_flags;
31046+ int field_op, ret = -EINVAL;
31047+ char *sep, *operand1_str;
31048+
31049+ if (level > 3) {
31050+ hist_err("Too many subexpressions (3 max): ", str);
31051+ return ERR_PTR(-EINVAL);
31052+ }
31053+
31054+ field_op = contains_operator(str);
31055+
31056+ if (field_op == FIELD_OP_NONE)
31057+ return parse_atom(hist_data, file, str, &flags, var_name);
31058+
31059+ if (field_op == FIELD_OP_UNARY_MINUS)
31060+ return parse_unary(hist_data, file, str, flags, var_name, ++level);
31061+
31062+ switch (field_op) {
31063+ case FIELD_OP_MINUS:
31064+ sep = "-";
31065+ break;
31066+ case FIELD_OP_PLUS:
31067+ sep = "+";
31068+ break;
31069+ default:
31070+ goto free;
31071+ }
31072+
31073+ operand1_str = strsep(&str, sep);
31074+ if (!operand1_str || !str)
31075+ goto free;
31076+
31077+ operand_flags = 0;
31078+ operand1 = parse_atom(hist_data, file, operand1_str,
31079+ &operand_flags, NULL);
31080+ if (IS_ERR(operand1)) {
31081+ ret = PTR_ERR(operand1);
31082+ operand1 = NULL;
31083+ goto free;
31084+ }
31085+
31086+ // rest of string could be another expression e.g. b+c in a+b+c
31087+ operand_flags = 0;
31088+ operand2 = parse_expr(hist_data, file, str, operand_flags, NULL, ++level);
31089+ if (IS_ERR(operand2)) {
31090+ ret = PTR_ERR(operand2);
31091+ operand2 = NULL;
31092+ goto free;
31093+ }
31094+
31095+ ret = check_expr_operands(operand1, operand2);
31096+ if (ret)
31097+ goto free;
31098+
31099+ flags |= HIST_FIELD_FL_EXPR;
31100+
31101+ flags |= operand1->flags &
31102+ (HIST_FIELD_FL_TIMESTAMP | HIST_FIELD_FL_TIMESTAMP_USECS);
31103+
31104+ expr = create_hist_field(hist_data, NULL, flags, var_name);
31105+ if (!expr) {
31106+ ret = -ENOMEM;
31107+ goto free;
31108+ }
31109+
31110+ operand1->read_once = true;
31111+ operand2->read_once = true;
31112+
31113+ expr->operands[0] = operand1;
31114+ expr->operands[1] = operand2;
31115+ expr->operator = field_op;
31116+ expr->name = expr_str(expr, 0);
31117+ expr->type = kstrdup(operand1->type, GFP_KERNEL);
31118+ if (!expr->type) {
31119+ ret = -ENOMEM;
31120+ goto free;
31121+ }
31122+
31123+ switch (field_op) {
31124+ case FIELD_OP_MINUS:
31125+ expr->fn = hist_field_minus;
31126+ break;
31127+ case FIELD_OP_PLUS:
31128+ expr->fn = hist_field_plus;
31129+ break;
31130+ default:
31131+ ret = -EINVAL;
31132+ goto free;
31133+ }
31134+
31135+ return expr;
31136+ free:
31137+ destroy_hist_field(operand1, 0);
31138+ destroy_hist_field(operand2, 0);
31139+ destroy_hist_field(expr, 0);
31140+
31141+ return ERR_PTR(ret);
31142+}
31143+
31144+static char *find_trigger_filter(struct hist_trigger_data *hist_data,
31145+ struct trace_event_file *file)
31146+{
31147+ struct event_trigger_data *test;
31148+
31149+ list_for_each_entry_rcu(test, &file->triggers, list) {
31150+ if (test->cmd_ops->trigger_type == ETT_EVENT_HIST) {
31151+ if (test->private_data == hist_data)
31152+ return test->filter_str;
31153+ }
31154+ }
31155+
31156+ return NULL;
31157+}
31158+
31159+static struct event_command trigger_hist_cmd;
31160+static int event_hist_trigger_func(struct event_command *cmd_ops,
31161+ struct trace_event_file *file,
31162+ char *glob, char *cmd, char *param);
31163+
31164+static bool compatible_keys(struct hist_trigger_data *target_hist_data,
31165+ struct hist_trigger_data *hist_data,
31166+ unsigned int n_keys)
31167+{
31168+ struct hist_field *target_hist_field, *hist_field;
31169+ unsigned int n, i, j;
31170+
31171+ if (hist_data->n_fields - hist_data->n_vals != n_keys)
31172+ return false;
31173+
31174+ i = hist_data->n_vals;
31175+ j = target_hist_data->n_vals;
31176+
31177+ for (n = 0; n < n_keys; n++) {
31178+ hist_field = hist_data->fields[i + n];
31179+ target_hist_field = target_hist_data->fields[j + n];
31180+
31181+ if (strcmp(hist_field->type, target_hist_field->type) != 0)
31182+ return false;
31183+ if (hist_field->size != target_hist_field->size)
31184+ return false;
31185+ if (hist_field->is_signed != target_hist_field->is_signed)
31186+ return false;
31187+ }
31188+
31189+ return true;
31190+}
31191+
31192+static struct hist_trigger_data *
31193+find_compatible_hist(struct hist_trigger_data *target_hist_data,
31194+ struct trace_event_file *file)
31195+{
31196+ struct hist_trigger_data *hist_data;
31197+ struct event_trigger_data *test;
31198+ unsigned int n_keys;
31199+
31200+ n_keys = target_hist_data->n_fields - target_hist_data->n_vals;
31201+
31202+ list_for_each_entry_rcu(test, &file->triggers, list) {
31203+ if (test->cmd_ops->trigger_type == ETT_EVENT_HIST) {
31204+ hist_data = test->private_data;
31205+
31206+ if (compatible_keys(target_hist_data, hist_data, n_keys))
31207+ return hist_data;
31208+ }
31209+ }
31210+
31211+ return NULL;
31212+}
31213+
31214+static struct trace_event_file *event_file(struct trace_array *tr,
31215+ char *system, char *event_name)
31216+{
31217+ struct trace_event_file *file;
31218+
31219+ file = find_event_file(tr, system, event_name);
31220+ if (!file)
31221+ return ERR_PTR(-EINVAL);
31222+
31223+ return file;
31224+}
31225+
31226+static struct hist_field *
31227+find_synthetic_field_var(struct hist_trigger_data *target_hist_data,
31228+ char *system, char *event_name, char *field_name)
31229+{
31230+ struct hist_field *event_var;
31231+ char *synthetic_name;
31232+
31233+ synthetic_name = kzalloc(MAX_FILTER_STR_VAL, GFP_KERNEL);
31234+ if (!synthetic_name)
31235+ return ERR_PTR(-ENOMEM);
31236+
31237+ strcpy(synthetic_name, "synthetic_");
31238+ strcat(synthetic_name, field_name);
31239+
31240+ event_var = find_event_var(target_hist_data, system, event_name, synthetic_name);
31241+
31242+ kfree(synthetic_name);
31243+
31244+ return event_var;
31245+}
31246+
31247+/**
31248+ * create_field_var_hist - Automatically create a histogram and var for a field
31249+ * @target_hist_data: The target hist trigger
31250+ * @subsys_name: Optional subsystem name
31251+ * @event_name: Optional event name
31252+ * @field_name: The name of the field (and the resulting variable)
31253+ *
31254+ * Hist trigger actions fetch data from variables, not directly from
31255+ * events. However, for convenience, users are allowed to directly
31256+ * specify an event field in an action, which will be automatically
31257+ * converted into a variable on their behalf.
31258+
31259+ * If a user specifies a field on an event that isn't the event the
31260+ * histogram currently being defined (the target event histogram), the
31261+ * only way that can be accomplished is if a new hist trigger is
31262+ * created and the field variable defined on that.
31263+ *
31264+ * This function creates a new histogram compatible with the target
31265+ * event (meaning a histogram with the same key as the target
31266+ * histogram), and creates a variable for the specified field, but
31267+ * with 'synthetic_' prepended to the variable name in order to avoid
31268+ * collision with normal field variables.
31269+ *
31270+ * Return: The variable created for the field.
31271+ */
31272+static struct hist_field *
31273+create_field_var_hist(struct hist_trigger_data *target_hist_data,
31274+ char *subsys_name, char *event_name, char *field_name)
31275+{
31276+ struct trace_array *tr = target_hist_data->event_file->tr;
31277+ struct hist_field *event_var = ERR_PTR(-EINVAL);
31278+ struct hist_trigger_data *hist_data;
31279+ unsigned int i, n, first = true;
31280+ struct field_var_hist *var_hist;
31281+ struct trace_event_file *file;
31282+ struct hist_field *key_field;
31283+ char *saved_filter;
31284+ char *cmd;
31285+ int ret;
31286+
31287+ if (target_hist_data->n_field_var_hists >= SYNTH_FIELDS_MAX) {
31288+ hist_err_event("onmatch: Too many field variables defined: ",
31289+ subsys_name, event_name, field_name);
31290+ return ERR_PTR(-EINVAL);
31291+ }
31292+
31293+ file = event_file(tr, subsys_name, event_name);
31294+
31295+ if (IS_ERR(file)) {
31296+ hist_err_event("onmatch: Event file not found: ",
31297+ subsys_name, event_name, field_name);
31298+ ret = PTR_ERR(file);
31299+ return ERR_PTR(ret);
31300+ }
31301+
31302+ /*
31303+ * Look for a histogram compatible with target. We'll use the
31304+ * found histogram specification to create a new matching
31305+ * histogram with our variable on it. target_hist_data is not
31306+ * yet a registered histogram so we can't use that.
31307+ */
31308+ hist_data = find_compatible_hist(target_hist_data, file);
31309+ if (!hist_data) {
31310+ hist_err_event("onmatch: Matching event histogram not found: ",
31311+ subsys_name, event_name, field_name);
31312+ return ERR_PTR(-EINVAL);
31313+ }
31314+
31315+ /* See if a synthetic field variable has already been created */
31316+ event_var = find_synthetic_field_var(target_hist_data, subsys_name,
31317+ event_name, field_name);
31318+ if (!IS_ERR_OR_NULL(event_var))
31319+ return event_var;
31320+
31321+ var_hist = kzalloc(sizeof(*var_hist), GFP_KERNEL);
31322+ if (!var_hist)
31323+ return ERR_PTR(-ENOMEM);
31324+
31325+ cmd = kzalloc(MAX_FILTER_STR_VAL, GFP_KERNEL);
31326+ if (!cmd) {
31327+ kfree(var_hist);
31328+ return ERR_PTR(-ENOMEM);
31329+ }
31330+
31331+ /* Use the same keys as the compatible histogram */
31332+ strcat(cmd, "keys=");
31333+
31334+ for_each_hist_key_field(i, hist_data) {
31335+ key_field = hist_data->fields[i];
31336+ if (!first)
31337+ strcat(cmd, ",");
31338+ strcat(cmd, key_field->field->name);
31339+ first = false;
31340+ }
31341+
31342+ /* Create the synthetic field variable specification */
31343+ strcat(cmd, ":synthetic_");
31344+ strcat(cmd, field_name);
31345+ strcat(cmd, "=");
31346+ strcat(cmd, field_name);
31347+
31348+ /* Use the same filter as the compatible histogram */
31349+ saved_filter = find_trigger_filter(hist_data, file);
31350+ if (saved_filter) {
31351+ strcat(cmd, " if ");
31352+ strcat(cmd, saved_filter);
31353+ }
31354+
31355+ var_hist->cmd = kstrdup(cmd, GFP_KERNEL);
31356+ if (!var_hist->cmd) {
31357+ kfree(cmd);
31358+ kfree(var_hist);
31359+ return ERR_PTR(-ENOMEM);
31360+ }
31361+
31362+ /* Save the compatible histogram information */
31363+ var_hist->hist_data = hist_data;
31364+
31365+ /* Create the new histogram with our variable */
31366+ ret = event_hist_trigger_func(&trigger_hist_cmd, file,
31367+ "", "hist", cmd);
31368+ if (ret) {
31369+ kfree(cmd);
31370+ kfree(var_hist->cmd);
31371+ kfree(var_hist);
31372+ hist_err_event("onmatch: Couldn't create histogram for field: ",
31373+ subsys_name, event_name, field_name);
31374+ return ERR_PTR(ret);
31375+ }
31376+
31377+ kfree(cmd);
31378+
31379+ /* If we can't find the variable, something went wrong */
31380+ event_var = find_synthetic_field_var(target_hist_data, subsys_name,
31381+ event_name, field_name);
31382+ if (IS_ERR_OR_NULL(event_var)) {
31383+ kfree(var_hist->cmd);
31384+ kfree(var_hist);
31385+ hist_err_event("onmatch: Couldn't find synthetic variable: ",
31386+ subsys_name, event_name, field_name);
31387+ return ERR_PTR(-EINVAL);
31388+ }
31389+
31390+ n = target_hist_data->n_field_var_hists;
31391+ target_hist_data->field_var_hists[n] = var_hist;
31392+ target_hist_data->n_field_var_hists++;
31393+
31394+ return event_var;
31395+}
31396+
31397+static struct hist_field *
31398+find_target_event_var(struct hist_trigger_data *hist_data,
31399+ char *subsys_name, char *event_name, char *var_name)
31400+{
31401+ struct trace_event_file *file = hist_data->event_file;
31402+ struct hist_field *hist_field = NULL;
31403+
31404+ if (subsys_name) {
31405+ struct trace_event_call *call;
31406+
31407+ if (!event_name)
31408+ return NULL;
31409+
31410+ call = file->event_call;
31411+
31412+ if (strcmp(subsys_name, call->class->system) != 0)
31413+ return NULL;
31414+
31415+ if (strcmp(event_name, trace_event_name(call)) != 0)
31416+ return NULL;
31417+ }
31418+
31419+ hist_field = find_var_field(hist_data, var_name);
31420+
31421+ return hist_field;
31422+}
31423+
31424+static inline void __update_field_vars(struct tracing_map_elt *elt,
31425+ struct ring_buffer_event *rbe,
31426+ void *rec,
31427+ struct field_var **field_vars,
31428+ unsigned int n_field_vars,
31429+ unsigned int field_var_str_start)
31430+{
31431+ struct hist_elt_data *elt_data = elt->private_data;
31432+ unsigned int i, j, var_idx;
31433+ u64 var_val;
31434+
31435+ for (i = 0, j = field_var_str_start; i < n_field_vars; i++) {
31436+ struct field_var *field_var = field_vars[i];
31437+ struct hist_field *var = field_var->var;
31438+ struct hist_field *val = field_var->val;
31439+
31440+ var_val = val->fn(val, elt, rbe, rec);
31441+ var_idx = var->var.idx;
31442+
31443+ if (val->flags & HIST_FIELD_FL_STRING) {
31444+ char *str = elt_data->field_var_str[j++];
31445+ char *val_str = (char *)(uintptr_t)var_val;
31446+
31447+ strscpy(str, val_str, STR_VAR_LEN_MAX);
31448+ var_val = (u64)(uintptr_t)str;
31449+ }
31450+ tracing_map_set_var(elt, var_idx, var_val);
31451+ }
31452+}
31453+
31454+static void update_field_vars(struct hist_trigger_data *hist_data,
31455+ struct tracing_map_elt *elt,
31456+ struct ring_buffer_event *rbe,
31457+ void *rec)
31458+{
31459+ __update_field_vars(elt, rbe, rec, hist_data->field_vars,
31460+ hist_data->n_field_vars, 0);
31461+}
31462+
31463+static void update_max_vars(struct hist_trigger_data *hist_data,
31464+ struct tracing_map_elt *elt,
31465+ struct ring_buffer_event *rbe,
31466+ void *rec)
31467+{
31468+ __update_field_vars(elt, rbe, rec, hist_data->max_vars,
31469+ hist_data->n_max_vars, hist_data->n_field_var_str);
31470+}
31471+
31472+static struct hist_field *create_var(struct hist_trigger_data *hist_data,
31473+ struct trace_event_file *file,
31474+ char *name, int size, const char *type)
31475+{
31476+ struct hist_field *var;
31477+ int idx;
31478+
31479+ if (find_var(hist_data, file, name) && !hist_data->remove) {
31480+ var = ERR_PTR(-EINVAL);
31481+ goto out;
31482+ }
31483+
31484+ var = kzalloc(sizeof(struct hist_field), GFP_KERNEL);
31485+ if (!var) {
31486+ var = ERR_PTR(-ENOMEM);
31487+ goto out;
31488+ }
31489+
31490+ idx = tracing_map_add_var(hist_data->map);
31491+ if (idx < 0) {
31492+ kfree(var);
31493+ var = ERR_PTR(-EINVAL);
31494+ goto out;
31495+ }
31496+
31497+ var->flags = HIST_FIELD_FL_VAR;
31498+ var->var.idx = idx;
31499+ var->var.hist_data = var->hist_data = hist_data;
31500+ var->size = size;
31501+ var->var.name = kstrdup(name, GFP_KERNEL);
31502+ var->type = kstrdup(type, GFP_KERNEL);
31503+ if (!var->var.name || !var->type) {
31504+ kfree(var->var.name);
31505+ kfree(var->type);
31506+ kfree(var);
31507+ var = ERR_PTR(-ENOMEM);
31508+ }
31509+ out:
31510+ return var;
31511+}
31512+
31513+static struct field_var *create_field_var(struct hist_trigger_data *hist_data,
31514+ struct trace_event_file *file,
31515+ char *field_name)
31516+{
31517+ struct hist_field *val = NULL, *var = NULL;
31518+ unsigned long flags = HIST_FIELD_FL_VAR;
31519+ struct field_var *field_var;
31520+ int ret = 0;
31521+
31522+ if (hist_data->n_field_vars >= SYNTH_FIELDS_MAX) {
31523+ hist_err("Too many field variables defined: ", field_name);
31524+ ret = -EINVAL;
31525+ goto err;
31526+ }
31527+
31528+ val = parse_atom(hist_data, file, field_name, &flags, NULL);
31529+ if (IS_ERR(val)) {
31530+ hist_err("Couldn't parse field variable: ", field_name);
31531+ ret = PTR_ERR(val);
31532+ goto err;
31533+ }
31534+
31535+ var = create_var(hist_data, file, field_name, val->size, val->type);
31536+ if (IS_ERR(var)) {
31537+ hist_err("Couldn't create or find variable: ", field_name);
31538+ kfree(val);
31539+ ret = PTR_ERR(var);
31540+ goto err;
31541+ }
31542+
31543+ field_var = kzalloc(sizeof(struct field_var), GFP_KERNEL);
31544+ if (!field_var) {
31545+ kfree(val);
31546+ kfree(var);
31547+ ret = -ENOMEM;
31548+ goto err;
31549+ }
31550+
31551+ field_var->var = var;
31552+ field_var->val = val;
31553+ out:
31554+ return field_var;
31555+ err:
31556+ field_var = ERR_PTR(ret);
31557+ goto out;
31558+}
31559+
31560+/**
31561+ * create_target_field_var - Automatically create a variable for a field
31562+ * @target_hist_data: The target hist trigger
31563+ * @subsys_name: Optional subsystem name
31564+ * @event_name: Optional event name
31565+ * @var_name: The name of the field (and the resulting variable)
31566+ *
31567+ * Hist trigger actions fetch data from variables, not directly from
31568+ * events. However, for convenience, users are allowed to directly
31569+ * specify an event field in an action, which will be automatically
31570+ * converted into a variable on their behalf.
31571+
31572+ * This function creates a field variable with the name var_name on
31573+ * the hist trigger currently being defined on the target event. If
31574+ * subsys_name and event_name are specified, this function simply
31575+ * verifies that they do in fact match the target event subsystem and
31576+ * event name.
31577+ *
31578+ * Return: The variable created for the field.
31579+ */
31580+static struct field_var *
31581+create_target_field_var(struct hist_trigger_data *target_hist_data,
31582+ char *subsys_name, char *event_name, char *var_name)
31583+{
31584+ struct trace_event_file *file = target_hist_data->event_file;
31585+
31586+ if (subsys_name) {
31587+ struct trace_event_call *call;
31588+
31589+ if (!event_name)
31590+ return NULL;
31591+
31592+ call = file->event_call;
31593+
31594+ if (strcmp(subsys_name, call->class->system) != 0)
31595+ return NULL;
31596+
31597+ if (strcmp(event_name, trace_event_name(call)) != 0)
31598+ return NULL;
31599+ }
31600+
31601+ return create_field_var(target_hist_data, file, var_name);
31602+}
31603+
31604+static void onmax_print(struct seq_file *m,
31605+ struct hist_trigger_data *hist_data,
31606+ struct tracing_map_elt *elt,
31607+ struct action_data *data)
31608+{
31609+ unsigned int i, save_var_idx, max_idx = data->onmax.max_var->var.idx;
31610+
31611+ seq_printf(m, "\n\tmax: %10llu", tracing_map_read_var(elt, max_idx));
31612+
31613+ for (i = 0; i < hist_data->n_max_vars; i++) {
31614+ struct hist_field *save_val = hist_data->max_vars[i]->val;
31615+ struct hist_field *save_var = hist_data->max_vars[i]->var;
31616+ u64 val;
31617+
31618+ save_var_idx = save_var->var.idx;
31619+
31620+ val = tracing_map_read_var(elt, save_var_idx);
31621+
31622+ if (save_val->flags & HIST_FIELD_FL_STRING) {
31623+ seq_printf(m, " %s: %-32s", save_var->var.name,
31624+ (char *)(uintptr_t)(val));
31625+ } else
31626+ seq_printf(m, " %s: %10llu", save_var->var.name, val);
31627+ }
31628+}
31629+
31630+static void onmax_save(struct hist_trigger_data *hist_data,
31631+ struct tracing_map_elt *elt, void *rec,
31632+ struct ring_buffer_event *rbe,
31633+ struct action_data *data, u64 *var_ref_vals)
31634+{
31635+ unsigned int max_idx = data->onmax.max_var->var.idx;
31636+ unsigned int max_var_ref_idx = data->onmax.max_var_ref_idx;
31637+
31638+ u64 var_val, max_val;
31639+
31640+ var_val = var_ref_vals[max_var_ref_idx];
31641+ max_val = tracing_map_read_var(elt, max_idx);
31642+
31643+ if (var_val <= max_val)
31644+ return;
31645+
31646+ tracing_map_set_var(elt, max_idx, var_val);
31647+
31648+ update_max_vars(hist_data, elt, rbe, rec);
31649+}
31650+
31651+static void onmax_destroy(struct action_data *data)
31652+{
31653+ unsigned int i;
31654+
31655+ destroy_hist_field(data->onmax.max_var, 0);
31656+ destroy_hist_field(data->onmax.var, 0);
31657+
31658+ kfree(data->onmax.var_str);
31659+ kfree(data->onmax.fn_name);
31660+
31661+ for (i = 0; i < data->n_params; i++)
31662+ kfree(data->params[i]);
31663+
31664+ kfree(data);
31665+}
31666+
31667+static int onmax_create(struct hist_trigger_data *hist_data,
31668+ struct action_data *data)
31669+{
31670+ struct trace_event_file *file = hist_data->event_file;
31671+ struct hist_field *var_field, *ref_field, *max_var;
31672+ unsigned int var_ref_idx = hist_data->n_var_refs;
31673+ struct field_var *field_var;
31674+ char *onmax_var_str, *param;
31675+ unsigned long flags;
31676+ unsigned int i;
31677+ int ret = 0;
31678+
31679+ onmax_var_str = data->onmax.var_str;
31680+ if (onmax_var_str[0] != '$') {
31681+ hist_err("onmax: For onmax(x), x must be a variable: ", onmax_var_str);
31682+ return -EINVAL;
31683+ }
31684+ onmax_var_str++;
31685+
31686+ var_field = find_target_event_var(hist_data, NULL, NULL, onmax_var_str);
31687+ if (!var_field) {
31688+ hist_err("onmax: Couldn't find onmax variable: ", onmax_var_str);
31689+ return -EINVAL;
31690+ }
31691+
31692+ flags = HIST_FIELD_FL_VAR_REF;
31693+ ref_field = create_hist_field(hist_data, NULL, flags, NULL);
31694+ if (!ref_field)
31695+ return -ENOMEM;
31696+
31697+ if (init_var_ref(ref_field, var_field, NULL, NULL)) {
31698+ destroy_hist_field(ref_field, 0);
31699+ ret = -ENOMEM;
31700+ goto out;
31701+ }
31702+ hist_data->var_refs[hist_data->n_var_refs] = ref_field;
31703+ ref_field->var_ref_idx = hist_data->n_var_refs++;
31704+ data->onmax.var = ref_field;
31705+
31706+ data->fn = onmax_save;
31707+ data->onmax.max_var_ref_idx = var_ref_idx;
31708+ max_var = create_var(hist_data, file, "max", sizeof(u64), "u64");
31709+ if (IS_ERR(max_var)) {
31710+ hist_err("onmax: Couldn't create onmax variable: ", "max");
31711+ ret = PTR_ERR(max_var);
31712+ goto out;
31713+ }
31714+ data->onmax.max_var = max_var;
31715+
31716+ for (i = 0; i < data->n_params; i++) {
31717+ param = kstrdup(data->params[i], GFP_KERNEL);
31718+ if (!param) {
31719+ ret = -ENOMEM;
31720+ goto out;
31721+ }
31722+
31723+ field_var = create_target_field_var(hist_data, NULL, NULL, param);
31724+ if (IS_ERR(field_var)) {
31725+ hist_err("onmax: Couldn't create field variable: ", param);
31726+ ret = PTR_ERR(field_var);
31727+ kfree(param);
31728+ goto out;
31729+ }
31730+
31731+ hist_data->max_vars[hist_data->n_max_vars++] = field_var;
31732+ if (field_var->val->flags & HIST_FIELD_FL_STRING)
31733+ hist_data->n_max_var_str++;
31734+
31735+ kfree(param);
31736+ }
31737+ out:
31738+ return ret;
31739+}
31740+
31741+static int parse_action_params(char *params, struct action_data *data)
31742+{
31743+ char *param, *saved_param;
31744+ int ret = 0;
31745+
31746+ while (params) {
31747+ if (data->n_params >= SYNTH_FIELDS_MAX)
31748+ goto out;
31749+
31750+ param = strsep(&params, ",");
31751+ if (!param) {
31752+ ret = -EINVAL;
31753+ goto out;
31754+ }
31755+
31756+ param = strstrip(param);
31757+ if (strlen(param) < 2) {
31758+ hist_err("Invalid action param: ", param);
31759+ ret = -EINVAL;
31760+ goto out;
31761+ }
31762+
31763+ saved_param = kstrdup(param, GFP_KERNEL);
31764+ if (!saved_param) {
31765+ ret = -ENOMEM;
31766+ goto out;
31767+ }
31768+
31769+ data->params[data->n_params++] = saved_param;
31770+ }
31771+ out:
31772+ return ret;
31773+}
31774+
31775+static struct action_data *onmax_parse(char *str)
31776+{
31777+ char *onmax_fn_name, *onmax_var_str;
31778+ struct action_data *data;
31779+ int ret = -EINVAL;
31780+
31781+ data = kzalloc(sizeof(*data), GFP_KERNEL);
31782+ if (!data)
31783+ return ERR_PTR(-ENOMEM);
31784+
31785+ onmax_var_str = strsep(&str, ")");
31786+ if (!onmax_var_str || !str) {
31787+ ret = -EINVAL;
31788+ goto free;
31789+ }
31790+
31791+ data->onmax.var_str = kstrdup(onmax_var_str, GFP_KERNEL);
31792+ if (!data->onmax.var_str) {
31793+ ret = -ENOMEM;
31794+ goto free;
31795+ }
31796+
31797+ strsep(&str, ".");
31798+ if (!str)
31799+ goto free;
31800+
31801+ onmax_fn_name = strsep(&str, "(");
31802+ if (!onmax_fn_name || !str)
31803+ goto free;
31804+
31805+ if (strncmp(onmax_fn_name, "save", strlen("save")) == 0) {
31806+ char *params = strsep(&str, ")");
31807+
31808+ if (!params) {
31809+ ret = -EINVAL;
31810+ goto free;
31811+ }
31812+
31813+ ret = parse_action_params(params, data);
31814+ if (ret)
31815+ goto free;
31816+ } else
31817+ goto free;
31818+
31819+ data->onmax.fn_name = kstrdup(onmax_fn_name, GFP_KERNEL);
31820+ if (!data->onmax.fn_name) {
31821+ ret = -ENOMEM;
31822+ goto free;
31823+ }
31824+ out:
31825+ return data;
31826+ free:
31827+ onmax_destroy(data);
31828+ data = ERR_PTR(ret);
31829+ goto out;
31830+}
31831+
31832+static void onmatch_destroy(struct action_data *data)
31833+{
31834+ unsigned int i;
31835+
31836+ mutex_lock(&synth_event_mutex);
31837+
31838+ kfree(data->onmatch.match_event);
31839+ kfree(data->onmatch.match_event_system);
31840+ kfree(data->onmatch.synth_event_name);
31841+
31842+ for (i = 0; i < data->n_params; i++)
31843+ kfree(data->params[i]);
31844+
31845+ if (data->onmatch.synth_event)
31846+ data->onmatch.synth_event->ref--;
31847+
31848+ kfree(data);
31849+
31850+ mutex_unlock(&synth_event_mutex);
31851+}
31852+
31853+static void destroy_field_var(struct field_var *field_var)
31854+{
31855+ if (!field_var)
31856+ return;
31857+
31858+ destroy_hist_field(field_var->var, 0);
31859+ destroy_hist_field(field_var->val, 0);
31860+
31861+ kfree(field_var);
31862+}
31863+
31864+static void destroy_field_vars(struct hist_trigger_data *hist_data)
31865+{
31866+ unsigned int i;
31867+
31868+ for (i = 0; i < hist_data->n_field_vars; i++)
31869+ destroy_field_var(hist_data->field_vars[i]);
31870+}
31871+
31872+static void save_field_var(struct hist_trigger_data *hist_data,
31873+ struct field_var *field_var)
31874+{
31875+ hist_data->field_vars[hist_data->n_field_vars++] = field_var;
31876+
31877+ if (field_var->val->flags & HIST_FIELD_FL_STRING)
31878+ hist_data->n_field_var_str++;
31879+}
31880+
31881+
31882+static void destroy_synth_var_refs(struct hist_trigger_data *hist_data)
31883+{
31884+ unsigned int i;
31885+
31886+ for (i = 0; i < hist_data->n_synth_var_refs; i++)
31887+ destroy_hist_field(hist_data->synth_var_refs[i], 0);
31888+}
31889+
31890+static void save_synth_var_ref(struct hist_trigger_data *hist_data,
31891+ struct hist_field *var_ref)
31892+{
31893+ hist_data->synth_var_refs[hist_data->n_synth_var_refs++] = var_ref;
31894+
31895+ hist_data->var_refs[hist_data->n_var_refs] = var_ref;
31896+ var_ref->var_ref_idx = hist_data->n_var_refs++;
31897+}
31898+
31899+static int check_synth_field(struct synth_event *event,
31900+ struct hist_field *hist_field,
31901+ unsigned int field_pos)
31902+{
31903+ struct synth_field *field;
31904+
31905+ if (field_pos >= event->n_fields)
31906+ return -EINVAL;
31907+
31908+ field = event->fields[field_pos];
31909+
31910+ if (strcmp(field->type, hist_field->type) != 0)
31911+ return -EINVAL;
31912+
31913+ return 0;
31914+}
31915+
31916+static struct hist_field *
31917+onmatch_find_var(struct hist_trigger_data *hist_data, struct action_data *data,
31918+ char *system, char *event, char *var)
31919+{
31920+ struct hist_field *hist_field;
31921+
31922+ var++; /* skip '$' */
31923+
31924+ hist_field = find_target_event_var(hist_data, system, event, var);
31925+ if (!hist_field) {
31926+ if (!system) {
31927+ system = data->onmatch.match_event_system;
31928+ event = data->onmatch.match_event;
31929+ }
31930+
31931+ hist_field = find_event_var(hist_data, system, event, var);
31932+ }
31933+
31934+ if (!hist_field)
31935+ hist_err_event("onmatch: Couldn't find onmatch param: $", system, event, var);
31936+
31937+ return hist_field;
31938+}
31939+
31940+static struct hist_field *
31941+onmatch_create_field_var(struct hist_trigger_data *hist_data,
31942+ struct action_data *data, char *system,
31943+ char *event, char *var)
31944+{
31945+ struct hist_field *hist_field = NULL;
31946+ struct field_var *field_var;
31947+
31948+ /*
31949+ * First try to create a field var on the target event (the
31950+ * currently being defined). This will create a variable for
31951+ * unqualified fields on the target event, or if qualified,
31952+ * target fields that have qualified names matching the target.
31953+ */
31954+ field_var = create_target_field_var(hist_data, system, event, var);
31955+
31956+ if (field_var && !IS_ERR(field_var)) {
31957+ save_field_var(hist_data, field_var);
31958+ hist_field = field_var->var;
31959+ } else {
31960+ field_var = NULL;
31961+ /*
31962+ * If no explicit system.event is specfied, default to
31963+ * looking for fields on the onmatch(system.event.xxx)
31964+ * event.
31965+ */
31966+ if (!system) {
31967+ system = data->onmatch.match_event_system;
31968+ event = data->onmatch.match_event;
31969+ }
31970+
31971+ /*
31972+ * At this point, we're looking at a field on another
31973+ * event. Because we can't modify a hist trigger on
31974+ * another event to add a variable for a field, we need
31975+ * to create a new trigger on that event and create the
31976+ * variable at the same time.
31977+ */
31978+ hist_field = create_field_var_hist(hist_data, system, event, var);
31979+ if (IS_ERR(hist_field))
31980+ goto free;
31981+ }
31982+ out:
31983+ return hist_field;
31984+ free:
31985+ destroy_field_var(field_var);
31986+ hist_field = NULL;
31987+ goto out;
31988+}
31989+
31990+static int onmatch_create(struct hist_trigger_data *hist_data,
31991+ struct trace_event_file *file,
31992+ struct action_data *data)
31993+{
31994+ char *event_name, *param, *system = NULL;
31995+ struct hist_field *hist_field, *var_ref;
31996+ unsigned int i, var_ref_idx;
31997+ unsigned int field_pos = 0;
31998+ struct synth_event *event;
31999+ int ret = 0;
32000+
32001+ mutex_lock(&synth_event_mutex);
32002+ event = find_synth_event(data->onmatch.synth_event_name);
32003+ if (!event) {
32004+ hist_err("onmatch: Couldn't find synthetic event: ", data->onmatch.synth_event_name);
32005+ mutex_unlock(&synth_event_mutex);
32006+ return -EINVAL;
32007+ }
32008+ event->ref++;
32009+ mutex_unlock(&synth_event_mutex);
32010+
32011+ var_ref_idx = hist_data->n_var_refs;
32012+
32013+ for (i = 0; i < data->n_params; i++) {
32014+ char *p;
32015+
32016+ p = param = kstrdup(data->params[i], GFP_KERNEL);
32017+ if (!param) {
32018+ ret = -ENOMEM;
32019+ goto err;
32020+ }
32021+
32022+ system = strsep(&param, ".");
32023+ if (!param) {
32024+ param = (char *)system;
32025+ system = event_name = NULL;
32026+ } else {
32027+ event_name = strsep(&param, ".");
32028+ if (!param) {
32029+ kfree(p);
32030+ ret = -EINVAL;
32031+ goto err;
32032+ }
32033+ }
32034+
32035+ if (param[0] == '$')
32036+ hist_field = onmatch_find_var(hist_data, data, system,
32037+ event_name, param);
32038+ else
32039+ hist_field = onmatch_create_field_var(hist_data, data,
32040+ system,
32041+ event_name,
32042+ param);
32043+
32044+ if (!hist_field) {
32045+ kfree(p);
32046+ ret = -EINVAL;
32047+ goto err;
32048+ }
32049+
32050+ if (check_synth_field(event, hist_field, field_pos) == 0) {
32051+ var_ref = create_var_ref(hist_field, system, event_name);
32052+ if (!var_ref) {
32053+ kfree(p);
32054+ ret = -ENOMEM;
32055+ goto err;
32056+ }
32057+
32058+ save_synth_var_ref(hist_data, var_ref);
32059+ field_pos++;
32060+ kfree(p);
32061+ continue;
32062+ }
32063+
32064+ hist_err_event("onmatch: Param type doesn't match synthetic event field type: ",
32065+ system, event_name, param);
32066+ kfree(p);
32067+ ret = -EINVAL;
32068+ goto err;
32069+ }
32070+
32071+ if (field_pos != event->n_fields) {
32072+ hist_err("onmatch: Param count doesn't match synthetic event field count: ", event->name);
32073+ ret = -EINVAL;
32074+ goto err;
32075+ }
32076+
32077+ data->fn = action_trace;
32078+ data->onmatch.synth_event = event;
32079+ data->onmatch.var_ref_idx = var_ref_idx;
32080+ out:
32081+ return ret;
32082+ err:
32083+ mutex_lock(&synth_event_mutex);
32084+ event->ref--;
32085+ mutex_unlock(&synth_event_mutex);
32086+
32087+ goto out;
32088+}
32089+
32090+static struct action_data *onmatch_parse(struct trace_array *tr, char *str)
32091+{
32092+ char *match_event, *match_event_system;
32093+ char *synth_event_name, *params;
32094+ struct action_data *data;
32095+ int ret = -EINVAL;
32096+
32097+ data = kzalloc(sizeof(*data), GFP_KERNEL);
32098+ if (!data)
32099+ return ERR_PTR(-ENOMEM);
32100+
32101+ match_event = strsep(&str, ")");
32102+ if (!match_event || !str) {
32103+ hist_err("onmatch: Missing closing paren: ", match_event);
32104+ goto free;
32105+ }
32106+
32107+ match_event_system = strsep(&match_event, ".");
32108+ if (!match_event) {
32109+ hist_err("onmatch: Missing subsystem for match event: ", match_event_system);
32110+ goto free;
32111+ }
32112+
32113+ if (IS_ERR(event_file(tr, match_event_system, match_event))) {
32114+ hist_err_event("onmatch: Invalid subsystem or event name: ",
32115+ match_event_system, match_event, NULL);
32116+ goto free;
32117 }
32118
32119- if (WARN_ON_ONCE(!field))
32120- goto out;
32121+ data->onmatch.match_event = kstrdup(match_event, GFP_KERNEL);
32122+ if (!data->onmatch.match_event) {
32123+ ret = -ENOMEM;
32124+ goto free;
32125+ }
32126
32127- if (is_string_field(field)) {
32128- flags |= HIST_FIELD_FL_STRING;
32129+ data->onmatch.match_event_system = kstrdup(match_event_system, GFP_KERNEL);
32130+ if (!data->onmatch.match_event_system) {
32131+ ret = -ENOMEM;
32132+ goto free;
32133+ }
32134
32135- if (field->filter_type == FILTER_STATIC_STRING)
32136- hist_field->fn = hist_field_string;
32137- else if (field->filter_type == FILTER_DYN_STRING)
32138- hist_field->fn = hist_field_dynstring;
32139- else
32140- hist_field->fn = hist_field_pstring;
32141- } else {
32142- hist_field->fn = select_value_fn(field->size,
32143- field->is_signed);
32144- if (!hist_field->fn) {
32145- destroy_hist_field(hist_field);
32146- return NULL;
32147- }
32148+ strsep(&str, ".");
32149+ if (!str) {
32150+ hist_err("onmatch: Missing . after onmatch(): ", str);
32151+ goto free;
32152 }
32153- out:
32154- hist_field->field = field;
32155- hist_field->flags = flags;
32156
32157- return hist_field;
32158-}
32159+ synth_event_name = strsep(&str, "(");
32160+ if (!synth_event_name || !str) {
32161+ hist_err("onmatch: Missing opening paramlist paren: ", synth_event_name);
32162+ goto free;
32163+ }
32164
32165-static void destroy_hist_fields(struct hist_trigger_data *hist_data)
32166-{
32167- unsigned int i;
32168+ data->onmatch.synth_event_name = kstrdup(synth_event_name, GFP_KERNEL);
32169+ if (!data->onmatch.synth_event_name) {
32170+ ret = -ENOMEM;
32171+ goto free;
32172+ }
32173
32174- for (i = 0; i < TRACING_MAP_FIELDS_MAX; i++) {
32175- if (hist_data->fields[i]) {
32176- destroy_hist_field(hist_data->fields[i]);
32177- hist_data->fields[i] = NULL;
32178- }
32179+ params = strsep(&str, ")");
32180+ if (!params || !str || (str && strlen(str))) {
32181+ hist_err("onmatch: Missing closing paramlist paren: ", params);
32182+ goto free;
32183 }
32184+
32185+ ret = parse_action_params(params, data);
32186+ if (ret)
32187+ goto free;
32188+ out:
32189+ return data;
32190+ free:
32191+ onmatch_destroy(data);
32192+ data = ERR_PTR(ret);
32193+ goto out;
32194 }
32195
32196 static int create_hitcount_val(struct hist_trigger_data *hist_data)
32197 {
32198 hist_data->fields[HITCOUNT_IDX] =
32199- create_hist_field(NULL, HIST_FIELD_FL_HITCOUNT);
32200+ create_hist_field(hist_data, NULL, HIST_FIELD_FL_HITCOUNT, NULL);
32201 if (!hist_data->fields[HITCOUNT_IDX])
32202 return -ENOMEM;
32203
32204 hist_data->n_vals++;
32205+ hist_data->n_fields++;
32206
32207 if (WARN_ON(hist_data->n_vals > TRACING_MAP_VALS_MAX))
32208 return -EINVAL;
32209@@ -426,54 +3828,71 @@ static int create_hitcount_val(struct hist_trigger_data *hist_data)
32210 return 0;
32211 }
32212
32213+static int __create_val_field(struct hist_trigger_data *hist_data,
32214+ unsigned int val_idx,
32215+ struct trace_event_file *file,
32216+ char *var_name, char *field_str,
32217+ unsigned long flags)
32218+{
32219+ struct hist_field *hist_field;
32220+ int ret = 0;
32221+
32222+ hist_field = parse_expr(hist_data, file, field_str, flags, var_name, 0);
32223+ if (IS_ERR(hist_field)) {
32224+ ret = PTR_ERR(hist_field);
32225+ goto out;
32226+ }
32227+
32228+ hist_data->fields[val_idx] = hist_field;
32229+
32230+ ++hist_data->n_vals;
32231+ ++hist_data->n_fields;
32232+
32233+ if (WARN_ON(hist_data->n_vals > TRACING_MAP_VALS_MAX + TRACING_MAP_VARS_MAX))
32234+ ret = -EINVAL;
32235+ out:
32236+ return ret;
32237+}
32238+
32239 static int create_val_field(struct hist_trigger_data *hist_data,
32240 unsigned int val_idx,
32241 struct trace_event_file *file,
32242 char *field_str)
32243 {
32244- struct ftrace_event_field *field = NULL;
32245- unsigned long flags = 0;
32246- char *field_name;
32247- int ret = 0;
32248-
32249 if (WARN_ON(val_idx >= TRACING_MAP_VALS_MAX))
32250 return -EINVAL;
32251
32252- field_name = strsep(&field_str, ".");
32253- if (field_str) {
32254- if (strcmp(field_str, "hex") == 0)
32255- flags |= HIST_FIELD_FL_HEX;
32256- else {
32257- ret = -EINVAL;
32258- goto out;
32259- }
32260- }
32261+ return __create_val_field(hist_data, val_idx, file, NULL, field_str, 0);
32262+}
32263
32264- field = trace_find_event_field(file->event_call, field_name);
32265- if (!field || !field->size) {
32266- ret = -EINVAL;
32267- goto out;
32268- }
32269+static int create_var_field(struct hist_trigger_data *hist_data,
32270+ unsigned int val_idx,
32271+ struct trace_event_file *file,
32272+ char *var_name, char *expr_str)
32273+{
32274+ unsigned long flags = 0;
32275
32276- hist_data->fields[val_idx] = create_hist_field(field, flags);
32277- if (!hist_data->fields[val_idx]) {
32278- ret = -ENOMEM;
32279- goto out;
32280+ if (WARN_ON(val_idx >= TRACING_MAP_VALS_MAX + TRACING_MAP_VARS_MAX))
32281+ return -EINVAL;
32282+
32283+ if (find_var(hist_data, file, var_name) && !hist_data->remove) {
32284+ hist_err("Variable already defined: ", var_name);
32285+ return -EINVAL;
32286 }
32287
32288- ++hist_data->n_vals;
32289+ flags |= HIST_FIELD_FL_VAR;
32290+ hist_data->n_vars++;
32291+ if (WARN_ON(hist_data->n_vars > TRACING_MAP_VARS_MAX))
32292+ return -EINVAL;
32293
32294- if (WARN_ON(hist_data->n_vals > TRACING_MAP_VALS_MAX))
32295- ret = -EINVAL;
32296- out:
32297- return ret;
32298+ return __create_val_field(hist_data, val_idx, file, var_name, expr_str, flags);
32299 }
32300
32301 static int create_val_fields(struct hist_trigger_data *hist_data,
32302 struct trace_event_file *file)
32303 {
32304 char *fields_str, *field_str;
32305- unsigned int i, j;
32306+ unsigned int i, j = 1;
32307 int ret;
32308
32309 ret = create_hitcount_val(hist_data);
32310@@ -493,12 +3912,15 @@ static int create_val_fields(struct hist_trigger_data *hist_data,
32311 field_str = strsep(&fields_str, ",");
32312 if (!field_str)
32313 break;
32314+
32315 if (strcmp(field_str, "hitcount") == 0)
32316 continue;
32317+
32318 ret = create_val_field(hist_data, j++, file, field_str);
32319 if (ret)
32320 goto out;
32321 }
32322+
32323 if (fields_str && (strcmp(fields_str, "hitcount") != 0))
32324 ret = -EINVAL;
32325 out:
32326@@ -511,12 +3933,13 @@ static int create_key_field(struct hist_trigger_data *hist_data,
32327 struct trace_event_file *file,
32328 char *field_str)
32329 {
32330- struct ftrace_event_field *field = NULL;
32331+ struct hist_field *hist_field = NULL;
32332+
32333 unsigned long flags = 0;
32334 unsigned int key_size;
32335 int ret = 0;
32336
32337- if (WARN_ON(key_idx >= TRACING_MAP_FIELDS_MAX))
32338+ if (WARN_ON(key_idx >= HIST_FIELDS_MAX))
32339 return -EINVAL;
32340
32341 flags |= HIST_FIELD_FL_KEY;
32342@@ -524,57 +3947,40 @@ static int create_key_field(struct hist_trigger_data *hist_data,
32343 if (strcmp(field_str, "stacktrace") == 0) {
32344 flags |= HIST_FIELD_FL_STACKTRACE;
32345 key_size = sizeof(unsigned long) * HIST_STACKTRACE_DEPTH;
32346+ hist_field = create_hist_field(hist_data, NULL, flags, NULL);
32347 } else {
32348- char *field_name = strsep(&field_str, ".");
32349-
32350- if (field_str) {
32351- if (strcmp(field_str, "hex") == 0)
32352- flags |= HIST_FIELD_FL_HEX;
32353- else if (strcmp(field_str, "sym") == 0)
32354- flags |= HIST_FIELD_FL_SYM;
32355- else if (strcmp(field_str, "sym-offset") == 0)
32356- flags |= HIST_FIELD_FL_SYM_OFFSET;
32357- else if ((strcmp(field_str, "execname") == 0) &&
32358- (strcmp(field_name, "common_pid") == 0))
32359- flags |= HIST_FIELD_FL_EXECNAME;
32360- else if (strcmp(field_str, "syscall") == 0)
32361- flags |= HIST_FIELD_FL_SYSCALL;
32362- else if (strcmp(field_str, "log2") == 0)
32363- flags |= HIST_FIELD_FL_LOG2;
32364- else {
32365- ret = -EINVAL;
32366- goto out;
32367- }
32368+ hist_field = parse_expr(hist_data, file, field_str, flags,
32369+ NULL, 0);
32370+ if (IS_ERR(hist_field)) {
32371+ ret = PTR_ERR(hist_field);
32372+ goto out;
32373 }
32374
32375- field = trace_find_event_field(file->event_call, field_name);
32376- if (!field || !field->size) {
32377+ if (hist_field->flags & HIST_FIELD_FL_VAR_REF) {
32378+ hist_err("Using variable references as keys not supported: ", field_str);
32379+ destroy_hist_field(hist_field, 0);
32380 ret = -EINVAL;
32381 goto out;
32382 }
32383
32384- if (is_string_field(field))
32385- key_size = MAX_FILTER_STR_VAL;
32386- else
32387- key_size = field->size;
32388+ key_size = hist_field->size;
32389 }
32390
32391- hist_data->fields[key_idx] = create_hist_field(field, flags);
32392- if (!hist_data->fields[key_idx]) {
32393- ret = -ENOMEM;
32394- goto out;
32395- }
32396+ hist_data->fields[key_idx] = hist_field;
32397
32398 key_size = ALIGN(key_size, sizeof(u64));
32399 hist_data->fields[key_idx]->size = key_size;
32400 hist_data->fields[key_idx]->offset = key_offset;
32401+
32402 hist_data->key_size += key_size;
32403+
32404 if (hist_data->key_size > HIST_KEY_SIZE_MAX) {
32405 ret = -EINVAL;
32406 goto out;
32407 }
32408
32409 hist_data->n_keys++;
32410+ hist_data->n_fields++;
32411
32412 if (WARN_ON(hist_data->n_keys > TRACING_MAP_KEYS_MAX))
32413 return -EINVAL;
32414@@ -618,21 +4024,113 @@ static int create_key_fields(struct hist_trigger_data *hist_data,
32415 return ret;
32416 }
32417
32418+static int create_var_fields(struct hist_trigger_data *hist_data,
32419+ struct trace_event_file *file)
32420+{
32421+ unsigned int i, j = hist_data->n_vals;
32422+ int ret = 0;
32423+
32424+ unsigned int n_vars = hist_data->attrs->var_defs.n_vars;
32425+
32426+ for (i = 0; i < n_vars; i++) {
32427+ char *var_name = hist_data->attrs->var_defs.name[i];
32428+ char *expr = hist_data->attrs->var_defs.expr[i];
32429+
32430+ ret = create_var_field(hist_data, j++, file, var_name, expr);
32431+ if (ret)
32432+ goto out;
32433+ }
32434+ out:
32435+ return ret;
32436+}
32437+
32438+static void free_var_defs(struct hist_trigger_data *hist_data)
32439+{
32440+ unsigned int i;
32441+
32442+ for (i = 0; i < hist_data->attrs->var_defs.n_vars; i++) {
32443+ kfree(hist_data->attrs->var_defs.name[i]);
32444+ kfree(hist_data->attrs->var_defs.expr[i]);
32445+ }
32446+
32447+ hist_data->attrs->var_defs.n_vars = 0;
32448+}
32449+
32450+static int parse_var_defs(struct hist_trigger_data *hist_data)
32451+{
32452+ char *s, *str, *var_name, *field_str;
32453+ unsigned int i, j, n_vars = 0;
32454+ int ret = 0;
32455+
32456+ for (i = 0; i < hist_data->attrs->n_assignments; i++) {
32457+ str = hist_data->attrs->assignment_str[i];
32458+ for (j = 0; j < TRACING_MAP_VARS_MAX; j++) {
32459+ field_str = strsep(&str, ",");
32460+ if (!field_str)
32461+ break;
32462+
32463+ var_name = strsep(&field_str, "=");
32464+ if (!var_name || !field_str) {
32465+ hist_err("Malformed assignment: ", var_name);
32466+ ret = -EINVAL;
32467+ goto free;
32468+ }
32469+
32470+ if (n_vars == TRACING_MAP_VARS_MAX) {
32471+ hist_err("Too many variables defined: ", var_name);
32472+ ret = -EINVAL;
32473+ goto free;
32474+ }
32475+
32476+ s = kstrdup(var_name, GFP_KERNEL);
32477+ if (!s) {
32478+ ret = -ENOMEM;
32479+ goto free;
32480+ }
32481+ hist_data->attrs->var_defs.name[n_vars] = s;
32482+
32483+ s = kstrdup(field_str, GFP_KERNEL);
32484+ if (!s) {
32485+ kfree(hist_data->attrs->var_defs.name[n_vars]);
32486+ ret = -ENOMEM;
32487+ goto free;
32488+ }
32489+ hist_data->attrs->var_defs.expr[n_vars++] = s;
32490+
32491+ hist_data->attrs->var_defs.n_vars = n_vars;
32492+ }
32493+ }
32494+
32495+ return ret;
32496+ free:
32497+ free_var_defs(hist_data);
32498+
32499+ return ret;
32500+}
32501+
32502 static int create_hist_fields(struct hist_trigger_data *hist_data,
32503 struct trace_event_file *file)
32504 {
32505 int ret;
32506
32507+ ret = parse_var_defs(hist_data);
32508+ if (ret)
32509+ goto out;
32510+
32511 ret = create_val_fields(hist_data, file);
32512 if (ret)
32513 goto out;
32514
32515- ret = create_key_fields(hist_data, file);
32516+ ret = create_var_fields(hist_data, file);
32517 if (ret)
32518 goto out;
32519
32520- hist_data->n_fields = hist_data->n_vals + hist_data->n_keys;
32521+ ret = create_key_fields(hist_data, file);
32522+ if (ret)
32523+ goto out;
32524 out:
32525+ free_var_defs(hist_data);
32526+
32527 return ret;
32528 }
32529
32530@@ -653,10 +4151,9 @@ static int is_descending(const char *str)
32531 static int create_sort_keys(struct hist_trigger_data *hist_data)
32532 {
32533 char *fields_str = hist_data->attrs->sort_key_str;
32534- struct ftrace_event_field *field = NULL;
32535 struct tracing_map_sort_key *sort_key;
32536 int descending, ret = 0;
32537- unsigned int i, j;
32538+ unsigned int i, j, k;
32539
32540 hist_data->n_sort_keys = 1; /* we always have at least one, hitcount */
32541
32542@@ -670,7 +4167,9 @@ static int create_sort_keys(struct hist_trigger_data *hist_data)
32543 }
32544
32545 for (i = 0; i < TRACING_MAP_SORT_KEYS_MAX; i++) {
32546+ struct hist_field *hist_field;
32547 char *field_str, *field_name;
32548+ const char *test_name;
32549
32550 sort_key = &hist_data->sort_keys[i];
32551
32552@@ -702,10 +4201,19 @@ static int create_sort_keys(struct hist_trigger_data *hist_data)
32553 continue;
32554 }
32555
32556- for (j = 1; j < hist_data->n_fields; j++) {
32557- field = hist_data->fields[j]->field;
32558- if (field && (strcmp(field_name, field->name) == 0)) {
32559- sort_key->field_idx = j;
32560+ for (j = 1, k = 1; j < hist_data->n_fields; j++) {
32561+ unsigned int idx;
32562+
32563+ hist_field = hist_data->fields[j];
32564+ if (hist_field->flags & HIST_FIELD_FL_VAR)
32565+ continue;
32566+
32567+ idx = k++;
32568+
32569+ test_name = hist_field_name(hist_field, 0);
32570+
32571+ if (strcmp(field_name, test_name) == 0) {
32572+ sort_key->field_idx = idx;
32573 descending = is_descending(field_str);
32574 if (descending < 0) {
32575 ret = descending;
32576@@ -720,16 +4228,230 @@ static int create_sort_keys(struct hist_trigger_data *hist_data)
32577 break;
32578 }
32579 }
32580- hist_data->n_sort_keys = i;
32581- out:
32582- return ret;
32583+
32584+ hist_data->n_sort_keys = i;
32585+ out:
32586+ return ret;
32587+}
32588+
32589+static void destroy_actions(struct hist_trigger_data *hist_data)
32590+{
32591+ unsigned int i;
32592+
32593+ for (i = 0; i < hist_data->n_actions; i++) {
32594+ struct action_data *data = hist_data->actions[i];
32595+
32596+ if (data->fn == action_trace)
32597+ onmatch_destroy(data);
32598+ else if (data->fn == onmax_save)
32599+ onmax_destroy(data);
32600+ else
32601+ kfree(data);
32602+ }
32603+}
32604+
32605+static int parse_actions(struct hist_trigger_data *hist_data)
32606+{
32607+ struct trace_array *tr = hist_data->event_file->tr;
32608+ struct action_data *data;
32609+ unsigned int i;
32610+ int ret = 0;
32611+ char *str;
32612+
32613+ for (i = 0; i < hist_data->attrs->n_actions; i++) {
32614+ str = hist_data->attrs->action_str[i];
32615+
32616+ if (strncmp(str, "onmatch(", strlen("onmatch(")) == 0) {
32617+ char *action_str = str + strlen("onmatch(");
32618+
32619+ data = onmatch_parse(tr, action_str);
32620+ if (IS_ERR(data)) {
32621+ ret = PTR_ERR(data);
32622+ break;
32623+ }
32624+ data->fn = action_trace;
32625+ } else if (strncmp(str, "onmax(", strlen("onmax(")) == 0) {
32626+ char *action_str = str + strlen("onmax(");
32627+
32628+ data = onmax_parse(action_str);
32629+ if (IS_ERR(data)) {
32630+ ret = PTR_ERR(data);
32631+ break;
32632+ }
32633+ data->fn = onmax_save;
32634+ } else {
32635+ ret = -EINVAL;
32636+ break;
32637+ }
32638+
32639+ hist_data->actions[hist_data->n_actions++] = data;
32640+ }
32641+
32642+ return ret;
32643+}
32644+
32645+static int create_actions(struct hist_trigger_data *hist_data,
32646+ struct trace_event_file *file)
32647+{
32648+ struct action_data *data;
32649+ unsigned int i;
32650+ int ret = 0;
32651+
32652+ for (i = 0; i < hist_data->attrs->n_actions; i++) {
32653+ data = hist_data->actions[i];
32654+
32655+ if (data->fn == action_trace) {
32656+ ret = onmatch_create(hist_data, file, data);
32657+ if (ret)
32658+ return ret;
32659+ } else if (data->fn == onmax_save) {
32660+ ret = onmax_create(hist_data, data);
32661+ if (ret)
32662+ return ret;
32663+ }
32664+ }
32665+
32666+ return ret;
32667+}
32668+
32669+static void print_actions(struct seq_file *m,
32670+ struct hist_trigger_data *hist_data,
32671+ struct tracing_map_elt *elt)
32672+{
32673+ unsigned int i;
32674+
32675+ for (i = 0; i < hist_data->n_actions; i++) {
32676+ struct action_data *data = hist_data->actions[i];
32677+
32678+ if (data->fn == onmax_save)
32679+ onmax_print(m, hist_data, elt, data);
32680+ }
32681+}
32682+
32683+static void print_onmax_spec(struct seq_file *m,
32684+ struct hist_trigger_data *hist_data,
32685+ struct action_data *data)
32686+{
32687+ unsigned int i;
32688+
32689+ seq_puts(m, ":onmax(");
32690+ seq_printf(m, "%s", data->onmax.var_str);
32691+ seq_printf(m, ").%s(", data->onmax.fn_name);
32692+
32693+ for (i = 0; i < hist_data->n_max_vars; i++) {
32694+ seq_printf(m, "%s", hist_data->max_vars[i]->var->var.name);
32695+ if (i < hist_data->n_max_vars - 1)
32696+ seq_puts(m, ",");
32697+ }
32698+ seq_puts(m, ")");
32699+}
32700+
32701+static void print_onmatch_spec(struct seq_file *m,
32702+ struct hist_trigger_data *hist_data,
32703+ struct action_data *data)
32704+{
32705+ unsigned int i;
32706+
32707+ seq_printf(m, ":onmatch(%s.%s).", data->onmatch.match_event_system,
32708+ data->onmatch.match_event);
32709+
32710+ seq_printf(m, "%s(", data->onmatch.synth_event->name);
32711+
32712+ for (i = 0; i < data->n_params; i++) {
32713+ if (i)
32714+ seq_puts(m, ",");
32715+ seq_printf(m, "%s", data->params[i]);
32716+ }
32717+
32718+ seq_puts(m, ")");
32719+}
32720+
32721+static bool actions_match(struct hist_trigger_data *hist_data,
32722+ struct hist_trigger_data *hist_data_test)
32723+{
32724+ unsigned int i, j;
32725+
32726+ if (hist_data->n_actions != hist_data_test->n_actions)
32727+ return false;
32728+
32729+ for (i = 0; i < hist_data->n_actions; i++) {
32730+ struct action_data *data = hist_data->actions[i];
32731+ struct action_data *data_test = hist_data_test->actions[i];
32732+
32733+ if (data->fn != data_test->fn)
32734+ return false;
32735+
32736+ if (data->n_params != data_test->n_params)
32737+ return false;
32738+
32739+ for (j = 0; j < data->n_params; j++) {
32740+ if (strcmp(data->params[j], data_test->params[j]) != 0)
32741+ return false;
32742+ }
32743+
32744+ if (data->fn == action_trace) {
32745+ if (strcmp(data->onmatch.synth_event_name,
32746+ data_test->onmatch.synth_event_name) != 0)
32747+ return false;
32748+ if (strcmp(data->onmatch.match_event_system,
32749+ data_test->onmatch.match_event_system) != 0)
32750+ return false;
32751+ if (strcmp(data->onmatch.match_event,
32752+ data_test->onmatch.match_event) != 0)
32753+ return false;
32754+ } else if (data->fn == onmax_save) {
32755+ if (strcmp(data->onmax.var_str,
32756+ data_test->onmax.var_str) != 0)
32757+ return false;
32758+ if (strcmp(data->onmax.fn_name,
32759+ data_test->onmax.fn_name) != 0)
32760+ return false;
32761+ }
32762+ }
32763+
32764+ return true;
32765+}
32766+
32767+
32768+static void print_actions_spec(struct seq_file *m,
32769+ struct hist_trigger_data *hist_data)
32770+{
32771+ unsigned int i;
32772+
32773+ for (i = 0; i < hist_data->n_actions; i++) {
32774+ struct action_data *data = hist_data->actions[i];
32775+
32776+ if (data->fn == action_trace)
32777+ print_onmatch_spec(m, hist_data, data);
32778+ else if (data->fn == onmax_save)
32779+ print_onmax_spec(m, hist_data, data);
32780+ }
32781+}
32782+
32783+static void destroy_field_var_hists(struct hist_trigger_data *hist_data)
32784+{
32785+ unsigned int i;
32786+
32787+ for (i = 0; i < hist_data->n_field_var_hists; i++) {
32788+ kfree(hist_data->field_var_hists[i]->cmd);
32789+ kfree(hist_data->field_var_hists[i]);
32790+ }
32791 }
32792
32793 static void destroy_hist_data(struct hist_trigger_data *hist_data)
32794 {
32795+ if (!hist_data)
32796+ return;
32797+
32798 destroy_hist_trigger_attrs(hist_data->attrs);
32799 destroy_hist_fields(hist_data);
32800 tracing_map_destroy(hist_data->map);
32801+
32802+ destroy_actions(hist_data);
32803+ destroy_field_vars(hist_data);
32804+ destroy_field_var_hists(hist_data);
32805+ destroy_synth_var_refs(hist_data);
32806+
32807 kfree(hist_data);
32808 }
32809
32810@@ -738,7 +4460,7 @@ static int create_tracing_map_fields(struct hist_trigger_data *hist_data)
32811 struct tracing_map *map = hist_data->map;
32812 struct ftrace_event_field *field;
32813 struct hist_field *hist_field;
32814- int i, idx;
32815+ int i, idx = 0;
32816
32817 for_each_hist_field(i, hist_data) {
32818 hist_field = hist_data->fields[i];
32819@@ -749,6 +4471,9 @@ static int create_tracing_map_fields(struct hist_trigger_data *hist_data)
32820
32821 if (hist_field->flags & HIST_FIELD_FL_STACKTRACE)
32822 cmp_fn = tracing_map_cmp_none;
32823+ else if (!field)
32824+ cmp_fn = tracing_map_cmp_num(hist_field->size,
32825+ hist_field->is_signed);
32826 else if (is_string_field(field))
32827 cmp_fn = tracing_map_cmp_string;
32828 else
32829@@ -757,36 +4482,29 @@ static int create_tracing_map_fields(struct hist_trigger_data *hist_data)
32830 idx = tracing_map_add_key_field(map,
32831 hist_field->offset,
32832 cmp_fn);
32833-
32834- } else
32835+ } else if (!(hist_field->flags & HIST_FIELD_FL_VAR))
32836 idx = tracing_map_add_sum_field(map);
32837
32838 if (idx < 0)
32839 return idx;
32840- }
32841-
32842- return 0;
32843-}
32844-
32845-static bool need_tracing_map_ops(struct hist_trigger_data *hist_data)
32846-{
32847- struct hist_field *key_field;
32848- unsigned int i;
32849-
32850- for_each_hist_key_field(i, hist_data) {
32851- key_field = hist_data->fields[i];
32852
32853- if (key_field->flags & HIST_FIELD_FL_EXECNAME)
32854- return true;
32855+ if (hist_field->flags & HIST_FIELD_FL_VAR) {
32856+ idx = tracing_map_add_var(map);
32857+ if (idx < 0)
32858+ return idx;
32859+ hist_field->var.idx = idx;
32860+ hist_field->var.hist_data = hist_data;
32861+ }
32862 }
32863
32864- return false;
32865+ return 0;
32866 }
32867
32868 static struct hist_trigger_data *
32869 create_hist_data(unsigned int map_bits,
32870 struct hist_trigger_attrs *attrs,
32871- struct trace_event_file *file)
32872+ struct trace_event_file *file,
32873+ bool remove)
32874 {
32875 const struct tracing_map_ops *map_ops = NULL;
32876 struct hist_trigger_data *hist_data;
32877@@ -797,6 +4515,12 @@ create_hist_data(unsigned int map_bits,
32878 return ERR_PTR(-ENOMEM);
32879
32880 hist_data->attrs = attrs;
32881+ hist_data->remove = remove;
32882+ hist_data->event_file = file;
32883+
32884+ ret = parse_actions(hist_data);
32885+ if (ret)
32886+ goto free;
32887
32888 ret = create_hist_fields(hist_data, file);
32889 if (ret)
32890@@ -806,8 +4530,7 @@ create_hist_data(unsigned int map_bits,
32891 if (ret)
32892 goto free;
32893
32894- if (need_tracing_map_ops(hist_data))
32895- map_ops = &hist_trigger_elt_comm_ops;
32896+ map_ops = &hist_trigger_elt_data_ops;
32897
32898 hist_data->map = tracing_map_create(map_bits, hist_data->key_size,
32899 map_ops, hist_data);
32900@@ -820,12 +4543,6 @@ create_hist_data(unsigned int map_bits,
32901 ret = create_tracing_map_fields(hist_data);
32902 if (ret)
32903 goto free;
32904-
32905- ret = tracing_map_init(hist_data->map);
32906- if (ret)
32907- goto free;
32908-
32909- hist_data->event_file = file;
32910 out:
32911 return hist_data;
32912 free:
32913@@ -839,18 +4556,39 @@ create_hist_data(unsigned int map_bits,
32914 }
32915
32916 static void hist_trigger_elt_update(struct hist_trigger_data *hist_data,
32917- struct tracing_map_elt *elt,
32918- void *rec)
32919+ struct tracing_map_elt *elt, void *rec,
32920+ struct ring_buffer_event *rbe,
32921+ u64 *var_ref_vals)
32922 {
32923+ struct hist_elt_data *elt_data;
32924 struct hist_field *hist_field;
32925- unsigned int i;
32926+ unsigned int i, var_idx;
32927 u64 hist_val;
32928
32929+ elt_data = elt->private_data;
32930+ elt_data->var_ref_vals = var_ref_vals;
32931+
32932 for_each_hist_val_field(i, hist_data) {
32933 hist_field = hist_data->fields[i];
32934- hist_val = hist_field->fn(hist_field, rec);
32935+ hist_val = hist_field->fn(hist_field, elt, rbe, rec);
32936+ if (hist_field->flags & HIST_FIELD_FL_VAR) {
32937+ var_idx = hist_field->var.idx;
32938+ tracing_map_set_var(elt, var_idx, hist_val);
32939+ continue;
32940+ }
32941 tracing_map_update_sum(elt, i, hist_val);
32942 }
32943+
32944+ for_each_hist_key_field(i, hist_data) {
32945+ hist_field = hist_data->fields[i];
32946+ if (hist_field->flags & HIST_FIELD_FL_VAR) {
32947+ hist_val = hist_field->fn(hist_field, elt, rbe, rec);
32948+ var_idx = hist_field->var.idx;
32949+ tracing_map_set_var(elt, var_idx, hist_val);
32950+ }
32951+ }
32952+
32953+ update_field_vars(hist_data, elt, rbe, rec);
32954 }
32955
32956 static inline void add_to_key(char *compound_key, void *key,
32957@@ -877,15 +4615,31 @@ static inline void add_to_key(char *compound_key, void *key,
32958 memcpy(compound_key + key_field->offset, key, size);
32959 }
32960
32961-static void event_hist_trigger(struct event_trigger_data *data, void *rec)
32962+static void
32963+hist_trigger_actions(struct hist_trigger_data *hist_data,
32964+ struct tracing_map_elt *elt, void *rec,
32965+ struct ring_buffer_event *rbe, u64 *var_ref_vals)
32966+{
32967+ struct action_data *data;
32968+ unsigned int i;
32969+
32970+ for (i = 0; i < hist_data->n_actions; i++) {
32971+ data = hist_data->actions[i];
32972+ data->fn(hist_data, elt, rec, rbe, data, var_ref_vals);
32973+ }
32974+}
32975+
32976+static void event_hist_trigger(struct event_trigger_data *data, void *rec,
32977+ struct ring_buffer_event *rbe)
32978 {
32979 struct hist_trigger_data *hist_data = data->private_data;
32980 bool use_compound_key = (hist_data->n_keys > 1);
32981 unsigned long entries[HIST_STACKTRACE_DEPTH];
32982+ u64 var_ref_vals[TRACING_MAP_VARS_MAX];
32983 char compound_key[HIST_KEY_SIZE_MAX];
32984+ struct tracing_map_elt *elt = NULL;
32985 struct stack_trace stacktrace;
32986 struct hist_field *key_field;
32987- struct tracing_map_elt *elt;
32988 u64 field_contents;
32989 void *key = NULL;
32990 unsigned int i;
32991@@ -906,7 +4660,7 @@ static void event_hist_trigger(struct event_trigger_data *data, void *rec)
32992
32993 key = entries;
32994 } else {
32995- field_contents = key_field->fn(key_field, rec);
32996+ field_contents = key_field->fn(key_field, elt, rbe, rec);
32997 if (key_field->flags & HIST_FIELD_FL_STRING) {
32998 key = (void *)(unsigned long)field_contents;
32999 use_compound_key = true;
33000@@ -921,9 +4675,18 @@ static void event_hist_trigger(struct event_trigger_data *data, void *rec)
33001 if (use_compound_key)
33002 key = compound_key;
33003
33004+ if (hist_data->n_var_refs &&
33005+ !resolve_var_refs(hist_data, key, var_ref_vals, false))
33006+ return;
33007+
33008 elt = tracing_map_insert(hist_data->map, key);
33009- if (elt)
33010- hist_trigger_elt_update(hist_data, elt, rec);
33011+ if (!elt)
33012+ return;
33013+
33014+ hist_trigger_elt_update(hist_data, elt, rec, rbe, var_ref_vals);
33015+
33016+ if (resolve_var_refs(hist_data, key, var_ref_vals, true))
33017+ hist_trigger_actions(hist_data, elt, rec, rbe, var_ref_vals);
33018 }
33019
33020 static void hist_trigger_stacktrace_print(struct seq_file *m,
33021@@ -952,6 +4715,7 @@ hist_trigger_entry_print(struct seq_file *m,
33022 struct hist_field *key_field;
33023 char str[KSYM_SYMBOL_LEN];
33024 bool multiline = false;
33025+ const char *field_name;
33026 unsigned int i;
33027 u64 uval;
33028
33029@@ -963,26 +4727,33 @@ hist_trigger_entry_print(struct seq_file *m,
33030 if (i > hist_data->n_vals)
33031 seq_puts(m, ", ");
33032
33033+ field_name = hist_field_name(key_field, 0);
33034+
33035 if (key_field->flags & HIST_FIELD_FL_HEX) {
33036 uval = *(u64 *)(key + key_field->offset);
33037- seq_printf(m, "%s: %llx",
33038- key_field->field->name, uval);
33039+ seq_printf(m, "%s: %llx", field_name, uval);
33040 } else if (key_field->flags & HIST_FIELD_FL_SYM) {
33041 uval = *(u64 *)(key + key_field->offset);
33042 sprint_symbol_no_offset(str, uval);
33043- seq_printf(m, "%s: [%llx] %-45s",
33044- key_field->field->name, uval, str);
33045+ seq_printf(m, "%s: [%llx] %-45s", field_name,
33046+ uval, str);
33047 } else if (key_field->flags & HIST_FIELD_FL_SYM_OFFSET) {
33048 uval = *(u64 *)(key + key_field->offset);
33049 sprint_symbol(str, uval);
33050- seq_printf(m, "%s: [%llx] %-55s",
33051- key_field->field->name, uval, str);
33052+ seq_printf(m, "%s: [%llx] %-55s", field_name,
33053+ uval, str);
33054 } else if (key_field->flags & HIST_FIELD_FL_EXECNAME) {
33055- char *comm = elt->private_data;
33056+ struct hist_elt_data *elt_data = elt->private_data;
33057+ char *comm;
33058+
33059+ if (WARN_ON_ONCE(!elt_data))
33060+ return;
33061+
33062+ comm = elt_data->comm;
33063
33064 uval = *(u64 *)(key + key_field->offset);
33065- seq_printf(m, "%s: %-16s[%10llu]",
33066- key_field->field->name, comm, uval);
33067+ seq_printf(m, "%s: %-16s[%10llu]", field_name,
33068+ comm, uval);
33069 } else if (key_field->flags & HIST_FIELD_FL_SYSCALL) {
33070 const char *syscall_name;
33071
33072@@ -991,8 +4762,8 @@ hist_trigger_entry_print(struct seq_file *m,
33073 if (!syscall_name)
33074 syscall_name = "unknown_syscall";
33075
33076- seq_printf(m, "%s: %-30s[%3llu]",
33077- key_field->field->name, syscall_name, uval);
33078+ seq_printf(m, "%s: %-30s[%3llu]", field_name,
33079+ syscall_name, uval);
33080 } else if (key_field->flags & HIST_FIELD_FL_STACKTRACE) {
33081 seq_puts(m, "stacktrace:\n");
33082 hist_trigger_stacktrace_print(m,
33083@@ -1000,15 +4771,14 @@ hist_trigger_entry_print(struct seq_file *m,
33084 HIST_STACKTRACE_DEPTH);
33085 multiline = true;
33086 } else if (key_field->flags & HIST_FIELD_FL_LOG2) {
33087- seq_printf(m, "%s: ~ 2^%-2llu", key_field->field->name,
33088+ seq_printf(m, "%s: ~ 2^%-2llu", field_name,
33089 *(u64 *)(key + key_field->offset));
33090 } else if (key_field->flags & HIST_FIELD_FL_STRING) {
33091- seq_printf(m, "%s: %-50s", key_field->field->name,
33092+ seq_printf(m, "%s: %-50s", field_name,
33093 (char *)(key + key_field->offset));
33094 } else {
33095 uval = *(u64 *)(key + key_field->offset);
33096- seq_printf(m, "%s: %10llu", key_field->field->name,
33097- uval);
33098+ seq_printf(m, "%s: %10llu", field_name, uval);
33099 }
33100 }
33101
33102@@ -1021,17 +4791,23 @@ hist_trigger_entry_print(struct seq_file *m,
33103 tracing_map_read_sum(elt, HITCOUNT_IDX));
33104
33105 for (i = 1; i < hist_data->n_vals; i++) {
33106+ field_name = hist_field_name(hist_data->fields[i], 0);
33107+
33108+ if (hist_data->fields[i]->flags & HIST_FIELD_FL_VAR ||
33109+ hist_data->fields[i]->flags & HIST_FIELD_FL_EXPR)
33110+ continue;
33111+
33112 if (hist_data->fields[i]->flags & HIST_FIELD_FL_HEX) {
33113- seq_printf(m, " %s: %10llx",
33114- hist_data->fields[i]->field->name,
33115+ seq_printf(m, " %s: %10llx", field_name,
33116 tracing_map_read_sum(elt, i));
33117 } else {
33118- seq_printf(m, " %s: %10llu",
33119- hist_data->fields[i]->field->name,
33120+ seq_printf(m, " %s: %10llu", field_name,
33121 tracing_map_read_sum(elt, i));
33122 }
33123 }
33124
33125+ print_actions(m, hist_data, elt);
33126+
33127 seq_puts(m, "\n");
33128 }
33129
33130@@ -1102,6 +4878,11 @@ static int hist_show(struct seq_file *m, void *v)
33131 hist_trigger_show(m, data, n++);
33132 }
33133
33134+ if (have_hist_err()) {
33135+ seq_printf(m, "\nERROR: %s\n", hist_err_str);
33136+ seq_printf(m, " Last command: %s\n", last_hist_cmd);
33137+ }
33138+
33139 out_unlock:
33140 mutex_unlock(&event_mutex);
33141
33142@@ -1120,34 +4901,31 @@ const struct file_operations event_hist_fops = {
33143 .release = single_release,
33144 };
33145
33146-static const char *get_hist_field_flags(struct hist_field *hist_field)
33147+static void hist_field_print(struct seq_file *m, struct hist_field *hist_field)
33148 {
33149- const char *flags_str = NULL;
33150+ const char *field_name = hist_field_name(hist_field, 0);
33151
33152- if (hist_field->flags & HIST_FIELD_FL_HEX)
33153- flags_str = "hex";
33154- else if (hist_field->flags & HIST_FIELD_FL_SYM)
33155- flags_str = "sym";
33156- else if (hist_field->flags & HIST_FIELD_FL_SYM_OFFSET)
33157- flags_str = "sym-offset";
33158- else if (hist_field->flags & HIST_FIELD_FL_EXECNAME)
33159- flags_str = "execname";
33160- else if (hist_field->flags & HIST_FIELD_FL_SYSCALL)
33161- flags_str = "syscall";
33162- else if (hist_field->flags & HIST_FIELD_FL_LOG2)
33163- flags_str = "log2";
33164+ if (hist_field->var.name)
33165+ seq_printf(m, "%s=", hist_field->var.name);
33166
33167- return flags_str;
33168-}
33169+ if (hist_field->flags & HIST_FIELD_FL_CPU)
33170+ seq_puts(m, "cpu");
33171+ else if (field_name) {
33172+ if (hist_field->flags & HIST_FIELD_FL_VAR_REF ||
33173+ hist_field->flags & HIST_FIELD_FL_ALIAS)
33174+ seq_putc(m, '$');
33175+ seq_printf(m, "%s", field_name);
33176+ } else if (hist_field->flags & HIST_FIELD_FL_TIMESTAMP)
33177+ seq_puts(m, "common_timestamp");
33178
33179-static void hist_field_print(struct seq_file *m, struct hist_field *hist_field)
33180-{
33181- seq_printf(m, "%s", hist_field->field->name);
33182 if (hist_field->flags) {
33183- const char *flags_str = get_hist_field_flags(hist_field);
33184+ if (!(hist_field->flags & HIST_FIELD_FL_VAR_REF) &&
33185+ !(hist_field->flags & HIST_FIELD_FL_EXPR)) {
33186+ const char *flags = get_hist_field_flags(hist_field);
33187
33188- if (flags_str)
33189- seq_printf(m, ".%s", flags_str);
33190+ if (flags)
33191+ seq_printf(m, ".%s", flags);
33192+ }
33193 }
33194 }
33195
33196@@ -1156,7 +4934,8 @@ static int event_hist_trigger_print(struct seq_file *m,
33197 struct event_trigger_data *data)
33198 {
33199 struct hist_trigger_data *hist_data = data->private_data;
33200- struct hist_field *key_field;
33201+ struct hist_field *field;
33202+ bool have_var = false;
33203 unsigned int i;
33204
33205 seq_puts(m, "hist:");
33206@@ -1167,25 +4946,47 @@ static int event_hist_trigger_print(struct seq_file *m,
33207 seq_puts(m, "keys=");
33208
33209 for_each_hist_key_field(i, hist_data) {
33210- key_field = hist_data->fields[i];
33211+ field = hist_data->fields[i];
33212
33213 if (i > hist_data->n_vals)
33214 seq_puts(m, ",");
33215
33216- if (key_field->flags & HIST_FIELD_FL_STACKTRACE)
33217+ if (field->flags & HIST_FIELD_FL_STACKTRACE)
33218 seq_puts(m, "stacktrace");
33219 else
33220- hist_field_print(m, key_field);
33221+ hist_field_print(m, field);
33222 }
33223
33224 seq_puts(m, ":vals=");
33225
33226 for_each_hist_val_field(i, hist_data) {
33227+ field = hist_data->fields[i];
33228+ if (field->flags & HIST_FIELD_FL_VAR) {
33229+ have_var = true;
33230+ continue;
33231+ }
33232+
33233 if (i == HITCOUNT_IDX)
33234 seq_puts(m, "hitcount");
33235 else {
33236 seq_puts(m, ",");
33237- hist_field_print(m, hist_data->fields[i]);
33238+ hist_field_print(m, field);
33239+ }
33240+ }
33241+
33242+ if (have_var) {
33243+ unsigned int n = 0;
33244+
33245+ seq_puts(m, ":");
33246+
33247+ for_each_hist_val_field(i, hist_data) {
33248+ field = hist_data->fields[i];
33249+
33250+ if (field->flags & HIST_FIELD_FL_VAR) {
33251+ if (n++)
33252+ seq_puts(m, ",");
33253+ hist_field_print(m, field);
33254+ }
33255 }
33256 }
33257
33258@@ -1193,28 +4994,36 @@ static int event_hist_trigger_print(struct seq_file *m,
33259
33260 for (i = 0; i < hist_data->n_sort_keys; i++) {
33261 struct tracing_map_sort_key *sort_key;
33262+ unsigned int idx, first_key_idx;
33263+
33264+ /* skip VAR vals */
33265+ first_key_idx = hist_data->n_vals - hist_data->n_vars;
33266
33267 sort_key = &hist_data->sort_keys[i];
33268+ idx = sort_key->field_idx;
33269+
33270+ if (WARN_ON(idx >= HIST_FIELDS_MAX))
33271+ return -EINVAL;
33272
33273 if (i > 0)
33274 seq_puts(m, ",");
33275
33276- if (sort_key->field_idx == HITCOUNT_IDX)
33277+ if (idx == HITCOUNT_IDX)
33278 seq_puts(m, "hitcount");
33279 else {
33280- unsigned int idx = sort_key->field_idx;
33281-
33282- if (WARN_ON(idx >= TRACING_MAP_FIELDS_MAX))
33283- return -EINVAL;
33284-
33285+ if (idx >= first_key_idx)
33286+ idx += hist_data->n_vars;
33287 hist_field_print(m, hist_data->fields[idx]);
33288 }
33289
33290 if (sort_key->descending)
33291 seq_puts(m, ".descending");
33292 }
33293-
33294 seq_printf(m, ":size=%u", (1 << hist_data->map->map_bits));
33295+ if (hist_data->enable_timestamps)
33296+ seq_printf(m, ":clock=%s", hist_data->attrs->clock);
33297+
33298+ print_actions_spec(m, hist_data);
33299
33300 if (data->filter_str)
33301 seq_printf(m, " if %s", data->filter_str);
33302@@ -1242,6 +5051,21 @@ static int event_hist_trigger_init(struct event_trigger_ops *ops,
33303 return 0;
33304 }
33305
33306+static void unregister_field_var_hists(struct hist_trigger_data *hist_data)
33307+{
33308+ struct trace_event_file *file;
33309+ unsigned int i;
33310+ char *cmd;
33311+ int ret;
33312+
33313+ for (i = 0; i < hist_data->n_field_var_hists; i++) {
33314+ file = hist_data->field_var_hists[i]->hist_data->event_file;
33315+ cmd = hist_data->field_var_hists[i]->cmd;
33316+ ret = event_hist_trigger_func(&trigger_hist_cmd, file,
33317+ "!hist", "hist", cmd);
33318+ }
33319+}
33320+
33321 static void event_hist_trigger_free(struct event_trigger_ops *ops,
33322 struct event_trigger_data *data)
33323 {
33324@@ -1254,7 +5078,13 @@ static void event_hist_trigger_free(struct event_trigger_ops *ops,
33325 if (!data->ref) {
33326 if (data->name)
33327 del_named_trigger(data);
33328+
33329 trigger_data_free(data);
33330+
33331+ remove_hist_vars(hist_data);
33332+
33333+ unregister_field_var_hists(hist_data);
33334+
33335 destroy_hist_data(hist_data);
33336 }
33337 }
33338@@ -1381,6 +5211,15 @@ static bool hist_trigger_match(struct event_trigger_data *data,
33339 return false;
33340 if (key_field->offset != key_field_test->offset)
33341 return false;
33342+ if (key_field->size != key_field_test->size)
33343+ return false;
33344+ if (key_field->is_signed != key_field_test->is_signed)
33345+ return false;
33346+ if (!!key_field->var.name != !!key_field_test->var.name)
33347+ return false;
33348+ if (key_field->var.name &&
33349+ strcmp(key_field->var.name, key_field_test->var.name) != 0)
33350+ return false;
33351 }
33352
33353 for (i = 0; i < hist_data->n_sort_keys; i++) {
33354@@ -1396,6 +5235,9 @@ static bool hist_trigger_match(struct event_trigger_data *data,
33355 (strcmp(data->filter_str, data_test->filter_str) != 0))
33356 return false;
33357
33358+ if (!actions_match(hist_data, hist_data_test))
33359+ return false;
33360+
33361 return true;
33362 }
33363
33364@@ -1412,6 +5254,7 @@ static int hist_register_trigger(char *glob, struct event_trigger_ops *ops,
33365 if (named_data) {
33366 if (!hist_trigger_match(data, named_data, named_data,
33367 true)) {
33368+ hist_err("Named hist trigger doesn't match existing named trigger (includes variables): ", hist_data->attrs->name);
33369 ret = -EINVAL;
33370 goto out;
33371 }
33372@@ -1431,13 +5274,16 @@ static int hist_register_trigger(char *glob, struct event_trigger_ops *ops,
33373 test->paused = false;
33374 else if (hist_data->attrs->clear)
33375 hist_clear(test);
33376- else
33377+ else {
33378+ hist_err("Hist trigger already exists", NULL);
33379 ret = -EEXIST;
33380+ }
33381 goto out;
33382 }
33383 }
33384 new:
33385 if (hist_data->attrs->cont || hist_data->attrs->clear) {
33386+ hist_err("Can't clear or continue a nonexistent hist trigger", NULL);
33387 ret = -ENOENT;
33388 goto out;
33389 }
33390@@ -1446,7 +5292,6 @@ static int hist_register_trigger(char *glob, struct event_trigger_ops *ops,
33391 data->paused = true;
33392
33393 if (named_data) {
33394- destroy_hist_data(data->private_data);
33395 data->private_data = named_data->private_data;
33396 set_named_trigger_data(data, named_data);
33397 data->ops = &event_hist_trigger_named_ops;
33398@@ -1458,8 +5303,32 @@ static int hist_register_trigger(char *glob, struct event_trigger_ops *ops,
33399 goto out;
33400 }
33401
33402- list_add_rcu(&data->list, &file->triggers);
33403+ if (hist_data->enable_timestamps) {
33404+ char *clock = hist_data->attrs->clock;
33405+
33406+ ret = tracing_set_clock(file->tr, hist_data->attrs->clock);
33407+ if (ret) {
33408+ hist_err("Couldn't set trace_clock: ", clock);
33409+ goto out;
33410+ }
33411+
33412+ tracing_set_time_stamp_abs(file->tr, true);
33413+ }
33414+
33415+ if (named_data)
33416+ destroy_hist_data(hist_data);
33417+
33418 ret++;
33419+ out:
33420+ return ret;
33421+}
33422+
33423+static int hist_trigger_enable(struct event_trigger_data *data,
33424+ struct trace_event_file *file)
33425+{
33426+ int ret = 0;
33427+
33428+ list_add_tail_rcu(&data->list, &file->triggers);
33429
33430 update_cond_flag(file);
33431
33432@@ -1468,10 +5337,55 @@ static int hist_register_trigger(char *glob, struct event_trigger_ops *ops,
33433 update_cond_flag(file);
33434 ret--;
33435 }
33436- out:
33437+
33438 return ret;
33439 }
33440
33441+static bool have_hist_trigger_match(struct event_trigger_data *data,
33442+ struct trace_event_file *file)
33443+{
33444+ struct hist_trigger_data *hist_data = data->private_data;
33445+ struct event_trigger_data *test, *named_data = NULL;
33446+ bool match = false;
33447+
33448+ if (hist_data->attrs->name)
33449+ named_data = find_named_trigger(hist_data->attrs->name);
33450+
33451+ list_for_each_entry_rcu(test, &file->triggers, list) {
33452+ if (test->cmd_ops->trigger_type == ETT_EVENT_HIST) {
33453+ if (hist_trigger_match(data, test, named_data, false)) {
33454+ match = true;
33455+ break;
33456+ }
33457+ }
33458+ }
33459+
33460+ return match;
33461+}
33462+
33463+static bool hist_trigger_check_refs(struct event_trigger_data *data,
33464+ struct trace_event_file *file)
33465+{
33466+ struct hist_trigger_data *hist_data = data->private_data;
33467+ struct event_trigger_data *test, *named_data = NULL;
33468+
33469+ if (hist_data->attrs->name)
33470+ named_data = find_named_trigger(hist_data->attrs->name);
33471+
33472+ list_for_each_entry_rcu(test, &file->triggers, list) {
33473+ if (test->cmd_ops->trigger_type == ETT_EVENT_HIST) {
33474+ if (!hist_trigger_match(data, test, named_data, false))
33475+ continue;
33476+ hist_data = test->private_data;
33477+ if (check_var_refs(hist_data))
33478+ return true;
33479+ break;
33480+ }
33481+ }
33482+
33483+ return false;
33484+}
33485+
33486 static void hist_unregister_trigger(char *glob, struct event_trigger_ops *ops,
33487 struct event_trigger_data *data,
33488 struct trace_event_file *file)
33489@@ -1497,17 +5411,55 @@ static void hist_unregister_trigger(char *glob, struct event_trigger_ops *ops,
33490
33491 if (unregistered && test->ops->free)
33492 test->ops->free(test->ops, test);
33493+
33494+ if (hist_data->enable_timestamps) {
33495+ if (!hist_data->remove || unregistered)
33496+ tracing_set_time_stamp_abs(file->tr, false);
33497+ }
33498+}
33499+
33500+static bool hist_file_check_refs(struct trace_event_file *file)
33501+{
33502+ struct hist_trigger_data *hist_data;
33503+ struct event_trigger_data *test;
33504+
33505+ list_for_each_entry_rcu(test, &file->triggers, list) {
33506+ if (test->cmd_ops->trigger_type == ETT_EVENT_HIST) {
33507+ hist_data = test->private_data;
33508+ if (check_var_refs(hist_data))
33509+ return true;
33510+ }
33511+ }
33512+
33513+ return false;
33514 }
33515
33516 static void hist_unreg_all(struct trace_event_file *file)
33517 {
33518 struct event_trigger_data *test, *n;
33519+ struct hist_trigger_data *hist_data;
33520+ struct synth_event *se;
33521+ const char *se_name;
33522+
33523+ if (hist_file_check_refs(file))
33524+ return;
33525
33526 list_for_each_entry_safe(test, n, &file->triggers, list) {
33527 if (test->cmd_ops->trigger_type == ETT_EVENT_HIST) {
33528+ hist_data = test->private_data;
33529 list_del_rcu(&test->list);
33530 trace_event_trigger_enable_disable(file, 0);
33531+
33532+ mutex_lock(&synth_event_mutex);
33533+ se_name = trace_event_name(file->event_call);
33534+ se = find_synth_event(se_name);
33535+ if (se)
33536+ se->ref--;
33537+ mutex_unlock(&synth_event_mutex);
33538+
33539 update_cond_flag(file);
33540+ if (hist_data->enable_timestamps)
33541+ tracing_set_time_stamp_abs(file->tr, false);
33542 if (test->ops->free)
33543 test->ops->free(test->ops, test);
33544 }
33545@@ -1523,16 +5475,54 @@ static int event_hist_trigger_func(struct event_command *cmd_ops,
33546 struct hist_trigger_attrs *attrs;
33547 struct event_trigger_ops *trigger_ops;
33548 struct hist_trigger_data *hist_data;
33549- char *trigger;
33550+ struct synth_event *se;
33551+ const char *se_name;
33552+ bool remove = false;
33553+ char *trigger, *p;
33554 int ret = 0;
33555
33556+ if (glob && strlen(glob)) {
33557+ last_cmd_set(param);
33558+ hist_err_clear();
33559+ }
33560+
33561 if (!param)
33562 return -EINVAL;
33563
33564- /* separate the trigger from the filter (k:v [if filter]) */
33565- trigger = strsep(&param, " \t");
33566- if (!trigger)
33567- return -EINVAL;
33568+ if (glob[0] == '!')
33569+ remove = true;
33570+
33571+ /*
33572+ * separate the trigger from the filter (k:v [if filter])
33573+ * allowing for whitespace in the trigger
33574+ */
33575+ p = trigger = param;
33576+ do {
33577+ p = strstr(p, "if");
33578+ if (!p)
33579+ break;
33580+ if (p == param)
33581+ return -EINVAL;
33582+ if (*(p - 1) != ' ' && *(p - 1) != '\t') {
33583+ p++;
33584+ continue;
33585+ }
33586+ if (p >= param + strlen(param) - strlen("if") - 1)
33587+ return -EINVAL;
33588+ if (*(p + strlen("if")) != ' ' && *(p + strlen("if")) != '\t') {
33589+ p++;
33590+ continue;
33591+ }
33592+ break;
33593+ } while (p);
33594+
33595+ if (!p)
33596+ param = NULL;
33597+ else {
33598+ *(p - 1) = '\0';
33599+ param = strstrip(p);
33600+ trigger = strstrip(trigger);
33601+ }
33602
33603 attrs = parse_hist_trigger_attrs(trigger);
33604 if (IS_ERR(attrs))
33605@@ -1541,7 +5531,7 @@ static int event_hist_trigger_func(struct event_command *cmd_ops,
33606 if (attrs->map_bits)
33607 hist_trigger_bits = attrs->map_bits;
33608
33609- hist_data = create_hist_data(hist_trigger_bits, attrs, file);
33610+ hist_data = create_hist_data(hist_trigger_bits, attrs, file, remove);
33611 if (IS_ERR(hist_data)) {
33612 destroy_hist_trigger_attrs(attrs);
33613 return PTR_ERR(hist_data);
33614@@ -1549,10 +5539,11 @@ static int event_hist_trigger_func(struct event_command *cmd_ops,
33615
33616 trigger_ops = cmd_ops->get_trigger_ops(cmd, trigger);
33617
33618- ret = -ENOMEM;
33619 trigger_data = kzalloc(sizeof(*trigger_data), GFP_KERNEL);
33620- if (!trigger_data)
33621+ if (!trigger_data) {
33622+ ret = -ENOMEM;
33623 goto out_free;
33624+ }
33625
33626 trigger_data->count = -1;
33627 trigger_data->ops = trigger_ops;
33628@@ -1570,8 +5561,24 @@ static int event_hist_trigger_func(struct event_command *cmd_ops,
33629 goto out_free;
33630 }
33631
33632- if (glob[0] == '!') {
33633+ if (remove) {
33634+ if (!have_hist_trigger_match(trigger_data, file))
33635+ goto out_free;
33636+
33637+ if (hist_trigger_check_refs(trigger_data, file)) {
33638+ ret = -EBUSY;
33639+ goto out_free;
33640+ }
33641+
33642 cmd_ops->unreg(glob+1, trigger_ops, trigger_data, file);
33643+
33644+ mutex_lock(&synth_event_mutex);
33645+ se_name = trace_event_name(file->event_call);
33646+ se = find_synth_event(se_name);
33647+ if (se)
33648+ se->ref--;
33649+ mutex_unlock(&synth_event_mutex);
33650+
33651 ret = 0;
33652 goto out_free;
33653 }
33654@@ -1588,14 +5595,47 @@ static int event_hist_trigger_func(struct event_command *cmd_ops,
33655 goto out_free;
33656 } else if (ret < 0)
33657 goto out_free;
33658+
33659+ if (get_named_trigger_data(trigger_data))
33660+ goto enable;
33661+
33662+ if (has_hist_vars(hist_data))
33663+ save_hist_vars(hist_data);
33664+
33665+ ret = create_actions(hist_data, file);
33666+ if (ret)
33667+ goto out_unreg;
33668+
33669+ ret = tracing_map_init(hist_data->map);
33670+ if (ret)
33671+ goto out_unreg;
33672+enable:
33673+ ret = hist_trigger_enable(trigger_data, file);
33674+ if (ret)
33675+ goto out_unreg;
33676+
33677+ mutex_lock(&synth_event_mutex);
33678+ se_name = trace_event_name(file->event_call);
33679+ se = find_synth_event(se_name);
33680+ if (se)
33681+ se->ref++;
33682+ mutex_unlock(&synth_event_mutex);
33683+
33684 /* Just return zero, not the number of registered triggers */
33685 ret = 0;
33686 out:
33687+ if (ret == 0)
33688+ hist_err_clear();
33689+
33690 return ret;
33691+ out_unreg:
33692+ cmd_ops->unreg(glob+1, trigger_ops, trigger_data, file);
33693 out_free:
33694 if (cmd_ops->set_filter)
33695 cmd_ops->set_filter(NULL, trigger_data, NULL);
33696
33697+ remove_hist_vars(hist_data);
33698+
33699 kfree(trigger_data);
33700
33701 destroy_hist_data(hist_data);
33702@@ -1625,7 +5665,8 @@ __init int register_trigger_hist_cmd(void)
33703 }
33704
33705 static void
33706-hist_enable_trigger(struct event_trigger_data *data, void *rec)
33707+hist_enable_trigger(struct event_trigger_data *data, void *rec,
33708+ struct ring_buffer_event *event)
33709 {
33710 struct enable_trigger_data *enable_data = data->private_data;
33711 struct event_trigger_data *test;
33712@@ -1641,7 +5682,8 @@ hist_enable_trigger(struct event_trigger_data *data, void *rec)
33713 }
33714
33715 static void
33716-hist_enable_count_trigger(struct event_trigger_data *data, void *rec)
33717+hist_enable_count_trigger(struct event_trigger_data *data, void *rec,
33718+ struct ring_buffer_event *event)
33719 {
33720 if (!data->count)
33721 return;
33722@@ -1649,7 +5691,7 @@ hist_enable_count_trigger(struct event_trigger_data *data, void *rec)
33723 if (data->count != -1)
33724 (data->count)--;
33725
33726- hist_enable_trigger(data, rec);
33727+ hist_enable_trigger(data, rec, event);
33728 }
33729
33730 static struct event_trigger_ops hist_enable_trigger_ops = {
33731@@ -1754,3 +5796,31 @@ __init int register_trigger_hist_enable_disable_cmds(void)
33732
33733 return ret;
33734 }
33735+
33736+static __init int trace_events_hist_init(void)
33737+{
33738+ struct dentry *entry = NULL;
33739+ struct dentry *d_tracer;
33740+ int err = 0;
33741+
33742+ d_tracer = tracing_init_dentry();
33743+ if (IS_ERR(d_tracer)) {
33744+ err = PTR_ERR(d_tracer);
33745+ goto err;
33746+ }
33747+
33748+ entry = tracefs_create_file("synthetic_events", 0644, d_tracer,
33749+ NULL, &synth_events_fops);
33750+ if (!entry) {
33751+ err = -ENODEV;
33752+ goto err;
33753+ }
33754+
33755+ return err;
33756+ err:
33757+ pr_warn("Could not create tracefs 'synthetic_events' entry\n");
33758+
33759+ return err;
33760+}
33761+
33762+fs_initcall(trace_events_hist_init);
33763diff --git a/kernel/trace/trace_events_trigger.c b/kernel/trace/trace_events_trigger.c
33764index 43254c5e7e16..24d42350d738 100644
33765--- a/kernel/trace/trace_events_trigger.c
33766+++ b/kernel/trace/trace_events_trigger.c
33767@@ -63,7 +63,8 @@ void trigger_data_free(struct event_trigger_data *data)
33768 * any trigger that should be deferred, ETT_NONE if nothing to defer.
33769 */
33770 enum event_trigger_type
33771-event_triggers_call(struct trace_event_file *file, void *rec)
33772+event_triggers_call(struct trace_event_file *file, void *rec,
33773+ struct ring_buffer_event *event)
33774 {
33775 struct event_trigger_data *data;
33776 enum event_trigger_type tt = ETT_NONE;
33777@@ -76,7 +77,7 @@ event_triggers_call(struct trace_event_file *file, void *rec)
33778 if (data->paused)
33779 continue;
33780 if (!rec) {
33781- data->ops->func(data, rec);
33782+ data->ops->func(data, rec, event);
33783 continue;
33784 }
33785 filter = rcu_dereference_sched(data->filter);
33786@@ -86,7 +87,7 @@ event_triggers_call(struct trace_event_file *file, void *rec)
33787 tt |= data->cmd_ops->trigger_type;
33788 continue;
33789 }
33790- data->ops->func(data, rec);
33791+ data->ops->func(data, rec, event);
33792 }
33793 return tt;
33794 }
33795@@ -108,7 +109,7 @@ EXPORT_SYMBOL_GPL(event_triggers_call);
33796 void
33797 event_triggers_post_call(struct trace_event_file *file,
33798 enum event_trigger_type tt,
33799- void *rec)
33800+ void *rec, struct ring_buffer_event *event)
33801 {
33802 struct event_trigger_data *data;
33803
33804@@ -116,7 +117,7 @@ event_triggers_post_call(struct trace_event_file *file,
33805 if (data->paused)
33806 continue;
33807 if (data->cmd_ops->trigger_type & tt)
33808- data->ops->func(data, rec);
33809+ data->ops->func(data, rec, event);
33810 }
33811 }
33812 EXPORT_SYMBOL_GPL(event_triggers_post_call);
33813@@ -914,8 +915,15 @@ void set_named_trigger_data(struct event_trigger_data *data,
33814 data->named_data = named_data;
33815 }
33816
33817+struct event_trigger_data *
33818+get_named_trigger_data(struct event_trigger_data *data)
33819+{
33820+ return data->named_data;
33821+}
33822+
33823 static void
33824-traceon_trigger(struct event_trigger_data *data, void *rec)
33825+traceon_trigger(struct event_trigger_data *data, void *rec,
33826+ struct ring_buffer_event *event)
33827 {
33828 if (tracing_is_on())
33829 return;
33830@@ -924,7 +932,8 @@ traceon_trigger(struct event_trigger_data *data, void *rec)
33831 }
33832
33833 static void
33834-traceon_count_trigger(struct event_trigger_data *data, void *rec)
33835+traceon_count_trigger(struct event_trigger_data *data, void *rec,
33836+ struct ring_buffer_event *event)
33837 {
33838 if (tracing_is_on())
33839 return;
33840@@ -939,7 +948,8 @@ traceon_count_trigger(struct event_trigger_data *data, void *rec)
33841 }
33842
33843 static void
33844-traceoff_trigger(struct event_trigger_data *data, void *rec)
33845+traceoff_trigger(struct event_trigger_data *data, void *rec,
33846+ struct ring_buffer_event *event)
33847 {
33848 if (!tracing_is_on())
33849 return;
33850@@ -948,7 +958,8 @@ traceoff_trigger(struct event_trigger_data *data, void *rec)
33851 }
33852
33853 static void
33854-traceoff_count_trigger(struct event_trigger_data *data, void *rec)
33855+traceoff_count_trigger(struct event_trigger_data *data, void *rec,
33856+ struct ring_buffer_event *event)
33857 {
33858 if (!tracing_is_on())
33859 return;
33860@@ -1045,7 +1056,8 @@ static struct event_command trigger_traceoff_cmd = {
33861
33862 #ifdef CONFIG_TRACER_SNAPSHOT
33863 static void
33864-snapshot_trigger(struct event_trigger_data *data, void *rec)
33865+snapshot_trigger(struct event_trigger_data *data, void *rec,
33866+ struct ring_buffer_event *event)
33867 {
33868 struct trace_event_file *file = data->private_data;
33869
33870@@ -1056,7 +1068,8 @@ snapshot_trigger(struct event_trigger_data *data, void *rec)
33871 }
33872
33873 static void
33874-snapshot_count_trigger(struct event_trigger_data *data, void *rec)
33875+snapshot_count_trigger(struct event_trigger_data *data, void *rec,
33876+ struct ring_buffer_event *event)
33877 {
33878 if (!data->count)
33879 return;
33880@@ -1064,7 +1077,7 @@ snapshot_count_trigger(struct event_trigger_data *data, void *rec)
33881 if (data->count != -1)
33882 (data->count)--;
33883
33884- snapshot_trigger(data, rec);
33885+ snapshot_trigger(data, rec, event);
33886 }
33887
33888 static int
33889@@ -1143,13 +1156,15 @@ static __init int register_trigger_snapshot_cmd(void) { return 0; }
33890 #define STACK_SKIP 3
33891
33892 static void
33893-stacktrace_trigger(struct event_trigger_data *data, void *rec)
33894+stacktrace_trigger(struct event_trigger_data *data, void *rec,
33895+ struct ring_buffer_event *event)
33896 {
33897 trace_dump_stack(STACK_SKIP);
33898 }
33899
33900 static void
33901-stacktrace_count_trigger(struct event_trigger_data *data, void *rec)
33902+stacktrace_count_trigger(struct event_trigger_data *data, void *rec,
33903+ struct ring_buffer_event *event)
33904 {
33905 if (!data->count)
33906 return;
33907@@ -1157,7 +1172,7 @@ stacktrace_count_trigger(struct event_trigger_data *data, void *rec)
33908 if (data->count != -1)
33909 (data->count)--;
33910
33911- stacktrace_trigger(data, rec);
33912+ stacktrace_trigger(data, rec, event);
33913 }
33914
33915 static int
33916@@ -1219,7 +1234,8 @@ static __init void unregister_trigger_traceon_traceoff_cmds(void)
33917 }
33918
33919 static void
33920-event_enable_trigger(struct event_trigger_data *data, void *rec)
33921+event_enable_trigger(struct event_trigger_data *data, void *rec,
33922+ struct ring_buffer_event *event)
33923 {
33924 struct enable_trigger_data *enable_data = data->private_data;
33925
33926@@ -1230,7 +1246,8 @@ event_enable_trigger(struct event_trigger_data *data, void *rec)
33927 }
33928
33929 static void
33930-event_enable_count_trigger(struct event_trigger_data *data, void *rec)
33931+event_enable_count_trigger(struct event_trigger_data *data, void *rec,
33932+ struct ring_buffer_event *event)
33933 {
33934 struct enable_trigger_data *enable_data = data->private_data;
33935
33936@@ -1244,7 +1261,7 @@ event_enable_count_trigger(struct event_trigger_data *data, void *rec)
33937 if (data->count != -1)
33938 (data->count)--;
33939
33940- event_enable_trigger(data, rec);
33941+ event_enable_trigger(data, rec, event);
33942 }
33943
33944 int event_enable_trigger_print(struct seq_file *m,
33945diff --git a/kernel/trace/trace_hwlat.c b/kernel/trace/trace_hwlat.c
33946index d7c8e4ec3d9d..518c61a1bceb 100644
33947--- a/kernel/trace/trace_hwlat.c
33948+++ b/kernel/trace/trace_hwlat.c
33949@@ -279,7 +279,7 @@ static void move_to_next_cpu(void)
33950 * of this thread, than stop migrating for the duration
33951 * of the current test.
33952 */
33953- if (!cpumask_equal(current_mask, &current->cpus_allowed))
33954+ if (!cpumask_equal(current_mask, current->cpus_ptr))
33955 goto disable;
33956
33957 get_online_cpus();
33958diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
33959index ea20274a105a..3c40d4174052 100644
33960--- a/kernel/trace/trace_kprobe.c
33961+++ b/kernel/trace/trace_kprobe.c
33962@@ -918,8 +918,8 @@ static int probes_open(struct inode *inode, struct file *file)
33963 static ssize_t probes_write(struct file *file, const char __user *buffer,
33964 size_t count, loff_t *ppos)
33965 {
33966- return traceprobe_probes_write(file, buffer, count, ppos,
33967- create_trace_kprobe);
33968+ return trace_parse_run_command(file, buffer, count, ppos,
33969+ create_trace_kprobe);
33970 }
33971
33972 static const struct file_operations kprobe_events_ops = {
33973@@ -1444,9 +1444,9 @@ static __init int kprobe_trace_self_tests_init(void)
33974
33975 pr_info("Testing kprobe tracing: ");
33976
33977- ret = traceprobe_command("p:testprobe kprobe_trace_selftest_target "
33978- "$stack $stack0 +0($stack)",
33979- create_trace_kprobe);
33980+ ret = trace_run_command("p:testprobe kprobe_trace_selftest_target "
33981+ "$stack $stack0 +0($stack)",
33982+ create_trace_kprobe);
33983 if (WARN_ON_ONCE(ret)) {
33984 pr_warn("error on probing function entry.\n");
33985 warn++;
33986@@ -1466,8 +1466,8 @@ static __init int kprobe_trace_self_tests_init(void)
33987 }
33988 }
33989
33990- ret = traceprobe_command("r:testprobe2 kprobe_trace_selftest_target "
33991- "$retval", create_trace_kprobe);
33992+ ret = trace_run_command("r:testprobe2 kprobe_trace_selftest_target "
33993+ "$retval", create_trace_kprobe);
33994 if (WARN_ON_ONCE(ret)) {
33995 pr_warn("error on probing function return.\n");
33996 warn++;
33997@@ -1537,13 +1537,13 @@ static __init int kprobe_trace_self_tests_init(void)
33998 disable_trace_kprobe(tk, file);
33999 }
34000
34001- ret = traceprobe_command("-:testprobe", create_trace_kprobe);
34002+ ret = trace_run_command("-:testprobe", create_trace_kprobe);
34003 if (WARN_ON_ONCE(ret)) {
34004 pr_warn("error on deleting a probe.\n");
34005 warn++;
34006 }
34007
34008- ret = traceprobe_command("-:testprobe2", create_trace_kprobe);
34009+ ret = trace_run_command("-:testprobe2", create_trace_kprobe);
34010 if (WARN_ON_ONCE(ret)) {
34011 pr_warn("error on deleting a probe.\n");
34012 warn++;
34013diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
34014index 4500b00e4e36..74a4bfc2c6b7 100644
34015--- a/kernel/trace/trace_output.c
34016+++ b/kernel/trace/trace_output.c
34017@@ -447,6 +447,7 @@ int trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry)
34018 {
34019 char hardsoft_irq;
34020 char need_resched;
34021+ char need_resched_lazy;
34022 char irqs_off;
34023 int hardirq;
34024 int softirq;
34025@@ -477,6 +478,9 @@ int trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry)
34026 break;
34027 }
34028
34029+ need_resched_lazy =
34030+ (entry->flags & TRACE_FLAG_NEED_RESCHED_LAZY) ? 'L' : '.';
34031+
34032 hardsoft_irq =
34033 (nmi && hardirq) ? 'Z' :
34034 nmi ? 'z' :
34035@@ -485,14 +489,25 @@ int trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry)
34036 softirq ? 's' :
34037 '.' ;
34038
34039- trace_seq_printf(s, "%c%c%c",
34040- irqs_off, need_resched, hardsoft_irq);
34041+ trace_seq_printf(s, "%c%c%c%c",
34042+ irqs_off, need_resched, need_resched_lazy,
34043+ hardsoft_irq);
34044
34045 if (entry->preempt_count)
34046 trace_seq_printf(s, "%x", entry->preempt_count);
34047 else
34048 trace_seq_putc(s, '.');
34049
34050+ if (entry->preempt_lazy_count)
34051+ trace_seq_printf(s, "%x", entry->preempt_lazy_count);
34052+ else
34053+ trace_seq_putc(s, '.');
34054+
34055+ if (entry->migrate_disable)
34056+ trace_seq_printf(s, "%x", entry->migrate_disable);
34057+ else
34058+ trace_seq_putc(s, '.');
34059+
34060 return !trace_seq_has_overflowed(s);
34061 }
34062
34063diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c
34064index fe4513330412..daf54bda4dc8 100644
34065--- a/kernel/trace/trace_probe.c
34066+++ b/kernel/trace/trace_probe.c
34067@@ -621,92 +621,6 @@ void traceprobe_free_probe_arg(struct probe_arg *arg)
34068 kfree(arg->comm);
34069 }
34070
34071-int traceprobe_command(const char *buf, int (*createfn)(int, char **))
34072-{
34073- char **argv;
34074- int argc, ret;
34075-
34076- argc = 0;
34077- ret = 0;
34078- argv = argv_split(GFP_KERNEL, buf, &argc);
34079- if (!argv)
34080- return -ENOMEM;
34081-
34082- if (argc)
34083- ret = createfn(argc, argv);
34084-
34085- argv_free(argv);
34086-
34087- return ret;
34088-}
34089-
34090-#define WRITE_BUFSIZE 4096
34091-
34092-ssize_t traceprobe_probes_write(struct file *file, const char __user *buffer,
34093- size_t count, loff_t *ppos,
34094- int (*createfn)(int, char **))
34095-{
34096- char *kbuf, *buf, *tmp;
34097- int ret = 0;
34098- size_t done = 0;
34099- size_t size;
34100-
34101- kbuf = kmalloc(WRITE_BUFSIZE, GFP_KERNEL);
34102- if (!kbuf)
34103- return -ENOMEM;
34104-
34105- while (done < count) {
34106- size = count - done;
34107-
34108- if (size >= WRITE_BUFSIZE)
34109- size = WRITE_BUFSIZE - 1;
34110-
34111- if (copy_from_user(kbuf, buffer + done, size)) {
34112- ret = -EFAULT;
34113- goto out;
34114- }
34115- kbuf[size] = '\0';
34116- buf = kbuf;
34117- do {
34118- tmp = strchr(buf, '\n');
34119- if (tmp) {
34120- *tmp = '\0';
34121- size = tmp - buf + 1;
34122- } else {
34123- size = strlen(buf);
34124- if (done + size < count) {
34125- if (buf != kbuf)
34126- break;
34127- /* This can accept WRITE_BUFSIZE - 2 ('\n' + '\0') */
34128- pr_warn("Line length is too long: Should be less than %d\n",
34129- WRITE_BUFSIZE - 2);
34130- ret = -EINVAL;
34131- goto out;
34132- }
34133- }
34134- done += size;
34135-
34136- /* Remove comments */
34137- tmp = strchr(buf, '#');
34138-
34139- if (tmp)
34140- *tmp = '\0';
34141-
34142- ret = traceprobe_command(buf, createfn);
34143- if (ret)
34144- goto out;
34145- buf += size;
34146-
34147- } while (done < count);
34148- }
34149- ret = done;
34150-
34151-out:
34152- kfree(kbuf);
34153-
34154- return ret;
34155-}
34156-
34157 static int __set_print_fmt(struct trace_probe *tp, char *buf, int len,
34158 bool is_return)
34159 {
34160diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h
34161index dc39472ca9e4..a0d750e3d17c 100644
34162--- a/kernel/trace/trace_probe.h
34163+++ b/kernel/trace/trace_probe.h
34164@@ -42,7 +42,6 @@
34165
34166 #define MAX_TRACE_ARGS 128
34167 #define MAX_ARGSTR_LEN 63
34168-#define MAX_EVENT_NAME_LEN 64
34169 #define MAX_STRING_SIZE PATH_MAX
34170
34171 /* Reserved field names */
34172@@ -356,12 +355,6 @@ extern void traceprobe_free_probe_arg(struct probe_arg *arg);
34173
34174 extern int traceprobe_split_symbol_offset(char *symbol, long *offset);
34175
34176-extern ssize_t traceprobe_probes_write(struct file *file,
34177- const char __user *buffer, size_t count, loff_t *ppos,
34178- int (*createfn)(int, char**));
34179-
34180-extern int traceprobe_command(const char *buf, int (*createfn)(int, char**));
34181-
34182 /* Sum up total data length for dynamic arraies (strings) */
34183 static nokprobe_inline int
34184 __get_data_size(struct trace_probe *tp, struct pt_regs *regs)
34185diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c
34186index ea0d90a31fc9..2ccfbb8efeb2 100644
34187--- a/kernel/trace/trace_uprobe.c
34188+++ b/kernel/trace/trace_uprobe.c
34189@@ -647,7 +647,7 @@ static int probes_open(struct inode *inode, struct file *file)
34190 static ssize_t probes_write(struct file *file, const char __user *buffer,
34191 size_t count, loff_t *ppos)
34192 {
34193- return traceprobe_probes_write(file, buffer, count, ppos, create_trace_uprobe);
34194+ return trace_parse_run_command(file, buffer, count, ppos, create_trace_uprobe);
34195 }
34196
34197 static const struct file_operations uprobe_events_ops = {
34198diff --git a/kernel/trace/tracing_map.c b/kernel/trace/tracing_map.c
34199index 305039b122fa..5cadb1b8b5fe 100644
34200--- a/kernel/trace/tracing_map.c
34201+++ b/kernel/trace/tracing_map.c
34202@@ -66,6 +66,73 @@ u64 tracing_map_read_sum(struct tracing_map_elt *elt, unsigned int i)
34203 return (u64)atomic64_read(&elt->fields[i].sum);
34204 }
34205
34206+/**
34207+ * tracing_map_set_var - Assign a tracing_map_elt's variable field
34208+ * @elt: The tracing_map_elt
34209+ * @i: The index of the given variable associated with the tracing_map_elt
34210+ * @n: The value to assign
34211+ *
34212+ * Assign n to variable i associated with the specified tracing_map_elt
34213+ * instance. The index i is the index returned by the call to
34214+ * tracing_map_add_var() when the tracing map was set up.
34215+ */
34216+void tracing_map_set_var(struct tracing_map_elt *elt, unsigned int i, u64 n)
34217+{
34218+ atomic64_set(&elt->vars[i], n);
34219+ elt->var_set[i] = true;
34220+}
34221+
34222+/**
34223+ * tracing_map_var_set - Return whether or not a variable has been set
34224+ * @elt: The tracing_map_elt
34225+ * @i: The index of the given variable associated with the tracing_map_elt
34226+ *
34227+ * Return true if the variable has been set, false otherwise. The
34228+ * index i is the index returned by the call to tracing_map_add_var()
34229+ * when the tracing map was set up.
34230+ */
34231+bool tracing_map_var_set(struct tracing_map_elt *elt, unsigned int i)
34232+{
34233+ return elt->var_set[i];
34234+}
34235+
34236+/**
34237+ * tracing_map_read_var - Return the value of a tracing_map_elt's variable field
34238+ * @elt: The tracing_map_elt
34239+ * @i: The index of the given variable associated with the tracing_map_elt
34240+ *
34241+ * Retrieve the value of the variable i associated with the specified
34242+ * tracing_map_elt instance. The index i is the index returned by the
34243+ * call to tracing_map_add_var() when the tracing map was set
34244+ * up.
34245+ *
34246+ * Return: The variable value associated with field i for elt.
34247+ */
34248+u64 tracing_map_read_var(struct tracing_map_elt *elt, unsigned int i)
34249+{
34250+ return (u64)atomic64_read(&elt->vars[i]);
34251+}
34252+
34253+/**
34254+ * tracing_map_read_var_once - Return and reset a tracing_map_elt's variable field
34255+ * @elt: The tracing_map_elt
34256+ * @i: The index of the given variable associated with the tracing_map_elt
34257+ *
34258+ * Retrieve the value of the variable i associated with the specified
34259+ * tracing_map_elt instance, and reset the variable to the 'not set'
34260+ * state. The index i is the index returned by the call to
34261+ * tracing_map_add_var() when the tracing map was set up. The reset
34262+ * essentially makes the variable a read-once variable if it's only
34263+ * accessed using this function.
34264+ *
34265+ * Return: The variable value associated with field i for elt.
34266+ */
34267+u64 tracing_map_read_var_once(struct tracing_map_elt *elt, unsigned int i)
34268+{
34269+ elt->var_set[i] = false;
34270+ return (u64)atomic64_read(&elt->vars[i]);
34271+}
34272+
34273 int tracing_map_cmp_string(void *val_a, void *val_b)
34274 {
34275 char *a = val_a;
34276@@ -170,6 +237,28 @@ int tracing_map_add_sum_field(struct tracing_map *map)
34277 return tracing_map_add_field(map, tracing_map_cmp_atomic64);
34278 }
34279
34280+/**
34281+ * tracing_map_add_var - Add a field describing a tracing_map var
34282+ * @map: The tracing_map
34283+ *
34284+ * Add a var to the map and return the index identifying it in the map
34285+ * and associated tracing_map_elts. This is the index used for
34286+ * instance to update a var for a particular tracing_map_elt using
34287+ * tracing_map_update_var() or reading it via tracing_map_read_var().
34288+ *
34289+ * Return: The index identifying the var in the map and associated
34290+ * tracing_map_elts, or -EINVAL on error.
34291+ */
34292+int tracing_map_add_var(struct tracing_map *map)
34293+{
34294+ int ret = -EINVAL;
34295+
34296+ if (map->n_vars < TRACING_MAP_VARS_MAX)
34297+ ret = map->n_vars++;
34298+
34299+ return ret;
34300+}
34301+
34302 /**
34303 * tracing_map_add_key_field - Add a field describing a tracing_map key
34304 * @map: The tracing_map
34305@@ -280,6 +369,11 @@ static void tracing_map_elt_clear(struct tracing_map_elt *elt)
34306 if (elt->fields[i].cmp_fn == tracing_map_cmp_atomic64)
34307 atomic64_set(&elt->fields[i].sum, 0);
34308
34309+ for (i = 0; i < elt->map->n_vars; i++) {
34310+ atomic64_set(&elt->vars[i], 0);
34311+ elt->var_set[i] = false;
34312+ }
34313+
34314 if (elt->map->ops && elt->map->ops->elt_clear)
34315 elt->map->ops->elt_clear(elt);
34316 }
34317@@ -306,6 +400,8 @@ static void tracing_map_elt_free(struct tracing_map_elt *elt)
34318 if (elt->map->ops && elt->map->ops->elt_free)
34319 elt->map->ops->elt_free(elt);
34320 kfree(elt->fields);
34321+ kfree(elt->vars);
34322+ kfree(elt->var_set);
34323 kfree(elt->key);
34324 kfree(elt);
34325 }
34326@@ -333,6 +429,18 @@ static struct tracing_map_elt *tracing_map_elt_alloc(struct tracing_map *map)
34327 goto free;
34328 }
34329
34330+ elt->vars = kcalloc(map->n_vars, sizeof(*elt->vars), GFP_KERNEL);
34331+ if (!elt->vars) {
34332+ err = -ENOMEM;
34333+ goto free;
34334+ }
34335+
34336+ elt->var_set = kcalloc(map->n_vars, sizeof(*elt->var_set), GFP_KERNEL);
34337+ if (!elt->var_set) {
34338+ err = -ENOMEM;
34339+ goto free;
34340+ }
34341+
34342 tracing_map_elt_init_fields(elt);
34343
34344 if (map->ops && map->ops->elt_alloc) {
34345@@ -414,7 +522,9 @@ static inline struct tracing_map_elt *
34346 __tracing_map_insert(struct tracing_map *map, void *key, bool lookup_only)
34347 {
34348 u32 idx, key_hash, test_key;
34349+ int dup_try = 0;
34350 struct tracing_map_entry *entry;
34351+ struct tracing_map_elt *val;
34352
34353 key_hash = jhash(key, map->key_size, 0);
34354 if (key_hash == 0)
34355@@ -426,10 +536,33 @@ __tracing_map_insert(struct tracing_map *map, void *key, bool lookup_only)
34356 entry = TRACING_MAP_ENTRY(map->map, idx);
34357 test_key = entry->key;
34358
34359- if (test_key && test_key == key_hash && entry->val &&
34360- keys_match(key, entry->val->key, map->key_size)) {
34361- atomic64_inc(&map->hits);
34362- return entry->val;
34363+ if (test_key && test_key == key_hash) {
34364+ val = READ_ONCE(entry->val);
34365+ if (val &&
34366+ keys_match(key, val->key, map->key_size)) {
34367+ if (!lookup_only)
34368+ atomic64_inc(&map->hits);
34369+ return val;
34370+ } else if (unlikely(!val)) {
34371+ /*
34372+ * The key is present. But, val (pointer to elt
34373+ * struct) is still NULL. which means some other
34374+ * thread is in the process of inserting an
34375+ * element.
34376+ *
34377+ * On top of that, it's key_hash is same as the
34378+ * one being inserted right now. So, it's
34379+ * possible that the element has the same
34380+ * key as well.
34381+ */
34382+
34383+ dup_try++;
34384+ if (dup_try > map->map_size) {
34385+ atomic64_inc(&map->drops);
34386+ break;
34387+ }
34388+ continue;
34389+ }
34390 }
34391
34392 if (!test_key) {
34393@@ -451,6 +584,13 @@ __tracing_map_insert(struct tracing_map *map, void *key, bool lookup_only)
34394 atomic64_inc(&map->hits);
34395
34396 return entry->val;
34397+ } else {
34398+ /*
34399+ * cmpxchg() failed. Loop around once
34400+ * more to check what key was inserted.
34401+ */
34402+ dup_try++;
34403+ continue;
34404 }
34405 }
34406
34407@@ -815,67 +955,15 @@ create_sort_entry(void *key, struct tracing_map_elt *elt)
34408 return sort_entry;
34409 }
34410
34411-static struct tracing_map_elt *copy_elt(struct tracing_map_elt *elt)
34412-{
34413- struct tracing_map_elt *dup_elt;
34414- unsigned int i;
34415-
34416- dup_elt = tracing_map_elt_alloc(elt->map);
34417- if (IS_ERR(dup_elt))
34418- return NULL;
34419-
34420- if (elt->map->ops && elt->map->ops->elt_copy)
34421- elt->map->ops->elt_copy(dup_elt, elt);
34422-
34423- dup_elt->private_data = elt->private_data;
34424- memcpy(dup_elt->key, elt->key, elt->map->key_size);
34425-
34426- for (i = 0; i < elt->map->n_fields; i++) {
34427- atomic64_set(&dup_elt->fields[i].sum,
34428- atomic64_read(&elt->fields[i].sum));
34429- dup_elt->fields[i].cmp_fn = elt->fields[i].cmp_fn;
34430- }
34431-
34432- return dup_elt;
34433-}
34434-
34435-static int merge_dup(struct tracing_map_sort_entry **sort_entries,
34436- unsigned int target, unsigned int dup)
34437-{
34438- struct tracing_map_elt *target_elt, *elt;
34439- bool first_dup = (target - dup) == 1;
34440- int i;
34441-
34442- if (first_dup) {
34443- elt = sort_entries[target]->elt;
34444- target_elt = copy_elt(elt);
34445- if (!target_elt)
34446- return -ENOMEM;
34447- sort_entries[target]->elt = target_elt;
34448- sort_entries[target]->elt_copied = true;
34449- } else
34450- target_elt = sort_entries[target]->elt;
34451-
34452- elt = sort_entries[dup]->elt;
34453-
34454- for (i = 0; i < elt->map->n_fields; i++)
34455- atomic64_add(atomic64_read(&elt->fields[i].sum),
34456- &target_elt->fields[i].sum);
34457-
34458- sort_entries[dup]->dup = true;
34459-
34460- return 0;
34461-}
34462-
34463-static int merge_dups(struct tracing_map_sort_entry **sort_entries,
34464+static void detect_dups(struct tracing_map_sort_entry **sort_entries,
34465 int n_entries, unsigned int key_size)
34466 {
34467 unsigned int dups = 0, total_dups = 0;
34468- int err, i, j;
34469+ int i;
34470 void *key;
34471
34472 if (n_entries < 2)
34473- return total_dups;
34474+ return;
34475
34476 sort(sort_entries, n_entries, sizeof(struct tracing_map_sort_entry *),
34477 (int (*)(const void *, const void *))cmp_entries_dup, NULL);
34478@@ -884,30 +972,14 @@ static int merge_dups(struct tracing_map_sort_entry **sort_entries,
34479 for (i = 1; i < n_entries; i++) {
34480 if (!memcmp(sort_entries[i]->key, key, key_size)) {
34481 dups++; total_dups++;
34482- err = merge_dup(sort_entries, i - dups, i);
34483- if (err)
34484- return err;
34485 continue;
34486 }
34487 key = sort_entries[i]->key;
34488 dups = 0;
34489 }
34490
34491- if (!total_dups)
34492- return total_dups;
34493-
34494- for (i = 0, j = 0; i < n_entries; i++) {
34495- if (!sort_entries[i]->dup) {
34496- sort_entries[j] = sort_entries[i];
34497- if (j++ != i)
34498- sort_entries[i] = NULL;
34499- } else {
34500- destroy_sort_entry(sort_entries[i]);
34501- sort_entries[i] = NULL;
34502- }
34503- }
34504-
34505- return total_dups;
34506+ WARN_ONCE(total_dups > 0,
34507+ "Duplicates detected: %d\n", total_dups);
34508 }
34509
34510 static bool is_key(struct tracing_map *map, unsigned int field_idx)
34511@@ -1033,10 +1105,7 @@ int tracing_map_sort_entries(struct tracing_map *map,
34512 return 1;
34513 }
34514
34515- ret = merge_dups(entries, n_entries, map->key_size);
34516- if (ret < 0)
34517- goto free;
34518- n_entries -= ret;
34519+ detect_dups(entries, n_entries, map->key_size);
34520
34521 if (is_key(map, sort_keys[0].field_idx))
34522 cmp_entries_fn = cmp_entries_key;
34523diff --git a/kernel/trace/tracing_map.h b/kernel/trace/tracing_map.h
34524index ab0ca77331d0..053eb92b2d31 100644
34525--- a/kernel/trace/tracing_map.h
34526+++ b/kernel/trace/tracing_map.h
34527@@ -6,10 +6,11 @@
34528 #define TRACING_MAP_BITS_MAX 17
34529 #define TRACING_MAP_BITS_MIN 7
34530
34531-#define TRACING_MAP_KEYS_MAX 2
34532+#define TRACING_MAP_KEYS_MAX 3
34533 #define TRACING_MAP_VALS_MAX 3
34534 #define TRACING_MAP_FIELDS_MAX (TRACING_MAP_KEYS_MAX + \
34535 TRACING_MAP_VALS_MAX)
34536+#define TRACING_MAP_VARS_MAX 16
34537 #define TRACING_MAP_SORT_KEYS_MAX 2
34538
34539 typedef int (*tracing_map_cmp_fn_t) (void *val_a, void *val_b);
34540@@ -137,6 +138,8 @@ struct tracing_map_field {
34541 struct tracing_map_elt {
34542 struct tracing_map *map;
34543 struct tracing_map_field *fields;
34544+ atomic64_t *vars;
34545+ bool *var_set;
34546 void *key;
34547 void *private_data;
34548 };
34549@@ -192,6 +195,7 @@ struct tracing_map {
34550 int key_idx[TRACING_MAP_KEYS_MAX];
34551 unsigned int n_keys;
34552 struct tracing_map_sort_key sort_key;
34553+ unsigned int n_vars;
34554 atomic64_t hits;
34555 atomic64_t drops;
34556 };
34557@@ -215,11 +219,6 @@ struct tracing_map {
34558 * Element allocation occurs before tracing begins, when the
34559 * tracing_map_init() call is made by client code.
34560 *
34561- * @elt_copy: At certain points in the lifetime of an element, it may
34562- * need to be copied. The copy should include a copy of the
34563- * client-allocated data, which can be copied into the 'to'
34564- * element from the 'from' element.
34565- *
34566 * @elt_free: When a tracing_map_elt is freed, this function is called
34567 * and allows client-allocated per-element data to be freed.
34568 *
34569@@ -233,8 +232,6 @@ struct tracing_map {
34570 */
34571 struct tracing_map_ops {
34572 int (*elt_alloc)(struct tracing_map_elt *elt);
34573- void (*elt_copy)(struct tracing_map_elt *to,
34574- struct tracing_map_elt *from);
34575 void (*elt_free)(struct tracing_map_elt *elt);
34576 void (*elt_clear)(struct tracing_map_elt *elt);
34577 void (*elt_init)(struct tracing_map_elt *elt);
34578@@ -248,6 +245,7 @@ tracing_map_create(unsigned int map_bits,
34579 extern int tracing_map_init(struct tracing_map *map);
34580
34581 extern int tracing_map_add_sum_field(struct tracing_map *map);
34582+extern int tracing_map_add_var(struct tracing_map *map);
34583 extern int tracing_map_add_key_field(struct tracing_map *map,
34584 unsigned int offset,
34585 tracing_map_cmp_fn_t cmp_fn);
34586@@ -267,7 +265,13 @@ extern int tracing_map_cmp_none(void *val_a, void *val_b);
34587
34588 extern void tracing_map_update_sum(struct tracing_map_elt *elt,
34589 unsigned int i, u64 n);
34590+extern void tracing_map_set_var(struct tracing_map_elt *elt,
34591+ unsigned int i, u64 n);
34592+extern bool tracing_map_var_set(struct tracing_map_elt *elt, unsigned int i);
34593 extern u64 tracing_map_read_sum(struct tracing_map_elt *elt, unsigned int i);
34594+extern u64 tracing_map_read_var(struct tracing_map_elt *elt, unsigned int i);
34595+extern u64 tracing_map_read_var_once(struct tracing_map_elt *elt, unsigned int i);
34596+
34597 extern void tracing_map_set_field_descr(struct tracing_map *map,
34598 unsigned int i,
34599 unsigned int key_offset,
34600diff --git a/kernel/user.c b/kernel/user.c
34601index 00281add65b2..f4cf1841f2fd 100644
34602--- a/kernel/user.c
34603+++ b/kernel/user.c
34604@@ -162,11 +162,11 @@ void free_uid(struct user_struct *up)
34605 if (!up)
34606 return;
34607
34608- local_irq_save(flags);
34609+ local_irq_save_nort(flags);
34610 if (atomic_dec_and_lock(&up->__count, &uidhash_lock))
34611 free_user(up, flags);
34612 else
34613- local_irq_restore(flags);
34614+ local_irq_restore_nort(flags);
34615 }
34616
34617 struct user_struct *alloc_uid(kuid_t uid)
34618diff --git a/kernel/watchdog.c b/kernel/watchdog.c
34619index 087994b23f8b..ea4c09109ce4 100644
34620--- a/kernel/watchdog.c
34621+++ b/kernel/watchdog.c
34622@@ -462,7 +462,7 @@ static void watchdog_enable(unsigned int cpu)
34623 * Start the timer first to prevent the NMI watchdog triggering
34624 * before the timer has a chance to fire.
34625 */
34626- hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
34627+ hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD);
34628 hrtimer->function = watchdog_timer_fn;
34629 hrtimer_start(hrtimer, ns_to_ktime(sample_period),
34630 HRTIMER_MODE_REL_PINNED);
34631diff --git a/kernel/watchdog_hld.c b/kernel/watchdog_hld.c
34632index 4ece6028007a..210dccc57c04 100644
34633--- a/kernel/watchdog_hld.c
34634+++ b/kernel/watchdog_hld.c
34635@@ -24,6 +24,8 @@ static DEFINE_PER_CPU(bool, hard_watchdog_warn);
34636 static DEFINE_PER_CPU(bool, watchdog_nmi_touch);
34637 static DEFINE_PER_CPU(struct perf_event *, watchdog_ev);
34638 static DEFINE_PER_CPU(struct perf_event *, dead_event);
34639+static DEFINE_RAW_SPINLOCK(watchdog_output_lock);
34640+
34641 static struct cpumask dead_events_mask;
34642
34643 static unsigned long hardlockup_allcpu_dumped;
34644@@ -134,6 +136,13 @@ static void watchdog_overflow_callback(struct perf_event *event,
34645 /* only print hardlockups once */
34646 if (__this_cpu_read(hard_watchdog_warn) == true)
34647 return;
34648+ /*
34649+ * If early-printk is enabled then make sure we do not
34650+ * lock up in printk() and kill console logging:
34651+ */
34652+ printk_kill();
34653+
34654+ raw_spin_lock(&watchdog_output_lock);
34655
34656 pr_emerg("Watchdog detected hard LOCKUP on cpu %d", this_cpu);
34657 print_modules();
34658@@ -151,6 +160,7 @@ static void watchdog_overflow_callback(struct perf_event *event,
34659 !test_and_set_bit(0, &hardlockup_allcpu_dumped))
34660 trigger_allbutself_cpu_backtrace();
34661
34662+ raw_spin_unlock(&watchdog_output_lock);
34663 if (hardlockup_panic)
34664 nmi_panic(regs, "Hard LOCKUP");
34665
34666diff --git a/kernel/workqueue.c b/kernel/workqueue.c
34667index 08bc551976b2..76297cce5602 100644
34668--- a/kernel/workqueue.c
34669+++ b/kernel/workqueue.c
34670@@ -49,6 +49,8 @@
34671 #include <linux/moduleparam.h>
34672 #include <linux/uaccess.h>
34673 #include <linux/nmi.h>
34674+#include <linux/locallock.h>
34675+#include <linux/delay.h>
34676
34677 #include "workqueue_internal.h"
34678
34679@@ -123,11 +125,16 @@ enum {
34680 * cpu or grabbing pool->lock is enough for read access. If
34681 * POOL_DISASSOCIATED is set, it's identical to L.
34682 *
34683+ * On RT we need the extra protection via rt_lock_idle_list() for
34684+ * the list manipulations against read access from
34685+ * wq_worker_sleeping(). All other places are nicely serialized via
34686+ * pool->lock.
34687+ *
34688 * A: pool->attach_mutex protected.
34689 *
34690 * PL: wq_pool_mutex protected.
34691 *
34692- * PR: wq_pool_mutex protected for writes. Sched-RCU protected for reads.
34693+ * PR: wq_pool_mutex protected for writes. RCU protected for reads.
34694 *
34695 * PW: wq_pool_mutex and wq->mutex protected for writes. Either for reads.
34696 *
34697@@ -136,7 +143,7 @@ enum {
34698 *
34699 * WQ: wq->mutex protected.
34700 *
34701- * WR: wq->mutex protected for writes. Sched-RCU protected for reads.
34702+ * WR: wq->mutex protected for writes. RCU protected for reads.
34703 *
34704 * MD: wq_mayday_lock protected.
34705 */
34706@@ -186,7 +193,7 @@ struct worker_pool {
34707 atomic_t nr_running ____cacheline_aligned_in_smp;
34708
34709 /*
34710- * Destruction of pool is sched-RCU protected to allow dereferences
34711+ * Destruction of pool is RCU protected to allow dereferences
34712 * from get_work_pool().
34713 */
34714 struct rcu_head rcu;
34715@@ -215,7 +222,7 @@ struct pool_workqueue {
34716 /*
34717 * Release of unbound pwq is punted to system_wq. See put_pwq()
34718 * and pwq_unbound_release_workfn() for details. pool_workqueue
34719- * itself is also sched-RCU protected so that the first pwq can be
34720+ * itself is also RCU protected so that the first pwq can be
34721 * determined without grabbing wq->mutex.
34722 */
34723 struct work_struct unbound_release_work;
34724@@ -352,6 +359,8 @@ EXPORT_SYMBOL_GPL(system_power_efficient_wq);
34725 struct workqueue_struct *system_freezable_power_efficient_wq __read_mostly;
34726 EXPORT_SYMBOL_GPL(system_freezable_power_efficient_wq);
34727
34728+static DEFINE_LOCAL_IRQ_LOCK(pendingb_lock);
34729+
34730 static int worker_thread(void *__worker);
34731 static void workqueue_sysfs_unregister(struct workqueue_struct *wq);
34732
34733@@ -359,20 +368,20 @@ static void workqueue_sysfs_unregister(struct workqueue_struct *wq);
34734 #include <trace/events/workqueue.h>
34735
34736 #define assert_rcu_or_pool_mutex() \
34737- RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held() && \
34738+ RCU_LOCKDEP_WARN(!rcu_read_lock_held() && \
34739 !lockdep_is_held(&wq_pool_mutex), \
34740- "sched RCU or wq_pool_mutex should be held")
34741+ "RCU or wq_pool_mutex should be held")
34742
34743 #define assert_rcu_or_wq_mutex(wq) \
34744- RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held() && \
34745+ RCU_LOCKDEP_WARN(!rcu_read_lock_held() && \
34746 !lockdep_is_held(&wq->mutex), \
34747- "sched RCU or wq->mutex should be held")
34748+ "RCU or wq->mutex should be held")
34749
34750 #define assert_rcu_or_wq_mutex_or_pool_mutex(wq) \
34751- RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held() && \
34752+ RCU_LOCKDEP_WARN(!rcu_read_lock_held() && \
34753 !lockdep_is_held(&wq->mutex) && \
34754 !lockdep_is_held(&wq_pool_mutex), \
34755- "sched RCU, wq->mutex or wq_pool_mutex should be held")
34756+ "RCU, wq->mutex or wq_pool_mutex should be held")
34757
34758 #define for_each_cpu_worker_pool(pool, cpu) \
34759 for ((pool) = &per_cpu(cpu_worker_pools, cpu)[0]; \
34760@@ -384,7 +393,7 @@ static void workqueue_sysfs_unregister(struct workqueue_struct *wq);
34761 * @pool: iteration cursor
34762 * @pi: integer used for iteration
34763 *
34764- * This must be called either with wq_pool_mutex held or sched RCU read
34765+ * This must be called either with wq_pool_mutex held or RCU read
34766 * locked. If the pool needs to be used beyond the locking in effect, the
34767 * caller is responsible for guaranteeing that the pool stays online.
34768 *
34769@@ -416,7 +425,7 @@ static void workqueue_sysfs_unregister(struct workqueue_struct *wq);
34770 * @pwq: iteration cursor
34771 * @wq: the target workqueue
34772 *
34773- * This must be called either with wq->mutex held or sched RCU read locked.
34774+ * This must be called either with wq->mutex held or RCU read locked.
34775 * If the pwq needs to be used beyond the locking in effect, the caller is
34776 * responsible for guaranteeing that the pwq stays online.
34777 *
34778@@ -428,6 +437,31 @@ static void workqueue_sysfs_unregister(struct workqueue_struct *wq);
34779 if (({ assert_rcu_or_wq_mutex(wq); false; })) { } \
34780 else
34781
34782+#ifdef CONFIG_PREEMPT_RT_BASE
34783+static inline void rt_lock_idle_list(struct worker_pool *pool)
34784+{
34785+ preempt_disable();
34786+}
34787+static inline void rt_unlock_idle_list(struct worker_pool *pool)
34788+{
34789+ preempt_enable();
34790+}
34791+static inline void sched_lock_idle_list(struct worker_pool *pool) { }
34792+static inline void sched_unlock_idle_list(struct worker_pool *pool) { }
34793+#else
34794+static inline void rt_lock_idle_list(struct worker_pool *pool) { }
34795+static inline void rt_unlock_idle_list(struct worker_pool *pool) { }
34796+static inline void sched_lock_idle_list(struct worker_pool *pool)
34797+{
34798+ spin_lock_irq(&pool->lock);
34799+}
34800+static inline void sched_unlock_idle_list(struct worker_pool *pool)
34801+{
34802+ spin_unlock_irq(&pool->lock);
34803+}
34804+#endif
34805+
34806+
34807 #ifdef CONFIG_DEBUG_OBJECTS_WORK
34808
34809 static struct debug_obj_descr work_debug_descr;
34810@@ -552,7 +586,7 @@ static int worker_pool_assign_id(struct worker_pool *pool)
34811 * @wq: the target workqueue
34812 * @node: the node ID
34813 *
34814- * This must be called with any of wq_pool_mutex, wq->mutex or sched RCU
34815+ * This must be called with any of wq_pool_mutex, wq->mutex or RCU
34816 * read locked.
34817 * If the pwq needs to be used beyond the locking in effect, the caller is
34818 * responsible for guaranteeing that the pwq stays online.
34819@@ -696,8 +730,8 @@ static struct pool_workqueue *get_work_pwq(struct work_struct *work)
34820 * @work: the work item of interest
34821 *
34822 * Pools are created and destroyed under wq_pool_mutex, and allows read
34823- * access under sched-RCU read lock. As such, this function should be
34824- * called under wq_pool_mutex or with preemption disabled.
34825+ * access under RCU read lock. As such, this function should be
34826+ * called under wq_pool_mutex or inside of a rcu_read_lock() region.
34827 *
34828 * All fields of the returned pool are accessible as long as the above
34829 * mentioned locking is in effect. If the returned pool needs to be used
34830@@ -834,50 +868,45 @@ static struct worker *first_idle_worker(struct worker_pool *pool)
34831 */
34832 static void wake_up_worker(struct worker_pool *pool)
34833 {
34834- struct worker *worker = first_idle_worker(pool);
34835+ struct worker *worker;
34836+
34837+ rt_lock_idle_list(pool);
34838+
34839+ worker = first_idle_worker(pool);
34840
34841 if (likely(worker))
34842 wake_up_process(worker->task);
34843+
34844+ rt_unlock_idle_list(pool);
34845 }
34846
34847 /**
34848- * wq_worker_waking_up - a worker is waking up
34849+ * wq_worker_running - a worker is running again
34850 * @task: task waking up
34851- * @cpu: CPU @task is waking up to
34852 *
34853- * This function is called during try_to_wake_up() when a worker is
34854- * being awoken.
34855- *
34856- * CONTEXT:
34857- * spin_lock_irq(rq->lock)
34858+ * This function is called when a worker returns from schedule()
34859 */
34860-void wq_worker_waking_up(struct task_struct *task, int cpu)
34861+void wq_worker_running(struct task_struct *task)
34862 {
34863 struct worker *worker = kthread_data(task);
34864
34865- if (!(worker->flags & WORKER_NOT_RUNNING)) {
34866- WARN_ON_ONCE(worker->pool->cpu != cpu);
34867+ if (!worker->sleeping)
34868+ return;
34869+ if (!(worker->flags & WORKER_NOT_RUNNING))
34870 atomic_inc(&worker->pool->nr_running);
34871- }
34872+ worker->sleeping = 0;
34873 }
34874
34875 /**
34876 * wq_worker_sleeping - a worker is going to sleep
34877 * @task: task going to sleep
34878 *
34879- * This function is called during schedule() when a busy worker is
34880- * going to sleep. Worker on the same cpu can be woken up by
34881- * returning pointer to its task.
34882- *
34883- * CONTEXT:
34884- * spin_lock_irq(rq->lock)
34885- *
34886- * Return:
34887- * Worker task on @cpu to wake up, %NULL if none.
34888+ * This function is called from schedule() when a busy worker is
34889+ * going to sleep.
34890 */
34891-struct task_struct *wq_worker_sleeping(struct task_struct *task)
34892+void wq_worker_sleeping(struct task_struct *task)
34893 {
34894- struct worker *worker = kthread_data(task), *to_wakeup = NULL;
34895+ struct worker *worker = kthread_data(task);
34896 struct worker_pool *pool;
34897
34898 /*
34899@@ -886,29 +915,26 @@ struct task_struct *wq_worker_sleeping(struct task_struct *task)
34900 * checking NOT_RUNNING.
34901 */
34902 if (worker->flags & WORKER_NOT_RUNNING)
34903- return NULL;
34904+ return;
34905
34906 pool = worker->pool;
34907
34908- /* this can only happen on the local cpu */
34909- if (WARN_ON_ONCE(pool->cpu != raw_smp_processor_id()))
34910- return NULL;
34911+ if (WARN_ON_ONCE(worker->sleeping))
34912+ return;
34913+
34914+ worker->sleeping = 1;
34915
34916 /*
34917 * The counterpart of the following dec_and_test, implied mb,
34918 * worklist not empty test sequence is in insert_work().
34919 * Please read comment there.
34920- *
34921- * NOT_RUNNING is clear. This means that we're bound to and
34922- * running on the local cpu w/ rq lock held and preemption
34923- * disabled, which in turn means that none else could be
34924- * manipulating idle_list, so dereferencing idle_list without pool
34925- * lock is safe.
34926 */
34927 if (atomic_dec_and_test(&pool->nr_running) &&
34928- !list_empty(&pool->worklist))
34929- to_wakeup = first_idle_worker(pool);
34930- return to_wakeup ? to_wakeup->task : NULL;
34931+ !list_empty(&pool->worklist)) {
34932+ sched_lock_idle_list(pool);
34933+ wake_up_worker(pool);
34934+ sched_unlock_idle_list(pool);
34935+ }
34936 }
34937
34938 /**
34939@@ -1102,12 +1128,14 @@ static void put_pwq_unlocked(struct pool_workqueue *pwq)
34940 {
34941 if (pwq) {
34942 /*
34943- * As both pwqs and pools are sched-RCU protected, the
34944+ * As both pwqs and pools are RCU protected, the
34945 * following lock operations are safe.
34946 */
34947- spin_lock_irq(&pwq->pool->lock);
34948+ rcu_read_lock();
34949+ local_spin_lock_irq(pendingb_lock, &pwq->pool->lock);
34950 put_pwq(pwq);
34951- spin_unlock_irq(&pwq->pool->lock);
34952+ local_spin_unlock_irq(pendingb_lock, &pwq->pool->lock);
34953+ rcu_read_unlock();
34954 }
34955 }
34956
34957@@ -1211,7 +1239,7 @@ static int try_to_grab_pending(struct work_struct *work, bool is_dwork,
34958 struct worker_pool *pool;
34959 struct pool_workqueue *pwq;
34960
34961- local_irq_save(*flags);
34962+ local_lock_irqsave(pendingb_lock, *flags);
34963
34964 /* try to steal the timer if it exists */
34965 if (is_dwork) {
34966@@ -1230,6 +1258,7 @@ static int try_to_grab_pending(struct work_struct *work, bool is_dwork,
34967 if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work)))
34968 return 0;
34969
34970+ rcu_read_lock();
34971 /*
34972 * The queueing is in progress, or it is already queued. Try to
34973 * steal it from ->worklist without clearing WORK_STRUCT_PENDING.
34974@@ -1268,14 +1297,16 @@ static int try_to_grab_pending(struct work_struct *work, bool is_dwork,
34975 set_work_pool_and_keep_pending(work, pool->id);
34976
34977 spin_unlock(&pool->lock);
34978+ rcu_read_unlock();
34979 return 1;
34980 }
34981 spin_unlock(&pool->lock);
34982 fail:
34983- local_irq_restore(*flags);
34984+ rcu_read_unlock();
34985+ local_unlock_irqrestore(pendingb_lock, *flags);
34986 if (work_is_canceling(work))
34987 return -ENOENT;
34988- cpu_relax();
34989+ cpu_chill();
34990 return -EAGAIN;
34991 }
34992
34993@@ -1377,7 +1408,7 @@ static void __queue_work(int cpu, struct workqueue_struct *wq,
34994 * queued or lose PENDING. Grabbing PENDING and queueing should
34995 * happen with IRQ disabled.
34996 */
34997- WARN_ON_ONCE(!irqs_disabled());
34998+ WARN_ON_ONCE_NONRT(!irqs_disabled());
34999
35000 debug_work_activate(work);
35001
35002@@ -1385,6 +1416,7 @@ static void __queue_work(int cpu, struct workqueue_struct *wq,
35003 if (unlikely(wq->flags & __WQ_DRAINING) &&
35004 WARN_ON_ONCE(!is_chained_work(wq)))
35005 return;
35006+ rcu_read_lock();
35007 retry:
35008 if (req_cpu == WORK_CPU_UNBOUND)
35009 cpu = wq_select_unbound_cpu(raw_smp_processor_id());
35010@@ -1441,10 +1473,8 @@ static void __queue_work(int cpu, struct workqueue_struct *wq,
35011 /* pwq determined, queue */
35012 trace_workqueue_queue_work(req_cpu, pwq, work);
35013
35014- if (WARN_ON(!list_empty(&work->entry))) {
35015- spin_unlock(&pwq->pool->lock);
35016- return;
35017- }
35018+ if (WARN_ON(!list_empty(&work->entry)))
35019+ goto out;
35020
35021 pwq->nr_in_flight[pwq->work_color]++;
35022 work_flags = work_color_to_flags(pwq->work_color);
35023@@ -1462,7 +1492,9 @@ static void __queue_work(int cpu, struct workqueue_struct *wq,
35024
35025 insert_work(pwq, work, worklist, work_flags);
35026
35027+out:
35028 spin_unlock(&pwq->pool->lock);
35029+ rcu_read_unlock();
35030 }
35031
35032 /**
35033@@ -1482,14 +1514,14 @@ bool queue_work_on(int cpu, struct workqueue_struct *wq,
35034 bool ret = false;
35035 unsigned long flags;
35036
35037- local_irq_save(flags);
35038+ local_lock_irqsave(pendingb_lock,flags);
35039
35040 if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work))) {
35041 __queue_work(cpu, wq, work);
35042 ret = true;
35043 }
35044
35045- local_irq_restore(flags);
35046+ local_unlock_irqrestore(pendingb_lock, flags);
35047 return ret;
35048 }
35049 EXPORT_SYMBOL(queue_work_on);
35050@@ -1498,8 +1530,11 @@ void delayed_work_timer_fn(unsigned long __data)
35051 {
35052 struct delayed_work *dwork = (struct delayed_work *)__data;
35053
35054+ /* XXX */
35055+ /* local_lock(pendingb_lock); */
35056 /* should have been called from irqsafe timer with irq already off */
35057 __queue_work(dwork->cpu, dwork->wq, &dwork->work);
35058+ /* local_unlock(pendingb_lock); */
35059 }
35060 EXPORT_SYMBOL(delayed_work_timer_fn);
35061
35062@@ -1555,14 +1590,14 @@ bool queue_delayed_work_on(int cpu, struct workqueue_struct *wq,
35063 unsigned long flags;
35064
35065 /* read the comment in __queue_work() */
35066- local_irq_save(flags);
35067+ local_lock_irqsave(pendingb_lock, flags);
35068
35069 if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work))) {
35070 __queue_delayed_work(cpu, wq, dwork, delay);
35071 ret = true;
35072 }
35073
35074- local_irq_restore(flags);
35075+ local_unlock_irqrestore(pendingb_lock, flags);
35076 return ret;
35077 }
35078 EXPORT_SYMBOL(queue_delayed_work_on);
35079@@ -1597,7 +1632,7 @@ bool mod_delayed_work_on(int cpu, struct workqueue_struct *wq,
35080
35081 if (likely(ret >= 0)) {
35082 __queue_delayed_work(cpu, wq, dwork, delay);
35083- local_irq_restore(flags);
35084+ local_unlock_irqrestore(pendingb_lock, flags);
35085 }
35086
35087 /* -ENOENT from try_to_grab_pending() becomes %true */
35088@@ -1630,7 +1665,9 @@ static void worker_enter_idle(struct worker *worker)
35089 worker->last_active = jiffies;
35090
35091 /* idle_list is LIFO */
35092+ rt_lock_idle_list(pool);
35093 list_add(&worker->entry, &pool->idle_list);
35094+ rt_unlock_idle_list(pool);
35095
35096 if (too_many_workers(pool) && !timer_pending(&pool->idle_timer))
35097 mod_timer(&pool->idle_timer, jiffies + IDLE_WORKER_TIMEOUT);
35098@@ -1663,7 +1700,9 @@ static void worker_leave_idle(struct worker *worker)
35099 return;
35100 worker_clr_flags(worker, WORKER_IDLE);
35101 pool->nr_idle--;
35102+ rt_lock_idle_list(pool);
35103 list_del_init(&worker->entry);
35104+ rt_unlock_idle_list(pool);
35105 }
35106
35107 static struct worker *alloc_worker(int node)
35108@@ -1829,7 +1868,9 @@ static void destroy_worker(struct worker *worker)
35109 pool->nr_workers--;
35110 pool->nr_idle--;
35111
35112+ rt_lock_idle_list(pool);
35113 list_del_init(&worker->entry);
35114+ rt_unlock_idle_list(pool);
35115 worker->flags |= WORKER_DIE;
35116 wake_up_process(worker->task);
35117 }
35118@@ -2815,14 +2856,14 @@ static bool start_flush_work(struct work_struct *work, struct wq_barrier *barr)
35119
35120 might_sleep();
35121
35122- local_irq_disable();
35123+ rcu_read_lock();
35124 pool = get_work_pool(work);
35125 if (!pool) {
35126- local_irq_enable();
35127+ rcu_read_unlock();
35128 return false;
35129 }
35130
35131- spin_lock(&pool->lock);
35132+ spin_lock_irq(&pool->lock);
35133 /* see the comment in try_to_grab_pending() with the same code */
35134 pwq = get_work_pwq(work);
35135 if (pwq) {
35136@@ -2853,10 +2894,11 @@ static bool start_flush_work(struct work_struct *work, struct wq_barrier *barr)
35137 lock_map_acquire(&pwq->wq->lockdep_map);
35138 lock_map_release(&pwq->wq->lockdep_map);
35139 }
35140-
35141+ rcu_read_unlock();
35142 return true;
35143 already_gone:
35144 spin_unlock_irq(&pool->lock);
35145+ rcu_read_unlock();
35146 return false;
35147 }
35148
35149@@ -2946,7 +2988,7 @@ static bool __cancel_work_timer(struct work_struct *work, bool is_dwork)
35150
35151 /* tell other tasks trying to grab @work to back off */
35152 mark_work_canceling(work);
35153- local_irq_restore(flags);
35154+ local_unlock_irqrestore(pendingb_lock, flags);
35155
35156 /*
35157 * This allows canceling during early boot. We know that @work
35158@@ -3007,10 +3049,10 @@ EXPORT_SYMBOL_GPL(cancel_work_sync);
35159 */
35160 bool flush_delayed_work(struct delayed_work *dwork)
35161 {
35162- local_irq_disable();
35163+ local_lock_irq(pendingb_lock);
35164 if (del_timer_sync(&dwork->timer))
35165 __queue_work(dwork->cpu, dwork->wq, &dwork->work);
35166- local_irq_enable();
35167+ local_unlock_irq(pendingb_lock);
35168 return flush_work(&dwork->work);
35169 }
35170 EXPORT_SYMBOL(flush_delayed_work);
35171@@ -3028,7 +3070,7 @@ static bool __cancel_work(struct work_struct *work, bool is_dwork)
35172 return false;
35173
35174 set_work_pool_and_clear_pending(work, get_work_pool_id(work));
35175- local_irq_restore(flags);
35176+ local_unlock_irqrestore(pendingb_lock, flags);
35177 return ret;
35178 }
35179
35180@@ -3284,7 +3326,7 @@ static void rcu_free_pool(struct rcu_head *rcu)
35181 * put_unbound_pool - put a worker_pool
35182 * @pool: worker_pool to put
35183 *
35184- * Put @pool. If its refcnt reaches zero, it gets destroyed in sched-RCU
35185+ * Put @pool. If its refcnt reaches zero, it gets destroyed in RCU
35186 * safe manner. get_unbound_pool() calls this function on its failure path
35187 * and this function should be able to release pools which went through,
35188 * successfully or not, init_worker_pool().
35189@@ -3338,8 +3380,8 @@ static void put_unbound_pool(struct worker_pool *pool)
35190 del_timer_sync(&pool->idle_timer);
35191 del_timer_sync(&pool->mayday_timer);
35192
35193- /* sched-RCU protected to allow dereferences from get_work_pool() */
35194- call_rcu_sched(&pool->rcu, rcu_free_pool);
35195+ /* RCU protected to allow dereferences from get_work_pool() */
35196+ call_rcu(&pool->rcu, rcu_free_pool);
35197 }
35198
35199 /**
35200@@ -3446,14 +3488,14 @@ static void pwq_unbound_release_workfn(struct work_struct *work)
35201 put_unbound_pool(pool);
35202 mutex_unlock(&wq_pool_mutex);
35203
35204- call_rcu_sched(&pwq->rcu, rcu_free_pwq);
35205+ call_rcu(&pwq->rcu, rcu_free_pwq);
35206
35207 /*
35208 * If we're the last pwq going away, @wq is already dead and no one
35209 * is gonna access it anymore. Schedule RCU free.
35210 */
35211 if (is_last)
35212- call_rcu_sched(&wq->rcu, rcu_free_wq);
35213+ call_rcu(&wq->rcu, rcu_free_wq);
35214 }
35215
35216 /**
35217@@ -4128,7 +4170,7 @@ void destroy_workqueue(struct workqueue_struct *wq)
35218 * The base ref is never dropped on per-cpu pwqs. Directly
35219 * schedule RCU free.
35220 */
35221- call_rcu_sched(&wq->rcu, rcu_free_wq);
35222+ call_rcu(&wq->rcu, rcu_free_wq);
35223 } else {
35224 /*
35225 * We're the sole accessor of @wq at this point. Directly
35226@@ -4238,7 +4280,8 @@ bool workqueue_congested(int cpu, struct workqueue_struct *wq)
35227 struct pool_workqueue *pwq;
35228 bool ret;
35229
35230- rcu_read_lock_sched();
35231+ rcu_read_lock();
35232+ preempt_disable();
35233
35234 if (cpu == WORK_CPU_UNBOUND)
35235 cpu = smp_processor_id();
35236@@ -4249,7 +4292,8 @@ bool workqueue_congested(int cpu, struct workqueue_struct *wq)
35237 pwq = unbound_pwq_by_node(wq, cpu_to_node(cpu));
35238
35239 ret = !list_empty(&pwq->delayed_works);
35240- rcu_read_unlock_sched();
35241+ preempt_enable();
35242+ rcu_read_unlock();
35243
35244 return ret;
35245 }
35246@@ -4275,15 +4319,15 @@ unsigned int work_busy(struct work_struct *work)
35247 if (work_pending(work))
35248 ret |= WORK_BUSY_PENDING;
35249
35250- local_irq_save(flags);
35251+ rcu_read_lock();
35252 pool = get_work_pool(work);
35253 if (pool) {
35254- spin_lock(&pool->lock);
35255+ spin_lock_irqsave(&pool->lock, flags);
35256 if (find_worker_executing_work(pool, work))
35257 ret |= WORK_BUSY_RUNNING;
35258- spin_unlock(&pool->lock);
35259+ spin_unlock_irqrestore(&pool->lock, flags);
35260 }
35261- local_irq_restore(flags);
35262+ rcu_read_unlock();
35263
35264 return ret;
35265 }
35266@@ -4472,7 +4516,7 @@ void show_workqueue_state(void)
35267 unsigned long flags;
35268 int pi;
35269
35270- rcu_read_lock_sched();
35271+ rcu_read_lock();
35272
35273 pr_info("Showing busy workqueues and worker pools:\n");
35274
35275@@ -4537,7 +4581,7 @@ void show_workqueue_state(void)
35276 touch_nmi_watchdog();
35277 }
35278
35279- rcu_read_unlock_sched();
35280+ rcu_read_unlock();
35281 }
35282
35283 /*
35284@@ -4898,16 +4942,16 @@ bool freeze_workqueues_busy(void)
35285 * nr_active is monotonically decreasing. It's safe
35286 * to peek without lock.
35287 */
35288- rcu_read_lock_sched();
35289+ rcu_read_lock();
35290 for_each_pwq(pwq, wq) {
35291 WARN_ON_ONCE(pwq->nr_active < 0);
35292 if (pwq->nr_active) {
35293 busy = true;
35294- rcu_read_unlock_sched();
35295+ rcu_read_unlock();
35296 goto out_unlock;
35297 }
35298 }
35299- rcu_read_unlock_sched();
35300+ rcu_read_unlock();
35301 }
35302 out_unlock:
35303 mutex_unlock(&wq_pool_mutex);
35304@@ -5097,7 +5141,8 @@ static ssize_t wq_pool_ids_show(struct device *dev,
35305 const char *delim = "";
35306 int node, written = 0;
35307
35308- rcu_read_lock_sched();
35309+ get_online_cpus();
35310+ rcu_read_lock();
35311 for_each_node(node) {
35312 written += scnprintf(buf + written, PAGE_SIZE - written,
35313 "%s%d:%d", delim, node,
35314@@ -5105,7 +5150,8 @@ static ssize_t wq_pool_ids_show(struct device *dev,
35315 delim = " ";
35316 }
35317 written += scnprintf(buf + written, PAGE_SIZE - written, "\n");
35318- rcu_read_unlock_sched();
35319+ rcu_read_unlock();
35320+ put_online_cpus();
35321
35322 return written;
35323 }
35324diff --git a/kernel/workqueue_internal.h b/kernel/workqueue_internal.h
35325index d390d1be3748..2dbcfe9bc364 100644
35326--- a/kernel/workqueue_internal.h
35327+++ b/kernel/workqueue_internal.h
35328@@ -45,6 +45,7 @@ struct worker {
35329 unsigned long last_active; /* L: last active timestamp */
35330 unsigned int flags; /* X: flags */
35331 int id; /* I: worker id */
35332+ int sleeping; /* None */
35333
35334 /*
35335 * Opaque string set with work_set_desc(). Printed out with task
35336@@ -70,7 +71,7 @@ static inline struct worker *current_wq_worker(void)
35337 * Scheduler hooks for concurrency managed workqueue. Only to be used from
35338 * sched/core.c and workqueue.c.
35339 */
35340-void wq_worker_waking_up(struct task_struct *task, int cpu);
35341-struct task_struct *wq_worker_sleeping(struct task_struct *task);
35342+void wq_worker_running(struct task_struct *task);
35343+void wq_worker_sleeping(struct task_struct *task);
35344
35345 #endif /* _KERNEL_WORKQUEUE_INTERNAL_H */
35346diff --git a/lib/Kconfig b/lib/Kconfig
35347index b1445b22a6de..9ab51b78991a 100644
35348--- a/lib/Kconfig
35349+++ b/lib/Kconfig
35350@@ -428,6 +428,7 @@ config CHECK_SIGNATURE
35351
35352 config CPUMASK_OFFSTACK
35353 bool "Force CPU masks off stack" if DEBUG_PER_CPU_MAPS
35354+ depends on !PREEMPT_RT_FULL
35355 help
35356 Use dynamic allocation for cpumask_var_t, instead of putting
35357 them on the stack. This is a bit more expensive, but avoids
35358diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
35359index 62d0e25c054c..401b7ed164b5 100644
35360--- a/lib/Kconfig.debug
35361+++ b/lib/Kconfig.debug
35362@@ -1197,7 +1197,7 @@ config DEBUG_ATOMIC_SLEEP
35363
35364 config DEBUG_LOCKING_API_SELFTESTS
35365 bool "Locking API boot-time self-tests"
35366- depends on DEBUG_KERNEL
35367+ depends on DEBUG_KERNEL && !PREEMPT_RT_FULL
35368 help
35369 Say Y here if you want the kernel to run a short self-test during
35370 bootup. The self-test checks whether common types of locking bugs
35371diff --git a/lib/debugobjects.c b/lib/debugobjects.c
35372index 99308479b1c8..161da6c6e173 100644
35373--- a/lib/debugobjects.c
35374+++ b/lib/debugobjects.c
35375@@ -339,7 +339,10 @@ __debug_object_init(void *addr, struct debug_obj_descr *descr, int onstack)
35376 struct debug_obj *obj;
35377 unsigned long flags;
35378
35379- fill_pool();
35380+#ifdef CONFIG_PREEMPT_RT_FULL
35381+ if (preempt_count() == 0 && !irqs_disabled())
35382+#endif
35383+ fill_pool();
35384
35385 db = get_bucket((unsigned long) addr);
35386
35387diff --git a/lib/irq_poll.c b/lib/irq_poll.c
35388index 86a709954f5a..9c069ef83d6d 100644
35389--- a/lib/irq_poll.c
35390+++ b/lib/irq_poll.c
35391@@ -37,6 +37,7 @@ void irq_poll_sched(struct irq_poll *iop)
35392 list_add_tail(&iop->list, this_cpu_ptr(&blk_cpu_iopoll));
35393 __raise_softirq_irqoff(IRQ_POLL_SOFTIRQ);
35394 local_irq_restore(flags);
35395+ preempt_check_resched_rt();
35396 }
35397 EXPORT_SYMBOL(irq_poll_sched);
35398
35399@@ -72,6 +73,7 @@ void irq_poll_complete(struct irq_poll *iop)
35400 local_irq_save(flags);
35401 __irq_poll_complete(iop);
35402 local_irq_restore(flags);
35403+ preempt_check_resched_rt();
35404 }
35405 EXPORT_SYMBOL(irq_poll_complete);
35406
35407@@ -96,6 +98,7 @@ static void __latent_entropy irq_poll_softirq(struct softirq_action *h)
35408 }
35409
35410 local_irq_enable();
35411+ preempt_check_resched_rt();
35412
35413 /* Even though interrupts have been re-enabled, this
35414 * access is safe because interrupts can only add new
35415@@ -133,6 +136,7 @@ static void __latent_entropy irq_poll_softirq(struct softirq_action *h)
35416 __raise_softirq_irqoff(IRQ_POLL_SOFTIRQ);
35417
35418 local_irq_enable();
35419+ preempt_check_resched_rt();
35420 }
35421
35422 /**
35423@@ -196,6 +200,7 @@ static int irq_poll_cpu_dead(unsigned int cpu)
35424 this_cpu_ptr(&blk_cpu_iopoll));
35425 __raise_softirq_irqoff(IRQ_POLL_SOFTIRQ);
35426 local_irq_enable();
35427+ preempt_check_resched_rt();
35428
35429 return 0;
35430 }
35431diff --git a/lib/locking-selftest.c b/lib/locking-selftest.c
35432index b5c1293ce147..075e225f4111 100644
35433--- a/lib/locking-selftest.c
35434+++ b/lib/locking-selftest.c
35435@@ -742,6 +742,8 @@ GENERATE_TESTCASE(init_held_rtmutex);
35436 #include "locking-selftest-spin-hardirq.h"
35437 GENERATE_PERMUTATIONS_2_EVENTS(irqsafe1_hard_spin)
35438
35439+#ifndef CONFIG_PREEMPT_RT_FULL
35440+
35441 #include "locking-selftest-rlock-hardirq.h"
35442 GENERATE_PERMUTATIONS_2_EVENTS(irqsafe1_hard_rlock)
35443
35444@@ -757,9 +759,12 @@ GENERATE_PERMUTATIONS_2_EVENTS(irqsafe1_soft_rlock)
35445 #include "locking-selftest-wlock-softirq.h"
35446 GENERATE_PERMUTATIONS_2_EVENTS(irqsafe1_soft_wlock)
35447
35448+#endif
35449+
35450 #undef E1
35451 #undef E2
35452
35453+#ifndef CONFIG_PREEMPT_RT_FULL
35454 /*
35455 * Enabling hardirqs with a softirq-safe lock held:
35456 */
35457@@ -792,6 +797,8 @@ GENERATE_PERMUTATIONS_2_EVENTS(irqsafe2A_rlock)
35458 #undef E1
35459 #undef E2
35460
35461+#endif
35462+
35463 /*
35464 * Enabling irqs with an irq-safe lock held:
35465 */
35466@@ -815,6 +822,8 @@ GENERATE_PERMUTATIONS_2_EVENTS(irqsafe2A_rlock)
35467 #include "locking-selftest-spin-hardirq.h"
35468 GENERATE_PERMUTATIONS_2_EVENTS(irqsafe2B_hard_spin)
35469
35470+#ifndef CONFIG_PREEMPT_RT_FULL
35471+
35472 #include "locking-selftest-rlock-hardirq.h"
35473 GENERATE_PERMUTATIONS_2_EVENTS(irqsafe2B_hard_rlock)
35474
35475@@ -830,6 +839,8 @@ GENERATE_PERMUTATIONS_2_EVENTS(irqsafe2B_soft_rlock)
35476 #include "locking-selftest-wlock-softirq.h"
35477 GENERATE_PERMUTATIONS_2_EVENTS(irqsafe2B_soft_wlock)
35478
35479+#endif
35480+
35481 #undef E1
35482 #undef E2
35483
35484@@ -861,6 +872,8 @@ GENERATE_PERMUTATIONS_2_EVENTS(irqsafe2B_soft_wlock)
35485 #include "locking-selftest-spin-hardirq.h"
35486 GENERATE_PERMUTATIONS_3_EVENTS(irqsafe3_hard_spin)
35487
35488+#ifndef CONFIG_PREEMPT_RT_FULL
35489+
35490 #include "locking-selftest-rlock-hardirq.h"
35491 GENERATE_PERMUTATIONS_3_EVENTS(irqsafe3_hard_rlock)
35492
35493@@ -876,6 +889,8 @@ GENERATE_PERMUTATIONS_3_EVENTS(irqsafe3_soft_rlock)
35494 #include "locking-selftest-wlock-softirq.h"
35495 GENERATE_PERMUTATIONS_3_EVENTS(irqsafe3_soft_wlock)
35496
35497+#endif
35498+
35499 #undef E1
35500 #undef E2
35501 #undef E3
35502@@ -909,6 +924,8 @@ GENERATE_PERMUTATIONS_3_EVENTS(irqsafe3_soft_wlock)
35503 #include "locking-selftest-spin-hardirq.h"
35504 GENERATE_PERMUTATIONS_3_EVENTS(irqsafe4_hard_spin)
35505
35506+#ifndef CONFIG_PREEMPT_RT_FULL
35507+
35508 #include "locking-selftest-rlock-hardirq.h"
35509 GENERATE_PERMUTATIONS_3_EVENTS(irqsafe4_hard_rlock)
35510
35511@@ -924,10 +941,14 @@ GENERATE_PERMUTATIONS_3_EVENTS(irqsafe4_soft_rlock)
35512 #include "locking-selftest-wlock-softirq.h"
35513 GENERATE_PERMUTATIONS_3_EVENTS(irqsafe4_soft_wlock)
35514
35515+#endif
35516+
35517 #undef E1
35518 #undef E2
35519 #undef E3
35520
35521+#ifndef CONFIG_PREEMPT_RT_FULL
35522+
35523 /*
35524 * read-lock / write-lock irq inversion.
35525 *
35526@@ -990,6 +1011,10 @@ GENERATE_PERMUTATIONS_3_EVENTS(irq_inversion_soft_wlock)
35527 #undef E2
35528 #undef E3
35529
35530+#endif
35531+
35532+#ifndef CONFIG_PREEMPT_RT_FULL
35533+
35534 /*
35535 * read-lock / write-lock recursion that is actually safe.
35536 */
35537@@ -1028,6 +1053,8 @@ GENERATE_PERMUTATIONS_3_EVENTS(irq_read_recursion_soft)
35538 #undef E2
35539 #undef E3
35540
35541+#endif
35542+
35543 /*
35544 * read-lock / write-lock recursion that is unsafe.
35545 */
35546@@ -2057,6 +2084,7 @@ void locking_selftest(void)
35547
35548 printk(" --------------------------------------------------------------------------\n");
35549
35550+#ifndef CONFIG_PREEMPT_RT_FULL
35551 /*
35552 * irq-context testcases:
35553 */
35554@@ -2069,6 +2097,28 @@ void locking_selftest(void)
35555
35556 DO_TESTCASE_6x2("irq read-recursion", irq_read_recursion);
35557 // DO_TESTCASE_6x2B("irq read-recursion #2", irq_read_recursion2);
35558+#else
35559+ /* On -rt, we only do hardirq context test for raw spinlock */
35560+ DO_TESTCASE_1B("hard-irqs-on + irq-safe-A", irqsafe1_hard_spin, 12);
35561+ DO_TESTCASE_1B("hard-irqs-on + irq-safe-A", irqsafe1_hard_spin, 21);
35562+
35563+ DO_TESTCASE_1B("hard-safe-A + irqs-on", irqsafe2B_hard_spin, 12);
35564+ DO_TESTCASE_1B("hard-safe-A + irqs-on", irqsafe2B_hard_spin, 21);
35565+
35566+ DO_TESTCASE_1B("hard-safe-A + unsafe-B #1", irqsafe3_hard_spin, 123);
35567+ DO_TESTCASE_1B("hard-safe-A + unsafe-B #1", irqsafe3_hard_spin, 132);
35568+ DO_TESTCASE_1B("hard-safe-A + unsafe-B #1", irqsafe3_hard_spin, 213);
35569+ DO_TESTCASE_1B("hard-safe-A + unsafe-B #1", irqsafe3_hard_spin, 231);
35570+ DO_TESTCASE_1B("hard-safe-A + unsafe-B #1", irqsafe3_hard_spin, 312);
35571+ DO_TESTCASE_1B("hard-safe-A + unsafe-B #1", irqsafe3_hard_spin, 321);
35572+
35573+ DO_TESTCASE_1B("hard-safe-A + unsafe-B #2", irqsafe4_hard_spin, 123);
35574+ DO_TESTCASE_1B("hard-safe-A + unsafe-B #2", irqsafe4_hard_spin, 132);
35575+ DO_TESTCASE_1B("hard-safe-A + unsafe-B #2", irqsafe4_hard_spin, 213);
35576+ DO_TESTCASE_1B("hard-safe-A + unsafe-B #2", irqsafe4_hard_spin, 231);
35577+ DO_TESTCASE_1B("hard-safe-A + unsafe-B #2", irqsafe4_hard_spin, 312);
35578+ DO_TESTCASE_1B("hard-safe-A + unsafe-B #2", irqsafe4_hard_spin, 321);
35579+#endif
35580
35581 ww_tests();
35582
35583diff --git a/lib/percpu_ida.c b/lib/percpu_ida.c
35584index 6016f1deb1f5..cdd43086b55b 100644
35585--- a/lib/percpu_ida.c
35586+++ b/lib/percpu_ida.c
35587@@ -27,6 +27,9 @@
35588 #include <linux/string.h>
35589 #include <linux/spinlock.h>
35590 #include <linux/percpu_ida.h>
35591+#include <linux/locallock.h>
35592+
35593+static DEFINE_LOCAL_IRQ_LOCK(irq_off_lock);
35594
35595 struct percpu_ida_cpu {
35596 /*
35597@@ -149,13 +152,13 @@ int percpu_ida_alloc(struct percpu_ida *pool, int state)
35598 unsigned long flags;
35599 int tag;
35600
35601- local_irq_save(flags);
35602+ local_lock_irqsave(irq_off_lock, flags);
35603 tags = this_cpu_ptr(pool->tag_cpu);
35604
35605 /* Fastpath */
35606 tag = alloc_local_tag(tags);
35607 if (likely(tag >= 0)) {
35608- local_irq_restore(flags);
35609+ local_unlock_irqrestore(irq_off_lock, flags);
35610 return tag;
35611 }
35612
35613@@ -174,6 +177,7 @@ int percpu_ida_alloc(struct percpu_ida *pool, int state)
35614
35615 if (!tags->nr_free)
35616 alloc_global_tags(pool, tags);
35617+
35618 if (!tags->nr_free)
35619 steal_tags(pool, tags);
35620
35621@@ -185,7 +189,7 @@ int percpu_ida_alloc(struct percpu_ida *pool, int state)
35622 }
35623
35624 spin_unlock(&pool->lock);
35625- local_irq_restore(flags);
35626+ local_unlock_irqrestore(irq_off_lock, flags);
35627
35628 if (tag >= 0 || state == TASK_RUNNING)
35629 break;
35630@@ -197,7 +201,7 @@ int percpu_ida_alloc(struct percpu_ida *pool, int state)
35631
35632 schedule();
35633
35634- local_irq_save(flags);
35635+ local_lock_irqsave(irq_off_lock, flags);
35636 tags = this_cpu_ptr(pool->tag_cpu);
35637 }
35638 if (state != TASK_RUNNING)
35639@@ -222,7 +226,7 @@ void percpu_ida_free(struct percpu_ida *pool, unsigned tag)
35640
35641 BUG_ON(tag >= pool->nr_tags);
35642
35643- local_irq_save(flags);
35644+ local_lock_irqsave(irq_off_lock, flags);
35645 tags = this_cpu_ptr(pool->tag_cpu);
35646
35647 spin_lock(&tags->lock);
35648@@ -254,7 +258,7 @@ void percpu_ida_free(struct percpu_ida *pool, unsigned tag)
35649 spin_unlock(&pool->lock);
35650 }
35651
35652- local_irq_restore(flags);
35653+ local_unlock_irqrestore(irq_off_lock, flags);
35654 }
35655 EXPORT_SYMBOL_GPL(percpu_ida_free);
35656
35657@@ -346,7 +350,7 @@ int percpu_ida_for_each_free(struct percpu_ida *pool, percpu_ida_cb fn,
35658 struct percpu_ida_cpu *remote;
35659 unsigned cpu, i, err = 0;
35660
35661- local_irq_save(flags);
35662+ local_lock_irqsave(irq_off_lock, flags);
35663 for_each_possible_cpu(cpu) {
35664 remote = per_cpu_ptr(pool->tag_cpu, cpu);
35665 spin_lock(&remote->lock);
35666@@ -368,7 +372,7 @@ int percpu_ida_for_each_free(struct percpu_ida *pool, percpu_ida_cb fn,
35667 }
35668 spin_unlock(&pool->lock);
35669 out:
35670- local_irq_restore(flags);
35671+ local_unlock_irqrestore(irq_off_lock, flags);
35672 return err;
35673 }
35674 EXPORT_SYMBOL_GPL(percpu_ida_for_each_free);
35675diff --git a/lib/radix-tree.c b/lib/radix-tree.c
35676index d172f0341b80..c1da1109a107 100644
35677--- a/lib/radix-tree.c
35678+++ b/lib/radix-tree.c
35679@@ -37,7 +37,7 @@
35680 #include <linux/rcupdate.h>
35681 #include <linux/slab.h>
35682 #include <linux/string.h>
35683-
35684+#include <linux/locallock.h>
35685
35686 /* Number of nodes in fully populated tree of given height */
35687 static unsigned long height_to_maxnodes[RADIX_TREE_MAX_PATH + 1] __read_mostly;
35688@@ -86,6 +86,7 @@ struct radix_tree_preload {
35689 struct radix_tree_node *nodes;
35690 };
35691 static DEFINE_PER_CPU(struct radix_tree_preload, radix_tree_preloads) = { 0, };
35692+static DEFINE_LOCAL_IRQ_LOCK(radix_tree_preloads_lock);
35693
35694 static inline struct radix_tree_node *entry_to_node(void *ptr)
35695 {
35696@@ -404,12 +405,13 @@ radix_tree_node_alloc(gfp_t gfp_mask, struct radix_tree_node *parent,
35697 * succeed in getting a node here (and never reach
35698 * kmem_cache_alloc)
35699 */
35700- rtp = this_cpu_ptr(&radix_tree_preloads);
35701+ rtp = &get_locked_var(radix_tree_preloads_lock, radix_tree_preloads);
35702 if (rtp->nr) {
35703 ret = rtp->nodes;
35704 rtp->nodes = ret->parent;
35705 rtp->nr--;
35706 }
35707+ put_locked_var(radix_tree_preloads_lock, radix_tree_preloads);
35708 /*
35709 * Update the allocation stack trace as this is more useful
35710 * for debugging.
35711@@ -475,14 +477,14 @@ static __must_check int __radix_tree_preload(gfp_t gfp_mask, unsigned nr)
35712 */
35713 gfp_mask &= ~__GFP_ACCOUNT;
35714
35715- preempt_disable();
35716+ local_lock(radix_tree_preloads_lock);
35717 rtp = this_cpu_ptr(&radix_tree_preloads);
35718 while (rtp->nr < nr) {
35719- preempt_enable();
35720+ local_unlock(radix_tree_preloads_lock);
35721 node = kmem_cache_alloc(radix_tree_node_cachep, gfp_mask);
35722 if (node == NULL)
35723 goto out;
35724- preempt_disable();
35725+ local_lock(radix_tree_preloads_lock);
35726 rtp = this_cpu_ptr(&radix_tree_preloads);
35727 if (rtp->nr < nr) {
35728 node->parent = rtp->nodes;
35729@@ -524,7 +526,7 @@ int radix_tree_maybe_preload(gfp_t gfp_mask)
35730 if (gfpflags_allow_blocking(gfp_mask))
35731 return __radix_tree_preload(gfp_mask, RADIX_TREE_PRELOAD_SIZE);
35732 /* Preloading doesn't help anything with this gfp mask, skip it */
35733- preempt_disable();
35734+ local_lock(radix_tree_preloads_lock);
35735 return 0;
35736 }
35737 EXPORT_SYMBOL(radix_tree_maybe_preload);
35738@@ -562,7 +564,7 @@ int radix_tree_maybe_preload_order(gfp_t gfp_mask, int order)
35739
35740 /* Preloading doesn't help anything with this gfp mask, skip it */
35741 if (!gfpflags_allow_blocking(gfp_mask)) {
35742- preempt_disable();
35743+ local_lock(radix_tree_preloads_lock);
35744 return 0;
35745 }
35746
35747@@ -596,6 +598,12 @@ int radix_tree_maybe_preload_order(gfp_t gfp_mask, int order)
35748 return __radix_tree_preload(gfp_mask, nr_nodes);
35749 }
35750
35751+void radix_tree_preload_end(void)
35752+{
35753+ local_unlock(radix_tree_preloads_lock);
35754+}
35755+EXPORT_SYMBOL(radix_tree_preload_end);
35756+
35757 static unsigned radix_tree_load_root(const struct radix_tree_root *root,
35758 struct radix_tree_node **nodep, unsigned long *maxindex)
35759 {
35760@@ -2105,10 +2113,16 @@ EXPORT_SYMBOL(radix_tree_tagged);
35761 void idr_preload(gfp_t gfp_mask)
35762 {
35763 if (__radix_tree_preload(gfp_mask, IDR_PRELOAD_SIZE))
35764- preempt_disable();
35765+ local_lock(radix_tree_preloads_lock);
35766 }
35767 EXPORT_SYMBOL(idr_preload);
35768
35769+void idr_preload_end(void)
35770+{
35771+ local_unlock(radix_tree_preloads_lock);
35772+}
35773+EXPORT_SYMBOL(idr_preload_end);
35774+
35775 /**
35776 * ida_pre_get - reserve resources for ida allocation
35777 * @ida: ida handle
35778@@ -2125,7 +2139,7 @@ int ida_pre_get(struct ida *ida, gfp_t gfp)
35779 * to return to the ida_pre_get() step.
35780 */
35781 if (!__radix_tree_preload(gfp, IDA_PRELOAD_SIZE))
35782- preempt_enable();
35783+ local_unlock(radix_tree_preloads_lock);
35784
35785 if (!this_cpu_read(ida_bitmap)) {
35786 struct ida_bitmap *bitmap = kmalloc(sizeof(*bitmap), gfp);
35787diff --git a/lib/scatterlist.c b/lib/scatterlist.c
35788index be7b4dd6b68d..d06c15d3d186 100644
35789--- a/lib/scatterlist.c
35790+++ b/lib/scatterlist.c
35791@@ -620,7 +620,7 @@ void sg_miter_stop(struct sg_mapping_iter *miter)
35792 flush_kernel_dcache_page(miter->page);
35793
35794 if (miter->__flags & SG_MITER_ATOMIC) {
35795- WARN_ON_ONCE(preemptible());
35796+ WARN_ON_ONCE(!pagefault_disabled());
35797 kunmap_atomic(miter->addr);
35798 } else
35799 kunmap(miter->page);
35800diff --git a/lib/smp_processor_id.c b/lib/smp_processor_id.c
35801index 835cc6df2776..6f4a4ae881c8 100644
35802--- a/lib/smp_processor_id.c
35803+++ b/lib/smp_processor_id.c
35804@@ -23,7 +23,7 @@ notrace static unsigned int check_preemption_disabled(const char *what1,
35805 * Kernel threads bound to a single CPU can safely use
35806 * smp_processor_id():
35807 */
35808- if (cpumask_equal(&current->cpus_allowed, cpumask_of(this_cpu)))
35809+ if (cpumask_equal(current->cpus_ptr, cpumask_of(this_cpu)))
35810 goto out;
35811
35812 /*
35813diff --git a/lib/timerqueue.c b/lib/timerqueue.c
35814index 4a720ed4fdaf..0d54bcbc8170 100644
35815--- a/lib/timerqueue.c
35816+++ b/lib/timerqueue.c
35817@@ -33,8 +33,9 @@
35818 * @head: head of timerqueue
35819 * @node: timer node to be added
35820 *
35821- * Adds the timer node to the timerqueue, sorted by the
35822- * node's expires value.
35823+ * Adds the timer node to the timerqueue, sorted by the node's expires
35824+ * value. Returns true if the newly added timer is the first expiring timer in
35825+ * the queue.
35826 */
35827 bool timerqueue_add(struct timerqueue_head *head, struct timerqueue_node *node)
35828 {
35829@@ -70,7 +71,8 @@ EXPORT_SYMBOL_GPL(timerqueue_add);
35830 * @head: head of timerqueue
35831 * @node: timer node to be removed
35832 *
35833- * Removes the timer node from the timerqueue.
35834+ * Removes the timer node from the timerqueue. Returns true if the queue is
35835+ * not empty after the remove.
35836 */
35837 bool timerqueue_del(struct timerqueue_head *head, struct timerqueue_node *node)
35838 {
35839diff --git a/localversion-rt b/localversion-rt
35840new file mode 100644
35841index 000000000000..8a777ac42aab
35842--- /dev/null
35843+++ b/localversion-rt
35844@@ -0,0 +1 @@
35845+-rt47
35846diff --git a/mm/Kconfig b/mm/Kconfig
35847index 59efbd3337e0..3df123c0bc3f 100644
35848--- a/mm/Kconfig
35849+++ b/mm/Kconfig
35850@@ -385,7 +385,7 @@ config NOMMU_INITIAL_TRIM_EXCESS
35851
35852 config TRANSPARENT_HUGEPAGE
35853 bool "Transparent Hugepage Support"
35854- depends on HAVE_ARCH_TRANSPARENT_HUGEPAGE
35855+ depends on HAVE_ARCH_TRANSPARENT_HUGEPAGE && !PREEMPT_RT_FULL
35856 select COMPACTION
35857 select RADIX_TREE_MULTIORDER
35858 help
35859diff --git a/mm/backing-dev.c b/mm/backing-dev.c
35860index 9386c98dac12..5e9d804c37cb 100644
35861--- a/mm/backing-dev.c
35862+++ b/mm/backing-dev.c
35863@@ -470,9 +470,9 @@ void wb_congested_put(struct bdi_writeback_congested *congested)
35864 {
35865 unsigned long flags;
35866
35867- local_irq_save(flags);
35868+ local_irq_save_nort(flags);
35869 if (!atomic_dec_and_lock(&congested->refcnt, &cgwb_lock)) {
35870- local_irq_restore(flags);
35871+ local_irq_restore_nort(flags);
35872 return;
35873 }
35874
35875diff --git a/mm/compaction.c b/mm/compaction.c
35876index 85395dc6eb13..d6c8ed009e93 100644
35877--- a/mm/compaction.c
35878+++ b/mm/compaction.c
35879@@ -1634,10 +1634,12 @@ static enum compact_result compact_zone(struct zone *zone, struct compact_contro
35880 block_start_pfn(cc->migrate_pfn, cc->order);
35881
35882 if (cc->last_migrated_pfn < current_block_start) {
35883- cpu = get_cpu();
35884+ cpu = get_cpu_light();
35885+ local_lock_irq(swapvec_lock);
35886 lru_add_drain_cpu(cpu);
35887+ local_unlock_irq(swapvec_lock);
35888 drain_local_pages(zone);
35889- put_cpu();
35890+ put_cpu_light();
35891 /* No more flushing until we migrate again */
35892 cc->last_migrated_pfn = 0;
35893 }
35894diff --git a/mm/filemap.c b/mm/filemap.c
35895index e2e738cc08b1..c47070dae8b9 100644
35896--- a/mm/filemap.c
35897+++ b/mm/filemap.c
35898@@ -110,6 +110,7 @@
35899 * ->i_mmap_rwsem
35900 * ->tasklist_lock (memory_failure, collect_procs_ao)
35901 */
35902+DECLARE_LOCAL_IRQ_LOCK(shadow_nodes_lock);
35903
35904 static int page_cache_tree_insert(struct address_space *mapping,
35905 struct page *page, void **shadowp)
35906@@ -133,8 +134,10 @@ static int page_cache_tree_insert(struct address_space *mapping,
35907 if (shadowp)
35908 *shadowp = p;
35909 }
35910+ local_lock(shadow_nodes_lock);
35911 __radix_tree_replace(&mapping->page_tree, node, slot, page,
35912- workingset_update_node, mapping);
35913+ __workingset_update_node, mapping);
35914+ local_unlock(shadow_nodes_lock);
35915 mapping->nrpages++;
35916 return 0;
35917 }
35918@@ -151,6 +154,7 @@ static void page_cache_tree_delete(struct address_space *mapping,
35919 VM_BUG_ON_PAGE(PageTail(page), page);
35920 VM_BUG_ON_PAGE(nr != 1 && shadow, page);
35921
35922+ local_lock(shadow_nodes_lock);
35923 for (i = 0; i < nr; i++) {
35924 struct radix_tree_node *node;
35925 void **slot;
35926@@ -162,8 +166,9 @@ static void page_cache_tree_delete(struct address_space *mapping,
35927
35928 radix_tree_clear_tags(&mapping->page_tree, node, slot);
35929 __radix_tree_replace(&mapping->page_tree, node, slot, shadow,
35930- workingset_update_node, mapping);
35931+ __workingset_update_node, mapping);
35932 }
35933+ local_unlock(shadow_nodes_lock);
35934
35935 if (shadow) {
35936 mapping->nrexceptional += nr;
35937diff --git a/mm/highmem.c b/mm/highmem.c
35938index 59db3223a5d6..22aa3ddbd87b 100644
35939--- a/mm/highmem.c
35940+++ b/mm/highmem.c
35941@@ -30,10 +30,11 @@
35942 #include <linux/kgdb.h>
35943 #include <asm/tlbflush.h>
35944
35945-
35946+#ifndef CONFIG_PREEMPT_RT_FULL
35947 #if defined(CONFIG_HIGHMEM) || defined(CONFIG_X86_32)
35948 DEFINE_PER_CPU(int, __kmap_atomic_idx);
35949 #endif
35950+#endif
35951
35952 /*
35953 * Virtual_count is not a pure "count".
35954@@ -108,8 +109,9 @@ static inline wait_queue_head_t *get_pkmap_wait_queue_head(unsigned int color)
35955 unsigned long totalhigh_pages __read_mostly;
35956 EXPORT_SYMBOL(totalhigh_pages);
35957
35958-
35959+#ifndef CONFIG_PREEMPT_RT_FULL
35960 EXPORT_PER_CPU_SYMBOL(__kmap_atomic_idx);
35961+#endif
35962
35963 unsigned int nr_free_highpages (void)
35964 {
35965diff --git a/mm/memcontrol.c b/mm/memcontrol.c
35966index 6a9a7e1066ef..3cc297730103 100644
35967--- a/mm/memcontrol.c
35968+++ b/mm/memcontrol.c
35969@@ -69,6 +69,7 @@
35970 #include <net/sock.h>
35971 #include <net/ip.h>
35972 #include "slab.h"
35973+#include <linux/locallock.h>
35974
35975 #include <linux/uaccess.h>
35976
35977@@ -94,6 +95,8 @@ int do_swap_account __read_mostly;
35978 #define do_swap_account 0
35979 #endif
35980
35981+static DEFINE_LOCAL_IRQ_LOCK(event_lock);
35982+
35983 /* Whether legacy memory+swap accounting is active */
35984 static bool do_memsw_account(void)
35985 {
35986@@ -1831,7 +1834,7 @@ static void drain_all_stock(struct mem_cgroup *root_memcg)
35987 * as well as workers from this path always operate on the local
35988 * per-cpu data. CPU up doesn't touch memcg_stock at all.
35989 */
35990- curcpu = get_cpu();
35991+ curcpu = get_cpu_light();
35992 for_each_online_cpu(cpu) {
35993 struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu);
35994 struct mem_cgroup *memcg;
35995@@ -1851,7 +1854,7 @@ static void drain_all_stock(struct mem_cgroup *root_memcg)
35996 }
35997 css_put(&memcg->css);
35998 }
35999- put_cpu();
36000+ put_cpu_light();
36001 mutex_unlock(&percpu_charge_mutex);
36002 }
36003
36004@@ -4631,12 +4634,12 @@ static int mem_cgroup_move_account(struct page *page,
36005
36006 ret = 0;
36007
36008- local_irq_disable();
36009+ local_lock_irq(event_lock);
36010 mem_cgroup_charge_statistics(to, page, compound, nr_pages);
36011 memcg_check_events(to, page);
36012 mem_cgroup_charge_statistics(from, page, compound, -nr_pages);
36013 memcg_check_events(from, page);
36014- local_irq_enable();
36015+ local_unlock_irq(event_lock);
36016 out_unlock:
36017 unlock_page(page);
36018 out:
36019@@ -5579,10 +5582,10 @@ void mem_cgroup_commit_charge(struct page *page, struct mem_cgroup *memcg,
36020
36021 commit_charge(page, memcg, lrucare);
36022
36023- local_irq_disable();
36024+ local_lock_irq(event_lock);
36025 mem_cgroup_charge_statistics(memcg, page, compound, nr_pages);
36026 memcg_check_events(memcg, page);
36027- local_irq_enable();
36028+ local_unlock_irq(event_lock);
36029
36030 if (do_memsw_account() && PageSwapCache(page)) {
36031 swp_entry_t entry = { .val = page_private(page) };
36032@@ -5651,7 +5654,7 @@ static void uncharge_batch(const struct uncharge_gather *ug)
36033 memcg_oom_recover(ug->memcg);
36034 }
36035
36036- local_irq_save(flags);
36037+ local_lock_irqsave(event_lock, flags);
36038 __this_cpu_sub(ug->memcg->stat->count[MEMCG_RSS], ug->nr_anon);
36039 __this_cpu_sub(ug->memcg->stat->count[MEMCG_CACHE], ug->nr_file);
36040 __this_cpu_sub(ug->memcg->stat->count[MEMCG_RSS_HUGE], ug->nr_huge);
36041@@ -5659,7 +5662,7 @@ static void uncharge_batch(const struct uncharge_gather *ug)
36042 __this_cpu_add(ug->memcg->stat->events[PGPGOUT], ug->pgpgout);
36043 __this_cpu_add(ug->memcg->stat->nr_page_events, nr_pages);
36044 memcg_check_events(ug->memcg, ug->dummy_page);
36045- local_irq_restore(flags);
36046+ local_unlock_irqrestore(event_lock, flags);
36047
36048 if (!mem_cgroup_is_root(ug->memcg))
36049 css_put_many(&ug->memcg->css, nr_pages);
36050@@ -5822,10 +5825,10 @@ void mem_cgroup_migrate(struct page *oldpage, struct page *newpage)
36051
36052 commit_charge(newpage, memcg, false);
36053
36054- local_irq_save(flags);
36055+ local_lock_irqsave(event_lock, flags);
36056 mem_cgroup_charge_statistics(memcg, newpage, compound, nr_pages);
36057 memcg_check_events(memcg, newpage);
36058- local_irq_restore(flags);
36059+ local_unlock_irqrestore(event_lock, flags);
36060 }
36061
36062 DEFINE_STATIC_KEY_FALSE(memcg_sockets_enabled_key);
36063@@ -6017,6 +6020,7 @@ void mem_cgroup_swapout(struct page *page, swp_entry_t entry)
36064 struct mem_cgroup *memcg, *swap_memcg;
36065 unsigned int nr_entries;
36066 unsigned short oldid;
36067+ unsigned long flags;
36068
36069 VM_BUG_ON_PAGE(PageLRU(page), page);
36070 VM_BUG_ON_PAGE(page_count(page), page);
36071@@ -6062,13 +6066,17 @@ void mem_cgroup_swapout(struct page *page, swp_entry_t entry)
36072 * important here to have the interrupts disabled because it is the
36073 * only synchronisation we have for udpating the per-CPU variables.
36074 */
36075+ local_lock_irqsave(event_lock, flags);
36076+#ifndef CONFIG_PREEMPT_RT_BASE
36077 VM_BUG_ON(!irqs_disabled());
36078+#endif
36079 mem_cgroup_charge_statistics(memcg, page, PageTransHuge(page),
36080 -nr_entries);
36081 memcg_check_events(memcg, page);
36082
36083 if (!mem_cgroup_is_root(memcg))
36084 css_put_many(&memcg->css, nr_entries);
36085+ local_unlock_irqrestore(event_lock, flags);
36086 }
36087
36088 /**
36089diff --git a/mm/mmu_context.c b/mm/mmu_context.c
36090index 3e612ae748e9..d0ccc070979f 100644
36091--- a/mm/mmu_context.c
36092+++ b/mm/mmu_context.c
36093@@ -25,6 +25,7 @@ void use_mm(struct mm_struct *mm)
36094 struct task_struct *tsk = current;
36095
36096 task_lock(tsk);
36097+ preempt_disable_rt();
36098 active_mm = tsk->active_mm;
36099 if (active_mm != mm) {
36100 mmgrab(mm);
36101@@ -32,6 +33,7 @@ void use_mm(struct mm_struct *mm)
36102 }
36103 tsk->mm = mm;
36104 switch_mm(active_mm, mm, tsk);
36105+ preempt_enable_rt();
36106 task_unlock(tsk);
36107 #ifdef finish_arch_post_lock_switch
36108 finish_arch_post_lock_switch();
36109diff --git a/mm/page_alloc.c b/mm/page_alloc.c
36110index a604b5da6755..525a6f2d5144 100644
36111--- a/mm/page_alloc.c
36112+++ b/mm/page_alloc.c
36113@@ -61,6 +61,7 @@
36114 #include <linux/hugetlb.h>
36115 #include <linux/sched/rt.h>
36116 #include <linux/sched/mm.h>
36117+#include <linux/locallock.h>
36118 #include <linux/page_owner.h>
36119 #include <linux/kthread.h>
36120 #include <linux/memcontrol.h>
36121@@ -286,6 +287,18 @@ EXPORT_SYMBOL(nr_node_ids);
36122 EXPORT_SYMBOL(nr_online_nodes);
36123 #endif
36124
36125+static DEFINE_LOCAL_IRQ_LOCK(pa_lock);
36126+
36127+#ifdef CONFIG_PREEMPT_RT_BASE
36128+# define cpu_lock_irqsave(cpu, flags) \
36129+ local_lock_irqsave_on(pa_lock, flags, cpu)
36130+# define cpu_unlock_irqrestore(cpu, flags) \
36131+ local_unlock_irqrestore_on(pa_lock, flags, cpu)
36132+#else
36133+# define cpu_lock_irqsave(cpu, flags) local_irq_save(flags)
36134+# define cpu_unlock_irqrestore(cpu, flags) local_irq_restore(flags)
36135+#endif
36136+
36137 int page_group_by_mobility_disabled __read_mostly;
36138
36139 #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
36140@@ -1094,7 +1107,7 @@ static bool bulkfree_pcp_prepare(struct page *page)
36141 #endif /* CONFIG_DEBUG_VM */
36142
36143 /*
36144- * Frees a number of pages from the PCP lists
36145+ * Frees a number of pages which have been collected from the pcp lists.
36146 * Assumes all pages on list are in same zone, and of same order.
36147 * count is the number of pages to free.
36148 *
36149@@ -1105,15 +1118,53 @@ static bool bulkfree_pcp_prepare(struct page *page)
36150 * pinned" detection logic.
36151 */
36152 static void free_pcppages_bulk(struct zone *zone, int count,
36153- struct per_cpu_pages *pcp)
36154+ struct list_head *list)
36155 {
36156- int migratetype = 0;
36157- int batch_free = 0;
36158 bool isolated_pageblocks;
36159+ unsigned long flags;
36160
36161- spin_lock(&zone->lock);
36162+ spin_lock_irqsave(&zone->lock, flags);
36163 isolated_pageblocks = has_isolate_pageblock(zone);
36164
36165+ while (!list_empty(list)) {
36166+ struct page *page;
36167+ int mt; /* migratetype of the to-be-freed page */
36168+
36169+ page = list_first_entry(list, struct page, lru);
36170+ /* must delete as __free_one_page list manipulates */
36171+ list_del(&page->lru);
36172+
36173+ mt = get_pcppage_migratetype(page);
36174+ /* MIGRATE_ISOLATE page should not go to pcplists */
36175+ VM_BUG_ON_PAGE(is_migrate_isolate(mt), page);
36176+ /* Pageblock could have been isolated meanwhile */
36177+ if (unlikely(isolated_pageblocks))
36178+ mt = get_pageblock_migratetype(page);
36179+
36180+ if (bulkfree_pcp_prepare(page))
36181+ continue;
36182+
36183+ __free_one_page(page, page_to_pfn(page), zone, 0, mt);
36184+ trace_mm_page_pcpu_drain(page, 0, mt);
36185+ count--;
36186+ }
36187+ WARN_ON(count != 0);
36188+ spin_unlock_irqrestore(&zone->lock, flags);
36189+}
36190+
36191+/*
36192+ * Moves a number of pages from the PCP lists to free list which
36193+ * is freed outside of the locked region.
36194+ *
36195+ * Assumes all pages on list are in same zone, and of same order.
36196+ * count is the number of pages to free.
36197+ */
36198+static void isolate_pcp_pages(int count, struct per_cpu_pages *src,
36199+ struct list_head *dst)
36200+{
36201+ int migratetype = 0;
36202+ int batch_free = 0;
36203+
36204 while (count) {
36205 struct page *page;
36206 struct list_head *list;
36207@@ -1129,7 +1180,7 @@ static void free_pcppages_bulk(struct zone *zone, int count,
36208 batch_free++;
36209 if (++migratetype == MIGRATE_PCPTYPES)
36210 migratetype = 0;
36211- list = &pcp->lists[migratetype];
36212+ list = &src->lists[migratetype];
36213 } while (list_empty(list));
36214
36215 /* This is the only non-empty list. Free them all. */
36216@@ -1137,27 +1188,12 @@ static void free_pcppages_bulk(struct zone *zone, int count,
36217 batch_free = count;
36218
36219 do {
36220- int mt; /* migratetype of the to-be-freed page */
36221-
36222 page = list_last_entry(list, struct page, lru);
36223- /* must delete as __free_one_page list manipulates */
36224 list_del(&page->lru);
36225
36226- mt = get_pcppage_migratetype(page);
36227- /* MIGRATE_ISOLATE page should not go to pcplists */
36228- VM_BUG_ON_PAGE(is_migrate_isolate(mt), page);
36229- /* Pageblock could have been isolated meanwhile */
36230- if (unlikely(isolated_pageblocks))
36231- mt = get_pageblock_migratetype(page);
36232-
36233- if (bulkfree_pcp_prepare(page))
36234- continue;
36235-
36236- __free_one_page(page, page_to_pfn(page), zone, 0, mt);
36237- trace_mm_page_pcpu_drain(page, 0, mt);
36238+ list_add(&page->lru, dst);
36239 } while (--count && --batch_free && !list_empty(list));
36240 }
36241- spin_unlock(&zone->lock);
36242 }
36243
36244 static void free_one_page(struct zone *zone,
36245@@ -1165,13 +1201,15 @@ static void free_one_page(struct zone *zone,
36246 unsigned int order,
36247 int migratetype)
36248 {
36249- spin_lock(&zone->lock);
36250+ unsigned long flags;
36251+
36252+ spin_lock_irqsave(&zone->lock, flags);
36253 if (unlikely(has_isolate_pageblock(zone) ||
36254 is_migrate_isolate(migratetype))) {
36255 migratetype = get_pfnblock_migratetype(page, pfn);
36256 }
36257 __free_one_page(page, pfn, zone, order, migratetype);
36258- spin_unlock(&zone->lock);
36259+ spin_unlock_irqrestore(&zone->lock, flags);
36260 }
36261
36262 static void __meminit __init_single_page(struct page *page, unsigned long pfn,
36263@@ -1257,10 +1295,10 @@ static void __free_pages_ok(struct page *page, unsigned int order)
36264 return;
36265
36266 migratetype = get_pfnblock_migratetype(page, pfn);
36267- local_irq_save(flags);
36268+ local_lock_irqsave(pa_lock, flags);
36269 __count_vm_events(PGFREE, 1 << order);
36270 free_one_page(page_zone(page), page, pfn, order, migratetype);
36271- local_irq_restore(flags);
36272+ local_unlock_irqrestore(pa_lock, flags);
36273 }
36274
36275 static void __init __free_pages_boot_core(struct page *page, unsigned int order)
36276@@ -2378,16 +2416,18 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order,
36277 void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp)
36278 {
36279 unsigned long flags;
36280+ LIST_HEAD(dst);
36281 int to_drain, batch;
36282
36283- local_irq_save(flags);
36284+ local_lock_irqsave(pa_lock, flags);
36285 batch = READ_ONCE(pcp->batch);
36286 to_drain = min(pcp->count, batch);
36287 if (to_drain > 0) {
36288- free_pcppages_bulk(zone, to_drain, pcp);
36289+ isolate_pcp_pages(to_drain, pcp, &dst);
36290 pcp->count -= to_drain;
36291 }
36292- local_irq_restore(flags);
36293+ local_unlock_irqrestore(pa_lock, flags);
36294+ free_pcppages_bulk(zone, to_drain, &dst);
36295 }
36296 #endif
36297
36298@@ -2403,16 +2443,21 @@ static void drain_pages_zone(unsigned int cpu, struct zone *zone)
36299 unsigned long flags;
36300 struct per_cpu_pageset *pset;
36301 struct per_cpu_pages *pcp;
36302+ LIST_HEAD(dst);
36303+ int count;
36304
36305- local_irq_save(flags);
36306+ cpu_lock_irqsave(cpu, flags);
36307 pset = per_cpu_ptr(zone->pageset, cpu);
36308
36309 pcp = &pset->pcp;
36310- if (pcp->count) {
36311- free_pcppages_bulk(zone, pcp->count, pcp);
36312+ count = pcp->count;
36313+ if (count) {
36314+ isolate_pcp_pages(count, pcp, &dst);
36315 pcp->count = 0;
36316 }
36317- local_irq_restore(flags);
36318+ cpu_unlock_irqrestore(cpu, flags);
36319+ if (count)
36320+ free_pcppages_bulk(zone, count, &dst);
36321 }
36322
36323 /*
36324@@ -2447,6 +2492,7 @@ void drain_local_pages(struct zone *zone)
36325 drain_pages(cpu);
36326 }
36327
36328+#ifndef CONFIG_PREEMPT_RT_BASE
36329 static void drain_local_pages_wq(struct work_struct *work)
36330 {
36331 /*
36332@@ -2460,6 +2506,7 @@ static void drain_local_pages_wq(struct work_struct *work)
36333 drain_local_pages(NULL);
36334 preempt_enable();
36335 }
36336+#endif
36337
36338 /*
36339 * Spill all the per-cpu pages from all CPUs back into the buddy allocator.
36340@@ -2526,7 +2573,14 @@ void drain_all_pages(struct zone *zone)
36341 else
36342 cpumask_clear_cpu(cpu, &cpus_with_pcps);
36343 }
36344-
36345+#ifdef CONFIG_PREEMPT_RT_BASE
36346+ for_each_cpu(cpu, &cpus_with_pcps) {
36347+ if (zone)
36348+ drain_pages_zone(cpu, zone);
36349+ else
36350+ drain_pages(cpu);
36351+ }
36352+#else
36353 for_each_cpu(cpu, &cpus_with_pcps) {
36354 struct work_struct *work = per_cpu_ptr(&pcpu_drain, cpu);
36355 INIT_WORK(work, drain_local_pages_wq);
36356@@ -2534,6 +2588,7 @@ void drain_all_pages(struct zone *zone)
36357 }
36358 for_each_cpu(cpu, &cpus_with_pcps)
36359 flush_work(per_cpu_ptr(&pcpu_drain, cpu));
36360+#endif
36361
36362 mutex_unlock(&pcpu_drain_mutex);
36363 }
36364@@ -2610,7 +2665,7 @@ void free_hot_cold_page(struct page *page, bool cold)
36365
36366 migratetype = get_pfnblock_migratetype(page, pfn);
36367 set_pcppage_migratetype(page, migratetype);
36368- local_irq_save(flags);
36369+ local_lock_irqsave(pa_lock, flags);
36370 __count_vm_event(PGFREE);
36371
36372 /*
36373@@ -2636,12 +2691,17 @@ void free_hot_cold_page(struct page *page, bool cold)
36374 pcp->count++;
36375 if (pcp->count >= pcp->high) {
36376 unsigned long batch = READ_ONCE(pcp->batch);
36377- free_pcppages_bulk(zone, batch, pcp);
36378+ LIST_HEAD(dst);
36379+
36380+ isolate_pcp_pages(batch, pcp, &dst);
36381 pcp->count -= batch;
36382+ local_unlock_irqrestore(pa_lock, flags);
36383+ free_pcppages_bulk(zone, batch, &dst);
36384+ return;
36385 }
36386
36387 out:
36388- local_irq_restore(flags);
36389+ local_unlock_irqrestore(pa_lock, flags);
36390 }
36391
36392 /*
36393@@ -2789,7 +2849,7 @@ static struct page *rmqueue_pcplist(struct zone *preferred_zone,
36394 struct page *page;
36395 unsigned long flags;
36396
36397- local_irq_save(flags);
36398+ local_lock_irqsave(pa_lock, flags);
36399 pcp = &this_cpu_ptr(zone->pageset)->pcp;
36400 list = &pcp->lists[migratetype];
36401 page = __rmqueue_pcplist(zone, migratetype, cold, pcp, list);
36402@@ -2797,7 +2857,7 @@ static struct page *rmqueue_pcplist(struct zone *preferred_zone,
36403 __count_zid_vm_events(PGALLOC, page_zonenum(page), 1 << order);
36404 zone_statistics(preferred_zone, zone);
36405 }
36406- local_irq_restore(flags);
36407+ local_unlock_irqrestore(pa_lock, flags);
36408 return page;
36409 }
36410
36411@@ -2824,7 +2884,7 @@ struct page *rmqueue(struct zone *preferred_zone,
36412 * allocate greater than order-1 page units with __GFP_NOFAIL.
36413 */
36414 WARN_ON_ONCE((gfp_flags & __GFP_NOFAIL) && (order > 1));
36415- spin_lock_irqsave(&zone->lock, flags);
36416+ local_spin_lock_irqsave(pa_lock, &zone->lock, flags);
36417
36418 do {
36419 page = NULL;
36420@@ -2844,14 +2904,14 @@ struct page *rmqueue(struct zone *preferred_zone,
36421
36422 __count_zid_vm_events(PGALLOC, page_zonenum(page), 1 << order);
36423 zone_statistics(preferred_zone, zone);
36424- local_irq_restore(flags);
36425+ local_unlock_irqrestore(pa_lock, flags);
36426
36427 out:
36428 VM_BUG_ON_PAGE(page && bad_range(zone, page), page);
36429 return page;
36430
36431 failed:
36432- local_irq_restore(flags);
36433+ local_unlock_irqrestore(pa_lock, flags);
36434 return NULL;
36435 }
36436
36437@@ -6785,8 +6845,9 @@ void __init free_area_init(unsigned long *zones_size)
36438
36439 static int page_alloc_cpu_dead(unsigned int cpu)
36440 {
36441-
36442+ local_lock_irq_on(swapvec_lock, cpu);
36443 lru_add_drain_cpu(cpu);
36444+ local_unlock_irq_on(swapvec_lock, cpu);
36445 drain_pages(cpu);
36446
36447 /*
36448@@ -7690,7 +7751,7 @@ void zone_pcp_reset(struct zone *zone)
36449 struct per_cpu_pageset *pset;
36450
36451 /* avoid races with drain_pages() */
36452- local_irq_save(flags);
36453+ local_lock_irqsave(pa_lock, flags);
36454 if (zone->pageset != &boot_pageset) {
36455 for_each_online_cpu(cpu) {
36456 pset = per_cpu_ptr(zone->pageset, cpu);
36457@@ -7699,7 +7760,7 @@ void zone_pcp_reset(struct zone *zone)
36458 free_percpu(zone->pageset);
36459 zone->pageset = &boot_pageset;
36460 }
36461- local_irq_restore(flags);
36462+ local_unlock_irqrestore(pa_lock, flags);
36463 }
36464
36465 #ifdef CONFIG_MEMORY_HOTREMOVE
36466diff --git a/mm/slab.h b/mm/slab.h
36467index 485d9fbb8802..f3b06c48bf39 100644
36468--- a/mm/slab.h
36469+++ b/mm/slab.h
36470@@ -451,7 +451,11 @@ static inline void slab_post_alloc_hook(struct kmem_cache *s, gfp_t flags,
36471 * The slab lists for all objects.
36472 */
36473 struct kmem_cache_node {
36474+#ifdef CONFIG_SLUB
36475+ raw_spinlock_t list_lock;
36476+#else
36477 spinlock_t list_lock;
36478+#endif
36479
36480 #ifdef CONFIG_SLAB
36481 struct list_head slabs_partial; /* partial list first, better asm code */
36482diff --git a/mm/slub.c b/mm/slub.c
36483index 220d42e592ef..9b337c28dd1f 100644
36484--- a/mm/slub.c
36485+++ b/mm/slub.c
36486@@ -1179,7 +1179,7 @@ static noinline int free_debug_processing(
36487 unsigned long uninitialized_var(flags);
36488 int ret = 0;
36489
36490- spin_lock_irqsave(&n->list_lock, flags);
36491+ raw_spin_lock_irqsave(&n->list_lock, flags);
36492 slab_lock(page);
36493
36494 if (s->flags & SLAB_CONSISTENCY_CHECKS) {
36495@@ -1214,7 +1214,7 @@ static noinline int free_debug_processing(
36496 bulk_cnt, cnt);
36497
36498 slab_unlock(page);
36499- spin_unlock_irqrestore(&n->list_lock, flags);
36500+ raw_spin_unlock_irqrestore(&n->list_lock, flags);
36501 if (!ret)
36502 slab_fix(s, "Object at 0x%p not freed", object);
36503 return ret;
36504@@ -1342,6 +1342,12 @@ static inline void dec_slabs_node(struct kmem_cache *s, int node,
36505
36506 #endif /* CONFIG_SLUB_DEBUG */
36507
36508+struct slub_free_list {
36509+ raw_spinlock_t lock;
36510+ struct list_head list;
36511+};
36512+static DEFINE_PER_CPU(struct slub_free_list, slub_free_list);
36513+
36514 /*
36515 * Hooks for other subsystems that check memory allocations. In a typical
36516 * production configuration these hooks all should produce no code at all.
36517@@ -1561,10 +1567,17 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
36518 void *start, *p;
36519 int idx, order;
36520 bool shuffle;
36521+ bool enableirqs = false;
36522
36523 flags &= gfp_allowed_mask;
36524
36525 if (gfpflags_allow_blocking(flags))
36526+ enableirqs = true;
36527+#ifdef CONFIG_PREEMPT_RT_FULL
36528+ if (system_state > SYSTEM_BOOTING)
36529+ enableirqs = true;
36530+#endif
36531+ if (enableirqs)
36532 local_irq_enable();
36533
36534 flags |= s->allocflags;
36535@@ -1623,7 +1636,7 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
36536 page->frozen = 1;
36537
36538 out:
36539- if (gfpflags_allow_blocking(flags))
36540+ if (enableirqs)
36541 local_irq_disable();
36542 if (!page)
36543 return NULL;
36544@@ -1681,6 +1694,16 @@ static void __free_slab(struct kmem_cache *s, struct page *page)
36545 __free_pages(page, order);
36546 }
36547
36548+static void free_delayed(struct list_head *h)
36549+{
36550+ while(!list_empty(h)) {
36551+ struct page *page = list_first_entry(h, struct page, lru);
36552+
36553+ list_del(&page->lru);
36554+ __free_slab(page->slab_cache, page);
36555+ }
36556+}
36557+
36558 #define need_reserve_slab_rcu \
36559 (sizeof(((struct page *)NULL)->lru) < sizeof(struct rcu_head))
36560
36561@@ -1712,6 +1735,12 @@ static void free_slab(struct kmem_cache *s, struct page *page)
36562 }
36563
36564 call_rcu(head, rcu_free_slab);
36565+ } else if (irqs_disabled()) {
36566+ struct slub_free_list *f = this_cpu_ptr(&slub_free_list);
36567+
36568+ raw_spin_lock(&f->lock);
36569+ list_add(&page->lru, &f->list);
36570+ raw_spin_unlock(&f->lock);
36571 } else
36572 __free_slab(s, page);
36573 }
36574@@ -1819,7 +1848,7 @@ static void *get_partial_node(struct kmem_cache *s, struct kmem_cache_node *n,
36575 if (!n || !n->nr_partial)
36576 return NULL;
36577
36578- spin_lock(&n->list_lock);
36579+ raw_spin_lock(&n->list_lock);
36580 list_for_each_entry_safe(page, page2, &n->partial, lru) {
36581 void *t;
36582
36583@@ -1844,7 +1873,7 @@ static void *get_partial_node(struct kmem_cache *s, struct kmem_cache_node *n,
36584 break;
36585
36586 }
36587- spin_unlock(&n->list_lock);
36588+ raw_spin_unlock(&n->list_lock);
36589 return object;
36590 }
36591
36592@@ -2090,7 +2119,7 @@ static void deactivate_slab(struct kmem_cache *s, struct page *page,
36593 * that acquire_slab() will see a slab page that
36594 * is frozen
36595 */
36596- spin_lock(&n->list_lock);
36597+ raw_spin_lock(&n->list_lock);
36598 }
36599 } else {
36600 m = M_FULL;
36601@@ -2101,7 +2130,7 @@ static void deactivate_slab(struct kmem_cache *s, struct page *page,
36602 * slabs from diagnostic functions will not see
36603 * any frozen slabs.
36604 */
36605- spin_lock(&n->list_lock);
36606+ raw_spin_lock(&n->list_lock);
36607 }
36608 }
36609
36610@@ -2136,7 +2165,7 @@ static void deactivate_slab(struct kmem_cache *s, struct page *page,
36611 goto redo;
36612
36613 if (lock)
36614- spin_unlock(&n->list_lock);
36615+ raw_spin_unlock(&n->list_lock);
36616
36617 if (m == M_FREE) {
36618 stat(s, DEACTIVATE_EMPTY);
36619@@ -2171,10 +2200,10 @@ static void unfreeze_partials(struct kmem_cache *s,
36620 n2 = get_node(s, page_to_nid(page));
36621 if (n != n2) {
36622 if (n)
36623- spin_unlock(&n->list_lock);
36624+ raw_spin_unlock(&n->list_lock);
36625
36626 n = n2;
36627- spin_lock(&n->list_lock);
36628+ raw_spin_lock(&n->list_lock);
36629 }
36630
36631 do {
36632@@ -2203,7 +2232,7 @@ static void unfreeze_partials(struct kmem_cache *s,
36633 }
36634
36635 if (n)
36636- spin_unlock(&n->list_lock);
36637+ raw_spin_unlock(&n->list_lock);
36638
36639 while (discard_page) {
36640 page = discard_page;
36641@@ -2242,14 +2271,21 @@ static void put_cpu_partial(struct kmem_cache *s, struct page *page, int drain)
36642 pobjects = oldpage->pobjects;
36643 pages = oldpage->pages;
36644 if (drain && pobjects > s->cpu_partial) {
36645+ struct slub_free_list *f;
36646 unsigned long flags;
36647+ LIST_HEAD(tofree);
36648 /*
36649 * partial array is full. Move the existing
36650 * set to the per node partial list.
36651 */
36652 local_irq_save(flags);
36653 unfreeze_partials(s, this_cpu_ptr(s->cpu_slab));
36654+ f = this_cpu_ptr(&slub_free_list);
36655+ raw_spin_lock(&f->lock);
36656+ list_splice_init(&f->list, &tofree);
36657+ raw_spin_unlock(&f->lock);
36658 local_irq_restore(flags);
36659+ free_delayed(&tofree);
36660 oldpage = NULL;
36661 pobjects = 0;
36662 pages = 0;
36663@@ -2319,7 +2355,22 @@ static bool has_cpu_slab(int cpu, void *info)
36664
36665 static void flush_all(struct kmem_cache *s)
36666 {
36667+ LIST_HEAD(tofree);
36668+ int cpu;
36669+
36670 on_each_cpu_cond(has_cpu_slab, flush_cpu_slab, s, 1, GFP_ATOMIC);
36671+ for_each_online_cpu(cpu) {
36672+ struct slub_free_list *f;
36673+
36674+ if (!has_cpu_slab(cpu, s))
36675+ continue;
36676+
36677+ f = &per_cpu(slub_free_list, cpu);
36678+ raw_spin_lock_irq(&f->lock);
36679+ list_splice_init(&f->list, &tofree);
36680+ raw_spin_unlock_irq(&f->lock);
36681+ free_delayed(&tofree);
36682+ }
36683 }
36684
36685 /*
36686@@ -2374,10 +2425,10 @@ static unsigned long count_partial(struct kmem_cache_node *n,
36687 unsigned long x = 0;
36688 struct page *page;
36689
36690- spin_lock_irqsave(&n->list_lock, flags);
36691+ raw_spin_lock_irqsave(&n->list_lock, flags);
36692 list_for_each_entry(page, &n->partial, lru)
36693 x += get_count(page);
36694- spin_unlock_irqrestore(&n->list_lock, flags);
36695+ raw_spin_unlock_irqrestore(&n->list_lock, flags);
36696 return x;
36697 }
36698 #endif /* CONFIG_SLUB_DEBUG || CONFIG_SYSFS */
36699@@ -2515,8 +2566,10 @@ static inline void *get_freelist(struct kmem_cache *s, struct page *page)
36700 * already disabled (which is the case for bulk allocation).
36701 */
36702 static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
36703- unsigned long addr, struct kmem_cache_cpu *c)
36704+ unsigned long addr, struct kmem_cache_cpu *c,
36705+ struct list_head *to_free)
36706 {
36707+ struct slub_free_list *f;
36708 void *freelist;
36709 struct page *page;
36710
36711@@ -2572,6 +2625,13 @@ static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
36712 VM_BUG_ON(!c->page->frozen);
36713 c->freelist = get_freepointer(s, freelist);
36714 c->tid = next_tid(c->tid);
36715+
36716+out:
36717+ f = this_cpu_ptr(&slub_free_list);
36718+ raw_spin_lock(&f->lock);
36719+ list_splice_init(&f->list, to_free);
36720+ raw_spin_unlock(&f->lock);
36721+
36722 return freelist;
36723
36724 new_slab:
36725@@ -2587,7 +2647,7 @@ static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
36726
36727 if (unlikely(!freelist)) {
36728 slab_out_of_memory(s, gfpflags, node);
36729- return NULL;
36730+ goto out;
36731 }
36732
36733 page = c->page;
36734@@ -2600,7 +2660,7 @@ static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
36735 goto new_slab; /* Slab failed checks. Next slab needed */
36736
36737 deactivate_slab(s, page, get_freepointer(s, freelist), c);
36738- return freelist;
36739+ goto out;
36740 }
36741
36742 /*
36743@@ -2612,6 +2672,7 @@ static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
36744 {
36745 void *p;
36746 unsigned long flags;
36747+ LIST_HEAD(tofree);
36748
36749 local_irq_save(flags);
36750 #ifdef CONFIG_PREEMPT
36751@@ -2623,8 +2684,9 @@ static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
36752 c = this_cpu_ptr(s->cpu_slab);
36753 #endif
36754
36755- p = ___slab_alloc(s, gfpflags, node, addr, c);
36756+ p = ___slab_alloc(s, gfpflags, node, addr, c, &tofree);
36757 local_irq_restore(flags);
36758+ free_delayed(&tofree);
36759 return p;
36760 }
36761
36762@@ -2810,7 +2872,7 @@ static void __slab_free(struct kmem_cache *s, struct page *page,
36763
36764 do {
36765 if (unlikely(n)) {
36766- spin_unlock_irqrestore(&n->list_lock, flags);
36767+ raw_spin_unlock_irqrestore(&n->list_lock, flags);
36768 n = NULL;
36769 }
36770 prior = page->freelist;
36771@@ -2842,7 +2904,7 @@ static void __slab_free(struct kmem_cache *s, struct page *page,
36772 * Otherwise the list_lock will synchronize with
36773 * other processors updating the list of slabs.
36774 */
36775- spin_lock_irqsave(&n->list_lock, flags);
36776+ raw_spin_lock_irqsave(&n->list_lock, flags);
36777
36778 }
36779 }
36780@@ -2884,7 +2946,7 @@ static void __slab_free(struct kmem_cache *s, struct page *page,
36781 add_partial(n, page, DEACTIVATE_TO_TAIL);
36782 stat(s, FREE_ADD_PARTIAL);
36783 }
36784- spin_unlock_irqrestore(&n->list_lock, flags);
36785+ raw_spin_unlock_irqrestore(&n->list_lock, flags);
36786 return;
36787
36788 slab_empty:
36789@@ -2899,7 +2961,7 @@ static void __slab_free(struct kmem_cache *s, struct page *page,
36790 remove_full(s, n, page);
36791 }
36792
36793- spin_unlock_irqrestore(&n->list_lock, flags);
36794+ raw_spin_unlock_irqrestore(&n->list_lock, flags);
36795 stat(s, FREE_SLAB);
36796 discard_slab(s, page);
36797 }
36798@@ -3104,6 +3166,7 @@ int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size,
36799 void **p)
36800 {
36801 struct kmem_cache_cpu *c;
36802+ LIST_HEAD(to_free);
36803 int i;
36804
36805 /* memcg and kmem_cache debug support */
36806@@ -3127,7 +3190,7 @@ int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size,
36807 * of re-populating per CPU c->freelist
36808 */
36809 p[i] = ___slab_alloc(s, flags, NUMA_NO_NODE,
36810- _RET_IP_, c);
36811+ _RET_IP_, c, &to_free);
36812 if (unlikely(!p[i]))
36813 goto error;
36814
36815@@ -3139,6 +3202,7 @@ int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size,
36816 }
36817 c->tid = next_tid(c->tid);
36818 local_irq_enable();
36819+ free_delayed(&to_free);
36820
36821 /* Clear memory outside IRQ disabled fastpath loop */
36822 if (unlikely(flags & __GFP_ZERO)) {
36823@@ -3153,6 +3217,7 @@ int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size,
36824 return i;
36825 error:
36826 local_irq_enable();
36827+ free_delayed(&to_free);
36828 slab_post_alloc_hook(s, flags, i, p);
36829 __kmem_cache_free_bulk(s, i, p);
36830 return 0;
36831@@ -3286,7 +3351,7 @@ static void
36832 init_kmem_cache_node(struct kmem_cache_node *n)
36833 {
36834 n->nr_partial = 0;
36835- spin_lock_init(&n->list_lock);
36836+ raw_spin_lock_init(&n->list_lock);
36837 INIT_LIST_HEAD(&n->partial);
36838 #ifdef CONFIG_SLUB_DEBUG
36839 atomic_long_set(&n->nr_slabs, 0);
36840@@ -3640,6 +3705,10 @@ static void list_slab_objects(struct kmem_cache *s, struct page *page,
36841 const char *text)
36842 {
36843 #ifdef CONFIG_SLUB_DEBUG
36844+#ifdef CONFIG_PREEMPT_RT_BASE
36845+ /* XXX move out of irq-off section */
36846+ slab_err(s, page, text, s->name);
36847+#else
36848 void *addr = page_address(page);
36849 void *p;
36850 unsigned long *map = kzalloc(BITS_TO_LONGS(page->objects) *
36851@@ -3660,6 +3729,7 @@ static void list_slab_objects(struct kmem_cache *s, struct page *page,
36852 slab_unlock(page);
36853 kfree(map);
36854 #endif
36855+#endif
36856 }
36857
36858 /*
36859@@ -3673,7 +3743,7 @@ static void free_partial(struct kmem_cache *s, struct kmem_cache_node *n)
36860 struct page *page, *h;
36861
36862 BUG_ON(irqs_disabled());
36863- spin_lock_irq(&n->list_lock);
36864+ raw_spin_lock_irq(&n->list_lock);
36865 list_for_each_entry_safe(page, h, &n->partial, lru) {
36866 if (!page->inuse) {
36867 remove_partial(n, page);
36868@@ -3683,7 +3753,7 @@ static void free_partial(struct kmem_cache *s, struct kmem_cache_node *n)
36869 "Objects remaining in %s on __kmem_cache_shutdown()");
36870 }
36871 }
36872- spin_unlock_irq(&n->list_lock);
36873+ raw_spin_unlock_irq(&n->list_lock);
36874
36875 list_for_each_entry_safe(page, h, &discard, lru)
36876 discard_slab(s, page);
36877@@ -3927,7 +3997,7 @@ int __kmem_cache_shrink(struct kmem_cache *s)
36878 for (i = 0; i < SHRINK_PROMOTE_MAX; i++)
36879 INIT_LIST_HEAD(promote + i);
36880
36881- spin_lock_irqsave(&n->list_lock, flags);
36882+ raw_spin_lock_irqsave(&n->list_lock, flags);
36883
36884 /*
36885 * Build lists of slabs to discard or promote.
36886@@ -3958,7 +4028,7 @@ int __kmem_cache_shrink(struct kmem_cache *s)
36887 for (i = SHRINK_PROMOTE_MAX - 1; i >= 0; i--)
36888 list_splice(promote + i, &n->partial);
36889
36890- spin_unlock_irqrestore(&n->list_lock, flags);
36891+ raw_spin_unlock_irqrestore(&n->list_lock, flags);
36892
36893 /* Release empty slabs */
36894 list_for_each_entry_safe(page, t, &discard, lru)
36895@@ -4171,6 +4241,12 @@ void __init kmem_cache_init(void)
36896 {
36897 static __initdata struct kmem_cache boot_kmem_cache,
36898 boot_kmem_cache_node;
36899+ int cpu;
36900+
36901+ for_each_possible_cpu(cpu) {
36902+ raw_spin_lock_init(&per_cpu(slub_free_list, cpu).lock);
36903+ INIT_LIST_HEAD(&per_cpu(slub_free_list, cpu).list);
36904+ }
36905
36906 if (debug_guardpage_minorder())
36907 slub_max_order = 0;
36908@@ -4379,7 +4455,7 @@ static int validate_slab_node(struct kmem_cache *s,
36909 struct page *page;
36910 unsigned long flags;
36911
36912- spin_lock_irqsave(&n->list_lock, flags);
36913+ raw_spin_lock_irqsave(&n->list_lock, flags);
36914
36915 list_for_each_entry(page, &n->partial, lru) {
36916 validate_slab_slab(s, page, map);
36917@@ -4401,7 +4477,7 @@ static int validate_slab_node(struct kmem_cache *s,
36918 s->name, count, atomic_long_read(&n->nr_slabs));
36919
36920 out:
36921- spin_unlock_irqrestore(&n->list_lock, flags);
36922+ raw_spin_unlock_irqrestore(&n->list_lock, flags);
36923 return count;
36924 }
36925
36926@@ -4589,12 +4665,12 @@ static int list_locations(struct kmem_cache *s, char *buf,
36927 if (!atomic_long_read(&n->nr_slabs))
36928 continue;
36929
36930- spin_lock_irqsave(&n->list_lock, flags);
36931+ raw_spin_lock_irqsave(&n->list_lock, flags);
36932 list_for_each_entry(page, &n->partial, lru)
36933 process_slab(&t, s, page, alloc, map);
36934 list_for_each_entry(page, &n->full, lru)
36935 process_slab(&t, s, page, alloc, map);
36936- spin_unlock_irqrestore(&n->list_lock, flags);
36937+ raw_spin_unlock_irqrestore(&n->list_lock, flags);
36938 }
36939
36940 for (i = 0; i < t.count; i++) {
36941diff --git a/mm/swap.c b/mm/swap.c
36942index a77d68f2c1b6..30d62efe001b 100644
36943--- a/mm/swap.c
36944+++ b/mm/swap.c
36945@@ -32,6 +32,7 @@
36946 #include <linux/memcontrol.h>
36947 #include <linux/gfp.h>
36948 #include <linux/uio.h>
36949+#include <linux/locallock.h>
36950 #include <linux/hugetlb.h>
36951 #include <linux/page_idle.h>
36952
36953@@ -50,6 +51,8 @@ static DEFINE_PER_CPU(struct pagevec, lru_lazyfree_pvecs);
36954 #ifdef CONFIG_SMP
36955 static DEFINE_PER_CPU(struct pagevec, activate_page_pvecs);
36956 #endif
36957+static DEFINE_LOCAL_IRQ_LOCK(rotate_lock);
36958+DEFINE_LOCAL_IRQ_LOCK(swapvec_lock);
36959
36960 /*
36961 * This path almost never happens for VM activity - pages are normally
36962@@ -252,11 +255,11 @@ void rotate_reclaimable_page(struct page *page)
36963 unsigned long flags;
36964
36965 get_page(page);
36966- local_irq_save(flags);
36967+ local_lock_irqsave(rotate_lock, flags);
36968 pvec = this_cpu_ptr(&lru_rotate_pvecs);
36969 if (!pagevec_add(pvec, page) || PageCompound(page))
36970 pagevec_move_tail(pvec);
36971- local_irq_restore(flags);
36972+ local_unlock_irqrestore(rotate_lock, flags);
36973 }
36974 }
36975
36976@@ -306,12 +309,13 @@ void activate_page(struct page *page)
36977 {
36978 page = compound_head(page);
36979 if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) {
36980- struct pagevec *pvec = &get_cpu_var(activate_page_pvecs);
36981+ struct pagevec *pvec = &get_locked_var(swapvec_lock,
36982+ activate_page_pvecs);
36983
36984 get_page(page);
36985 if (!pagevec_add(pvec, page) || PageCompound(page))
36986 pagevec_lru_move_fn(pvec, __activate_page, NULL);
36987- put_cpu_var(activate_page_pvecs);
36988+ put_locked_var(swapvec_lock, activate_page_pvecs);
36989 }
36990 }
36991
36992@@ -338,7 +342,7 @@ void activate_page(struct page *page)
36993
36994 static void __lru_cache_activate_page(struct page *page)
36995 {
36996- struct pagevec *pvec = &get_cpu_var(lru_add_pvec);
36997+ struct pagevec *pvec = &get_locked_var(swapvec_lock, lru_add_pvec);
36998 int i;
36999
37000 /*
37001@@ -360,7 +364,7 @@ static void __lru_cache_activate_page(struct page *page)
37002 }
37003 }
37004
37005- put_cpu_var(lru_add_pvec);
37006+ put_locked_var(swapvec_lock, lru_add_pvec);
37007 }
37008
37009 /*
37010@@ -402,12 +406,12 @@ EXPORT_SYMBOL(mark_page_accessed);
37011
37012 static void __lru_cache_add(struct page *page)
37013 {
37014- struct pagevec *pvec = &get_cpu_var(lru_add_pvec);
37015+ struct pagevec *pvec = &get_locked_var(swapvec_lock, lru_add_pvec);
37016
37017 get_page(page);
37018 if (!pagevec_add(pvec, page) || PageCompound(page))
37019 __pagevec_lru_add(pvec);
37020- put_cpu_var(lru_add_pvec);
37021+ put_locked_var(swapvec_lock, lru_add_pvec);
37022 }
37023
37024 /**
37025@@ -613,9 +617,15 @@ void lru_add_drain_cpu(int cpu)
37026 unsigned long flags;
37027
37028 /* No harm done if a racing interrupt already did this */
37029- local_irq_save(flags);
37030+#ifdef CONFIG_PREEMPT_RT_BASE
37031+ local_lock_irqsave_on(rotate_lock, flags, cpu);
37032 pagevec_move_tail(pvec);
37033- local_irq_restore(flags);
37034+ local_unlock_irqrestore_on(rotate_lock, flags, cpu);
37035+#else
37036+ local_lock_irqsave(rotate_lock, flags);
37037+ pagevec_move_tail(pvec);
37038+ local_unlock_irqrestore(rotate_lock, flags);
37039+#endif
37040 }
37041
37042 pvec = &per_cpu(lru_deactivate_file_pvecs, cpu);
37043@@ -647,11 +657,12 @@ void deactivate_file_page(struct page *page)
37044 return;
37045
37046 if (likely(get_page_unless_zero(page))) {
37047- struct pagevec *pvec = &get_cpu_var(lru_deactivate_file_pvecs);
37048+ struct pagevec *pvec = &get_locked_var(swapvec_lock,
37049+ lru_deactivate_file_pvecs);
37050
37051 if (!pagevec_add(pvec, page) || PageCompound(page))
37052 pagevec_lru_move_fn(pvec, lru_deactivate_file_fn, NULL);
37053- put_cpu_var(lru_deactivate_file_pvecs);
37054+ put_locked_var(swapvec_lock, lru_deactivate_file_pvecs);
37055 }
37056 }
37057
37058@@ -666,21 +677,32 @@ void mark_page_lazyfree(struct page *page)
37059 {
37060 if (PageLRU(page) && PageAnon(page) && PageSwapBacked(page) &&
37061 !PageSwapCache(page) && !PageUnevictable(page)) {
37062- struct pagevec *pvec = &get_cpu_var(lru_lazyfree_pvecs);
37063+ struct pagevec *pvec = &get_locked_var(swapvec_lock,
37064+ lru_lazyfree_pvecs);
37065
37066 get_page(page);
37067 if (!pagevec_add(pvec, page) || PageCompound(page))
37068 pagevec_lru_move_fn(pvec, lru_lazyfree_fn, NULL);
37069- put_cpu_var(lru_lazyfree_pvecs);
37070+ put_locked_var(swapvec_lock, lru_lazyfree_pvecs);
37071 }
37072 }
37073
37074 void lru_add_drain(void)
37075 {
37076- lru_add_drain_cpu(get_cpu());
37077- put_cpu();
37078+ lru_add_drain_cpu(local_lock_cpu(swapvec_lock));
37079+ local_unlock_cpu(swapvec_lock);
37080+}
37081+
37082+#ifdef CONFIG_PREEMPT_RT_BASE
37083+static inline void remote_lru_add_drain(int cpu, struct cpumask *has_work)
37084+{
37085+ local_lock_on(swapvec_lock, cpu);
37086+ lru_add_drain_cpu(cpu);
37087+ local_unlock_on(swapvec_lock, cpu);
37088 }
37089
37090+#else
37091+
37092 static void lru_add_drain_per_cpu(struct work_struct *dummy)
37093 {
37094 lru_add_drain();
37095@@ -688,6 +710,16 @@ static void lru_add_drain_per_cpu(struct work_struct *dummy)
37096
37097 static DEFINE_PER_CPU(struct work_struct, lru_add_drain_work);
37098
37099+static inline void remote_lru_add_drain(int cpu, struct cpumask *has_work)
37100+{
37101+ struct work_struct *work = &per_cpu(lru_add_drain_work, cpu);
37102+
37103+ INIT_WORK(work, lru_add_drain_per_cpu);
37104+ queue_work_on(cpu, mm_percpu_wq, work);
37105+ cpumask_set_cpu(cpu, has_work);
37106+}
37107+#endif
37108+
37109 void lru_add_drain_all_cpuslocked(void)
37110 {
37111 static DEFINE_MUTEX(lock);
37112@@ -705,21 +737,19 @@ void lru_add_drain_all_cpuslocked(void)
37113 cpumask_clear(&has_work);
37114
37115 for_each_online_cpu(cpu) {
37116- struct work_struct *work = &per_cpu(lru_add_drain_work, cpu);
37117
37118 if (pagevec_count(&per_cpu(lru_add_pvec, cpu)) ||
37119 pagevec_count(&per_cpu(lru_rotate_pvecs, cpu)) ||
37120 pagevec_count(&per_cpu(lru_deactivate_file_pvecs, cpu)) ||
37121 pagevec_count(&per_cpu(lru_lazyfree_pvecs, cpu)) ||
37122- need_activate_page_drain(cpu)) {
37123- INIT_WORK(work, lru_add_drain_per_cpu);
37124- queue_work_on(cpu, mm_percpu_wq, work);
37125- cpumask_set_cpu(cpu, &has_work);
37126- }
37127+ need_activate_page_drain(cpu))
37128+ remote_lru_add_drain(cpu, &has_work);
37129 }
37130
37131+#ifndef CONFIG_PREEMPT_RT_BASE
37132 for_each_cpu(cpu, &has_work)
37133 flush_work(&per_cpu(lru_add_drain_work, cpu));
37134+#endif
37135
37136 mutex_unlock(&lock);
37137 }
37138diff --git a/mm/truncate.c b/mm/truncate.c
37139index 2330223841fb..d0c8e6c8fef5 100644
37140--- a/mm/truncate.c
37141+++ b/mm/truncate.c
37142@@ -41,8 +41,10 @@ static void clear_shadow_entry(struct address_space *mapping, pgoff_t index,
37143 goto unlock;
37144 if (*slot != entry)
37145 goto unlock;
37146+ local_lock(shadow_nodes_lock);
37147 __radix_tree_replace(&mapping->page_tree, node, slot, NULL,
37148- workingset_update_node, mapping);
37149+ __workingset_update_node, mapping);
37150+ local_unlock(shadow_nodes_lock);
37151 mapping->nrexceptional--;
37152 unlock:
37153 spin_unlock_irq(&mapping->tree_lock);
37154diff --git a/mm/vmalloc.c b/mm/vmalloc.c
37155index 9ff21a12ea00..95c83b291548 100644
37156--- a/mm/vmalloc.c
37157+++ b/mm/vmalloc.c
37158@@ -865,7 +865,7 @@ static void *new_vmap_block(unsigned int order, gfp_t gfp_mask)
37159 struct vmap_block *vb;
37160 struct vmap_area *va;
37161 unsigned long vb_idx;
37162- int node, err;
37163+ int node, err, cpu;
37164 void *vaddr;
37165
37166 node = numa_node_id();
37167@@ -908,11 +908,12 @@ static void *new_vmap_block(unsigned int order, gfp_t gfp_mask)
37168 BUG_ON(err);
37169 radix_tree_preload_end();
37170
37171- vbq = &get_cpu_var(vmap_block_queue);
37172+ cpu = get_cpu_light();
37173+ vbq = this_cpu_ptr(&vmap_block_queue);
37174 spin_lock(&vbq->lock);
37175 list_add_tail_rcu(&vb->free_list, &vbq->free);
37176 spin_unlock(&vbq->lock);
37177- put_cpu_var(vmap_block_queue);
37178+ put_cpu_light();
37179
37180 return vaddr;
37181 }
37182@@ -981,6 +982,7 @@ static void *vb_alloc(unsigned long size, gfp_t gfp_mask)
37183 struct vmap_block *vb;
37184 void *vaddr = NULL;
37185 unsigned int order;
37186+ int cpu;
37187
37188 BUG_ON(offset_in_page(size));
37189 BUG_ON(size > PAGE_SIZE*VMAP_MAX_ALLOC);
37190@@ -995,7 +997,8 @@ static void *vb_alloc(unsigned long size, gfp_t gfp_mask)
37191 order = get_order(size);
37192
37193 rcu_read_lock();
37194- vbq = &get_cpu_var(vmap_block_queue);
37195+ cpu = get_cpu_light();
37196+ vbq = this_cpu_ptr(&vmap_block_queue);
37197 list_for_each_entry_rcu(vb, &vbq->free, free_list) {
37198 unsigned long pages_off;
37199
37200@@ -1018,7 +1021,7 @@ static void *vb_alloc(unsigned long size, gfp_t gfp_mask)
37201 break;
37202 }
37203
37204- put_cpu_var(vmap_block_queue);
37205+ put_cpu_light();
37206 rcu_read_unlock();
37207
37208 /* Allocate new block if nothing was found */
37209diff --git a/mm/vmstat.c b/mm/vmstat.c
37210index 527ae727d547..ae6446b054d3 100644
37211--- a/mm/vmstat.c
37212+++ b/mm/vmstat.c
37213@@ -249,6 +249,7 @@ void __mod_zone_page_state(struct zone *zone, enum zone_stat_item item,
37214 long x;
37215 long t;
37216
37217+ preempt_disable_rt();
37218 x = delta + __this_cpu_read(*p);
37219
37220 t = __this_cpu_read(pcp->stat_threshold);
37221@@ -258,6 +259,7 @@ void __mod_zone_page_state(struct zone *zone, enum zone_stat_item item,
37222 x = 0;
37223 }
37224 __this_cpu_write(*p, x);
37225+ preempt_enable_rt();
37226 }
37227 EXPORT_SYMBOL(__mod_zone_page_state);
37228
37229@@ -269,6 +271,7 @@ void __mod_node_page_state(struct pglist_data *pgdat, enum node_stat_item item,
37230 long x;
37231 long t;
37232
37233+ preempt_disable_rt();
37234 x = delta + __this_cpu_read(*p);
37235
37236 t = __this_cpu_read(pcp->stat_threshold);
37237@@ -278,6 +281,7 @@ void __mod_node_page_state(struct pglist_data *pgdat, enum node_stat_item item,
37238 x = 0;
37239 }
37240 __this_cpu_write(*p, x);
37241+ preempt_enable_rt();
37242 }
37243 EXPORT_SYMBOL(__mod_node_page_state);
37244
37245@@ -310,6 +314,7 @@ void __inc_zone_state(struct zone *zone, enum zone_stat_item item)
37246 s8 __percpu *p = pcp->vm_stat_diff + item;
37247 s8 v, t;
37248
37249+ preempt_disable_rt();
37250 v = __this_cpu_inc_return(*p);
37251 t = __this_cpu_read(pcp->stat_threshold);
37252 if (unlikely(v > t)) {
37253@@ -318,6 +323,7 @@ void __inc_zone_state(struct zone *zone, enum zone_stat_item item)
37254 zone_page_state_add(v + overstep, zone, item);
37255 __this_cpu_write(*p, -overstep);
37256 }
37257+ preempt_enable_rt();
37258 }
37259
37260 void __inc_node_state(struct pglist_data *pgdat, enum node_stat_item item)
37261@@ -326,6 +332,7 @@ void __inc_node_state(struct pglist_data *pgdat, enum node_stat_item item)
37262 s8 __percpu *p = pcp->vm_node_stat_diff + item;
37263 s8 v, t;
37264
37265+ preempt_disable_rt();
37266 v = __this_cpu_inc_return(*p);
37267 t = __this_cpu_read(pcp->stat_threshold);
37268 if (unlikely(v > t)) {
37269@@ -334,6 +341,7 @@ void __inc_node_state(struct pglist_data *pgdat, enum node_stat_item item)
37270 node_page_state_add(v + overstep, pgdat, item);
37271 __this_cpu_write(*p, -overstep);
37272 }
37273+ preempt_enable_rt();
37274 }
37275
37276 void __inc_zone_page_state(struct page *page, enum zone_stat_item item)
37277@@ -354,6 +362,7 @@ void __dec_zone_state(struct zone *zone, enum zone_stat_item item)
37278 s8 __percpu *p = pcp->vm_stat_diff + item;
37279 s8 v, t;
37280
37281+ preempt_disable_rt();
37282 v = __this_cpu_dec_return(*p);
37283 t = __this_cpu_read(pcp->stat_threshold);
37284 if (unlikely(v < - t)) {
37285@@ -362,6 +371,7 @@ void __dec_zone_state(struct zone *zone, enum zone_stat_item item)
37286 zone_page_state_add(v - overstep, zone, item);
37287 __this_cpu_write(*p, overstep);
37288 }
37289+ preempt_enable_rt();
37290 }
37291
37292 void __dec_node_state(struct pglist_data *pgdat, enum node_stat_item item)
37293@@ -370,6 +380,7 @@ void __dec_node_state(struct pglist_data *pgdat, enum node_stat_item item)
37294 s8 __percpu *p = pcp->vm_node_stat_diff + item;
37295 s8 v, t;
37296
37297+ preempt_disable_rt();
37298 v = __this_cpu_dec_return(*p);
37299 t = __this_cpu_read(pcp->stat_threshold);
37300 if (unlikely(v < - t)) {
37301@@ -378,6 +389,7 @@ void __dec_node_state(struct pglist_data *pgdat, enum node_stat_item item)
37302 node_page_state_add(v - overstep, pgdat, item);
37303 __this_cpu_write(*p, overstep);
37304 }
37305+ preempt_enable_rt();
37306 }
37307
37308 void __dec_zone_page_state(struct page *page, enum zone_stat_item item)
37309diff --git a/mm/workingset.c b/mm/workingset.c
37310index b997c9de28f6..e252cc69a3d4 100644
37311--- a/mm/workingset.c
37312+++ b/mm/workingset.c
37313@@ -338,9 +338,10 @@ void workingset_activation(struct page *page)
37314 * point where they would still be useful.
37315 */
37316
37317-static struct list_lru shadow_nodes;
37318+static struct list_lru __shadow_nodes;
37319+DEFINE_LOCAL_IRQ_LOCK(shadow_nodes_lock);
37320
37321-void workingset_update_node(struct radix_tree_node *node, void *private)
37322+void __workingset_update_node(struct radix_tree_node *node, void *private)
37323 {
37324 struct address_space *mapping = private;
37325
37326@@ -358,10 +359,10 @@ void workingset_update_node(struct radix_tree_node *node, void *private)
37327 */
37328 if (node->count && node->count == node->exceptional) {
37329 if (list_empty(&node->private_list))
37330- list_lru_add(&shadow_nodes, &node->private_list);
37331+ list_lru_add(&__shadow_nodes, &node->private_list);
37332 } else {
37333 if (!list_empty(&node->private_list))
37334- list_lru_del(&shadow_nodes, &node->private_list);
37335+ list_lru_del(&__shadow_nodes, &node->private_list);
37336 }
37337 }
37338
37339@@ -373,9 +374,9 @@ static unsigned long count_shadow_nodes(struct shrinker *shrinker,
37340 unsigned long cache;
37341
37342 /* list_lru lock nests inside IRQ-safe mapping->tree_lock */
37343- local_irq_disable();
37344- nodes = list_lru_shrink_count(&shadow_nodes, sc);
37345- local_irq_enable();
37346+ local_lock_irq(shadow_nodes_lock);
37347+ nodes = list_lru_shrink_count(&__shadow_nodes, sc);
37348+ local_unlock_irq(shadow_nodes_lock);
37349
37350 /*
37351 * Approximate a reasonable limit for the radix tree nodes
37352@@ -475,15 +476,15 @@ static enum lru_status shadow_lru_isolate(struct list_head *item,
37353 goto out_invalid;
37354 inc_lruvec_page_state(virt_to_page(node), WORKINGSET_NODERECLAIM);
37355 __radix_tree_delete_node(&mapping->page_tree, node,
37356- workingset_update_node, mapping);
37357+ __workingset_update_node, mapping);
37358
37359 out_invalid:
37360 spin_unlock(&mapping->tree_lock);
37361 ret = LRU_REMOVED_RETRY;
37362 out:
37363- local_irq_enable();
37364+ local_unlock_irq(shadow_nodes_lock);
37365 cond_resched();
37366- local_irq_disable();
37367+ local_lock_irq(shadow_nodes_lock);
37368 spin_lock(lru_lock);
37369 return ret;
37370 }
37371@@ -494,9 +495,9 @@ static unsigned long scan_shadow_nodes(struct shrinker *shrinker,
37372 unsigned long ret;
37373
37374 /* list_lru lock nests inside IRQ-safe mapping->tree_lock */
37375- local_irq_disable();
37376- ret = list_lru_shrink_walk(&shadow_nodes, sc, shadow_lru_isolate, NULL);
37377- local_irq_enable();
37378+ local_lock_irq(shadow_nodes_lock);
37379+ ret = list_lru_shrink_walk(&__shadow_nodes, sc, shadow_lru_isolate, NULL);
37380+ local_unlock_irq(shadow_nodes_lock);
37381 return ret;
37382 }
37383
37384@@ -534,7 +535,7 @@ static int __init workingset_init(void)
37385 pr_info("workingset: timestamp_bits=%d max_order=%d bucket_order=%u\n",
37386 timestamp_bits, max_order, bucket_order);
37387
37388- ret = __list_lru_init(&shadow_nodes, true, &shadow_nodes_key);
37389+ ret = __list_lru_init(&__shadow_nodes, true, &shadow_nodes_key);
37390 if (ret)
37391 goto err;
37392 ret = register_shrinker(&workingset_shadow_shrinker);
37393@@ -542,7 +543,7 @@ static int __init workingset_init(void)
37394 goto err_list_lru;
37395 return 0;
37396 err_list_lru:
37397- list_lru_destroy(&shadow_nodes);
37398+ list_lru_destroy(&__shadow_nodes);
37399 err:
37400 return ret;
37401 }
37402diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
37403index 685049a9048d..8d1489fd1dbc 100644
37404--- a/mm/zsmalloc.c
37405+++ b/mm/zsmalloc.c
37406@@ -53,6 +53,7 @@
37407 #include <linux/mount.h>
37408 #include <linux/migrate.h>
37409 #include <linux/pagemap.h>
37410+#include <linux/locallock.h>
37411
37412 #define ZSPAGE_MAGIC 0x58
37413
37414@@ -70,9 +71,22 @@
37415 */
37416 #define ZS_MAX_ZSPAGE_ORDER 2
37417 #define ZS_MAX_PAGES_PER_ZSPAGE (_AC(1, UL) << ZS_MAX_ZSPAGE_ORDER)
37418-
37419 #define ZS_HANDLE_SIZE (sizeof(unsigned long))
37420
37421+#ifdef CONFIG_PREEMPT_RT_FULL
37422+
37423+struct zsmalloc_handle {
37424+ unsigned long addr;
37425+ struct mutex lock;
37426+};
37427+
37428+#define ZS_HANDLE_ALLOC_SIZE (sizeof(struct zsmalloc_handle))
37429+
37430+#else
37431+
37432+#define ZS_HANDLE_ALLOC_SIZE (sizeof(unsigned long))
37433+#endif
37434+
37435 /*
37436 * Object location (<PFN>, <obj_idx>) is encoded as
37437 * as single (unsigned long) handle value.
37438@@ -320,7 +334,7 @@ static void SetZsPageMovable(struct zs_pool *pool, struct zspage *zspage) {}
37439
37440 static int create_cache(struct zs_pool *pool)
37441 {
37442- pool->handle_cachep = kmem_cache_create("zs_handle", ZS_HANDLE_SIZE,
37443+ pool->handle_cachep = kmem_cache_create("zs_handle", ZS_HANDLE_ALLOC_SIZE,
37444 0, 0, NULL);
37445 if (!pool->handle_cachep)
37446 return 1;
37447@@ -344,10 +358,27 @@ static void destroy_cache(struct zs_pool *pool)
37448
37449 static unsigned long cache_alloc_handle(struct zs_pool *pool, gfp_t gfp)
37450 {
37451- return (unsigned long)kmem_cache_alloc(pool->handle_cachep,
37452- gfp & ~(__GFP_HIGHMEM|__GFP_MOVABLE));
37453+ void *p;
37454+
37455+ p = kmem_cache_alloc(pool->handle_cachep,
37456+ gfp & ~(__GFP_HIGHMEM|__GFP_MOVABLE));
37457+#ifdef CONFIG_PREEMPT_RT_FULL
37458+ if (p) {
37459+ struct zsmalloc_handle *zh = p;
37460+
37461+ mutex_init(&zh->lock);
37462+ }
37463+#endif
37464+ return (unsigned long)p;
37465 }
37466
37467+#ifdef CONFIG_PREEMPT_RT_FULL
37468+static struct zsmalloc_handle *zs_get_pure_handle(unsigned long handle)
37469+{
37470+ return (void *)(handle &~((1 << OBJ_TAG_BITS) - 1));
37471+}
37472+#endif
37473+
37474 static void cache_free_handle(struct zs_pool *pool, unsigned long handle)
37475 {
37476 kmem_cache_free(pool->handle_cachep, (void *)handle);
37477@@ -366,12 +397,18 @@ static void cache_free_zspage(struct zs_pool *pool, struct zspage *zspage)
37478
37479 static void record_obj(unsigned long handle, unsigned long obj)
37480 {
37481+#ifdef CONFIG_PREEMPT_RT_FULL
37482+ struct zsmalloc_handle *zh = zs_get_pure_handle(handle);
37483+
37484+ WRITE_ONCE(zh->addr, obj);
37485+#else
37486 /*
37487 * lsb of @obj represents handle lock while other bits
37488 * represent object value the handle is pointing so
37489 * updating shouldn't do store tearing.
37490 */
37491 WRITE_ONCE(*(unsigned long *)handle, obj);
37492+#endif
37493 }
37494
37495 /* zpool driver */
37496@@ -460,6 +497,7 @@ MODULE_ALIAS("zpool-zsmalloc");
37497
37498 /* per-cpu VM mapping areas for zspage accesses that cross page boundaries */
37499 static DEFINE_PER_CPU(struct mapping_area, zs_map_area);
37500+static DEFINE_LOCAL_IRQ_LOCK(zs_map_area_lock);
37501
37502 static bool is_zspage_isolated(struct zspage *zspage)
37503 {
37504@@ -898,7 +936,13 @@ static unsigned long location_to_obj(struct page *page, unsigned int obj_idx)
37505
37506 static unsigned long handle_to_obj(unsigned long handle)
37507 {
37508+#ifdef CONFIG_PREEMPT_RT_FULL
37509+ struct zsmalloc_handle *zh = zs_get_pure_handle(handle);
37510+
37511+ return zh->addr;
37512+#else
37513 return *(unsigned long *)handle;
37514+#endif
37515 }
37516
37517 static unsigned long obj_to_head(struct page *page, void *obj)
37518@@ -912,22 +956,46 @@ static unsigned long obj_to_head(struct page *page, void *obj)
37519
37520 static inline int testpin_tag(unsigned long handle)
37521 {
37522+#ifdef CONFIG_PREEMPT_RT_FULL
37523+ struct zsmalloc_handle *zh = zs_get_pure_handle(handle);
37524+
37525+ return mutex_is_locked(&zh->lock);
37526+#else
37527 return bit_spin_is_locked(HANDLE_PIN_BIT, (unsigned long *)handle);
37528+#endif
37529 }
37530
37531 static inline int trypin_tag(unsigned long handle)
37532 {
37533+#ifdef CONFIG_PREEMPT_RT_FULL
37534+ struct zsmalloc_handle *zh = zs_get_pure_handle(handle);
37535+
37536+ return mutex_trylock(&zh->lock);
37537+#else
37538 return bit_spin_trylock(HANDLE_PIN_BIT, (unsigned long *)handle);
37539+#endif
37540 }
37541
37542 static void pin_tag(unsigned long handle)
37543 {
37544+#ifdef CONFIG_PREEMPT_RT_FULL
37545+ struct zsmalloc_handle *zh = zs_get_pure_handle(handle);
37546+
37547+ return mutex_lock(&zh->lock);
37548+#else
37549 bit_spin_lock(HANDLE_PIN_BIT, (unsigned long *)handle);
37550+#endif
37551 }
37552
37553 static void unpin_tag(unsigned long handle)
37554 {
37555+#ifdef CONFIG_PREEMPT_RT_FULL
37556+ struct zsmalloc_handle *zh = zs_get_pure_handle(handle);
37557+
37558+ return mutex_unlock(&zh->lock);
37559+#else
37560 bit_spin_unlock(HANDLE_PIN_BIT, (unsigned long *)handle);
37561+#endif
37562 }
37563
37564 static void reset_page(struct page *page)
37565@@ -1365,7 +1433,7 @@ void *zs_map_object(struct zs_pool *pool, unsigned long handle,
37566 class = pool->size_class[class_idx];
37567 off = (class->size * obj_idx) & ~PAGE_MASK;
37568
37569- area = &get_cpu_var(zs_map_area);
37570+ area = &get_locked_var(zs_map_area_lock, zs_map_area);
37571 area->vm_mm = mm;
37572 if (off + class->size <= PAGE_SIZE) {
37573 /* this object is contained entirely within a page */
37574@@ -1419,7 +1487,7 @@ void zs_unmap_object(struct zs_pool *pool, unsigned long handle)
37575
37576 __zs_unmap_object(area, pages, off, class->size);
37577 }
37578- put_cpu_var(zs_map_area);
37579+ put_locked_var(zs_map_area_lock, zs_map_area);
37580
37581 migrate_read_unlock(zspage);
37582 unpin_tag(handle);
37583diff --git a/net/9p/trans_xen.c b/net/9p/trans_xen.c
37584index c10bdf63eae7..84a49f2bcfbc 100644
37585--- a/net/9p/trans_xen.c
37586+++ b/net/9p/trans_xen.c
37587@@ -38,7 +38,6 @@
37588
37589 #include <linux/module.h>
37590 #include <linux/spinlock.h>
37591-#include <linux/rwlock.h>
37592 #include <net/9p/9p.h>
37593 #include <net/9p/client.h>
37594 #include <net/9p/transport.h>
37595diff --git a/net/Kconfig b/net/Kconfig
37596index 9dba2715919d..9c7b38379c09 100644
37597--- a/net/Kconfig
37598+++ b/net/Kconfig
37599@@ -272,7 +272,7 @@ config CGROUP_NET_CLASSID
37600
37601 config NET_RX_BUSY_POLL
37602 bool
37603- default y
37604+ default y if !PREEMPT_RT_FULL
37605
37606 config BQL
37607 bool
37608diff --git a/net/bluetooth/hci_sock.c b/net/bluetooth/hci_sock.c
37609index 65d734c165bd..923e9a271872 100644
37610--- a/net/bluetooth/hci_sock.c
37611+++ b/net/bluetooth/hci_sock.c
37612@@ -251,15 +251,13 @@ void hci_send_to_sock(struct hci_dev *hdev, struct sk_buff *skb)
37613 }
37614
37615 /* Send frame to sockets with specific channel */
37616-void hci_send_to_channel(unsigned short channel, struct sk_buff *skb,
37617- int flag, struct sock *skip_sk)
37618+static void __hci_send_to_channel(unsigned short channel, struct sk_buff *skb,
37619+ int flag, struct sock *skip_sk)
37620 {
37621 struct sock *sk;
37622
37623 BT_DBG("channel %u len %d", channel, skb->len);
37624
37625- read_lock(&hci_sk_list.lock);
37626-
37627 sk_for_each(sk, &hci_sk_list.head) {
37628 struct sk_buff *nskb;
37629
37630@@ -285,6 +283,13 @@ void hci_send_to_channel(unsigned short channel, struct sk_buff *skb,
37631 kfree_skb(nskb);
37632 }
37633
37634+}
37635+
37636+void hci_send_to_channel(unsigned short channel, struct sk_buff *skb,
37637+ int flag, struct sock *skip_sk)
37638+{
37639+ read_lock(&hci_sk_list.lock);
37640+ __hci_send_to_channel(channel, skb, flag, skip_sk);
37641 read_unlock(&hci_sk_list.lock);
37642 }
37643
37644@@ -388,8 +393,8 @@ void hci_send_monitor_ctrl_event(struct hci_dev *hdev, u16 event,
37645 hdr->index = index;
37646 hdr->len = cpu_to_le16(skb->len - HCI_MON_HDR_SIZE);
37647
37648- hci_send_to_channel(HCI_CHANNEL_MONITOR, skb,
37649- HCI_SOCK_TRUSTED, NULL);
37650+ __hci_send_to_channel(HCI_CHANNEL_MONITOR, skb,
37651+ HCI_SOCK_TRUSTED, NULL);
37652 kfree_skb(skb);
37653 }
37654
37655diff --git a/net/can/bcm.c b/net/can/bcm.c
37656index 13690334efa3..9cc67ac257f1 100644
37657--- a/net/can/bcm.c
37658+++ b/net/can/bcm.c
37659@@ -102,7 +102,6 @@ struct bcm_op {
37660 unsigned long frames_abs, frames_filtered;
37661 struct bcm_timeval ival1, ival2;
37662 struct hrtimer timer, thrtimer;
37663- struct tasklet_struct tsklet, thrtsklet;
37664 ktime_t rx_stamp, kt_ival1, kt_ival2, kt_lastmsg;
37665 int rx_ifindex;
37666 int cfsiz;
37667@@ -364,25 +363,34 @@ static void bcm_send_to_user(struct bcm_op *op, struct bcm_msg_head *head,
37668 }
37669 }
37670
37671-static void bcm_tx_start_timer(struct bcm_op *op)
37672+static bool bcm_tx_set_expiry(struct bcm_op *op, struct hrtimer *hrt)
37673 {
37674+ ktime_t ival;
37675+
37676 if (op->kt_ival1 && op->count)
37677- hrtimer_start(&op->timer,
37678- ktime_add(ktime_get(), op->kt_ival1),
37679- HRTIMER_MODE_ABS);
37680+ ival = op->kt_ival1;
37681 else if (op->kt_ival2)
37682- hrtimer_start(&op->timer,
37683- ktime_add(ktime_get(), op->kt_ival2),
37684- HRTIMER_MODE_ABS);
37685+ ival = op->kt_ival2;
37686+ else
37687+ return false;
37688+
37689+ hrtimer_set_expires(hrt, ktime_add(ktime_get(), ival));
37690+ return true;
37691 }
37692
37693-static void bcm_tx_timeout_tsklet(unsigned long data)
37694+static void bcm_tx_start_timer(struct bcm_op *op)
37695 {
37696- struct bcm_op *op = (struct bcm_op *)data;
37697+ if (bcm_tx_set_expiry(op, &op->timer))
37698+ hrtimer_start_expires(&op->timer, HRTIMER_MODE_ABS_SOFT);
37699+}
37700+
37701+/* bcm_tx_timeout_handler - performs cyclic CAN frame transmissions */
37702+static enum hrtimer_restart bcm_tx_timeout_handler(struct hrtimer *hrtimer)
37703+{
37704+ struct bcm_op *op = container_of(hrtimer, struct bcm_op, timer);
37705 struct bcm_msg_head msg_head;
37706
37707 if (op->kt_ival1 && (op->count > 0)) {
37708-
37709 op->count--;
37710 if (!op->count && (op->flags & TX_COUNTEVT)) {
37711
37712@@ -399,22 +407,12 @@ static void bcm_tx_timeout_tsklet(unsigned long data)
37713 }
37714 bcm_can_tx(op);
37715
37716- } else if (op->kt_ival2)
37717+ } else if (op->kt_ival2) {
37718 bcm_can_tx(op);
37719+ }
37720
37721- bcm_tx_start_timer(op);
37722-}
37723-
37724-/*
37725- * bcm_tx_timeout_handler - performs cyclic CAN frame transmissions
37726- */
37727-static enum hrtimer_restart bcm_tx_timeout_handler(struct hrtimer *hrtimer)
37728-{
37729- struct bcm_op *op = container_of(hrtimer, struct bcm_op, timer);
37730-
37731- tasklet_schedule(&op->tsklet);
37732-
37733- return HRTIMER_NORESTART;
37734+ return bcm_tx_set_expiry(op, &op->timer) ?
37735+ HRTIMER_RESTART : HRTIMER_NORESTART;
37736 }
37737
37738 /*
37739@@ -480,7 +478,7 @@ static void bcm_rx_update_and_send(struct bcm_op *op,
37740 /* do not send the saved data - only start throttle timer */
37741 hrtimer_start(&op->thrtimer,
37742 ktime_add(op->kt_lastmsg, op->kt_ival2),
37743- HRTIMER_MODE_ABS);
37744+ HRTIMER_MODE_ABS_SOFT);
37745 return;
37746 }
37747
37748@@ -539,14 +537,21 @@ static void bcm_rx_starttimer(struct bcm_op *op)
37749 return;
37750
37751 if (op->kt_ival1)
37752- hrtimer_start(&op->timer, op->kt_ival1, HRTIMER_MODE_REL);
37753+ hrtimer_start(&op->timer, op->kt_ival1, HRTIMER_MODE_REL_SOFT);
37754 }
37755
37756-static void bcm_rx_timeout_tsklet(unsigned long data)
37757+/* bcm_rx_timeout_handler - when the (cyclic) CAN frame reception timed out */
37758+static enum hrtimer_restart bcm_rx_timeout_handler(struct hrtimer *hrtimer)
37759 {
37760- struct bcm_op *op = (struct bcm_op *)data;
37761+ struct bcm_op *op = container_of(hrtimer, struct bcm_op, timer);
37762 struct bcm_msg_head msg_head;
37763
37764+ /* if user wants to be informed, when cyclic CAN-Messages come back */
37765+ if ((op->flags & RX_ANNOUNCE_RESUME) && op->last_frames) {
37766+ /* clear received CAN frames to indicate 'nothing received' */
37767+ memset(op->last_frames, 0, op->nframes * op->cfsiz);
37768+ }
37769+
37770 /* create notification to user */
37771 msg_head.opcode = RX_TIMEOUT;
37772 msg_head.flags = op->flags;
37773@@ -557,25 +562,6 @@ static void bcm_rx_timeout_tsklet(unsigned long data)
37774 msg_head.nframes = 0;
37775
37776 bcm_send_to_user(op, &msg_head, NULL, 0);
37777-}
37778-
37779-/*
37780- * bcm_rx_timeout_handler - when the (cyclic) CAN frame reception timed out
37781- */
37782-static enum hrtimer_restart bcm_rx_timeout_handler(struct hrtimer *hrtimer)
37783-{
37784- struct bcm_op *op = container_of(hrtimer, struct bcm_op, timer);
37785-
37786- /* schedule before NET_RX_SOFTIRQ */
37787- tasklet_hi_schedule(&op->tsklet);
37788-
37789- /* no restart of the timer is done here! */
37790-
37791- /* if user wants to be informed, when cyclic CAN-Messages come back */
37792- if ((op->flags & RX_ANNOUNCE_RESUME) && op->last_frames) {
37793- /* clear received CAN frames to indicate 'nothing received' */
37794- memset(op->last_frames, 0, op->nframes * op->cfsiz);
37795- }
37796
37797 return HRTIMER_NORESTART;
37798 }
37799@@ -583,14 +569,12 @@ static enum hrtimer_restart bcm_rx_timeout_handler(struct hrtimer *hrtimer)
37800 /*
37801 * bcm_rx_do_flush - helper for bcm_rx_thr_flush
37802 */
37803-static inline int bcm_rx_do_flush(struct bcm_op *op, int update,
37804- unsigned int index)
37805+static inline int bcm_rx_do_flush(struct bcm_op *op, unsigned int index)
37806 {
37807 struct canfd_frame *lcf = op->last_frames + op->cfsiz * index;
37808
37809 if ((op->last_frames) && (lcf->flags & RX_THR)) {
37810- if (update)
37811- bcm_rx_changed(op, lcf);
37812+ bcm_rx_changed(op, lcf);
37813 return 1;
37814 }
37815 return 0;
37816@@ -598,11 +582,8 @@ static inline int bcm_rx_do_flush(struct bcm_op *op, int update,
37817
37818 /*
37819 * bcm_rx_thr_flush - Check for throttled data and send it to the userspace
37820- *
37821- * update == 0 : just check if throttled data is available (any irq context)
37822- * update == 1 : check and send throttled data to userspace (soft_irq context)
37823 */
37824-static int bcm_rx_thr_flush(struct bcm_op *op, int update)
37825+static int bcm_rx_thr_flush(struct bcm_op *op)
37826 {
37827 int updated = 0;
37828
37829@@ -611,24 +592,16 @@ static int bcm_rx_thr_flush(struct bcm_op *op, int update)
37830
37831 /* for MUX filter we start at index 1 */
37832 for (i = 1; i < op->nframes; i++)
37833- updated += bcm_rx_do_flush(op, update, i);
37834+ updated += bcm_rx_do_flush(op, i);
37835
37836 } else {
37837 /* for RX_FILTER_ID and simple filter */
37838- updated += bcm_rx_do_flush(op, update, 0);
37839+ updated += bcm_rx_do_flush(op, 0);
37840 }
37841
37842 return updated;
37843 }
37844
37845-static void bcm_rx_thr_tsklet(unsigned long data)
37846-{
37847- struct bcm_op *op = (struct bcm_op *)data;
37848-
37849- /* push the changed data to the userspace */
37850- bcm_rx_thr_flush(op, 1);
37851-}
37852-
37853 /*
37854 * bcm_rx_thr_handler - the time for blocked content updates is over now:
37855 * Check for throttled data and send it to the userspace
37856@@ -637,9 +610,7 @@ static enum hrtimer_restart bcm_rx_thr_handler(struct hrtimer *hrtimer)
37857 {
37858 struct bcm_op *op = container_of(hrtimer, struct bcm_op, thrtimer);
37859
37860- tasklet_schedule(&op->thrtsklet);
37861-
37862- if (bcm_rx_thr_flush(op, 0)) {
37863+ if (bcm_rx_thr_flush(op)) {
37864 hrtimer_forward(hrtimer, ktime_get(), op->kt_ival2);
37865 return HRTIMER_RESTART;
37866 } else {
37867@@ -735,23 +706,8 @@ static struct bcm_op *bcm_find_op(struct list_head *ops,
37868
37869 static void bcm_remove_op(struct bcm_op *op)
37870 {
37871- if (op->tsklet.func) {
37872- while (test_bit(TASKLET_STATE_SCHED, &op->tsklet.state) ||
37873- test_bit(TASKLET_STATE_RUN, &op->tsklet.state) ||
37874- hrtimer_active(&op->timer)) {
37875- hrtimer_cancel(&op->timer);
37876- tasklet_kill(&op->tsklet);
37877- }
37878- }
37879-
37880- if (op->thrtsklet.func) {
37881- while (test_bit(TASKLET_STATE_SCHED, &op->thrtsklet.state) ||
37882- test_bit(TASKLET_STATE_RUN, &op->thrtsklet.state) ||
37883- hrtimer_active(&op->thrtimer)) {
37884- hrtimer_cancel(&op->thrtimer);
37885- tasklet_kill(&op->thrtsklet);
37886- }
37887- }
37888+ hrtimer_cancel(&op->timer);
37889+ hrtimer_cancel(&op->thrtimer);
37890
37891 if ((op->frames) && (op->frames != &op->sframe))
37892 kfree(op->frames);
37893@@ -979,15 +935,13 @@ static int bcm_tx_setup(struct bcm_msg_head *msg_head, struct msghdr *msg,
37894 op->ifindex = ifindex;
37895
37896 /* initialize uninitialized (kzalloc) structure */
37897- hrtimer_init(&op->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
37898+ hrtimer_init(&op->timer, CLOCK_MONOTONIC,
37899+ HRTIMER_MODE_REL_SOFT);
37900 op->timer.function = bcm_tx_timeout_handler;
37901
37902- /* initialize tasklet for tx countevent notification */
37903- tasklet_init(&op->tsklet, bcm_tx_timeout_tsklet,
37904- (unsigned long) op);
37905-
37906 /* currently unused in tx_ops */
37907- hrtimer_init(&op->thrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
37908+ hrtimer_init(&op->thrtimer, CLOCK_MONOTONIC,
37909+ HRTIMER_MODE_REL_SOFT);
37910
37911 /* add this bcm_op to the list of the tx_ops */
37912 list_add(&op->list, &bo->tx_ops);
37913@@ -1150,20 +1104,14 @@ static int bcm_rx_setup(struct bcm_msg_head *msg_head, struct msghdr *msg,
37914 op->rx_ifindex = ifindex;
37915
37916 /* initialize uninitialized (kzalloc) structure */
37917- hrtimer_init(&op->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
37918+ hrtimer_init(&op->timer, CLOCK_MONOTONIC,
37919+ HRTIMER_MODE_REL_SOFT);
37920 op->timer.function = bcm_rx_timeout_handler;
37921
37922- /* initialize tasklet for rx timeout notification */
37923- tasklet_init(&op->tsklet, bcm_rx_timeout_tsklet,
37924- (unsigned long) op);
37925-
37926- hrtimer_init(&op->thrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
37927+ hrtimer_init(&op->thrtimer, CLOCK_MONOTONIC,
37928+ HRTIMER_MODE_REL_SOFT);
37929 op->thrtimer.function = bcm_rx_thr_handler;
37930
37931- /* initialize tasklet for rx throttle handling */
37932- tasklet_init(&op->thrtsklet, bcm_rx_thr_tsklet,
37933- (unsigned long) op);
37934-
37935 /* add this bcm_op to the list of the rx_ops */
37936 list_add(&op->list, &bo->rx_ops);
37937
37938@@ -1209,12 +1157,12 @@ static int bcm_rx_setup(struct bcm_msg_head *msg_head, struct msghdr *msg,
37939 */
37940 op->kt_lastmsg = 0;
37941 hrtimer_cancel(&op->thrtimer);
37942- bcm_rx_thr_flush(op, 1);
37943+ bcm_rx_thr_flush(op);
37944 }
37945
37946 if ((op->flags & STARTTIMER) && op->kt_ival1)
37947 hrtimer_start(&op->timer, op->kt_ival1,
37948- HRTIMER_MODE_REL);
37949+ HRTIMER_MODE_REL_SOFT);
37950 }
37951
37952 /* now we can register for can_ids, if we added a new bcm_op */
37953diff --git a/net/core/dev.c b/net/core/dev.c
37954index e8a66ad6d07c..fa9642bb0482 100644
37955--- a/net/core/dev.c
37956+++ b/net/core/dev.c
37957@@ -195,6 +195,7 @@ static unsigned int napi_gen_id = NR_CPUS;
37958 static DEFINE_READ_MOSTLY_HASHTABLE(napi_hash, 8);
37959
37960 static seqcount_t devnet_rename_seq;
37961+static DEFINE_MUTEX(devnet_rename_mutex);
37962
37963 static inline void dev_base_seq_inc(struct net *net)
37964 {
37965@@ -217,14 +218,14 @@ static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex)
37966 static inline void rps_lock(struct softnet_data *sd)
37967 {
37968 #ifdef CONFIG_RPS
37969- spin_lock(&sd->input_pkt_queue.lock);
37970+ raw_spin_lock(&sd->input_pkt_queue.raw_lock);
37971 #endif
37972 }
37973
37974 static inline void rps_unlock(struct softnet_data *sd)
37975 {
37976 #ifdef CONFIG_RPS
37977- spin_unlock(&sd->input_pkt_queue.lock);
37978+ raw_spin_unlock(&sd->input_pkt_queue.raw_lock);
37979 #endif
37980 }
37981
37982@@ -920,7 +921,8 @@ int netdev_get_name(struct net *net, char *name, int ifindex)
37983 strcpy(name, dev->name);
37984 rcu_read_unlock();
37985 if (read_seqcount_retry(&devnet_rename_seq, seq)) {
37986- cond_resched();
37987+ mutex_lock(&devnet_rename_mutex);
37988+ mutex_unlock(&devnet_rename_mutex);
37989 goto retry;
37990 }
37991
37992@@ -1189,20 +1191,17 @@ int dev_change_name(struct net_device *dev, const char *newname)
37993 if (dev->flags & IFF_UP)
37994 return -EBUSY;
37995
37996- write_seqcount_begin(&devnet_rename_seq);
37997+ mutex_lock(&devnet_rename_mutex);
37998+ __raw_write_seqcount_begin(&devnet_rename_seq);
37999
38000- if (strncmp(newname, dev->name, IFNAMSIZ) == 0) {
38001- write_seqcount_end(&devnet_rename_seq);
38002- return 0;
38003- }
38004+ if (strncmp(newname, dev->name, IFNAMSIZ) == 0)
38005+ goto outunlock;
38006
38007 memcpy(oldname, dev->name, IFNAMSIZ);
38008
38009 err = dev_get_valid_name(net, dev, newname);
38010- if (err < 0) {
38011- write_seqcount_end(&devnet_rename_seq);
38012- return err;
38013- }
38014+ if (err < 0)
38015+ goto outunlock;
38016
38017 if (oldname[0] && !strchr(oldname, '%'))
38018 netdev_info(dev, "renamed from %s\n", oldname);
38019@@ -1215,11 +1214,12 @@ int dev_change_name(struct net_device *dev, const char *newname)
38020 if (ret) {
38021 memcpy(dev->name, oldname, IFNAMSIZ);
38022 dev->name_assign_type = old_assign_type;
38023- write_seqcount_end(&devnet_rename_seq);
38024- return ret;
38025+ err = ret;
38026+ goto outunlock;
38027 }
38028
38029- write_seqcount_end(&devnet_rename_seq);
38030+ __raw_write_seqcount_end(&devnet_rename_seq);
38031+ mutex_unlock(&devnet_rename_mutex);
38032
38033 netdev_adjacent_rename_links(dev, oldname);
38034
38035@@ -1240,7 +1240,8 @@ int dev_change_name(struct net_device *dev, const char *newname)
38036 /* err >= 0 after dev_alloc_name() or stores the first errno */
38037 if (err >= 0) {
38038 err = ret;
38039- write_seqcount_begin(&devnet_rename_seq);
38040+ mutex_lock(&devnet_rename_mutex);
38041+ __raw_write_seqcount_begin(&devnet_rename_seq);
38042 memcpy(dev->name, oldname, IFNAMSIZ);
38043 memcpy(oldname, newname, IFNAMSIZ);
38044 dev->name_assign_type = old_assign_type;
38045@@ -1253,6 +1254,11 @@ int dev_change_name(struct net_device *dev, const char *newname)
38046 }
38047
38048 return err;
38049+
38050+outunlock:
38051+ __raw_write_seqcount_end(&devnet_rename_seq);
38052+ mutex_unlock(&devnet_rename_mutex);
38053+ return err;
38054 }
38055
38056 /**
38057@@ -2460,6 +2466,7 @@ static void __netif_reschedule(struct Qdisc *q)
38058 sd->output_queue_tailp = &q->next_sched;
38059 raise_softirq_irqoff(NET_TX_SOFTIRQ);
38060 local_irq_restore(flags);
38061+ preempt_check_resched_rt();
38062 }
38063
38064 void __netif_schedule(struct Qdisc *q)
38065@@ -2522,6 +2529,7 @@ void __dev_kfree_skb_irq(struct sk_buff *skb, enum skb_free_reason reason)
38066 __this_cpu_write(softnet_data.completion_queue, skb);
38067 raise_softirq_irqoff(NET_TX_SOFTIRQ);
38068 local_irq_restore(flags);
38069+ preempt_check_resched_rt();
38070 }
38071 EXPORT_SYMBOL(__dev_kfree_skb_irq);
38072
38073@@ -3197,7 +3205,11 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
38074 * This permits qdisc->running owner to get the lock more
38075 * often and dequeue packets faster.
38076 */
38077+#ifdef CONFIG_PREEMPT_RT_FULL
38078+ contended = true;
38079+#else
38080 contended = qdisc_is_running(q);
38081+#endif
38082 if (unlikely(contended))
38083 spin_lock(&q->busylock);
38084
38085@@ -3268,8 +3280,10 @@ static void skb_update_prio(struct sk_buff *skb)
38086 #define skb_update_prio(skb)
38087 #endif
38088
38089+#ifndef CONFIG_PREEMPT_RT_FULL
38090 DEFINE_PER_CPU(int, xmit_recursion);
38091 EXPORT_SYMBOL(xmit_recursion);
38092+#endif
38093
38094 /**
38095 * dev_loopback_xmit - loop back @skb
38096@@ -3509,9 +3523,12 @@ static int __dev_queue_xmit(struct sk_buff *skb, void *accel_priv)
38097 if (dev->flags & IFF_UP) {
38098 int cpu = smp_processor_id(); /* ok because BHs are off */
38099
38100+#ifdef CONFIG_PREEMPT_RT_FULL
38101+ if (txq->xmit_lock_owner != current) {
38102+#else
38103 if (txq->xmit_lock_owner != cpu) {
38104- if (unlikely(__this_cpu_read(xmit_recursion) >
38105- XMIT_RECURSION_LIMIT))
38106+#endif
38107+ if (unlikely(xmit_rec_read() > XMIT_RECURSION_LIMIT))
38108 goto recursion_alert;
38109
38110 skb = validate_xmit_skb(skb, dev);
38111@@ -3521,9 +3538,9 @@ static int __dev_queue_xmit(struct sk_buff *skb, void *accel_priv)
38112 HARD_TX_LOCK(dev, txq, cpu);
38113
38114 if (!netif_xmit_stopped(txq)) {
38115- __this_cpu_inc(xmit_recursion);
38116+ xmit_rec_inc();
38117 skb = dev_hard_start_xmit(skb, dev, txq, &rc);
38118- __this_cpu_dec(xmit_recursion);
38119+ xmit_rec_dec();
38120 if (dev_xmit_complete(rc)) {
38121 HARD_TX_UNLOCK(dev, txq);
38122 goto out;
38123@@ -3904,6 +3921,7 @@ static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
38124 rps_unlock(sd);
38125
38126 local_irq_restore(flags);
38127+ preempt_check_resched_rt();
38128
38129 atomic_long_inc(&skb->dev->rx_dropped);
38130 kfree_skb(skb);
38131@@ -4056,7 +4074,7 @@ static int netif_rx_internal(struct sk_buff *skb)
38132 struct rps_dev_flow voidflow, *rflow = &voidflow;
38133 int cpu;
38134
38135- preempt_disable();
38136+ migrate_disable();
38137 rcu_read_lock();
38138
38139 cpu = get_rps_cpu(skb->dev, skb, &rflow);
38140@@ -4066,14 +4084,14 @@ static int netif_rx_internal(struct sk_buff *skb)
38141 ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
38142
38143 rcu_read_unlock();
38144- preempt_enable();
38145+ migrate_enable();
38146 } else
38147 #endif
38148 {
38149 unsigned int qtail;
38150
38151- ret = enqueue_to_backlog(skb, get_cpu(), &qtail);
38152- put_cpu();
38153+ ret = enqueue_to_backlog(skb, get_cpu_light(), &qtail);
38154+ put_cpu_light();
38155 }
38156 return ret;
38157 }
38158@@ -4107,11 +4125,9 @@ int netif_rx_ni(struct sk_buff *skb)
38159
38160 trace_netif_rx_ni_entry(skb);
38161
38162- preempt_disable();
38163+ local_bh_disable();
38164 err = netif_rx_internal(skb);
38165- if (local_softirq_pending())
38166- do_softirq();
38167- preempt_enable();
38168+ local_bh_enable();
38169
38170 return err;
38171 }
38172@@ -4629,7 +4645,7 @@ static void flush_backlog(struct work_struct *work)
38173 skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) {
38174 if (skb->dev->reg_state == NETREG_UNREGISTERING) {
38175 __skb_unlink(skb, &sd->input_pkt_queue);
38176- kfree_skb(skb);
38177+ __skb_queue_tail(&sd->tofree_queue, skb);
38178 input_queue_head_incr(sd);
38179 }
38180 }
38181@@ -4639,11 +4655,14 @@ static void flush_backlog(struct work_struct *work)
38182 skb_queue_walk_safe(&sd->process_queue, skb, tmp) {
38183 if (skb->dev->reg_state == NETREG_UNREGISTERING) {
38184 __skb_unlink(skb, &sd->process_queue);
38185- kfree_skb(skb);
38186+ __skb_queue_tail(&sd->tofree_queue, skb);
38187 input_queue_head_incr(sd);
38188 }
38189 }
38190+ if (!skb_queue_empty(&sd->tofree_queue))
38191+ raise_softirq_irqoff(NET_RX_SOFTIRQ);
38192 local_bh_enable();
38193+
38194 }
38195
38196 static void flush_all_backlogs(void)
38197@@ -5153,12 +5172,14 @@ static void net_rps_action_and_irq_enable(struct softnet_data *sd)
38198 sd->rps_ipi_list = NULL;
38199
38200 local_irq_enable();
38201+ preempt_check_resched_rt();
38202
38203 /* Send pending IPI's to kick RPS processing on remote cpus. */
38204 net_rps_send_ipi(remsd);
38205 } else
38206 #endif
38207 local_irq_enable();
38208+ preempt_check_resched_rt();
38209 }
38210
38211 static bool sd_has_rps_ipi_waiting(struct softnet_data *sd)
38212@@ -5188,7 +5209,9 @@ static int process_backlog(struct napi_struct *napi, int quota)
38213 while (again) {
38214 struct sk_buff *skb;
38215
38216+ local_irq_disable();
38217 while ((skb = __skb_dequeue(&sd->process_queue))) {
38218+ local_irq_enable();
38219 rcu_read_lock();
38220 __netif_receive_skb(skb);
38221 rcu_read_unlock();
38222@@ -5196,9 +5219,9 @@ static int process_backlog(struct napi_struct *napi, int quota)
38223 if (++work >= quota)
38224 return work;
38225
38226+ local_irq_disable();
38227 }
38228
38229- local_irq_disable();
38230 rps_lock(sd);
38231 if (skb_queue_empty(&sd->input_pkt_queue)) {
38232 /*
38233@@ -5236,6 +5259,7 @@ void __napi_schedule(struct napi_struct *n)
38234 local_irq_save(flags);
38235 ____napi_schedule(this_cpu_ptr(&softnet_data), n);
38236 local_irq_restore(flags);
38237+ preempt_check_resched_rt();
38238 }
38239 EXPORT_SYMBOL(__napi_schedule);
38240
38241@@ -5272,6 +5296,7 @@ bool napi_schedule_prep(struct napi_struct *n)
38242 }
38243 EXPORT_SYMBOL(napi_schedule_prep);
38244
38245+#ifndef CONFIG_PREEMPT_RT_FULL
38246 /**
38247 * __napi_schedule_irqoff - schedule for receive
38248 * @n: entry to schedule
38249@@ -5283,6 +5308,7 @@ void __napi_schedule_irqoff(struct napi_struct *n)
38250 ____napi_schedule(this_cpu_ptr(&softnet_data), n);
38251 }
38252 EXPORT_SYMBOL(__napi_schedule_irqoff);
38253+#endif
38254
38255 bool napi_complete_done(struct napi_struct *n, int work_done)
38256 {
38257@@ -5637,13 +5663,21 @@ static __latent_entropy void net_rx_action(struct softirq_action *h)
38258 unsigned long time_limit = jiffies +
38259 usecs_to_jiffies(netdev_budget_usecs);
38260 int budget = netdev_budget;
38261+ struct sk_buff_head tofree_q;
38262+ struct sk_buff *skb;
38263 LIST_HEAD(list);
38264 LIST_HEAD(repoll);
38265
38266+ __skb_queue_head_init(&tofree_q);
38267+
38268 local_irq_disable();
38269+ skb_queue_splice_init(&sd->tofree_queue, &tofree_q);
38270 list_splice_init(&sd->poll_list, &list);
38271 local_irq_enable();
38272
38273+ while ((skb = __skb_dequeue(&tofree_q)))
38274+ kfree_skb(skb);
38275+
38276 for (;;) {
38277 struct napi_struct *n;
38278
38279@@ -5673,7 +5707,7 @@ static __latent_entropy void net_rx_action(struct softirq_action *h)
38280 list_splice_tail(&repoll, &list);
38281 list_splice(&list, &sd->poll_list);
38282 if (!list_empty(&sd->poll_list))
38283- __raise_softirq_irqoff(NET_RX_SOFTIRQ);
38284+ __raise_softirq_irqoff_ksoft(NET_RX_SOFTIRQ);
38285
38286 net_rps_action_and_irq_enable(sd);
38287 out:
38288@@ -7502,7 +7536,7 @@ static void netdev_init_one_queue(struct net_device *dev,
38289 /* Initialize queue lock */
38290 spin_lock_init(&queue->_xmit_lock);
38291 netdev_set_xmit_lockdep_class(&queue->_xmit_lock, dev->type);
38292- queue->xmit_lock_owner = -1;
38293+ netdev_queue_clear_owner(queue);
38294 netdev_queue_numa_node_write(queue, NUMA_NO_NODE);
38295 queue->dev = dev;
38296 #ifdef CONFIG_BQL
38297@@ -8442,6 +8476,7 @@ static int dev_cpu_dead(unsigned int oldcpu)
38298
38299 raise_softirq_irqoff(NET_TX_SOFTIRQ);
38300 local_irq_enable();
38301+ preempt_check_resched_rt();
38302
38303 #ifdef CONFIG_RPS
38304 remsd = oldsd->rps_ipi_list;
38305@@ -8455,10 +8490,13 @@ static int dev_cpu_dead(unsigned int oldcpu)
38306 netif_rx_ni(skb);
38307 input_queue_head_incr(oldsd);
38308 }
38309- while ((skb = skb_dequeue(&oldsd->input_pkt_queue))) {
38310+ while ((skb = __skb_dequeue(&oldsd->input_pkt_queue))) {
38311 netif_rx_ni(skb);
38312 input_queue_head_incr(oldsd);
38313 }
38314+ while ((skb = __skb_dequeue(&oldsd->tofree_queue))) {
38315+ kfree_skb(skb);
38316+ }
38317
38318 return 0;
38319 }
38320@@ -8762,8 +8800,9 @@ static int __init net_dev_init(void)
38321
38322 INIT_WORK(flush, flush_backlog);
38323
38324- skb_queue_head_init(&sd->input_pkt_queue);
38325- skb_queue_head_init(&sd->process_queue);
38326+ skb_queue_head_init_raw(&sd->input_pkt_queue);
38327+ skb_queue_head_init_raw(&sd->process_queue);
38328+ skb_queue_head_init_raw(&sd->tofree_queue);
38329 INIT_LIST_HEAD(&sd->poll_list);
38330 sd->output_queue_tailp = &sd->output_queue;
38331 #ifdef CONFIG_RPS
38332diff --git a/net/core/filter.c b/net/core/filter.c
38333index d5158a10ac8f..ad96ec78f7b8 100644
38334--- a/net/core/filter.c
38335+++ b/net/core/filter.c
38336@@ -1696,7 +1696,7 @@ static inline int __bpf_tx_skb(struct net_device *dev, struct sk_buff *skb)
38337 {
38338 int ret;
38339
38340- if (unlikely(__this_cpu_read(xmit_recursion) > XMIT_RECURSION_LIMIT)) {
38341+ if (unlikely(xmit_rec_read() > XMIT_RECURSION_LIMIT)) {
38342 net_crit_ratelimited("bpf: recursion limit reached on datapath, buggy bpf program?\n");
38343 kfree_skb(skb);
38344 return -ENETDOWN;
38345@@ -1704,9 +1704,9 @@ static inline int __bpf_tx_skb(struct net_device *dev, struct sk_buff *skb)
38346
38347 skb->dev = dev;
38348
38349- __this_cpu_inc(xmit_recursion);
38350+ xmit_rec_inc();
38351 ret = dev_queue_xmit(skb);
38352- __this_cpu_dec(xmit_recursion);
38353+ xmit_rec_dec();
38354
38355 return ret;
38356 }
38357diff --git a/net/core/gen_estimator.c b/net/core/gen_estimator.c
38358index 7f980bd7426e..7250106015ef 100644
38359--- a/net/core/gen_estimator.c
38360+++ b/net/core/gen_estimator.c
38361@@ -46,7 +46,7 @@
38362 struct net_rate_estimator {
38363 struct gnet_stats_basic_packed *bstats;
38364 spinlock_t *stats_lock;
38365- seqcount_t *running;
38366+ net_seqlock_t *running;
38367 struct gnet_stats_basic_cpu __percpu *cpu_bstats;
38368 u8 ewma_log;
38369 u8 intvl_log; /* period : (250ms << intvl_log) */
38370@@ -129,7 +129,7 @@ int gen_new_estimator(struct gnet_stats_basic_packed *bstats,
38371 struct gnet_stats_basic_cpu __percpu *cpu_bstats,
38372 struct net_rate_estimator __rcu **rate_est,
38373 spinlock_t *stats_lock,
38374- seqcount_t *running,
38375+ net_seqlock_t *running,
38376 struct nlattr *opt)
38377 {
38378 struct gnet_estimator *parm = nla_data(opt);
38379@@ -222,7 +222,7 @@ int gen_replace_estimator(struct gnet_stats_basic_packed *bstats,
38380 struct gnet_stats_basic_cpu __percpu *cpu_bstats,
38381 struct net_rate_estimator __rcu **rate_est,
38382 spinlock_t *stats_lock,
38383- seqcount_t *running, struct nlattr *opt)
38384+ net_seqlock_t *running, struct nlattr *opt)
38385 {
38386 return gen_new_estimator(bstats, cpu_bstats, rate_est,
38387 stats_lock, running, opt);
38388diff --git a/net/core/gen_stats.c b/net/core/gen_stats.c
38389index 441c04adedba..07f9a6a1f8e4 100644
38390--- a/net/core/gen_stats.c
38391+++ b/net/core/gen_stats.c
38392@@ -142,7 +142,7 @@ __gnet_stats_copy_basic_cpu(struct gnet_stats_basic_packed *bstats,
38393 }
38394
38395 void
38396-__gnet_stats_copy_basic(const seqcount_t *running,
38397+__gnet_stats_copy_basic(net_seqlock_t *running,
38398 struct gnet_stats_basic_packed *bstats,
38399 struct gnet_stats_basic_cpu __percpu *cpu,
38400 struct gnet_stats_basic_packed *b)
38401@@ -155,10 +155,10 @@ __gnet_stats_copy_basic(const seqcount_t *running,
38402 }
38403 do {
38404 if (running)
38405- seq = read_seqcount_begin(running);
38406+ seq = net_seq_begin(running);
38407 bstats->bytes = b->bytes;
38408 bstats->packets = b->packets;
38409- } while (running && read_seqcount_retry(running, seq));
38410+ } while (running && net_seq_retry(running, seq));
38411 }
38412 EXPORT_SYMBOL(__gnet_stats_copy_basic);
38413
38414@@ -176,7 +176,7 @@ EXPORT_SYMBOL(__gnet_stats_copy_basic);
38415 * if the room in the socket buffer was not sufficient.
38416 */
38417 int
38418-gnet_stats_copy_basic(const seqcount_t *running,
38419+gnet_stats_copy_basic(net_seqlock_t *running,
38420 struct gnet_dump *d,
38421 struct gnet_stats_basic_cpu __percpu *cpu,
38422 struct gnet_stats_basic_packed *b)
38423diff --git a/net/core/pktgen.c b/net/core/pktgen.c
38424index 6e1e10ff433a..c1ae4075e0ed 100644
38425--- a/net/core/pktgen.c
38426+++ b/net/core/pktgen.c
38427@@ -2252,7 +2252,8 @@ static void spin(struct pktgen_dev *pkt_dev, ktime_t spin_until)
38428 s64 remaining;
38429 struct hrtimer_sleeper t;
38430
38431- hrtimer_init_on_stack(&t.timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
38432+ hrtimer_init_sleeper_on_stack(&t, CLOCK_MONOTONIC, HRTIMER_MODE_ABS,
38433+ current);
38434 hrtimer_set_expires(&t.timer, spin_until);
38435
38436 remaining = ktime_to_ns(hrtimer_expires_remaining(&t.timer));
38437@@ -2267,7 +2268,6 @@ static void spin(struct pktgen_dev *pkt_dev, ktime_t spin_until)
38438 } while (ktime_compare(end_time, spin_until) < 0);
38439 } else {
38440 /* see do_nanosleep */
38441- hrtimer_init_sleeper(&t, current);
38442 do {
38443 set_current_state(TASK_INTERRUPTIBLE);
38444 hrtimer_start_expires(&t.timer, HRTIMER_MODE_ABS);
38445diff --git a/net/core/skbuff.c b/net/core/skbuff.c
38446index 9f80b947f53b..c0f23b8dcfc6 100644
38447--- a/net/core/skbuff.c
38448+++ b/net/core/skbuff.c
38449@@ -63,6 +63,7 @@
38450 #include <linux/errqueue.h>
38451 #include <linux/prefetch.h>
38452 #include <linux/if_vlan.h>
38453+#include <linux/locallock.h>
38454
38455 #include <net/protocol.h>
38456 #include <net/dst.h>
38457@@ -330,6 +331,8 @@ struct napi_alloc_cache {
38458
38459 static DEFINE_PER_CPU(struct page_frag_cache, netdev_alloc_cache);
38460 static DEFINE_PER_CPU(struct napi_alloc_cache, napi_alloc_cache);
38461+static DEFINE_LOCAL_IRQ_LOCK(netdev_alloc_lock);
38462+static DEFINE_LOCAL_IRQ_LOCK(napi_alloc_cache_lock);
38463
38464 static void *__netdev_alloc_frag(unsigned int fragsz, gfp_t gfp_mask)
38465 {
38466@@ -337,10 +340,10 @@ static void *__netdev_alloc_frag(unsigned int fragsz, gfp_t gfp_mask)
38467 unsigned long flags;
38468 void *data;
38469
38470- local_irq_save(flags);
38471+ local_lock_irqsave(netdev_alloc_lock, flags);
38472 nc = this_cpu_ptr(&netdev_alloc_cache);
38473 data = page_frag_alloc(nc, fragsz, gfp_mask);
38474- local_irq_restore(flags);
38475+ local_unlock_irqrestore(netdev_alloc_lock, flags);
38476 return data;
38477 }
38478
38479@@ -359,9 +362,13 @@ EXPORT_SYMBOL(netdev_alloc_frag);
38480
38481 static void *__napi_alloc_frag(unsigned int fragsz, gfp_t gfp_mask)
38482 {
38483- struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache);
38484+ struct napi_alloc_cache *nc;
38485+ void *data;
38486
38487- return page_frag_alloc(&nc->page, fragsz, gfp_mask);
38488+ nc = &get_locked_var(napi_alloc_cache_lock, napi_alloc_cache);
38489+ data = page_frag_alloc(&nc->page, fragsz, gfp_mask);
38490+ put_locked_var(napi_alloc_cache_lock, napi_alloc_cache);
38491+ return data;
38492 }
38493
38494 void *napi_alloc_frag(unsigned int fragsz)
38495@@ -408,13 +415,13 @@ struct sk_buff *__netdev_alloc_skb(struct net_device *dev, unsigned int len,
38496 if (sk_memalloc_socks())
38497 gfp_mask |= __GFP_MEMALLOC;
38498
38499- local_irq_save(flags);
38500+ local_lock_irqsave(netdev_alloc_lock, flags);
38501
38502 nc = this_cpu_ptr(&netdev_alloc_cache);
38503 data = page_frag_alloc(nc, len, gfp_mask);
38504 pfmemalloc = nc->pfmemalloc;
38505
38506- local_irq_restore(flags);
38507+ local_unlock_irqrestore(netdev_alloc_lock, flags);
38508
38509 if (unlikely(!data))
38510 return NULL;
38511@@ -455,9 +462,10 @@ EXPORT_SYMBOL(__netdev_alloc_skb);
38512 struct sk_buff *__napi_alloc_skb(struct napi_struct *napi, unsigned int len,
38513 gfp_t gfp_mask)
38514 {
38515- struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache);
38516+ struct napi_alloc_cache *nc;
38517 struct sk_buff *skb;
38518 void *data;
38519+ bool pfmemalloc;
38520
38521 len += NET_SKB_PAD + NET_IP_ALIGN;
38522
38523@@ -475,7 +483,10 @@ struct sk_buff *__napi_alloc_skb(struct napi_struct *napi, unsigned int len,
38524 if (sk_memalloc_socks())
38525 gfp_mask |= __GFP_MEMALLOC;
38526
38527+ nc = &get_locked_var(napi_alloc_cache_lock, napi_alloc_cache);
38528 data = page_frag_alloc(&nc->page, len, gfp_mask);
38529+ pfmemalloc = nc->page.pfmemalloc;
38530+ put_locked_var(napi_alloc_cache_lock, napi_alloc_cache);
38531 if (unlikely(!data))
38532 return NULL;
38533
38534@@ -486,7 +497,7 @@ struct sk_buff *__napi_alloc_skb(struct napi_struct *napi, unsigned int len,
38535 }
38536
38537 /* use OR instead of assignment to avoid clearing of bits in mask */
38538- if (nc->page.pfmemalloc)
38539+ if (pfmemalloc)
38540 skb->pfmemalloc = 1;
38541 skb->head_frag = 1;
38542
38543@@ -718,23 +729,26 @@ void __consume_stateless_skb(struct sk_buff *skb)
38544
38545 void __kfree_skb_flush(void)
38546 {
38547- struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache);
38548+ struct napi_alloc_cache *nc;
38549
38550+ nc = &get_locked_var(napi_alloc_cache_lock, napi_alloc_cache);
38551 /* flush skb_cache if containing objects */
38552 if (nc->skb_count) {
38553 kmem_cache_free_bulk(skbuff_head_cache, nc->skb_count,
38554 nc->skb_cache);
38555 nc->skb_count = 0;
38556 }
38557+ put_locked_var(napi_alloc_cache_lock, napi_alloc_cache);
38558 }
38559
38560 static inline void _kfree_skb_defer(struct sk_buff *skb)
38561 {
38562- struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache);
38563+ struct napi_alloc_cache *nc;
38564
38565 /* drop skb->head and call any destructors for packet */
38566 skb_release_all(skb);
38567
38568+ nc = &get_locked_var(napi_alloc_cache_lock, napi_alloc_cache);
38569 /* record skb to CPU local list */
38570 nc->skb_cache[nc->skb_count++] = skb;
38571
38572@@ -749,6 +763,7 @@ static inline void _kfree_skb_defer(struct sk_buff *skb)
38573 nc->skb_cache);
38574 nc->skb_count = 0;
38575 }
38576+ put_locked_var(napi_alloc_cache_lock, napi_alloc_cache);
38577 }
38578 void __kfree_skb_defer(struct sk_buff *skb)
38579 {
38580diff --git a/net/core/sock.c b/net/core/sock.c
38581index 68d08ed5521e..ee242ff5d4b1 100644
38582--- a/net/core/sock.c
38583+++ b/net/core/sock.c
38584@@ -2757,12 +2757,11 @@ void lock_sock_nested(struct sock *sk, int subclass)
38585 if (sk->sk_lock.owned)
38586 __lock_sock(sk);
38587 sk->sk_lock.owned = 1;
38588- spin_unlock(&sk->sk_lock.slock);
38589+ spin_unlock_bh(&sk->sk_lock.slock);
38590 /*
38591 * The sk_lock has mutex_lock() semantics here:
38592 */
38593 mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_);
38594- local_bh_enable();
38595 }
38596 EXPORT_SYMBOL(lock_sock_nested);
38597
38598diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c
38599index 3c1570d3e22f..0310ea93f877 100644
38600--- a/net/ipv4/icmp.c
38601+++ b/net/ipv4/icmp.c
38602@@ -77,6 +77,7 @@
38603 #include <linux/string.h>
38604 #include <linux/netfilter_ipv4.h>
38605 #include <linux/slab.h>
38606+#include <linux/locallock.h>
38607 #include <net/snmp.h>
38608 #include <net/ip.h>
38609 #include <net/route.h>
38610@@ -204,6 +205,8 @@ static const struct icmp_control icmp_pointers[NR_ICMP_TYPES+1];
38611 *
38612 * On SMP we have one ICMP socket per-cpu.
38613 */
38614+static DEFINE_LOCAL_IRQ_LOCK(icmp_sk_lock);
38615+
38616 static struct sock *icmp_sk(struct net *net)
38617 {
38618 return *this_cpu_ptr(net->ipv4.icmp_sk);
38619@@ -214,12 +217,16 @@ static inline struct sock *icmp_xmit_lock(struct net *net)
38620 {
38621 struct sock *sk;
38622
38623+ if (!local_trylock(icmp_sk_lock))
38624+ return NULL;
38625+
38626 sk = icmp_sk(net);
38627
38628 if (unlikely(!spin_trylock(&sk->sk_lock.slock))) {
38629 /* This can happen if the output path signals a
38630 * dst_link_failure() for an outgoing ICMP packet.
38631 */
38632+ local_unlock(icmp_sk_lock);
38633 return NULL;
38634 }
38635 return sk;
38636@@ -228,6 +235,7 @@ static inline struct sock *icmp_xmit_lock(struct net *net)
38637 static inline void icmp_xmit_unlock(struct sock *sk)
38638 {
38639 spin_unlock(&sk->sk_lock.slock);
38640+ local_unlock(icmp_sk_lock);
38641 }
38642
38643 int sysctl_icmp_msgs_per_sec __read_mostly = 1000;
38644diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
38645index 31b34c0c2d5f..851f241e70b5 100644
38646--- a/net/ipv4/tcp_ipv4.c
38647+++ b/net/ipv4/tcp_ipv4.c
38648@@ -62,6 +62,7 @@
38649 #include <linux/init.h>
38650 #include <linux/times.h>
38651 #include <linux/slab.h>
38652+#include <linux/locallock.h>
38653
38654 #include <net/net_namespace.h>
38655 #include <net/icmp.h>
38656@@ -580,6 +581,7 @@ void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
38657 }
38658 EXPORT_SYMBOL(tcp_v4_send_check);
38659
38660+static DEFINE_LOCAL_IRQ_LOCK(tcp_sk_lock);
38661 /*
38662 * This routine will send an RST to the other tcp.
38663 *
38664@@ -710,6 +712,7 @@ static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
38665 arg.tos = ip_hdr(skb)->tos;
38666 arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL);
38667 local_bh_disable();
38668+ local_lock(tcp_sk_lock);
38669 ip_send_unicast_reply(*this_cpu_ptr(net->ipv4.tcp_sk),
38670 skb, &TCP_SKB_CB(skb)->header.h4.opt,
38671 ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
38672@@ -717,6 +720,7 @@ static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
38673
38674 __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
38675 __TCP_INC_STATS(net, TCP_MIB_OUTRSTS);
38676+ local_unlock(tcp_sk_lock);
38677 local_bh_enable();
38678
38679 #ifdef CONFIG_TCP_MD5SIG
38680@@ -796,12 +800,14 @@ static void tcp_v4_send_ack(const struct sock *sk,
38681 arg.tos = tos;
38682 arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL);
38683 local_bh_disable();
38684+ local_lock(tcp_sk_lock);
38685 ip_send_unicast_reply(*this_cpu_ptr(net->ipv4.tcp_sk),
38686 skb, &TCP_SKB_CB(skb)->header.h4.opt,
38687 ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
38688 &arg, arg.iov[0].iov_len);
38689
38690 __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
38691+ local_unlock(tcp_sk_lock);
38692 local_bh_enable();
38693 }
38694
38695diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c
38696index dddd498e1338..8f39b8162df8 100644
38697--- a/net/mac80211/rx.c
38698+++ b/net/mac80211/rx.c
38699@@ -4252,7 +4252,7 @@ void ieee80211_rx_napi(struct ieee80211_hw *hw, struct ieee80211_sta *pubsta,
38700 struct ieee80211_supported_band *sband;
38701 struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(skb);
38702
38703- WARN_ON_ONCE(softirq_count() == 0);
38704+ WARN_ON_ONCE_NONRT(softirq_count() == 0);
38705
38706 if (WARN_ON(status->band >= NUM_NL80211_BANDS))
38707 goto drop;
38708diff --git a/net/netfilter/core.c b/net/netfilter/core.c
38709index 52cd2901a097..c63e937b6676 100644
38710--- a/net/netfilter/core.c
38711+++ b/net/netfilter/core.c
38712@@ -21,6 +21,7 @@
38713 #include <linux/inetdevice.h>
38714 #include <linux/proc_fs.h>
38715 #include <linux/mutex.h>
38716+#include <linux/locallock.h>
38717 #include <linux/mm.h>
38718 #include <linux/rcupdate.h>
38719 #include <net/net_namespace.h>
38720@@ -28,6 +29,11 @@
38721
38722 #include "nf_internals.h"
38723
38724+#ifdef CONFIG_PREEMPT_RT_BASE
38725+DEFINE_LOCAL_IRQ_LOCK(xt_write_lock);
38726+EXPORT_PER_CPU_SYMBOL(xt_write_lock);
38727+#endif
38728+
38729 static DEFINE_MUTEX(afinfo_mutex);
38730
38731 const struct nf_afinfo __rcu *nf_afinfo[NFPROTO_NUMPROTO] __read_mostly;
38732diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
38733index 8d1a7c900393..f1f56be3b061 100644
38734--- a/net/packet/af_packet.c
38735+++ b/net/packet/af_packet.c
38736@@ -63,6 +63,7 @@
38737 #include <linux/if_packet.h>
38738 #include <linux/wireless.h>
38739 #include <linux/kernel.h>
38740+#include <linux/delay.h>
38741 #include <linux/kmod.h>
38742 #include <linux/slab.h>
38743 #include <linux/vmalloc.h>
38744@@ -707,7 +708,7 @@ static void prb_retire_rx_blk_timer_expired(unsigned long data)
38745 if (BLOCK_NUM_PKTS(pbd)) {
38746 while (atomic_read(&pkc->blk_fill_in_prog)) {
38747 /* Waiting for skb_copy_bits to finish... */
38748- cpu_relax();
38749+ cpu_chill();
38750 }
38751 }
38752
38753@@ -969,7 +970,7 @@ static void prb_retire_current_block(struct tpacket_kbdq_core *pkc,
38754 if (!(status & TP_STATUS_BLK_TMO)) {
38755 while (atomic_read(&pkc->blk_fill_in_prog)) {
38756 /* Waiting for skb_copy_bits to finish... */
38757- cpu_relax();
38758+ cpu_chill();
38759 }
38760 }
38761 prb_close_block(pkc, pbd, po, status);
38762diff --git a/net/rds/ib_rdma.c b/net/rds/ib_rdma.c
38763index 9a3c54e659e9..2a95f1d587ac 100644
38764--- a/net/rds/ib_rdma.c
38765+++ b/net/rds/ib_rdma.c
38766@@ -34,6 +34,7 @@
38767 #include <linux/slab.h>
38768 #include <linux/rculist.h>
38769 #include <linux/llist.h>
38770+#include <linux/delay.h>
38771
38772 #include "rds_single_path.h"
38773 #include "ib_mr.h"
38774@@ -210,7 +211,7 @@ static inline void wait_clean_list_grace(void)
38775 for_each_online_cpu(cpu) {
38776 flag = &per_cpu(clean_list_grace, cpu);
38777 while (test_bit(CLEAN_LIST_BUSY_BIT, flag))
38778- cpu_relax();
38779+ cpu_chill();
38780 }
38781 }
38782
38783diff --git a/net/rxrpc/security.c b/net/rxrpc/security.c
38784index e9f428351293..c4479afe8ae7 100644
38785--- a/net/rxrpc/security.c
38786+++ b/net/rxrpc/security.c
38787@@ -19,9 +19,6 @@
38788 #include <keys/rxrpc-type.h>
38789 #include "ar-internal.h"
38790
38791-static LIST_HEAD(rxrpc_security_methods);
38792-static DECLARE_RWSEM(rxrpc_security_sem);
38793-
38794 static const struct rxrpc_security *rxrpc_security_types[] = {
38795 [RXRPC_SECURITY_NONE] = &rxrpc_no_security,
38796 #ifdef CONFIG_RXKAD
38797diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c
38798index cd69aa067543..73348ac5019f 100644
38799--- a/net/sched/sch_api.c
38800+++ b/net/sched/sch_api.c
38801@@ -1081,7 +1081,7 @@ static struct Qdisc *qdisc_create(struct net_device *dev,
38802 rcu_assign_pointer(sch->stab, stab);
38803 }
38804 if (tca[TCA_RATE]) {
38805- seqcount_t *running;
38806+ net_seqlock_t *running;
38807
38808 err = -EOPNOTSUPP;
38809 if (sch->flags & TCQ_F_MQROOT)
38810diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
38811index 79549baf5804..341f7895659c 100644
38812--- a/net/sched/sch_generic.c
38813+++ b/net/sched/sch_generic.c
38814@@ -429,7 +429,11 @@ struct Qdisc noop_qdisc = {
38815 .ops = &noop_qdisc_ops,
38816 .q.lock = __SPIN_LOCK_UNLOCKED(noop_qdisc.q.lock),
38817 .dev_queue = &noop_netdev_queue,
38818+#ifdef CONFIG_PREEMPT_RT_BASE
38819+ .running = __SEQLOCK_UNLOCKED(noop_qdisc.running),
38820+#else
38821 .running = SEQCNT_ZERO(noop_qdisc.running),
38822+#endif
38823 .busylock = __SPIN_LOCK_UNLOCKED(noop_qdisc.busylock),
38824 };
38825 EXPORT_SYMBOL(noop_qdisc);
38826@@ -628,9 +632,17 @@ struct Qdisc *qdisc_alloc(struct netdev_queue *dev_queue,
38827 lockdep_set_class(&sch->busylock,
38828 dev->qdisc_tx_busylock ?: &qdisc_tx_busylock);
38829
38830+#ifdef CONFIG_PREEMPT_RT_BASE
38831+ seqlock_init(&sch->running);
38832+ lockdep_set_class(&sch->running.seqcount,
38833+ dev->qdisc_running_key ?: &qdisc_running_key);
38834+ lockdep_set_class(&sch->running.lock,
38835+ dev->qdisc_running_key ?: &qdisc_running_key);
38836+#else
38837 seqcount_init(&sch->running);
38838 lockdep_set_class(&sch->running,
38839 dev->qdisc_running_key ?: &qdisc_running_key);
38840+#endif
38841
38842 sch->ops = ops;
38843 sch->enqueue = ops->enqueue;
38844@@ -933,7 +945,7 @@ void dev_deactivate_many(struct list_head *head)
38845 /* Wait for outstanding qdisc_run calls. */
38846 list_for_each_entry(dev, head, close_list) {
38847 while (some_qdisc_is_busy(dev))
38848- yield();
38849+ msleep(1);
38850 /* The new qdisc is assigned at this point so we can safely
38851 * unwind stale skb lists and qdisc statistics
38852 */
38853diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c
38854index d16a8b423c20..cedaf909eb97 100644
38855--- a/net/sunrpc/svc_xprt.c
38856+++ b/net/sunrpc/svc_xprt.c
38857@@ -396,7 +396,7 @@ void svc_xprt_do_enqueue(struct svc_xprt *xprt)
38858 goto out;
38859 }
38860
38861- cpu = get_cpu();
38862+ cpu = get_cpu_light();
38863 pool = svc_pool_for_cpu(xprt->xpt_server, cpu);
38864
38865 atomic_long_inc(&pool->sp_stats.packets);
38866@@ -432,7 +432,7 @@ void svc_xprt_do_enqueue(struct svc_xprt *xprt)
38867
38868 atomic_long_inc(&pool->sp_stats.threads_woken);
38869 wake_up_process(rqstp->rq_task);
38870- put_cpu();
38871+ put_cpu_light();
38872 goto out;
38873 }
38874 rcu_read_unlock();
38875@@ -453,7 +453,7 @@ void svc_xprt_do_enqueue(struct svc_xprt *xprt)
38876 goto redo_search;
38877 }
38878 rqstp = NULL;
38879- put_cpu();
38880+ put_cpu_light();
38881 out:
38882 trace_svc_xprt_do_enqueue(xprt, rqstp);
38883 }
38884diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c
38885index 6c4ec69e11a0..77f52dc790ec 100644
38886--- a/net/xfrm/xfrm_state.c
38887+++ b/net/xfrm/xfrm_state.c
38888@@ -427,7 +427,7 @@ static void xfrm_put_mode(struct xfrm_mode *mode)
38889
38890 static void xfrm_state_gc_destroy(struct xfrm_state *x)
38891 {
38892- tasklet_hrtimer_cancel(&x->mtimer);
38893+ hrtimer_cancel(&x->mtimer);
38894 del_timer_sync(&x->rtimer);
38895 kfree(x->aead);
38896 kfree(x->aalg);
38897@@ -472,8 +472,8 @@ static void xfrm_state_gc_task(struct work_struct *work)
38898
38899 static enum hrtimer_restart xfrm_timer_handler(struct hrtimer *me)
38900 {
38901- struct tasklet_hrtimer *thr = container_of(me, struct tasklet_hrtimer, timer);
38902- struct xfrm_state *x = container_of(thr, struct xfrm_state, mtimer);
38903+ struct xfrm_state *x = container_of(me, struct xfrm_state, mtimer);
38904+ enum hrtimer_restart ret = HRTIMER_NORESTART;
38905 unsigned long now = get_seconds();
38906 long next = LONG_MAX;
38907 int warn = 0;
38908@@ -537,7 +537,8 @@ static enum hrtimer_restart xfrm_timer_handler(struct hrtimer *me)
38909 km_state_expired(x, 0, 0);
38910 resched:
38911 if (next != LONG_MAX) {
38912- tasklet_hrtimer_start(&x->mtimer, ktime_set(next, 0), HRTIMER_MODE_REL);
38913+ hrtimer_forward_now(&x->mtimer, ktime_set(next, 0));
38914+ ret = HRTIMER_RESTART;
38915 }
38916
38917 goto out;
38918@@ -554,7 +555,7 @@ static enum hrtimer_restart xfrm_timer_handler(struct hrtimer *me)
38919
38920 out:
38921 spin_unlock(&x->lock);
38922- return HRTIMER_NORESTART;
38923+ return ret;
38924 }
38925
38926 static void xfrm_replay_timer_handler(unsigned long data);
38927@@ -573,8 +574,8 @@ struct xfrm_state *xfrm_state_alloc(struct net *net)
38928 INIT_HLIST_NODE(&x->bydst);
38929 INIT_HLIST_NODE(&x->bysrc);
38930 INIT_HLIST_NODE(&x->byspi);
38931- tasklet_hrtimer_init(&x->mtimer, xfrm_timer_handler,
38932- CLOCK_BOOTTIME, HRTIMER_MODE_ABS);
38933+ hrtimer_init(&x->mtimer, CLOCK_BOOTTIME, HRTIMER_MODE_ABS_SOFT);
38934+ x->mtimer.function = xfrm_timer_handler;
38935 setup_timer(&x->rtimer, xfrm_replay_timer_handler,
38936 (unsigned long)x);
38937 x->curlft.add_time = get_seconds();
38938@@ -1031,7 +1032,9 @@ xfrm_state_find(const xfrm_address_t *daddr, const xfrm_address_t *saddr,
38939 hlist_add_head_rcu(&x->byspi, net->xfrm.state_byspi + h);
38940 }
38941 x->lft.hard_add_expires_seconds = net->xfrm.sysctl_acq_expires;
38942- tasklet_hrtimer_start(&x->mtimer, ktime_set(net->xfrm.sysctl_acq_expires, 0), HRTIMER_MODE_REL);
38943+ hrtimer_start(&x->mtimer,
38944+ ktime_set(net->xfrm.sysctl_acq_expires, 0),
38945+ HRTIMER_MODE_REL_SOFT);
38946 net->xfrm.state_num++;
38947 xfrm_hash_grow_check(net, x->bydst.next != NULL);
38948 spin_unlock_bh(&net->xfrm.xfrm_state_lock);
38949@@ -1142,7 +1145,7 @@ static void __xfrm_state_insert(struct xfrm_state *x)
38950 hlist_add_head_rcu(&x->byspi, net->xfrm.state_byspi + h);
38951 }
38952
38953- tasklet_hrtimer_start(&x->mtimer, ktime_set(1, 0), HRTIMER_MODE_REL);
38954+ hrtimer_start(&x->mtimer, ktime_set(1, 0), HRTIMER_MODE_REL_SOFT);
38955 if (x->replay_maxage)
38956 mod_timer(&x->rtimer, jiffies + x->replay_maxage);
38957
38958@@ -1246,7 +1249,9 @@ static struct xfrm_state *__find_acq_core(struct net *net,
38959 x->mark.m = m->m;
38960 x->lft.hard_add_expires_seconds = net->xfrm.sysctl_acq_expires;
38961 xfrm_state_hold(x);
38962- tasklet_hrtimer_start(&x->mtimer, ktime_set(net->xfrm.sysctl_acq_expires, 0), HRTIMER_MODE_REL);
38963+ hrtimer_start(&x->mtimer,
38964+ ktime_set(net->xfrm.sysctl_acq_expires, 0),
38965+ HRTIMER_MODE_REL_SOFT);
38966 list_add(&x->km.all, &net->xfrm.state_all);
38967 hlist_add_head_rcu(&x->bydst, net->xfrm.state_bydst + h);
38968 h = xfrm_src_hash(net, daddr, saddr, family);
38969@@ -1546,7 +1551,8 @@ int xfrm_state_update(struct xfrm_state *x)
38970 memcpy(&x1->lft, &x->lft, sizeof(x1->lft));
38971 x1->km.dying = 0;
38972
38973- tasklet_hrtimer_start(&x1->mtimer, ktime_set(1, 0), HRTIMER_MODE_REL);
38974+ hrtimer_start(&x1->mtimer, ktime_set(1, 0),
38975+ HRTIMER_MODE_REL_SOFT);
38976 if (x1->curlft.use_time)
38977 xfrm_state_check_expire(x1);
38978
38979@@ -1570,7 +1576,7 @@ int xfrm_state_check_expire(struct xfrm_state *x)
38980 if (x->curlft.bytes >= x->lft.hard_byte_limit ||
38981 x->curlft.packets >= x->lft.hard_packet_limit) {
38982 x->km.state = XFRM_STATE_EXPIRED;
38983- tasklet_hrtimer_start(&x->mtimer, 0, HRTIMER_MODE_REL);
38984+ hrtimer_start(&x->mtimer, 0, HRTIMER_MODE_REL_SOFT);
38985 return -EINVAL;
38986 }
38987
38988diff --git a/samples/trace_events/trace-events-sample.c b/samples/trace_events/trace-events-sample.c
38989index 5522692100ba..8b4be8e1802a 100644
38990--- a/samples/trace_events/trace-events-sample.c
38991+++ b/samples/trace_events/trace-events-sample.c
38992@@ -33,7 +33,7 @@ static void simple_thread_func(int cnt)
38993
38994 /* Silly tracepoints */
38995 trace_foo_bar("hello", cnt, array, random_strings[len],
38996- &current->cpus_allowed);
38997+ current->cpus_ptr);
38998
38999 trace_foo_with_template_simple("HELLO", cnt);
39000
39001diff --git a/scripts/mkcompile_h b/scripts/mkcompile_h
39002index 959199c3147e..3e68004ed345 100755
39003--- a/scripts/mkcompile_h
39004+++ b/scripts/mkcompile_h
39005@@ -5,7 +5,8 @@ TARGET=$1
39006 ARCH=$2
39007 SMP=$3
39008 PREEMPT=$4
39009-CC=$5
39010+RT=$5
39011+CC=$6
39012
39013 vecho() { [ "${quiet}" = "silent_" ] || echo "$@" ; }
39014
39015@@ -58,6 +59,7 @@ UTS_VERSION="#$VERSION"
39016 CONFIG_FLAGS=""
39017 if [ -n "$SMP" ] ; then CONFIG_FLAGS="SMP"; fi
39018 if [ -n "$PREEMPT" ] ; then CONFIG_FLAGS="$CONFIG_FLAGS PREEMPT"; fi
39019+if [ -n "$RT" ] ; then CONFIG_FLAGS="$CONFIG_FLAGS RT"; fi
39020 UTS_VERSION="$UTS_VERSION $CONFIG_FLAGS $TIMESTAMP"
39021
39022 # Truncate to maximum length
39023diff --git a/security/apparmor/include/path.h b/security/apparmor/include/path.h
39024index 05fb3305671e..b26c16b02662 100644
39025--- a/security/apparmor/include/path.h
39026+++ b/security/apparmor/include/path.h
39027@@ -39,9 +39,10 @@ struct aa_buffers {
39028 };
39029
39030 #include <linux/percpu.h>
39031-#include <linux/preempt.h>
39032+#include <linux/locallock.h>
39033
39034 DECLARE_PER_CPU(struct aa_buffers, aa_buffers);
39035+DECLARE_LOCAL_IRQ_LOCK(aa_buffers_lock);
39036
39037 #define COUNT_ARGS(X...) COUNT_ARGS_HELPER(, ##X, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0)
39038 #define COUNT_ARGS_HELPER(_0, _1, _2, _3, _4, _5, _6, _7, _8, _9, n, X...) n
39039@@ -55,12 +56,24 @@ DECLARE_PER_CPU(struct aa_buffers, aa_buffers);
39040
39041 #define for_each_cpu_buffer(I) for ((I) = 0; (I) < MAX_PATH_BUFFERS; (I)++)
39042
39043-#ifdef CONFIG_DEBUG_PREEMPT
39044+#ifdef CONFIG_PREEMPT_RT_BASE
39045+
39046+static inline void AA_BUG_PREEMPT_ENABLED(const char *s)
39047+{
39048+ struct local_irq_lock *lv;
39049+
39050+ lv = this_cpu_ptr(&aa_buffers_lock);
39051+ WARN_ONCE(lv->owner != current,
39052+ "__get_buffer without aa_buffers_lock\n");
39053+}
39054+
39055+#elif defined(CONFIG_DEBUG_PREEMPT)
39056 #define AA_BUG_PREEMPT_ENABLED(X) AA_BUG(preempt_count() <= 0, X)
39057 #else
39058 #define AA_BUG_PREEMPT_ENABLED(X) /* nop */
39059 #endif
39060
39061+
39062 #define __get_buffer(N) ({ \
39063 struct aa_buffers *__cpu_var; \
39064 AA_BUG_PREEMPT_ENABLED("__get_buffer without preempt disabled"); \
39065@@ -73,14 +86,14 @@ DECLARE_PER_CPU(struct aa_buffers, aa_buffers);
39066
39067 #define get_buffers(X...) \
39068 do { \
39069- preempt_disable(); \
39070+ local_lock(aa_buffers_lock); \
39071 __get_buffers(X); \
39072 } while (0)
39073
39074 #define put_buffers(X, Y...) \
39075 do { \
39076 __put_buffers(X, Y); \
39077- preempt_enable(); \
39078+ local_unlock(aa_buffers_lock); \
39079 } while (0)
39080
39081 #endif /* __AA_PATH_H */
39082diff --git a/security/apparmor/lsm.c b/security/apparmor/lsm.c
39083index 1346ee5be04f..aa7e4dee107b 100644
39084--- a/security/apparmor/lsm.c
39085+++ b/security/apparmor/lsm.c
39086@@ -44,7 +44,7 @@
39087 int apparmor_initialized;
39088
39089 DEFINE_PER_CPU(struct aa_buffers, aa_buffers);
39090-
39091+DEFINE_LOCAL_IRQ_LOCK(aa_buffers_lock);
39092
39093 /*
39094 * LSM hook functions
39095diff --git a/sound/core/pcm_native.c b/sound/core/pcm_native.c
39096index ab3bf36786b6..f0bb7c9aa4be 100644
39097--- a/sound/core/pcm_native.c
39098+++ b/sound/core/pcm_native.c
39099@@ -148,7 +148,7 @@ EXPORT_SYMBOL_GPL(snd_pcm_stream_unlock);
39100 void snd_pcm_stream_lock_irq(struct snd_pcm_substream *substream)
39101 {
39102 if (!substream->pcm->nonatomic)
39103- local_irq_disable();
39104+ local_irq_disable_nort();
39105 snd_pcm_stream_lock(substream);
39106 }
39107 EXPORT_SYMBOL_GPL(snd_pcm_stream_lock_irq);
39108@@ -163,7 +163,7 @@ void snd_pcm_stream_unlock_irq(struct snd_pcm_substream *substream)
39109 {
39110 snd_pcm_stream_unlock(substream);
39111 if (!substream->pcm->nonatomic)
39112- local_irq_enable();
39113+ local_irq_enable_nort();
39114 }
39115 EXPORT_SYMBOL_GPL(snd_pcm_stream_unlock_irq);
39116
39117@@ -171,7 +171,7 @@ unsigned long _snd_pcm_stream_lock_irqsave(struct snd_pcm_substream *substream)
39118 {
39119 unsigned long flags = 0;
39120 if (!substream->pcm->nonatomic)
39121- local_irq_save(flags);
39122+ local_irq_save_nort(flags);
39123 snd_pcm_stream_lock(substream);
39124 return flags;
39125 }
39126@@ -189,7 +189,7 @@ void snd_pcm_stream_unlock_irqrestore(struct snd_pcm_substream *substream,
39127 {
39128 snd_pcm_stream_unlock(substream);
39129 if (!substream->pcm->nonatomic)
39130- local_irq_restore(flags);
39131+ local_irq_restore_nort(flags);
39132 }
39133 EXPORT_SYMBOL_GPL(snd_pcm_stream_unlock_irqrestore);
39134
39135diff --git a/sound/drivers/dummy.c b/sound/drivers/dummy.c
39136index c0939a0164a6..549e014ecc0d 100644
39137--- a/sound/drivers/dummy.c
39138+++ b/sound/drivers/dummy.c
39139@@ -376,17 +376,9 @@ struct dummy_hrtimer_pcm {
39140 ktime_t period_time;
39141 atomic_t running;
39142 struct hrtimer timer;
39143- struct tasklet_struct tasklet;
39144 struct snd_pcm_substream *substream;
39145 };
39146
39147-static void dummy_hrtimer_pcm_elapsed(unsigned long priv)
39148-{
39149- struct dummy_hrtimer_pcm *dpcm = (struct dummy_hrtimer_pcm *)priv;
39150- if (atomic_read(&dpcm->running))
39151- snd_pcm_period_elapsed(dpcm->substream);
39152-}
39153-
39154 static enum hrtimer_restart dummy_hrtimer_callback(struct hrtimer *timer)
39155 {
39156 struct dummy_hrtimer_pcm *dpcm;
39157@@ -394,7 +386,14 @@ static enum hrtimer_restart dummy_hrtimer_callback(struct hrtimer *timer)
39158 dpcm = container_of(timer, struct dummy_hrtimer_pcm, timer);
39159 if (!atomic_read(&dpcm->running))
39160 return HRTIMER_NORESTART;
39161- tasklet_schedule(&dpcm->tasklet);
39162+ /*
39163+ * In cases of XRUN and draining, this calls .trigger to stop PCM
39164+ * substream.
39165+ */
39166+ snd_pcm_period_elapsed(dpcm->substream);
39167+ if (!atomic_read(&dpcm->running))
39168+ return HRTIMER_NORESTART;
39169+
39170 hrtimer_forward_now(timer, dpcm->period_time);
39171 return HRTIMER_RESTART;
39172 }
39173@@ -404,7 +403,7 @@ static int dummy_hrtimer_start(struct snd_pcm_substream *substream)
39174 struct dummy_hrtimer_pcm *dpcm = substream->runtime->private_data;
39175
39176 dpcm->base_time = hrtimer_cb_get_time(&dpcm->timer);
39177- hrtimer_start(&dpcm->timer, dpcm->period_time, HRTIMER_MODE_REL);
39178+ hrtimer_start(&dpcm->timer, dpcm->period_time, HRTIMER_MODE_REL_SOFT);
39179 atomic_set(&dpcm->running, 1);
39180 return 0;
39181 }
39182@@ -414,14 +413,14 @@ static int dummy_hrtimer_stop(struct snd_pcm_substream *substream)
39183 struct dummy_hrtimer_pcm *dpcm = substream->runtime->private_data;
39184
39185 atomic_set(&dpcm->running, 0);
39186- hrtimer_cancel(&dpcm->timer);
39187+ if (!hrtimer_callback_running(&dpcm->timer))
39188+ hrtimer_cancel(&dpcm->timer);
39189 return 0;
39190 }
39191
39192 static inline void dummy_hrtimer_sync(struct dummy_hrtimer_pcm *dpcm)
39193 {
39194 hrtimer_cancel(&dpcm->timer);
39195- tasklet_kill(&dpcm->tasklet);
39196 }
39197
39198 static snd_pcm_uframes_t
39199@@ -466,12 +465,10 @@ static int dummy_hrtimer_create(struct snd_pcm_substream *substream)
39200 if (!dpcm)
39201 return -ENOMEM;
39202 substream->runtime->private_data = dpcm;
39203- hrtimer_init(&dpcm->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
39204+ hrtimer_init(&dpcm->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_SOFT);
39205 dpcm->timer.function = dummy_hrtimer_callback;
39206 dpcm->substream = substream;
39207 atomic_set(&dpcm->running, 0);
39208- tasklet_init(&dpcm->tasklet, dummy_hrtimer_pcm_elapsed,
39209- (unsigned long)dpcm);
39210 return 0;
39211 }
39212
39213diff --git a/tools/testing/selftests/ftrace/test.d/functions b/tools/testing/selftests/ftrace/test.d/functions
39214index 6a4982d029bf..843c2b0d948e 100644
39215--- a/tools/testing/selftests/ftrace/test.d/functions
39216+++ b/tools/testing/selftests/ftrace/test.d/functions
39217@@ -70,6 +70,13 @@ disable_events() {
39218 echo 0 > events/enable
39219 }
39220
39221+clear_synthetic_events() { # reset all current synthetic events
39222+ grep -v ^# synthetic_events |
39223+ while read line; do
39224+ echo "!$line" >> synthetic_events
39225+ done
39226+}
39227+
39228 initialize_ftrace() { # Reset ftrace to initial-state
39229 # As the initial state, ftrace will be set to nop tracer,
39230 # no events, no triggers, no filters, no function filters,
39231diff --git a/tools/testing/selftests/ftrace/test.d/trigger/inter-event/trigger-extended-error-support.tc b/tools/testing/selftests/ftrace/test.d/trigger/inter-event/trigger-extended-error-support.tc
39232new file mode 100644
39233index 000000000000..786dce7e48be
39234--- /dev/null
39235+++ b/tools/testing/selftests/ftrace/test.d/trigger/inter-event/trigger-extended-error-support.tc
39236@@ -0,0 +1,39 @@
39237+#!/bin/sh
39238+# description: event trigger - test extended error support
39239+
39240+
39241+do_reset() {
39242+ reset_trigger
39243+ echo > set_event
39244+ clear_trace
39245+}
39246+
39247+fail() { #msg
39248+ do_reset
39249+ echo $1
39250+ exit_fail
39251+}
39252+
39253+if [ ! -f set_event ]; then
39254+ echo "event tracing is not supported"
39255+ exit_unsupported
39256+fi
39257+
39258+if [ ! -f synthetic_events ]; then
39259+ echo "synthetic event is not supported"
39260+ exit_unsupported
39261+fi
39262+
39263+reset_tracer
39264+do_reset
39265+
39266+echo "Test extended error support"
39267+echo 'hist:keys=pid:ts0=common_timestamp.usecs if comm=="ping"' > events/sched/sched_wakeup/trigger
39268+echo 'hist:keys=pid:ts0=common_timestamp.usecs if comm=="ping"' >> events/sched/sched_wakeup/trigger &>/dev/null
39269+if ! grep -q "ERROR:" events/sched/sched_wakeup/hist; then
39270+ fail "Failed to generate extended error in histogram"
39271+fi
39272+
39273+do_reset
39274+
39275+exit 0
39276diff --git a/tools/testing/selftests/ftrace/test.d/trigger/inter-event/trigger-field-variable-support.tc b/tools/testing/selftests/ftrace/test.d/trigger/inter-event/trigger-field-variable-support.tc
39277new file mode 100644
39278index 000000000000..7fd5b4a8f060
39279--- /dev/null
39280+++ b/tools/testing/selftests/ftrace/test.d/trigger/inter-event/trigger-field-variable-support.tc
39281@@ -0,0 +1,54 @@
39282+#!/bin/sh
39283+# description: event trigger - test field variable support
39284+
39285+do_reset() {
39286+ reset_trigger
39287+ echo > set_event
39288+ clear_trace
39289+}
39290+
39291+fail() { #msg
39292+ do_reset
39293+ echo $1
39294+ exit_fail
39295+}
39296+
39297+if [ ! -f set_event ]; then
39298+ echo "event tracing is not supported"
39299+ exit_unsupported
39300+fi
39301+
39302+if [ ! -f synthetic_events ]; then
39303+ echo "synthetic event is not supported"
39304+ exit_unsupported
39305+fi
39306+
39307+clear_synthetic_events
39308+reset_tracer
39309+do_reset
39310+
39311+echo "Test field variable support"
39312+
39313+echo 'wakeup_latency u64 lat; pid_t pid; int prio; char comm[16]' > synthetic_events
39314+echo 'hist:keys=comm:ts0=common_timestamp.usecs if comm=="ping"' > events/sched/sched_waking/trigger
39315+echo 'hist:keys=next_comm:wakeup_lat=common_timestamp.usecs-$ts0:onmatch(sched.sched_waking).wakeup_latency($wakeup_lat,next_pid,sched.sched_waking.prio,next_comm) if next_comm=="ping"' > events/sched/sched_switch/trigger
39316+echo 'hist:keys=pid,prio,comm:vals=lat:sort=pid,prio' > events/synthetic/wakeup_latency/trigger
39317+
39318+ping localhost -c 3
39319+if ! grep -q "ping" events/synthetic/wakeup_latency/hist; then
39320+ fail "Failed to create inter-event histogram"
39321+fi
39322+
39323+if ! grep -q "synthetic_prio=prio" events/sched/sched_waking/hist; then
39324+ fail "Failed to create histogram with field variable"
39325+fi
39326+
39327+echo '!hist:keys=next_comm:wakeup_lat=common_timestamp.usecs-$ts0:onmatch(sched.sched_waking).wakeup_latency($wakeup_lat,next_pid,sched.sched_waking.prio,next_comm) if next_comm=="ping"' >> events/sched/sched_switch/trigger
39328+
39329+if grep -q "synthetic_prio=prio" events/sched/sched_waking/hist; then
39330+ fail "Failed to remove histogram with field variable"
39331+fi
39332+
39333+do_reset
39334+
39335+exit 0
39336diff --git a/tools/testing/selftests/ftrace/test.d/trigger/inter-event/trigger-inter-event-combined-hist.tc b/tools/testing/selftests/ftrace/test.d/trigger/inter-event/trigger-inter-event-combined-hist.tc
39337new file mode 100644
39338index 000000000000..c93dbe38b5df
39339--- /dev/null
39340+++ b/tools/testing/selftests/ftrace/test.d/trigger/inter-event/trigger-inter-event-combined-hist.tc
39341@@ -0,0 +1,58 @@
39342+#!/bin/sh
39343+# description: event trigger - test inter-event combined histogram trigger
39344+
39345+do_reset() {
39346+ reset_trigger
39347+ echo > set_event
39348+ clear_trace
39349+}
39350+
39351+fail() { #msg
39352+ do_reset
39353+ echo $1
39354+ exit_fail
39355+}
39356+
39357+if [ ! -f set_event ]; then
39358+ echo "event tracing is not supported"
39359+ exit_unsupported
39360+fi
39361+
39362+if [ ! -f synthetic_events ]; then
39363+ echo "synthetic event is not supported"
39364+ exit_unsupported
39365+fi
39366+
39367+reset_tracer
39368+do_reset
39369+clear_synthetic_events
39370+
39371+echo "Test create synthetic event"
39372+
39373+echo 'waking_latency u64 lat pid_t pid' > synthetic_events
39374+if [ ! -d events/synthetic/waking_latency ]; then
39375+ fail "Failed to create waking_latency synthetic event"
39376+fi
39377+
39378+echo "Test combined histogram"
39379+
39380+echo 'hist:keys=pid:ts0=common_timestamp.usecs if comm=="ping"' > events/sched/sched_waking/trigger
39381+echo 'hist:keys=pid:waking_lat=common_timestamp.usecs-$ts0:onmatch(sched.sched_waking).waking_latency($waking_lat,pid) if comm=="ping"' > events/sched/sched_wakeup/trigger
39382+echo 'hist:keys=pid,lat:sort=pid,lat' > events/synthetic/waking_latency/trigger
39383+
39384+echo 'wakeup_latency u64 lat pid_t pid' >> synthetic_events
39385+echo 'hist:keys=pid:ts1=common_timestamp.usecs if comm=="ping"' >> events/sched/sched_wakeup/trigger
39386+echo 'hist:keys=next_pid:wakeup_lat=common_timestamp.usecs-$ts1:onmatch(sched.sched_wakeup).wakeup_latency($wakeup_lat,next_pid) if next_comm=="ping"' > events/sched/sched_switch/trigger
39387+
39388+echo 'waking+wakeup_latency u64 lat; pid_t pid' >> synthetic_events
39389+echo 'hist:keys=pid,lat:sort=pid,lat:ww_lat=$waking_lat+$wakeup_lat:onmatch(synthetic.wakeup_latency).waking+wakeup_latency($ww_lat,pid)' >> events/synthetic/wakeup_latency/trigger
39390+echo 'hist:keys=pid,lat:sort=pid,lat' >> events/synthetic/waking+wakeup_latency/trigger
39391+
39392+ping localhost -c 3
39393+if ! grep -q "pid:" events/synthetic/waking+wakeup_latency/hist; then
39394+ fail "Failed to create combined histogram"
39395+fi
39396+
39397+do_reset
39398+
39399+exit 0
39400diff --git a/tools/testing/selftests/ftrace/test.d/trigger/inter-event/trigger-onmatch-action-hist.tc b/tools/testing/selftests/ftrace/test.d/trigger/inter-event/trigger-onmatch-action-hist.tc
39401new file mode 100644
39402index 000000000000..e84e7d048566
39403--- /dev/null
39404+++ b/tools/testing/selftests/ftrace/test.d/trigger/inter-event/trigger-onmatch-action-hist.tc
39405@@ -0,0 +1,50 @@
39406+#!/bin/sh
39407+# description: event trigger - test inter-event histogram trigger onmatch action
39408+
39409+do_reset() {
39410+ reset_trigger
39411+ echo > set_event
39412+ clear_trace
39413+}
39414+
39415+fail() { #msg
39416+ do_reset
39417+ echo $1
39418+ exit_fail
39419+}
39420+
39421+if [ ! -f set_event ]; then
39422+ echo "event tracing is not supported"
39423+ exit_unsupported
39424+fi
39425+
39426+if [ ! -f synthetic_events ]; then
39427+ echo "synthetic event is not supported"
39428+ exit_unsupported
39429+fi
39430+
39431+clear_synthetic_events
39432+reset_tracer
39433+do_reset
39434+
39435+echo "Test create synthetic event"
39436+
39437+echo 'wakeup_latency u64 lat pid_t pid char comm[16]' > synthetic_events
39438+if [ ! -d events/synthetic/wakeup_latency ]; then
39439+ fail "Failed to create wakeup_latency synthetic event"
39440+fi
39441+
39442+echo "Test create histogram for synthetic event"
39443+echo "Test histogram variables,simple expression support and onmatch action"
39444+
39445+echo 'hist:keys=pid:ts0=common_timestamp.usecs if comm=="ping"' > events/sched/sched_wakeup/trigger
39446+echo 'hist:keys=next_pid:wakeup_lat=common_timestamp.usecs-$ts0:onmatch(sched.sched_wakeup).wakeup_latency($wakeup_lat,next_pid,next_comm) if next_comm=="ping"' > events/sched/sched_switch/trigger
39447+echo 'hist:keys=comm,pid,lat:wakeup_lat=lat:sort=lat' > events/synthetic/wakeup_latency/trigger
39448+ping localhost -c 5
39449+if ! grep -q "ping" events/synthetic/wakeup_latency/hist; then
39450+ fail "Failed to create onmatch action inter-event histogram"
39451+fi
39452+
39453+do_reset
39454+
39455+exit 0
39456diff --git a/tools/testing/selftests/ftrace/test.d/trigger/inter-event/trigger-onmatch-onmax-action-hist.tc b/tools/testing/selftests/ftrace/test.d/trigger/inter-event/trigger-onmatch-onmax-action-hist.tc
39457new file mode 100644
39458index 000000000000..7907d8aacde3
39459--- /dev/null
39460+++ b/tools/testing/selftests/ftrace/test.d/trigger/inter-event/trigger-onmatch-onmax-action-hist.tc
39461@@ -0,0 +1,50 @@
39462+#!/bin/sh
39463+# description: event trigger - test inter-event histogram trigger onmatch-onmax action
39464+
39465+do_reset() {
39466+ reset_trigger
39467+ echo > set_event
39468+ clear_trace
39469+}
39470+
39471+fail() { #msg
39472+ do_reset
39473+ echo $1
39474+ exit_fail
39475+}
39476+
39477+if [ ! -f set_event ]; then
39478+ echo "event tracing is not supported"
39479+ exit_unsupported
39480+fi
39481+
39482+if [ ! -f synthetic_events ]; then
39483+ echo "synthetic event is not supported"
39484+ exit_unsupported
39485+fi
39486+
39487+clear_synthetic_events
39488+reset_tracer
39489+do_reset
39490+
39491+echo "Test create synthetic event"
39492+
39493+echo 'wakeup_latency u64 lat pid_t pid char comm[16]' > synthetic_events
39494+if [ ! -d events/synthetic/wakeup_latency ]; then
39495+ fail "Failed to create wakeup_latency synthetic event"
39496+fi
39497+
39498+echo "Test create histogram for synthetic event"
39499+echo "Test histogram variables,simple expression support and onmatch-onmax action"
39500+
39501+echo 'hist:keys=pid:ts0=common_timestamp.usecs if comm=="ping"' > events/sched/sched_wakeup/trigger
39502+echo 'hist:keys=next_pid:wakeup_lat=common_timestamp.usecs-$ts0:onmatch(sched.sched_wakeup).wakeup_latency($wakeup_lat,next_pid,next_comm):onmax($wakeup_lat).save(next_comm,prev_pid,prev_prio,prev_comm) if next_comm=="ping"' >> events/sched/sched_switch/trigger
39503+echo 'hist:keys=comm,pid,lat:wakeup_lat=lat:sort=lat' > events/synthetic/wakeup_latency/trigger
39504+ping localhost -c 5
39505+if [ ! grep -q "ping" events/synthetic/wakeup_latency/hist -o ! grep -q "max:" events/sched/sched_switch/hist]; then
39506+ fail "Failed to create onmatch-onmax action inter-event histogram"
39507+fi
39508+
39509+do_reset
39510+
39511+exit 0
39512diff --git a/tools/testing/selftests/ftrace/test.d/trigger/inter-event/trigger-onmax-action-hist.tc b/tools/testing/selftests/ftrace/test.d/trigger/inter-event/trigger-onmax-action-hist.tc
39513new file mode 100644
39514index 000000000000..38b7ed6242b2
39515--- /dev/null
39516+++ b/tools/testing/selftests/ftrace/test.d/trigger/inter-event/trigger-onmax-action-hist.tc
39517@@ -0,0 +1,48 @@
39518+#!/bin/sh
39519+# description: event trigger - test inter-event histogram trigger onmax action
39520+
39521+do_reset() {
39522+ reset_trigger
39523+ echo > set_event
39524+ clear_trace
39525+}
39526+
39527+fail() { #msg
39528+ do_reset
39529+ echo $1
39530+ exit_fail
39531+}
39532+
39533+if [ ! -f set_event ]; then
39534+ echo "event tracing is not supported"
39535+ exit_unsupported
39536+fi
39537+
39538+if [ ! -f synthetic_events ]; then
39539+ echo "synthetic event is not supported"
39540+ exit_unsupported
39541+fi
39542+
39543+clear_synthetic_events
39544+reset_tracer
39545+do_reset
39546+
39547+echo "Test create synthetic event"
39548+
39549+echo 'wakeup_latency u64 lat pid_t pid char comm[16]' > synthetic_events
39550+if [ ! -d events/synthetic/wakeup_latency ]; then
39551+ fail "Failed to create wakeup_latency synthetic event"
39552+fi
39553+
39554+echo "Test onmax action"
39555+
39556+echo 'hist:keys=pid:ts0=common_timestamp.usecs if comm=="ping"' >> events/sched/sched_waking/trigger
39557+echo 'hist:keys=next_pid:wakeup_lat=common_timestamp.usecs-$ts0:onmax($wakeup_lat).save(next_comm,prev_pid,prev_prio,prev_comm) if next_comm=="ping"' >> events/sched/sched_switch/trigger
39558+ping localhost -c 3
39559+if ! grep -q "max:" events/sched/sched_switch/hist; then
39560+ fail "Failed to create onmax action inter-event histogram"
39561+fi
39562+
39563+do_reset
39564+
39565+exit 0
39566diff --git a/tools/testing/selftests/ftrace/test.d/trigger/inter-event/trigger-synthetic-event-createremove.tc b/tools/testing/selftests/ftrace/test.d/trigger/inter-event/trigger-synthetic-event-createremove.tc
39567new file mode 100644
39568index 000000000000..cef11377dcbd
39569--- /dev/null
39570+++ b/tools/testing/selftests/ftrace/test.d/trigger/inter-event/trigger-synthetic-event-createremove.tc
39571@@ -0,0 +1,54 @@
39572+#!/bin/sh
39573+# description: event trigger - test synthetic event create remove
39574+do_reset() {
39575+ reset_trigger
39576+ echo > set_event
39577+ clear_trace
39578+}
39579+
39580+fail() { #msg
39581+ do_reset
39582+ echo $1
39583+ exit_fail
39584+}
39585+
39586+if [ ! -f set_event ]; then
39587+ echo "event tracing is not supported"
39588+ exit_unsupported
39589+fi
39590+
39591+if [ ! -f synthetic_events ]; then
39592+ echo "synthetic event is not supported"
39593+ exit_unsupported
39594+fi
39595+
39596+clear_synthetic_events
39597+reset_tracer
39598+do_reset
39599+
39600+echo "Test create synthetic event"
39601+
39602+echo 'wakeup_latency u64 lat pid_t pid char comm[16]' > synthetic_events
39603+if [ ! -d events/synthetic/wakeup_latency ]; then
39604+ fail "Failed to create wakeup_latency synthetic event"
39605+fi
39606+
39607+reset_trigger
39608+
39609+echo "Test create synthetic event with an error"
39610+echo 'wakeup_latency u64 lat pid_t pid char' > synthetic_events > /dev/null
39611+if [ -d events/synthetic/wakeup_latency ]; then
39612+ fail "Created wakeup_latency synthetic event with an invalid format"
39613+fi
39614+
39615+reset_trigger
39616+
39617+echo "Test remove synthetic event"
39618+echo '!wakeup_latency u64 lat pid_t pid char comm[16]' > synthetic_events
39619+if [ -d events/synthetic/wakeup_latency ]; then
39620+ fail "Failed to delete wakeup_latency synthetic event"
39621+fi
39622+
39623+do_reset
39624+
39625+exit 0
39626diff --git a/virt/kvm/arm/arm.c b/virt/kvm/arm/arm.c
39627index d5f1d8364571..c09e04130bfe 100644
39628--- a/virt/kvm/arm/arm.c
39629+++ b/virt/kvm/arm/arm.c
39630@@ -69,7 +69,6 @@ static DEFINE_PER_CPU(unsigned char, kvm_arm_hardware_enabled);
39631
39632 static void kvm_arm_set_running_vcpu(struct kvm_vcpu *vcpu)
39633 {
39634- BUG_ON(preemptible());
39635 __this_cpu_write(kvm_arm_running_vcpu, vcpu);
39636 }
39637
39638@@ -79,7 +78,6 @@ static void kvm_arm_set_running_vcpu(struct kvm_vcpu *vcpu)
39639 */
39640 struct kvm_vcpu *kvm_arm_get_running_vcpu(void)
39641 {
39642- BUG_ON(preemptible());
39643 return __this_cpu_read(kvm_arm_running_vcpu);
39644 }
39645
39646@@ -653,7 +651,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run)
39647 * involves poking the GIC, which must be done in a
39648 * non-preemptible context.
39649 */
39650- preempt_disable();
39651+ migrate_disable();
39652
39653 kvm_pmu_flush_hwstate(vcpu);
39654
39655@@ -690,7 +688,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run)
39656 kvm_pmu_sync_hwstate(vcpu);
39657 kvm_timer_sync_hwstate(vcpu);
39658 kvm_vgic_sync_hwstate(vcpu);
39659- preempt_enable();
39660+ migrate_enable();
39661 continue;
39662 }
39663
39664@@ -745,7 +743,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run)
39665
39666 kvm_vgic_sync_hwstate(vcpu);
39667
39668- preempt_enable();
39669+ migrate_enable();
39670
39671 ret = handle_exit(vcpu, run, ret);
39672 }
This page took 0.645175 seconds and 4 git commands to generate.