]> git.pld-linux.org Git - packages/kernel.git/blame - kernel-rt.patch
- 4.14.267
[packages/kernel.git] / kernel-rt.patch
CommitLineData
b3bbd485
JK
1diff --git a/Documentation/trace/events.txt b/Documentation/trace/events.txt
2index 2cc08d4a326e..e28f7f29f2b3 100644
3--- a/Documentation/trace/events.txt
4+++ b/Documentation/trace/events.txt
5@@ -517,1550 +517,4 @@ The following commands are supported:
6 totals derived from one or more trace event format fields and/or
7 event counts (hitcount).
e4b2b4a8 8
b3bbd485 9- The format of a hist trigger is as follows:
e4b2b4a8 10-
b3bbd485
JK
11- hist:keys=<field1[,field2,...]>[:values=<field1[,field2,...]>]
12- [:sort=<field1[,field2,...]>][:size=#entries][:pause][:continue]
13- [:clear][:name=histname1] [if <filter>]
e4b2b4a8 14-
b3bbd485
JK
15- When a matching event is hit, an entry is added to a hash table
16- using the key(s) and value(s) named. Keys and values correspond to
17- fields in the event's format description. Values must correspond to
18- numeric fields - on an event hit, the value(s) will be added to a
19- sum kept for that field. The special string 'hitcount' can be used
20- in place of an explicit value field - this is simply a count of
21- event hits. If 'values' isn't specified, an implicit 'hitcount'
22- value will be automatically created and used as the only value.
23- Keys can be any field, or the special string 'stacktrace', which
24- will use the event's kernel stacktrace as the key. The keywords
25- 'keys' or 'key' can be used to specify keys, and the keywords
26- 'values', 'vals', or 'val' can be used to specify values. Compound
27- keys consisting of up to two fields can be specified by the 'keys'
28- keyword. Hashing a compound key produces a unique entry in the
29- table for each unique combination of component keys, and can be
30- useful for providing more fine-grained summaries of event data.
31- Additionally, sort keys consisting of up to two fields can be
32- specified by the 'sort' keyword. If more than one field is
33- specified, the result will be a 'sort within a sort': the first key
34- is taken to be the primary sort key and the second the secondary
35- key. If a hist trigger is given a name using the 'name' parameter,
36- its histogram data will be shared with other triggers of the same
37- name, and trigger hits will update this common data. Only triggers
38- with 'compatible' fields can be combined in this way; triggers are
39- 'compatible' if the fields named in the trigger share the same
40- number and type of fields and those fields also have the same names.
41- Note that any two events always share the compatible 'hitcount' and
42- 'stacktrace' fields and can therefore be combined using those
43- fields, however pointless that may be.
e4b2b4a8
JK
44-
45- 'hist' triggers add a 'hist' file to each event's subdirectory.
46- Reading the 'hist' file for the event will dump the hash table in
47- its entirety to stdout. If there are multiple hist triggers
48- attached to an event, there will be a table for each trigger in the
49- output. The table displayed for a named trigger will be the same as
50- any other instance having the same name. Each printed hash table
51- entry is a simple list of the keys and values comprising the entry;
52- keys are printed first and are delineated by curly braces, and are
53- followed by the set of value fields for the entry. By default,
54- numeric fields are displayed as base-10 integers. This can be
55- modified by appending any of the following modifiers to the field
56- name:
57-
58- .hex display a number as a hex value
59- .sym display an address as a symbol
60- .sym-offset display an address as a symbol and offset
61- .syscall display a syscall id as a system call name
62- .execname display a common_pid as a program name
63-
64- Note that in general the semantics of a given field aren't
65- interpreted when applying a modifier to it, but there are some
66- restrictions to be aware of in this regard:
67-
68- - only the 'hex' modifier can be used for values (because values
69- are essentially sums, and the other modifiers don't make sense
70- in that context).
71- - the 'execname' modifier can only be used on a 'common_pid'. The
72- reason for this is that the execname is simply the 'comm' value
73- saved for the 'current' process when an event was triggered,
74- which is the same as the common_pid value saved by the event
75- tracing code. Trying to apply that comm value to other pid
76- values wouldn't be correct, and typically events that care save
77- pid-specific comm fields in the event itself.
78-
79- A typical usage scenario would be the following to enable a hist
80- trigger, read its current contents, and then turn it off:
81-
82- # echo 'hist:keys=skbaddr.hex:vals=len' > \
83- /sys/kernel/debug/tracing/events/net/netif_rx/trigger
84-
85- # cat /sys/kernel/debug/tracing/events/net/netif_rx/hist
86-
87- # echo '!hist:keys=skbaddr.hex:vals=len' > \
88- /sys/kernel/debug/tracing/events/net/netif_rx/trigger
89-
90- The trigger file itself can be read to show the details of the
91- currently attached hist trigger. This information is also displayed
92- at the top of the 'hist' file when read.
93-
94- By default, the size of the hash table is 2048 entries. The 'size'
95- parameter can be used to specify more or fewer than that. The units
96- are in terms of hashtable entries - if a run uses more entries than
97- specified, the results will show the number of 'drops', the number
98- of hits that were ignored. The size should be a power of 2 between
99- 128 and 131072 (any non- power-of-2 number specified will be rounded
100- up).
101-
102- The 'sort' parameter can be used to specify a value field to sort
103- on. The default if unspecified is 'hitcount' and the default sort
104- order is 'ascending'. To sort in the opposite direction, append
105- .descending' to the sort key.
106-
107- The 'pause' parameter can be used to pause an existing hist trigger
108- or to start a hist trigger but not log any events until told to do
109- so. 'continue' or 'cont' can be used to start or restart a paused
110- hist trigger.
111-
112- The 'clear' parameter will clear the contents of a running hist
113- trigger and leave its current paused/active state.
114-
115- Note that the 'pause', 'cont', and 'clear' parameters should be
116- applied using 'append' shell operator ('>>') if applied to an
117- existing trigger, rather than via the '>' operator, which will cause
118- the trigger to be removed through truncation.
119-
120-- enable_hist/disable_hist
121-
122- The enable_hist and disable_hist triggers can be used to have one
123- event conditionally start and stop another event's already-attached
124- hist trigger. Any number of enable_hist and disable_hist triggers
125- can be attached to a given event, allowing that event to kick off
126- and stop aggregations on a host of other events.
127-
128- The format is very similar to the enable/disable_event triggers:
129-
130- enable_hist:<system>:<event>[:count]
131- disable_hist:<system>:<event>[:count]
132-
133- Instead of enabling or disabling the tracing of the target event
134- into the trace buffer as the enable/disable_event triggers do, the
135- enable/disable_hist triggers enable or disable the aggregation of
136- the target event into a hash table.
137-
138- A typical usage scenario for the enable_hist/disable_hist triggers
139- would be to first set up a paused hist trigger on some event,
140- followed by an enable_hist/disable_hist pair that turns the hist
141- aggregation on and off when conditions of interest are hit:
142-
143- # echo 'hist:keys=skbaddr.hex:vals=len:pause' > \
144- /sys/kernel/debug/tracing/events/net/netif_receive_skb/trigger
145-
146- # echo 'enable_hist:net:netif_receive_skb if filename==/usr/bin/wget' > \
147- /sys/kernel/debug/tracing/events/sched/sched_process_exec/trigger
148-
149- # echo 'disable_hist:net:netif_receive_skb if comm==wget' > \
150- /sys/kernel/debug/tracing/events/sched/sched_process_exit/trigger
151-
152- The above sets up an initially paused hist trigger which is unpaused
153- and starts aggregating events when a given program is executed, and
154- which stops aggregating when the process exits and the hist trigger
155- is paused again.
156-
157- The examples below provide a more concrete illustration of the
158- concepts and typical usage patterns discussed above.
159-
160-
161-6.2 'hist' trigger examples
162----------------------------
163-
164- The first set of examples creates aggregations using the kmalloc
165- event. The fields that can be used for the hist trigger are listed
166- in the kmalloc event's format file:
167-
168- # cat /sys/kernel/debug/tracing/events/kmem/kmalloc/format
169- name: kmalloc
170- ID: 374
171- format:
172- field:unsigned short common_type; offset:0; size:2; signed:0;
173- field:unsigned char common_flags; offset:2; size:1; signed:0;
174- field:unsigned char common_preempt_count; offset:3; size:1; signed:0;
175- field:int common_pid; offset:4; size:4; signed:1;
176-
177- field:unsigned long call_site; offset:8; size:8; signed:0;
178- field:const void * ptr; offset:16; size:8; signed:0;
179- field:size_t bytes_req; offset:24; size:8; signed:0;
180- field:size_t bytes_alloc; offset:32; size:8; signed:0;
181- field:gfp_t gfp_flags; offset:40; size:4; signed:0;
182-
183- We'll start by creating a hist trigger that generates a simple table
184- that lists the total number of bytes requested for each function in
185- the kernel that made one or more calls to kmalloc:
186-
187- # echo 'hist:key=call_site:val=bytes_req' > \
188- /sys/kernel/debug/tracing/events/kmem/kmalloc/trigger
189-
190- This tells the tracing system to create a 'hist' trigger using the
191- call_site field of the kmalloc event as the key for the table, which
192- just means that each unique call_site address will have an entry
193- created for it in the table. The 'val=bytes_req' parameter tells
194- the hist trigger that for each unique entry (call_site) in the
195- table, it should keep a running total of the number of bytes
196- requested by that call_site.
197-
198- We'll let it run for awhile and then dump the contents of the 'hist'
199- file in the kmalloc event's subdirectory (for readability, a number
200- of entries have been omitted):
201-
202- # cat /sys/kernel/debug/tracing/events/kmem/kmalloc/hist
203- # trigger info: hist:keys=call_site:vals=bytes_req:sort=hitcount:size=2048 [active]
204-
205- { call_site: 18446744072106379007 } hitcount: 1 bytes_req: 176
206- { call_site: 18446744071579557049 } hitcount: 1 bytes_req: 1024
207- { call_site: 18446744071580608289 } hitcount: 1 bytes_req: 16384
208- { call_site: 18446744071581827654 } hitcount: 1 bytes_req: 24
209- { call_site: 18446744071580700980 } hitcount: 1 bytes_req: 8
210- { call_site: 18446744071579359876 } hitcount: 1 bytes_req: 152
211- { call_site: 18446744071580795365 } hitcount: 3 bytes_req: 144
212- { call_site: 18446744071581303129 } hitcount: 3 bytes_req: 144
213- { call_site: 18446744071580713234 } hitcount: 4 bytes_req: 2560
214- { call_site: 18446744071580933750 } hitcount: 4 bytes_req: 736
215- .
216- .
217- .
218- { call_site: 18446744072106047046 } hitcount: 69 bytes_req: 5576
219- { call_site: 18446744071582116407 } hitcount: 73 bytes_req: 2336
220- { call_site: 18446744072106054684 } hitcount: 136 bytes_req: 140504
221- { call_site: 18446744072106224230 } hitcount: 136 bytes_req: 19584
222- { call_site: 18446744072106078074 } hitcount: 153 bytes_req: 2448
223- { call_site: 18446744072106062406 } hitcount: 153 bytes_req: 36720
224- { call_site: 18446744071582507929 } hitcount: 153 bytes_req: 37088
225- { call_site: 18446744072102520590 } hitcount: 273 bytes_req: 10920
226- { call_site: 18446744071582143559 } hitcount: 358 bytes_req: 716
227- { call_site: 18446744072106465852 } hitcount: 417 bytes_req: 56712
228- { call_site: 18446744072102523378 } hitcount: 485 bytes_req: 27160
229- { call_site: 18446744072099568646 } hitcount: 1676 bytes_req: 33520
230-
231- Totals:
232- Hits: 4610
233- Entries: 45
234- Dropped: 0
235-
236- The output displays a line for each entry, beginning with the key
237- specified in the trigger, followed by the value(s) also specified in
238- the trigger. At the beginning of the output is a line that displays
239- the trigger info, which can also be displayed by reading the
240- 'trigger' file:
241-
242- # cat /sys/kernel/debug/tracing/events/kmem/kmalloc/trigger
243- hist:keys=call_site:vals=bytes_req:sort=hitcount:size=2048 [active]
244-
245- At the end of the output are a few lines that display the overall
246- totals for the run. The 'Hits' field shows the total number of
247- times the event trigger was hit, the 'Entries' field shows the total
248- number of used entries in the hash table, and the 'Dropped' field
249- shows the number of hits that were dropped because the number of
250- used entries for the run exceeded the maximum number of entries
251- allowed for the table (normally 0, but if not a hint that you may
252- want to increase the size of the table using the 'size' parameter).
253-
254- Notice in the above output that there's an extra field, 'hitcount',
255- which wasn't specified in the trigger. Also notice that in the
256- trigger info output, there's a parameter, 'sort=hitcount', which
257- wasn't specified in the trigger either. The reason for that is that
258- every trigger implicitly keeps a count of the total number of hits
259- attributed to a given entry, called the 'hitcount'. That hitcount
260- information is explicitly displayed in the output, and in the
261- absence of a user-specified sort parameter, is used as the default
262- sort field.
263-
264- The value 'hitcount' can be used in place of an explicit value in
265- the 'values' parameter if you don't really need to have any
266- particular field summed and are mainly interested in hit
267- frequencies.
268-
269- To turn the hist trigger off, simply call up the trigger in the
270- command history and re-execute it with a '!' prepended:
271-
272- # echo '!hist:key=call_site:val=bytes_req' > \
273- /sys/kernel/debug/tracing/events/kmem/kmalloc/trigger
274-
275- Finally, notice that the call_site as displayed in the output above
276- isn't really very useful. It's an address, but normally addresses
277- are displayed in hex. To have a numeric field displayed as a hex
278- value, simply append '.hex' to the field name in the trigger:
279-
280- # echo 'hist:key=call_site.hex:val=bytes_req' > \
281- /sys/kernel/debug/tracing/events/kmem/kmalloc/trigger
282-
283- # cat /sys/kernel/debug/tracing/events/kmem/kmalloc/hist
284- # trigger info: hist:keys=call_site.hex:vals=bytes_req:sort=hitcount:size=2048 [active]
285-
286- { call_site: ffffffffa026b291 } hitcount: 1 bytes_req: 433
287- { call_site: ffffffffa07186ff } hitcount: 1 bytes_req: 176
288- { call_site: ffffffff811ae721 } hitcount: 1 bytes_req: 16384
289- { call_site: ffffffff811c5134 } hitcount: 1 bytes_req: 8
290- { call_site: ffffffffa04a9ebb } hitcount: 1 bytes_req: 511
291- { call_site: ffffffff8122e0a6 } hitcount: 1 bytes_req: 12
292- { call_site: ffffffff8107da84 } hitcount: 1 bytes_req: 152
293- { call_site: ffffffff812d8246 } hitcount: 1 bytes_req: 24
294- { call_site: ffffffff811dc1e5 } hitcount: 3 bytes_req: 144
295- { call_site: ffffffffa02515e8 } hitcount: 3 bytes_req: 648
296- { call_site: ffffffff81258159 } hitcount: 3 bytes_req: 144
297- { call_site: ffffffff811c80f4 } hitcount: 4 bytes_req: 544
298- .
299- .
300- .
301- { call_site: ffffffffa06c7646 } hitcount: 106 bytes_req: 8024
302- { call_site: ffffffffa06cb246 } hitcount: 132 bytes_req: 31680
303- { call_site: ffffffffa06cef7a } hitcount: 132 bytes_req: 2112
304- { call_site: ffffffff8137e399 } hitcount: 132 bytes_req: 23232
305- { call_site: ffffffffa06c941c } hitcount: 185 bytes_req: 171360
306- { call_site: ffffffffa06f2a66 } hitcount: 185 bytes_req: 26640
307- { call_site: ffffffffa036a70e } hitcount: 265 bytes_req: 10600
308- { call_site: ffffffff81325447 } hitcount: 292 bytes_req: 584
309- { call_site: ffffffffa072da3c } hitcount: 446 bytes_req: 60656
310- { call_site: ffffffffa036b1f2 } hitcount: 526 bytes_req: 29456
311- { call_site: ffffffffa0099c06 } hitcount: 1780 bytes_req: 35600
312-
313- Totals:
314- Hits: 4775
315- Entries: 46
316- Dropped: 0
317-
318- Even that's only marginally more useful - while hex values do look
319- more like addresses, what users are typically more interested in
320- when looking at text addresses are the corresponding symbols
321- instead. To have an address displayed as symbolic value instead,
322- simply append '.sym' or '.sym-offset' to the field name in the
323- trigger:
324-
325- # echo 'hist:key=call_site.sym:val=bytes_req' > \
326- /sys/kernel/debug/tracing/events/kmem/kmalloc/trigger
327-
328- # cat /sys/kernel/debug/tracing/events/kmem/kmalloc/hist
329- # trigger info: hist:keys=call_site.sym:vals=bytes_req:sort=hitcount:size=2048 [active]
330-
331- { call_site: [ffffffff810adcb9] syslog_print_all } hitcount: 1 bytes_req: 1024
332- { call_site: [ffffffff8154bc62] usb_control_msg } hitcount: 1 bytes_req: 8
333- { call_site: [ffffffffa00bf6fe] hidraw_send_report [hid] } hitcount: 1 bytes_req: 7
334- { call_site: [ffffffff8154acbe] usb_alloc_urb } hitcount: 1 bytes_req: 192
335- { call_site: [ffffffffa00bf1ca] hidraw_report_event [hid] } hitcount: 1 bytes_req: 7
336- { call_site: [ffffffff811e3a25] __seq_open_private } hitcount: 1 bytes_req: 40
337- { call_site: [ffffffff8109524a] alloc_fair_sched_group } hitcount: 2 bytes_req: 128
338- { call_site: [ffffffff811febd5] fsnotify_alloc_group } hitcount: 2 bytes_req: 528
339- { call_site: [ffffffff81440f58] __tty_buffer_request_room } hitcount: 2 bytes_req: 2624
340- { call_site: [ffffffff81200ba6] inotify_new_group } hitcount: 2 bytes_req: 96
341- { call_site: [ffffffffa05e19af] ieee80211_start_tx_ba_session [mac80211] } hitcount: 2 bytes_req: 464
342- { call_site: [ffffffff81672406] tcp_get_metrics } hitcount: 2 bytes_req: 304
343- { call_site: [ffffffff81097ec2] alloc_rt_sched_group } hitcount: 2 bytes_req: 128
344- { call_site: [ffffffff81089b05] sched_create_group } hitcount: 2 bytes_req: 1424
345- .
346- .
347- .
348- { call_site: [ffffffffa04a580c] intel_crtc_page_flip [i915] } hitcount: 1185 bytes_req: 123240
349- { call_site: [ffffffffa0287592] drm_mode_page_flip_ioctl [drm] } hitcount: 1185 bytes_req: 104280
350- { call_site: [ffffffffa04c4a3c] intel_plane_duplicate_state [i915] } hitcount: 1402 bytes_req: 190672
351- { call_site: [ffffffff812891ca] ext4_find_extent } hitcount: 1518 bytes_req: 146208
352- { call_site: [ffffffffa029070e] drm_vma_node_allow [drm] } hitcount: 1746 bytes_req: 69840
353- { call_site: [ffffffffa045e7c4] i915_gem_do_execbuffer.isra.23 [i915] } hitcount: 2021 bytes_req: 792312
354- { call_site: [ffffffffa02911f2] drm_modeset_lock_crtc [drm] } hitcount: 2592 bytes_req: 145152
355- { call_site: [ffffffffa0489a66] intel_ring_begin [i915] } hitcount: 2629 bytes_req: 378576
356- { call_site: [ffffffffa046041c] i915_gem_execbuffer2 [i915] } hitcount: 2629 bytes_req: 3783248
357- { call_site: [ffffffff81325607] apparmor_file_alloc_security } hitcount: 5192 bytes_req: 10384
358- { call_site: [ffffffffa00b7c06] hid_report_raw_event [hid] } hitcount: 5529 bytes_req: 110584
359- { call_site: [ffffffff8131ebf7] aa_alloc_task_context } hitcount: 21943 bytes_req: 702176
360- { call_site: [ffffffff8125847d] ext4_htree_store_dirent } hitcount: 55759 bytes_req: 5074265
361-
362- Totals:
363- Hits: 109928
364- Entries: 71
365- Dropped: 0
366-
367- Because the default sort key above is 'hitcount', the above shows a
368- the list of call_sites by increasing hitcount, so that at the bottom
369- we see the functions that made the most kmalloc calls during the
370- run. If instead we we wanted to see the top kmalloc callers in
371- terms of the number of bytes requested rather than the number of
372- calls, and we wanted the top caller to appear at the top, we can use
373- the 'sort' parameter, along with the 'descending' modifier:
374-
375- # echo 'hist:key=call_site.sym:val=bytes_req:sort=bytes_req.descending' > \
376- /sys/kernel/debug/tracing/events/kmem/kmalloc/trigger
377-
378- # cat /sys/kernel/debug/tracing/events/kmem/kmalloc/hist
379- # trigger info: hist:keys=call_site.sym:vals=bytes_req:sort=bytes_req.descending:size=2048 [active]
380-
381- { call_site: [ffffffffa046041c] i915_gem_execbuffer2 [i915] } hitcount: 2186 bytes_req: 3397464
382- { call_site: [ffffffffa045e7c4] i915_gem_do_execbuffer.isra.23 [i915] } hitcount: 1790 bytes_req: 712176
383- { call_site: [ffffffff8125847d] ext4_htree_store_dirent } hitcount: 8132 bytes_req: 513135
384- { call_site: [ffffffff811e2a1b] seq_buf_alloc } hitcount: 106 bytes_req: 440128
385- { call_site: [ffffffffa0489a66] intel_ring_begin [i915] } hitcount: 2186 bytes_req: 314784
386- { call_site: [ffffffff812891ca] ext4_find_extent } hitcount: 2174 bytes_req: 208992
387- { call_site: [ffffffff811ae8e1] __kmalloc } hitcount: 8 bytes_req: 131072
388- { call_site: [ffffffffa04c4a3c] intel_plane_duplicate_state [i915] } hitcount: 859 bytes_req: 116824
389- { call_site: [ffffffffa02911f2] drm_modeset_lock_crtc [drm] } hitcount: 1834 bytes_req: 102704
390- { call_site: [ffffffffa04a580c] intel_crtc_page_flip [i915] } hitcount: 972 bytes_req: 101088
391- { call_site: [ffffffffa0287592] drm_mode_page_flip_ioctl [drm] } hitcount: 972 bytes_req: 85536
392- { call_site: [ffffffffa00b7c06] hid_report_raw_event [hid] } hitcount: 3333 bytes_req: 66664
393- { call_site: [ffffffff8137e559] sg_kmalloc } hitcount: 209 bytes_req: 61632
394- .
395- .
396- .
397- { call_site: [ffffffff81095225] alloc_fair_sched_group } hitcount: 2 bytes_req: 128
398- { call_site: [ffffffff81097ec2] alloc_rt_sched_group } hitcount: 2 bytes_req: 128
399- { call_site: [ffffffff812d8406] copy_semundo } hitcount: 2 bytes_req: 48
400- { call_site: [ffffffff81200ba6] inotify_new_group } hitcount: 1 bytes_req: 48
401- { call_site: [ffffffffa027121a] drm_getmagic [drm] } hitcount: 1 bytes_req: 48
402- { call_site: [ffffffff811e3a25] __seq_open_private } hitcount: 1 bytes_req: 40
403- { call_site: [ffffffff811c52f4] bprm_change_interp } hitcount: 2 bytes_req: 16
404- { call_site: [ffffffff8154bc62] usb_control_msg } hitcount: 1 bytes_req: 8
405- { call_site: [ffffffffa00bf1ca] hidraw_report_event [hid] } hitcount: 1 bytes_req: 7
406- { call_site: [ffffffffa00bf6fe] hidraw_send_report [hid] } hitcount: 1 bytes_req: 7
407-
408- Totals:
409- Hits: 32133
410- Entries: 81
411- Dropped: 0
412-
413- To display the offset and size information in addition to the symbol
414- name, just use 'sym-offset' instead:
415-
416- # echo 'hist:key=call_site.sym-offset:val=bytes_req:sort=bytes_req.descending' > \
417- /sys/kernel/debug/tracing/events/kmem/kmalloc/trigger
418-
419- # cat /sys/kernel/debug/tracing/events/kmem/kmalloc/hist
420- # trigger info: hist:keys=call_site.sym-offset:vals=bytes_req:sort=bytes_req.descending:size=2048 [active]
421-
422- { call_site: [ffffffffa046041c] i915_gem_execbuffer2+0x6c/0x2c0 [i915] } hitcount: 4569 bytes_req: 3163720
423- { call_site: [ffffffffa0489a66] intel_ring_begin+0xc6/0x1f0 [i915] } hitcount: 4569 bytes_req: 657936
424- { call_site: [ffffffffa045e7c4] i915_gem_do_execbuffer.isra.23+0x694/0x1020 [i915] } hitcount: 1519 bytes_req: 472936
425- { call_site: [ffffffffa045e646] i915_gem_do_execbuffer.isra.23+0x516/0x1020 [i915] } hitcount: 3050 bytes_req: 211832
426- { call_site: [ffffffff811e2a1b] seq_buf_alloc+0x1b/0x50 } hitcount: 34 bytes_req: 148384
427- { call_site: [ffffffffa04a580c] intel_crtc_page_flip+0xbc/0x870 [i915] } hitcount: 1385 bytes_req: 144040
428- { call_site: [ffffffff811ae8e1] __kmalloc+0x191/0x1b0 } hitcount: 8 bytes_req: 131072
429- { call_site: [ffffffffa0287592] drm_mode_page_flip_ioctl+0x282/0x360 [drm] } hitcount: 1385 bytes_req: 121880
430- { call_site: [ffffffffa02911f2] drm_modeset_lock_crtc+0x32/0x100 [drm] } hitcount: 1848 bytes_req: 103488
431- { call_site: [ffffffffa04c4a3c] intel_plane_duplicate_state+0x2c/0xa0 [i915] } hitcount: 461 bytes_req: 62696
432- { call_site: [ffffffffa029070e] drm_vma_node_allow+0x2e/0xd0 [drm] } hitcount: 1541 bytes_req: 61640
433- { call_site: [ffffffff815f8d7b] sk_prot_alloc+0xcb/0x1b0 } hitcount: 57 bytes_req: 57456
434- .
435- .
436- .
437- { call_site: [ffffffff8109524a] alloc_fair_sched_group+0x5a/0x1a0 } hitcount: 2 bytes_req: 128
438- { call_site: [ffffffffa027b921] drm_vm_open_locked+0x31/0xa0 [drm] } hitcount: 3 bytes_req: 96
439- { call_site: [ffffffff8122e266] proc_self_follow_link+0x76/0xb0 } hitcount: 8 bytes_req: 96
440- { call_site: [ffffffff81213e80] load_elf_binary+0x240/0x1650 } hitcount: 3 bytes_req: 84
441- { call_site: [ffffffff8154bc62] usb_control_msg+0x42/0x110 } hitcount: 1 bytes_req: 8
442- { call_site: [ffffffffa00bf6fe] hidraw_send_report+0x7e/0x1a0 [hid] } hitcount: 1 bytes_req: 7
443- { call_site: [ffffffffa00bf1ca] hidraw_report_event+0x8a/0x120 [hid] } hitcount: 1 bytes_req: 7
444-
445- Totals:
446- Hits: 26098
447- Entries: 64
448- Dropped: 0
449-
450- We can also add multiple fields to the 'values' parameter. For
451- example, we might want to see the total number of bytes allocated
452- alongside bytes requested, and display the result sorted by bytes
453- allocated in a descending order:
454-
455- # echo 'hist:keys=call_site.sym:values=bytes_req,bytes_alloc:sort=bytes_alloc.descending' > \
456- /sys/kernel/debug/tracing/events/kmem/kmalloc/trigger
457-
458- # cat /sys/kernel/debug/tracing/events/kmem/kmalloc/hist
459- # trigger info: hist:keys=call_site.sym:vals=bytes_req,bytes_alloc:sort=bytes_alloc.descending:size=2048 [active]
460-
461- { call_site: [ffffffffa046041c] i915_gem_execbuffer2 [i915] } hitcount: 7403 bytes_req: 4084360 bytes_alloc: 5958016
462- { call_site: [ffffffff811e2a1b] seq_buf_alloc } hitcount: 541 bytes_req: 2213968 bytes_alloc: 2228224
463- { call_site: [ffffffffa0489a66] intel_ring_begin [i915] } hitcount: 7404 bytes_req: 1066176 bytes_alloc: 1421568
464- { call_site: [ffffffffa045e7c4] i915_gem_do_execbuffer.isra.23 [i915] } hitcount: 1565 bytes_req: 557368 bytes_alloc: 1037760
465- { call_site: [ffffffff8125847d] ext4_htree_store_dirent } hitcount: 9557 bytes_req: 595778 bytes_alloc: 695744
466- { call_site: [ffffffffa045e646] i915_gem_do_execbuffer.isra.23 [i915] } hitcount: 5839 bytes_req: 430680 bytes_alloc: 470400
467- { call_site: [ffffffffa04c4a3c] intel_plane_duplicate_state [i915] } hitcount: 2388 bytes_req: 324768 bytes_alloc: 458496
468- { call_site: [ffffffffa02911f2] drm_modeset_lock_crtc [drm] } hitcount: 3911 bytes_req: 219016 bytes_alloc: 250304
469- { call_site: [ffffffff815f8d7b] sk_prot_alloc } hitcount: 235 bytes_req: 236880 bytes_alloc: 240640
470- { call_site: [ffffffff8137e559] sg_kmalloc } hitcount: 557 bytes_req: 169024 bytes_alloc: 221760
471- { call_site: [ffffffffa00b7c06] hid_report_raw_event [hid] } hitcount: 9378 bytes_req: 187548 bytes_alloc: 206312
472- { call_site: [ffffffffa04a580c] intel_crtc_page_flip [i915] } hitcount: 1519 bytes_req: 157976 bytes_alloc: 194432
473- .
474- .
475- .
476- { call_site: [ffffffff8109bd3b] sched_autogroup_create_attach } hitcount: 2 bytes_req: 144 bytes_alloc: 192
477- { call_site: [ffffffff81097ee8] alloc_rt_sched_group } hitcount: 2 bytes_req: 128 bytes_alloc: 128
478- { call_site: [ffffffff8109524a] alloc_fair_sched_group } hitcount: 2 bytes_req: 128 bytes_alloc: 128
479- { call_site: [ffffffff81095225] alloc_fair_sched_group } hitcount: 2 bytes_req: 128 bytes_alloc: 128
480- { call_site: [ffffffff81097ec2] alloc_rt_sched_group } hitcount: 2 bytes_req: 128 bytes_alloc: 128
481- { call_site: [ffffffff81213e80] load_elf_binary } hitcount: 3 bytes_req: 84 bytes_alloc: 96
482- { call_site: [ffffffff81079a2e] kthread_create_on_node } hitcount: 1 bytes_req: 56 bytes_alloc: 64
483- { call_site: [ffffffffa00bf6fe] hidraw_send_report [hid] } hitcount: 1 bytes_req: 7 bytes_alloc: 8
484- { call_site: [ffffffff8154bc62] usb_control_msg } hitcount: 1 bytes_req: 8 bytes_alloc: 8
485- { call_site: [ffffffffa00bf1ca] hidraw_report_event [hid] } hitcount: 1 bytes_req: 7 bytes_alloc: 8
486-
487- Totals:
488- Hits: 66598
489- Entries: 65
490- Dropped: 0
491-
492- Finally, to finish off our kmalloc example, instead of simply having
493- the hist trigger display symbolic call_sites, we can have the hist
494- trigger additionally display the complete set of kernel stack traces
495- that led to each call_site. To do that, we simply use the special
496- value 'stacktrace' for the key parameter:
497-
498- # echo 'hist:keys=stacktrace:values=bytes_req,bytes_alloc:sort=bytes_alloc' > \
499- /sys/kernel/debug/tracing/events/kmem/kmalloc/trigger
500-
501- The above trigger will use the kernel stack trace in effect when an
502- event is triggered as the key for the hash table. This allows the
503- enumeration of every kernel callpath that led up to a particular
504- event, along with a running total of any of the event fields for
505- that event. Here we tally bytes requested and bytes allocated for
506- every callpath in the system that led up to a kmalloc (in this case
507- every callpath to a kmalloc for a kernel compile):
508-
509- # cat /sys/kernel/debug/tracing/events/kmem/kmalloc/hist
510- # trigger info: hist:keys=stacktrace:vals=bytes_req,bytes_alloc:sort=bytes_alloc:size=2048 [active]
511-
512- { stacktrace:
513- __kmalloc_track_caller+0x10b/0x1a0
514- kmemdup+0x20/0x50
515- hidraw_report_event+0x8a/0x120 [hid]
516- hid_report_raw_event+0x3ea/0x440 [hid]
517- hid_input_report+0x112/0x190 [hid]
518- hid_irq_in+0xc2/0x260 [usbhid]
519- __usb_hcd_giveback_urb+0x72/0x120
520- usb_giveback_urb_bh+0x9e/0xe0
521- tasklet_hi_action+0xf8/0x100
522- __do_softirq+0x114/0x2c0
523- irq_exit+0xa5/0xb0
524- do_IRQ+0x5a/0xf0
525- ret_from_intr+0x0/0x30
526- cpuidle_enter+0x17/0x20
527- cpu_startup_entry+0x315/0x3e0
528- rest_init+0x7c/0x80
529- } hitcount: 3 bytes_req: 21 bytes_alloc: 24
530- { stacktrace:
531- __kmalloc_track_caller+0x10b/0x1a0
532- kmemdup+0x20/0x50
533- hidraw_report_event+0x8a/0x120 [hid]
534- hid_report_raw_event+0x3ea/0x440 [hid]
535- hid_input_report+0x112/0x190 [hid]
536- hid_irq_in+0xc2/0x260 [usbhid]
537- __usb_hcd_giveback_urb+0x72/0x120
538- usb_giveback_urb_bh+0x9e/0xe0
539- tasklet_hi_action+0xf8/0x100
540- __do_softirq+0x114/0x2c0
541- irq_exit+0xa5/0xb0
542- do_IRQ+0x5a/0xf0
543- ret_from_intr+0x0/0x30
544- } hitcount: 3 bytes_req: 21 bytes_alloc: 24
545- { stacktrace:
546- kmem_cache_alloc_trace+0xeb/0x150
547- aa_alloc_task_context+0x27/0x40
548- apparmor_cred_prepare+0x1f/0x50
549- security_prepare_creds+0x16/0x20
550- prepare_creds+0xdf/0x1a0
551- SyS_capset+0xb5/0x200
552- system_call_fastpath+0x12/0x6a
553- } hitcount: 1 bytes_req: 32 bytes_alloc: 32
554- .
555- .
556- .
557- { stacktrace:
558- __kmalloc+0x11b/0x1b0
559- i915_gem_execbuffer2+0x6c/0x2c0 [i915]
560- drm_ioctl+0x349/0x670 [drm]
561- do_vfs_ioctl+0x2f0/0x4f0
562- SyS_ioctl+0x81/0xa0
563- system_call_fastpath+0x12/0x6a
564- } hitcount: 17726 bytes_req: 13944120 bytes_alloc: 19593808
565- { stacktrace:
566- __kmalloc+0x11b/0x1b0
567- load_elf_phdrs+0x76/0xa0
568- load_elf_binary+0x102/0x1650
569- search_binary_handler+0x97/0x1d0
570- do_execveat_common.isra.34+0x551/0x6e0
571- SyS_execve+0x3a/0x50
572- return_from_execve+0x0/0x23
573- } hitcount: 33348 bytes_req: 17152128 bytes_alloc: 20226048
574- { stacktrace:
575- kmem_cache_alloc_trace+0xeb/0x150
576- apparmor_file_alloc_security+0x27/0x40
577- security_file_alloc+0x16/0x20
578- get_empty_filp+0x93/0x1c0
579- path_openat+0x31/0x5f0
580- do_filp_open+0x3a/0x90
581- do_sys_open+0x128/0x220
582- SyS_open+0x1e/0x20
583- system_call_fastpath+0x12/0x6a
584- } hitcount: 4766422 bytes_req: 9532844 bytes_alloc: 38131376
585- { stacktrace:
586- __kmalloc+0x11b/0x1b0
587- seq_buf_alloc+0x1b/0x50
588- seq_read+0x2cc/0x370
589- proc_reg_read+0x3d/0x80
590- __vfs_read+0x28/0xe0
591- vfs_read+0x86/0x140
592- SyS_read+0x46/0xb0
593- system_call_fastpath+0x12/0x6a
594- } hitcount: 19133 bytes_req: 78368768 bytes_alloc: 78368768
595-
596- Totals:
597- Hits: 6085872
598- Entries: 253
599- Dropped: 0
600-
601- If you key a hist trigger on common_pid, in order for example to
602- gather and display sorted totals for each process, you can use the
603- special .execname modifier to display the executable names for the
604- processes in the table rather than raw pids. The example below
605- keeps a per-process sum of total bytes read:
606-
607- # echo 'hist:key=common_pid.execname:val=count:sort=count.descending' > \
608- /sys/kernel/debug/tracing/events/syscalls/sys_enter_read/trigger
609-
610- # cat /sys/kernel/debug/tracing/events/syscalls/sys_enter_read/hist
611- # trigger info: hist:keys=common_pid.execname:vals=count:sort=count.descending:size=2048 [active]
612-
613- { common_pid: gnome-terminal [ 3196] } hitcount: 280 count: 1093512
614- { common_pid: Xorg [ 1309] } hitcount: 525 count: 256640
615- { common_pid: compiz [ 2889] } hitcount: 59 count: 254400
616- { common_pid: bash [ 8710] } hitcount: 3 count: 66369
617- { common_pid: dbus-daemon-lau [ 8703] } hitcount: 49 count: 47739
618- { common_pid: irqbalance [ 1252] } hitcount: 27 count: 27648
619- { common_pid: 01ifupdown [ 8705] } hitcount: 3 count: 17216
620- { common_pid: dbus-daemon [ 772] } hitcount: 10 count: 12396
621- { common_pid: Socket Thread [ 8342] } hitcount: 11 count: 11264
622- { common_pid: nm-dhcp-client. [ 8701] } hitcount: 6 count: 7424
623- { common_pid: gmain [ 1315] } hitcount: 18 count: 6336
624- .
625- .
626- .
627- { common_pid: postgres [ 1892] } hitcount: 2 count: 32
628- { common_pid: postgres [ 1891] } hitcount: 2 count: 32
629- { common_pid: gmain [ 8704] } hitcount: 2 count: 32
630- { common_pid: upstart-dbus-br [ 2740] } hitcount: 21 count: 21
631- { common_pid: nm-dispatcher.a [ 8696] } hitcount: 1 count: 16
632- { common_pid: indicator-datet [ 2904] } hitcount: 1 count: 16
633- { common_pid: gdbus [ 2998] } hitcount: 1 count: 16
634- { common_pid: rtkit-daemon [ 2052] } hitcount: 1 count: 8
635- { common_pid: init [ 1] } hitcount: 2 count: 2
636-
637- Totals:
638- Hits: 2116
639- Entries: 51
640- Dropped: 0
641-
642- Similarly, if you key a hist trigger on syscall id, for example to
643- gather and display a list of systemwide syscall hits, you can use
644- the special .syscall modifier to display the syscall names rather
645- than raw ids. The example below keeps a running total of syscall
646- counts for the system during the run:
647-
648- # echo 'hist:key=id.syscall:val=hitcount' > \
649- /sys/kernel/debug/tracing/events/raw_syscalls/sys_enter/trigger
650-
651- # cat /sys/kernel/debug/tracing/events/raw_syscalls/sys_enter/hist
652- # trigger info: hist:keys=id.syscall:vals=hitcount:sort=hitcount:size=2048 [active]
653-
654- { id: sys_fsync [ 74] } hitcount: 1
655- { id: sys_newuname [ 63] } hitcount: 1
656- { id: sys_prctl [157] } hitcount: 1
657- { id: sys_statfs [137] } hitcount: 1
658- { id: sys_symlink [ 88] } hitcount: 1
659- { id: sys_sendmmsg [307] } hitcount: 1
660- { id: sys_semctl [ 66] } hitcount: 1
661- { id: sys_readlink [ 89] } hitcount: 3
662- { id: sys_bind [ 49] } hitcount: 3
663- { id: sys_getsockname [ 51] } hitcount: 3
664- { id: sys_unlink [ 87] } hitcount: 3
665- { id: sys_rename [ 82] } hitcount: 4
666- { id: unknown_syscall [ 58] } hitcount: 4
667- { id: sys_connect [ 42] } hitcount: 4
668- { id: sys_getpid [ 39] } hitcount: 4
669- .
670- .
671- .
672- { id: sys_rt_sigprocmask [ 14] } hitcount: 952
673- { id: sys_futex [202] } hitcount: 1534
674- { id: sys_write [ 1] } hitcount: 2689
675- { id: sys_setitimer [ 38] } hitcount: 2797
676- { id: sys_read [ 0] } hitcount: 3202
677- { id: sys_select [ 23] } hitcount: 3773
678- { id: sys_writev [ 20] } hitcount: 4531
679- { id: sys_poll [ 7] } hitcount: 8314
680- { id: sys_recvmsg [ 47] } hitcount: 13738
681- { id: sys_ioctl [ 16] } hitcount: 21843
682-
683- Totals:
684- Hits: 67612
685- Entries: 72
686- Dropped: 0
687-
688- The syscall counts above provide a rough overall picture of system
689- call activity on the system; we can see for example that the most
690- popular system call on this system was the 'sys_ioctl' system call.
691-
692- We can use 'compound' keys to refine that number and provide some
693- further insight as to which processes exactly contribute to the
694- overall ioctl count.
695-
696- The command below keeps a hitcount for every unique combination of
697- system call id and pid - the end result is essentially a table
698- that keeps a per-pid sum of system call hits. The results are
699- sorted using the system call id as the primary key, and the
700- hitcount sum as the secondary key:
701-
702- # echo 'hist:key=id.syscall,common_pid.execname:val=hitcount:sort=id,hitcount' > \
703- /sys/kernel/debug/tracing/events/raw_syscalls/sys_enter/trigger
704-
705- # cat /sys/kernel/debug/tracing/events/raw_syscalls/sys_enter/hist
706- # trigger info: hist:keys=id.syscall,common_pid.execname:vals=hitcount:sort=id.syscall,hitcount:size=2048 [active]
707-
708- { id: sys_read [ 0], common_pid: rtkit-daemon [ 1877] } hitcount: 1
709- { id: sys_read [ 0], common_pid: gdbus [ 2976] } hitcount: 1
710- { id: sys_read [ 0], common_pid: console-kit-dae [ 3400] } hitcount: 1
711- { id: sys_read [ 0], common_pid: postgres [ 1865] } hitcount: 1
712- { id: sys_read [ 0], common_pid: deja-dup-monito [ 3543] } hitcount: 2
713- { id: sys_read [ 0], common_pid: NetworkManager [ 890] } hitcount: 2
714- { id: sys_read [ 0], common_pid: evolution-calen [ 3048] } hitcount: 2
715- { id: sys_read [ 0], common_pid: postgres [ 1864] } hitcount: 2
716- { id: sys_read [ 0], common_pid: nm-applet [ 3022] } hitcount: 2
717- { id: sys_read [ 0], common_pid: whoopsie [ 1212] } hitcount: 2
718- .
719- .
720- .
721- { id: sys_ioctl [ 16], common_pid: bash [ 8479] } hitcount: 1
722- { id: sys_ioctl [ 16], common_pid: bash [ 3472] } hitcount: 12
723- { id: sys_ioctl [ 16], common_pid: gnome-terminal [ 3199] } hitcount: 16
724- { id: sys_ioctl [ 16], common_pid: Xorg [ 1267] } hitcount: 1808
725- { id: sys_ioctl [ 16], common_pid: compiz [ 2994] } hitcount: 5580
726- .
727- .
728- .
729- { id: sys_waitid [247], common_pid: upstart-dbus-br [ 2690] } hitcount: 3
730- { id: sys_waitid [247], common_pid: upstart-dbus-br [ 2688] } hitcount: 16
731- { id: sys_inotify_add_watch [254], common_pid: gmain [ 975] } hitcount: 2
732- { id: sys_inotify_add_watch [254], common_pid: gmain [ 3204] } hitcount: 4
733- { id: sys_inotify_add_watch [254], common_pid: gmain [ 2888] } hitcount: 4
734- { id: sys_inotify_add_watch [254], common_pid: gmain [ 3003] } hitcount: 4
735- { id: sys_inotify_add_watch [254], common_pid: gmain [ 2873] } hitcount: 4
736- { id: sys_inotify_add_watch [254], common_pid: gmain [ 3196] } hitcount: 6
737- { id: sys_openat [257], common_pid: java [ 2623] } hitcount: 2
738- { id: sys_eventfd2 [290], common_pid: ibus-ui-gtk3 [ 2760] } hitcount: 4
739- { id: sys_eventfd2 [290], common_pid: compiz [ 2994] } hitcount: 6
740-
741- Totals:
742- Hits: 31536
743- Entries: 323
744- Dropped: 0
745-
746- The above list does give us a breakdown of the ioctl syscall by
747- pid, but it also gives us quite a bit more than that, which we
748- don't really care about at the moment. Since we know the syscall
749- id for sys_ioctl (16, displayed next to the sys_ioctl name), we
750- can use that to filter out all the other syscalls:
751-
752- # echo 'hist:key=id.syscall,common_pid.execname:val=hitcount:sort=id,hitcount if id == 16' > \
753- /sys/kernel/debug/tracing/events/raw_syscalls/sys_enter/trigger
754-
755- # cat /sys/kernel/debug/tracing/events/raw_syscalls/sys_enter/hist
756- # trigger info: hist:keys=id.syscall,common_pid.execname:vals=hitcount:sort=id.syscall,hitcount:size=2048 if id == 16 [active]
757-
758- { id: sys_ioctl [ 16], common_pid: gmain [ 2769] } hitcount: 1
759- { id: sys_ioctl [ 16], common_pid: evolution-addre [ 8571] } hitcount: 1
760- { id: sys_ioctl [ 16], common_pid: gmain [ 3003] } hitcount: 1
761- { id: sys_ioctl [ 16], common_pid: gmain [ 2781] } hitcount: 1
762- { id: sys_ioctl [ 16], common_pid: gmain [ 2829] } hitcount: 1
763- { id: sys_ioctl [ 16], common_pid: bash [ 8726] } hitcount: 1
764- { id: sys_ioctl [ 16], common_pid: bash [ 8508] } hitcount: 1
765- { id: sys_ioctl [ 16], common_pid: gmain [ 2970] } hitcount: 1
766- { id: sys_ioctl [ 16], common_pid: gmain [ 2768] } hitcount: 1
767- .
768- .
769- .
770- { id: sys_ioctl [ 16], common_pid: pool [ 8559] } hitcount: 45
771- { id: sys_ioctl [ 16], common_pid: pool [ 8555] } hitcount: 48
772- { id: sys_ioctl [ 16], common_pid: pool [ 8551] } hitcount: 48
773- { id: sys_ioctl [ 16], common_pid: avahi-daemon [ 896] } hitcount: 66
774- { id: sys_ioctl [ 16], common_pid: Xorg [ 1267] } hitcount: 26674
775- { id: sys_ioctl [ 16], common_pid: compiz [ 2994] } hitcount: 73443
776-
777- Totals:
778- Hits: 101162
779- Entries: 103
780- Dropped: 0
781-
782- The above output shows that 'compiz' and 'Xorg' are far and away
783- the heaviest ioctl callers (which might lead to questions about
784- whether they really need to be making all those calls and to
785- possible avenues for further investigation.)
786-
787- The compound key examples used a key and a sum value (hitcount) to
788- sort the output, but we can just as easily use two keys instead.
789- Here's an example where we use a compound key composed of the the
790- common_pid and size event fields. Sorting with pid as the primary
791- key and 'size' as the secondary key allows us to display an
792- ordered summary of the recvfrom sizes, with counts, received by
793- each process:
794-
795- # echo 'hist:key=common_pid.execname,size:val=hitcount:sort=common_pid,size' > \
796- /sys/kernel/debug/tracing/events/syscalls/sys_enter_recvfrom/trigger
797-
798- # cat /sys/kernel/debug/tracing/events/syscalls/sys_enter_recvfrom/hist
799- # trigger info: hist:keys=common_pid.execname,size:vals=hitcount:sort=common_pid.execname,size:size=2048 [active]
800-
801- { common_pid: smbd [ 784], size: 4 } hitcount: 1
802- { common_pid: dnsmasq [ 1412], size: 4096 } hitcount: 672
803- { common_pid: postgres [ 1796], size: 1000 } hitcount: 6
804- { common_pid: postgres [ 1867], size: 1000 } hitcount: 10
805- { common_pid: bamfdaemon [ 2787], size: 28 } hitcount: 2
806- { common_pid: bamfdaemon [ 2787], size: 14360 } hitcount: 1
807- { common_pid: compiz [ 2994], size: 8 } hitcount: 1
808- { common_pid: compiz [ 2994], size: 20 } hitcount: 11
809- { common_pid: gnome-terminal [ 3199], size: 4 } hitcount: 2
810- { common_pid: firefox [ 8817], size: 4 } hitcount: 1
811- { common_pid: firefox [ 8817], size: 8 } hitcount: 5
812- { common_pid: firefox [ 8817], size: 588 } hitcount: 2
813- { common_pid: firefox [ 8817], size: 628 } hitcount: 1
814- { common_pid: firefox [ 8817], size: 6944 } hitcount: 1
815- { common_pid: firefox [ 8817], size: 408880 } hitcount: 2
816- { common_pid: firefox [ 8822], size: 8 } hitcount: 2
817- { common_pid: firefox [ 8822], size: 160 } hitcount: 2
818- { common_pid: firefox [ 8822], size: 320 } hitcount: 2
819- { common_pid: firefox [ 8822], size: 352 } hitcount: 1
820- .
821- .
822- .
823- { common_pid: pool [ 8923], size: 1960 } hitcount: 10
824- { common_pid: pool [ 8923], size: 2048 } hitcount: 10
825- { common_pid: pool [ 8924], size: 1960 } hitcount: 10
826- { common_pid: pool [ 8924], size: 2048 } hitcount: 10
827- { common_pid: pool [ 8928], size: 1964 } hitcount: 4
828- { common_pid: pool [ 8928], size: 1965 } hitcount: 2
829- { common_pid: pool [ 8928], size: 2048 } hitcount: 6
830- { common_pid: pool [ 8929], size: 1982 } hitcount: 1
831- { common_pid: pool [ 8929], size: 2048 } hitcount: 1
832-
833- Totals:
834- Hits: 2016
835- Entries: 224
836- Dropped: 0
837-
838- The above example also illustrates the fact that although a compound
839- key is treated as a single entity for hashing purposes, the sub-keys
840- it's composed of can be accessed independently.
841-
842- The next example uses a string field as the hash key and
843- demonstrates how you can manually pause and continue a hist trigger.
844- In this example, we'll aggregate fork counts and don't expect a
845- large number of entries in the hash table, so we'll drop it to a
846- much smaller number, say 256:
847-
848- # echo 'hist:key=child_comm:val=hitcount:size=256' > \
849- /sys/kernel/debug/tracing/events/sched/sched_process_fork/trigger
850-
851- # cat /sys/kernel/debug/tracing/events/sched/sched_process_fork/hist
852- # trigger info: hist:keys=child_comm:vals=hitcount:sort=hitcount:size=256 [active]
853-
854- { child_comm: dconf worker } hitcount: 1
855- { child_comm: ibus-daemon } hitcount: 1
856- { child_comm: whoopsie } hitcount: 1
857- { child_comm: smbd } hitcount: 1
858- { child_comm: gdbus } hitcount: 1
859- { child_comm: kthreadd } hitcount: 1
860- { child_comm: dconf worker } hitcount: 1
861- { child_comm: evolution-alarm } hitcount: 2
862- { child_comm: Socket Thread } hitcount: 2
863- { child_comm: postgres } hitcount: 2
864- { child_comm: bash } hitcount: 3
865- { child_comm: compiz } hitcount: 3
866- { child_comm: evolution-sourc } hitcount: 4
867- { child_comm: dhclient } hitcount: 4
868- { child_comm: pool } hitcount: 5
869- { child_comm: nm-dispatcher.a } hitcount: 8
870- { child_comm: firefox } hitcount: 8
871- { child_comm: dbus-daemon } hitcount: 8
872- { child_comm: glib-pacrunner } hitcount: 10
873- { child_comm: evolution } hitcount: 23
874-
875- Totals:
876- Hits: 89
877- Entries: 20
878- Dropped: 0
879-
880- If we want to pause the hist trigger, we can simply append :pause to
881- the command that started the trigger. Notice that the trigger info
882- displays as [paused]:
883-
884- # echo 'hist:key=child_comm:val=hitcount:size=256:pause' >> \
885- /sys/kernel/debug/tracing/events/sched/sched_process_fork/trigger
886-
887- # cat /sys/kernel/debug/tracing/events/sched/sched_process_fork/hist
888- # trigger info: hist:keys=child_comm:vals=hitcount:sort=hitcount:size=256 [paused]
889-
890- { child_comm: dconf worker } hitcount: 1
891- { child_comm: kthreadd } hitcount: 1
892- { child_comm: dconf worker } hitcount: 1
893- { child_comm: gdbus } hitcount: 1
894- { child_comm: ibus-daemon } hitcount: 1
895- { child_comm: Socket Thread } hitcount: 2
896- { child_comm: evolution-alarm } hitcount: 2
897- { child_comm: smbd } hitcount: 2
898- { child_comm: bash } hitcount: 3
899- { child_comm: whoopsie } hitcount: 3
900- { child_comm: compiz } hitcount: 3
901- { child_comm: evolution-sourc } hitcount: 4
902- { child_comm: pool } hitcount: 5
903- { child_comm: postgres } hitcount: 6
904- { child_comm: firefox } hitcount: 8
905- { child_comm: dhclient } hitcount: 10
906- { child_comm: emacs } hitcount: 12
907- { child_comm: dbus-daemon } hitcount: 20
908- { child_comm: nm-dispatcher.a } hitcount: 20
909- { child_comm: evolution } hitcount: 35
910- { child_comm: glib-pacrunner } hitcount: 59
911-
912- Totals:
913- Hits: 199
914- Entries: 21
915- Dropped: 0
916-
917- To manually continue having the trigger aggregate events, append
918- :cont instead. Notice that the trigger info displays as [active]
919- again, and the data has changed:
920-
921- # echo 'hist:key=child_comm:val=hitcount:size=256:cont' >> \
922- /sys/kernel/debug/tracing/events/sched/sched_process_fork/trigger
923-
924- # cat /sys/kernel/debug/tracing/events/sched/sched_process_fork/hist
925- # trigger info: hist:keys=child_comm:vals=hitcount:sort=hitcount:size=256 [active]
926-
927- { child_comm: dconf worker } hitcount: 1
928- { child_comm: dconf worker } hitcount: 1
929- { child_comm: kthreadd } hitcount: 1
930- { child_comm: gdbus } hitcount: 1
931- { child_comm: ibus-daemon } hitcount: 1
932- { child_comm: Socket Thread } hitcount: 2
933- { child_comm: evolution-alarm } hitcount: 2
934- { child_comm: smbd } hitcount: 2
935- { child_comm: whoopsie } hitcount: 3
936- { child_comm: compiz } hitcount: 3
937- { child_comm: evolution-sourc } hitcount: 4
938- { child_comm: bash } hitcount: 5
939- { child_comm: pool } hitcount: 5
940- { child_comm: postgres } hitcount: 6
941- { child_comm: firefox } hitcount: 8
942- { child_comm: dhclient } hitcount: 11
943- { child_comm: emacs } hitcount: 12
944- { child_comm: dbus-daemon } hitcount: 22
945- { child_comm: nm-dispatcher.a } hitcount: 22
946- { child_comm: evolution } hitcount: 35
947- { child_comm: glib-pacrunner } hitcount: 59
948-
949- Totals:
950- Hits: 206
951- Entries: 21
952- Dropped: 0
953-
954- The previous example showed how to start and stop a hist trigger by
955- appending 'pause' and 'continue' to the hist trigger command. A
956- hist trigger can also be started in a paused state by initially
957- starting the trigger with ':pause' appended. This allows you to
958- start the trigger only when you're ready to start collecting data
959- and not before. For example, you could start the trigger in a
960- paused state, then unpause it and do something you want to measure,
961- then pause the trigger again when done.
962-
963- Of course, doing this manually can be difficult and error-prone, but
964- it is possible to automatically start and stop a hist trigger based
965- on some condition, via the enable_hist and disable_hist triggers.
966-
967- For example, suppose we wanted to take a look at the relative
968- weights in terms of skb length for each callpath that leads to a
969- netif_receieve_skb event when downloading a decent-sized file using
970- wget.
971-
972- First we set up an initially paused stacktrace trigger on the
973- netif_receive_skb event:
974-
975- # echo 'hist:key=stacktrace:vals=len:pause' > \
976- /sys/kernel/debug/tracing/events/net/netif_receive_skb/trigger
977-
978- Next, we set up an 'enable_hist' trigger on the sched_process_exec
979- event, with an 'if filename==/usr/bin/wget' filter. The effect of
980- this new trigger is that it will 'unpause' the hist trigger we just
981- set up on netif_receive_skb if and only if it sees a
982- sched_process_exec event with a filename of '/usr/bin/wget'. When
983- that happens, all netif_receive_skb events are aggregated into a
984- hash table keyed on stacktrace:
985-
986- # echo 'enable_hist:net:netif_receive_skb if filename==/usr/bin/wget' > \
987- /sys/kernel/debug/tracing/events/sched/sched_process_exec/trigger
988-
989- The aggregation continues until the netif_receive_skb is paused
990- again, which is what the following disable_hist event does by
991- creating a similar setup on the sched_process_exit event, using the
992- filter 'comm==wget':
993-
994- # echo 'disable_hist:net:netif_receive_skb if comm==wget' > \
995- /sys/kernel/debug/tracing/events/sched/sched_process_exit/trigger
996-
997- Whenever a process exits and the comm field of the disable_hist
998- trigger filter matches 'comm==wget', the netif_receive_skb hist
999- trigger is disabled.
1000-
1001- The overall effect is that netif_receive_skb events are aggregated
1002- into the hash table for only the duration of the wget. Executing a
1003- wget command and then listing the 'hist' file will display the
1004- output generated by the wget command:
1005-
1006- $ wget https://www.kernel.org/pub/linux/kernel/v3.x/patch-3.19.xz
1007-
1008- # cat /sys/kernel/debug/tracing/events/net/netif_receive_skb/hist
1009- # trigger info: hist:keys=stacktrace:vals=len:sort=hitcount:size=2048 [paused]
1010-
1011- { stacktrace:
1012- __netif_receive_skb_core+0x46d/0x990
1013- __netif_receive_skb+0x18/0x60
1014- netif_receive_skb_internal+0x23/0x90
1015- napi_gro_receive+0xc8/0x100
1016- ieee80211_deliver_skb+0xd6/0x270 [mac80211]
1017- ieee80211_rx_handlers+0xccf/0x22f0 [mac80211]
1018- ieee80211_prepare_and_rx_handle+0x4e7/0xc40 [mac80211]
1019- ieee80211_rx+0x31d/0x900 [mac80211]
1020- iwlagn_rx_reply_rx+0x3db/0x6f0 [iwldvm]
1021- iwl_rx_dispatch+0x8e/0xf0 [iwldvm]
1022- iwl_pcie_irq_handler+0xe3c/0x12f0 [iwlwifi]
1023- irq_thread_fn+0x20/0x50
1024- irq_thread+0x11f/0x150
1025- kthread+0xd2/0xf0
1026- ret_from_fork+0x42/0x70
1027- } hitcount: 85 len: 28884
1028- { stacktrace:
1029- __netif_receive_skb_core+0x46d/0x990
1030- __netif_receive_skb+0x18/0x60
1031- netif_receive_skb_internal+0x23/0x90
1032- napi_gro_complete+0xa4/0xe0
1033- dev_gro_receive+0x23a/0x360
1034- napi_gro_receive+0x30/0x100
1035- ieee80211_deliver_skb+0xd6/0x270 [mac80211]
1036- ieee80211_rx_handlers+0xccf/0x22f0 [mac80211]
1037- ieee80211_prepare_and_rx_handle+0x4e7/0xc40 [mac80211]
1038- ieee80211_rx+0x31d/0x900 [mac80211]
1039- iwlagn_rx_reply_rx+0x3db/0x6f0 [iwldvm]
1040- iwl_rx_dispatch+0x8e/0xf0 [iwldvm]
1041- iwl_pcie_irq_handler+0xe3c/0x12f0 [iwlwifi]
1042- irq_thread_fn+0x20/0x50
1043- irq_thread+0x11f/0x150
1044- kthread+0xd2/0xf0
1045- } hitcount: 98 len: 664329
1046- { stacktrace:
1047- __netif_receive_skb_core+0x46d/0x990
1048- __netif_receive_skb+0x18/0x60
1049- process_backlog+0xa8/0x150
1050- net_rx_action+0x15d/0x340
1051- __do_softirq+0x114/0x2c0
1052- do_softirq_own_stack+0x1c/0x30
1053- do_softirq+0x65/0x70
1054- __local_bh_enable_ip+0xb5/0xc0
1055- ip_finish_output+0x1f4/0x840
1056- ip_output+0x6b/0xc0
1057- ip_local_out_sk+0x31/0x40
1058- ip_send_skb+0x1a/0x50
1059- udp_send_skb+0x173/0x2a0
1060- udp_sendmsg+0x2bf/0x9f0
1061- inet_sendmsg+0x64/0xa0
1062- sock_sendmsg+0x3d/0x50
1063- } hitcount: 115 len: 13030
1064- { stacktrace:
1065- __netif_receive_skb_core+0x46d/0x990
1066- __netif_receive_skb+0x18/0x60
1067- netif_receive_skb_internal+0x23/0x90
1068- napi_gro_complete+0xa4/0xe0
1069- napi_gro_flush+0x6d/0x90
1070- iwl_pcie_irq_handler+0x92a/0x12f0 [iwlwifi]
1071- irq_thread_fn+0x20/0x50
1072- irq_thread+0x11f/0x150
1073- kthread+0xd2/0xf0
1074- ret_from_fork+0x42/0x70
1075- } hitcount: 934 len: 5512212
1076-
1077- Totals:
1078- Hits: 1232
1079- Entries: 4
1080- Dropped: 0
1081-
1082- The above shows all the netif_receive_skb callpaths and their total
1083- lengths for the duration of the wget command.
1084-
1085- The 'clear' hist trigger param can be used to clear the hash table.
1086- Suppose we wanted to try another run of the previous example but
1087- this time also wanted to see the complete list of events that went
1088- into the histogram. In order to avoid having to set everything up
1089- again, we can just clear the histogram first:
1090-
1091- # echo 'hist:key=stacktrace:vals=len:clear' >> \
1092- /sys/kernel/debug/tracing/events/net/netif_receive_skb/trigger
1093-
1094- Just to verify that it is in fact cleared, here's what we now see in
1095- the hist file:
1096-
1097- # cat /sys/kernel/debug/tracing/events/net/netif_receive_skb/hist
1098- # trigger info: hist:keys=stacktrace:vals=len:sort=hitcount:size=2048 [paused]
1099-
1100- Totals:
1101- Hits: 0
1102- Entries: 0
1103- Dropped: 0
1104-
1105- Since we want to see the detailed list of every netif_receive_skb
1106- event occurring during the new run, which are in fact the same
1107- events being aggregated into the hash table, we add some additional
1108- 'enable_event' events to the triggering sched_process_exec and
1109- sched_process_exit events as such:
1110-
1111- # echo 'enable_event:net:netif_receive_skb if filename==/usr/bin/wget' > \
1112- /sys/kernel/debug/tracing/events/sched/sched_process_exec/trigger
1113-
1114- # echo 'disable_event:net:netif_receive_skb if comm==wget' > \
1115- /sys/kernel/debug/tracing/events/sched/sched_process_exit/trigger
1116-
1117- If you read the trigger files for the sched_process_exec and
1118- sched_process_exit triggers, you should see two triggers for each:
1119- one enabling/disabling the hist aggregation and the other
1120- enabling/disabling the logging of events:
1121-
1122- # cat /sys/kernel/debug/tracing/events/sched/sched_process_exec/trigger
1123- enable_event:net:netif_receive_skb:unlimited if filename==/usr/bin/wget
1124- enable_hist:net:netif_receive_skb:unlimited if filename==/usr/bin/wget
1125-
1126- # cat /sys/kernel/debug/tracing/events/sched/sched_process_exit/trigger
1127- enable_event:net:netif_receive_skb:unlimited if comm==wget
1128- disable_hist:net:netif_receive_skb:unlimited if comm==wget
1129-
1130- In other words, whenever either of the sched_process_exec or
1131- sched_process_exit events is hit and matches 'wget', it enables or
1132- disables both the histogram and the event log, and what you end up
1133- with is a hash table and set of events just covering the specified
1134- duration. Run the wget command again:
1135-
1136- $ wget https://www.kernel.org/pub/linux/kernel/v3.x/patch-3.19.xz
1137-
1138- Displaying the 'hist' file should show something similar to what you
1139- saw in the last run, but this time you should also see the
1140- individual events in the trace file:
1141-
1142- # cat /sys/kernel/debug/tracing/trace
1143-
1144- # tracer: nop
1145- #
1146- # entries-in-buffer/entries-written: 183/1426 #P:4
1147- #
1148- # _-----=> irqs-off
1149- # / _----=> need-resched
1150- # | / _---=> hardirq/softirq
1151- # || / _--=> preempt-depth
1152- # ||| / delay
1153- # TASK-PID CPU# |||| TIMESTAMP FUNCTION
1154- # | | | |||| | |
1155- wget-15108 [000] ..s1 31769.606929: netif_receive_skb: dev=lo skbaddr=ffff88009c353100 len=60
1156- wget-15108 [000] ..s1 31769.606999: netif_receive_skb: dev=lo skbaddr=ffff88009c353200 len=60
1157- dnsmasq-1382 [000] ..s1 31769.677652: netif_receive_skb: dev=lo skbaddr=ffff88009c352b00 len=130
1158- dnsmasq-1382 [000] ..s1 31769.685917: netif_receive_skb: dev=lo skbaddr=ffff88009c352200 len=138
1159- ##### CPU 2 buffer started ####
1160- irq/29-iwlwifi-559 [002] ..s. 31772.031529: netif_receive_skb: dev=wlan0 skbaddr=ffff88009d433d00 len=2948
1161- irq/29-iwlwifi-559 [002] ..s. 31772.031572: netif_receive_skb: dev=wlan0 skbaddr=ffff88009d432200 len=1500
1162- irq/29-iwlwifi-559 [002] ..s. 31772.032196: netif_receive_skb: dev=wlan0 skbaddr=ffff88009d433100 len=2948
1163- irq/29-iwlwifi-559 [002] ..s. 31772.032761: netif_receive_skb: dev=wlan0 skbaddr=ffff88009d433000 len=2948
1164- irq/29-iwlwifi-559 [002] ..s. 31772.033220: netif_receive_skb: dev=wlan0 skbaddr=ffff88009d432e00 len=1500
1165- .
1166- .
1167- .
1168-
1169- The following example demonstrates how multiple hist triggers can be
1170- attached to a given event. This capability can be useful for
1171- creating a set of different summaries derived from the same set of
1172- events, or for comparing the effects of different filters, among
1173- other things.
1174-
1175- # echo 'hist:keys=skbaddr.hex:vals=len if len < 0' >> \
1176- /sys/kernel/debug/tracing/events/net/netif_receive_skb/trigger
1177- # echo 'hist:keys=skbaddr.hex:vals=len if len > 4096' >> \
1178- /sys/kernel/debug/tracing/events/net/netif_receive_skb/trigger
1179- # echo 'hist:keys=skbaddr.hex:vals=len if len == 256' >> \
1180- /sys/kernel/debug/tracing/events/net/netif_receive_skb/trigger
1181- # echo 'hist:keys=skbaddr.hex:vals=len' >> \
1182- /sys/kernel/debug/tracing/events/net/netif_receive_skb/trigger
1183- # echo 'hist:keys=len:vals=common_preempt_count' >> \
1184- /sys/kernel/debug/tracing/events/net/netif_receive_skb/trigger
1185-
1186- The above set of commands create four triggers differing only in
1187- their filters, along with a completely different though fairly
1188- nonsensical trigger. Note that in order to append multiple hist
1189- triggers to the same file, you should use the '>>' operator to
1190- append them ('>' will also add the new hist trigger, but will remove
1191- any existing hist triggers beforehand).
1192-
1193- Displaying the contents of the 'hist' file for the event shows the
1194- contents of all five histograms:
1195-
1196- # cat /sys/kernel/debug/tracing/events/net/netif_receive_skb/hist
1197-
1198- # event histogram
1199- #
1200- # trigger info: hist:keys=len:vals=hitcount,common_preempt_count:sort=hitcount:size=2048 [active]
1201- #
1202-
1203- { len: 176 } hitcount: 1 common_preempt_count: 0
1204- { len: 223 } hitcount: 1 common_preempt_count: 0
1205- { len: 4854 } hitcount: 1 common_preempt_count: 0
1206- { len: 395 } hitcount: 1 common_preempt_count: 0
1207- { len: 177 } hitcount: 1 common_preempt_count: 0
1208- { len: 446 } hitcount: 1 common_preempt_count: 0
1209- { len: 1601 } hitcount: 1 common_preempt_count: 0
1210- .
1211- .
1212- .
1213- { len: 1280 } hitcount: 66 common_preempt_count: 0
1214- { len: 116 } hitcount: 81 common_preempt_count: 40
1215- { len: 708 } hitcount: 112 common_preempt_count: 0
1216- { len: 46 } hitcount: 221 common_preempt_count: 0
1217- { len: 1264 } hitcount: 458 common_preempt_count: 0
1218-
1219- Totals:
1220- Hits: 1428
1221- Entries: 147
1222- Dropped: 0
1223-
1224-
1225- # event histogram
1226- #
1227- # trigger info: hist:keys=skbaddr.hex:vals=hitcount,len:sort=hitcount:size=2048 [active]
1228- #
1229-
1230- { skbaddr: ffff8800baee5e00 } hitcount: 1 len: 130
1231- { skbaddr: ffff88005f3d5600 } hitcount: 1 len: 1280
1232- { skbaddr: ffff88005f3d4900 } hitcount: 1 len: 1280
1233- { skbaddr: ffff88009fed6300 } hitcount: 1 len: 115
1234- { skbaddr: ffff88009fe0ad00 } hitcount: 1 len: 115
1235- { skbaddr: ffff88008cdb1900 } hitcount: 1 len: 46
1236- { skbaddr: ffff880064b5ef00 } hitcount: 1 len: 118
1237- { skbaddr: ffff880044e3c700 } hitcount: 1 len: 60
1238- { skbaddr: ffff880100065900 } hitcount: 1 len: 46
1239- { skbaddr: ffff8800d46bd500 } hitcount: 1 len: 116
1240- { skbaddr: ffff88005f3d5f00 } hitcount: 1 len: 1280
1241- { skbaddr: ffff880100064700 } hitcount: 1 len: 365
1242- { skbaddr: ffff8800badb6f00 } hitcount: 1 len: 60
1243- .
1244- .
1245- .
1246- { skbaddr: ffff88009fe0be00 } hitcount: 27 len: 24677
1247- { skbaddr: ffff88009fe0a400 } hitcount: 27 len: 23052
1248- { skbaddr: ffff88009fe0b700 } hitcount: 31 len: 25589
1249- { skbaddr: ffff88009fe0b600 } hitcount: 32 len: 27326
1250- { skbaddr: ffff88006a462800 } hitcount: 68 len: 71678
1251- { skbaddr: ffff88006a463700 } hitcount: 70 len: 72678
1252- { skbaddr: ffff88006a462b00 } hitcount: 71 len: 77589
1253- { skbaddr: ffff88006a463600 } hitcount: 73 len: 71307
1254- { skbaddr: ffff88006a462200 } hitcount: 81 len: 81032
1255-
1256- Totals:
1257- Hits: 1451
1258- Entries: 318
1259- Dropped: 0
1260-
1261-
1262- # event histogram
1263- #
1264- # trigger info: hist:keys=skbaddr.hex:vals=hitcount,len:sort=hitcount:size=2048 if len == 256 [active]
1265- #
1266-
1267-
1268- Totals:
1269- Hits: 0
1270- Entries: 0
1271- Dropped: 0
1272-
1273-
1274- # event histogram
1275- #
1276- # trigger info: hist:keys=skbaddr.hex:vals=hitcount,len:sort=hitcount:size=2048 if len > 4096 [active]
1277- #
1278-
1279- { skbaddr: ffff88009fd2c300 } hitcount: 1 len: 7212
1280- { skbaddr: ffff8800d2bcce00 } hitcount: 1 len: 7212
1281- { skbaddr: ffff8800d2bcd700 } hitcount: 1 len: 7212
1282- { skbaddr: ffff8800d2bcda00 } hitcount: 1 len: 21492
1283- { skbaddr: ffff8800ae2e2d00 } hitcount: 1 len: 7212
1284- { skbaddr: ffff8800d2bcdb00 } hitcount: 1 len: 7212
1285- { skbaddr: ffff88006a4df500 } hitcount: 1 len: 4854
1286- { skbaddr: ffff88008ce47b00 } hitcount: 1 len: 18636
1287- { skbaddr: ffff8800ae2e2200 } hitcount: 1 len: 12924
1288- { skbaddr: ffff88005f3e1000 } hitcount: 1 len: 4356
1289- { skbaddr: ffff8800d2bcdc00 } hitcount: 2 len: 24420
1290- { skbaddr: ffff8800d2bcc200 } hitcount: 2 len: 12996
1291-
1292- Totals:
1293- Hits: 14
1294- Entries: 12
1295- Dropped: 0
1296-
1297-
1298- # event histogram
1299- #
1300- # trigger info: hist:keys=skbaddr.hex:vals=hitcount,len:sort=hitcount:size=2048 if len < 0 [active]
1301- #
1302-
1303-
1304- Totals:
1305- Hits: 0
1306- Entries: 0
1307- Dropped: 0
1308-
1309- Named triggers can be used to have triggers share a common set of
1310- histogram data. This capability is mostly useful for combining the
1311- output of events generated by tracepoints contained inside inline
1312- functions, but names can be used in a hist trigger on any event.
1313- For example, these two triggers when hit will update the same 'len'
1314- field in the shared 'foo' histogram data:
1315-
1316- # echo 'hist:name=foo:keys=skbaddr.hex:vals=len' > \
1317- /sys/kernel/debug/tracing/events/net/netif_receive_skb/trigger
1318- # echo 'hist:name=foo:keys=skbaddr.hex:vals=len' > \
1319- /sys/kernel/debug/tracing/events/net/netif_rx/trigger
1320-
1321- You can see that they're updating common histogram data by reading
1322- each event's hist files at the same time:
1323-
1324- # cat /sys/kernel/debug/tracing/events/net/netif_receive_skb/hist;
1325- cat /sys/kernel/debug/tracing/events/net/netif_rx/hist
1326-
1327- # event histogram
1328- #
1329- # trigger info: hist:name=foo:keys=skbaddr.hex:vals=hitcount,len:sort=hitcount:size=2048 [active]
1330- #
1331-
1332- { skbaddr: ffff88000ad53500 } hitcount: 1 len: 46
1333- { skbaddr: ffff8800af5a1500 } hitcount: 1 len: 76
1334- { skbaddr: ffff8800d62a1900 } hitcount: 1 len: 46
1335- { skbaddr: ffff8800d2bccb00 } hitcount: 1 len: 468
1336- { skbaddr: ffff8800d3c69900 } hitcount: 1 len: 46
1337- { skbaddr: ffff88009ff09100 } hitcount: 1 len: 52
1338- { skbaddr: ffff88010f13ab00 } hitcount: 1 len: 168
1339- { skbaddr: ffff88006a54f400 } hitcount: 1 len: 46
1340- { skbaddr: ffff8800d2bcc500 } hitcount: 1 len: 260
1341- { skbaddr: ffff880064505000 } hitcount: 1 len: 46
1342- { skbaddr: ffff8800baf24e00 } hitcount: 1 len: 32
1343- { skbaddr: ffff88009fe0ad00 } hitcount: 1 len: 46
1344- { skbaddr: ffff8800d3edff00 } hitcount: 1 len: 44
1345- { skbaddr: ffff88009fe0b400 } hitcount: 1 len: 168
1346- { skbaddr: ffff8800a1c55a00 } hitcount: 1 len: 40
1347- { skbaddr: ffff8800d2bcd100 } hitcount: 1 len: 40
1348- { skbaddr: ffff880064505f00 } hitcount: 1 len: 174
1349- { skbaddr: ffff8800a8bff200 } hitcount: 1 len: 160
1350- { skbaddr: ffff880044e3cc00 } hitcount: 1 len: 76
1351- { skbaddr: ffff8800a8bfe700 } hitcount: 1 len: 46
1352- { skbaddr: ffff8800d2bcdc00 } hitcount: 1 len: 32
1353- { skbaddr: ffff8800a1f64800 } hitcount: 1 len: 46
1354- { skbaddr: ffff8800d2bcde00 } hitcount: 1 len: 988
1355- { skbaddr: ffff88006a5dea00 } hitcount: 1 len: 46
1356- { skbaddr: ffff88002e37a200 } hitcount: 1 len: 44
1357- { skbaddr: ffff8800a1f32c00 } hitcount: 2 len: 676
1358- { skbaddr: ffff88000ad52600 } hitcount: 2 len: 107
1359- { skbaddr: ffff8800a1f91e00 } hitcount: 2 len: 92
1360- { skbaddr: ffff8800af5a0200 } hitcount: 2 len: 142
1361- { skbaddr: ffff8800d2bcc600 } hitcount: 2 len: 220
1362- { skbaddr: ffff8800ba36f500 } hitcount: 2 len: 92
1363- { skbaddr: ffff8800d021f800 } hitcount: 2 len: 92
1364- { skbaddr: ffff8800a1f33600 } hitcount: 2 len: 675
1365- { skbaddr: ffff8800a8bfff00 } hitcount: 3 len: 138
1366- { skbaddr: ffff8800d62a1300 } hitcount: 3 len: 138
1367- { skbaddr: ffff88002e37a100 } hitcount: 4 len: 184
1368- { skbaddr: ffff880064504400 } hitcount: 4 len: 184
1369- { skbaddr: ffff8800a8bfec00 } hitcount: 4 len: 184
1370- { skbaddr: ffff88000ad53700 } hitcount: 5 len: 230
1371- { skbaddr: ffff8800d2bcdb00 } hitcount: 5 len: 196
1372- { skbaddr: ffff8800a1f90000 } hitcount: 6 len: 276
1373- { skbaddr: ffff88006a54f900 } hitcount: 6 len: 276
1374-
1375- Totals:
1376- Hits: 81
1377- Entries: 42
1378- Dropped: 0
1379- # event histogram
1380- #
1381- # trigger info: hist:name=foo:keys=skbaddr.hex:vals=hitcount,len:sort=hitcount:size=2048 [active]
1382- #
1383-
1384- { skbaddr: ffff88000ad53500 } hitcount: 1 len: 46
1385- { skbaddr: ffff8800af5a1500 } hitcount: 1 len: 76
1386- { skbaddr: ffff8800d62a1900 } hitcount: 1 len: 46
1387- { skbaddr: ffff8800d2bccb00 } hitcount: 1 len: 468
1388- { skbaddr: ffff8800d3c69900 } hitcount: 1 len: 46
1389- { skbaddr: ffff88009ff09100 } hitcount: 1 len: 52
1390- { skbaddr: ffff88010f13ab00 } hitcount: 1 len: 168
1391- { skbaddr: ffff88006a54f400 } hitcount: 1 len: 46
1392- { skbaddr: ffff8800d2bcc500 } hitcount: 1 len: 260
1393- { skbaddr: ffff880064505000 } hitcount: 1 len: 46
1394- { skbaddr: ffff8800baf24e00 } hitcount: 1 len: 32
1395- { skbaddr: ffff88009fe0ad00 } hitcount: 1 len: 46
1396- { skbaddr: ffff8800d3edff00 } hitcount: 1 len: 44
1397- { skbaddr: ffff88009fe0b400 } hitcount: 1 len: 168
1398- { skbaddr: ffff8800a1c55a00 } hitcount: 1 len: 40
1399- { skbaddr: ffff8800d2bcd100 } hitcount: 1 len: 40
1400- { skbaddr: ffff880064505f00 } hitcount: 1 len: 174
1401- { skbaddr: ffff8800a8bff200 } hitcount: 1 len: 160
1402- { skbaddr: ffff880044e3cc00 } hitcount: 1 len: 76
1403- { skbaddr: ffff8800a8bfe700 } hitcount: 1 len: 46
1404- { skbaddr: ffff8800d2bcdc00 } hitcount: 1 len: 32
1405- { skbaddr: ffff8800a1f64800 } hitcount: 1 len: 46
1406- { skbaddr: ffff8800d2bcde00 } hitcount: 1 len: 988
1407- { skbaddr: ffff88006a5dea00 } hitcount: 1 len: 46
1408- { skbaddr: ffff88002e37a200 } hitcount: 1 len: 44
1409- { skbaddr: ffff8800a1f32c00 } hitcount: 2 len: 676
1410- { skbaddr: ffff88000ad52600 } hitcount: 2 len: 107
1411- { skbaddr: ffff8800a1f91e00 } hitcount: 2 len: 92
1412- { skbaddr: ffff8800af5a0200 } hitcount: 2 len: 142
1413- { skbaddr: ffff8800d2bcc600 } hitcount: 2 len: 220
1414- { skbaddr: ffff8800ba36f500 } hitcount: 2 len: 92
1415- { skbaddr: ffff8800d021f800 } hitcount: 2 len: 92
1416- { skbaddr: ffff8800a1f33600 } hitcount: 2 len: 675
1417- { skbaddr: ffff8800a8bfff00 } hitcount: 3 len: 138
1418- { skbaddr: ffff8800d62a1300 } hitcount: 3 len: 138
1419- { skbaddr: ffff88002e37a100 } hitcount: 4 len: 184
1420- { skbaddr: ffff880064504400 } hitcount: 4 len: 184
1421- { skbaddr: ffff8800a8bfec00 } hitcount: 4 len: 184
1422- { skbaddr: ffff88000ad53700 } hitcount: 5 len: 230
1423- { skbaddr: ffff8800d2bcdb00 } hitcount: 5 len: 196
1424- { skbaddr: ffff8800a1f90000 } hitcount: 6 len: 276
1425- { skbaddr: ffff88006a54f900 } hitcount: 6 len: 276
1426-
1427- Totals:
1428- Hits: 81
1429- Entries: 42
1430- Dropped: 0
1431-
1432- And here's an example that shows how to combine histogram data from
1433- any two events even if they don't share any 'compatible' fields
1434- other than 'hitcount' and 'stacktrace'. These commands create a
1435- couple of triggers named 'bar' using those fields:
1436-
1437- # echo 'hist:name=bar:key=stacktrace:val=hitcount' > \
1438- /sys/kernel/debug/tracing/events/sched/sched_process_fork/trigger
1439- # echo 'hist:name=bar:key=stacktrace:val=hitcount' > \
1440- /sys/kernel/debug/tracing/events/net/netif_rx/trigger
1441-
1442- And displaying the output of either shows some interesting if
1443- somewhat confusing output:
1444-
1445- # cat /sys/kernel/debug/tracing/events/sched/sched_process_fork/hist
1446- # cat /sys/kernel/debug/tracing/events/net/netif_rx/hist
1447-
1448- # event histogram
1449- #
1450- # trigger info: hist:name=bar:keys=stacktrace:vals=hitcount:sort=hitcount:size=2048 [active]
1451- #
1452-
1453- { stacktrace:
1454- _do_fork+0x18e/0x330
1455- kernel_thread+0x29/0x30
1456- kthreadd+0x154/0x1b0
1457- ret_from_fork+0x3f/0x70
1458- } hitcount: 1
1459- { stacktrace:
1460- netif_rx_internal+0xb2/0xd0
1461- netif_rx_ni+0x20/0x70
1462- dev_loopback_xmit+0xaa/0xd0
1463- ip_mc_output+0x126/0x240
1464- ip_local_out_sk+0x31/0x40
1465- igmp_send_report+0x1e9/0x230
1466- igmp_timer_expire+0xe9/0x120
1467- call_timer_fn+0x39/0xf0
1468- run_timer_softirq+0x1e1/0x290
1469- __do_softirq+0xfd/0x290
1470- irq_exit+0x98/0xb0
1471- smp_apic_timer_interrupt+0x4a/0x60
1472- apic_timer_interrupt+0x6d/0x80
1473- cpuidle_enter+0x17/0x20
1474- call_cpuidle+0x3b/0x60
1475- cpu_startup_entry+0x22d/0x310
1476- } hitcount: 1
1477- { stacktrace:
1478- netif_rx_internal+0xb2/0xd0
1479- netif_rx_ni+0x20/0x70
1480- dev_loopback_xmit+0xaa/0xd0
1481- ip_mc_output+0x17f/0x240
1482- ip_local_out_sk+0x31/0x40
1483- ip_send_skb+0x1a/0x50
1484- udp_send_skb+0x13e/0x270
1485- udp_sendmsg+0x2bf/0x980
1486- inet_sendmsg+0x67/0xa0
1487- sock_sendmsg+0x38/0x50
1488- SYSC_sendto+0xef/0x170
1489- SyS_sendto+0xe/0x10
1490- entry_SYSCALL_64_fastpath+0x12/0x6a
1491- } hitcount: 2
1492- { stacktrace:
1493- netif_rx_internal+0xb2/0xd0
1494- netif_rx+0x1c/0x60
1495- loopback_xmit+0x6c/0xb0
1496- dev_hard_start_xmit+0x219/0x3a0
1497- __dev_queue_xmit+0x415/0x4f0
1498- dev_queue_xmit_sk+0x13/0x20
1499- ip_finish_output2+0x237/0x340
1500- ip_finish_output+0x113/0x1d0
1501- ip_output+0x66/0xc0
1502- ip_local_out_sk+0x31/0x40
1503- ip_send_skb+0x1a/0x50
1504- udp_send_skb+0x16d/0x270
1505- udp_sendmsg+0x2bf/0x980
1506- inet_sendmsg+0x67/0xa0
1507- sock_sendmsg+0x38/0x50
1508- ___sys_sendmsg+0x14e/0x270
1509- } hitcount: 76
1510- { stacktrace:
1511- netif_rx_internal+0xb2/0xd0
1512- netif_rx+0x1c/0x60
1513- loopback_xmit+0x6c/0xb0
1514- dev_hard_start_xmit+0x219/0x3a0
1515- __dev_queue_xmit+0x415/0x4f0
1516- dev_queue_xmit_sk+0x13/0x20
1517- ip_finish_output2+0x237/0x340
1518- ip_finish_output+0x113/0x1d0
1519- ip_output+0x66/0xc0
1520- ip_local_out_sk+0x31/0x40
1521- ip_send_skb+0x1a/0x50
1522- udp_send_skb+0x16d/0x270
1523- udp_sendmsg+0x2bf/0x980
1524- inet_sendmsg+0x67/0xa0
1525- sock_sendmsg+0x38/0x50
1526- ___sys_sendmsg+0x269/0x270
1527- } hitcount: 77
1528- { stacktrace:
1529- netif_rx_internal+0xb2/0xd0
1530- netif_rx+0x1c/0x60
1531- loopback_xmit+0x6c/0xb0
1532- dev_hard_start_xmit+0x219/0x3a0
1533- __dev_queue_xmit+0x415/0x4f0
1534- dev_queue_xmit_sk+0x13/0x20
1535- ip_finish_output2+0x237/0x340
1536- ip_finish_output+0x113/0x1d0
1537- ip_output+0x66/0xc0
1538- ip_local_out_sk+0x31/0x40
1539- ip_send_skb+0x1a/0x50
1540- udp_send_skb+0x16d/0x270
1541- udp_sendmsg+0x2bf/0x980
1542- inet_sendmsg+0x67/0xa0
1543- sock_sendmsg+0x38/0x50
1544- SYSC_sendto+0xef/0x170
1545- } hitcount: 88
1546- { stacktrace:
1547- _do_fork+0x18e/0x330
1548- SyS_clone+0x19/0x20
1549- entry_SYSCALL_64_fastpath+0x12/0x6a
1550- } hitcount: 244
1551-
1552- Totals:
1553- Hits: 489
1554- Entries: 7
1555- Dropped: 0
1556+ See Documentation/trace/histogram.txt for details and examples.
b3bbd485
JK
1557diff --git a/Documentation/trace/ftrace.txt b/Documentation/trace/ftrace.txt
1558index d4601df6e72e..54213e5c23f6 100644
1559--- a/Documentation/trace/ftrace.txt
1560+++ b/Documentation/trace/ftrace.txt
1561@@ -539,6 +539,30 @@ of ftrace. Here is a list of some of the key files:
e4b2b4a8
JK
1562
1563 See events.txt for more information.
1564
1565+ timestamp_mode:
1566+
1567+ Certain tracers may change the timestamp mode used when
1568+ logging trace events into the event buffer. Events with
1569+ different modes can coexist within a buffer but the mode in
1570+ effect when an event is logged determines which timestamp mode
1571+ is used for that event. The default timestamp mode is
1572+ 'delta'.
1573+
1574+ Usual timestamp modes for tracing:
1575+
1576+ # cat timestamp_mode
1577+ [delta] absolute
1578+
1579+ The timestamp mode with the square brackets around it is the
1580+ one in effect.
1581+
1582+ delta: Default timestamp mode - timestamp is a delta against
1583+ a per-buffer timestamp.
1584+
1585+ absolute: The timestamp is a full timestamp, not a delta
1586+ against some other value. As such it takes up more
1587+ space and is less efficient.
1588+
1589 hwlat_detector:
1590
1591 Directory for the Hardware Latency Detector.
b3bbd485
JK
1592diff --git a/Documentation/trace/histogram.txt b/Documentation/trace/histogram.txt
1593new file mode 100644
1594index 000000000000..6e05510afc28
1595--- /dev/null
1596+++ b/Documentation/trace/histogram.txt
e4b2b4a8
JK
1597@@ -0,0 +1,1995 @@
1598+ Event Histograms
1599+
1600+ Documentation written by Tom Zanussi
1601+
1602+1. Introduction
1603+===============
1604+
1605+ Histogram triggers are special event triggers that can be used to
1606+ aggregate trace event data into histograms. For information on
1607+ trace events and event triggers, see Documentation/trace/events.txt.
1608+
1609+
1610+2. Histogram Trigger Command
1611+============================
1612+
1613+ A histogram trigger command is an event trigger command that
1614+ aggregates event hits into a hash table keyed on one or more trace
1615+ event format fields (or stacktrace) and a set of running totals
1616+ derived from one or more trace event format fields and/or event
1617+ counts (hitcount).
1618+
1619+ The format of a hist trigger is as follows:
1620+
1621+ hist:keys=<field1[,field2,...]>[:values=<field1[,field2,...]>]
1622+ [:sort=<field1[,field2,...]>][:size=#entries][:pause][:continue]
1623+ [:clear][:name=histname1] [if <filter>]
1624+
1625+ When a matching event is hit, an entry is added to a hash table
1626+ using the key(s) and value(s) named. Keys and values correspond to
1627+ fields in the event's format description. Values must correspond to
1628+ numeric fields - on an event hit, the value(s) will be added to a
1629+ sum kept for that field. The special string 'hitcount' can be used
1630+ in place of an explicit value field - this is simply a count of
1631+ event hits. If 'values' isn't specified, an implicit 'hitcount'
1632+ value will be automatically created and used as the only value.
1633+ Keys can be any field, or the special string 'stacktrace', which
1634+ will use the event's kernel stacktrace as the key. The keywords
1635+ 'keys' or 'key' can be used to specify keys, and the keywords
1636+ 'values', 'vals', or 'val' can be used to specify values. Compound
1637+ keys consisting of up to two fields can be specified by the 'keys'
1638+ keyword. Hashing a compound key produces a unique entry in the
1639+ table for each unique combination of component keys, and can be
1640+ useful for providing more fine-grained summaries of event data.
1641+ Additionally, sort keys consisting of up to two fields can be
1642+ specified by the 'sort' keyword. If more than one field is
1643+ specified, the result will be a 'sort within a sort': the first key
1644+ is taken to be the primary sort key and the second the secondary
1645+ key. If a hist trigger is given a name using the 'name' parameter,
1646+ its histogram data will be shared with other triggers of the same
1647+ name, and trigger hits will update this common data. Only triggers
1648+ with 'compatible' fields can be combined in this way; triggers are
1649+ 'compatible' if the fields named in the trigger share the same
1650+ number and type of fields and those fields also have the same names.
1651+ Note that any two events always share the compatible 'hitcount' and
1652+ 'stacktrace' fields and can therefore be combined using those
1653+ fields, however pointless that may be.
1654+
1655+ 'hist' triggers add a 'hist' file to each event's subdirectory.
1656+ Reading the 'hist' file for the event will dump the hash table in
1657+ its entirety to stdout. If there are multiple hist triggers
1658+ attached to an event, there will be a table for each trigger in the
1659+ output. The table displayed for a named trigger will be the same as
1660+ any other instance having the same name. Each printed hash table
1661+ entry is a simple list of the keys and values comprising the entry;
1662+ keys are printed first and are delineated by curly braces, and are
1663+ followed by the set of value fields for the entry. By default,
1664+ numeric fields are displayed as base-10 integers. This can be
1665+ modified by appending any of the following modifiers to the field
1666+ name:
1667+
1668+ .hex display a number as a hex value
1669+ .sym display an address as a symbol
1670+ .sym-offset display an address as a symbol and offset
1671+ .syscall display a syscall id as a system call name
1672+ .execname display a common_pid as a program name
1673+ .log2 display log2 value rather than raw number
1674+ .usecs display a common_timestamp in microseconds
1675+
1676+ Note that in general the semantics of a given field aren't
1677+ interpreted when applying a modifier to it, but there are some
1678+ restrictions to be aware of in this regard:
1679+
1680+ - only the 'hex' modifier can be used for values (because values
1681+ are essentially sums, and the other modifiers don't make sense
1682+ in that context).
1683+ - the 'execname' modifier can only be used on a 'common_pid'. The
1684+ reason for this is that the execname is simply the 'comm' value
1685+ saved for the 'current' process when an event was triggered,
1686+ which is the same as the common_pid value saved by the event
1687+ tracing code. Trying to apply that comm value to other pid
1688+ values wouldn't be correct, and typically events that care save
1689+ pid-specific comm fields in the event itself.
1690+
1691+ A typical usage scenario would be the following to enable a hist
1692+ trigger, read its current contents, and then turn it off:
1693+
1694+ # echo 'hist:keys=skbaddr.hex:vals=len' > \
1695+ /sys/kernel/debug/tracing/events/net/netif_rx/trigger
1696+
1697+ # cat /sys/kernel/debug/tracing/events/net/netif_rx/hist
1698+
1699+ # echo '!hist:keys=skbaddr.hex:vals=len' > \
1700+ /sys/kernel/debug/tracing/events/net/netif_rx/trigger
1701+
1702+ The trigger file itself can be read to show the details of the
1703+ currently attached hist trigger. This information is also displayed
1704+ at the top of the 'hist' file when read.
1705+
1706+ By default, the size of the hash table is 2048 entries. The 'size'
1707+ parameter can be used to specify more or fewer than that. The units
1708+ are in terms of hashtable entries - if a run uses more entries than
1709+ specified, the results will show the number of 'drops', the number
1710+ of hits that were ignored. The size should be a power of 2 between
1711+ 128 and 131072 (any non- power-of-2 number specified will be rounded
1712+ up).
1713+
1714+ The 'sort' parameter can be used to specify a value field to sort
1715+ on. The default if unspecified is 'hitcount' and the default sort
1716+ order is 'ascending'. To sort in the opposite direction, append
1717+ .descending' to the sort key.
1718+
1719+ The 'pause' parameter can be used to pause an existing hist trigger
1720+ or to start a hist trigger but not log any events until told to do
1721+ so. 'continue' or 'cont' can be used to start or restart a paused
1722+ hist trigger.
1723+
1724+ The 'clear' parameter will clear the contents of a running hist
1725+ trigger and leave its current paused/active state.
1726+
1727+ Note that the 'pause', 'cont', and 'clear' parameters should be
1728+ applied using 'append' shell operator ('>>') if applied to an
1729+ existing trigger, rather than via the '>' operator, which will cause
1730+ the trigger to be removed through truncation.
1731+
1732+- enable_hist/disable_hist
1733+
1734+ The enable_hist and disable_hist triggers can be used to have one
1735+ event conditionally start and stop another event's already-attached
1736+ hist trigger. Any number of enable_hist and disable_hist triggers
1737+ can be attached to a given event, allowing that event to kick off
1738+ and stop aggregations on a host of other events.
1739+
1740+ The format is very similar to the enable/disable_event triggers:
1741+
1742+ enable_hist:<system>:<event>[:count]
1743+ disable_hist:<system>:<event>[:count]
1744+
1745+ Instead of enabling or disabling the tracing of the target event
1746+ into the trace buffer as the enable/disable_event triggers do, the
1747+ enable/disable_hist triggers enable or disable the aggregation of
1748+ the target event into a hash table.
1749+
1750+ A typical usage scenario for the enable_hist/disable_hist triggers
1751+ would be to first set up a paused hist trigger on some event,
1752+ followed by an enable_hist/disable_hist pair that turns the hist
1753+ aggregation on and off when conditions of interest are hit:
1754+
1755+ # echo 'hist:keys=skbaddr.hex:vals=len:pause' > \
1756+ /sys/kernel/debug/tracing/events/net/netif_receive_skb/trigger
1757+
1758+ # echo 'enable_hist:net:netif_receive_skb if filename==/usr/bin/wget' > \
1759+ /sys/kernel/debug/tracing/events/sched/sched_process_exec/trigger
1760+
1761+ # echo 'disable_hist:net:netif_receive_skb if comm==wget' > \
1762+ /sys/kernel/debug/tracing/events/sched/sched_process_exit/trigger
1763+
1764+ The above sets up an initially paused hist trigger which is unpaused
1765+ and starts aggregating events when a given program is executed, and
1766+ which stops aggregating when the process exits and the hist trigger
1767+ is paused again.
1768+
1769+ The examples below provide a more concrete illustration of the
1770+ concepts and typical usage patterns discussed above.
1771+
1772+ 'special' event fields
1773+ ------------------------
1774+
1775+ There are a number of 'special event fields' available for use as
1776+ keys or values in a hist trigger. These look like and behave as if
1777+ they were actual event fields, but aren't really part of the event's
1778+ field definition or format file. They are however available for any
1779+ event, and can be used anywhere an actual event field could be.
1780+ They are:
1781+
1782+ common_timestamp u64 - timestamp (from ring buffer) associated
1783+ with the event, in nanoseconds. May be
1784+ modified by .usecs to have timestamps
1785+ interpreted as microseconds.
1786+ cpu int - the cpu on which the event occurred.
1787+
1788+ Extended error information
1789+ --------------------------
1790+
1791+ For some error conditions encountered when invoking a hist trigger
1792+ command, extended error information is available via the
1793+ corresponding event's 'hist' file. Reading the hist file after an
1794+ error will display more detailed information about what went wrong,
1795+ if information is available. This extended error information will
1796+ be available until the next hist trigger command for that event.
1797+
1798+ If available for a given error condition, the extended error
1799+ information and usage takes the following form:
1800+
1801+ # echo xxx > /sys/kernel/debug/tracing/events/sched/sched_wakeup/trigger
1802+ echo: write error: Invalid argument
1803+
1804+ # cat /sys/kernel/debug/tracing/events/sched/sched_wakeup/hist
1805+ ERROR: Couldn't yyy: zzz
1806+ Last command: xxx
1807+
1808+6.2 'hist' trigger examples
1809+---------------------------
1810+
1811+ The first set of examples creates aggregations using the kmalloc
1812+ event. The fields that can be used for the hist trigger are listed
1813+ in the kmalloc event's format file:
1814+
1815+ # cat /sys/kernel/debug/tracing/events/kmem/kmalloc/format
1816+ name: kmalloc
1817+ ID: 374
1818+ format:
1819+ field:unsigned short common_type; offset:0; size:2; signed:0;
1820+ field:unsigned char common_flags; offset:2; size:1; signed:0;
1821+ field:unsigned char common_preempt_count; offset:3; size:1; signed:0;
1822+ field:int common_pid; offset:4; size:4; signed:1;
1823+
1824+ field:unsigned long call_site; offset:8; size:8; signed:0;
1825+ field:const void * ptr; offset:16; size:8; signed:0;
1826+ field:size_t bytes_req; offset:24; size:8; signed:0;
1827+ field:size_t bytes_alloc; offset:32; size:8; signed:0;
1828+ field:gfp_t gfp_flags; offset:40; size:4; signed:0;
1829+
1830+ We'll start by creating a hist trigger that generates a simple table
1831+ that lists the total number of bytes requested for each function in
1832+ the kernel that made one or more calls to kmalloc:
1833+
1834+ # echo 'hist:key=call_site:val=bytes_req' > \
1835+ /sys/kernel/debug/tracing/events/kmem/kmalloc/trigger
1836+
1837+ This tells the tracing system to create a 'hist' trigger using the
1838+ call_site field of the kmalloc event as the key for the table, which
1839+ just means that each unique call_site address will have an entry
1840+ created for it in the table. The 'val=bytes_req' parameter tells
1841+ the hist trigger that for each unique entry (call_site) in the
1842+ table, it should keep a running total of the number of bytes
1843+ requested by that call_site.
1844+
1845+ We'll let it run for awhile and then dump the contents of the 'hist'
1846+ file in the kmalloc event's subdirectory (for readability, a number
1847+ of entries have been omitted):
1848+
1849+ # cat /sys/kernel/debug/tracing/events/kmem/kmalloc/hist
1850+ # trigger info: hist:keys=call_site:vals=bytes_req:sort=hitcount:size=2048 [active]
1851+
1852+ { call_site: 18446744072106379007 } hitcount: 1 bytes_req: 176
1853+ { call_site: 18446744071579557049 } hitcount: 1 bytes_req: 1024
1854+ { call_site: 18446744071580608289 } hitcount: 1 bytes_req: 16384
1855+ { call_site: 18446744071581827654 } hitcount: 1 bytes_req: 24
1856+ { call_site: 18446744071580700980 } hitcount: 1 bytes_req: 8
1857+ { call_site: 18446744071579359876 } hitcount: 1 bytes_req: 152
1858+ { call_site: 18446744071580795365 } hitcount: 3 bytes_req: 144
1859+ { call_site: 18446744071581303129 } hitcount: 3 bytes_req: 144
1860+ { call_site: 18446744071580713234 } hitcount: 4 bytes_req: 2560
1861+ { call_site: 18446744071580933750 } hitcount: 4 bytes_req: 736
1862+ .
1863+ .
1864+ .
1865+ { call_site: 18446744072106047046 } hitcount: 69 bytes_req: 5576
1866+ { call_site: 18446744071582116407 } hitcount: 73 bytes_req: 2336
1867+ { call_site: 18446744072106054684 } hitcount: 136 bytes_req: 140504
1868+ { call_site: 18446744072106224230 } hitcount: 136 bytes_req: 19584
1869+ { call_site: 18446744072106078074 } hitcount: 153 bytes_req: 2448
1870+ { call_site: 18446744072106062406 } hitcount: 153 bytes_req: 36720
1871+ { call_site: 18446744071582507929 } hitcount: 153 bytes_req: 37088
1872+ { call_site: 18446744072102520590 } hitcount: 273 bytes_req: 10920
1873+ { call_site: 18446744071582143559 } hitcount: 358 bytes_req: 716
1874+ { call_site: 18446744072106465852 } hitcount: 417 bytes_req: 56712
1875+ { call_site: 18446744072102523378 } hitcount: 485 bytes_req: 27160
1876+ { call_site: 18446744072099568646 } hitcount: 1676 bytes_req: 33520
1877+
1878+ Totals:
1879+ Hits: 4610
1880+ Entries: 45
1881+ Dropped: 0
1882+
1883+ The output displays a line for each entry, beginning with the key
1884+ specified in the trigger, followed by the value(s) also specified in
1885+ the trigger. At the beginning of the output is a line that displays
1886+ the trigger info, which can also be displayed by reading the
1887+ 'trigger' file:
1888+
1889+ # cat /sys/kernel/debug/tracing/events/kmem/kmalloc/trigger
1890+ hist:keys=call_site:vals=bytes_req:sort=hitcount:size=2048 [active]
1891+
1892+ At the end of the output are a few lines that display the overall
1893+ totals for the run. The 'Hits' field shows the total number of
1894+ times the event trigger was hit, the 'Entries' field shows the total
1895+ number of used entries in the hash table, and the 'Dropped' field
1896+ shows the number of hits that were dropped because the number of
1897+ used entries for the run exceeded the maximum number of entries
1898+ allowed for the table (normally 0, but if not a hint that you may
1899+ want to increase the size of the table using the 'size' parameter).
1900+
1901+ Notice in the above output that there's an extra field, 'hitcount',
1902+ which wasn't specified in the trigger. Also notice that in the
1903+ trigger info output, there's a parameter, 'sort=hitcount', which
1904+ wasn't specified in the trigger either. The reason for that is that
1905+ every trigger implicitly keeps a count of the total number of hits
1906+ attributed to a given entry, called the 'hitcount'. That hitcount
1907+ information is explicitly displayed in the output, and in the
1908+ absence of a user-specified sort parameter, is used as the default
1909+ sort field.
1910+
1911+ The value 'hitcount' can be used in place of an explicit value in
1912+ the 'values' parameter if you don't really need to have any
1913+ particular field summed and are mainly interested in hit
1914+ frequencies.
1915+
1916+ To turn the hist trigger off, simply call up the trigger in the
1917+ command history and re-execute it with a '!' prepended:
1918+
1919+ # echo '!hist:key=call_site:val=bytes_req' > \
1920+ /sys/kernel/debug/tracing/events/kmem/kmalloc/trigger
1921+
1922+ Finally, notice that the call_site as displayed in the output above
1923+ isn't really very useful. It's an address, but normally addresses
1924+ are displayed in hex. To have a numeric field displayed as a hex
1925+ value, simply append '.hex' to the field name in the trigger:
1926+
1927+ # echo 'hist:key=call_site.hex:val=bytes_req' > \
1928+ /sys/kernel/debug/tracing/events/kmem/kmalloc/trigger
1929+
1930+ # cat /sys/kernel/debug/tracing/events/kmem/kmalloc/hist
1931+ # trigger info: hist:keys=call_site.hex:vals=bytes_req:sort=hitcount:size=2048 [active]
1932+
1933+ { call_site: ffffffffa026b291 } hitcount: 1 bytes_req: 433
1934+ { call_site: ffffffffa07186ff } hitcount: 1 bytes_req: 176
1935+ { call_site: ffffffff811ae721 } hitcount: 1 bytes_req: 16384
1936+ { call_site: ffffffff811c5134 } hitcount: 1 bytes_req: 8
1937+ { call_site: ffffffffa04a9ebb } hitcount: 1 bytes_req: 511
1938+ { call_site: ffffffff8122e0a6 } hitcount: 1 bytes_req: 12
1939+ { call_site: ffffffff8107da84 } hitcount: 1 bytes_req: 152
1940+ { call_site: ffffffff812d8246 } hitcount: 1 bytes_req: 24
1941+ { call_site: ffffffff811dc1e5 } hitcount: 3 bytes_req: 144
1942+ { call_site: ffffffffa02515e8 } hitcount: 3 bytes_req: 648
1943+ { call_site: ffffffff81258159 } hitcount: 3 bytes_req: 144
1944+ { call_site: ffffffff811c80f4 } hitcount: 4 bytes_req: 544
1945+ .
1946+ .
1947+ .
1948+ { call_site: ffffffffa06c7646 } hitcount: 106 bytes_req: 8024
1949+ { call_site: ffffffffa06cb246 } hitcount: 132 bytes_req: 31680
1950+ { call_site: ffffffffa06cef7a } hitcount: 132 bytes_req: 2112
1951+ { call_site: ffffffff8137e399 } hitcount: 132 bytes_req: 23232
1952+ { call_site: ffffffffa06c941c } hitcount: 185 bytes_req: 171360
1953+ { call_site: ffffffffa06f2a66 } hitcount: 185 bytes_req: 26640
1954+ { call_site: ffffffffa036a70e } hitcount: 265 bytes_req: 10600
1955+ { call_site: ffffffff81325447 } hitcount: 292 bytes_req: 584
1956+ { call_site: ffffffffa072da3c } hitcount: 446 bytes_req: 60656
1957+ { call_site: ffffffffa036b1f2 } hitcount: 526 bytes_req: 29456
1958+ { call_site: ffffffffa0099c06 } hitcount: 1780 bytes_req: 35600
1959+
1960+ Totals:
1961+ Hits: 4775
1962+ Entries: 46
1963+ Dropped: 0
1964+
1965+ Even that's only marginally more useful - while hex values do look
1966+ more like addresses, what users are typically more interested in
1967+ when looking at text addresses are the corresponding symbols
1968+ instead. To have an address displayed as symbolic value instead,
1969+ simply append '.sym' or '.sym-offset' to the field name in the
1970+ trigger:
1971+
1972+ # echo 'hist:key=call_site.sym:val=bytes_req' > \
1973+ /sys/kernel/debug/tracing/events/kmem/kmalloc/trigger
1974+
1975+ # cat /sys/kernel/debug/tracing/events/kmem/kmalloc/hist
1976+ # trigger info: hist:keys=call_site.sym:vals=bytes_req:sort=hitcount:size=2048 [active]
1977+
1978+ { call_site: [ffffffff810adcb9] syslog_print_all } hitcount: 1 bytes_req: 1024
1979+ { call_site: [ffffffff8154bc62] usb_control_msg } hitcount: 1 bytes_req: 8
1980+ { call_site: [ffffffffa00bf6fe] hidraw_send_report [hid] } hitcount: 1 bytes_req: 7
1981+ { call_site: [ffffffff8154acbe] usb_alloc_urb } hitcount: 1 bytes_req: 192
1982+ { call_site: [ffffffffa00bf1ca] hidraw_report_event [hid] } hitcount: 1 bytes_req: 7
1983+ { call_site: [ffffffff811e3a25] __seq_open_private } hitcount: 1 bytes_req: 40
1984+ { call_site: [ffffffff8109524a] alloc_fair_sched_group } hitcount: 2 bytes_req: 128
1985+ { call_site: [ffffffff811febd5] fsnotify_alloc_group } hitcount: 2 bytes_req: 528
1986+ { call_site: [ffffffff81440f58] __tty_buffer_request_room } hitcount: 2 bytes_req: 2624
1987+ { call_site: [ffffffff81200ba6] inotify_new_group } hitcount: 2 bytes_req: 96
1988+ { call_site: [ffffffffa05e19af] ieee80211_start_tx_ba_session [mac80211] } hitcount: 2 bytes_req: 464
1989+ { call_site: [ffffffff81672406] tcp_get_metrics } hitcount: 2 bytes_req: 304
1990+ { call_site: [ffffffff81097ec2] alloc_rt_sched_group } hitcount: 2 bytes_req: 128
1991+ { call_site: [ffffffff81089b05] sched_create_group } hitcount: 2 bytes_req: 1424
1992+ .
1993+ .
1994+ .
1995+ { call_site: [ffffffffa04a580c] intel_crtc_page_flip [i915] } hitcount: 1185 bytes_req: 123240
1996+ { call_site: [ffffffffa0287592] drm_mode_page_flip_ioctl [drm] } hitcount: 1185 bytes_req: 104280
1997+ { call_site: [ffffffffa04c4a3c] intel_plane_duplicate_state [i915] } hitcount: 1402 bytes_req: 190672
1998+ { call_site: [ffffffff812891ca] ext4_find_extent } hitcount: 1518 bytes_req: 146208
1999+ { call_site: [ffffffffa029070e] drm_vma_node_allow [drm] } hitcount: 1746 bytes_req: 69840
2000+ { call_site: [ffffffffa045e7c4] i915_gem_do_execbuffer.isra.23 [i915] } hitcount: 2021 bytes_req: 792312
2001+ { call_site: [ffffffffa02911f2] drm_modeset_lock_crtc [drm] } hitcount: 2592 bytes_req: 145152
2002+ { call_site: [ffffffffa0489a66] intel_ring_begin [i915] } hitcount: 2629 bytes_req: 378576
2003+ { call_site: [ffffffffa046041c] i915_gem_execbuffer2 [i915] } hitcount: 2629 bytes_req: 3783248
2004+ { call_site: [ffffffff81325607] apparmor_file_alloc_security } hitcount: 5192 bytes_req: 10384
2005+ { call_site: [ffffffffa00b7c06] hid_report_raw_event [hid] } hitcount: 5529 bytes_req: 110584
2006+ { call_site: [ffffffff8131ebf7] aa_alloc_task_context } hitcount: 21943 bytes_req: 702176
2007+ { call_site: [ffffffff8125847d] ext4_htree_store_dirent } hitcount: 55759 bytes_req: 5074265
2008+
2009+ Totals:
2010+ Hits: 109928
2011+ Entries: 71
2012+ Dropped: 0
2013+
2014+ Because the default sort key above is 'hitcount', the above shows a
2015+ the list of call_sites by increasing hitcount, so that at the bottom
2016+ we see the functions that made the most kmalloc calls during the
2017+ run. If instead we we wanted to see the top kmalloc callers in
2018+ terms of the number of bytes requested rather than the number of
2019+ calls, and we wanted the top caller to appear at the top, we can use
2020+ the 'sort' parameter, along with the 'descending' modifier:
2021+
2022+ # echo 'hist:key=call_site.sym:val=bytes_req:sort=bytes_req.descending' > \
2023+ /sys/kernel/debug/tracing/events/kmem/kmalloc/trigger
2024+
2025+ # cat /sys/kernel/debug/tracing/events/kmem/kmalloc/hist
2026+ # trigger info: hist:keys=call_site.sym:vals=bytes_req:sort=bytes_req.descending:size=2048 [active]
2027+
2028+ { call_site: [ffffffffa046041c] i915_gem_execbuffer2 [i915] } hitcount: 2186 bytes_req: 3397464
2029+ { call_site: [ffffffffa045e7c4] i915_gem_do_execbuffer.isra.23 [i915] } hitcount: 1790 bytes_req: 712176
2030+ { call_site: [ffffffff8125847d] ext4_htree_store_dirent } hitcount: 8132 bytes_req: 513135
2031+ { call_site: [ffffffff811e2a1b] seq_buf_alloc } hitcount: 106 bytes_req: 440128
2032+ { call_site: [ffffffffa0489a66] intel_ring_begin [i915] } hitcount: 2186 bytes_req: 314784
2033+ { call_site: [ffffffff812891ca] ext4_find_extent } hitcount: 2174 bytes_req: 208992
2034+ { call_site: [ffffffff811ae8e1] __kmalloc } hitcount: 8 bytes_req: 131072
2035+ { call_site: [ffffffffa04c4a3c] intel_plane_duplicate_state [i915] } hitcount: 859 bytes_req: 116824
2036+ { call_site: [ffffffffa02911f2] drm_modeset_lock_crtc [drm] } hitcount: 1834 bytes_req: 102704
2037+ { call_site: [ffffffffa04a580c] intel_crtc_page_flip [i915] } hitcount: 972 bytes_req: 101088
2038+ { call_site: [ffffffffa0287592] drm_mode_page_flip_ioctl [drm] } hitcount: 972 bytes_req: 85536
2039+ { call_site: [ffffffffa00b7c06] hid_report_raw_event [hid] } hitcount: 3333 bytes_req: 66664
2040+ { call_site: [ffffffff8137e559] sg_kmalloc } hitcount: 209 bytes_req: 61632
2041+ .
2042+ .
2043+ .
2044+ { call_site: [ffffffff81095225] alloc_fair_sched_group } hitcount: 2 bytes_req: 128
2045+ { call_site: [ffffffff81097ec2] alloc_rt_sched_group } hitcount: 2 bytes_req: 128
2046+ { call_site: [ffffffff812d8406] copy_semundo } hitcount: 2 bytes_req: 48
2047+ { call_site: [ffffffff81200ba6] inotify_new_group } hitcount: 1 bytes_req: 48
2048+ { call_site: [ffffffffa027121a] drm_getmagic [drm] } hitcount: 1 bytes_req: 48
2049+ { call_site: [ffffffff811e3a25] __seq_open_private } hitcount: 1 bytes_req: 40
2050+ { call_site: [ffffffff811c52f4] bprm_change_interp } hitcount: 2 bytes_req: 16
2051+ { call_site: [ffffffff8154bc62] usb_control_msg } hitcount: 1 bytes_req: 8
2052+ { call_site: [ffffffffa00bf1ca] hidraw_report_event [hid] } hitcount: 1 bytes_req: 7
2053+ { call_site: [ffffffffa00bf6fe] hidraw_send_report [hid] } hitcount: 1 bytes_req: 7
2054+
2055+ Totals:
2056+ Hits: 32133
2057+ Entries: 81
2058+ Dropped: 0
2059+
2060+ To display the offset and size information in addition to the symbol
2061+ name, just use 'sym-offset' instead:
2062+
2063+ # echo 'hist:key=call_site.sym-offset:val=bytes_req:sort=bytes_req.descending' > \
2064+ /sys/kernel/debug/tracing/events/kmem/kmalloc/trigger
2065+
2066+ # cat /sys/kernel/debug/tracing/events/kmem/kmalloc/hist
2067+ # trigger info: hist:keys=call_site.sym-offset:vals=bytes_req:sort=bytes_req.descending:size=2048 [active]
2068+
2069+ { call_site: [ffffffffa046041c] i915_gem_execbuffer2+0x6c/0x2c0 [i915] } hitcount: 4569 bytes_req: 3163720
2070+ { call_site: [ffffffffa0489a66] intel_ring_begin+0xc6/0x1f0 [i915] } hitcount: 4569 bytes_req: 657936
2071+ { call_site: [ffffffffa045e7c4] i915_gem_do_execbuffer.isra.23+0x694/0x1020 [i915] } hitcount: 1519 bytes_req: 472936
2072+ { call_site: [ffffffffa045e646] i915_gem_do_execbuffer.isra.23+0x516/0x1020 [i915] } hitcount: 3050 bytes_req: 211832
2073+ { call_site: [ffffffff811e2a1b] seq_buf_alloc+0x1b/0x50 } hitcount: 34 bytes_req: 148384
2074+ { call_site: [ffffffffa04a580c] intel_crtc_page_flip+0xbc/0x870 [i915] } hitcount: 1385 bytes_req: 144040
2075+ { call_site: [ffffffff811ae8e1] __kmalloc+0x191/0x1b0 } hitcount: 8 bytes_req: 131072
2076+ { call_site: [ffffffffa0287592] drm_mode_page_flip_ioctl+0x282/0x360 [drm] } hitcount: 1385 bytes_req: 121880
2077+ { call_site: [ffffffffa02911f2] drm_modeset_lock_crtc+0x32/0x100 [drm] } hitcount: 1848 bytes_req: 103488
2078+ { call_site: [ffffffffa04c4a3c] intel_plane_duplicate_state+0x2c/0xa0 [i915] } hitcount: 461 bytes_req: 62696
2079+ { call_site: [ffffffffa029070e] drm_vma_node_allow+0x2e/0xd0 [drm] } hitcount: 1541 bytes_req: 61640
2080+ { call_site: [ffffffff815f8d7b] sk_prot_alloc+0xcb/0x1b0 } hitcount: 57 bytes_req: 57456
2081+ .
2082+ .
2083+ .
2084+ { call_site: [ffffffff8109524a] alloc_fair_sched_group+0x5a/0x1a0 } hitcount: 2 bytes_req: 128
2085+ { call_site: [ffffffffa027b921] drm_vm_open_locked+0x31/0xa0 [drm] } hitcount: 3 bytes_req: 96
2086+ { call_site: [ffffffff8122e266] proc_self_follow_link+0x76/0xb0 } hitcount: 8 bytes_req: 96
2087+ { call_site: [ffffffff81213e80] load_elf_binary+0x240/0x1650 } hitcount: 3 bytes_req: 84
2088+ { call_site: [ffffffff8154bc62] usb_control_msg+0x42/0x110 } hitcount: 1 bytes_req: 8
2089+ { call_site: [ffffffffa00bf6fe] hidraw_send_report+0x7e/0x1a0 [hid] } hitcount: 1 bytes_req: 7
2090+ { call_site: [ffffffffa00bf1ca] hidraw_report_event+0x8a/0x120 [hid] } hitcount: 1 bytes_req: 7
2091+
2092+ Totals:
2093+ Hits: 26098
2094+ Entries: 64
2095+ Dropped: 0
2096+
2097+ We can also add multiple fields to the 'values' parameter. For
2098+ example, we might want to see the total number of bytes allocated
2099+ alongside bytes requested, and display the result sorted by bytes
2100+ allocated in a descending order:
2101+
2102+ # echo 'hist:keys=call_site.sym:values=bytes_req,bytes_alloc:sort=bytes_alloc.descending' > \
2103+ /sys/kernel/debug/tracing/events/kmem/kmalloc/trigger
2104+
2105+ # cat /sys/kernel/debug/tracing/events/kmem/kmalloc/hist
2106+ # trigger info: hist:keys=call_site.sym:vals=bytes_req,bytes_alloc:sort=bytes_alloc.descending:size=2048 [active]
2107+
2108+ { call_site: [ffffffffa046041c] i915_gem_execbuffer2 [i915] } hitcount: 7403 bytes_req: 4084360 bytes_alloc: 5958016
2109+ { call_site: [ffffffff811e2a1b] seq_buf_alloc } hitcount: 541 bytes_req: 2213968 bytes_alloc: 2228224
2110+ { call_site: [ffffffffa0489a66] intel_ring_begin [i915] } hitcount: 7404 bytes_req: 1066176 bytes_alloc: 1421568
2111+ { call_site: [ffffffffa045e7c4] i915_gem_do_execbuffer.isra.23 [i915] } hitcount: 1565 bytes_req: 557368 bytes_alloc: 1037760
2112+ { call_site: [ffffffff8125847d] ext4_htree_store_dirent } hitcount: 9557 bytes_req: 595778 bytes_alloc: 695744
2113+ { call_site: [ffffffffa045e646] i915_gem_do_execbuffer.isra.23 [i915] } hitcount: 5839 bytes_req: 430680 bytes_alloc: 470400
2114+ { call_site: [ffffffffa04c4a3c] intel_plane_duplicate_state [i915] } hitcount: 2388 bytes_req: 324768 bytes_alloc: 458496
2115+ { call_site: [ffffffffa02911f2] drm_modeset_lock_crtc [drm] } hitcount: 3911 bytes_req: 219016 bytes_alloc: 250304
2116+ { call_site: [ffffffff815f8d7b] sk_prot_alloc } hitcount: 235 bytes_req: 236880 bytes_alloc: 240640
2117+ { call_site: [ffffffff8137e559] sg_kmalloc } hitcount: 557 bytes_req: 169024 bytes_alloc: 221760
2118+ { call_site: [ffffffffa00b7c06] hid_report_raw_event [hid] } hitcount: 9378 bytes_req: 187548 bytes_alloc: 206312
2119+ { call_site: [ffffffffa04a580c] intel_crtc_page_flip [i915] } hitcount: 1519 bytes_req: 157976 bytes_alloc: 194432
2120+ .
2121+ .
2122+ .
2123+ { call_site: [ffffffff8109bd3b] sched_autogroup_create_attach } hitcount: 2 bytes_req: 144 bytes_alloc: 192
2124+ { call_site: [ffffffff81097ee8] alloc_rt_sched_group } hitcount: 2 bytes_req: 128 bytes_alloc: 128
2125+ { call_site: [ffffffff8109524a] alloc_fair_sched_group } hitcount: 2 bytes_req: 128 bytes_alloc: 128
2126+ { call_site: [ffffffff81095225] alloc_fair_sched_group } hitcount: 2 bytes_req: 128 bytes_alloc: 128
2127+ { call_site: [ffffffff81097ec2] alloc_rt_sched_group } hitcount: 2 bytes_req: 128 bytes_alloc: 128
2128+ { call_site: [ffffffff81213e80] load_elf_binary } hitcount: 3 bytes_req: 84 bytes_alloc: 96
2129+ { call_site: [ffffffff81079a2e] kthread_create_on_node } hitcount: 1 bytes_req: 56 bytes_alloc: 64
2130+ { call_site: [ffffffffa00bf6fe] hidraw_send_report [hid] } hitcount: 1 bytes_req: 7 bytes_alloc: 8
2131+ { call_site: [ffffffff8154bc62] usb_control_msg } hitcount: 1 bytes_req: 8 bytes_alloc: 8
2132+ { call_site: [ffffffffa00bf1ca] hidraw_report_event [hid] } hitcount: 1 bytes_req: 7 bytes_alloc: 8
2133+
2134+ Totals:
2135+ Hits: 66598
2136+ Entries: 65
2137+ Dropped: 0
2138+
2139+ Finally, to finish off our kmalloc example, instead of simply having
2140+ the hist trigger display symbolic call_sites, we can have the hist
2141+ trigger additionally display the complete set of kernel stack traces
2142+ that led to each call_site. To do that, we simply use the special
2143+ value 'stacktrace' for the key parameter:
2144+
2145+ # echo 'hist:keys=stacktrace:values=bytes_req,bytes_alloc:sort=bytes_alloc' > \
2146+ /sys/kernel/debug/tracing/events/kmem/kmalloc/trigger
2147+
2148+ The above trigger will use the kernel stack trace in effect when an
2149+ event is triggered as the key for the hash table. This allows the
2150+ enumeration of every kernel callpath that led up to a particular
2151+ event, along with a running total of any of the event fields for
2152+ that event. Here we tally bytes requested and bytes allocated for
2153+ every callpath in the system that led up to a kmalloc (in this case
2154+ every callpath to a kmalloc for a kernel compile):
2155+
2156+ # cat /sys/kernel/debug/tracing/events/kmem/kmalloc/hist
2157+ # trigger info: hist:keys=stacktrace:vals=bytes_req,bytes_alloc:sort=bytes_alloc:size=2048 [active]
2158+
2159+ { stacktrace:
2160+ __kmalloc_track_caller+0x10b/0x1a0
2161+ kmemdup+0x20/0x50
2162+ hidraw_report_event+0x8a/0x120 [hid]
2163+ hid_report_raw_event+0x3ea/0x440 [hid]
2164+ hid_input_report+0x112/0x190 [hid]
2165+ hid_irq_in+0xc2/0x260 [usbhid]
2166+ __usb_hcd_giveback_urb+0x72/0x120
2167+ usb_giveback_urb_bh+0x9e/0xe0
2168+ tasklet_hi_action+0xf8/0x100
2169+ __do_softirq+0x114/0x2c0
2170+ irq_exit+0xa5/0xb0
2171+ do_IRQ+0x5a/0xf0
2172+ ret_from_intr+0x0/0x30
2173+ cpuidle_enter+0x17/0x20
2174+ cpu_startup_entry+0x315/0x3e0
2175+ rest_init+0x7c/0x80
2176+ } hitcount: 3 bytes_req: 21 bytes_alloc: 24
2177+ { stacktrace:
2178+ __kmalloc_track_caller+0x10b/0x1a0
2179+ kmemdup+0x20/0x50
2180+ hidraw_report_event+0x8a/0x120 [hid]
2181+ hid_report_raw_event+0x3ea/0x440 [hid]
2182+ hid_input_report+0x112/0x190 [hid]
2183+ hid_irq_in+0xc2/0x260 [usbhid]
2184+ __usb_hcd_giveback_urb+0x72/0x120
2185+ usb_giveback_urb_bh+0x9e/0xe0
2186+ tasklet_hi_action+0xf8/0x100
2187+ __do_softirq+0x114/0x2c0
2188+ irq_exit+0xa5/0xb0
2189+ do_IRQ+0x5a/0xf0
2190+ ret_from_intr+0x0/0x30
2191+ } hitcount: 3 bytes_req: 21 bytes_alloc: 24
2192+ { stacktrace:
2193+ kmem_cache_alloc_trace+0xeb/0x150
2194+ aa_alloc_task_context+0x27/0x40
2195+ apparmor_cred_prepare+0x1f/0x50
2196+ security_prepare_creds+0x16/0x20
2197+ prepare_creds+0xdf/0x1a0
2198+ SyS_capset+0xb5/0x200
2199+ system_call_fastpath+0x12/0x6a
2200+ } hitcount: 1 bytes_req: 32 bytes_alloc: 32
2201+ .
2202+ .
2203+ .
2204+ { stacktrace:
2205+ __kmalloc+0x11b/0x1b0
2206+ i915_gem_execbuffer2+0x6c/0x2c0 [i915]
2207+ drm_ioctl+0x349/0x670 [drm]
2208+ do_vfs_ioctl+0x2f0/0x4f0
2209+ SyS_ioctl+0x81/0xa0
2210+ system_call_fastpath+0x12/0x6a
2211+ } hitcount: 17726 bytes_req: 13944120 bytes_alloc: 19593808
2212+ { stacktrace:
2213+ __kmalloc+0x11b/0x1b0
2214+ load_elf_phdrs+0x76/0xa0
2215+ load_elf_binary+0x102/0x1650
2216+ search_binary_handler+0x97/0x1d0
2217+ do_execveat_common.isra.34+0x551/0x6e0
2218+ SyS_execve+0x3a/0x50
2219+ return_from_execve+0x0/0x23
2220+ } hitcount: 33348 bytes_req: 17152128 bytes_alloc: 20226048
2221+ { stacktrace:
2222+ kmem_cache_alloc_trace+0xeb/0x150
2223+ apparmor_file_alloc_security+0x27/0x40
2224+ security_file_alloc+0x16/0x20
2225+ get_empty_filp+0x93/0x1c0
2226+ path_openat+0x31/0x5f0
2227+ do_filp_open+0x3a/0x90
2228+ do_sys_open+0x128/0x220
2229+ SyS_open+0x1e/0x20
2230+ system_call_fastpath+0x12/0x6a
2231+ } hitcount: 4766422 bytes_req: 9532844 bytes_alloc: 38131376
2232+ { stacktrace:
2233+ __kmalloc+0x11b/0x1b0
2234+ seq_buf_alloc+0x1b/0x50
2235+ seq_read+0x2cc/0x370
2236+ proc_reg_read+0x3d/0x80
2237+ __vfs_read+0x28/0xe0
2238+ vfs_read+0x86/0x140
2239+ SyS_read+0x46/0xb0
2240+ system_call_fastpath+0x12/0x6a
2241+ } hitcount: 19133 bytes_req: 78368768 bytes_alloc: 78368768
2242+
2243+ Totals:
2244+ Hits: 6085872
2245+ Entries: 253
2246+ Dropped: 0
2247+
2248+ If you key a hist trigger on common_pid, in order for example to
2249+ gather and display sorted totals for each process, you can use the
2250+ special .execname modifier to display the executable names for the
2251+ processes in the table rather than raw pids. The example below
2252+ keeps a per-process sum of total bytes read:
2253+
2254+ # echo 'hist:key=common_pid.execname:val=count:sort=count.descending' > \
2255+ /sys/kernel/debug/tracing/events/syscalls/sys_enter_read/trigger
2256+
2257+ # cat /sys/kernel/debug/tracing/events/syscalls/sys_enter_read/hist
2258+ # trigger info: hist:keys=common_pid.execname:vals=count:sort=count.descending:size=2048 [active]
2259+
2260+ { common_pid: gnome-terminal [ 3196] } hitcount: 280 count: 1093512
2261+ { common_pid: Xorg [ 1309] } hitcount: 525 count: 256640
2262+ { common_pid: compiz [ 2889] } hitcount: 59 count: 254400
2263+ { common_pid: bash [ 8710] } hitcount: 3 count: 66369
2264+ { common_pid: dbus-daemon-lau [ 8703] } hitcount: 49 count: 47739
2265+ { common_pid: irqbalance [ 1252] } hitcount: 27 count: 27648
2266+ { common_pid: 01ifupdown [ 8705] } hitcount: 3 count: 17216
2267+ { common_pid: dbus-daemon [ 772] } hitcount: 10 count: 12396
2268+ { common_pid: Socket Thread [ 8342] } hitcount: 11 count: 11264
2269+ { common_pid: nm-dhcp-client. [ 8701] } hitcount: 6 count: 7424
2270+ { common_pid: gmain [ 1315] } hitcount: 18 count: 6336
2271+ .
2272+ .
2273+ .
2274+ { common_pid: postgres [ 1892] } hitcount: 2 count: 32
2275+ { common_pid: postgres [ 1891] } hitcount: 2 count: 32
2276+ { common_pid: gmain [ 8704] } hitcount: 2 count: 32
2277+ { common_pid: upstart-dbus-br [ 2740] } hitcount: 21 count: 21
2278+ { common_pid: nm-dispatcher.a [ 8696] } hitcount: 1 count: 16
2279+ { common_pid: indicator-datet [ 2904] } hitcount: 1 count: 16
2280+ { common_pid: gdbus [ 2998] } hitcount: 1 count: 16
2281+ { common_pid: rtkit-daemon [ 2052] } hitcount: 1 count: 8
2282+ { common_pid: init [ 1] } hitcount: 2 count: 2
2283+
2284+ Totals:
2285+ Hits: 2116
2286+ Entries: 51
2287+ Dropped: 0
2288+
2289+ Similarly, if you key a hist trigger on syscall id, for example to
2290+ gather and display a list of systemwide syscall hits, you can use
2291+ the special .syscall modifier to display the syscall names rather
2292+ than raw ids. The example below keeps a running total of syscall
2293+ counts for the system during the run:
2294+
2295+ # echo 'hist:key=id.syscall:val=hitcount' > \
2296+ /sys/kernel/debug/tracing/events/raw_syscalls/sys_enter/trigger
2297+
2298+ # cat /sys/kernel/debug/tracing/events/raw_syscalls/sys_enter/hist
2299+ # trigger info: hist:keys=id.syscall:vals=hitcount:sort=hitcount:size=2048 [active]
2300+
2301+ { id: sys_fsync [ 74] } hitcount: 1
2302+ { id: sys_newuname [ 63] } hitcount: 1
2303+ { id: sys_prctl [157] } hitcount: 1
2304+ { id: sys_statfs [137] } hitcount: 1
2305+ { id: sys_symlink [ 88] } hitcount: 1
2306+ { id: sys_sendmmsg [307] } hitcount: 1
2307+ { id: sys_semctl [ 66] } hitcount: 1
2308+ { id: sys_readlink [ 89] } hitcount: 3
2309+ { id: sys_bind [ 49] } hitcount: 3
2310+ { id: sys_getsockname [ 51] } hitcount: 3
2311+ { id: sys_unlink [ 87] } hitcount: 3
2312+ { id: sys_rename [ 82] } hitcount: 4
2313+ { id: unknown_syscall [ 58] } hitcount: 4
2314+ { id: sys_connect [ 42] } hitcount: 4
2315+ { id: sys_getpid [ 39] } hitcount: 4
2316+ .
2317+ .
2318+ .
2319+ { id: sys_rt_sigprocmask [ 14] } hitcount: 952
2320+ { id: sys_futex [202] } hitcount: 1534
2321+ { id: sys_write [ 1] } hitcount: 2689
2322+ { id: sys_setitimer [ 38] } hitcount: 2797
2323+ { id: sys_read [ 0] } hitcount: 3202
2324+ { id: sys_select [ 23] } hitcount: 3773
2325+ { id: sys_writev [ 20] } hitcount: 4531
2326+ { id: sys_poll [ 7] } hitcount: 8314
2327+ { id: sys_recvmsg [ 47] } hitcount: 13738
2328+ { id: sys_ioctl [ 16] } hitcount: 21843
2329+
2330+ Totals:
2331+ Hits: 67612
2332+ Entries: 72
2333+ Dropped: 0
2334+
2335+ The syscall counts above provide a rough overall picture of system
2336+ call activity on the system; we can see for example that the most
2337+ popular system call on this system was the 'sys_ioctl' system call.
2338+
2339+ We can use 'compound' keys to refine that number and provide some
2340+ further insight as to which processes exactly contribute to the
2341+ overall ioctl count.
2342+
2343+ The command below keeps a hitcount for every unique combination of
2344+ system call id and pid - the end result is essentially a table
2345+ that keeps a per-pid sum of system call hits. The results are
2346+ sorted using the system call id as the primary key, and the
2347+ hitcount sum as the secondary key:
2348+
2349+ # echo 'hist:key=id.syscall,common_pid.execname:val=hitcount:sort=id,hitcount' > \
2350+ /sys/kernel/debug/tracing/events/raw_syscalls/sys_enter/trigger
2351+
2352+ # cat /sys/kernel/debug/tracing/events/raw_syscalls/sys_enter/hist
2353+ # trigger info: hist:keys=id.syscall,common_pid.execname:vals=hitcount:sort=id.syscall,hitcount:size=2048 [active]
2354+
2355+ { id: sys_read [ 0], common_pid: rtkit-daemon [ 1877] } hitcount: 1
2356+ { id: sys_read [ 0], common_pid: gdbus [ 2976] } hitcount: 1
2357+ { id: sys_read [ 0], common_pid: console-kit-dae [ 3400] } hitcount: 1
2358+ { id: sys_read [ 0], common_pid: postgres [ 1865] } hitcount: 1
2359+ { id: sys_read [ 0], common_pid: deja-dup-monito [ 3543] } hitcount: 2
2360+ { id: sys_read [ 0], common_pid: NetworkManager [ 890] } hitcount: 2
2361+ { id: sys_read [ 0], common_pid: evolution-calen [ 3048] } hitcount: 2
2362+ { id: sys_read [ 0], common_pid: postgres [ 1864] } hitcount: 2
2363+ { id: sys_read [ 0], common_pid: nm-applet [ 3022] } hitcount: 2
2364+ { id: sys_read [ 0], common_pid: whoopsie [ 1212] } hitcount: 2
2365+ .
2366+ .
2367+ .
2368+ { id: sys_ioctl [ 16], common_pid: bash [ 8479] } hitcount: 1
2369+ { id: sys_ioctl [ 16], common_pid: bash [ 3472] } hitcount: 12
2370+ { id: sys_ioctl [ 16], common_pid: gnome-terminal [ 3199] } hitcount: 16
2371+ { id: sys_ioctl [ 16], common_pid: Xorg [ 1267] } hitcount: 1808
2372+ { id: sys_ioctl [ 16], common_pid: compiz [ 2994] } hitcount: 5580
2373+ .
2374+ .
2375+ .
2376+ { id: sys_waitid [247], common_pid: upstart-dbus-br [ 2690] } hitcount: 3
2377+ { id: sys_waitid [247], common_pid: upstart-dbus-br [ 2688] } hitcount: 16
2378+ { id: sys_inotify_add_watch [254], common_pid: gmain [ 975] } hitcount: 2
2379+ { id: sys_inotify_add_watch [254], common_pid: gmain [ 3204] } hitcount: 4
2380+ { id: sys_inotify_add_watch [254], common_pid: gmain [ 2888] } hitcount: 4
2381+ { id: sys_inotify_add_watch [254], common_pid: gmain [ 3003] } hitcount: 4
2382+ { id: sys_inotify_add_watch [254], common_pid: gmain [ 2873] } hitcount: 4
2383+ { id: sys_inotify_add_watch [254], common_pid: gmain [ 3196] } hitcount: 6
2384+ { id: sys_openat [257], common_pid: java [ 2623] } hitcount: 2
2385+ { id: sys_eventfd2 [290], common_pid: ibus-ui-gtk3 [ 2760] } hitcount: 4
2386+ { id: sys_eventfd2 [290], common_pid: compiz [ 2994] } hitcount: 6
2387+
2388+ Totals:
2389+ Hits: 31536
2390+ Entries: 323
2391+ Dropped: 0
2392+
2393+ The above list does give us a breakdown of the ioctl syscall by
2394+ pid, but it also gives us quite a bit more than that, which we
2395+ don't really care about at the moment. Since we know the syscall
2396+ id for sys_ioctl (16, displayed next to the sys_ioctl name), we
2397+ can use that to filter out all the other syscalls:
2398+
2399+ # echo 'hist:key=id.syscall,common_pid.execname:val=hitcount:sort=id,hitcount if id == 16' > \
2400+ /sys/kernel/debug/tracing/events/raw_syscalls/sys_enter/trigger
2401+
2402+ # cat /sys/kernel/debug/tracing/events/raw_syscalls/sys_enter/hist
2403+ # trigger info: hist:keys=id.syscall,common_pid.execname:vals=hitcount:sort=id.syscall,hitcount:size=2048 if id == 16 [active]
2404+
2405+ { id: sys_ioctl [ 16], common_pid: gmain [ 2769] } hitcount: 1
2406+ { id: sys_ioctl [ 16], common_pid: evolution-addre [ 8571] } hitcount: 1
2407+ { id: sys_ioctl [ 16], common_pid: gmain [ 3003] } hitcount: 1
2408+ { id: sys_ioctl [ 16], common_pid: gmain [ 2781] } hitcount: 1
2409+ { id: sys_ioctl [ 16], common_pid: gmain [ 2829] } hitcount: 1
2410+ { id: sys_ioctl [ 16], common_pid: bash [ 8726] } hitcount: 1
2411+ { id: sys_ioctl [ 16], common_pid: bash [ 8508] } hitcount: 1
2412+ { id: sys_ioctl [ 16], common_pid: gmain [ 2970] } hitcount: 1
2413+ { id: sys_ioctl [ 16], common_pid: gmain [ 2768] } hitcount: 1
2414+ .
2415+ .
2416+ .
2417+ { id: sys_ioctl [ 16], common_pid: pool [ 8559] } hitcount: 45
2418+ { id: sys_ioctl [ 16], common_pid: pool [ 8555] } hitcount: 48
2419+ { id: sys_ioctl [ 16], common_pid: pool [ 8551] } hitcount: 48
2420+ { id: sys_ioctl [ 16], common_pid: avahi-daemon [ 896] } hitcount: 66
2421+ { id: sys_ioctl [ 16], common_pid: Xorg [ 1267] } hitcount: 26674
2422+ { id: sys_ioctl [ 16], common_pid: compiz [ 2994] } hitcount: 73443
2423+
2424+ Totals:
2425+ Hits: 101162
2426+ Entries: 103
2427+ Dropped: 0
2428+
2429+ The above output shows that 'compiz' and 'Xorg' are far and away
2430+ the heaviest ioctl callers (which might lead to questions about
2431+ whether they really need to be making all those calls and to
2432+ possible avenues for further investigation.)
2433+
2434+ The compound key examples used a key and a sum value (hitcount) to
2435+ sort the output, but we can just as easily use two keys instead.
2436+ Here's an example where we use a compound key composed of the the
2437+ common_pid and size event fields. Sorting with pid as the primary
2438+ key and 'size' as the secondary key allows us to display an
2439+ ordered summary of the recvfrom sizes, with counts, received by
2440+ each process:
2441+
2442+ # echo 'hist:key=common_pid.execname,size:val=hitcount:sort=common_pid,size' > \
2443+ /sys/kernel/debug/tracing/events/syscalls/sys_enter_recvfrom/trigger
2444+
2445+ # cat /sys/kernel/debug/tracing/events/syscalls/sys_enter_recvfrom/hist
2446+ # trigger info: hist:keys=common_pid.execname,size:vals=hitcount:sort=common_pid.execname,size:size=2048 [active]
2447+
2448+ { common_pid: smbd [ 784], size: 4 } hitcount: 1
2449+ { common_pid: dnsmasq [ 1412], size: 4096 } hitcount: 672
2450+ { common_pid: postgres [ 1796], size: 1000 } hitcount: 6
2451+ { common_pid: postgres [ 1867], size: 1000 } hitcount: 10
2452+ { common_pid: bamfdaemon [ 2787], size: 28 } hitcount: 2
2453+ { common_pid: bamfdaemon [ 2787], size: 14360 } hitcount: 1
2454+ { common_pid: compiz [ 2994], size: 8 } hitcount: 1
2455+ { common_pid: compiz [ 2994], size: 20 } hitcount: 11
2456+ { common_pid: gnome-terminal [ 3199], size: 4 } hitcount: 2
2457+ { common_pid: firefox [ 8817], size: 4 } hitcount: 1
2458+ { common_pid: firefox [ 8817], size: 8 } hitcount: 5
2459+ { common_pid: firefox [ 8817], size: 588 } hitcount: 2
2460+ { common_pid: firefox [ 8817], size: 628 } hitcount: 1
2461+ { common_pid: firefox [ 8817], size: 6944 } hitcount: 1
2462+ { common_pid: firefox [ 8817], size: 408880 } hitcount: 2
2463+ { common_pid: firefox [ 8822], size: 8 } hitcount: 2
2464+ { common_pid: firefox [ 8822], size: 160 } hitcount: 2
2465+ { common_pid: firefox [ 8822], size: 320 } hitcount: 2
2466+ { common_pid: firefox [ 8822], size: 352 } hitcount: 1
2467+ .
2468+ .
2469+ .
2470+ { common_pid: pool [ 8923], size: 1960 } hitcount: 10
2471+ { common_pid: pool [ 8923], size: 2048 } hitcount: 10
2472+ { common_pid: pool [ 8924], size: 1960 } hitcount: 10
2473+ { common_pid: pool [ 8924], size: 2048 } hitcount: 10
2474+ { common_pid: pool [ 8928], size: 1964 } hitcount: 4
2475+ { common_pid: pool [ 8928], size: 1965 } hitcount: 2
2476+ { common_pid: pool [ 8928], size: 2048 } hitcount: 6
2477+ { common_pid: pool [ 8929], size: 1982 } hitcount: 1
2478+ { common_pid: pool [ 8929], size: 2048 } hitcount: 1
2479+
2480+ Totals:
2481+ Hits: 2016
2482+ Entries: 224
2483+ Dropped: 0
2484+
2485+ The above example also illustrates the fact that although a compound
2486+ key is treated as a single entity for hashing purposes, the sub-keys
2487+ it's composed of can be accessed independently.
2488+
2489+ The next example uses a string field as the hash key and
2490+ demonstrates how you can manually pause and continue a hist trigger.
2491+ In this example, we'll aggregate fork counts and don't expect a
2492+ large number of entries in the hash table, so we'll drop it to a
2493+ much smaller number, say 256:
2494+
2495+ # echo 'hist:key=child_comm:val=hitcount:size=256' > \
2496+ /sys/kernel/debug/tracing/events/sched/sched_process_fork/trigger
2497+
2498+ # cat /sys/kernel/debug/tracing/events/sched/sched_process_fork/hist
2499+ # trigger info: hist:keys=child_comm:vals=hitcount:sort=hitcount:size=256 [active]
2500+
2501+ { child_comm: dconf worker } hitcount: 1
2502+ { child_comm: ibus-daemon } hitcount: 1
2503+ { child_comm: whoopsie } hitcount: 1
2504+ { child_comm: smbd } hitcount: 1
2505+ { child_comm: gdbus } hitcount: 1
2506+ { child_comm: kthreadd } hitcount: 1
2507+ { child_comm: dconf worker } hitcount: 1
2508+ { child_comm: evolution-alarm } hitcount: 2
2509+ { child_comm: Socket Thread } hitcount: 2
2510+ { child_comm: postgres } hitcount: 2
2511+ { child_comm: bash } hitcount: 3
2512+ { child_comm: compiz } hitcount: 3
2513+ { child_comm: evolution-sourc } hitcount: 4
2514+ { child_comm: dhclient } hitcount: 4
2515+ { child_comm: pool } hitcount: 5
2516+ { child_comm: nm-dispatcher.a } hitcount: 8
2517+ { child_comm: firefox } hitcount: 8
2518+ { child_comm: dbus-daemon } hitcount: 8
2519+ { child_comm: glib-pacrunner } hitcount: 10
2520+ { child_comm: evolution } hitcount: 23
2521+
2522+ Totals:
2523+ Hits: 89
2524+ Entries: 20
2525+ Dropped: 0
2526+
2527+ If we want to pause the hist trigger, we can simply append :pause to
2528+ the command that started the trigger. Notice that the trigger info
2529+ displays as [paused]:
2530+
2531+ # echo 'hist:key=child_comm:val=hitcount:size=256:pause' >> \
2532+ /sys/kernel/debug/tracing/events/sched/sched_process_fork/trigger
2533+
2534+ # cat /sys/kernel/debug/tracing/events/sched/sched_process_fork/hist
2535+ # trigger info: hist:keys=child_comm:vals=hitcount:sort=hitcount:size=256 [paused]
2536+
2537+ { child_comm: dconf worker } hitcount: 1
2538+ { child_comm: kthreadd } hitcount: 1
2539+ { child_comm: dconf worker } hitcount: 1
2540+ { child_comm: gdbus } hitcount: 1
2541+ { child_comm: ibus-daemon } hitcount: 1
2542+ { child_comm: Socket Thread } hitcount: 2
2543+ { child_comm: evolution-alarm } hitcount: 2
2544+ { child_comm: smbd } hitcount: 2
2545+ { child_comm: bash } hitcount: 3
2546+ { child_comm: whoopsie } hitcount: 3
2547+ { child_comm: compiz } hitcount: 3
2548+ { child_comm: evolution-sourc } hitcount: 4
2549+ { child_comm: pool } hitcount: 5
2550+ { child_comm: postgres } hitcount: 6
2551+ { child_comm: firefox } hitcount: 8
2552+ { child_comm: dhclient } hitcount: 10
2553+ { child_comm: emacs } hitcount: 12
2554+ { child_comm: dbus-daemon } hitcount: 20
2555+ { child_comm: nm-dispatcher.a } hitcount: 20
2556+ { child_comm: evolution } hitcount: 35
2557+ { child_comm: glib-pacrunner } hitcount: 59
2558+
2559+ Totals:
2560+ Hits: 199
2561+ Entries: 21
2562+ Dropped: 0
2563+
2564+ To manually continue having the trigger aggregate events, append
2565+ :cont instead. Notice that the trigger info displays as [active]
2566+ again, and the data has changed:
2567+
2568+ # echo 'hist:key=child_comm:val=hitcount:size=256:cont' >> \
2569+ /sys/kernel/debug/tracing/events/sched/sched_process_fork/trigger
2570+
2571+ # cat /sys/kernel/debug/tracing/events/sched/sched_process_fork/hist
2572+ # trigger info: hist:keys=child_comm:vals=hitcount:sort=hitcount:size=256 [active]
2573+
2574+ { child_comm: dconf worker } hitcount: 1
2575+ { child_comm: dconf worker } hitcount: 1
2576+ { child_comm: kthreadd } hitcount: 1
2577+ { child_comm: gdbus } hitcount: 1
2578+ { child_comm: ibus-daemon } hitcount: 1
2579+ { child_comm: Socket Thread } hitcount: 2
2580+ { child_comm: evolution-alarm } hitcount: 2
2581+ { child_comm: smbd } hitcount: 2
2582+ { child_comm: whoopsie } hitcount: 3
2583+ { child_comm: compiz } hitcount: 3
2584+ { child_comm: evolution-sourc } hitcount: 4
2585+ { child_comm: bash } hitcount: 5
2586+ { child_comm: pool } hitcount: 5
2587+ { child_comm: postgres } hitcount: 6
2588+ { child_comm: firefox } hitcount: 8
2589+ { child_comm: dhclient } hitcount: 11
2590+ { child_comm: emacs } hitcount: 12
2591+ { child_comm: dbus-daemon } hitcount: 22
2592+ { child_comm: nm-dispatcher.a } hitcount: 22
2593+ { child_comm: evolution } hitcount: 35
2594+ { child_comm: glib-pacrunner } hitcount: 59
2595+
2596+ Totals:
2597+ Hits: 206
2598+ Entries: 21
2599+ Dropped: 0
2600+
2601+ The previous example showed how to start and stop a hist trigger by
2602+ appending 'pause' and 'continue' to the hist trigger command. A
2603+ hist trigger can also be started in a paused state by initially
2604+ starting the trigger with ':pause' appended. This allows you to
2605+ start the trigger only when you're ready to start collecting data
2606+ and not before. For example, you could start the trigger in a
2607+ paused state, then unpause it and do something you want to measure,
2608+ then pause the trigger again when done.
2609+
2610+ Of course, doing this manually can be difficult and error-prone, but
2611+ it is possible to automatically start and stop a hist trigger based
2612+ on some condition, via the enable_hist and disable_hist triggers.
2613+
2614+ For example, suppose we wanted to take a look at the relative
2615+ weights in terms of skb length for each callpath that leads to a
2616+ netif_receieve_skb event when downloading a decent-sized file using
2617+ wget.
2618+
2619+ First we set up an initially paused stacktrace trigger on the
2620+ netif_receive_skb event:
2621+
2622+ # echo 'hist:key=stacktrace:vals=len:pause' > \
2623+ /sys/kernel/debug/tracing/events/net/netif_receive_skb/trigger
2624+
2625+ Next, we set up an 'enable_hist' trigger on the sched_process_exec
2626+ event, with an 'if filename==/usr/bin/wget' filter. The effect of
2627+ this new trigger is that it will 'unpause' the hist trigger we just
2628+ set up on netif_receive_skb if and only if it sees a
2629+ sched_process_exec event with a filename of '/usr/bin/wget'. When
2630+ that happens, all netif_receive_skb events are aggregated into a
2631+ hash table keyed on stacktrace:
2632+
2633+ # echo 'enable_hist:net:netif_receive_skb if filename==/usr/bin/wget' > \
2634+ /sys/kernel/debug/tracing/events/sched/sched_process_exec/trigger
2635+
2636+ The aggregation continues until the netif_receive_skb is paused
2637+ again, which is what the following disable_hist event does by
2638+ creating a similar setup on the sched_process_exit event, using the
2639+ filter 'comm==wget':
2640+
2641+ # echo 'disable_hist:net:netif_receive_skb if comm==wget' > \
2642+ /sys/kernel/debug/tracing/events/sched/sched_process_exit/trigger
2643+
2644+ Whenever a process exits and the comm field of the disable_hist
2645+ trigger filter matches 'comm==wget', the netif_receive_skb hist
2646+ trigger is disabled.
2647+
2648+ The overall effect is that netif_receive_skb events are aggregated
2649+ into the hash table for only the duration of the wget. Executing a
2650+ wget command and then listing the 'hist' file will display the
2651+ output generated by the wget command:
2652+
2653+ $ wget https://www.kernel.org/pub/linux/kernel/v3.x/patch-3.19.xz
2654+
2655+ # cat /sys/kernel/debug/tracing/events/net/netif_receive_skb/hist
2656+ # trigger info: hist:keys=stacktrace:vals=len:sort=hitcount:size=2048 [paused]
2657+
2658+ { stacktrace:
2659+ __netif_receive_skb_core+0x46d/0x990
2660+ __netif_receive_skb+0x18/0x60
2661+ netif_receive_skb_internal+0x23/0x90
2662+ napi_gro_receive+0xc8/0x100
2663+ ieee80211_deliver_skb+0xd6/0x270 [mac80211]
2664+ ieee80211_rx_handlers+0xccf/0x22f0 [mac80211]
2665+ ieee80211_prepare_and_rx_handle+0x4e7/0xc40 [mac80211]
2666+ ieee80211_rx+0x31d/0x900 [mac80211]
2667+ iwlagn_rx_reply_rx+0x3db/0x6f0 [iwldvm]
2668+ iwl_rx_dispatch+0x8e/0xf0 [iwldvm]
2669+ iwl_pcie_irq_handler+0xe3c/0x12f0 [iwlwifi]
2670+ irq_thread_fn+0x20/0x50
2671+ irq_thread+0x11f/0x150
2672+ kthread+0xd2/0xf0
2673+ ret_from_fork+0x42/0x70
2674+ } hitcount: 85 len: 28884
2675+ { stacktrace:
2676+ __netif_receive_skb_core+0x46d/0x990
2677+ __netif_receive_skb+0x18/0x60
2678+ netif_receive_skb_internal+0x23/0x90
2679+ napi_gro_complete+0xa4/0xe0
2680+ dev_gro_receive+0x23a/0x360
2681+ napi_gro_receive+0x30/0x100
2682+ ieee80211_deliver_skb+0xd6/0x270 [mac80211]
2683+ ieee80211_rx_handlers+0xccf/0x22f0 [mac80211]
2684+ ieee80211_prepare_and_rx_handle+0x4e7/0xc40 [mac80211]
2685+ ieee80211_rx+0x31d/0x900 [mac80211]
2686+ iwlagn_rx_reply_rx+0x3db/0x6f0 [iwldvm]
2687+ iwl_rx_dispatch+0x8e/0xf0 [iwldvm]
2688+ iwl_pcie_irq_handler+0xe3c/0x12f0 [iwlwifi]
2689+ irq_thread_fn+0x20/0x50
2690+ irq_thread+0x11f/0x150
2691+ kthread+0xd2/0xf0
2692+ } hitcount: 98 len: 664329
2693+ { stacktrace:
2694+ __netif_receive_skb_core+0x46d/0x990
2695+ __netif_receive_skb+0x18/0x60
2696+ process_backlog+0xa8/0x150
2697+ net_rx_action+0x15d/0x340
2698+ __do_softirq+0x114/0x2c0
2699+ do_softirq_own_stack+0x1c/0x30
2700+ do_softirq+0x65/0x70
2701+ __local_bh_enable_ip+0xb5/0xc0
2702+ ip_finish_output+0x1f4/0x840
2703+ ip_output+0x6b/0xc0
2704+ ip_local_out_sk+0x31/0x40
2705+ ip_send_skb+0x1a/0x50
2706+ udp_send_skb+0x173/0x2a0
2707+ udp_sendmsg+0x2bf/0x9f0
2708+ inet_sendmsg+0x64/0xa0
2709+ sock_sendmsg+0x3d/0x50
2710+ } hitcount: 115 len: 13030
2711+ { stacktrace:
2712+ __netif_receive_skb_core+0x46d/0x990
2713+ __netif_receive_skb+0x18/0x60
2714+ netif_receive_skb_internal+0x23/0x90
2715+ napi_gro_complete+0xa4/0xe0
2716+ napi_gro_flush+0x6d/0x90
2717+ iwl_pcie_irq_handler+0x92a/0x12f0 [iwlwifi]
2718+ irq_thread_fn+0x20/0x50
2719+ irq_thread+0x11f/0x150
2720+ kthread+0xd2/0xf0
2721+ ret_from_fork+0x42/0x70
2722+ } hitcount: 934 len: 5512212
2723+
2724+ Totals:
2725+ Hits: 1232
2726+ Entries: 4
2727+ Dropped: 0
2728+
2729+ The above shows all the netif_receive_skb callpaths and their total
2730+ lengths for the duration of the wget command.
2731+
2732+ The 'clear' hist trigger param can be used to clear the hash table.
2733+ Suppose we wanted to try another run of the previous example but
2734+ this time also wanted to see the complete list of events that went
2735+ into the histogram. In order to avoid having to set everything up
2736+ again, we can just clear the histogram first:
2737+
2738+ # echo 'hist:key=stacktrace:vals=len:clear' >> \
2739+ /sys/kernel/debug/tracing/events/net/netif_receive_skb/trigger
2740+
2741+ Just to verify that it is in fact cleared, here's what we now see in
2742+ the hist file:
2743+
2744+ # cat /sys/kernel/debug/tracing/events/net/netif_receive_skb/hist
2745+ # trigger info: hist:keys=stacktrace:vals=len:sort=hitcount:size=2048 [paused]
2746+
2747+ Totals:
2748+ Hits: 0
2749+ Entries: 0
2750+ Dropped: 0
2751+
2752+ Since we want to see the detailed list of every netif_receive_skb
2753+ event occurring during the new run, which are in fact the same
2754+ events being aggregated into the hash table, we add some additional
2755+ 'enable_event' events to the triggering sched_process_exec and
2756+ sched_process_exit events as such:
2757+
2758+ # echo 'enable_event:net:netif_receive_skb if filename==/usr/bin/wget' > \
2759+ /sys/kernel/debug/tracing/events/sched/sched_process_exec/trigger
2760+
2761+ # echo 'disable_event:net:netif_receive_skb if comm==wget' > \
2762+ /sys/kernel/debug/tracing/events/sched/sched_process_exit/trigger
2763+
2764+ If you read the trigger files for the sched_process_exec and
2765+ sched_process_exit triggers, you should see two triggers for each:
2766+ one enabling/disabling the hist aggregation and the other
2767+ enabling/disabling the logging of events:
2768+
2769+ # cat /sys/kernel/debug/tracing/events/sched/sched_process_exec/trigger
2770+ enable_event:net:netif_receive_skb:unlimited if filename==/usr/bin/wget
2771+ enable_hist:net:netif_receive_skb:unlimited if filename==/usr/bin/wget
2772+
2773+ # cat /sys/kernel/debug/tracing/events/sched/sched_process_exit/trigger
2774+ enable_event:net:netif_receive_skb:unlimited if comm==wget
2775+ disable_hist:net:netif_receive_skb:unlimited if comm==wget
2776+
2777+ In other words, whenever either of the sched_process_exec or
2778+ sched_process_exit events is hit and matches 'wget', it enables or
2779+ disables both the histogram and the event log, and what you end up
2780+ with is a hash table and set of events just covering the specified
2781+ duration. Run the wget command again:
2782+
2783+ $ wget https://www.kernel.org/pub/linux/kernel/v3.x/patch-3.19.xz
2784+
2785+ Displaying the 'hist' file should show something similar to what you
2786+ saw in the last run, but this time you should also see the
2787+ individual events in the trace file:
2788+
2789+ # cat /sys/kernel/debug/tracing/trace
2790+
2791+ # tracer: nop
2792+ #
2793+ # entries-in-buffer/entries-written: 183/1426 #P:4
2794+ #
2795+ # _-----=> irqs-off
2796+ # / _----=> need-resched
2797+ # | / _---=> hardirq/softirq
2798+ # || / _--=> preempt-depth
2799+ # ||| / delay
2800+ # TASK-PID CPU# |||| TIMESTAMP FUNCTION
2801+ # | | | |||| | |
2802+ wget-15108 [000] ..s1 31769.606929: netif_receive_skb: dev=lo skbaddr=ffff88009c353100 len=60
2803+ wget-15108 [000] ..s1 31769.606999: netif_receive_skb: dev=lo skbaddr=ffff88009c353200 len=60
2804+ dnsmasq-1382 [000] ..s1 31769.677652: netif_receive_skb: dev=lo skbaddr=ffff88009c352b00 len=130
2805+ dnsmasq-1382 [000] ..s1 31769.685917: netif_receive_skb: dev=lo skbaddr=ffff88009c352200 len=138
2806+ ##### CPU 2 buffer started ####
2807+ irq/29-iwlwifi-559 [002] ..s. 31772.031529: netif_receive_skb: dev=wlan0 skbaddr=ffff88009d433d00 len=2948
2808+ irq/29-iwlwifi-559 [002] ..s. 31772.031572: netif_receive_skb: dev=wlan0 skbaddr=ffff88009d432200 len=1500
2809+ irq/29-iwlwifi-559 [002] ..s. 31772.032196: netif_receive_skb: dev=wlan0 skbaddr=ffff88009d433100 len=2948
2810+ irq/29-iwlwifi-559 [002] ..s. 31772.032761: netif_receive_skb: dev=wlan0 skbaddr=ffff88009d433000 len=2948
2811+ irq/29-iwlwifi-559 [002] ..s. 31772.033220: netif_receive_skb: dev=wlan0 skbaddr=ffff88009d432e00 len=1500
2812+ .
2813+ .
2814+ .
2815+
2816+ The following example demonstrates how multiple hist triggers can be
2817+ attached to a given event. This capability can be useful for
2818+ creating a set of different summaries derived from the same set of
2819+ events, or for comparing the effects of different filters, among
2820+ other things.
2821+
2822+ # echo 'hist:keys=skbaddr.hex:vals=len if len < 0' >> \
2823+ /sys/kernel/debug/tracing/events/net/netif_receive_skb/trigger
2824+ # echo 'hist:keys=skbaddr.hex:vals=len if len > 4096' >> \
2825+ /sys/kernel/debug/tracing/events/net/netif_receive_skb/trigger
2826+ # echo 'hist:keys=skbaddr.hex:vals=len if len == 256' >> \
2827+ /sys/kernel/debug/tracing/events/net/netif_receive_skb/trigger
2828+ # echo 'hist:keys=skbaddr.hex:vals=len' >> \
2829+ /sys/kernel/debug/tracing/events/net/netif_receive_skb/trigger
2830+ # echo 'hist:keys=len:vals=common_preempt_count' >> \
2831+ /sys/kernel/debug/tracing/events/net/netif_receive_skb/trigger
2832+
2833+ The above set of commands create four triggers differing only in
2834+ their filters, along with a completely different though fairly
2835+ nonsensical trigger. Note that in order to append multiple hist
2836+ triggers to the same file, you should use the '>>' operator to
2837+ append them ('>' will also add the new hist trigger, but will remove
2838+ any existing hist triggers beforehand).
2839+
2840+ Displaying the contents of the 'hist' file for the event shows the
2841+ contents of all five histograms:
2842+
2843+ # cat /sys/kernel/debug/tracing/events/net/netif_receive_skb/hist
2844+
2845+ # event histogram
2846+ #
2847+ # trigger info: hist:keys=len:vals=hitcount,common_preempt_count:sort=hitcount:size=2048 [active]
2848+ #
2849+
2850+ { len: 176 } hitcount: 1 common_preempt_count: 0
2851+ { len: 223 } hitcount: 1 common_preempt_count: 0
2852+ { len: 4854 } hitcount: 1 common_preempt_count: 0
2853+ { len: 395 } hitcount: 1 common_preempt_count: 0
2854+ { len: 177 } hitcount: 1 common_preempt_count: 0
2855+ { len: 446 } hitcount: 1 common_preempt_count: 0
2856+ { len: 1601 } hitcount: 1 common_preempt_count: 0
2857+ .
2858+ .
2859+ .
2860+ { len: 1280 } hitcount: 66 common_preempt_count: 0
2861+ { len: 116 } hitcount: 81 common_preempt_count: 40
2862+ { len: 708 } hitcount: 112 common_preempt_count: 0
2863+ { len: 46 } hitcount: 221 common_preempt_count: 0
2864+ { len: 1264 } hitcount: 458 common_preempt_count: 0
2865+
2866+ Totals:
2867+ Hits: 1428
2868+ Entries: 147
2869+ Dropped: 0
2870+
2871+
2872+ # event histogram
2873+ #
2874+ # trigger info: hist:keys=skbaddr.hex:vals=hitcount,len:sort=hitcount:size=2048 [active]
2875+ #
2876+
2877+ { skbaddr: ffff8800baee5e00 } hitcount: 1 len: 130
2878+ { skbaddr: ffff88005f3d5600 } hitcount: 1 len: 1280
2879+ { skbaddr: ffff88005f3d4900 } hitcount: 1 len: 1280
2880+ { skbaddr: ffff88009fed6300 } hitcount: 1 len: 115
2881+ { skbaddr: ffff88009fe0ad00 } hitcount: 1 len: 115
2882+ { skbaddr: ffff88008cdb1900 } hitcount: 1 len: 46
2883+ { skbaddr: ffff880064b5ef00 } hitcount: 1 len: 118
2884+ { skbaddr: ffff880044e3c700 } hitcount: 1 len: 60
2885+ { skbaddr: ffff880100065900 } hitcount: 1 len: 46
2886+ { skbaddr: ffff8800d46bd500 } hitcount: 1 len: 116
2887+ { skbaddr: ffff88005f3d5f00 } hitcount: 1 len: 1280
2888+ { skbaddr: ffff880100064700 } hitcount: 1 len: 365
2889+ { skbaddr: ffff8800badb6f00 } hitcount: 1 len: 60
2890+ .
2891+ .
2892+ .
2893+ { skbaddr: ffff88009fe0be00 } hitcount: 27 len: 24677
2894+ { skbaddr: ffff88009fe0a400 } hitcount: 27 len: 23052
2895+ { skbaddr: ffff88009fe0b700 } hitcount: 31 len: 25589
2896+ { skbaddr: ffff88009fe0b600 } hitcount: 32 len: 27326
2897+ { skbaddr: ffff88006a462800 } hitcount: 68 len: 71678
2898+ { skbaddr: ffff88006a463700 } hitcount: 70 len: 72678
2899+ { skbaddr: ffff88006a462b00 } hitcount: 71 len: 77589
2900+ { skbaddr: ffff88006a463600 } hitcount: 73 len: 71307
2901+ { skbaddr: ffff88006a462200 } hitcount: 81 len: 81032
2902+
2903+ Totals:
2904+ Hits: 1451
2905+ Entries: 318
2906+ Dropped: 0
2907+
2908+
2909+ # event histogram
2910+ #
2911+ # trigger info: hist:keys=skbaddr.hex:vals=hitcount,len:sort=hitcount:size=2048 if len == 256 [active]
2912+ #
2913+
2914+
2915+ Totals:
2916+ Hits: 0
2917+ Entries: 0
2918+ Dropped: 0
2919+
2920+
2921+ # event histogram
2922+ #
2923+ # trigger info: hist:keys=skbaddr.hex:vals=hitcount,len:sort=hitcount:size=2048 if len > 4096 [active]
2924+ #
2925+
2926+ { skbaddr: ffff88009fd2c300 } hitcount: 1 len: 7212
2927+ { skbaddr: ffff8800d2bcce00 } hitcount: 1 len: 7212
2928+ { skbaddr: ffff8800d2bcd700 } hitcount: 1 len: 7212
2929+ { skbaddr: ffff8800d2bcda00 } hitcount: 1 len: 21492
2930+ { skbaddr: ffff8800ae2e2d00 } hitcount: 1 len: 7212
2931+ { skbaddr: ffff8800d2bcdb00 } hitcount: 1 len: 7212
2932+ { skbaddr: ffff88006a4df500 } hitcount: 1 len: 4854
2933+ { skbaddr: ffff88008ce47b00 } hitcount: 1 len: 18636
2934+ { skbaddr: ffff8800ae2e2200 } hitcount: 1 len: 12924
2935+ { skbaddr: ffff88005f3e1000 } hitcount: 1 len: 4356
2936+ { skbaddr: ffff8800d2bcdc00 } hitcount: 2 len: 24420
2937+ { skbaddr: ffff8800d2bcc200 } hitcount: 2 len: 12996
2938+
2939+ Totals:
2940+ Hits: 14
2941+ Entries: 12
2942+ Dropped: 0
2943+
2944+
2945+ # event histogram
2946+ #
2947+ # trigger info: hist:keys=skbaddr.hex:vals=hitcount,len:sort=hitcount:size=2048 if len < 0 [active]
2948+ #
2949+
2950+
2951+ Totals:
2952+ Hits: 0
2953+ Entries: 0
2954+ Dropped: 0
2955+
2956+ Named triggers can be used to have triggers share a common set of
2957+ histogram data. This capability is mostly useful for combining the
2958+ output of events generated by tracepoints contained inside inline
2959+ functions, but names can be used in a hist trigger on any event.
2960+ For example, these two triggers when hit will update the same 'len'
2961+ field in the shared 'foo' histogram data:
2962+
2963+ # echo 'hist:name=foo:keys=skbaddr.hex:vals=len' > \
2964+ /sys/kernel/debug/tracing/events/net/netif_receive_skb/trigger
2965+ # echo 'hist:name=foo:keys=skbaddr.hex:vals=len' > \
2966+ /sys/kernel/debug/tracing/events/net/netif_rx/trigger
2967+
2968+ You can see that they're updating common histogram data by reading
2969+ each event's hist files at the same time:
2970+
2971+ # cat /sys/kernel/debug/tracing/events/net/netif_receive_skb/hist;
2972+ cat /sys/kernel/debug/tracing/events/net/netif_rx/hist
2973+
2974+ # event histogram
2975+ #
2976+ # trigger info: hist:name=foo:keys=skbaddr.hex:vals=hitcount,len:sort=hitcount:size=2048 [active]
2977+ #
2978+
2979+ { skbaddr: ffff88000ad53500 } hitcount: 1 len: 46
2980+ { skbaddr: ffff8800af5a1500 } hitcount: 1 len: 76
2981+ { skbaddr: ffff8800d62a1900 } hitcount: 1 len: 46
2982+ { skbaddr: ffff8800d2bccb00 } hitcount: 1 len: 468
2983+ { skbaddr: ffff8800d3c69900 } hitcount: 1 len: 46
2984+ { skbaddr: ffff88009ff09100 } hitcount: 1 len: 52
2985+ { skbaddr: ffff88010f13ab00 } hitcount: 1 len: 168
2986+ { skbaddr: ffff88006a54f400 } hitcount: 1 len: 46
2987+ { skbaddr: ffff8800d2bcc500 } hitcount: 1 len: 260
2988+ { skbaddr: ffff880064505000 } hitcount: 1 len: 46
2989+ { skbaddr: ffff8800baf24e00 } hitcount: 1 len: 32
2990+ { skbaddr: ffff88009fe0ad00 } hitcount: 1 len: 46
2991+ { skbaddr: ffff8800d3edff00 } hitcount: 1 len: 44
2992+ { skbaddr: ffff88009fe0b400 } hitcount: 1 len: 168
2993+ { skbaddr: ffff8800a1c55a00 } hitcount: 1 len: 40
2994+ { skbaddr: ffff8800d2bcd100 } hitcount: 1 len: 40
2995+ { skbaddr: ffff880064505f00 } hitcount: 1 len: 174
2996+ { skbaddr: ffff8800a8bff200 } hitcount: 1 len: 160
2997+ { skbaddr: ffff880044e3cc00 } hitcount: 1 len: 76
2998+ { skbaddr: ffff8800a8bfe700 } hitcount: 1 len: 46
2999+ { skbaddr: ffff8800d2bcdc00 } hitcount: 1 len: 32
3000+ { skbaddr: ffff8800a1f64800 } hitcount: 1 len: 46
3001+ { skbaddr: ffff8800d2bcde00 } hitcount: 1 len: 988
3002+ { skbaddr: ffff88006a5dea00 } hitcount: 1 len: 46
3003+ { skbaddr: ffff88002e37a200 } hitcount: 1 len: 44
3004+ { skbaddr: ffff8800a1f32c00 } hitcount: 2 len: 676
3005+ { skbaddr: ffff88000ad52600 } hitcount: 2 len: 107
3006+ { skbaddr: ffff8800a1f91e00 } hitcount: 2 len: 92
3007+ { skbaddr: ffff8800af5a0200 } hitcount: 2 len: 142
3008+ { skbaddr: ffff8800d2bcc600 } hitcount: 2 len: 220
3009+ { skbaddr: ffff8800ba36f500 } hitcount: 2 len: 92
3010+ { skbaddr: ffff8800d021f800 } hitcount: 2 len: 92
3011+ { skbaddr: ffff8800a1f33600 } hitcount: 2 len: 675
3012+ { skbaddr: ffff8800a8bfff00 } hitcount: 3 len: 138
3013+ { skbaddr: ffff8800d62a1300 } hitcount: 3 len: 138
3014+ { skbaddr: ffff88002e37a100 } hitcount: 4 len: 184
3015+ { skbaddr: ffff880064504400 } hitcount: 4 len: 184
3016+ { skbaddr: ffff8800a8bfec00 } hitcount: 4 len: 184
3017+ { skbaddr: ffff88000ad53700 } hitcount: 5 len: 230
3018+ { skbaddr: ffff8800d2bcdb00 } hitcount: 5 len: 196
3019+ { skbaddr: ffff8800a1f90000 } hitcount: 6 len: 276
3020+ { skbaddr: ffff88006a54f900 } hitcount: 6 len: 276
3021+
3022+ Totals:
3023+ Hits: 81
3024+ Entries: 42
3025+ Dropped: 0
3026+ # event histogram
3027+ #
3028+ # trigger info: hist:name=foo:keys=skbaddr.hex:vals=hitcount,len:sort=hitcount:size=2048 [active]
3029+ #
3030+
3031+ { skbaddr: ffff88000ad53500 } hitcount: 1 len: 46
3032+ { skbaddr: ffff8800af5a1500 } hitcount: 1 len: 76
3033+ { skbaddr: ffff8800d62a1900 } hitcount: 1 len: 46
3034+ { skbaddr: ffff8800d2bccb00 } hitcount: 1 len: 468
3035+ { skbaddr: ffff8800d3c69900 } hitcount: 1 len: 46
3036+ { skbaddr: ffff88009ff09100 } hitcount: 1 len: 52
3037+ { skbaddr: ffff88010f13ab00 } hitcount: 1 len: 168
3038+ { skbaddr: ffff88006a54f400 } hitcount: 1 len: 46
3039+ { skbaddr: ffff8800d2bcc500 } hitcount: 1 len: 260
3040+ { skbaddr: ffff880064505000 } hitcount: 1 len: 46
3041+ { skbaddr: ffff8800baf24e00 } hitcount: 1 len: 32
3042+ { skbaddr: ffff88009fe0ad00 } hitcount: 1 len: 46
3043+ { skbaddr: ffff8800d3edff00 } hitcount: 1 len: 44
3044+ { skbaddr: ffff88009fe0b400 } hitcount: 1 len: 168
3045+ { skbaddr: ffff8800a1c55a00 } hitcount: 1 len: 40
3046+ { skbaddr: ffff8800d2bcd100 } hitcount: 1 len: 40
3047+ { skbaddr: ffff880064505f00 } hitcount: 1 len: 174
3048+ { skbaddr: ffff8800a8bff200 } hitcount: 1 len: 160
3049+ { skbaddr: ffff880044e3cc00 } hitcount: 1 len: 76
3050+ { skbaddr: ffff8800a8bfe700 } hitcount: 1 len: 46
3051+ { skbaddr: ffff8800d2bcdc00 } hitcount: 1 len: 32
3052+ { skbaddr: ffff8800a1f64800 } hitcount: 1 len: 46
3053+ { skbaddr: ffff8800d2bcde00 } hitcount: 1 len: 988
3054+ { skbaddr: ffff88006a5dea00 } hitcount: 1 len: 46
3055+ { skbaddr: ffff88002e37a200 } hitcount: 1 len: 44
3056+ { skbaddr: ffff8800a1f32c00 } hitcount: 2 len: 676
3057+ { skbaddr: ffff88000ad52600 } hitcount: 2 len: 107
3058+ { skbaddr: ffff8800a1f91e00 } hitcount: 2 len: 92
3059+ { skbaddr: ffff8800af5a0200 } hitcount: 2 len: 142
3060+ { skbaddr: ffff8800d2bcc600 } hitcount: 2 len: 220
3061+ { skbaddr: ffff8800ba36f500 } hitcount: 2 len: 92
3062+ { skbaddr: ffff8800d021f800 } hitcount: 2 len: 92
3063+ { skbaddr: ffff8800a1f33600 } hitcount: 2 len: 675
3064+ { skbaddr: ffff8800a8bfff00 } hitcount: 3 len: 138
3065+ { skbaddr: ffff8800d62a1300 } hitcount: 3 len: 138
3066+ { skbaddr: ffff88002e37a100 } hitcount: 4 len: 184
3067+ { skbaddr: ffff880064504400 } hitcount: 4 len: 184
3068+ { skbaddr: ffff8800a8bfec00 } hitcount: 4 len: 184
3069+ { skbaddr: ffff88000ad53700 } hitcount: 5 len: 230
3070+ { skbaddr: ffff8800d2bcdb00 } hitcount: 5 len: 196
3071+ { skbaddr: ffff8800a1f90000 } hitcount: 6 len: 276
3072+ { skbaddr: ffff88006a54f900 } hitcount: 6 len: 276
3073+
3074+ Totals:
3075+ Hits: 81
3076+ Entries: 42
3077+ Dropped: 0
3078+
3079+ And here's an example that shows how to combine histogram data from
3080+ any two events even if they don't share any 'compatible' fields
3081+ other than 'hitcount' and 'stacktrace'. These commands create a
3082+ couple of triggers named 'bar' using those fields:
3083+
3084+ # echo 'hist:name=bar:key=stacktrace:val=hitcount' > \
3085+ /sys/kernel/debug/tracing/events/sched/sched_process_fork/trigger
3086+ # echo 'hist:name=bar:key=stacktrace:val=hitcount' > \
3087+ /sys/kernel/debug/tracing/events/net/netif_rx/trigger
3088+
3089+ And displaying the output of either shows some interesting if
3090+ somewhat confusing output:
3091+
3092+ # cat /sys/kernel/debug/tracing/events/sched/sched_process_fork/hist
3093+ # cat /sys/kernel/debug/tracing/events/net/netif_rx/hist
3094+
3095+ # event histogram
3096+ #
3097+ # trigger info: hist:name=bar:keys=stacktrace:vals=hitcount:sort=hitcount:size=2048 [active]
3098+ #
3099+
3100+ { stacktrace:
3101+ _do_fork+0x18e/0x330
3102+ kernel_thread+0x29/0x30
3103+ kthreadd+0x154/0x1b0
3104+ ret_from_fork+0x3f/0x70
3105+ } hitcount: 1
3106+ { stacktrace:
3107+ netif_rx_internal+0xb2/0xd0
3108+ netif_rx_ni+0x20/0x70
3109+ dev_loopback_xmit+0xaa/0xd0
3110+ ip_mc_output+0x126/0x240
3111+ ip_local_out_sk+0x31/0x40
3112+ igmp_send_report+0x1e9/0x230
3113+ igmp_timer_expire+0xe9/0x120
3114+ call_timer_fn+0x39/0xf0
3115+ run_timer_softirq+0x1e1/0x290
3116+ __do_softirq+0xfd/0x290
3117+ irq_exit+0x98/0xb0
3118+ smp_apic_timer_interrupt+0x4a/0x60
3119+ apic_timer_interrupt+0x6d/0x80
3120+ cpuidle_enter+0x17/0x20
3121+ call_cpuidle+0x3b/0x60
3122+ cpu_startup_entry+0x22d/0x310
3123+ } hitcount: 1
3124+ { stacktrace:
3125+ netif_rx_internal+0xb2/0xd0
3126+ netif_rx_ni+0x20/0x70
3127+ dev_loopback_xmit+0xaa/0xd0
3128+ ip_mc_output+0x17f/0x240
3129+ ip_local_out_sk+0x31/0x40
3130+ ip_send_skb+0x1a/0x50
3131+ udp_send_skb+0x13e/0x270
3132+ udp_sendmsg+0x2bf/0x980
3133+ inet_sendmsg+0x67/0xa0
3134+ sock_sendmsg+0x38/0x50
3135+ SYSC_sendto+0xef/0x170
3136+ SyS_sendto+0xe/0x10
3137+ entry_SYSCALL_64_fastpath+0x12/0x6a
3138+ } hitcount: 2
3139+ { stacktrace:
3140+ netif_rx_internal+0xb2/0xd0
3141+ netif_rx+0x1c/0x60
3142+ loopback_xmit+0x6c/0xb0
3143+ dev_hard_start_xmit+0x219/0x3a0
3144+ __dev_queue_xmit+0x415/0x4f0
3145+ dev_queue_xmit_sk+0x13/0x20
3146+ ip_finish_output2+0x237/0x340
3147+ ip_finish_output+0x113/0x1d0
3148+ ip_output+0x66/0xc0
3149+ ip_local_out_sk+0x31/0x40
3150+ ip_send_skb+0x1a/0x50
3151+ udp_send_skb+0x16d/0x270
3152+ udp_sendmsg+0x2bf/0x980
3153+ inet_sendmsg+0x67/0xa0
3154+ sock_sendmsg+0x38/0x50
3155+ ___sys_sendmsg+0x14e/0x270
3156+ } hitcount: 76
3157+ { stacktrace:
3158+ netif_rx_internal+0xb2/0xd0
3159+ netif_rx+0x1c/0x60
3160+ loopback_xmit+0x6c/0xb0
3161+ dev_hard_start_xmit+0x219/0x3a0
3162+ __dev_queue_xmit+0x415/0x4f0
3163+ dev_queue_xmit_sk+0x13/0x20
3164+ ip_finish_output2+0x237/0x340
3165+ ip_finish_output+0x113/0x1d0
3166+ ip_output+0x66/0xc0
3167+ ip_local_out_sk+0x31/0x40
3168+ ip_send_skb+0x1a/0x50
3169+ udp_send_skb+0x16d/0x270
3170+ udp_sendmsg+0x2bf/0x980
3171+ inet_sendmsg+0x67/0xa0
3172+ sock_sendmsg+0x38/0x50
3173+ ___sys_sendmsg+0x269/0x270
3174+ } hitcount: 77
3175+ { stacktrace:
3176+ netif_rx_internal+0xb2/0xd0
3177+ netif_rx+0x1c/0x60
3178+ loopback_xmit+0x6c/0xb0
3179+ dev_hard_start_xmit+0x219/0x3a0
3180+ __dev_queue_xmit+0x415/0x4f0
3181+ dev_queue_xmit_sk+0x13/0x20
3182+ ip_finish_output2+0x237/0x340
3183+ ip_finish_output+0x113/0x1d0
3184+ ip_output+0x66/0xc0
3185+ ip_local_out_sk+0x31/0x40
3186+ ip_send_skb+0x1a/0x50
3187+ udp_send_skb+0x16d/0x270
3188+ udp_sendmsg+0x2bf/0x980
3189+ inet_sendmsg+0x67/0xa0
3190+ sock_sendmsg+0x38/0x50
3191+ SYSC_sendto+0xef/0x170
3192+ } hitcount: 88
3193+ { stacktrace:
3194+ _do_fork+0x18e/0x330
3195+ SyS_clone+0x19/0x20
3196+ entry_SYSCALL_64_fastpath+0x12/0x6a
3197+ } hitcount: 244
3198+
b3bbd485
JK
3199+ Totals:
3200+ Hits: 489
3201+ Entries: 7
3202+ Dropped: 0
3203+
3204+
3205+2.2 Inter-event hist triggers
3206+-----------------------------
3207+
3208+Inter-event hist triggers are hist triggers that combine values from
3209+one or more other events and create a histogram using that data. Data
3210+from an inter-event histogram can in turn become the source for
3211+further combined histograms, thus providing a chain of related
3212+histograms, which is important for some applications.
3213+
3214+The most important example of an inter-event quantity that can be used
3215+in this manner is latency, which is simply a difference in timestamps
3216+between two events. Although latency is the most important
3217+inter-event quantity, note that because the support is completely
3218+general across the trace event subsystem, any event field can be used
3219+in an inter-event quantity.
3220+
3221+An example of a histogram that combines data from other histograms
3222+into a useful chain would be a 'wakeupswitch latency' histogram that
3223+combines a 'wakeup latency' histogram and a 'switch latency'
3224+histogram.
3225+
3226+Normally, a hist trigger specification consists of a (possibly
3227+compound) key along with one or more numeric values, which are
3228+continually updated sums associated with that key. A histogram
3229+specification in this case consists of individual key and value
3230+specifications that refer to trace event fields associated with a
3231+single event type.
3232+
3233+The inter-event hist trigger extension allows fields from multiple
3234+events to be referenced and combined into a multi-event histogram
3235+specification. In support of this overall goal, a few enabling
3236+features have been added to the hist trigger support:
3237+
3238+ - In order to compute an inter-event quantity, a value from one
3239+ event needs to saved and then referenced from another event. This
3240+ requires the introduction of support for histogram 'variables'.
3241+
3242+ - The computation of inter-event quantities and their combination
3243+ require some minimal amount of support for applying simple
3244+ expressions to variables (+ and -).
3245+
3246+ - A histogram consisting of inter-event quantities isn't logically a
3247+ histogram on either event (so having the 'hist' file for either
3248+ event host the histogram output doesn't really make sense). To
3249+ address the idea that the histogram is associated with a
3250+ combination of events, support is added allowing the creation of
3251+ 'synthetic' events that are events derived from other events.
3252+ These synthetic events are full-fledged events just like any other
3253+ and can be used as such, as for instance to create the
3254+ 'combination' histograms mentioned previously.
3255+
3256+ - A set of 'actions' can be associated with histogram entries -
3257+ these can be used to generate the previously mentioned synthetic
3258+ events, but can also be used for other purposes, such as for
3259+ example saving context when a 'max' latency has been hit.
3260+
3261+ - Trace events don't have a 'timestamp' associated with them, but
3262+ there is an implicit timestamp saved along with an event in the
3263+ underlying ftrace ring buffer. This timestamp is now exposed as a
3264+ a synthetic field named 'common_timestamp' which can be used in
3265+ histograms as if it were any other event field; it isn't an actual
3266+ field in the trace format but rather is a synthesized value that
3267+ nonetheless can be used as if it were an actual field. By default
3268+ it is in units of nanoseconds; appending '.usecs' to a
3269+ common_timestamp field changes the units to microseconds.
3270+
3271+A note on inter-event timestamps: If common_timestamp is used in a
3272+histogram, the trace buffer is automatically switched over to using
3273+absolute timestamps and the "global" trace clock, in order to avoid
3274+bogus timestamp differences with other clocks that aren't coherent
3275+across CPUs. This can be overridden by specifying one of the other
3276+trace clocks instead, using the "clock=XXX" hist trigger attribute,
3277+where XXX is any of the clocks listed in the tracing/trace_clock
3278+pseudo-file.
3279+
3280+These features are described in more detail in the following sections.
3281+
3282+2.2.1 Histogram Variables
3283+-------------------------
3284+
3285+Variables are simply named locations used for saving and retrieving
3286+values between matching events. A 'matching' event is defined as an
3287+event that has a matching key - if a variable is saved for a histogram
3288+entry corresponding to that key, any subsequent event with a matching
3289+key can access that variable.
3290+
3291+A variable's value is normally available to any subsequent event until
3292+it is set to something else by a subsequent event. The one exception
3293+to that rule is that any variable used in an expression is essentially
3294+'read-once' - once it's used by an expression in a subsequent event,
3295+it's reset to its 'unset' state, which means it can't be used again
3296+unless it's set again. This ensures not only that an event doesn't
3297+use an uninitialized variable in a calculation, but that that variable
3298+is used only once and not for any unrelated subsequent match.
3299+
3300+The basic syntax for saving a variable is to simply prefix a unique
3301+variable name not corresponding to any keyword along with an '=' sign
3302+to any event field.
3303+
3304+Either keys or values can be saved and retrieved in this way. This
3305+creates a variable named 'ts0' for a histogram entry with the key
3306+'next_pid':
3307+
3308+ # echo 'hist:keys=next_pid:vals=$ts0:ts0=common_timestamp ... >> \
3309+ event/trigger
3310+
3311+The ts0 variable can be accessed by any subsequent event having the
3312+same pid as 'next_pid'.
3313+
3314+Variable references are formed by prepending the variable name with
3315+the '$' sign. Thus for example, the ts0 variable above would be
3316+referenced as '$ts0' in expressions.
3317+
3318+Because 'vals=' is used, the common_timestamp variable value above
3319+will also be summed as a normal histogram value would (though for a
3320+timestamp it makes little sense).
3321+
3322+The below shows that a key value can also be saved in the same way:
3323+
3324+ # echo 'hist:timer_pid=common_pid:key=timer_pid ...' >> event/trigger
3325+
3326+If a variable isn't a key variable or prefixed with 'vals=', the
3327+associated event field will be saved in a variable but won't be summed
3328+as a value:
3329+
3330+ # echo 'hist:keys=next_pid:ts1=common_timestamp ... >> event/trigger
3331+
3332+Multiple variables can be assigned at the same time. The below would
3333+result in both ts0 and b being created as variables, with both
3334+common_timestamp and field1 additionally being summed as values:
3335+
3336+ # echo 'hist:keys=pid:vals=$ts0,$b:ts0=common_timestamp,b=field1 ... >> \
3337+ event/trigger
3338+
3339+Note that variable assignments can appear either preceding or
3340+following their use. The command below behaves identically to the
3341+command above:
3342+
3343+ # echo 'hist:keys=pid:ts0=common_timestamp,b=field1:vals=$ts0,$b ... >> \
3344+ event/trigger
3345+
3346+Any number of variables not bound to a 'vals=' prefix can also be
3347+assigned by simply separating them with colons. Below is the same
3348+thing but without the values being summed in the histogram:
3349+
3350+ # echo 'hist:keys=pid:ts0=common_timestamp:b=field1 ... >> event/trigger
3351+
3352+Variables set as above can be referenced and used in expressions on
3353+another event.
3354+
3355+For example, here's how a latency can be calculated:
3356+
3357+ # echo 'hist:keys=pid,prio:ts0=common_timestamp ... >> event1/trigger
3358+ # echo 'hist:keys=next_pid:wakeup_lat=common_timestamp-$ts0 ... >> event2/trigger
3359+
3360+In the first line above, the event's timetamp is saved into the
3361+variable ts0. In the next line, ts0 is subtracted from the second
3362+event's timestamp to produce the latency, which is then assigned into
3363+yet another variable, 'wakeup_lat'. The hist trigger below in turn
3364+makes use of the wakeup_lat variable to compute a combined latency
3365+using the same key and variable from yet another event:
3366+
3367+ # echo 'hist:key=pid:wakeupswitch_lat=$wakeup_lat+$switchtime_lat ... >> event3/trigger
3368+
3369+2.2.2 Synthetic Events
3370+----------------------
3371+
3372+Synthetic events are user-defined events generated from hist trigger
3373+variables or fields associated with one or more other events. Their
3374+purpose is to provide a mechanism for displaying data spanning
3375+multiple events consistent with the existing and already familiar
3376+usage for normal events.
3377+
3378+To define a synthetic event, the user writes a simple specification
3379+consisting of the name of the new event along with one or more
3380+variables and their types, which can be any valid field type,
3381+separated by semicolons, to the tracing/synthetic_events file.
3382+
3383+For instance, the following creates a new event named 'wakeup_latency'
3384+with 3 fields: lat, pid, and prio. Each of those fields is simply a
3385+variable reference to a variable on another event:
3386+
3387+ # echo 'wakeup_latency \
3388+ u64 lat; \
3389+ pid_t pid; \
3390+ int prio' >> \
3391+ /sys/kernel/debug/tracing/synthetic_events
3392+
3393+Reading the tracing/synthetic_events file lists all the currently
3394+defined synthetic events, in this case the event defined above:
3395+
3396+ # cat /sys/kernel/debug/tracing/synthetic_events
3397+ wakeup_latency u64 lat; pid_t pid; int prio
3398+
3399+An existing synthetic event definition can be removed by prepending
3400+the command that defined it with a '!':
3401+
3402+ # echo '!wakeup_latency u64 lat pid_t pid int prio' >> \
3403+ /sys/kernel/debug/tracing/synthetic_events
3404+
3405+At this point, there isn't yet an actual 'wakeup_latency' event
3406+instantiated in the event subsytem - for this to happen, a 'hist
3407+trigger action' needs to be instantiated and bound to actual fields
3408+and variables defined on other events (see Section 6.3.3 below).
3409+
3410+Once that is done, an event instance is created, and a histogram can
3411+be defined using it:
3412+
3413+ # echo 'hist:keys=pid,prio,lat.log2:sort=pid,lat' >> \
3414+ /sys/kernel/debug/tracing/events/synthetic/wakeup_latency/trigger
3415+
3416+The new event is created under the tracing/events/synthetic/ directory
3417+and looks and behaves just like any other event:
3418+
3419+ # ls /sys/kernel/debug/tracing/events/synthetic/wakeup_latency
3420+ enable filter format hist id trigger
3421+
3422+Like any other event, once a histogram is enabled for the event, the
3423+output can be displayed by reading the event's 'hist' file.
3424+
3425+2.2.3 Hist trigger 'actions'
3426+----------------------------
3427+
3428+A hist trigger 'action' is a function that's executed whenever a
3429+histogram entry is added or updated.
3430+
3431+The default 'action' if no special function is explicity specified is
3432+as it always has been, to simply update the set of values associated
3433+with an entry. Some applications, however, may want to perform
3434+additional actions at that point, such as generate another event, or
3435+compare and save a maximum.
3436+
3437+The following additional actions are available. To specify an action
3438+for a given event, simply specify the action between colons in the
3439+hist trigger specification.
3440+
3441+ - onmatch(matching.event).<synthetic_event_name>(param list)
3442+
3443+ The 'onmatch(matching.event).<synthetic_event_name>(params)' hist
3444+ trigger action is invoked whenever an event matches and the
3445+ histogram entry would be added or updated. It causes the named
3446+ synthetic event to be generated with the values given in the
3447+ 'param list'. The result is the generation of a synthetic event
3448+ that consists of the values contained in those variables at the
3449+ time the invoking event was hit.
3450+
3451+ The 'param list' consists of one or more parameters which may be
3452+ either variables or fields defined on either the 'matching.event'
3453+ or the target event. The variables or fields specified in the
3454+ param list may be either fully-qualified or unqualified. If a
3455+ variable is specified as unqualified, it must be unique between
3456+ the two events. A field name used as a param can be unqualified
3457+ if it refers to the target event, but must be fully qualified if
3458+ it refers to the matching event. A fully-qualified name is of the
3459+ form 'system.event_name.$var_name' or 'system.event_name.field'.
3460+
3461+ The 'matching.event' specification is simply the fully qualified
3462+ event name of the event that matches the target event for the
3463+ onmatch() functionality, in the form 'system.event_name'.
3464+
3465+ Finally, the number and type of variables/fields in the 'param
3466+ list' must match the number and types of the fields in the
3467+ synthetic event being generated.
3468+
3469+ As an example the below defines a simple synthetic event and uses
3470+ a variable defined on the sched_wakeup_new event as a parameter
3471+ when invoking the synthetic event. Here we define the synthetic
3472+ event:
3473+
3474+ # echo 'wakeup_new_test pid_t pid' >> \
3475+ /sys/kernel/debug/tracing/synthetic_events
3476+
3477+ # cat /sys/kernel/debug/tracing/synthetic_events
3478+ wakeup_new_test pid_t pid
3479+
3480+ The following hist trigger both defines the missing testpid
3481+ variable and specifies an onmatch() action that generates a
3482+ wakeup_new_test synthetic event whenever a sched_wakeup_new event
3483+ occurs, which because of the 'if comm == "cyclictest"' filter only
3484+ happens when the executable is cyclictest:
3485+
3486+ # echo 'hist:keys=$testpid:testpid=pid:onmatch(sched.sched_wakeup_new).\
3487+ wakeup_new_test($testpid) if comm=="cyclictest"' >> \
3488+ /sys/kernel/debug/tracing/events/sched/sched_wakeup_new/trigger
3489+
3490+ Creating and displaying a histogram based on those events is now
3491+ just a matter of using the fields and new synthetic event in the
3492+ tracing/events/synthetic directory, as usual:
3493+
3494+ # echo 'hist:keys=pid:sort=pid' >> \
3495+ /sys/kernel/debug/tracing/events/synthetic/wakeup_new_test/trigger
3496+
3497+ Running 'cyclictest' should cause wakeup_new events to generate
3498+ wakeup_new_test synthetic events which should result in histogram
3499+ output in the wakeup_new_test event's hist file:
3500+
3501+ # cat /sys/kernel/debug/tracing/events/synthetic/wakeup_new_test/hist
3502+
3503+ A more typical usage would be to use two events to calculate a
3504+ latency. The following example uses a set of hist triggers to
3505+ produce a 'wakeup_latency' histogram:
3506+
3507+ First, we define a 'wakeup_latency' synthetic event:
3508+
3509+ # echo 'wakeup_latency u64 lat; pid_t pid; int prio' >> \
3510+ /sys/kernel/debug/tracing/synthetic_events
e4b2b4a8 3511+
b3bbd485
JK
3512+ Next, we specify that whenever we see a sched_waking event for a
3513+ cyclictest thread, save the timestamp in a 'ts0' variable:
e4b2b4a8 3514+
b3bbd485
JK
3515+ # echo 'hist:keys=$saved_pid:saved_pid=pid:ts0=common_timestamp.usecs \
3516+ if comm=="cyclictest"' >> \
3517+ /sys/kernel/debug/tracing/events/sched/sched_waking/trigger
e4b2b4a8 3518+
b3bbd485
JK
3519+ Then, when the corresponding thread is actually scheduled onto the
3520+ CPU by a sched_switch event, calculate the latency and use that
3521+ along with another variable and an event field to generate a
3522+ wakeup_latency synthetic event:
e4b2b4a8 3523+
b3bbd485
JK
3524+ # echo 'hist:keys=next_pid:wakeup_lat=common_timestamp.usecs-$ts0:\
3525+ onmatch(sched.sched_waking).wakeup_latency($wakeup_lat,\
3526+ $saved_pid,next_prio) if next_comm=="cyclictest"' >> \
3527+ /sys/kernel/debug/tracing/events/sched/sched_switch/trigger
e4b2b4a8 3528+
b3bbd485
JK
3529+ We also need to create a histogram on the wakeup_latency synthetic
3530+ event in order to aggregate the generated synthetic event data:
e4b2b4a8 3531+
b3bbd485
JK
3532+ # echo 'hist:keys=pid,prio,lat:sort=pid,lat' >> \
3533+ /sys/kernel/debug/tracing/events/synthetic/wakeup_latency/trigger
e4b2b4a8 3534+
b3bbd485
JK
3535+ Finally, once we've run cyclictest to actually generate some
3536+ events, we can see the output by looking at the wakeup_latency
3537+ synthetic event's hist file:
e4b2b4a8 3538+
b3bbd485 3539+ # cat /sys/kernel/debug/tracing/events/synthetic/wakeup_latency/hist
e4b2b4a8 3540+
b3bbd485 3541+ - onmax(var).save(field,.. .)
e4b2b4a8 3542+
b3bbd485
JK
3543+ The 'onmax(var).save(field,...)' hist trigger action is invoked
3544+ whenever the value of 'var' associated with a histogram entry
3545+ exceeds the current maximum contained in that variable.
e4b2b4a8 3546+
b3bbd485
JK
3547+ The end result is that the trace event fields specified as the
3548+ onmax.save() params will be saved if 'var' exceeds the current
3549+ maximum for that hist trigger entry. This allows context from the
3550+ event that exhibited the new maximum to be saved for later
3551+ reference. When the histogram is displayed, additional fields
3552+ displaying the saved values will be printed.
e4b2b4a8 3553+
b3bbd485
JK
3554+ As an example the below defines a couple of hist triggers, one for
3555+ sched_waking and another for sched_switch, keyed on pid. Whenever
3556+ a sched_waking occurs, the timestamp is saved in the entry
3557+ corresponding to the current pid, and when the scheduler switches
3558+ back to that pid, the timestamp difference is calculated. If the
3559+ resulting latency, stored in wakeup_lat, exceeds the current
3560+ maximum latency, the values specified in the save() fields are
3561+ recoreded:
e4b2b4a8 3562+
b3bbd485
JK
3563+ # echo 'hist:keys=pid:ts0=common_timestamp.usecs \
3564+ if comm=="cyclictest"' >> \
3565+ /sys/kernel/debug/tracing/events/sched/sched_waking/trigger
e4b2b4a8 3566+
b3bbd485
JK
3567+ # echo 'hist:keys=next_pid:\
3568+ wakeup_lat=common_timestamp.usecs-$ts0:\
3569+ onmax($wakeup_lat).save(next_comm,prev_pid,prev_prio,prev_comm) \
3570+ if next_comm=="cyclictest"' >> \
3571+ /sys/kernel/debug/tracing/events/sched/sched_switch/trigger
e4b2b4a8 3572+
b3bbd485
JK
3573+ When the histogram is displayed, the max value and the saved
3574+ values corresponding to the max are displayed following the rest
3575+ of the fields:
e4b2b4a8 3576+
b3bbd485
JK
3577+ # cat /sys/kernel/debug/tracing/events/sched/sched_switch/hist
3578+ { next_pid: 2255 } hitcount: 239
3579+ common_timestamp-ts0: 0
3580+ max: 27
3581+ next_comm: cyclictest
3582+ prev_pid: 0 prev_prio: 120 prev_comm: swapper/1
e4b2b4a8 3583+
b3bbd485
JK
3584+ { next_pid: 2256 } hitcount: 2355
3585+ common_timestamp-ts0: 0
3586+ max: 49 next_comm: cyclictest
3587+ prev_pid: 0 prev_prio: 120 prev_comm: swapper/0
e4b2b4a8 3588+
b3bbd485
JK
3589+ Totals:
3590+ Hits: 12970
3591+ Entries: 2
3592+ Dropped: 0
3593diff --git a/arch/Kconfig b/arch/Kconfig
3594index 40dc31fea90c..7c6108479209 100644
3595--- a/arch/Kconfig
3596+++ b/arch/Kconfig
3597@@ -20,6 +20,7 @@ config OPROFILE
3598 tristate "OProfile system profiling"
3599 depends on PROFILING
3600 depends on HAVE_OPROFILE
3601+ depends on !PREEMPT_RT_FULL
3602 select RING_BUFFER
3603 select RING_BUFFER_ALLOW_SWAP
3604 help
3605diff --git a/arch/alpha/include/asm/spinlock_types.h b/arch/alpha/include/asm/spinlock_types.h
3606index 1d5716bc060b..6883bc952d22 100644
3607--- a/arch/alpha/include/asm/spinlock_types.h
3608+++ b/arch/alpha/include/asm/spinlock_types.h
3609@@ -2,10 +2,6 @@
3610 #ifndef _ALPHA_SPINLOCK_TYPES_H
3611 #define _ALPHA_SPINLOCK_TYPES_H
3612
3613-#ifndef __LINUX_SPINLOCK_TYPES_H
3614-# error "please don't include this file directly"
3615-#endif
3616-
3617 typedef struct {
3618 volatile unsigned int lock;
3619 } arch_spinlock_t;
3620diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig
3621index d1346a160760..558b0995e94a 100644
3622--- a/arch/arm/Kconfig
3623+++ b/arch/arm/Kconfig
3624@@ -45,7 +45,7 @@ config ARM
3625 select HARDIRQS_SW_RESEND
3626 select HAVE_ARCH_AUDITSYSCALL if (AEABI && !OABI_COMPAT)
3627 select HAVE_ARCH_BITREVERSE if (CPU_32v7M || CPU_32v7) && !CPU_32v6
3628- select HAVE_ARCH_JUMP_LABEL if !XIP_KERNEL && !CPU_ENDIAN_BE32 && MMU
3629+ select HAVE_ARCH_JUMP_LABEL if !XIP_KERNEL && !CPU_ENDIAN_BE32 && MMU && !PREEMPT_RT_BASE
3630 select HAVE_ARCH_KGDB if !CPU_ENDIAN_BE32 && MMU
3631 select HAVE_ARCH_MMAP_RND_BITS if MMU
3632 select HAVE_ARCH_SECCOMP_FILTER if (AEABI && !OABI_COMPAT)
3633@@ -85,6 +85,7 @@ config ARM
3634 select HAVE_PERF_EVENTS
3635 select HAVE_PERF_REGS
3636 select HAVE_PERF_USER_STACK_DUMP
3637+ select HAVE_PREEMPT_LAZY
3638 select HAVE_RCU_TABLE_FREE if (SMP && ARM_LPAE)
3639 select HAVE_REGS_AND_STACK_ACCESS_API
3640 select HAVE_SYSCALL_TRACEPOINTS
3641@@ -2164,7 +2165,7 @@ config NEON
3642
3643 config KERNEL_MODE_NEON
3644 bool "Support for NEON in kernel mode"
3645- depends on NEON && AEABI
3646+ depends on NEON && AEABI && !PREEMPT_RT_BASE
3647 help
3648 Say Y to include support for NEON in kernel mode.
3649
3650diff --git a/arch/arm/include/asm/irq.h b/arch/arm/include/asm/irq.h
3651index b6f319606e30..ad377ef73739 100644
3652--- a/arch/arm/include/asm/irq.h
3653+++ b/arch/arm/include/asm/irq.h
3654@@ -23,6 +23,8 @@
3655 #endif
3656
3657 #ifndef __ASSEMBLY__
3658+#include <linux/cpumask.h>
e4b2b4a8 3659+
b3bbd485
JK
3660 struct irqaction;
3661 struct pt_regs;
3662 extern void migrate_irqs(void);
3663diff --git a/arch/arm/include/asm/spinlock_types.h b/arch/arm/include/asm/spinlock_types.h
3664index 5976958647fe..a37c0803954b 100644
3665--- a/arch/arm/include/asm/spinlock_types.h
3666+++ b/arch/arm/include/asm/spinlock_types.h
3667@@ -2,10 +2,6 @@
3668 #ifndef __ASM_SPINLOCK_TYPES_H
3669 #define __ASM_SPINLOCK_TYPES_H
3670
3671-#ifndef __LINUX_SPINLOCK_TYPES_H
3672-# error "please don't include this file directly"
3673-#endif
3674-
3675 #define TICKET_SHIFT 16
3676
3677 typedef struct {
3678diff --git a/arch/arm/include/asm/switch_to.h b/arch/arm/include/asm/switch_to.h
3679index d3e937dcee4d..6ab96a2ce1f8 100644
3680--- a/arch/arm/include/asm/switch_to.h
3681+++ b/arch/arm/include/asm/switch_to.h
3682@@ -4,6 +4,13 @@
3683
3684 #include <linux/thread_info.h>
3685
3686+#if defined CONFIG_PREEMPT_RT_FULL && defined CONFIG_HIGHMEM
3687+void switch_kmaps(struct task_struct *prev_p, struct task_struct *next_p);
3688+#else
3689+static inline void
3690+switch_kmaps(struct task_struct *prev_p, struct task_struct *next_p) { }
3691+#endif
e4b2b4a8 3692+
b3bbd485
JK
3693 /*
3694 * For v7 SMP cores running a preemptible kernel we may be pre-empted
3695 * during a TLB maintenance operation, so execute an inner-shareable dsb
3696@@ -26,6 +33,7 @@ extern struct task_struct *__switch_to(struct task_struct *, struct thread_info
3697 #define switch_to(prev,next,last) \
3698 do { \
3699 __complete_pending_tlbi(); \
3700+ switch_kmaps(prev, next); \
3701 last = __switch_to(prev,task_thread_info(prev), task_thread_info(next)); \
3702 } while (0)
3703
3704diff --git a/arch/arm/include/asm/thread_info.h b/arch/arm/include/asm/thread_info.h
5dd41b01 3705index 57d2ad9c75ca..cdfb6855943b 100644
b3bbd485
JK
3706--- a/arch/arm/include/asm/thread_info.h
3707+++ b/arch/arm/include/asm/thread_info.h
3708@@ -49,6 +49,7 @@ struct cpu_context_save {
3709 struct thread_info {
3710 unsigned long flags; /* low level flags */
3711 int preempt_count; /* 0 => preemptable, <0 => bug */
3712+ int preempt_lazy_count; /* 0 => preemptable, <0 => bug */
3713 mm_segment_t addr_limit; /* address limit */
3714 struct task_struct *task; /* main task structure */
3715 __u32 cpu; /* cpu */
5dd41b01 3716@@ -142,7 +143,8 @@ extern int vfp_restore_user_hwstate(struct user_vfp *,
b3bbd485
JK
3717 #define TIF_SYSCALL_TRACE 4 /* syscall trace active */
3718 #define TIF_SYSCALL_AUDIT 5 /* syscall auditing active */
3719 #define TIF_SYSCALL_TRACEPOINT 6 /* syscall tracepoint instrumentation */
3720-#define TIF_SECCOMP 7 /* seccomp syscall filtering active */
3721+#define TIF_SECCOMP 8 /* seccomp syscall filtering active */
3722+#define TIF_NEED_RESCHED_LAZY 7
3723
3724 #define TIF_NOHZ 12 /* in adaptive nohz mode */
3725 #define TIF_USING_IWMMXT 17
5dd41b01 3726@@ -152,6 +154,7 @@ extern int vfp_restore_user_hwstate(struct user_vfp *,
b3bbd485
JK
3727 #define _TIF_SIGPENDING (1 << TIF_SIGPENDING)
3728 #define _TIF_NEED_RESCHED (1 << TIF_NEED_RESCHED)
3729 #define _TIF_NOTIFY_RESUME (1 << TIF_NOTIFY_RESUME)
3730+#define _TIF_NEED_RESCHED_LAZY (1 << TIF_NEED_RESCHED_LAZY)
3731 #define _TIF_UPROBE (1 << TIF_UPROBE)
3732 #define _TIF_SYSCALL_TRACE (1 << TIF_SYSCALL_TRACE)
3733 #define _TIF_SYSCALL_AUDIT (1 << TIF_SYSCALL_AUDIT)
5dd41b01 3734@@ -167,7 +170,8 @@ extern int vfp_restore_user_hwstate(struct user_vfp *,
b3bbd485
JK
3735 * Change these and you break ASM code in entry-common.S
3736 */
3737 #define _TIF_WORK_MASK (_TIF_NEED_RESCHED | _TIF_SIGPENDING | \
3738- _TIF_NOTIFY_RESUME | _TIF_UPROBE)
3739+ _TIF_NOTIFY_RESUME | _TIF_UPROBE | \
3740+ _TIF_NEED_RESCHED_LAZY)
3741
3742 #endif /* __KERNEL__ */
3743 #endif /* __ASM_ARM_THREAD_INFO_H */
3744diff --git a/arch/arm/kernel/asm-offsets.c b/arch/arm/kernel/asm-offsets.c
3745index 608008229c7d..3866da3f7bb7 100644
3746--- a/arch/arm/kernel/asm-offsets.c
3747+++ b/arch/arm/kernel/asm-offsets.c
3748@@ -65,6 +65,7 @@ int main(void)
3749 BLANK();
3750 DEFINE(TI_FLAGS, offsetof(struct thread_info, flags));
3751 DEFINE(TI_PREEMPT, offsetof(struct thread_info, preempt_count));
3752+ DEFINE(TI_PREEMPT_LAZY, offsetof(struct thread_info, preempt_lazy_count));
3753 DEFINE(TI_ADDR_LIMIT, offsetof(struct thread_info, addr_limit));
3754 DEFINE(TI_TASK, offsetof(struct thread_info, task));
3755 DEFINE(TI_CPU, offsetof(struct thread_info, cpu));
3756diff --git a/arch/arm/kernel/entry-armv.S b/arch/arm/kernel/entry-armv.S
3757index fbc707626b3e..b434c59d2b64 100644
3758--- a/arch/arm/kernel/entry-armv.S
3759+++ b/arch/arm/kernel/entry-armv.S
3760@@ -220,11 +220,18 @@ __irq_svc:
3761
3762 #ifdef CONFIG_PREEMPT
3763 ldr r8, [tsk, #TI_PREEMPT] @ get preempt count
3764- ldr r0, [tsk, #TI_FLAGS] @ get flags
3765 teq r8, #0 @ if preempt count != 0
3766+ bne 1f @ return from exeption
3767+ ldr r0, [tsk, #TI_FLAGS] @ get flags
3768+ tst r0, #_TIF_NEED_RESCHED @ if NEED_RESCHED is set
3769+ blne svc_preempt @ preempt!
e4b2b4a8 3770+
b3bbd485
JK
3771+ ldr r8, [tsk, #TI_PREEMPT_LAZY] @ get preempt lazy count
3772+ teq r8, #0 @ if preempt lazy count != 0
3773 movne r0, #0 @ force flags to 0
3774- tst r0, #_TIF_NEED_RESCHED
3775+ tst r0, #_TIF_NEED_RESCHED_LAZY
3776 blne svc_preempt
3777+1:
3778 #endif
3779
3780 svc_exit r5, irq = 1 @ return from exception
3781@@ -239,8 +246,14 @@ svc_preempt:
3782 1: bl preempt_schedule_irq @ irq en/disable is done inside
3783 ldr r0, [tsk, #TI_FLAGS] @ get new tasks TI_FLAGS
3784 tst r0, #_TIF_NEED_RESCHED
3785+ bne 1b
3786+ tst r0, #_TIF_NEED_RESCHED_LAZY
3787 reteq r8 @ go again
3788- b 1b
3789+ ldr r0, [tsk, #TI_PREEMPT_LAZY] @ get preempt lazy count
3790+ teq r0, #0 @ if preempt lazy count != 0
3791+ beq 1b
3792+ ret r8 @ go again
e4b2b4a8 3793+
b3bbd485
JK
3794 #endif
3795
3796 __und_fault:
3797diff --git a/arch/arm/kernel/entry-common.S b/arch/arm/kernel/entry-common.S
5dd41b01 3798index 54c10503d71f..3fdeade24e3f 100644
b3bbd485
JK
3799--- a/arch/arm/kernel/entry-common.S
3800+++ b/arch/arm/kernel/entry-common.S
3801@@ -53,7 +53,9 @@ ret_fast_syscall:
3802 cmp r2, #TASK_SIZE
3803 blne addr_limit_check_failed
3804 ldr r1, [tsk, #TI_FLAGS] @ re-check for syscall tracing
3805- tst r1, #_TIF_SYSCALL_WORK | _TIF_WORK_MASK
3806+ tst r1, #((_TIF_SYSCALL_WORK | _TIF_WORK_MASK) & ~_TIF_SECCOMP)
3807+ bne fast_work_pending
3808+ tst r1, #_TIF_SECCOMP
3809 bne fast_work_pending
3810
3811
3812@@ -83,8 +85,11 @@ ret_fast_syscall:
3813 cmp r2, #TASK_SIZE
3814 blne addr_limit_check_failed
3815 ldr r1, [tsk, #TI_FLAGS] @ re-check for syscall tracing
3816- tst r1, #_TIF_SYSCALL_WORK | _TIF_WORK_MASK
3817+ tst r1, #((_TIF_SYSCALL_WORK | _TIF_WORK_MASK) & ~_TIF_SECCOMP)
3818+ bne do_slower_path
3819+ tst r1, #_TIF_SECCOMP
3820 beq no_work_pending
3821+do_slower_path:
3822 UNWIND(.fnend )
3823 ENDPROC(ret_fast_syscall)
3824
3825diff --git a/arch/arm/kernel/patch.c b/arch/arm/kernel/patch.c
3826index a50dc00d79a2..d0a05a3bdb96 100644
3827--- a/arch/arm/kernel/patch.c
3828+++ b/arch/arm/kernel/patch.c
3829@@ -16,7 +16,7 @@ struct patch {
3830 unsigned int insn;
3831 };
3832
3833-static DEFINE_SPINLOCK(patch_lock);
3834+static DEFINE_RAW_SPINLOCK(patch_lock);
3835
3836 static void __kprobes *patch_map(void *addr, int fixmap, unsigned long *flags)
3837 __acquires(&patch_lock)
3838@@ -33,7 +33,7 @@ static void __kprobes *patch_map(void *addr, int fixmap, unsigned long *flags)
3839 return addr;
3840
3841 if (flags)
3842- spin_lock_irqsave(&patch_lock, *flags);
3843+ raw_spin_lock_irqsave(&patch_lock, *flags);
3844 else
3845 __acquire(&patch_lock);
3846
3847@@ -48,7 +48,7 @@ static void __kprobes patch_unmap(int fixmap, unsigned long *flags)
3848 clear_fixmap(fixmap);
3849
3850 if (flags)
3851- spin_unlock_irqrestore(&patch_lock, *flags);
3852+ raw_spin_unlock_irqrestore(&patch_lock, *flags);
3853 else
3854 __release(&patch_lock);
3855 }
3856diff --git a/arch/arm/kernel/process.c b/arch/arm/kernel/process.c
3857index d96714e1858c..cf4e1452d4b4 100644
3858--- a/arch/arm/kernel/process.c
3859+++ b/arch/arm/kernel/process.c
3860@@ -325,6 +325,30 @@ unsigned long arch_randomize_brk(struct mm_struct *mm)
3861 }
3862
3863 #ifdef CONFIG_MMU
3864+/*
3865+ * CONFIG_SPLIT_PTLOCK_CPUS results in a page->ptl lock. If the lock is not
3866+ * initialized by pgtable_page_ctor() then a coredump of the vector page will
3867+ * fail.
3868+ */
3869+static int __init vectors_user_mapping_init_page(void)
3870+{
3871+ struct page *page;
3872+ unsigned long addr = 0xffff0000;
3873+ pgd_t *pgd;
3874+ pud_t *pud;
3875+ pmd_t *pmd;
e4b2b4a8 3876+
b3bbd485
JK
3877+ pgd = pgd_offset_k(addr);
3878+ pud = pud_offset(pgd, addr);
3879+ pmd = pmd_offset(pud, addr);
3880+ page = pmd_page(*(pmd));
e4b2b4a8 3881+
b3bbd485
JK
3882+ pgtable_page_ctor(page);
3883+
3884+ return 0;
3885+}
3886+late_initcall(vectors_user_mapping_init_page);
3887+
3888 #ifdef CONFIG_KUSER_HELPERS
3889 /*
3890 * The vectors page is always readable from user space for the
3891diff --git a/arch/arm/kernel/signal.c b/arch/arm/kernel/signal.c
5dd41b01 3892index cdfe52b15a0a..198cf8bf0b37 100644
b3bbd485
JK
3893--- a/arch/arm/kernel/signal.c
3894+++ b/arch/arm/kernel/signal.c
3895@@ -615,7 +615,8 @@ do_work_pending(struct pt_regs *regs, unsigned int thread_flags, int syscall)
3896 */
3897 trace_hardirqs_off();
3898 do {
3899- if (likely(thread_flags & _TIF_NEED_RESCHED)) {
3900+ if (likely(thread_flags & (_TIF_NEED_RESCHED |
3901+ _TIF_NEED_RESCHED_LAZY))) {
3902 schedule();
3903 } else {
3904 if (unlikely(!user_mode(regs)))
3905diff --git a/arch/arm/kernel/smp.c b/arch/arm/kernel/smp.c
5dd41b01 3906index e61af0600133..d8f2e77d5651 100644
b3bbd485
JK
3907--- a/arch/arm/kernel/smp.c
3908+++ b/arch/arm/kernel/smp.c
5dd41b01 3909@@ -237,8 +237,6 @@ int __cpu_disable(void)
b3bbd485
JK
3910 flush_cache_louis();
3911 local_flush_tlb_all();
3912
3913- clear_tasks_mm_cpumask(cpu);
3914-
3915 return 0;
3916 }
3917
5dd41b01 3918@@ -256,6 +254,7 @@ void __cpu_die(unsigned int cpu)
b3bbd485
JK
3919 }
3920 pr_debug("CPU%u: shutdown\n", cpu);
3921
3922+ clear_tasks_mm_cpumask(cpu);
3923 /*
3924 * platform_cpu_kill() is generally expected to do the powering off
3925 * and/or cutting of clocks to the dying CPU. Optionally, this may
3926diff --git a/arch/arm/kernel/unwind.c b/arch/arm/kernel/unwind.c
3927index 0bee233fef9a..314cfb232a63 100644
3928--- a/arch/arm/kernel/unwind.c
3929+++ b/arch/arm/kernel/unwind.c
3930@@ -93,7 +93,7 @@ extern const struct unwind_idx __start_unwind_idx[];
3931 static const struct unwind_idx *__origin_unwind_idx;
3932 extern const struct unwind_idx __stop_unwind_idx[];
3933
3934-static DEFINE_SPINLOCK(unwind_lock);
3935+static DEFINE_RAW_SPINLOCK(unwind_lock);
3936 static LIST_HEAD(unwind_tables);
3937
3938 /* Convert a prel31 symbol to an absolute address */
3939@@ -201,7 +201,7 @@ static const struct unwind_idx *unwind_find_idx(unsigned long addr)
3940 /* module unwind tables */
3941 struct unwind_table *table;
3942
3943- spin_lock_irqsave(&unwind_lock, flags);
3944+ raw_spin_lock_irqsave(&unwind_lock, flags);
3945 list_for_each_entry(table, &unwind_tables, list) {
3946 if (addr >= table->begin_addr &&
3947 addr < table->end_addr) {
3948@@ -213,7 +213,7 @@ static const struct unwind_idx *unwind_find_idx(unsigned long addr)
3949 break;
3950 }
3951 }
3952- spin_unlock_irqrestore(&unwind_lock, flags);
3953+ raw_spin_unlock_irqrestore(&unwind_lock, flags);
3954 }
3955
3956 pr_debug("%s: idx = %p\n", __func__, idx);
3957@@ -529,9 +529,9 @@ struct unwind_table *unwind_table_add(unsigned long start, unsigned long size,
3958 tab->begin_addr = text_addr;
3959 tab->end_addr = text_addr + text_size;
3960
3961- spin_lock_irqsave(&unwind_lock, flags);
3962+ raw_spin_lock_irqsave(&unwind_lock, flags);
3963 list_add_tail(&tab->list, &unwind_tables);
3964- spin_unlock_irqrestore(&unwind_lock, flags);
3965+ raw_spin_unlock_irqrestore(&unwind_lock, flags);
3966
3967 return tab;
3968 }
3969@@ -543,9 +543,9 @@ void unwind_table_del(struct unwind_table *tab)
3970 if (!tab)
3971 return;
3972
3973- spin_lock_irqsave(&unwind_lock, flags);
3974+ raw_spin_lock_irqsave(&unwind_lock, flags);
3975 list_del(&tab->list);
3976- spin_unlock_irqrestore(&unwind_lock, flags);
3977+ raw_spin_unlock_irqrestore(&unwind_lock, flags);
3978
3979 kfree(tab);
3980 }
3981diff --git a/arch/arm/mach-exynos/platsmp.c b/arch/arm/mach-exynos/platsmp.c
3982index 5a03bffe7226..3080ea833d19 100644
3983--- a/arch/arm/mach-exynos/platsmp.c
3984+++ b/arch/arm/mach-exynos/platsmp.c
3985@@ -229,7 +229,7 @@ static void __iomem *scu_base_addr(void)
3986 return (void __iomem *)(S5P_VA_SCU);
3987 }
3988
3989-static DEFINE_SPINLOCK(boot_lock);
3990+static DEFINE_RAW_SPINLOCK(boot_lock);
3991
3992 static void exynos_secondary_init(unsigned int cpu)
3993 {
3994@@ -242,8 +242,8 @@ static void exynos_secondary_init(unsigned int cpu)
3995 /*
3996 * Synchronise with the boot thread.
3997 */
3998- spin_lock(&boot_lock);
3999- spin_unlock(&boot_lock);
4000+ raw_spin_lock(&boot_lock);
4001+ raw_spin_unlock(&boot_lock);
4002 }
4003
4004 int exynos_set_boot_addr(u32 core_id, unsigned long boot_addr)
4005@@ -307,7 +307,7 @@ static int exynos_boot_secondary(unsigned int cpu, struct task_struct *idle)
4006 * Set synchronisation state between this boot processor
4007 * and the secondary one
4008 */
4009- spin_lock(&boot_lock);
4010+ raw_spin_lock(&boot_lock);
4011
4012 /*
4013 * The secondary processor is waiting to be released from
4014@@ -334,7 +334,7 @@ static int exynos_boot_secondary(unsigned int cpu, struct task_struct *idle)
4015
4016 if (timeout == 0) {
4017 printk(KERN_ERR "cpu1 power enable failed");
4018- spin_unlock(&boot_lock);
4019+ raw_spin_unlock(&boot_lock);
4020 return -ETIMEDOUT;
4021 }
4022 }
4023@@ -380,7 +380,7 @@ static int exynos_boot_secondary(unsigned int cpu, struct task_struct *idle)
4024 * calibrations, then wait for it to finish
4025 */
4026 fail:
4027- spin_unlock(&boot_lock);
4028+ raw_spin_unlock(&boot_lock);
4029
4030 return pen_release != -1 ? ret : 0;
4031 }
4032diff --git a/arch/arm/mach-hisi/platmcpm.c b/arch/arm/mach-hisi/platmcpm.c
4033index f66815c3dd07..00524abd963f 100644
4034--- a/arch/arm/mach-hisi/platmcpm.c
4035+++ b/arch/arm/mach-hisi/platmcpm.c
4036@@ -61,7 +61,7 @@
4037
4038 static void __iomem *sysctrl, *fabric;
4039 static int hip04_cpu_table[HIP04_MAX_CLUSTERS][HIP04_MAX_CPUS_PER_CLUSTER];
4040-static DEFINE_SPINLOCK(boot_lock);
4041+static DEFINE_RAW_SPINLOCK(boot_lock);
4042 static u32 fabric_phys_addr;
4043 /*
4044 * [0]: bootwrapper physical address
4045@@ -113,7 +113,7 @@ static int hip04_boot_secondary(unsigned int l_cpu, struct task_struct *idle)
4046 if (cluster >= HIP04_MAX_CLUSTERS || cpu >= HIP04_MAX_CPUS_PER_CLUSTER)
4047 return -EINVAL;
4048
4049- spin_lock_irq(&boot_lock);
4050+ raw_spin_lock_irq(&boot_lock);
4051
4052 if (hip04_cpu_table[cluster][cpu])
4053 goto out;
4054@@ -147,7 +147,7 @@ static int hip04_boot_secondary(unsigned int l_cpu, struct task_struct *idle)
4055
4056 out:
4057 hip04_cpu_table[cluster][cpu]++;
4058- spin_unlock_irq(&boot_lock);
4059+ raw_spin_unlock_irq(&boot_lock);
4060
4061 return 0;
4062 }
4063@@ -162,11 +162,11 @@ static void hip04_cpu_die(unsigned int l_cpu)
4064 cpu = MPIDR_AFFINITY_LEVEL(mpidr, 0);
4065 cluster = MPIDR_AFFINITY_LEVEL(mpidr, 1);
4066
4067- spin_lock(&boot_lock);
4068+ raw_spin_lock(&boot_lock);
4069 hip04_cpu_table[cluster][cpu]--;
4070 if (hip04_cpu_table[cluster][cpu] == 1) {
4071 /* A power_up request went ahead of us. */
4072- spin_unlock(&boot_lock);
4073+ raw_spin_unlock(&boot_lock);
4074 return;
4075 } else if (hip04_cpu_table[cluster][cpu] > 1) {
4076 pr_err("Cluster %d CPU%d boots multiple times\n", cluster, cpu);
4077@@ -174,7 +174,7 @@ static void hip04_cpu_die(unsigned int l_cpu)
4078 }
4079
4080 last_man = hip04_cluster_is_down(cluster);
4081- spin_unlock(&boot_lock);
4082+ raw_spin_unlock(&boot_lock);
4083 if (last_man) {
4084 /* Since it's Cortex A15, disable L2 prefetching. */
4085 asm volatile(
4086@@ -203,7 +203,7 @@ static int hip04_cpu_kill(unsigned int l_cpu)
4087 cpu >= HIP04_MAX_CPUS_PER_CLUSTER);
4088
4089 count = TIMEOUT_MSEC / POLL_MSEC;
4090- spin_lock_irq(&boot_lock);
4091+ raw_spin_lock_irq(&boot_lock);
4092 for (tries = 0; tries < count; tries++) {
4093 if (hip04_cpu_table[cluster][cpu])
4094 goto err;
4095@@ -211,10 +211,10 @@ static int hip04_cpu_kill(unsigned int l_cpu)
4096 data = readl_relaxed(sysctrl + SC_CPU_RESET_STATUS(cluster));
4097 if (data & CORE_WFI_STATUS(cpu))
4098 break;
4099- spin_unlock_irq(&boot_lock);
4100+ raw_spin_unlock_irq(&boot_lock);
4101 /* Wait for clean L2 when the whole cluster is down. */
4102 msleep(POLL_MSEC);
4103- spin_lock_irq(&boot_lock);
4104+ raw_spin_lock_irq(&boot_lock);
4105 }
4106 if (tries >= count)
4107 goto err;
4108@@ -231,10 +231,10 @@ static int hip04_cpu_kill(unsigned int l_cpu)
4109 goto err;
4110 if (hip04_cluster_is_down(cluster))
4111 hip04_set_snoop_filter(cluster, 0);
4112- spin_unlock_irq(&boot_lock);
4113+ raw_spin_unlock_irq(&boot_lock);
4114 return 1;
4115 err:
4116- spin_unlock_irq(&boot_lock);
4117+ raw_spin_unlock_irq(&boot_lock);
4118 return 0;
4119 }
4120 #endif
4121diff --git a/arch/arm/mach-omap2/omap-smp.c b/arch/arm/mach-omap2/omap-smp.c
4122index 1c73694c871a..ac4d2f030b87 100644
4123--- a/arch/arm/mach-omap2/omap-smp.c
4124+++ b/arch/arm/mach-omap2/omap-smp.c
4125@@ -69,7 +69,7 @@ static const struct omap_smp_config omap5_cfg __initconst = {
4126 .startup_addr = omap5_secondary_startup,
4127 };
4128
4129-static DEFINE_SPINLOCK(boot_lock);
4130+static DEFINE_RAW_SPINLOCK(boot_lock);
4131
4132 void __iomem *omap4_get_scu_base(void)
4133 {
4134@@ -177,8 +177,8 @@ static void omap4_secondary_init(unsigned int cpu)
4135 /*
4136 * Synchronise with the boot thread.
4137 */
4138- spin_lock(&boot_lock);
4139- spin_unlock(&boot_lock);
4140+ raw_spin_lock(&boot_lock);
4141+ raw_spin_unlock(&boot_lock);
4142 }
4143
4144 static int omap4_boot_secondary(unsigned int cpu, struct task_struct *idle)
4145@@ -191,7 +191,7 @@ static int omap4_boot_secondary(unsigned int cpu, struct task_struct *idle)
4146 * Set synchronisation state between this boot processor
4147 * and the secondary one
4148 */
4149- spin_lock(&boot_lock);
4150+ raw_spin_lock(&boot_lock);
4151
4152 /*
4153 * Update the AuxCoreBoot0 with boot state for secondary core.
4154@@ -270,7 +270,7 @@ static int omap4_boot_secondary(unsigned int cpu, struct task_struct *idle)
4155 * Now the secondary core is starting up let it run its
4156 * calibrations, then wait for it to finish
4157 */
4158- spin_unlock(&boot_lock);
4159+ raw_spin_unlock(&boot_lock);
4160
4161 return 0;
4162 }
4163diff --git a/arch/arm/mach-prima2/platsmp.c b/arch/arm/mach-prima2/platsmp.c
4164index 75ef5d4be554..c17c86e5d860 100644
4165--- a/arch/arm/mach-prima2/platsmp.c
4166+++ b/arch/arm/mach-prima2/platsmp.c
4167@@ -22,7 +22,7 @@
4168
4169 static void __iomem *clk_base;
4170
4171-static DEFINE_SPINLOCK(boot_lock);
4172+static DEFINE_RAW_SPINLOCK(boot_lock);
4173
4174 static void sirfsoc_secondary_init(unsigned int cpu)
4175 {
4176@@ -36,8 +36,8 @@ static void sirfsoc_secondary_init(unsigned int cpu)
4177 /*
4178 * Synchronise with the boot thread.
4179 */
4180- spin_lock(&boot_lock);
4181- spin_unlock(&boot_lock);
4182+ raw_spin_lock(&boot_lock);
4183+ raw_spin_unlock(&boot_lock);
4184 }
4185
4186 static const struct of_device_id clk_ids[] = {
4187@@ -75,7 +75,7 @@ static int sirfsoc_boot_secondary(unsigned int cpu, struct task_struct *idle)
4188 /* make sure write buffer is drained */
4189 mb();
4190
4191- spin_lock(&boot_lock);
4192+ raw_spin_lock(&boot_lock);
4193
4194 /*
4195 * The secondary processor is waiting to be released from
4196@@ -107,7 +107,7 @@ static int sirfsoc_boot_secondary(unsigned int cpu, struct task_struct *idle)
4197 * now the secondary core is starting up let it run its
4198 * calibrations, then wait for it to finish
4199 */
4200- spin_unlock(&boot_lock);
4201+ raw_spin_unlock(&boot_lock);
4202
4203 return pen_release != -1 ? -ENOSYS : 0;
4204 }
4205diff --git a/arch/arm/mach-qcom/platsmp.c b/arch/arm/mach-qcom/platsmp.c
4206index 5494c9e0c909..e8ce157d3548 100644
4207--- a/arch/arm/mach-qcom/platsmp.c
4208+++ b/arch/arm/mach-qcom/platsmp.c
4209@@ -46,7 +46,7 @@
4210
4211 extern void secondary_startup_arm(void);
4212
4213-static DEFINE_SPINLOCK(boot_lock);
4214+static DEFINE_RAW_SPINLOCK(boot_lock);
4215
4216 #ifdef CONFIG_HOTPLUG_CPU
4217 static void qcom_cpu_die(unsigned int cpu)
4218@@ -60,8 +60,8 @@ static void qcom_secondary_init(unsigned int cpu)
4219 /*
4220 * Synchronise with the boot thread.
4221 */
4222- spin_lock(&boot_lock);
4223- spin_unlock(&boot_lock);
4224+ raw_spin_lock(&boot_lock);
4225+ raw_spin_unlock(&boot_lock);
4226 }
4227
4228 static int scss_release_secondary(unsigned int cpu)
4229@@ -284,7 +284,7 @@ static int qcom_boot_secondary(unsigned int cpu, int (*func)(unsigned int))
4230 * set synchronisation state between this boot processor
4231 * and the secondary one
4232 */
4233- spin_lock(&boot_lock);
4234+ raw_spin_lock(&boot_lock);
4235
4236 /*
4237 * Send the secondary CPU a soft interrupt, thereby causing
4238@@ -297,7 +297,7 @@ static int qcom_boot_secondary(unsigned int cpu, int (*func)(unsigned int))
4239 * now the secondary core is starting up let it run its
4240 * calibrations, then wait for it to finish
4241 */
4242- spin_unlock(&boot_lock);
4243+ raw_spin_unlock(&boot_lock);
4244
4245 return ret;
4246 }
4247diff --git a/arch/arm/mach-spear/platsmp.c b/arch/arm/mach-spear/platsmp.c
4248index 39038a03836a..6da5c93872bf 100644
4249--- a/arch/arm/mach-spear/platsmp.c
4250+++ b/arch/arm/mach-spear/platsmp.c
4251@@ -32,7 +32,7 @@ static void write_pen_release(int val)
4252 sync_cache_w(&pen_release);
4253 }
4254
4255-static DEFINE_SPINLOCK(boot_lock);
4256+static DEFINE_RAW_SPINLOCK(boot_lock);
4257
4258 static void __iomem *scu_base = IOMEM(VA_SCU_BASE);
4259
4260@@ -47,8 +47,8 @@ static void spear13xx_secondary_init(unsigned int cpu)
4261 /*
4262 * Synchronise with the boot thread.
4263 */
4264- spin_lock(&boot_lock);
4265- spin_unlock(&boot_lock);
4266+ raw_spin_lock(&boot_lock);
4267+ raw_spin_unlock(&boot_lock);
4268 }
4269
4270 static int spear13xx_boot_secondary(unsigned int cpu, struct task_struct *idle)
4271@@ -59,7 +59,7 @@ static int spear13xx_boot_secondary(unsigned int cpu, struct task_struct *idle)
4272 * set synchronisation state between this boot processor
4273 * and the secondary one
4274 */
4275- spin_lock(&boot_lock);
4276+ raw_spin_lock(&boot_lock);
4277
4278 /*
4279 * The secondary processor is waiting to be released from
4280@@ -84,7 +84,7 @@ static int spear13xx_boot_secondary(unsigned int cpu, struct task_struct *idle)
4281 * now the secondary core is starting up let it run its
4282 * calibrations, then wait for it to finish
4283 */
4284- spin_unlock(&boot_lock);
4285+ raw_spin_unlock(&boot_lock);
4286
4287 return pen_release != -1 ? -ENOSYS : 0;
4288 }
4289diff --git a/arch/arm/mach-sti/platsmp.c b/arch/arm/mach-sti/platsmp.c
4290index 231f19e17436..a3419b7003e6 100644
4291--- a/arch/arm/mach-sti/platsmp.c
4292+++ b/arch/arm/mach-sti/platsmp.c
4293@@ -35,7 +35,7 @@ static void write_pen_release(int val)
4294 sync_cache_w(&pen_release);
4295 }
4296
4297-static DEFINE_SPINLOCK(boot_lock);
4298+static DEFINE_RAW_SPINLOCK(boot_lock);
4299
4300 static void sti_secondary_init(unsigned int cpu)
4301 {
4302@@ -48,8 +48,8 @@ static void sti_secondary_init(unsigned int cpu)
4303 /*
4304 * Synchronise with the boot thread.
4305 */
4306- spin_lock(&boot_lock);
4307- spin_unlock(&boot_lock);
4308+ raw_spin_lock(&boot_lock);
4309+ raw_spin_unlock(&boot_lock);
4310 }
4311
4312 static int sti_boot_secondary(unsigned int cpu, struct task_struct *idle)
4313@@ -60,7 +60,7 @@ static int sti_boot_secondary(unsigned int cpu, struct task_struct *idle)
4314 * set synchronisation state between this boot processor
4315 * and the secondary one
4316 */
4317- spin_lock(&boot_lock);
4318+ raw_spin_lock(&boot_lock);
4319
4320 /*
4321 * The secondary processor is waiting to be released from
4322@@ -91,7 +91,7 @@ static int sti_boot_secondary(unsigned int cpu, struct task_struct *idle)
4323 * now the secondary core is starting up let it run its
4324 * calibrations, then wait for it to finish
4325 */
4326- spin_unlock(&boot_lock);
4327+ raw_spin_unlock(&boot_lock);
4328
4329 return pen_release != -1 ? -ENOSYS : 0;
4330 }
4331diff --git a/arch/arm/mm/fault.c b/arch/arm/mm/fault.c
5dd41b01 4332index 49b1b8048635..b261967ea028 100644
b3bbd485
JK
4333--- a/arch/arm/mm/fault.c
4334+++ b/arch/arm/mm/fault.c
5dd41b01 4335@@ -437,6 +437,9 @@ do_translation_fault(unsigned long addr, unsigned int fsr,
b3bbd485
JK
4336 if (addr < TASK_SIZE)
4337 return do_page_fault(addr, fsr, regs);
4338
4339+ if (interrupts_enabled(regs))
4340+ local_irq_enable();
e4b2b4a8 4341+
b3bbd485
JK
4342 if (user_mode(regs))
4343 goto bad_area;
4344
5dd41b01 4345@@ -504,6 +507,9 @@ do_translation_fault(unsigned long addr, unsigned int fsr,
b3bbd485
JK
4346 static int
4347 do_sect_fault(unsigned long addr, unsigned int fsr, struct pt_regs *regs)
4348 {
4349+ if (interrupts_enabled(regs))
4350+ local_irq_enable();
e4b2b4a8 4351+
b3bbd485
JK
4352 do_bad_area(addr, fsr, regs);
4353 return 0;
4354 }
4355diff --git a/arch/arm/mm/highmem.c b/arch/arm/mm/highmem.c
4356index d02f8187b1cc..542692dbd40a 100644
4357--- a/arch/arm/mm/highmem.c
4358+++ b/arch/arm/mm/highmem.c
4359@@ -34,6 +34,11 @@ static inline pte_t get_fixmap_pte(unsigned long vaddr)
4360 return *ptep;
4361 }
4362
4363+static unsigned int fixmap_idx(int type)
4364+{
4365+ return FIX_KMAP_BEGIN + type + KM_TYPE_NR * smp_processor_id();
4366+}
e4b2b4a8 4367+
b3bbd485
JK
4368 void *kmap(struct page *page)
4369 {
4370 might_sleep();
4371@@ -54,12 +59,13 @@ EXPORT_SYMBOL(kunmap);
4372
4373 void *kmap_atomic(struct page *page)
4374 {
4375+ pte_t pte = mk_pte(page, kmap_prot);
4376 unsigned int idx;
4377 unsigned long vaddr;
4378 void *kmap;
4379 int type;
4380
4381- preempt_disable();
4382+ preempt_disable_nort();
4383 pagefault_disable();
4384 if (!PageHighMem(page))
4385 return page_address(page);
4386@@ -79,7 +85,7 @@ void *kmap_atomic(struct page *page)
4387
4388 type = kmap_atomic_idx_push();
4389
4390- idx = FIX_KMAP_BEGIN + type + KM_TYPE_NR * smp_processor_id();
4391+ idx = fixmap_idx(type);
4392 vaddr = __fix_to_virt(idx);
4393 #ifdef CONFIG_DEBUG_HIGHMEM
4394 /*
4395@@ -93,7 +99,10 @@ void *kmap_atomic(struct page *page)
4396 * in place, so the contained TLB flush ensures the TLB is updated
4397 * with the new mapping.
4398 */
4399- set_fixmap_pte(idx, mk_pte(page, kmap_prot));
4400+#ifdef CONFIG_PREEMPT_RT_FULL
4401+ current->kmap_pte[type] = pte;
4402+#endif
4403+ set_fixmap_pte(idx, pte);
4404
4405 return (void *)vaddr;
4406 }
4407@@ -106,44 +115,75 @@ void __kunmap_atomic(void *kvaddr)
4408
4409 if (kvaddr >= (void *)FIXADDR_START) {
4410 type = kmap_atomic_idx();
4411- idx = FIX_KMAP_BEGIN + type + KM_TYPE_NR * smp_processor_id();
4412+ idx = fixmap_idx(type);
4413
4414 if (cache_is_vivt())
4415 __cpuc_flush_dcache_area((void *)vaddr, PAGE_SIZE);
4416+#ifdef CONFIG_PREEMPT_RT_FULL
4417+ current->kmap_pte[type] = __pte(0);
4418+#endif
4419 #ifdef CONFIG_DEBUG_HIGHMEM
4420 BUG_ON(vaddr != __fix_to_virt(idx));
4421- set_fixmap_pte(idx, __pte(0));
4422 #else
4423 (void) idx; /* to kill a warning */
4424 #endif
4425+ set_fixmap_pte(idx, __pte(0));
4426 kmap_atomic_idx_pop();
4427 } else if (vaddr >= PKMAP_ADDR(0) && vaddr < PKMAP_ADDR(LAST_PKMAP)) {
4428 /* this address was obtained through kmap_high_get() */
4429 kunmap_high(pte_page(pkmap_page_table[PKMAP_NR(vaddr)]));
4430 }
4431 pagefault_enable();
4432- preempt_enable();
4433+ preempt_enable_nort();
4434 }
4435 EXPORT_SYMBOL(__kunmap_atomic);
4436
4437 void *kmap_atomic_pfn(unsigned long pfn)
4438 {
4439+ pte_t pte = pfn_pte(pfn, kmap_prot);
4440 unsigned long vaddr;
4441 int idx, type;
4442 struct page *page = pfn_to_page(pfn);
4443
4444- preempt_disable();
4445+ preempt_disable_nort();
4446 pagefault_disable();
4447 if (!PageHighMem(page))
4448 return page_address(page);
4449
4450 type = kmap_atomic_idx_push();
4451- idx = FIX_KMAP_BEGIN + type + KM_TYPE_NR * smp_processor_id();
4452+ idx = fixmap_idx(type);
4453 vaddr = __fix_to_virt(idx);
4454 #ifdef CONFIG_DEBUG_HIGHMEM
4455 BUG_ON(!pte_none(get_fixmap_pte(vaddr)));
4456 #endif
4457- set_fixmap_pte(idx, pfn_pte(pfn, kmap_prot));
4458+#ifdef CONFIG_PREEMPT_RT_FULL
4459+ current->kmap_pte[type] = pte;
4460+#endif
4461+ set_fixmap_pte(idx, pte);
4462
4463 return (void *)vaddr;
4464 }
4465+#if defined CONFIG_PREEMPT_RT_FULL
4466+void switch_kmaps(struct task_struct *prev_p, struct task_struct *next_p)
4467+{
4468+ int i;
e4b2b4a8 4469+
b3bbd485
JK
4470+ /*
4471+ * Clear @prev's kmap_atomic mappings
4472+ */
4473+ for (i = 0; i < prev_p->kmap_idx; i++) {
4474+ int idx = fixmap_idx(i);
e4b2b4a8 4475+
b3bbd485
JK
4476+ set_fixmap_pte(idx, __pte(0));
4477+ }
4478+ /*
4479+ * Restore @next_p's kmap_atomic mappings
4480+ */
4481+ for (i = 0; i < next_p->kmap_idx; i++) {
4482+ int idx = fixmap_idx(i);
e4b2b4a8 4483+
b3bbd485
JK
4484+ if (!pte_none(next_p->kmap_pte[i]))
4485+ set_fixmap_pte(idx, next_p->kmap_pte[i]);
4486+ }
4487+}
4488+#endif
4489diff --git a/arch/arm/plat-versatile/platsmp.c b/arch/arm/plat-versatile/platsmp.c
4490index c2366510187a..6b60f582b738 100644
4491--- a/arch/arm/plat-versatile/platsmp.c
4492+++ b/arch/arm/plat-versatile/platsmp.c
4493@@ -32,7 +32,7 @@ static void write_pen_release(int val)
4494 sync_cache_w(&pen_release);
4495 }
4496
4497-static DEFINE_SPINLOCK(boot_lock);
4498+static DEFINE_RAW_SPINLOCK(boot_lock);
4499
4500 void versatile_secondary_init(unsigned int cpu)
4501 {
4502@@ -45,8 +45,8 @@ void versatile_secondary_init(unsigned int cpu)
4503 /*
4504 * Synchronise with the boot thread.
4505 */
4506- spin_lock(&boot_lock);
4507- spin_unlock(&boot_lock);
4508+ raw_spin_lock(&boot_lock);
4509+ raw_spin_unlock(&boot_lock);
4510 }
4511
4512 int versatile_boot_secondary(unsigned int cpu, struct task_struct *idle)
4513@@ -57,7 +57,7 @@ int versatile_boot_secondary(unsigned int cpu, struct task_struct *idle)
4514 * Set synchronisation state between this boot processor
4515 * and the secondary one
4516 */
4517- spin_lock(&boot_lock);
4518+ raw_spin_lock(&boot_lock);
4519
4520 /*
4521 * This is really belt and braces; we hold unintended secondary
4522@@ -87,7 +87,7 @@ int versatile_boot_secondary(unsigned int cpu, struct task_struct *idle)
4523 * now the secondary core is starting up let it run its
4524 * calibrations, then wait for it to finish
4525 */
4526- spin_unlock(&boot_lock);
4527+ raw_spin_unlock(&boot_lock);
4528
4529 return pen_release != -1 ? -ENOSYS : 0;
4530 }
4531diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
4532index c30cd78b6918..458d2033ffde 100644
4533--- a/arch/arm64/Kconfig
4534+++ b/arch/arm64/Kconfig
4535@@ -103,6 +103,7 @@ config ARM64
4536 select HAVE_PERF_EVENTS
4537 select HAVE_PERF_REGS
4538 select HAVE_PERF_USER_STACK_DUMP
4539+ select HAVE_PREEMPT_LAZY
4540 select HAVE_REGS_AND_STACK_ACCESS_API
4541 select HAVE_RCU_TABLE_FREE
4542 select HAVE_SYSCALL_TRACEPOINTS
4543diff --git a/arch/arm64/crypto/Kconfig b/arch/arm64/crypto/Kconfig
4544index 70c517aa4501..2a5f05b5a19a 100644
4545--- a/arch/arm64/crypto/Kconfig
4546+++ b/arch/arm64/crypto/Kconfig
4547@@ -19,19 +19,19 @@ config CRYPTO_SHA512_ARM64
4548
4549 config CRYPTO_SHA1_ARM64_CE
4550 tristate "SHA-1 digest algorithm (ARMv8 Crypto Extensions)"
4551- depends on KERNEL_MODE_NEON
4552+ depends on KERNEL_MODE_NEON && !PREEMPT_RT_BASE
4553 select CRYPTO_HASH
4554 select CRYPTO_SHA1
4555
4556 config CRYPTO_SHA2_ARM64_CE
4557 tristate "SHA-224/SHA-256 digest algorithm (ARMv8 Crypto Extensions)"
4558- depends on KERNEL_MODE_NEON
4559+ depends on KERNEL_MODE_NEON && !PREEMPT_RT_BASE
4560 select CRYPTO_HASH
4561 select CRYPTO_SHA256_ARM64
4562
4563 config CRYPTO_GHASH_ARM64_CE
4564 tristate "GHASH/AES-GCM using ARMv8 Crypto Extensions"
4565- depends on KERNEL_MODE_NEON
4566+ depends on KERNEL_MODE_NEON && !PREEMPT_RT_BASE
4567 select CRYPTO_HASH
4568 select CRYPTO_GF128MUL
4569 select CRYPTO_AES
4570@@ -39,7 +39,7 @@ config CRYPTO_GHASH_ARM64_CE
4571
4572 config CRYPTO_CRCT10DIF_ARM64_CE
4573 tristate "CRCT10DIF digest algorithm using PMULL instructions"
4574- depends on KERNEL_MODE_NEON && CRC_T10DIF
4575+ depends on KERNEL_MODE_NEON && CRC_T10DIF && !PREEMPT_RT_BASE
4576 select CRYPTO_HASH
4577
4578 config CRYPTO_CRC32_ARM64_CE
4579@@ -53,13 +53,13 @@ config CRYPTO_AES_ARM64
4580
4581 config CRYPTO_AES_ARM64_CE
4582 tristate "AES core cipher using ARMv8 Crypto Extensions"
4583- depends on ARM64 && KERNEL_MODE_NEON
4584+ depends on ARM64 && KERNEL_MODE_NEON && !PREEMPT_RT_BASE
4585 select CRYPTO_ALGAPI
4586 select CRYPTO_AES_ARM64
4587
4588 config CRYPTO_AES_ARM64_CE_CCM
4589 tristate "AES in CCM mode using ARMv8 Crypto Extensions"
4590- depends on ARM64 && KERNEL_MODE_NEON
4591+ depends on ARM64 && KERNEL_MODE_NEON && !PREEMPT_RT_BASE
4592 select CRYPTO_ALGAPI
4593 select CRYPTO_AES_ARM64_CE
4594 select CRYPTO_AES_ARM64
4595@@ -67,7 +67,7 @@ config CRYPTO_AES_ARM64_CE_CCM
4596
4597 config CRYPTO_AES_ARM64_CE_BLK
4598 tristate "AES in ECB/CBC/CTR/XTS modes using ARMv8 Crypto Extensions"
4599- depends on KERNEL_MODE_NEON
4600+ depends on KERNEL_MODE_NEON && !PREEMPT_RT_BASE
4601 select CRYPTO_BLKCIPHER
4602 select CRYPTO_AES_ARM64_CE
4603 select CRYPTO_AES_ARM64
4604@@ -75,7 +75,7 @@ config CRYPTO_AES_ARM64_CE_BLK
4605
4606 config CRYPTO_AES_ARM64_NEON_BLK
4607 tristate "AES in ECB/CBC/CTR/XTS modes using NEON instructions"
4608- depends on KERNEL_MODE_NEON
4609+ depends on KERNEL_MODE_NEON && !PREEMPT_RT_BASE
4610 select CRYPTO_BLKCIPHER
4611 select CRYPTO_AES_ARM64
4612 select CRYPTO_AES
4613@@ -83,13 +83,13 @@ config CRYPTO_AES_ARM64_NEON_BLK
4614
4615 config CRYPTO_CHACHA20_NEON
4616 tristate "NEON accelerated ChaCha20 symmetric cipher"
4617- depends on KERNEL_MODE_NEON
4618+ depends on KERNEL_MODE_NEON && !PREEMPT_RT_BASE
4619 select CRYPTO_BLKCIPHER
4620 select CRYPTO_CHACHA20
4621
4622 config CRYPTO_AES_ARM64_BS
4623 tristate "AES in ECB/CBC/CTR/XTS modes using bit-sliced NEON algorithm"
4624- depends on KERNEL_MODE_NEON
4625+ depends on KERNEL_MODE_NEON && !PREEMPT_RT_BASE
4626 select CRYPTO_BLKCIPHER
4627 select CRYPTO_AES_ARM64_NEON_BLK
4628 select CRYPTO_AES_ARM64
4629diff --git a/arch/arm64/crypto/crc32-ce-glue.c b/arch/arm64/crypto/crc32-ce-glue.c
4630index 34b4e3d46aab..ae055cdad8cf 100644
4631--- a/arch/arm64/crypto/crc32-ce-glue.c
4632+++ b/arch/arm64/crypto/crc32-ce-glue.c
4633@@ -208,7 +208,8 @@ static struct shash_alg crc32_pmull_algs[] = { {
4634
4635 static int __init crc32_pmull_mod_init(void)
4636 {
4637- if (IS_ENABLED(CONFIG_KERNEL_MODE_NEON) && (elf_hwcap & HWCAP_PMULL)) {
4638+ if (IS_ENABLED(CONFIG_KERNEL_MODE_NEON) &&
4639+ !IS_ENABLED(CONFIG_PREEMPT_RT_BASE) && (elf_hwcap & HWCAP_PMULL)) {
4640 crc32_pmull_algs[0].update = crc32_pmull_update;
4641 crc32_pmull_algs[1].update = crc32c_pmull_update;
4642
4643diff --git a/arch/arm64/include/asm/spinlock_types.h b/arch/arm64/include/asm/spinlock_types.h
4644index 55be59a35e3f..ba0cf1361f65 100644
4645--- a/arch/arm64/include/asm/spinlock_types.h
4646+++ b/arch/arm64/include/asm/spinlock_types.h
4647@@ -16,10 +16,6 @@
4648 #ifndef __ASM_SPINLOCK_TYPES_H
4649 #define __ASM_SPINLOCK_TYPES_H
4650
4651-#if !defined(__LINUX_SPINLOCK_TYPES_H) && !defined(__ASM_SPINLOCK_H)
4652-# error "please don't include this file directly"
4653-#endif
4654-
4655 #include <linux/types.h>
4656
4657 #define TICKET_SHIFT 16
4658diff --git a/arch/arm64/include/asm/thread_info.h b/arch/arm64/include/asm/thread_info.h
4659index fc786d344e46..b833258b7594 100644
4660--- a/arch/arm64/include/asm/thread_info.h
4661+++ b/arch/arm64/include/asm/thread_info.h
4662@@ -43,6 +43,7 @@ struct thread_info {
4663 u64 ttbr0; /* saved TTBR0_EL1 */
4664 #endif
4665 int preempt_count; /* 0 => preemptable, <0 => bug */
4666+ int preempt_lazy_count; /* 0 => preemptable, <0 => bug */
4667 };
4668
4669 #define INIT_THREAD_INFO(tsk) \
4670@@ -82,6 +83,7 @@ void arch_setup_new_exec(void);
4671 #define TIF_FOREIGN_FPSTATE 3 /* CPU's FP state is not current's */
4672 #define TIF_UPROBE 4 /* uprobe breakpoint or singlestep */
4673 #define TIF_FSCHECK 5 /* Check FS is USER_DS on return */
4674+#define TIF_NEED_RESCHED_LAZY 6
4675 #define TIF_NOHZ 7
4676 #define TIF_SYSCALL_TRACE 8
4677 #define TIF_SYSCALL_AUDIT 9
4678@@ -98,6 +100,7 @@ void arch_setup_new_exec(void);
4679 #define _TIF_NEED_RESCHED (1 << TIF_NEED_RESCHED)
4680 #define _TIF_NOTIFY_RESUME (1 << TIF_NOTIFY_RESUME)
4681 #define _TIF_FOREIGN_FPSTATE (1 << TIF_FOREIGN_FPSTATE)
4682+#define _TIF_NEED_RESCHED_LAZY (1 << TIF_NEED_RESCHED_LAZY)
4683 #define _TIF_NOHZ (1 << TIF_NOHZ)
4684 #define _TIF_SYSCALL_TRACE (1 << TIF_SYSCALL_TRACE)
4685 #define _TIF_SYSCALL_AUDIT (1 << TIF_SYSCALL_AUDIT)
4686@@ -109,8 +112,9 @@ void arch_setup_new_exec(void);
4687
4688 #define _TIF_WORK_MASK (_TIF_NEED_RESCHED | _TIF_SIGPENDING | \
4689 _TIF_NOTIFY_RESUME | _TIF_FOREIGN_FPSTATE | \
4690- _TIF_UPROBE | _TIF_FSCHECK)
4691+ _TIF_UPROBE | _TIF_FSCHECK | _TIF_NEED_RESCHED_LAZY)
4692
4693+#define _TIF_NEED_RESCHED_MASK (_TIF_NEED_RESCHED | _TIF_NEED_RESCHED_LAZY)
4694 #define _TIF_SYSCALL_WORK (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_AUDIT | \
4695 _TIF_SYSCALL_TRACEPOINT | _TIF_SECCOMP | \
4696 _TIF_NOHZ)
4697diff --git a/arch/arm64/kernel/asm-offsets.c b/arch/arm64/kernel/asm-offsets.c
4698index b5e43b01b396..ae26a1664436 100644
4699--- a/arch/arm64/kernel/asm-offsets.c
4700+++ b/arch/arm64/kernel/asm-offsets.c
4701@@ -39,6 +39,7 @@ int main(void)
4702 BLANK();
4703 DEFINE(TSK_TI_FLAGS, offsetof(struct task_struct, thread_info.flags));
4704 DEFINE(TSK_TI_PREEMPT, offsetof(struct task_struct, thread_info.preempt_count));
4705+ DEFINE(TSK_TI_PREEMPT_LAZY, offsetof(struct task_struct, thread_info.preempt_lazy_count));
4706 DEFINE(TSK_TI_ADDR_LIMIT, offsetof(struct task_struct, thread_info.addr_limit));
4707 #ifdef CONFIG_ARM64_SW_TTBR0_PAN
4708 DEFINE(TSK_TI_TTBR0, offsetof(struct task_struct, thread_info.ttbr0));
4709diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S
4710index c1ffa95c0ad2..c60ecb5a3916 100644
4711--- a/arch/arm64/kernel/entry.S
4712+++ b/arch/arm64/kernel/entry.S
4713@@ -637,11 +637,16 @@ el1_irq:
4714
4715 #ifdef CONFIG_PREEMPT
4716 ldr w24, [tsk, #TSK_TI_PREEMPT] // get preempt count
4717- cbnz w24, 1f // preempt count != 0
4718+ cbnz w24, 2f // preempt count != 0
4719 ldr x0, [tsk, #TSK_TI_FLAGS] // get flags
4720- tbz x0, #TIF_NEED_RESCHED, 1f // needs rescheduling?
4721- bl el1_preempt
4722+ tbnz x0, #TIF_NEED_RESCHED, 1f // needs rescheduling?
e4b2b4a8 4723+
b3bbd485
JK
4724+ ldr w24, [tsk, #TSK_TI_PREEMPT_LAZY] // get preempt lazy count
4725+ cbnz w24, 2f // preempt lazy count != 0
4726+ tbz x0, #TIF_NEED_RESCHED_LAZY, 2f // needs rescheduling?
4727 1:
4728+ bl el1_preempt
4729+2:
4730 #endif
4731 #ifdef CONFIG_TRACE_IRQFLAGS
4732 bl trace_hardirqs_on
4733@@ -655,6 +660,7 @@ el1_preempt:
4734 1: bl preempt_schedule_irq // irq en/disable is done inside
4735 ldr x0, [tsk, #TSK_TI_FLAGS] // get new tasks TI_FLAGS
4736 tbnz x0, #TIF_NEED_RESCHED, 1b // needs rescheduling?
4737+ tbnz x0, #TIF_NEED_RESCHED_LAZY, 1b // needs rescheduling?
4738 ret x24
4739 #endif
4740
4741diff --git a/arch/arm64/kernel/signal.c b/arch/arm64/kernel/signal.c
4742index 43442b3a463f..81bf9545a589 100644
4743--- a/arch/arm64/kernel/signal.c
4744+++ b/arch/arm64/kernel/signal.c
4745@@ -756,7 +756,7 @@ asmlinkage void do_notify_resume(struct pt_regs *regs,
4746 /* Check valid user FS if needed */
4747 addr_limit_user_check();
4748
4749- if (thread_flags & _TIF_NEED_RESCHED) {
4750+ if (thread_flags & _TIF_NEED_RESCHED_MASK) {
4751 schedule();
4752 } else {
4753 local_irq_enable();
4754diff --git a/arch/blackfin/include/asm/spinlock_types.h b/arch/blackfin/include/asm/spinlock_types.h
4755index 1a33608c958b..103b34d3dcf6 100644
4756--- a/arch/blackfin/include/asm/spinlock_types.h
4757+++ b/arch/blackfin/include/asm/spinlock_types.h
4758@@ -7,10 +7,6 @@
4759 #ifndef __ASM_SPINLOCK_TYPES_H
4760 #define __ASM_SPINLOCK_TYPES_H
4761
4762-#ifndef __LINUX_SPINLOCK_TYPES_H
4763-# error "please don't include this file directly"
4764-#endif
4765-
4766 #include <asm/rwlock.h>
4767
4768 typedef struct {
4769diff --git a/arch/hexagon/include/asm/spinlock_types.h b/arch/hexagon/include/asm/spinlock_types.h
4770index 7a906b5214a4..d8f596fec022 100644
4771--- a/arch/hexagon/include/asm/spinlock_types.h
4772+++ b/arch/hexagon/include/asm/spinlock_types.h
4773@@ -21,10 +21,6 @@
4774 #ifndef _ASM_SPINLOCK_TYPES_H
4775 #define _ASM_SPINLOCK_TYPES_H
4776
4777-#ifndef __LINUX_SPINLOCK_TYPES_H
4778-# error "please don't include this file directly"
4779-#endif
4780-
4781 typedef struct {
4782 volatile unsigned int lock;
4783 } arch_spinlock_t;
4784diff --git a/arch/ia64/include/asm/spinlock_types.h b/arch/ia64/include/asm/spinlock_types.h
4785index 6e345fefcdca..681408d6816f 100644
4786--- a/arch/ia64/include/asm/spinlock_types.h
4787+++ b/arch/ia64/include/asm/spinlock_types.h
4788@@ -2,10 +2,6 @@
4789 #ifndef _ASM_IA64_SPINLOCK_TYPES_H
4790 #define _ASM_IA64_SPINLOCK_TYPES_H
4791
4792-#ifndef __LINUX_SPINLOCK_TYPES_H
4793-# error "please don't include this file directly"
4794-#endif
4795-
4796 typedef struct {
4797 volatile unsigned int lock;
4798 } arch_spinlock_t;
4799diff --git a/arch/ia64/kernel/mca.c b/arch/ia64/kernel/mca.c
4800index 555b11180156..6866201a7603 100644
4801--- a/arch/ia64/kernel/mca.c
4802+++ b/arch/ia64/kernel/mca.c
4803@@ -1824,7 +1824,7 @@ format_mca_init_stack(void *mca_data, unsigned long offset,
4804 ti->cpu = cpu;
4805 p->stack = ti;
4806 p->state = TASK_UNINTERRUPTIBLE;
4807- cpumask_set_cpu(cpu, &p->cpus_allowed);
4808+ cpumask_set_cpu(cpu, &p->cpus_mask);
4809 INIT_LIST_HEAD(&p->tasks);
4810 p->parent = p->real_parent = p->group_leader = p;
4811 INIT_LIST_HEAD(&p->children);
4812diff --git a/arch/m32r/include/asm/spinlock_types.h b/arch/m32r/include/asm/spinlock_types.h
4813index bb0d17b64198..fc6afa42fe11 100644
4814--- a/arch/m32r/include/asm/spinlock_types.h
4815+++ b/arch/m32r/include/asm/spinlock_types.h
4816@@ -2,10 +2,6 @@
4817 #ifndef _ASM_M32R_SPINLOCK_TYPES_H
4818 #define _ASM_M32R_SPINLOCK_TYPES_H
4819
4820-#ifndef __LINUX_SPINLOCK_TYPES_H
4821-# error "please don't include this file directly"
4822-#endif
4823-
4824 typedef struct {
4825 volatile int slock;
4826 } arch_spinlock_t;
4827diff --git a/arch/metag/include/asm/spinlock_types.h b/arch/metag/include/asm/spinlock_types.h
4828index cd197f1bed59..adc26e9797c5 100644
4829--- a/arch/metag/include/asm/spinlock_types.h
4830+++ b/arch/metag/include/asm/spinlock_types.h
4831@@ -2,10 +2,6 @@
4832 #ifndef _ASM_METAG_SPINLOCK_TYPES_H
4833 #define _ASM_METAG_SPINLOCK_TYPES_H
4834
4835-#ifndef __LINUX_SPINLOCK_TYPES_H
4836-# error "please don't include this file directly"
4837-#endif
4838-
4839 typedef struct {
4840 volatile unsigned int lock;
4841 } arch_spinlock_t;
4842diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig
4843index c82457b0e733..7bb1838508de 100644
4844--- a/arch/mips/Kconfig
4845+++ b/arch/mips/Kconfig
4846@@ -2519,7 +2519,7 @@ config MIPS_ASID_BITS_VARIABLE
4847 #
4848 config HIGHMEM
4849 bool "High Memory Support"
4850- depends on 32BIT && CPU_SUPPORTS_HIGHMEM && SYS_SUPPORTS_HIGHMEM && !CPU_MIPS32_3_5_EVA
4851+ depends on 32BIT && CPU_SUPPORTS_HIGHMEM && SYS_SUPPORTS_HIGHMEM && !CPU_MIPS32_3_5_EVA && !PREEMPT_RT_FULL
4852
4853 config CPU_SUPPORTS_HIGHMEM
4854 bool
4855diff --git a/arch/mips/include/asm/switch_to.h b/arch/mips/include/asm/switch_to.h
4856index e610473d61b8..1428b4febbc9 100644
4857--- a/arch/mips/include/asm/switch_to.h
4858+++ b/arch/mips/include/asm/switch_to.h
4859@@ -42,7 +42,7 @@ extern struct task_struct *ll_task;
4860 * inline to try to keep the overhead down. If we have been forced to run on
4861 * a "CPU" with an FPU because of a previous high level of FP computation,
4862 * but did not actually use the FPU during the most recent time-slice (CU1
4863- * isn't set), we undo the restriction on cpus_allowed.
4864+ * isn't set), we undo the restriction on cpus_mask.
4865 *
4866 * We're not calling set_cpus_allowed() here, because we have no need to
4867 * force prompt migration - we're already switching the current CPU to a
4868@@ -57,7 +57,7 @@ do { \
4869 test_ti_thread_flag(__prev_ti, TIF_FPUBOUND) && \
4870 (!(KSTK_STATUS(prev) & ST0_CU1))) { \
4871 clear_ti_thread_flag(__prev_ti, TIF_FPUBOUND); \
4872- prev->cpus_allowed = prev->thread.user_cpus_allowed; \
4873+ prev->cpus_mask = prev->thread.user_cpus_allowed; \
4874 } \
4875 next->thread.emulated_fp = 0; \
4876 } while(0)
4877diff --git a/arch/mips/kernel/mips-mt-fpaff.c b/arch/mips/kernel/mips-mt-fpaff.c
4878index a7c0f97e4b0d..1a08428eedcf 100644
4879--- a/arch/mips/kernel/mips-mt-fpaff.c
4880+++ b/arch/mips/kernel/mips-mt-fpaff.c
4881@@ -177,7 +177,7 @@ asmlinkage long mipsmt_sys_sched_getaffinity(pid_t pid, unsigned int len,
4882 if (retval)
4883 goto out_unlock;
4884
4885- cpumask_or(&allowed, &p->thread.user_cpus_allowed, &p->cpus_allowed);
4886+ cpumask_or(&allowed, &p->thread.user_cpus_allowed, p->cpus_ptr);
4887 cpumask_and(&mask, &allowed, cpu_active_mask);
4888
4889 out_unlock:
4890diff --git a/arch/mips/kernel/traps.c b/arch/mips/kernel/traps.c
4891index 583aed906933..24ad7aaca5eb 100644
4892--- a/arch/mips/kernel/traps.c
4893+++ b/arch/mips/kernel/traps.c
4894@@ -1193,12 +1193,12 @@ static void mt_ase_fp_affinity(void)
4895 * restricted the allowed set to exclude any CPUs with FPUs,
4896 * we'll skip the procedure.
4897 */
4898- if (cpumask_intersects(&current->cpus_allowed, &mt_fpu_cpumask)) {
4899+ if (cpumask_intersects(&current->cpus_mask, &mt_fpu_cpumask)) {
4900 cpumask_t tmask;
4901
4902 current->thread.user_cpus_allowed
4903- = current->cpus_allowed;
4904- cpumask_and(&tmask, &current->cpus_allowed,
4905+ = current->cpus_mask;
4906+ cpumask_and(&tmask, &current->cpus_mask,
4907 &mt_fpu_cpumask);
4908 set_cpus_allowed_ptr(current, &tmask);
4909 set_thread_flag(TIF_FPUBOUND);
4910diff --git a/arch/mn10300/include/asm/spinlock_types.h b/arch/mn10300/include/asm/spinlock_types.h
4911index 32abdc89bbc7..c45230a12d60 100644
4912--- a/arch/mn10300/include/asm/spinlock_types.h
4913+++ b/arch/mn10300/include/asm/spinlock_types.h
4914@@ -2,10 +2,6 @@
4915 #ifndef _ASM_SPINLOCK_TYPES_H
4916 #define _ASM_SPINLOCK_TYPES_H
4917
4918-#ifndef __LINUX_SPINLOCK_TYPES_H
4919-# error "please don't include this file directly"
4920-#endif
4921-
4922 typedef struct arch_spinlock {
4923 unsigned int slock;
4924 } arch_spinlock_t;
4925diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
4926index fe418226df7f..b5658e925465 100644
4927--- a/arch/powerpc/Kconfig
4928+++ b/arch/powerpc/Kconfig
4929@@ -111,10 +111,11 @@ config LOCKDEP_SUPPORT
4930
4931 config RWSEM_GENERIC_SPINLOCK
4932 bool
4933+ default y if PREEMPT_RT_FULL
4934
4935 config RWSEM_XCHGADD_ALGORITHM
4936 bool
4937- default y
4938+ default y if !PREEMPT_RT_FULL
4939
4940 config GENERIC_LOCKBREAK
4941 bool
4942@@ -215,6 +216,7 @@ config PPC
4943 select HAVE_HARDLOCKUP_DETECTOR_PERF if PERF_EVENTS && HAVE_PERF_EVENTS_NMI && !HAVE_HARDLOCKUP_DETECTOR_ARCH
4944 select HAVE_PERF_REGS
4945 select HAVE_PERF_USER_STACK_DUMP
4946+ select HAVE_PREEMPT_LAZY
4947 select HAVE_RCU_TABLE_FREE if SMP
4948 select HAVE_REGS_AND_STACK_ACCESS_API
4949 select HAVE_SYSCALL_TRACEPOINTS
4950@@ -390,7 +392,7 @@ menu "Kernel options"
4951
4952 config HIGHMEM
4953 bool "High memory support"
4954- depends on PPC32
4955+ depends on PPC32 && !PREEMPT_RT_FULL
4956
4957 source kernel/Kconfig.hz
4958 source kernel/Kconfig.preempt
4959diff --git a/arch/powerpc/include/asm/spinlock_types.h b/arch/powerpc/include/asm/spinlock_types.h
4960index 87adaf13b7e8..7305cb6a53e4 100644
4961--- a/arch/powerpc/include/asm/spinlock_types.h
4962+++ b/arch/powerpc/include/asm/spinlock_types.h
4963@@ -2,10 +2,6 @@
4964 #ifndef _ASM_POWERPC_SPINLOCK_TYPES_H
4965 #define _ASM_POWERPC_SPINLOCK_TYPES_H
4966
4967-#ifndef __LINUX_SPINLOCK_TYPES_H
4968-# error "please don't include this file directly"
4969-#endif
4970-
4971 typedef struct {
4972 volatile unsigned int slock;
4973 } arch_spinlock_t;
4974diff --git a/arch/powerpc/include/asm/thread_info.h b/arch/powerpc/include/asm/thread_info.h
4975index a264c3ad366b..020afb8329a1 100644
4976--- a/arch/powerpc/include/asm/thread_info.h
4977+++ b/arch/powerpc/include/asm/thread_info.h
4978@@ -36,6 +36,8 @@ struct thread_info {
4979 int cpu; /* cpu we're on */
4980 int preempt_count; /* 0 => preemptable,
4981 <0 => BUG */
4982+ int preempt_lazy_count; /* 0 => preemptable,
4983+ <0 => BUG */
4984 unsigned long local_flags; /* private flags for thread */
4985 #ifdef CONFIG_LIVEPATCH
4986 unsigned long *livepatch_sp;
4987@@ -81,8 +83,7 @@ static inline struct thread_info *current_thread_info(void)
4988 #define TIF_SYSCALL_TRACE 0 /* syscall trace active */
4989 #define TIF_SIGPENDING 1 /* signal pending */
4990 #define TIF_NEED_RESCHED 2 /* rescheduling necessary */
4991-#define TIF_POLLING_NRFLAG 3 /* true if poll_idle() is polling
4992- TIF_NEED_RESCHED */
4993+#define TIF_NEED_RESCHED_LAZY 3 /* lazy rescheduling necessary */
4994 #define TIF_32BIT 4 /* 32 bit binary */
4995 #define TIF_RESTORE_TM 5 /* need to restore TM FP/VEC/VSX */
4996 #define TIF_PATCH_PENDING 6 /* pending live patching update */
4997@@ -101,6 +102,8 @@ static inline struct thread_info *current_thread_info(void)
4998 #if defined(CONFIG_PPC64)
4999 #define TIF_ELF2ABI 18 /* function descriptors must die! */
5000 #endif
5001+#define TIF_POLLING_NRFLAG 19 /* true if poll_idle() is polling
5002+ TIF_NEED_RESCHED */
5003
5004 /* as above, but as bit values */
5005 #define _TIF_SYSCALL_TRACE (1<<TIF_SYSCALL_TRACE)
5006@@ -120,14 +123,16 @@ static inline struct thread_info *current_thread_info(void)
5007 #define _TIF_SYSCALL_TRACEPOINT (1<<TIF_SYSCALL_TRACEPOINT)
5008 #define _TIF_EMULATE_STACK_STORE (1<<TIF_EMULATE_STACK_STORE)
5009 #define _TIF_NOHZ (1<<TIF_NOHZ)
5010+#define _TIF_NEED_RESCHED_LAZY (1<<TIF_NEED_RESCHED_LAZY)
5011 #define _TIF_SYSCALL_DOTRACE (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_AUDIT | \
5012 _TIF_SECCOMP | _TIF_SYSCALL_TRACEPOINT | \
5013 _TIF_NOHZ)
5014
5015 #define _TIF_USER_WORK_MASK (_TIF_SIGPENDING | _TIF_NEED_RESCHED | \
5016 _TIF_NOTIFY_RESUME | _TIF_UPROBE | \
5017- _TIF_RESTORE_TM | _TIF_PATCH_PENDING)
5018+ _TIF_RESTORE_TM | _TIF_PATCH_PENDING | _TIF_NEED_RESCHED_LAZY)
5019 #define _TIF_PERSYSCALL_MASK (_TIF_RESTOREALL|_TIF_NOERROR)
5020+#define _TIF_NEED_RESCHED_MASK (_TIF_NEED_RESCHED | _TIF_NEED_RESCHED_LAZY)
5021
5022 /* Bits in local_flags */
5023 /* Don't move TLF_NAPPING without adjusting the code in entry_32.S */
5024diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c
5025index 2e5ea300258a..a2cb40098d7c 100644
5026--- a/arch/powerpc/kernel/asm-offsets.c
5027+++ b/arch/powerpc/kernel/asm-offsets.c
5028@@ -156,6 +156,7 @@ int main(void)
5029 OFFSET(TI_FLAGS, thread_info, flags);
5030 OFFSET(TI_LOCAL_FLAGS, thread_info, local_flags);
5031 OFFSET(TI_PREEMPT, thread_info, preempt_count);
5032+ OFFSET(TI_PREEMPT_LAZY, thread_info, preempt_lazy_count);
5033 OFFSET(TI_TASK, thread_info, task);
5034 OFFSET(TI_CPU, thread_info, cpu);
5035
5036diff --git a/arch/powerpc/kernel/entry_32.S b/arch/powerpc/kernel/entry_32.S
5037index e780e1fbf6c2..dc7fe90ff6a9 100644
5038--- a/arch/powerpc/kernel/entry_32.S
5039+++ b/arch/powerpc/kernel/entry_32.S
5040@@ -866,7 +866,14 @@ resume_kernel:
5041 cmpwi 0,r0,0 /* if non-zero, just restore regs and return */
5042 bne restore
5043 andi. r8,r8,_TIF_NEED_RESCHED
5044+ bne+ 1f
5045+ lwz r0,TI_PREEMPT_LAZY(r9)
5046+ cmpwi 0,r0,0 /* if non-zero, just restore regs and return */
5047+ bne restore
5048+ lwz r0,TI_FLAGS(r9)
5049+ andi. r0,r0,_TIF_NEED_RESCHED_LAZY
5050 beq+ restore
5051+1:
5052 lwz r3,_MSR(r1)
5053 andi. r0,r3,MSR_EE /* interrupts off? */
5054 beq restore /* don't schedule if so */
5055@@ -877,11 +884,11 @@ resume_kernel:
5056 */
5057 bl trace_hardirqs_off
5058 #endif
5059-1: bl preempt_schedule_irq
5060+2: bl preempt_schedule_irq
5061 CURRENT_THREAD_INFO(r9, r1)
5062 lwz r3,TI_FLAGS(r9)
5063- andi. r0,r3,_TIF_NEED_RESCHED
5064- bne- 1b
5065+ andi. r0,r3,_TIF_NEED_RESCHED_MASK
5066+ bne- 2b
5067 #ifdef CONFIG_TRACE_IRQFLAGS
5068 /* And now, to properly rebalance the above, we tell lockdep they
5069 * are being turned back on, which will happen when we return
5070@@ -1204,7 +1211,7 @@ global_dbcr0:
5071 #endif /* !(CONFIG_4xx || CONFIG_BOOKE) */
5072
5073 do_work: /* r10 contains MSR_KERNEL here */
5074- andi. r0,r9,_TIF_NEED_RESCHED
5075+ andi. r0,r9,_TIF_NEED_RESCHED_MASK
5076 beq do_user_signal
5077
5078 do_resched: /* r10 contains MSR_KERNEL here */
5079@@ -1225,7 +1232,7 @@ recheck:
5080 MTMSRD(r10) /* disable interrupts */
5081 CURRENT_THREAD_INFO(r9, r1)
5082 lwz r9,TI_FLAGS(r9)
5083- andi. r0,r9,_TIF_NEED_RESCHED
5084+ andi. r0,r9,_TIF_NEED_RESCHED_MASK
5085 bne- do_resched
5086 andi. r0,r9,_TIF_USER_WORK_MASK
5087 beq restore_user
5088diff --git a/arch/powerpc/kernel/entry_64.S b/arch/powerpc/kernel/entry_64.S
5089index c194f4c8e66b..117c1f6cab66 100644
5090--- a/arch/powerpc/kernel/entry_64.S
5091+++ b/arch/powerpc/kernel/entry_64.S
5092@@ -690,7 +690,7 @@ _GLOBAL(ret_from_except_lite)
5093 bl restore_math
5094 b restore
5095 #endif
5096-1: andi. r0,r4,_TIF_NEED_RESCHED
5097+1: andi. r0,r4,_TIF_NEED_RESCHED_MASK
5098 beq 2f
5099 bl restore_interrupts
5100 SCHEDULE_USER
5101@@ -752,10 +752,18 @@ resume_kernel:
5102
5103 #ifdef CONFIG_PREEMPT
5104 /* Check if we need to preempt */
5105+ lwz r8,TI_PREEMPT(r9)
5106+ cmpwi 0,r8,0 /* if non-zero, just restore regs and return */
5107+ bne restore
5108 andi. r0,r4,_TIF_NEED_RESCHED
5109+ bne+ check_count
e4b2b4a8 5110+
b3bbd485
JK
5111+ andi. r0,r4,_TIF_NEED_RESCHED_LAZY
5112 beq+ restore
5113+ lwz r8,TI_PREEMPT_LAZY(r9)
e4b2b4a8 5114+
b3bbd485
JK
5115 /* Check that preempt_count() == 0 and interrupts are enabled */
5116- lwz r8,TI_PREEMPT(r9)
5117+check_count:
5118 cmpwi cr1,r8,0
5119 ld r0,SOFTE(r1)
5120 cmpdi r0,0
5121@@ -772,7 +780,7 @@ resume_kernel:
5122 /* Re-test flags and eventually loop */
5123 CURRENT_THREAD_INFO(r9, r1)
5124 ld r4,TI_FLAGS(r9)
5125- andi. r0,r4,_TIF_NEED_RESCHED
5126+ andi. r0,r4,_TIF_NEED_RESCHED_MASK
5127 bne 1b
5128
5129 /*
5130diff --git a/arch/powerpc/kernel/irq.c b/arch/powerpc/kernel/irq.c
5131index 0ce8b0e5d7ba..375adb3048fc 100644
5132--- a/arch/powerpc/kernel/irq.c
5133+++ b/arch/powerpc/kernel/irq.c
5134@@ -693,6 +693,7 @@ void irq_ctx_init(void)
5135 }
5136 }
5137
5138+#ifndef CONFIG_PREEMPT_RT_FULL
5139 void do_softirq_own_stack(void)
5140 {
5141 struct thread_info *curtp, *irqtp;
5142@@ -710,6 +711,7 @@ void do_softirq_own_stack(void)
5143 if (irqtp->flags)
5144 set_bits(irqtp->flags, &curtp->flags);
5145 }
5146+#endif
5147
5148 irq_hw_number_t virq_to_hw(unsigned int virq)
5149 {
5150diff --git a/arch/powerpc/kernel/misc_32.S b/arch/powerpc/kernel/misc_32.S
5151index 3f7a9a2d2435..1795359d27b6 100644
5152--- a/arch/powerpc/kernel/misc_32.S
5153+++ b/arch/powerpc/kernel/misc_32.S
5154@@ -41,6 +41,7 @@
5155 * We store the saved ksp_limit in the unused part
5156 * of the STACK_FRAME_OVERHEAD
5157 */
5158+#ifndef CONFIG_PREEMPT_RT_FULL
5159 _GLOBAL(call_do_softirq)
5160 mflr r0
5161 stw r0,4(r1)
5162@@ -57,6 +58,7 @@ _GLOBAL(call_do_softirq)
5163 stw r10,THREAD+KSP_LIMIT(r2)
5164 mtlr r0
5165 blr
5166+#endif
5167
5168 /*
5169 * void call_do_irq(struct pt_regs *regs, struct thread_info *irqtp);
5170diff --git a/arch/powerpc/kernel/misc_64.S b/arch/powerpc/kernel/misc_64.S
5171index 3280953a82cf..dd2a80d190c4 100644
5172--- a/arch/powerpc/kernel/misc_64.S
5173+++ b/arch/powerpc/kernel/misc_64.S
5174@@ -31,6 +31,7 @@
5175
5176 .text
5177
5178+#ifndef CONFIG_PREEMPT_RT_FULL
5179 _GLOBAL(call_do_softirq)
5180 mflr r0
5181 std r0,16(r1)
5182@@ -41,6 +42,7 @@ _GLOBAL(call_do_softirq)
5183 ld r0,16(r1)
5184 mtlr r0
5185 blr
5186+#endif
5187
5188 _GLOBAL(call_do_irq)
5189 mflr r0
5190diff --git a/arch/powerpc/kvm/Kconfig b/arch/powerpc/kvm/Kconfig
5191index 648160334abf..9d24331fc9b4 100644
5192--- a/arch/powerpc/kvm/Kconfig
5193+++ b/arch/powerpc/kvm/Kconfig
5194@@ -177,6 +177,7 @@ config KVM_E500MC
5195 config KVM_MPIC
5196 bool "KVM in-kernel MPIC emulation"
5197 depends on KVM && E500
5198+ depends on !PREEMPT_RT_FULL
5199 select HAVE_KVM_IRQCHIP
5200 select HAVE_KVM_IRQFD
5201 select HAVE_KVM_IRQ_ROUTING
5202diff --git a/arch/powerpc/platforms/cell/spufs/sched.c b/arch/powerpc/platforms/cell/spufs/sched.c
5203index 1fbb5da17dd2..ca86366d5424 100644
5204--- a/arch/powerpc/platforms/cell/spufs/sched.c
5205+++ b/arch/powerpc/platforms/cell/spufs/sched.c
5206@@ -141,7 +141,7 @@ void __spu_update_sched_info(struct spu_context *ctx)
5207 * runqueue. The context will be rescheduled on the proper node
5208 * if it is timesliced or preempted.
5209 */
5210- cpumask_copy(&ctx->cpus_allowed, &current->cpus_allowed);
5211+ cpumask_copy(&ctx->cpus_allowed, current->cpus_ptr);
5212
5213 /* Save the current cpu id for spu interrupt routing. */
5214 ctx->last_ran = raw_smp_processor_id();
5215diff --git a/arch/powerpc/platforms/ps3/device-init.c b/arch/powerpc/platforms/ps3/device-init.c
5216index e48462447ff0..2670cee66064 100644
5217--- a/arch/powerpc/platforms/ps3/device-init.c
5218+++ b/arch/powerpc/platforms/ps3/device-init.c
5219@@ -752,7 +752,7 @@ static int ps3_notification_read_write(struct ps3_notification_device *dev,
5220 }
5221 pr_debug("%s:%u: notification %s issued\n", __func__, __LINE__, op);
5222
5223- res = wait_event_interruptible(dev->done.wait,
5224+ res = swait_event_interruptible(dev->done.wait,
5225 dev->done.done || kthread_should_stop());
5226 if (kthread_should_stop())
5227 res = -EINTR;
5228diff --git a/arch/s390/include/asm/spinlock_types.h b/arch/s390/include/asm/spinlock_types.h
5229index 1861a0c5dd47..74092ebaca3c 100644
5230--- a/arch/s390/include/asm/spinlock_types.h
5231+++ b/arch/s390/include/asm/spinlock_types.h
5232@@ -2,10 +2,6 @@
5233 #ifndef __ASM_SPINLOCK_TYPES_H
5234 #define __ASM_SPINLOCK_TYPES_H
5235
5236-#ifndef __LINUX_SPINLOCK_TYPES_H
5237-# error "please don't include this file directly"
5238-#endif
5239-
5240 typedef struct {
5241 int lock;
5242 } __attribute__ ((aligned (4))) arch_spinlock_t;
5243diff --git a/arch/sh/include/asm/spinlock_types.h b/arch/sh/include/asm/spinlock_types.h
5244index e82369f286a2..22ca9a98bbb8 100644
5245--- a/arch/sh/include/asm/spinlock_types.h
5246+++ b/arch/sh/include/asm/spinlock_types.h
5247@@ -2,10 +2,6 @@
5248 #ifndef __ASM_SH_SPINLOCK_TYPES_H
5249 #define __ASM_SH_SPINLOCK_TYPES_H
5250
5251-#ifndef __LINUX_SPINLOCK_TYPES_H
5252-# error "please don't include this file directly"
5253-#endif
5254-
5255 typedef struct {
5256 volatile unsigned int lock;
5257 } arch_spinlock_t;
5258diff --git a/arch/sh/kernel/irq.c b/arch/sh/kernel/irq.c
5259index 245dbeb20afe..e298c82d2a69 100644
5260--- a/arch/sh/kernel/irq.c
5261+++ b/arch/sh/kernel/irq.c
5262@@ -148,6 +148,7 @@ void irq_ctx_exit(int cpu)
5263 hardirq_ctx[cpu] = NULL;
5264 }
5265
5266+#ifndef CONFIG_PREEMPT_RT_FULL
5267 void do_softirq_own_stack(void)
5268 {
5269 struct thread_info *curctx;
5270@@ -175,6 +176,7 @@ void do_softirq_own_stack(void)
5271 "r5", "r6", "r7", "r8", "r9", "r15", "t", "pr"
5272 );
5273 }
5274+#endif
5275 #else
5276 static inline void handle_one_irq(unsigned int irq)
5277 {
5278diff --git a/arch/sparc/Kconfig b/arch/sparc/Kconfig
5279index 4e83f950713e..7f9d71523763 100644
5280--- a/arch/sparc/Kconfig
5281+++ b/arch/sparc/Kconfig
5282@@ -206,12 +206,10 @@ config NR_CPUS
5283 source kernel/Kconfig.hz
5284
5285 config RWSEM_GENERIC_SPINLOCK
5286- bool
5287- default y if SPARC32
5288+ def_bool PREEMPT_RT_FULL
5289
5290 config RWSEM_XCHGADD_ALGORITHM
5291- bool
5292- default y if SPARC64
5293+ def_bool !RWSEM_GENERIC_SPINLOCK && !PREEMPT_RT_FULL
5294
5295 config GENERIC_HWEIGHT
5296 bool
5297diff --git a/arch/sparc/kernel/irq_64.c b/arch/sparc/kernel/irq_64.c
5298index d66dde833f5e..f87b3f8f4d43 100644
5299--- a/arch/sparc/kernel/irq_64.c
5300+++ b/arch/sparc/kernel/irq_64.c
5301@@ -855,6 +855,7 @@ void __irq_entry handler_irq(int pil, struct pt_regs *regs)
5302 set_irq_regs(old_regs);
5303 }
5304
5305+#ifndef CONFIG_PREEMPT_RT_FULL
5306 void do_softirq_own_stack(void)
5307 {
5308 void *orig_sp, *sp = softirq_stack[smp_processor_id()];
5309@@ -869,6 +870,7 @@ void do_softirq_own_stack(void)
5310 __asm__ __volatile__("mov %0, %%sp"
5311 : : "r" (orig_sp));
5312 }
5313+#endif
5314
5315 #ifdef CONFIG_HOTPLUG_CPU
5316 void fixup_irqs(void)
5317diff --git a/arch/tile/include/asm/setup.h b/arch/tile/include/asm/setup.h
5318index 2a0347af0702..670fa2f4cfc3 100644
5319--- a/arch/tile/include/asm/setup.h
5320+++ b/arch/tile/include/asm/setup.h
5321@@ -49,7 +49,7 @@ int hardwall_ipi_valid(int cpu);
5322
5323 /* Hook hardwall code into changes in affinity. */
5324 #define arch_set_cpus_allowed(p, new_mask) do { \
5325- if (!cpumask_equal(&p->cpus_allowed, new_mask)) \
5326+ if (!cpumask_equal(p->cpus_ptr, new_mask)) \
5327 hardwall_deactivate_all(p); \
5328 } while (0)
5329 #endif
5330diff --git a/arch/tile/include/asm/spinlock_types.h b/arch/tile/include/asm/spinlock_types.h
5331index a71f59b49c50..9311c6ff2abc 100644
5332--- a/arch/tile/include/asm/spinlock_types.h
5333+++ b/arch/tile/include/asm/spinlock_types.h
5334@@ -15,10 +15,6 @@
5335 #ifndef _ASM_TILE_SPINLOCK_TYPES_H
5336 #define _ASM_TILE_SPINLOCK_TYPES_H
5337
5338-#ifndef __LINUX_SPINLOCK_TYPES_H
5339-# error "please don't include this file directly"
5340-#endif
5341-
5342 #ifdef __tilegx__
5343
5344 /* Low 15 bits are "next"; high 15 bits are "current". */
5345diff --git a/arch/tile/kernel/hardwall.c b/arch/tile/kernel/hardwall.c
5346index 2fd1694ac1d0..98f4fb696289 100644
5347--- a/arch/tile/kernel/hardwall.c
5348+++ b/arch/tile/kernel/hardwall.c
5349@@ -590,12 +590,12 @@ static int hardwall_activate(struct hardwall_info *info)
5350 * Get our affinity; if we're not bound to this tile uniquely,
5351 * we can't access the network registers.
5352 */
5353- if (cpumask_weight(&p->cpus_allowed) != 1)
5354+ if (p->nr_cpus_allowed != 1)
5355 return -EPERM;
5356
5357 /* Make sure we are bound to a cpu assigned to this resource. */
5358 cpu = smp_processor_id();
5359- BUG_ON(cpumask_first(&p->cpus_allowed) != cpu);
5360+ BUG_ON(cpumask_first(p->cpus_ptr) != cpu);
5361 if (!cpumask_test_cpu(cpu, &info->cpumask))
5362 return -EINVAL;
5363
5364@@ -621,17 +621,17 @@ static int hardwall_activate(struct hardwall_info *info)
5365 * Deactivate a task's hardwall. Must hold lock for hardwall_type.
5366 * This method may be called from exit_thread(), so we don't want to
5367 * rely on too many fields of struct task_struct still being valid.
5368- * We assume the cpus_allowed, pid, and comm fields are still valid.
5369+ * We assume the nr_cpus_allowed, pid, and comm fields are still valid.
5370 */
5371 static void _hardwall_deactivate(struct hardwall_type *hwt,
5372 struct task_struct *task)
5373 {
5374 struct thread_struct *ts = &task->thread;
5375
5376- if (cpumask_weight(&task->cpus_allowed) != 1) {
5377+ if (task->nr_cpus_allowed != 1) {
5378 pr_err("pid %d (%s) releasing %s hardwall with an affinity mask containing %d cpus!\n",
5379 task->pid, task->comm, hwt->name,
5380- cpumask_weight(&task->cpus_allowed));
5381+ task->nr_cpus_allowed);
5382 BUG();
5383 }
5384
5385diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
5386index 2af0af33362a..7764f936d6ab 100644
5387--- a/arch/x86/Kconfig
5388+++ b/arch/x86/Kconfig
5389@@ -169,6 +169,7 @@ config X86
5390 select HAVE_HARDLOCKUP_DETECTOR_PERF if PERF_EVENTS && HAVE_PERF_EVENTS_NMI
5391 select HAVE_PERF_REGS
5392 select HAVE_PERF_USER_STACK_DUMP
5393+ select HAVE_PREEMPT_LAZY
5394 select HAVE_RCU_TABLE_FREE
5395 select HAVE_RCU_TABLE_INVALIDATE if HAVE_RCU_TABLE_FREE
5396 select HAVE_REGS_AND_STACK_ACCESS_API
5397@@ -257,8 +258,11 @@ config ARCH_MAY_HAVE_PC_FDC
5398 def_bool y
5399 depends on ISA_DMA_API
5400
5401+config RWSEM_GENERIC_SPINLOCK
5402+ def_bool PREEMPT_RT_FULL
e4b2b4a8 5403+
b3bbd485
JK
5404 config RWSEM_XCHGADD_ALGORITHM
5405- def_bool y
5406+ def_bool !RWSEM_GENERIC_SPINLOCK && !PREEMPT_RT_FULL
5407
5408 config GENERIC_CALIBRATE_DELAY
5409 def_bool y
5410@@ -933,7 +937,7 @@ config IOMMU_HELPER
5411 config MAXSMP
5412 bool "Enable Maximum number of SMP Processors and NUMA Nodes"
5413 depends on X86_64 && SMP && DEBUG_KERNEL
5414- select CPUMASK_OFFSTACK
5415+ select CPUMASK_OFFSTACK if !PREEMPT_RT_FULL
5416 ---help---
5417 Enable maximum number of CPUS and NUMA Nodes for this architecture.
5418 If unsure, say N.
5419diff --git a/arch/x86/crypto/aesni-intel_glue.c b/arch/x86/crypto/aesni-intel_glue.c
5420index c690ddc78c03..7a3138d33e33 100644
5421--- a/arch/x86/crypto/aesni-intel_glue.c
5422+++ b/arch/x86/crypto/aesni-intel_glue.c
5423@@ -387,14 +387,14 @@ static int ecb_encrypt(struct skcipher_request *req)
5424
5425 err = skcipher_walk_virt(&walk, req, true);
5426
5427- kernel_fpu_begin();
5428 while ((nbytes = walk.nbytes)) {
5429+ kernel_fpu_begin();
5430 aesni_ecb_enc(ctx, walk.dst.virt.addr, walk.src.virt.addr,
5431 nbytes & AES_BLOCK_MASK);
5432+ kernel_fpu_end();
5433 nbytes &= AES_BLOCK_SIZE - 1;
5434 err = skcipher_walk_done(&walk, nbytes);
5435 }
5436- kernel_fpu_end();
5437
5438 return err;
5439 }
5440@@ -409,14 +409,14 @@ static int ecb_decrypt(struct skcipher_request *req)
5441
5442 err = skcipher_walk_virt(&walk, req, true);
5443
5444- kernel_fpu_begin();
5445 while ((nbytes = walk.nbytes)) {
5446+ kernel_fpu_begin();
5447 aesni_ecb_dec(ctx, walk.dst.virt.addr, walk.src.virt.addr,
5448 nbytes & AES_BLOCK_MASK);
5449+ kernel_fpu_end();
5450 nbytes &= AES_BLOCK_SIZE - 1;
5451 err = skcipher_walk_done(&walk, nbytes);
5452 }
5453- kernel_fpu_end();
5454
5455 return err;
5456 }
5457@@ -431,14 +431,14 @@ static int cbc_encrypt(struct skcipher_request *req)
5458
5459 err = skcipher_walk_virt(&walk, req, true);
5460
5461- kernel_fpu_begin();
5462 while ((nbytes = walk.nbytes)) {
5463+ kernel_fpu_begin();
5464 aesni_cbc_enc(ctx, walk.dst.virt.addr, walk.src.virt.addr,
5465 nbytes & AES_BLOCK_MASK, walk.iv);
5466+ kernel_fpu_end();
5467 nbytes &= AES_BLOCK_SIZE - 1;
5468 err = skcipher_walk_done(&walk, nbytes);
5469 }
5470- kernel_fpu_end();
5471
5472 return err;
5473 }
5474@@ -453,14 +453,14 @@ static int cbc_decrypt(struct skcipher_request *req)
5475
5476 err = skcipher_walk_virt(&walk, req, true);
5477
5478- kernel_fpu_begin();
5479 while ((nbytes = walk.nbytes)) {
5480+ kernel_fpu_begin();
5481 aesni_cbc_dec(ctx, walk.dst.virt.addr, walk.src.virt.addr,
5482 nbytes & AES_BLOCK_MASK, walk.iv);
5483+ kernel_fpu_end();
5484 nbytes &= AES_BLOCK_SIZE - 1;
5485 err = skcipher_walk_done(&walk, nbytes);
5486 }
5487- kernel_fpu_end();
5488
5489 return err;
5490 }
5491@@ -510,18 +510,20 @@ static int ctr_crypt(struct skcipher_request *req)
5492
5493 err = skcipher_walk_virt(&walk, req, true);
5494
5495- kernel_fpu_begin();
5496 while ((nbytes = walk.nbytes) >= AES_BLOCK_SIZE) {
5497+ kernel_fpu_begin();
5498 aesni_ctr_enc_tfm(ctx, walk.dst.virt.addr, walk.src.virt.addr,
5499 nbytes & AES_BLOCK_MASK, walk.iv);
5500+ kernel_fpu_end();
5501 nbytes &= AES_BLOCK_SIZE - 1;
5502 err = skcipher_walk_done(&walk, nbytes);
5503 }
5504 if (walk.nbytes) {
5505+ kernel_fpu_begin();
5506 ctr_crypt_final(ctx, &walk);
5507+ kernel_fpu_end();
5508 err = skcipher_walk_done(&walk, 0);
5509 }
5510- kernel_fpu_end();
5511
5512 return err;
5513 }
5514diff --git a/arch/x86/crypto/camellia_aesni_avx2_glue.c b/arch/x86/crypto/camellia_aesni_avx2_glue.c
5515index 60907c139c4e..0902db7d326a 100644
5516--- a/arch/x86/crypto/camellia_aesni_avx2_glue.c
5517+++ b/arch/x86/crypto/camellia_aesni_avx2_glue.c
5518@@ -206,6 +206,20 @@ struct crypt_priv {
5519 bool fpu_enabled;
5520 };
5521
5522+#ifdef CONFIG_PREEMPT_RT_FULL
5523+static void camellia_fpu_end_rt(struct crypt_priv *ctx)
5524+{
5525+ bool fpu_enabled = ctx->fpu_enabled;
e4b2b4a8 5526+
b3bbd485
JK
5527+ if (!fpu_enabled)
5528+ return;
5529+ camellia_fpu_end(fpu_enabled);
5530+ ctx->fpu_enabled = false;
5531+}
5532+#else
5533+static void camellia_fpu_end_rt(struct crypt_priv *ctx) { }
5534+#endif
e4b2b4a8 5535+
b3bbd485
JK
5536 static void encrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes)
5537 {
5538 const unsigned int bsize = CAMELLIA_BLOCK_SIZE;
5539@@ -221,16 +235,19 @@ static void encrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes)
5540 }
5541
5542 if (nbytes >= CAMELLIA_AESNI_PARALLEL_BLOCKS * bsize) {
5543+ kernel_fpu_resched();
5544 camellia_ecb_enc_16way(ctx->ctx, srcdst, srcdst);
5545 srcdst += bsize * CAMELLIA_AESNI_PARALLEL_BLOCKS;
5546 nbytes -= bsize * CAMELLIA_AESNI_PARALLEL_BLOCKS;
5547 }
5548
5549 while (nbytes >= CAMELLIA_PARALLEL_BLOCKS * bsize) {
5550+ kernel_fpu_resched();
5551 camellia_enc_blk_2way(ctx->ctx, srcdst, srcdst);
5552 srcdst += bsize * CAMELLIA_PARALLEL_BLOCKS;
5553 nbytes -= bsize * CAMELLIA_PARALLEL_BLOCKS;
5554 }
5555+ camellia_fpu_end_rt(ctx);
5556
5557 for (i = 0; i < nbytes / bsize; i++, srcdst += bsize)
5558 camellia_enc_blk(ctx->ctx, srcdst, srcdst);
5559@@ -251,16 +268,19 @@ static void decrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes)
5560 }
5561
5562 if (nbytes >= CAMELLIA_AESNI_PARALLEL_BLOCKS * bsize) {
5563+ kernel_fpu_resched();
5564 camellia_ecb_dec_16way(ctx->ctx, srcdst, srcdst);
5565 srcdst += bsize * CAMELLIA_AESNI_PARALLEL_BLOCKS;
5566 nbytes -= bsize * CAMELLIA_AESNI_PARALLEL_BLOCKS;
5567 }
5568
5569 while (nbytes >= CAMELLIA_PARALLEL_BLOCKS * bsize) {
5570+ kernel_fpu_resched();
5571 camellia_dec_blk_2way(ctx->ctx, srcdst, srcdst);
5572 srcdst += bsize * CAMELLIA_PARALLEL_BLOCKS;
5573 nbytes -= bsize * CAMELLIA_PARALLEL_BLOCKS;
5574 }
5575+ camellia_fpu_end_rt(ctx);
5576
5577 for (i = 0; i < nbytes / bsize; i++, srcdst += bsize)
5578 camellia_dec_blk(ctx->ctx, srcdst, srcdst);
5579diff --git a/arch/x86/crypto/camellia_aesni_avx_glue.c b/arch/x86/crypto/camellia_aesni_avx_glue.c
5580index d96429da88eb..3b8e91841039 100644
5581--- a/arch/x86/crypto/camellia_aesni_avx_glue.c
5582+++ b/arch/x86/crypto/camellia_aesni_avx_glue.c
5583@@ -210,6 +210,21 @@ struct crypt_priv {
5584 bool fpu_enabled;
5585 };
5586
5587+#ifdef CONFIG_PREEMPT_RT_FULL
5588+static void camellia_fpu_end_rt(struct crypt_priv *ctx)
5589+{
5590+ bool fpu_enabled = ctx->fpu_enabled;
e4b2b4a8 5591+
b3bbd485
JK
5592+ if (!fpu_enabled)
5593+ return;
5594+ camellia_fpu_end(fpu_enabled);
5595+ ctx->fpu_enabled = false;
5596+}
e4b2b4a8 5597+
b3bbd485
JK
5598+#else
5599+static void camellia_fpu_end_rt(struct crypt_priv *ctx) { }
5600+#endif
e4b2b4a8 5601+
b3bbd485
JK
5602 static void encrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes)
5603 {
5604 const unsigned int bsize = CAMELLIA_BLOCK_SIZE;
5605@@ -225,10 +240,12 @@ static void encrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes)
5606 }
5607
5608 while (nbytes >= CAMELLIA_PARALLEL_BLOCKS * bsize) {
5609+ kernel_fpu_resched();
5610 camellia_enc_blk_2way(ctx->ctx, srcdst, srcdst);
5611 srcdst += bsize * CAMELLIA_PARALLEL_BLOCKS;
5612 nbytes -= bsize * CAMELLIA_PARALLEL_BLOCKS;
5613 }
5614+ camellia_fpu_end_rt(ctx);
5615
5616 for (i = 0; i < nbytes / bsize; i++, srcdst += bsize)
5617 camellia_enc_blk(ctx->ctx, srcdst, srcdst);
5618@@ -249,10 +266,12 @@ static void decrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes)
5619 }
5620
5621 while (nbytes >= CAMELLIA_PARALLEL_BLOCKS * bsize) {
5622+ kernel_fpu_resched();
5623 camellia_dec_blk_2way(ctx->ctx, srcdst, srcdst);
5624 srcdst += bsize * CAMELLIA_PARALLEL_BLOCKS;
5625 nbytes -= bsize * CAMELLIA_PARALLEL_BLOCKS;
5626 }
5627+ camellia_fpu_end_rt(ctx);
5628
5629 for (i = 0; i < nbytes / bsize; i++, srcdst += bsize)
5630 camellia_dec_blk(ctx->ctx, srcdst, srcdst);
5631diff --git a/arch/x86/crypto/cast5_avx_glue.c b/arch/x86/crypto/cast5_avx_glue.c
5632index 575292a33bdf..0a4b0a222b18 100644
5633--- a/arch/x86/crypto/cast5_avx_glue.c
5634+++ b/arch/x86/crypto/cast5_avx_glue.c
5635@@ -59,7 +59,7 @@ static inline void cast5_fpu_end(bool fpu_enabled)
5636 static int ecb_crypt(struct blkcipher_desc *desc, struct blkcipher_walk *walk,
5637 bool enc)
5638 {
5639- bool fpu_enabled = false;
5640+ bool fpu_enabled;
5641 struct cast5_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
5642 const unsigned int bsize = CAST5_BLOCK_SIZE;
5643 unsigned int nbytes;
5644@@ -73,7 +73,7 @@ static int ecb_crypt(struct blkcipher_desc *desc, struct blkcipher_walk *walk,
5645 u8 *wsrc = walk->src.virt.addr;
5646 u8 *wdst = walk->dst.virt.addr;
5647
5648- fpu_enabled = cast5_fpu_begin(fpu_enabled, nbytes);
5649+ fpu_enabled = cast5_fpu_begin(false, nbytes);
5650
5651 /* Process multi-block batch */
5652 if (nbytes >= bsize * CAST5_PARALLEL_BLOCKS) {
5653@@ -102,10 +102,9 @@ static int ecb_crypt(struct blkcipher_desc *desc, struct blkcipher_walk *walk,
5654 } while (nbytes >= bsize);
5655
5656 done:
5657+ cast5_fpu_end(fpu_enabled);
5658 err = blkcipher_walk_done(desc, walk, nbytes);
5659 }
5660-
5661- cast5_fpu_end(fpu_enabled);
5662 return err;
5663 }
5664
5665@@ -226,7 +225,7 @@ static unsigned int __cbc_decrypt(struct blkcipher_desc *desc,
5666 static int cbc_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
5667 struct scatterlist *src, unsigned int nbytes)
5668 {
5669- bool fpu_enabled = false;
5670+ bool fpu_enabled;
5671 struct blkcipher_walk walk;
5672 int err;
5673
5674@@ -235,12 +234,11 @@ static int cbc_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
5675 desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
5676
5677 while ((nbytes = walk.nbytes)) {
5678- fpu_enabled = cast5_fpu_begin(fpu_enabled, nbytes);
5679+ fpu_enabled = cast5_fpu_begin(false, nbytes);
5680 nbytes = __cbc_decrypt(desc, &walk);
5681+ cast5_fpu_end(fpu_enabled);
5682 err = blkcipher_walk_done(desc, &walk, nbytes);
5683 }
5684-
5685- cast5_fpu_end(fpu_enabled);
5686 return err;
5687 }
5688
5689@@ -309,7 +307,7 @@ static unsigned int __ctr_crypt(struct blkcipher_desc *desc,
5690 static int ctr_crypt(struct blkcipher_desc *desc, struct scatterlist *dst,
5691 struct scatterlist *src, unsigned int nbytes)
5692 {
5693- bool fpu_enabled = false;
5694+ bool fpu_enabled;
5695 struct blkcipher_walk walk;
5696 int err;
5697
5698@@ -318,13 +316,12 @@ static int ctr_crypt(struct blkcipher_desc *desc, struct scatterlist *dst,
5699 desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
5700
5701 while ((nbytes = walk.nbytes) >= CAST5_BLOCK_SIZE) {
5702- fpu_enabled = cast5_fpu_begin(fpu_enabled, nbytes);
5703+ fpu_enabled = cast5_fpu_begin(false, nbytes);
5704 nbytes = __ctr_crypt(desc, &walk);
5705+ cast5_fpu_end(fpu_enabled);
5706 err = blkcipher_walk_done(desc, &walk, nbytes);
5707 }
5708
5709- cast5_fpu_end(fpu_enabled);
5710-
5711 if (walk.nbytes) {
5712 ctr_crypt_final(desc, &walk);
5713 err = blkcipher_walk_done(desc, &walk, 0);
5714diff --git a/arch/x86/crypto/cast6_avx_glue.c b/arch/x86/crypto/cast6_avx_glue.c
5715index 50e684768c55..8caf9ba8c1da 100644
5716--- a/arch/x86/crypto/cast6_avx_glue.c
5717+++ b/arch/x86/crypto/cast6_avx_glue.c
5718@@ -205,19 +205,33 @@ struct crypt_priv {
5719 bool fpu_enabled;
5720 };
5721
5722+#ifdef CONFIG_PREEMPT_RT_FULL
5723+static void cast6_fpu_end_rt(struct crypt_priv *ctx)
5724+{
5725+ bool fpu_enabled = ctx->fpu_enabled;
e4b2b4a8 5726+
b3bbd485
JK
5727+ if (!fpu_enabled)
5728+ return;
5729+ cast6_fpu_end(fpu_enabled);
5730+ ctx->fpu_enabled = false;
5731+}
e4b2b4a8 5732+
b3bbd485
JK
5733+#else
5734+static void cast6_fpu_end_rt(struct crypt_priv *ctx) { }
5735+#endif
e4b2b4a8 5736+
b3bbd485
JK
5737 static void encrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes)
5738 {
5739 const unsigned int bsize = CAST6_BLOCK_SIZE;
5740 struct crypt_priv *ctx = priv;
5741 int i;
5742
5743- ctx->fpu_enabled = cast6_fpu_begin(ctx->fpu_enabled, nbytes);
5744-
5745 if (nbytes == bsize * CAST6_PARALLEL_BLOCKS) {
5746+ ctx->fpu_enabled = cast6_fpu_begin(ctx->fpu_enabled, nbytes);
5747 cast6_ecb_enc_8way(ctx->ctx, srcdst, srcdst);
5748+ cast6_fpu_end_rt(ctx);
5749 return;
5750 }
5751-
5752 for (i = 0; i < nbytes / bsize; i++, srcdst += bsize)
5753 __cast6_encrypt(ctx->ctx, srcdst, srcdst);
5754 }
5755@@ -228,10 +242,10 @@ static void decrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes)
5756 struct crypt_priv *ctx = priv;
5757 int i;
5758
5759- ctx->fpu_enabled = cast6_fpu_begin(ctx->fpu_enabled, nbytes);
5760-
5761 if (nbytes == bsize * CAST6_PARALLEL_BLOCKS) {
5762+ ctx->fpu_enabled = cast6_fpu_begin(ctx->fpu_enabled, nbytes);
5763 cast6_ecb_dec_8way(ctx->ctx, srcdst, srcdst);
5764+ cast6_fpu_end_rt(ctx);
5765 return;
5766 }
5767
5768diff --git a/arch/x86/crypto/chacha20_glue.c b/arch/x86/crypto/chacha20_glue.c
5769index 1e6af1b35f7b..e7809fd2a4fd 100644
5770--- a/arch/x86/crypto/chacha20_glue.c
5771+++ b/arch/x86/crypto/chacha20_glue.c
5772@@ -81,23 +81,24 @@ static int chacha20_simd(struct skcipher_request *req)
5773
5774 crypto_chacha20_init(state, ctx, walk.iv);
5775
5776- kernel_fpu_begin();
5777-
5778 while (walk.nbytes >= CHACHA20_BLOCK_SIZE) {
5779+ kernel_fpu_begin();
e4b2b4a8 5780+
b3bbd485
JK
5781 chacha20_dosimd(state, walk.dst.virt.addr, walk.src.virt.addr,
5782 rounddown(walk.nbytes, CHACHA20_BLOCK_SIZE));
5783+ kernel_fpu_end();
5784 err = skcipher_walk_done(&walk,
5785 walk.nbytes % CHACHA20_BLOCK_SIZE);
5786 }
5787
5788 if (walk.nbytes) {
5789+ kernel_fpu_begin();
5790 chacha20_dosimd(state, walk.dst.virt.addr, walk.src.virt.addr,
5791 walk.nbytes);
5792+ kernel_fpu_end();
5793 err = skcipher_walk_done(&walk, 0);
5794 }
5795
5796- kernel_fpu_end();
5797-
5798 return err;
5799 }
5800
5801diff --git a/arch/x86/crypto/glue_helper.c b/arch/x86/crypto/glue_helper.c
5802index d61e57960fe0..c67560d9718a 100644
5803--- a/arch/x86/crypto/glue_helper.c
5804+++ b/arch/x86/crypto/glue_helper.c
5805@@ -40,7 +40,7 @@ static int __glue_ecb_crypt_128bit(const struct common_glue_ctx *gctx,
5806 void *ctx = crypto_blkcipher_ctx(desc->tfm);
5807 const unsigned int bsize = 128 / 8;
5808 unsigned int nbytes, i, func_bytes;
5809- bool fpu_enabled = false;
5810+ bool fpu_enabled;
5811 int err;
5812
5813 err = blkcipher_walk_virt(desc, walk);
5814@@ -50,7 +50,7 @@ static int __glue_ecb_crypt_128bit(const struct common_glue_ctx *gctx,
5815 u8 *wdst = walk->dst.virt.addr;
5816
5817 fpu_enabled = glue_fpu_begin(bsize, gctx->fpu_blocks_limit,
5818- desc, fpu_enabled, nbytes);
5819+ desc, false, nbytes);
5820
5821 for (i = 0; i < gctx->num_funcs; i++) {
5822 func_bytes = bsize * gctx->funcs[i].num_blocks;
5823@@ -72,10 +72,10 @@ static int __glue_ecb_crypt_128bit(const struct common_glue_ctx *gctx,
5824 }
5825
5826 done:
5827+ glue_fpu_end(fpu_enabled);
5828 err = blkcipher_walk_done(desc, walk, nbytes);
5829 }
5830
5831- glue_fpu_end(fpu_enabled);
5832 return err;
5833 }
5834
5835@@ -192,7 +192,7 @@ int glue_cbc_decrypt_128bit(const struct common_glue_ctx *gctx,
5836 struct scatterlist *src, unsigned int nbytes)
5837 {
5838 const unsigned int bsize = 128 / 8;
5839- bool fpu_enabled = false;
5840+ bool fpu_enabled;
5841 struct blkcipher_walk walk;
5842 int err;
5843
5844@@ -201,12 +201,12 @@ int glue_cbc_decrypt_128bit(const struct common_glue_ctx *gctx,
5845
5846 while ((nbytes = walk.nbytes)) {
5847 fpu_enabled = glue_fpu_begin(bsize, gctx->fpu_blocks_limit,
5848- desc, fpu_enabled, nbytes);
5849+ desc, false, nbytes);
5850 nbytes = __glue_cbc_decrypt_128bit(gctx, desc, &walk);
5851+ glue_fpu_end(fpu_enabled);
5852 err = blkcipher_walk_done(desc, &walk, nbytes);
5853 }
5854
5855- glue_fpu_end(fpu_enabled);
5856 return err;
5857 }
5858 EXPORT_SYMBOL_GPL(glue_cbc_decrypt_128bit);
5859@@ -275,7 +275,7 @@ int glue_ctr_crypt_128bit(const struct common_glue_ctx *gctx,
5860 struct scatterlist *src, unsigned int nbytes)
5861 {
5862 const unsigned int bsize = 128 / 8;
5863- bool fpu_enabled = false;
5864+ bool fpu_enabled;
5865 struct blkcipher_walk walk;
5866 int err;
5867
5868@@ -284,13 +284,12 @@ int glue_ctr_crypt_128bit(const struct common_glue_ctx *gctx,
5869
5870 while ((nbytes = walk.nbytes) >= bsize) {
5871 fpu_enabled = glue_fpu_begin(bsize, gctx->fpu_blocks_limit,
5872- desc, fpu_enabled, nbytes);
5873+ desc, false, nbytes);
5874 nbytes = __glue_ctr_crypt_128bit(gctx, desc, &walk);
5875+ glue_fpu_end(fpu_enabled);
5876 err = blkcipher_walk_done(desc, &walk, nbytes);
5877 }
5878
5879- glue_fpu_end(fpu_enabled);
5880-
5881 if (walk.nbytes) {
5882 glue_ctr_crypt_final_128bit(
5883 gctx->funcs[gctx->num_funcs - 1].fn_u.ctr, desc, &walk);
5884@@ -380,7 +379,7 @@ int glue_xts_crypt_128bit(const struct common_glue_ctx *gctx,
5885 void *tweak_ctx, void *crypt_ctx)
5886 {
5887 const unsigned int bsize = 128 / 8;
5888- bool fpu_enabled = false;
5889+ bool fpu_enabled;
5890 struct blkcipher_walk walk;
5891 int err;
5892
5893@@ -393,21 +392,21 @@ int glue_xts_crypt_128bit(const struct common_glue_ctx *gctx,
5894
5895 /* set minimum length to bsize, for tweak_fn */
5896 fpu_enabled = glue_fpu_begin(bsize, gctx->fpu_blocks_limit,
5897- desc, fpu_enabled,
5898+ desc, false,
5899 nbytes < bsize ? bsize : nbytes);
5900-
5901 /* calculate first value of T */
5902 tweak_fn(tweak_ctx, walk.iv, walk.iv);
5903+ glue_fpu_end(fpu_enabled);
5904
5905 while (nbytes) {
5906+ fpu_enabled = glue_fpu_begin(bsize, gctx->fpu_blocks_limit,
5907+ desc, false, nbytes);
5908 nbytes = __glue_xts_crypt_128bit(gctx, crypt_ctx, desc, &walk);
5909
5910+ glue_fpu_end(fpu_enabled);
5911 err = blkcipher_walk_done(desc, &walk, nbytes);
5912 nbytes = walk.nbytes;
5913 }
5914-
5915- glue_fpu_end(fpu_enabled);
5916-
5917 return err;
5918 }
5919 EXPORT_SYMBOL_GPL(glue_xts_crypt_128bit);
5920diff --git a/arch/x86/crypto/serpent_avx2_glue.c b/arch/x86/crypto/serpent_avx2_glue.c
5921index 870f6d812a2d..5c806bf39f1d 100644
5922--- a/arch/x86/crypto/serpent_avx2_glue.c
5923+++ b/arch/x86/crypto/serpent_avx2_glue.c
5924@@ -184,6 +184,21 @@ struct crypt_priv {
5925 bool fpu_enabled;
5926 };
5927
5928+#ifdef CONFIG_PREEMPT_RT_FULL
5929+static void serpent_fpu_end_rt(struct crypt_priv *ctx)
5930+{
5931+ bool fpu_enabled = ctx->fpu_enabled;
e4b2b4a8 5932+
b3bbd485
JK
5933+ if (!fpu_enabled)
5934+ return;
5935+ serpent_fpu_end(fpu_enabled);
5936+ ctx->fpu_enabled = false;
5937+}
e4b2b4a8 5938+
b3bbd485
JK
5939+#else
5940+static void serpent_fpu_end_rt(struct crypt_priv *ctx) { }
5941+#endif
e4b2b4a8 5942+
b3bbd485
JK
5943 static void encrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes)
5944 {
5945 const unsigned int bsize = SERPENT_BLOCK_SIZE;
5946@@ -199,10 +214,12 @@ static void encrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes)
5947 }
5948
5949 while (nbytes >= SERPENT_PARALLEL_BLOCKS * bsize) {
5950+ kernel_fpu_resched();
5951 serpent_ecb_enc_8way_avx(ctx->ctx, srcdst, srcdst);
5952 srcdst += bsize * SERPENT_PARALLEL_BLOCKS;
5953 nbytes -= bsize * SERPENT_PARALLEL_BLOCKS;
5954 }
5955+ serpent_fpu_end_rt(ctx);
5956
5957 for (i = 0; i < nbytes / bsize; i++, srcdst += bsize)
5958 __serpent_encrypt(ctx->ctx, srcdst, srcdst);
5959@@ -223,10 +240,12 @@ static void decrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes)
5960 }
5961
5962 while (nbytes >= SERPENT_PARALLEL_BLOCKS * bsize) {
5963+ kernel_fpu_resched();
5964 serpent_ecb_dec_8way_avx(ctx->ctx, srcdst, srcdst);
5965 srcdst += bsize * SERPENT_PARALLEL_BLOCKS;
5966 nbytes -= bsize * SERPENT_PARALLEL_BLOCKS;
5967 }
5968+ serpent_fpu_end_rt(ctx);
5969
5970 for (i = 0; i < nbytes / bsize; i++, srcdst += bsize)
5971 __serpent_decrypt(ctx->ctx, srcdst, srcdst);
5972diff --git a/arch/x86/crypto/serpent_avx_glue.c b/arch/x86/crypto/serpent_avx_glue.c
5973index 6f778d3daa22..46dcbdbd0518 100644
5974--- a/arch/x86/crypto/serpent_avx_glue.c
5975+++ b/arch/x86/crypto/serpent_avx_glue.c
5976@@ -218,16 +218,31 @@ struct crypt_priv {
5977 bool fpu_enabled;
5978 };
5979
5980+#ifdef CONFIG_PREEMPT_RT_FULL
5981+static void serpent_fpu_end_rt(struct crypt_priv *ctx)
5982+{
5983+ bool fpu_enabled = ctx->fpu_enabled;
e4b2b4a8 5984+
b3bbd485
JK
5985+ if (!fpu_enabled)
5986+ return;
5987+ serpent_fpu_end(fpu_enabled);
5988+ ctx->fpu_enabled = false;
5989+}
e4b2b4a8 5990+
b3bbd485
JK
5991+#else
5992+static void serpent_fpu_end_rt(struct crypt_priv *ctx) { }
5993+#endif
e4b2b4a8 5994+
b3bbd485
JK
5995 static void encrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes)
5996 {
5997 const unsigned int bsize = SERPENT_BLOCK_SIZE;
5998 struct crypt_priv *ctx = priv;
5999 int i;
6000
6001- ctx->fpu_enabled = serpent_fpu_begin(ctx->fpu_enabled, nbytes);
6002-
6003 if (nbytes == bsize * SERPENT_PARALLEL_BLOCKS) {
6004+ ctx->fpu_enabled = serpent_fpu_begin(ctx->fpu_enabled, nbytes);
6005 serpent_ecb_enc_8way_avx(ctx->ctx, srcdst, srcdst);
6006+ serpent_fpu_end_rt(ctx);
6007 return;
6008 }
6009
6010@@ -241,10 +256,10 @@ static void decrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes)
6011 struct crypt_priv *ctx = priv;
6012 int i;
6013
6014- ctx->fpu_enabled = serpent_fpu_begin(ctx->fpu_enabled, nbytes);
6015-
6016 if (nbytes == bsize * SERPENT_PARALLEL_BLOCKS) {
6017+ ctx->fpu_enabled = serpent_fpu_begin(ctx->fpu_enabled, nbytes);
6018 serpent_ecb_dec_8way_avx(ctx->ctx, srcdst, srcdst);
6019+ serpent_fpu_end_rt(ctx);
6020 return;
6021 }
6022
6023diff --git a/arch/x86/crypto/serpent_sse2_glue.c b/arch/x86/crypto/serpent_sse2_glue.c
6024index ac0e831943f5..d35f607d067f 100644
6025--- a/arch/x86/crypto/serpent_sse2_glue.c
6026+++ b/arch/x86/crypto/serpent_sse2_glue.c
6027@@ -187,16 +187,31 @@ struct crypt_priv {
6028 bool fpu_enabled;
6029 };
6030
6031+#ifdef CONFIG_PREEMPT_RT_FULL
6032+static void serpent_fpu_end_rt(struct crypt_priv *ctx)
6033+{
6034+ bool fpu_enabled = ctx->fpu_enabled;
e4b2b4a8 6035+
b3bbd485
JK
6036+ if (!fpu_enabled)
6037+ return;
6038+ serpent_fpu_end(fpu_enabled);
6039+ ctx->fpu_enabled = false;
6040+}
e4b2b4a8 6041+
b3bbd485
JK
6042+#else
6043+static void serpent_fpu_end_rt(struct crypt_priv *ctx) { }
6044+#endif
e4b2b4a8 6045+
b3bbd485
JK
6046 static void encrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes)
6047 {
6048 const unsigned int bsize = SERPENT_BLOCK_SIZE;
6049 struct crypt_priv *ctx = priv;
6050 int i;
6051
6052- ctx->fpu_enabled = serpent_fpu_begin(ctx->fpu_enabled, nbytes);
6053-
6054 if (nbytes == bsize * SERPENT_PARALLEL_BLOCKS) {
6055+ ctx->fpu_enabled = serpent_fpu_begin(ctx->fpu_enabled, nbytes);
6056 serpent_enc_blk_xway(ctx->ctx, srcdst, srcdst);
6057+ serpent_fpu_end_rt(ctx);
6058 return;
6059 }
6060
6061@@ -210,10 +225,10 @@ static void decrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes)
6062 struct crypt_priv *ctx = priv;
6063 int i;
6064
6065- ctx->fpu_enabled = serpent_fpu_begin(ctx->fpu_enabled, nbytes);
6066-
6067 if (nbytes == bsize * SERPENT_PARALLEL_BLOCKS) {
6068+ ctx->fpu_enabled = serpent_fpu_begin(ctx->fpu_enabled, nbytes);
6069 serpent_dec_blk_xway(ctx->ctx, srcdst, srcdst);
6070+ serpent_fpu_end_rt(ctx);
6071 return;
6072 }
6073
6074diff --git a/arch/x86/crypto/twofish_avx_glue.c b/arch/x86/crypto/twofish_avx_glue.c
6075index b7a3904b953c..de00fe24927e 100644
6076--- a/arch/x86/crypto/twofish_avx_glue.c
6077+++ b/arch/x86/crypto/twofish_avx_glue.c
6078@@ -218,6 +218,21 @@ struct crypt_priv {
6079 bool fpu_enabled;
6080 };
6081
6082+#ifdef CONFIG_PREEMPT_RT_FULL
6083+static void twofish_fpu_end_rt(struct crypt_priv *ctx)
6084+{
6085+ bool fpu_enabled = ctx->fpu_enabled;
e4b2b4a8 6086+
b3bbd485
JK
6087+ if (!fpu_enabled)
6088+ return;
6089+ twofish_fpu_end(fpu_enabled);
6090+ ctx->fpu_enabled = false;
6091+}
e4b2b4a8 6092+
b3bbd485
JK
6093+#else
6094+static void twofish_fpu_end_rt(struct crypt_priv *ctx) { }
6095+#endif
e4b2b4a8 6096+
b3bbd485
JK
6097 static void encrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes)
6098 {
6099 const unsigned int bsize = TF_BLOCK_SIZE;
6100@@ -228,12 +243,16 @@ static void encrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes)
6101
6102 if (nbytes == bsize * TWOFISH_PARALLEL_BLOCKS) {
6103 twofish_ecb_enc_8way(ctx->ctx, srcdst, srcdst);
6104+ twofish_fpu_end_rt(ctx);
6105 return;
6106 }
6107
6108- for (i = 0; i < nbytes / (bsize * 3); i++, srcdst += bsize * 3)
6109+ for (i = 0; i < nbytes / (bsize * 3); i++, srcdst += bsize * 3) {
6110+ kernel_fpu_resched();
6111 twofish_enc_blk_3way(ctx->ctx, srcdst, srcdst);
6112+ }
6113
6114+ twofish_fpu_end_rt(ctx);
6115 nbytes %= bsize * 3;
6116
6117 for (i = 0; i < nbytes / bsize; i++, srcdst += bsize)
6118@@ -250,11 +269,15 @@ static void decrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes)
6119
6120 if (nbytes == bsize * TWOFISH_PARALLEL_BLOCKS) {
6121 twofish_ecb_dec_8way(ctx->ctx, srcdst, srcdst);
6122+ twofish_fpu_end_rt(ctx);
6123 return;
6124 }
6125
6126- for (i = 0; i < nbytes / (bsize * 3); i++, srcdst += bsize * 3)
6127+ for (i = 0; i < nbytes / (bsize * 3); i++, srcdst += bsize * 3) {
6128+ kernel_fpu_resched();
6129 twofish_dec_blk_3way(ctx->ctx, srcdst, srcdst);
6130+ }
6131+ twofish_fpu_end_rt(ctx);
6132
6133 nbytes %= bsize * 3;
6134
6135diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c
6136index 60e21ccfb6d6..0e27f35febe7 100644
6137--- a/arch/x86/entry/common.c
6138+++ b/arch/x86/entry/common.c
6139@@ -133,7 +133,7 @@ static long syscall_trace_enter(struct pt_regs *regs)
6140
6141 #define EXIT_TO_USERMODE_LOOP_FLAGS \
6142 (_TIF_SIGPENDING | _TIF_NOTIFY_RESUME | _TIF_UPROBE | \
6143- _TIF_NEED_RESCHED | _TIF_USER_RETURN_NOTIFY | _TIF_PATCH_PENDING)
6144+ _TIF_NEED_RESCHED_MASK | _TIF_USER_RETURN_NOTIFY | _TIF_PATCH_PENDING)
6145
6146 static void exit_to_usermode_loop(struct pt_regs *regs, u32 cached_flags)
6147 {
6148@@ -148,9 +148,16 @@ static void exit_to_usermode_loop(struct pt_regs *regs, u32 cached_flags)
6149 /* We have work to do. */
6150 local_irq_enable();
6151
6152- if (cached_flags & _TIF_NEED_RESCHED)
6153+ if (cached_flags & _TIF_NEED_RESCHED_MASK)
6154 schedule();
6155
6156+#ifdef ARCH_RT_DELAYS_SIGNAL_SEND
6157+ if (unlikely(current->forced_info.si_signo)) {
6158+ struct task_struct *t = current;
6159+ force_sig_info(t->forced_info.si_signo, &t->forced_info, t);
6160+ t->forced_info.si_signo = 0;
6161+ }
6162+#endif
6163 if (cached_flags & _TIF_UPROBE)
6164 uprobe_notify_resume(regs);
6165
6166diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S
6167index 60c4c342316c..cd0c7c56e2dd 100644
6168--- a/arch/x86/entry/entry_32.S
6169+++ b/arch/x86/entry/entry_32.S
6170@@ -350,8 +350,25 @@ END(ret_from_exception)
6171 ENTRY(resume_kernel)
6172 DISABLE_INTERRUPTS(CLBR_ANY)
6173 .Lneed_resched:
6174+ # preempt count == 0 + NEED_RS set?
6175 cmpl $0, PER_CPU_VAR(__preempt_count)
6176+#ifndef CONFIG_PREEMPT_LAZY
6177 jnz restore_all
6178+#else
6179+ jz test_int_off
e4b2b4a8 6180+
b3bbd485
JK
6181+ # atleast preempt count == 0 ?
6182+ cmpl $_PREEMPT_ENABLED,PER_CPU_VAR(__preempt_count)
6183+ jne restore_all
e4b2b4a8 6184+
b3bbd485
JK
6185+ movl PER_CPU_VAR(current_task), %ebp
6186+ cmpl $0,TASK_TI_preempt_lazy_count(%ebp) # non-zero preempt_lazy_count ?
6187+ jnz restore_all
e4b2b4a8 6188+
b3bbd485
JK
6189+ testl $_TIF_NEED_RESCHED_LAZY, TASK_TI_flags(%ebp)
6190+ jz restore_all
6191+test_int_off:
6192+#endif
6193 testl $X86_EFLAGS_IF, PT_EFLAGS(%esp) # interrupts off (exception path) ?
6194 jz restore_all
6195 call preempt_schedule_irq
6196diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
5dd41b01 6197index 164cd7529f0b..75d42cb8a7c9 100644
b3bbd485
JK
6198--- a/arch/x86/entry/entry_64.S
6199+++ b/arch/x86/entry/entry_64.S
6200@@ -633,7 +633,23 @@ retint_kernel:
5dd41b01 6201 btl $9, EFLAGS(%rsp) /* were interrupts off? */
b3bbd485
JK
6202 jnc 1f
6203 0: cmpl $0, PER_CPU_VAR(__preempt_count)
6204+#ifndef CONFIG_PREEMPT_LAZY
6205 jnz 1f
6206+#else
6207+ jz do_preempt_schedule_irq
e4b2b4a8 6208+
b3bbd485
JK
6209+ # atleast preempt count == 0 ?
6210+ cmpl $_PREEMPT_ENABLED,PER_CPU_VAR(__preempt_count)
6211+ jnz 1f
e4b2b4a8 6212+
b3bbd485
JK
6213+ movq PER_CPU_VAR(current_task), %rcx
6214+ cmpl $0, TASK_TI_preempt_lazy_count(%rcx)
6215+ jnz 1f
e4b2b4a8 6216+
b3bbd485
JK
6217+ bt $TIF_NEED_RESCHED_LAZY,TASK_TI_flags(%rcx)
6218+ jnc 1f
6219+do_preempt_schedule_irq:
6220+#endif
6221 call preempt_schedule_irq
6222 jmp 0b
6223 1:
6224@@ -988,6 +1004,7 @@ bad_gs:
6225 jmp 2b
6226 .previous
6227
6228+#ifndef CONFIG_PREEMPT_RT_FULL
6229 /* Call softirq on interrupt stack. Interrupts are off. */
6230 ENTRY(do_softirq_own_stack)
6231 pushq %rbp
6232@@ -998,6 +1015,7 @@ ENTRY(do_softirq_own_stack)
6233 leaveq
6234 ret
6235 ENDPROC(do_softirq_own_stack)
6236+#endif
6237
6238 #ifdef CONFIG_XEN
6239 idtentry hypervisor_callback xen_do_hypervisor_callback has_error_code=0
6240diff --git a/arch/x86/include/asm/fpu/api.h b/arch/x86/include/asm/fpu/api.h
6241index a9caac9d4a72..18b31f22ca5d 100644
6242--- a/arch/x86/include/asm/fpu/api.h
6243+++ b/arch/x86/include/asm/fpu/api.h
6244@@ -25,6 +25,7 @@ extern void __kernel_fpu_begin(void);
6245 extern void __kernel_fpu_end(void);
6246 extern void kernel_fpu_begin(void);
6247 extern void kernel_fpu_end(void);
6248+extern void kernel_fpu_resched(void);
6249 extern bool irq_fpu_usable(void);
6250
6251 /*
6252diff --git a/arch/x86/include/asm/preempt.h b/arch/x86/include/asm/preempt.h
6253index 7f2dbd91fc74..22992c837795 100644
6254--- a/arch/x86/include/asm/preempt.h
6255+++ b/arch/x86/include/asm/preempt.h
6256@@ -86,17 +86,46 @@ static __always_inline void __preempt_count_sub(int val)
6257 * a decrement which hits zero means we have no preempt_count and should
6258 * reschedule.
6259 */
6260-static __always_inline bool __preempt_count_dec_and_test(void)
6261+static __always_inline bool ____preempt_count_dec_and_test(void)
6262 {
6263 GEN_UNARY_RMWcc("decl", __preempt_count, __percpu_arg(0), e);
6264 }
6265
6266+static __always_inline bool __preempt_count_dec_and_test(void)
6267+{
6268+ if (____preempt_count_dec_and_test())
6269+ return true;
6270+#ifdef CONFIG_PREEMPT_LAZY
6271+ if (current_thread_info()->preempt_lazy_count)
6272+ return false;
6273+ return test_thread_flag(TIF_NEED_RESCHED_LAZY);
6274+#else
6275+ return false;
6276+#endif
6277+}
e4b2b4a8 6278+
b3bbd485
JK
6279 /*
6280 * Returns true when we need to resched and can (barring IRQ state).
6281 */
6282 static __always_inline bool should_resched(int preempt_offset)
6283 {
6284+#ifdef CONFIG_PREEMPT_LAZY
6285+ u32 tmp;
e4b2b4a8 6286+
b3bbd485
JK
6287+ tmp = raw_cpu_read_4(__preempt_count);
6288+ if (tmp == preempt_offset)
6289+ return true;
e4b2b4a8 6290+
b3bbd485
JK
6291+ /* preempt count == 0 ? */
6292+ tmp &= ~PREEMPT_NEED_RESCHED;
6293+ if (tmp)
6294+ return false;
6295+ if (current_thread_info()->preempt_lazy_count)
6296+ return false;
6297+ return test_thread_flag(TIF_NEED_RESCHED_LAZY);
6298+#else
6299 return unlikely(raw_cpu_read_4(__preempt_count) == preempt_offset);
6300+#endif
6301 }
6302
6303 #ifdef CONFIG_PREEMPT
6304diff --git a/arch/x86/include/asm/signal.h b/arch/x86/include/asm/signal.h
6305index 5f9012ff52ed..39117e57caf2 100644
6306--- a/arch/x86/include/asm/signal.h
6307+++ b/arch/x86/include/asm/signal.h
6308@@ -28,6 +28,19 @@ typedef struct {
6309 #define SA_IA32_ABI 0x02000000u
6310 #define SA_X32_ABI 0x01000000u
6311
6312+/*
6313+ * Because some traps use the IST stack, we must keep preemption
6314+ * disabled while calling do_trap(), but do_trap() may call
6315+ * force_sig_info() which will grab the signal spin_locks for the
6316+ * task, which in PREEMPT_RT_FULL are mutexes. By defining
6317+ * ARCH_RT_DELAYS_SIGNAL_SEND the force_sig_info() will set
6318+ * TIF_NOTIFY_RESUME and set up the signal to be sent on exit of the
6319+ * trap.
6320+ */
6321+#if defined(CONFIG_PREEMPT_RT_FULL)
6322+#define ARCH_RT_DELAYS_SIGNAL_SEND
6323+#endif
e4b2b4a8 6324+
b3bbd485
JK
6325 #ifndef CONFIG_COMPAT
6326 typedef sigset_t compat_sigset_t;
6327 #endif
6328diff --git a/arch/x86/include/asm/stackprotector.h b/arch/x86/include/asm/stackprotector.h
6329index 371b3a4af000..06613a805b25 100644
6330--- a/arch/x86/include/asm/stackprotector.h
6331+++ b/arch/x86/include/asm/stackprotector.h
6332@@ -60,7 +60,7 @@
6333 */
6334 static __always_inline void boot_init_stack_canary(void)
6335 {
6336- u64 canary;
6337+ u64 uninitialized_var(canary);
6338 u64 tsc;
6339
6340 #ifdef CONFIG_X86_64
6341@@ -71,8 +71,14 @@ static __always_inline void boot_init_stack_canary(void)
6342 * of randomness. The TSC only matters for very early init,
6343 * there it already has some randomness on most systems. Later
6344 * on during the bootup the random pool has true entropy too.
6345+ * For preempt-rt we need to weaken the randomness a bit, as
6346+ * we can't call into the random generator from atomic context
6347+ * due to locking constraints. We just leave canary
6348+ * uninitialized and use the TSC based randomness on top of it.
6349 */
6350+#ifndef CONFIG_PREEMPT_RT_FULL
6351 get_random_bytes(&canary, sizeof(canary));
6352+#endif
6353 tsc = rdtsc();
6354 canary += tsc + (tsc << 32UL);
6355 canary &= CANARY_MASK;
6356diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h
6357index 95ff2d7f553f..b1c9129f64fc 100644
6358--- a/arch/x86/include/asm/thread_info.h
6359+++ b/arch/x86/include/asm/thread_info.h
6360@@ -56,11 +56,14 @@ struct task_struct;
6361 struct thread_info {
6362 unsigned long flags; /* low level flags */
6363 u32 status; /* thread synchronous flags */
6364+ int preempt_lazy_count; /* 0 => lazy preemptable
6365+ <0 => BUG */
6366 };
6367
6368 #define INIT_THREAD_INFO(tsk) \
6369 { \
6370 .flags = 0, \
6371+ .preempt_lazy_count = 0, \
6372 }
6373
6374 #define init_stack (init_thread_union.stack)
6375@@ -69,6 +72,10 @@ struct thread_info {
6376
6377 #include <asm/asm-offsets.h>
6378
6379+#define GET_THREAD_INFO(reg) \
6380+ _ASM_MOV PER_CPU_VAR(cpu_current_top_of_stack),reg ; \
6381+ _ASM_SUB $(THREAD_SIZE),reg ;
e4b2b4a8 6382+
b3bbd485
JK
6383 #endif
6384
6385 /*
6386@@ -85,6 +92,7 @@ struct thread_info {
6387 #define TIF_SYSCALL_EMU 6 /* syscall emulation active */
6388 #define TIF_SYSCALL_AUDIT 7 /* syscall auditing active */
6389 #define TIF_SECCOMP 8 /* secure computing */
6390+#define TIF_NEED_RESCHED_LAZY 9 /* lazy rescheduling necessary */
6391 #define TIF_USER_RETURN_NOTIFY 11 /* notify kernel of userspace return */
6392 #define TIF_UPROBE 12 /* breakpointed or singlestepping */
6393 #define TIF_PATCH_PENDING 13 /* pending live patching update */
6394@@ -112,6 +120,7 @@ struct thread_info {
6395 #define _TIF_SYSCALL_EMU (1 << TIF_SYSCALL_EMU)
6396 #define _TIF_SYSCALL_AUDIT (1 << TIF_SYSCALL_AUDIT)
6397 #define _TIF_SECCOMP (1 << TIF_SECCOMP)
6398+#define _TIF_NEED_RESCHED_LAZY (1 << TIF_NEED_RESCHED_LAZY)
6399 #define _TIF_USER_RETURN_NOTIFY (1 << TIF_USER_RETURN_NOTIFY)
6400 #define _TIF_UPROBE (1 << TIF_UPROBE)
6401 #define _TIF_PATCH_PENDING (1 << TIF_PATCH_PENDING)
6402@@ -153,6 +162,8 @@ struct thread_info {
6403 #define _TIF_WORK_CTXSW_PREV (_TIF_WORK_CTXSW|_TIF_USER_RETURN_NOTIFY)
6404 #define _TIF_WORK_CTXSW_NEXT (_TIF_WORK_CTXSW)
6405
6406+#define _TIF_NEED_RESCHED_MASK (_TIF_NEED_RESCHED | _TIF_NEED_RESCHED_LAZY)
e4b2b4a8 6407+
b3bbd485
JK
6408 #define STACK_WARN (THREAD_SIZE/8)
6409
6410 /*
6411diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
6412index 96a8a68f9c79..c9af5afebc4a 100644
6413--- a/arch/x86/kernel/apic/io_apic.c
6414+++ b/arch/x86/kernel/apic/io_apic.c
6415@@ -1688,19 +1688,20 @@ static bool io_apic_level_ack_pending(struct mp_chip_data *data)
6416 return false;
6417 }
6418
6419-static inline bool ioapic_irqd_mask(struct irq_data *data)
6420+static inline bool ioapic_prepare_move(struct irq_data *data)
6421 {
6422 /* If we are moving the irq we need to mask it */
6423 if (unlikely(irqd_is_setaffinity_pending(data))) {
6424- mask_ioapic_irq(data);
6425+ if (!irqd_irq_masked(data))
6426+ mask_ioapic_irq(data);
6427 return true;
6428 }
6429 return false;
6430 }
6431
6432-static inline void ioapic_irqd_unmask(struct irq_data *data, bool masked)
6433+static inline void ioapic_finish_move(struct irq_data *data, bool moveit)
6434 {
6435- if (unlikely(masked)) {
6436+ if (unlikely(moveit)) {
6437 /* Only migrate the irq if the ack has been received.
6438 *
6439 * On rare occasions the broadcast level triggered ack gets
6440@@ -1729,15 +1730,17 @@ static inline void ioapic_irqd_unmask(struct irq_data *data, bool masked)
6441 */
6442 if (!io_apic_level_ack_pending(data->chip_data))
6443 irq_move_masked_irq(data);
6444- unmask_ioapic_irq(data);
6445+ /* If the irq is masked in the core, leave it */
6446+ if (!irqd_irq_masked(data))
6447+ unmask_ioapic_irq(data);
6448 }
6449 }
6450 #else
6451-static inline bool ioapic_irqd_mask(struct irq_data *data)
6452+static inline bool ioapic_prepare_move(struct irq_data *data)
6453 {
6454 return false;
6455 }
6456-static inline void ioapic_irqd_unmask(struct irq_data *data, bool masked)
6457+static inline void ioapic_finish_move(struct irq_data *data, bool moveit)
6458 {
6459 }
6460 #endif
6461@@ -1746,11 +1749,11 @@ static void ioapic_ack_level(struct irq_data *irq_data)
6462 {
6463 struct irq_cfg *cfg = irqd_cfg(irq_data);
6464 unsigned long v;
6465- bool masked;
6466+ bool moveit;
6467 int i;
6468
6469 irq_complete_move(cfg);
6470- masked = ioapic_irqd_mask(irq_data);
6471+ moveit = ioapic_prepare_move(irq_data);
6472
6473 /*
6474 * It appears there is an erratum which affects at least version 0x11
6475@@ -1805,7 +1808,7 @@ static void ioapic_ack_level(struct irq_data *irq_data)
6476 eoi_ioapic_pin(cfg->vector, irq_data->chip_data);
6477 }
6478
6479- ioapic_irqd_unmask(irq_data, masked);
6480+ ioapic_finish_move(irq_data, moveit);
6481 }
6482
6483 static void ioapic_ir_ack_level(struct irq_data *irq_data)
6484diff --git a/arch/x86/kernel/asm-offsets.c b/arch/x86/kernel/asm-offsets.c
6485index 76417a9aab73..62c3e27c8e1c 100644
6486--- a/arch/x86/kernel/asm-offsets.c
6487+++ b/arch/x86/kernel/asm-offsets.c
6488@@ -38,6 +38,7 @@ void common(void) {
6489
6490 BLANK();
6491 OFFSET(TASK_TI_flags, task_struct, thread_info.flags);
6492+ OFFSET(TASK_TI_preempt_lazy_count, task_struct, thread_info.preempt_lazy_count);
6493 OFFSET(TASK_addr_limit, task_struct, thread.addr_limit);
6494
6495 BLANK();
6496@@ -94,6 +95,7 @@ void common(void) {
6497
6498 BLANK();
6499 DEFINE(PTREGS_SIZE, sizeof(struct pt_regs));
6500+ DEFINE(_PREEMPT_ENABLED, PREEMPT_ENABLED);
6501
6502 /* TLB state for the entry code */
6503 OFFSET(TLB_STATE_user_pcid_flush_mask, tlb_state, user_pcid_flush_mask);
6504diff --git a/arch/x86/kernel/cpu/mcheck/dev-mcelog.c b/arch/x86/kernel/cpu/mcheck/dev-mcelog.c
6505index 7f85b76f43bc..9e74b805070f 100644
6506--- a/arch/x86/kernel/cpu/mcheck/dev-mcelog.c
6507+++ b/arch/x86/kernel/cpu/mcheck/dev-mcelog.c
6508@@ -14,6 +14,7 @@
6509 #include <linux/slab.h>
6510 #include <linux/kmod.h>
6511 #include <linux/poll.h>
6512+#include <linux/swork.h>
6513
6514 #include "mce-internal.h"
6515
6516@@ -86,13 +87,43 @@ static void mce_do_trigger(struct work_struct *work)
6517
6518 static DECLARE_WORK(mce_trigger_work, mce_do_trigger);
6519
6520-
6521-void mce_work_trigger(void)
6522+static void __mce_work_trigger(struct swork_event *event)
6523 {
6524 if (mce_helper[0])
6525 schedule_work(&mce_trigger_work);
6526 }
6527
6528+#ifdef CONFIG_PREEMPT_RT_FULL
6529+static bool notify_work_ready __read_mostly;
6530+static struct swork_event notify_work;
e4b2b4a8 6531+
b3bbd485
JK
6532+static int mce_notify_work_init(void)
6533+{
6534+ int err;
e4b2b4a8 6535+
b3bbd485
JK
6536+ err = swork_get();
6537+ if (err)
6538+ return err;
e4b2b4a8 6539+
b3bbd485
JK
6540+ INIT_SWORK(&notify_work, __mce_work_trigger);
6541+ notify_work_ready = true;
6542+ return 0;
6543+}
e4b2b4a8 6544+
b3bbd485
JK
6545+void mce_work_trigger(void)
6546+{
6547+ if (notify_work_ready)
6548+ swork_queue(&notify_work);
6549+}
e4b2b4a8 6550+
b3bbd485
JK
6551+#else
6552+void mce_work_trigger(void)
6553+{
6554+ __mce_work_trigger(NULL);
6555+}
6556+static inline int mce_notify_work_init(void) { return 0; }
6557+#endif
e4b2b4a8 6558+
b3bbd485
JK
6559 static ssize_t
6560 show_trigger(struct device *s, struct device_attribute *attr, char *buf)
6561 {
6562@@ -356,7 +387,7 @@ static __init int dev_mcelog_init_device(void)
6563
6564 return err;
6565 }
6566-
6567+ mce_notify_work_init();
6568 mce_register_decode_chain(&dev_mcelog_nb);
6569 return 0;
6570 }
6571diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
6572index 98e4e4dc4a3b..5cce2ee3b9f6 100644
6573--- a/arch/x86/kernel/cpu/mcheck/mce.c
6574+++ b/arch/x86/kernel/cpu/mcheck/mce.c
6575@@ -42,6 +42,7 @@
6576 #include <linux/debugfs.h>
6577 #include <linux/irq_work.h>
6578 #include <linux/export.h>
6579+#include <linux/jiffies.h>
6580 #include <linux/jump_label.h>
6581
6582 #include <asm/intel-family.h>
6583@@ -1365,7 +1366,7 @@ int memory_failure(unsigned long pfn, int vector, int flags)
6584 static unsigned long check_interval = INITIAL_CHECK_INTERVAL;
6585
6586 static DEFINE_PER_CPU(unsigned long, mce_next_interval); /* in jiffies */
6587-static DEFINE_PER_CPU(struct timer_list, mce_timer);
6588+static DEFINE_PER_CPU(struct hrtimer, mce_timer);
6589
6590 static unsigned long mce_adjust_timer_default(unsigned long interval)
6591 {
6592@@ -1374,27 +1375,19 @@ static unsigned long mce_adjust_timer_default(unsigned long interval)
6593
6594 static unsigned long (*mce_adjust_timer)(unsigned long interval) = mce_adjust_timer_default;
6595
6596-static void __start_timer(struct timer_list *t, unsigned long interval)
6597+static void __start_timer(struct hrtimer *t, unsigned long iv)
6598 {
6599- unsigned long when = jiffies + interval;
6600- unsigned long flags;
6601-
6602- local_irq_save(flags);
6603-
6604- if (!timer_pending(t) || time_before(when, t->expires))
6605- mod_timer(t, round_jiffies(when));
6606+ if (!iv)
6607+ return;
6608
6609- local_irq_restore(flags);
6610+ hrtimer_start_range_ns(t, ns_to_ktime(jiffies_to_usecs(iv) * 1000ULL),
6611+ 0, HRTIMER_MODE_REL_PINNED);
6612 }
6613
6614-static void mce_timer_fn(unsigned long data)
6615+static enum hrtimer_restart mce_timer_fn(struct hrtimer *timer)
6616 {
6617- struct timer_list *t = this_cpu_ptr(&mce_timer);
6618- int cpu = smp_processor_id();
6619 unsigned long iv;
6620
6621- WARN_ON(cpu != data);
6622-
6623 iv = __this_cpu_read(mce_next_interval);
6624
6625 if (mce_available(this_cpu_ptr(&cpu_info))) {
6626@@ -1417,7 +1410,11 @@ static void mce_timer_fn(unsigned long data)
6627
6628 done:
6629 __this_cpu_write(mce_next_interval, iv);
6630- __start_timer(t, iv);
6631+ if (!iv)
6632+ return HRTIMER_NORESTART;
e4b2b4a8 6633+
b3bbd485
JK
6634+ hrtimer_forward_now(timer, ns_to_ktime(jiffies_to_nsecs(iv)));
6635+ return HRTIMER_RESTART;
6636 }
6637
6638 /*
6639@@ -1425,7 +1422,7 @@ static void mce_timer_fn(unsigned long data)
6640 */
6641 void mce_timer_kick(unsigned long interval)
6642 {
6643- struct timer_list *t = this_cpu_ptr(&mce_timer);
6644+ struct hrtimer *t = this_cpu_ptr(&mce_timer);
6645 unsigned long iv = __this_cpu_read(mce_next_interval);
6646
6647 __start_timer(t, interval);
6648@@ -1440,7 +1437,7 @@ static void mce_timer_delete_all(void)
6649 int cpu;
6650
6651 for_each_online_cpu(cpu)
6652- del_timer_sync(&per_cpu(mce_timer, cpu));
6653+ hrtimer_cancel(&per_cpu(mce_timer, cpu));
6654 }
6655
6656 /*
6657@@ -1769,7 +1766,7 @@ static void __mcheck_cpu_clear_vendor(struct cpuinfo_x86 *c)
6658 }
6659 }
6660
6661-static void mce_start_timer(struct timer_list *t)
6662+static void mce_start_timer(struct hrtimer *t)
6663 {
6664 unsigned long iv = check_interval * HZ;
6665
6666@@ -1782,18 +1779,19 @@ static void mce_start_timer(struct timer_list *t)
6667
6668 static void __mcheck_cpu_setup_timer(void)
6669 {
6670- struct timer_list *t = this_cpu_ptr(&mce_timer);
6671- unsigned int cpu = smp_processor_id();
6672+ struct hrtimer *t = this_cpu_ptr(&mce_timer);
6673
6674- setup_pinned_timer(t, mce_timer_fn, cpu);
6675+ hrtimer_init(t, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
6676+ t->function = mce_timer_fn;
6677 }
6678
6679 static void __mcheck_cpu_init_timer(void)
6680 {
6681- struct timer_list *t = this_cpu_ptr(&mce_timer);
6682- unsigned int cpu = smp_processor_id();
6683+ struct hrtimer *t = this_cpu_ptr(&mce_timer);
e4b2b4a8 6684+
b3bbd485
JK
6685+ hrtimer_init(t, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
6686+ t->function = mce_timer_fn;
6687
6688- setup_pinned_timer(t, mce_timer_fn, cpu);
6689 mce_start_timer(t);
6690 }
6691
6692@@ -2309,7 +2307,7 @@ static int mce_cpu_dead(unsigned int cpu)
6693
6694 static int mce_cpu_online(unsigned int cpu)
6695 {
6696- struct timer_list *t = this_cpu_ptr(&mce_timer);
6697+ struct hrtimer *t = this_cpu_ptr(&mce_timer);
6698 int ret;
6699
6700 mce_device_create(cpu);
6701@@ -2326,10 +2324,10 @@ static int mce_cpu_online(unsigned int cpu)
6702
6703 static int mce_cpu_pre_down(unsigned int cpu)
6704 {
6705- struct timer_list *t = this_cpu_ptr(&mce_timer);
6706+ struct hrtimer *t = this_cpu_ptr(&mce_timer);
6707
6708 mce_disable_cpu();
6709- del_timer_sync(t);
6710+ hrtimer_cancel(t);
6711 mce_threshold_remove_device(cpu);
6712 mce_device_remove(cpu);
6713 return 0;
6714diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c
6715index 2ea85b32421a..6914dc569d1e 100644
6716--- a/arch/x86/kernel/fpu/core.c
6717+++ b/arch/x86/kernel/fpu/core.c
6718@@ -138,6 +138,18 @@ void kernel_fpu_end(void)
6719 }
6720 EXPORT_SYMBOL_GPL(kernel_fpu_end);
6721
6722+void kernel_fpu_resched(void)
6723+{
6724+ WARN_ON_FPU(!this_cpu_read(in_kernel_fpu));
e4b2b4a8 6725+
b3bbd485
JK
6726+ if (should_resched(PREEMPT_OFFSET)) {
6727+ kernel_fpu_end();
6728+ cond_resched();
6729+ kernel_fpu_begin();
6730+ }
6731+}
6732+EXPORT_SYMBOL_GPL(kernel_fpu_resched);
e4b2b4a8 6733+
b3bbd485
JK
6734 /*
6735 * Save the FPU state (mark it for reload if necessary):
6736 *
6737diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c
6738index 95600a99ae93..9192d76085ba 100644
6739--- a/arch/x86/kernel/irq_32.c
6740+++ b/arch/x86/kernel/irq_32.c
6741@@ -130,6 +130,7 @@ void irq_ctx_init(int cpu)
6742 cpu, per_cpu(hardirq_stack, cpu), per_cpu(softirq_stack, cpu));
6743 }
6744
6745+#ifndef CONFIG_PREEMPT_RT_FULL
6746 void do_softirq_own_stack(void)
6747 {
6748 struct irq_stack *irqstk;
6749@@ -146,6 +147,7 @@ void do_softirq_own_stack(void)
6750
6751 call_on_stack(__do_softirq, isp);
6752 }
6753+#endif
6754
6755 bool handle_irq(struct irq_desc *desc, struct pt_regs *regs)
6756 {
6757diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
6758index 5224c6099184..9b2b1f0409c5 100644
6759--- a/arch/x86/kernel/process_32.c
6760+++ b/arch/x86/kernel/process_32.c
6761@@ -38,6 +38,7 @@
6762 #include <linux/io.h>
6763 #include <linux/kdebug.h>
6764 #include <linux/syscalls.h>
6765+#include <linux/highmem.h>
6766
6767 #include <asm/pgtable.h>
6768 #include <asm/ldt.h>
6769@@ -198,6 +199,35 @@ start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp)
6770 }
6771 EXPORT_SYMBOL_GPL(start_thread);
6772
6773+#ifdef CONFIG_PREEMPT_RT_FULL
6774+static void switch_kmaps(struct task_struct *prev_p, struct task_struct *next_p)
6775+{
6776+ int i;
e4b2b4a8 6777+
b3bbd485
JK
6778+ /*
6779+ * Clear @prev's kmap_atomic mappings
6780+ */
6781+ for (i = 0; i < prev_p->kmap_idx; i++) {
6782+ int idx = i + KM_TYPE_NR * smp_processor_id();
6783+ pte_t *ptep = kmap_pte - idx;
e4b2b4a8 6784+
b3bbd485
JK
6785+ kpte_clear_flush(ptep, __fix_to_virt(FIX_KMAP_BEGIN + idx));
6786+ }
6787+ /*
6788+ * Restore @next_p's kmap_atomic mappings
6789+ */
6790+ for (i = 0; i < next_p->kmap_idx; i++) {
6791+ int idx = i + KM_TYPE_NR * smp_processor_id();
e4b2b4a8 6792+
b3bbd485
JK
6793+ if (!pte_none(next_p->kmap_pte[i]))
6794+ set_pte(kmap_pte - idx, next_p->kmap_pte[i]);
6795+ }
6796+}
6797+#else
6798+static inline void
6799+switch_kmaps(struct task_struct *prev_p, struct task_struct *next_p) { }
6800+#endif
e4b2b4a8 6801+
b3bbd485
JK
6802
6803 /*
6804 * switch_to(x,y) should switch tasks from x to y.
6805@@ -273,6 +303,8 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
6806 task_thread_info(next_p)->flags & _TIF_WORK_CTXSW_NEXT))
6807 __switch_to_xtra(prev_p, next_p, tss);
6808
6809+ switch_kmaps(prev_p, next_p);
e4b2b4a8 6810+
b3bbd485
JK
6811 /*
6812 * Leave lazy mode, flushing any hypercalls made here.
6813 * This must be done before restoring TLS segments so
6814diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
5dd41b01 6815index 13dfb55b84db..dd66f629d1d0 100644
b3bbd485
JK
6816--- a/arch/x86/kvm/lapic.c
6817+++ b/arch/x86/kvm/lapic.c
5dd41b01 6818@@ -2136,7 +2136,7 @@ int kvm_create_lapic(struct kvm_vcpu *vcpu)
b3bbd485
JK
6819 apic->vcpu = vcpu;
6820
6821 hrtimer_init(&apic->lapic_timer.timer, CLOCK_MONOTONIC,
6822- HRTIMER_MODE_ABS_PINNED);
6823+ HRTIMER_MODE_ABS_PINNED_HARD);
6824 apic->lapic_timer.timer.function = apic_timer_fn;
6825
6826 /*
6827diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
6828index 3856828ee1dc..407658146ae1 100644
6829--- a/arch/x86/kvm/x86.c
6830+++ b/arch/x86/kvm/x86.c
6831@@ -6287,6 +6287,13 @@ int kvm_arch_init(void *opaque)
6832 goto out;
6833 }
6834
6835+#ifdef CONFIG_PREEMPT_RT_FULL
6836+ if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) {
6837+ printk(KERN_ERR "RT requires X86_FEATURE_CONSTANT_TSC\n");
6838+ return -EOPNOTSUPP;
6839+ }
6840+#endif
e4b2b4a8 6841+
b3bbd485
JK
6842 r = kvm_mmu_module_init();
6843 if (r)
6844 goto out_free_percpu;
6845diff --git a/arch/x86/mm/highmem_32.c b/arch/x86/mm/highmem_32.c
6846index 6d18b70ed5a9..f752724c22e8 100644
6847--- a/arch/x86/mm/highmem_32.c
6848+++ b/arch/x86/mm/highmem_32.c
6849@@ -32,10 +32,11 @@ EXPORT_SYMBOL(kunmap);
6850 */
6851 void *kmap_atomic_prot(struct page *page, pgprot_t prot)
6852 {
6853+ pte_t pte = mk_pte(page, prot);
6854 unsigned long vaddr;
6855 int idx, type;
6856
6857- preempt_disable();
6858+ preempt_disable_nort();
6859 pagefault_disable();
6860
6861 if (!PageHighMem(page))
6862@@ -45,7 +46,10 @@ void *kmap_atomic_prot(struct page *page, pgprot_t prot)
6863 idx = type + KM_TYPE_NR*smp_processor_id();
6864 vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx);
6865 BUG_ON(!pte_none(*(kmap_pte-idx)));
6866- set_pte(kmap_pte-idx, mk_pte(page, prot));
6867+#ifdef CONFIG_PREEMPT_RT_FULL
6868+ current->kmap_pte[type] = pte;
6869+#endif
6870+ set_pte(kmap_pte-idx, pte);
6871 arch_flush_lazy_mmu_mode();
6872
6873 return (void *)vaddr;
6874@@ -88,6 +92,9 @@ void __kunmap_atomic(void *kvaddr)
6875 * is a bad idea also, in case the page changes cacheability
6876 * attributes or becomes a protected page in a hypervisor.
6877 */
6878+#ifdef CONFIG_PREEMPT_RT_FULL
6879+ current->kmap_pte[type] = __pte(0);
6880+#endif
6881 kpte_clear_flush(kmap_pte-idx, vaddr);
6882 kmap_atomic_idx_pop();
6883 arch_flush_lazy_mmu_mode();
6884@@ -100,7 +107,7 @@ void __kunmap_atomic(void *kvaddr)
6885 #endif
6886
6887 pagefault_enable();
6888- preempt_enable();
6889+ preempt_enable_nort();
6890 }
6891 EXPORT_SYMBOL(__kunmap_atomic);
6892
6893diff --git a/arch/x86/mm/iomap_32.c b/arch/x86/mm/iomap_32.c
6894index ada98b39b8ad..585f6829653b 100644
6895--- a/arch/x86/mm/iomap_32.c
6896+++ b/arch/x86/mm/iomap_32.c
6897@@ -56,6 +56,7 @@ EXPORT_SYMBOL_GPL(iomap_free);
6898
6899 void *kmap_atomic_prot_pfn(unsigned long pfn, pgprot_t prot)
6900 {
6901+ pte_t pte = pfn_pte(pfn, prot);
6902 unsigned long vaddr;
6903 int idx, type;
6904
6905@@ -65,7 +66,12 @@ void *kmap_atomic_prot_pfn(unsigned long pfn, pgprot_t prot)
6906 type = kmap_atomic_idx_push();
6907 idx = type + KM_TYPE_NR * smp_processor_id();
6908 vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx);
6909- set_pte(kmap_pte - idx, pfn_pte(pfn, prot));
6910+ WARN_ON(!pte_none(*(kmap_pte - idx)));
e4b2b4a8 6911+
b3bbd485
JK
6912+#ifdef CONFIG_PREEMPT_RT_FULL
6913+ current->kmap_pte[type] = pte;
6914+#endif
6915+ set_pte(kmap_pte - idx, pte);
6916 arch_flush_lazy_mmu_mode();
6917
6918 return (void *)vaddr;
6919@@ -113,6 +119,9 @@ iounmap_atomic(void __iomem *kvaddr)
6920 * is a bad idea also, in case the page changes cacheability
6921 * attributes or becomes a protected page in a hypervisor.
6922 */
6923+#ifdef CONFIG_PREEMPT_RT_FULL
6924+ current->kmap_pte[type] = __pte(0);
6925+#endif
6926 kpte_clear_flush(kmap_pte-idx, vaddr);
6927 kmap_atomic_idx_pop();
6928 }
6929diff --git a/arch/xtensa/include/asm/spinlock_types.h b/arch/xtensa/include/asm/spinlock_types.h
6930index bb1fe6c1816e..8a22f1e7b6c9 100644
6931--- a/arch/xtensa/include/asm/spinlock_types.h
6932+++ b/arch/xtensa/include/asm/spinlock_types.h
6933@@ -2,10 +2,6 @@
6934 #ifndef __ASM_SPINLOCK_TYPES_H
6935 #define __ASM_SPINLOCK_TYPES_H
6936
6937-#ifndef __LINUX_SPINLOCK_TYPES_H
6938-# error "please don't include this file directly"
6939-#endif
6940-
6941 typedef struct {
6942 volatile unsigned int slock;
6943 } arch_spinlock_t;
6944diff --git a/block/blk-core.c b/block/blk-core.c
5dd41b01 6945index 6aa2bc4e9652..f005077ae291 100644
b3bbd485
JK
6946--- a/block/blk-core.c
6947+++ b/block/blk-core.c
6948@@ -116,6 +116,9 @@ void blk_rq_init(struct request_queue *q, struct request *rq)
6949
6950 INIT_LIST_HEAD(&rq->queuelist);
6951 INIT_LIST_HEAD(&rq->timeout_list);
6952+#ifdef CONFIG_PREEMPT_RT_FULL
6953+ INIT_WORK(&rq->work, __blk_mq_complete_request_remote_work);
6954+#endif
6955 rq->cpu = -1;
6956 rq->q = q;
6957 rq->__sector = (sector_t) -1;
6958@@ -280,7 +283,7 @@ EXPORT_SYMBOL(blk_start_queue_async);
6959 void blk_start_queue(struct request_queue *q)
6960 {
6961 lockdep_assert_held(q->queue_lock);
6962- WARN_ON(!in_interrupt() && !irqs_disabled());
6963+ WARN_ON_NONRT(!in_interrupt() && !irqs_disabled());
6964 WARN_ON_ONCE(q->mq_ops);
6965
6966 queue_flag_clear(QUEUE_FLAG_STOPPED, q);
5dd41b01 6967@@ -812,12 +815,21 @@ void blk_queue_exit(struct request_queue *q)
b3bbd485
JK
6968 percpu_ref_put(&q->q_usage_counter);
6969 }
6970
6971+static void blk_queue_usage_counter_release_swork(struct swork_event *sev)
6972+{
6973+ struct request_queue *q =
6974+ container_of(sev, struct request_queue, mq_pcpu_wake);
e4b2b4a8 6975+
b3bbd485
JK
6976+ wake_up_all(&q->mq_freeze_wq);
6977+}
e4b2b4a8 6978+
b3bbd485
JK
6979 static void blk_queue_usage_counter_release(struct percpu_ref *ref)
6980 {
6981 struct request_queue *q =
6982 container_of(ref, struct request_queue, q_usage_counter);
6983
6984- wake_up_all(&q->mq_freeze_wq);
6985+ if (wq_has_sleeper(&q->mq_freeze_wq))
6986+ swork_queue(&q->mq_pcpu_wake);
6987 }
6988
6989 static void blk_rq_timed_out_timer(unsigned long data)
5dd41b01 6990@@ -894,6 +906,7 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id)
b3bbd485
JK
6991 __set_bit(QUEUE_FLAG_BYPASS, &q->queue_flags);
6992
6993 init_waitqueue_head(&q->mq_freeze_wq);
6994+ INIT_SWORK(&q->mq_pcpu_wake, blk_queue_usage_counter_release_swork);
6995
6996 /*
6997 * Init percpu_ref in atomic mode so that it's faster to shutdown.
5dd41b01 6998@@ -3313,7 +3326,7 @@ static void queue_unplugged(struct request_queue *q, unsigned int depth,
b3bbd485
JK
6999 blk_run_queue_async(q);
7000 else
7001 __blk_run_queue(q);
7002- spin_unlock(q->queue_lock);
7003+ spin_unlock_irq(q->queue_lock);
7004 }
7005
7006 static void flush_plug_callbacks(struct blk_plug *plug, bool from_schedule)
5dd41b01 7007@@ -3361,7 +3374,6 @@ EXPORT_SYMBOL(blk_check_plugged);
b3bbd485
JK
7008 void blk_flush_plug_list(struct blk_plug *plug, bool from_schedule)
7009 {
7010 struct request_queue *q;
7011- unsigned long flags;
7012 struct request *rq;
7013 LIST_HEAD(list);
7014 unsigned int depth;
5dd41b01 7015@@ -3381,11 +3393,6 @@ void blk_flush_plug_list(struct blk_plug *plug, bool from_schedule)
b3bbd485
JK
7016 q = NULL;
7017 depth = 0;
7018
7019- /*
7020- * Save and disable interrupts here, to avoid doing it for every
7021- * queue lock we have to take.
7022- */
7023- local_irq_save(flags);
7024 while (!list_empty(&list)) {
7025 rq = list_entry_rq(list.next);
7026 list_del_init(&rq->queuelist);
5dd41b01 7027@@ -3398,7 +3405,7 @@ void blk_flush_plug_list(struct blk_plug *plug, bool from_schedule)
b3bbd485
JK
7028 queue_unplugged(q, depth, from_schedule);
7029 q = rq->q;
7030 depth = 0;
7031- spin_lock(q->queue_lock);
7032+ spin_lock_irq(q->queue_lock);
7033 }
7034
7035 /*
5dd41b01 7036@@ -3425,8 +3432,6 @@ void blk_flush_plug_list(struct blk_plug *plug, bool from_schedule)
b3bbd485
JK
7037 */
7038 if (q)
7039 queue_unplugged(q, depth, from_schedule);
7040-
7041- local_irq_restore(flags);
7042 }
7043
7044 void blk_finish_plug(struct blk_plug *plug)
5dd41b01 7045@@ -3638,6 +3643,8 @@ int __init blk_dev_init(void)
b3bbd485
JK
7046 if (!kblockd_workqueue)
7047 panic("Failed to create kblockd\n");
7048
7049+ BUG_ON(swork_get());
e4b2b4a8 7050+
b3bbd485
JK
7051 request_cachep = kmem_cache_create("blkdev_requests",
7052 sizeof(struct request), 0, SLAB_PANIC, NULL);
7053
7054diff --git a/block/blk-ioc.c b/block/blk-ioc.c
7055index f23311e4b201..ca9ea624f159 100644
7056--- a/block/blk-ioc.c
7057+++ b/block/blk-ioc.c
7058@@ -9,6 +9,7 @@
7059 #include <linux/blkdev.h>
7060 #include <linux/slab.h>
7061 #include <linux/sched/task.h>
7062+#include <linux/delay.h>
7063
7064 #include "blk.h"
7065
7066@@ -118,7 +119,7 @@ static void ioc_release_fn(struct work_struct *work)
7067 spin_unlock(q->queue_lock);
7068 } else {
7069 spin_unlock_irqrestore(&ioc->lock, flags);
7070- cpu_relax();
7071+ cpu_chill();
7072 spin_lock_irqsave_nested(&ioc->lock, flags, 1);
7073 }
7074 }
7075@@ -202,7 +203,7 @@ void put_io_context_active(struct io_context *ioc)
7076 spin_unlock(icq->q->queue_lock);
7077 } else {
7078 spin_unlock_irqrestore(&ioc->lock, flags);
7079- cpu_relax();
7080+ cpu_chill();
7081 goto retry;
7082 }
7083 }
7084diff --git a/block/blk-mq.c b/block/blk-mq.c
5dd41b01 7085index eac444804736..a6314b82273e 100644
b3bbd485
JK
7086--- a/block/blk-mq.c
7087+++ b/block/blk-mq.c
7088@@ -339,6 +339,9 @@ static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data,
7089 /* tag was already set */
7090 rq->extra_len = 0;
7091
7092+#ifdef CONFIG_PREEMPT_RT_FULL
7093+ INIT_WORK(&rq->work, __blk_mq_complete_request_remote_work);
7094+#endif
7095 INIT_LIST_HEAD(&rq->timeout_list);
7096 rq->timeout = 0;
7097
7098@@ -533,12 +536,24 @@ void blk_mq_end_request(struct request *rq, blk_status_t error)
7099 }
7100 EXPORT_SYMBOL(blk_mq_end_request);
7101
7102+#ifdef CONFIG_PREEMPT_RT_FULL
e4b2b4a8 7103+
b3bbd485
JK
7104+void __blk_mq_complete_request_remote_work(struct work_struct *work)
7105+{
7106+ struct request *rq = container_of(work, struct request, work);
e4b2b4a8 7107+
b3bbd485
JK
7108+ rq->q->softirq_done_fn(rq);
7109+}
e4b2b4a8 7110+
b3bbd485 7111+#else
e4b2b4a8 7112+
b3bbd485
JK
7113 static void __blk_mq_complete_request_remote(void *data)
7114 {
7115 struct request *rq = data;
7116
7117 rq->q->softirq_done_fn(rq);
7118 }
7119+#endif
7120
7121 static void __blk_mq_complete_request(struct request *rq)
7122 {
7123@@ -558,19 +573,27 @@ static void __blk_mq_complete_request(struct request *rq)
7124 return;
7125 }
7126
7127- cpu = get_cpu();
7128+ cpu = get_cpu_light();
7129 if (!test_bit(QUEUE_FLAG_SAME_FORCE, &rq->q->queue_flags))
7130 shared = cpus_share_cache(cpu, ctx->cpu);
7131
7132 if (cpu != ctx->cpu && !shared && cpu_online(ctx->cpu)) {
7133+#ifdef CONFIG_PREEMPT_RT_FULL
7134+ /*
7135+ * We could force QUEUE_FLAG_SAME_FORCE then we would not get in
7136+ * here. But we could try to invoke it one the CPU like this.
7137+ */
7138+ schedule_work_on(ctx->cpu, &rq->work);
7139+#else
7140 rq->csd.func = __blk_mq_complete_request_remote;
7141 rq->csd.info = rq;
7142 rq->csd.flags = 0;
7143 smp_call_function_single_async(ctx->cpu, &rq->csd);
7144+#endif
7145 } else {
7146 rq->q->softirq_done_fn(rq);
7147 }
7148- put_cpu();
7149+ put_cpu_light();
7150 }
7151
7152 /**
7153@@ -1238,14 +1261,14 @@ static void __blk_mq_delay_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async,
7154 return;
7155
7156 if (!async && !(hctx->flags & BLK_MQ_F_BLOCKING)) {
7157- int cpu = get_cpu();
7158+ int cpu = get_cpu_light();
7159 if (cpumask_test_cpu(cpu, hctx->cpumask)) {
7160 __blk_mq_run_hw_queue(hctx);
7161- put_cpu();
7162+ put_cpu_light();
7163 return;
7164 }
7165
7166- put_cpu();
7167+ put_cpu_light();
7168 }
7169
7170 kblockd_schedule_delayed_work_on(blk_mq_hctx_next_cpu(hctx),
7171@@ -2863,10 +2886,9 @@ static bool blk_mq_poll_hybrid_sleep(struct request_queue *q,
7172 kt = nsecs;
7173
7174 mode = HRTIMER_MODE_REL;
7175- hrtimer_init_on_stack(&hs.timer, CLOCK_MONOTONIC, mode);
7176+ hrtimer_init_sleeper_on_stack(&hs, CLOCK_MONOTONIC, mode, current);
7177 hrtimer_set_expires(&hs.timer, kt);
7178
7179- hrtimer_init_sleeper(&hs, current);
7180 do {
7181 if (test_bit(REQ_ATOM_COMPLETE, &rq->atomic_flags))
7182 break;
7183diff --git a/block/blk-mq.h b/block/blk-mq.h
7184index 877237e09083..d944750bade0 100644
7185--- a/block/blk-mq.h
7186+++ b/block/blk-mq.h
7187@@ -98,12 +98,12 @@ static inline struct blk_mq_ctx *__blk_mq_get_ctx(struct request_queue *q,
7188 */
7189 static inline struct blk_mq_ctx *blk_mq_get_ctx(struct request_queue *q)
7190 {
7191- return __blk_mq_get_ctx(q, get_cpu());
7192+ return __blk_mq_get_ctx(q, get_cpu_light());
7193 }
7194
7195 static inline void blk_mq_put_ctx(struct blk_mq_ctx *ctx)
7196 {
7197- put_cpu();
7198+ put_cpu_light();
7199 }
7200
7201 struct blk_mq_alloc_data {
7202diff --git a/block/blk-softirq.c b/block/blk-softirq.c
7203index 01e2b353a2b9..e8c0d4945f5a 100644
7204--- a/block/blk-softirq.c
7205+++ b/block/blk-softirq.c
7206@@ -53,6 +53,7 @@ static void trigger_softirq(void *data)
7207 raise_softirq_irqoff(BLOCK_SOFTIRQ);
7208
7209 local_irq_restore(flags);
7210+ preempt_check_resched_rt();
7211 }
7212
7213 /*
7214@@ -91,6 +92,7 @@ static int blk_softirq_cpu_dead(unsigned int cpu)
7215 this_cpu_ptr(&blk_cpu_done));
7216 raise_softirq_irqoff(BLOCK_SOFTIRQ);
7217 local_irq_enable();
7218+ preempt_check_resched_rt();
7219
7220 return 0;
7221 }
7222@@ -143,6 +145,7 @@ void __blk_complete_request(struct request *req)
7223 goto do_local;
7224
7225 local_irq_restore(flags);
7226+ preempt_check_resched_rt();
7227 }
7228
7229 /**
7230diff --git a/block/bounce.c b/block/bounce.c
7231index 1d05c422c932..0101ffefddc4 100644
7232--- a/block/bounce.c
7233+++ b/block/bounce.c
7234@@ -66,11 +66,11 @@ static void bounce_copy_vec(struct bio_vec *to, unsigned char *vfrom)
7235 unsigned long flags;
7236 unsigned char *vto;
7237
7238- local_irq_save(flags);
7239+ local_irq_save_nort(flags);
7240 vto = kmap_atomic(to->bv_page);
7241 memcpy(vto + to->bv_offset, vfrom, to->bv_len);
7242 kunmap_atomic(vto);
7243- local_irq_restore(flags);
7244+ local_irq_restore_nort(flags);
7245 }
7246
7247 #else /* CONFIG_HIGHMEM */
7248diff --git a/crypto/algapi.c b/crypto/algapi.c
7249index 50eb828db767..7bce92a6599a 100644
7250--- a/crypto/algapi.c
7251+++ b/crypto/algapi.c
7252@@ -731,13 +731,13 @@ EXPORT_SYMBOL_GPL(crypto_spawn_tfm2);
7253
7254 int crypto_register_notifier(struct notifier_block *nb)
7255 {
7256- return blocking_notifier_chain_register(&crypto_chain, nb);
7257+ return srcu_notifier_chain_register(&crypto_chain, nb);
7258 }
7259 EXPORT_SYMBOL_GPL(crypto_register_notifier);
7260
7261 int crypto_unregister_notifier(struct notifier_block *nb)
7262 {
7263- return blocking_notifier_chain_unregister(&crypto_chain, nb);
7264+ return srcu_notifier_chain_unregister(&crypto_chain, nb);
7265 }
7266 EXPORT_SYMBOL_GPL(crypto_unregister_notifier);
7267
7268diff --git a/crypto/api.c b/crypto/api.c
5dd41b01 7269index e485aed11ad0..089e648d2fa9 100644
b3bbd485
JK
7270--- a/crypto/api.c
7271+++ b/crypto/api.c
7272@@ -31,7 +31,7 @@ EXPORT_SYMBOL_GPL(crypto_alg_list);
7273 DECLARE_RWSEM(crypto_alg_sem);
7274 EXPORT_SYMBOL_GPL(crypto_alg_sem);
7275
7276-BLOCKING_NOTIFIER_HEAD(crypto_chain);
7277+SRCU_NOTIFIER_HEAD(crypto_chain);
7278 EXPORT_SYMBOL_GPL(crypto_chain);
7279
7280 static struct crypto_alg *crypto_larval_wait(struct crypto_alg *alg);
7281@@ -236,10 +236,10 @@ int crypto_probing_notify(unsigned long val, void *v)
7282 {
7283 int ok;
7284
7285- ok = blocking_notifier_call_chain(&crypto_chain, val, v);
7286+ ok = srcu_notifier_call_chain(&crypto_chain, val, v);
7287 if (ok == NOTIFY_DONE) {
7288 request_module("cryptomgr");
7289- ok = blocking_notifier_call_chain(&crypto_chain, val, v);
7290+ ok = srcu_notifier_call_chain(&crypto_chain, val, v);
7291 }
7292
7293 return ok;
7294diff --git a/crypto/cryptd.c b/crypto/cryptd.c
7295index 248f6ba41688..54b7985c8caa 100644
7296--- a/crypto/cryptd.c
7297+++ b/crypto/cryptd.c
7298@@ -37,6 +37,7 @@
7299 struct cryptd_cpu_queue {
7300 struct crypto_queue queue;
7301 struct work_struct work;
7302+ spinlock_t qlock;
7303 };
7304
7305 struct cryptd_queue {
7306@@ -115,6 +116,7 @@ static int cryptd_init_queue(struct cryptd_queue *queue,
7307 cpu_queue = per_cpu_ptr(queue->cpu_queue, cpu);
7308 crypto_init_queue(&cpu_queue->queue, max_cpu_qlen);
7309 INIT_WORK(&cpu_queue->work, cryptd_queue_worker);
7310+ spin_lock_init(&cpu_queue->qlock);
7311 }
7312 return 0;
7313 }
7314@@ -139,8 +141,10 @@ static int cryptd_enqueue_request(struct cryptd_queue *queue,
7315 atomic_t *refcnt;
7316 bool may_backlog;
7317
7318- cpu = get_cpu();
7319- cpu_queue = this_cpu_ptr(queue->cpu_queue);
7320+ cpu_queue = raw_cpu_ptr(queue->cpu_queue);
7321+ spin_lock_bh(&cpu_queue->qlock);
7322+ cpu = smp_processor_id();
e4b2b4a8 7323+
b3bbd485
JK
7324 err = crypto_enqueue_request(&cpu_queue->queue, request);
7325
7326 refcnt = crypto_tfm_ctx(request->tfm);
7327@@ -157,7 +161,7 @@ static int cryptd_enqueue_request(struct cryptd_queue *queue,
7328 atomic_inc(refcnt);
7329
7330 out_put_cpu:
7331- put_cpu();
7332+ spin_unlock_bh(&cpu_queue->qlock);
7333
7334 return err;
7335 }
7336@@ -173,16 +177,11 @@ static void cryptd_queue_worker(struct work_struct *work)
7337 cpu_queue = container_of(work, struct cryptd_cpu_queue, work);
7338 /*
7339 * Only handle one request at a time to avoid hogging crypto workqueue.
7340- * preempt_disable/enable is used to prevent being preempted by
7341- * cryptd_enqueue_request(). local_bh_disable/enable is used to prevent
7342- * cryptd_enqueue_request() being accessed from software interrupts.
7343 */
7344- local_bh_disable();
7345- preempt_disable();
7346+ spin_lock_bh(&cpu_queue->qlock);
7347 backlog = crypto_get_backlog(&cpu_queue->queue);
7348 req = crypto_dequeue_request(&cpu_queue->queue);
7349- preempt_enable();
7350- local_bh_enable();
7351+ spin_unlock_bh(&cpu_queue->qlock);
7352
7353 if (!req)
7354 return;
7355diff --git a/crypto/internal.h b/crypto/internal.h
7356index f07320423191..333d985088fe 100644
7357--- a/crypto/internal.h
7358+++ b/crypto/internal.h
7359@@ -47,7 +47,7 @@ struct crypto_larval {
7360
7361 extern struct list_head crypto_alg_list;
7362 extern struct rw_semaphore crypto_alg_sem;
7363-extern struct blocking_notifier_head crypto_chain;
7364+extern struct srcu_notifier_head crypto_chain;
7365
7366 #ifdef CONFIG_PROC_FS
7367 void __init crypto_init_proc(void);
7368@@ -143,7 +143,7 @@ static inline int crypto_is_moribund(struct crypto_alg *alg)
7369
7370 static inline void crypto_notify(unsigned long val, void *v)
7371 {
7372- blocking_notifier_call_chain(&crypto_chain, val, v);
7373+ srcu_notifier_call_chain(&crypto_chain, val, v);
7374 }
7375
7376 #endif /* _CRYPTO_INTERNAL_H */
7377diff --git a/crypto/scompress.c b/crypto/scompress.c
7378index 2075e2c4e7df..c6b4e265c6bf 100644
7379--- a/crypto/scompress.c
7380+++ b/crypto/scompress.c
7381@@ -24,6 +24,7 @@
7382 #include <linux/cryptouser.h>
7383 #include <net/netlink.h>
7384 #include <linux/scatterlist.h>
7385+#include <linux/locallock.h>
7386 #include <crypto/scatterwalk.h>
7387 #include <crypto/internal/acompress.h>
7388 #include <crypto/internal/scompress.h>
7389@@ -34,6 +35,7 @@ static void * __percpu *scomp_src_scratches;
7390 static void * __percpu *scomp_dst_scratches;
7391 static int scomp_scratch_users;
7392 static DEFINE_MUTEX(scomp_lock);
7393+static DEFINE_LOCAL_IRQ_LOCK(scomp_scratches_lock);
7394
7395 #ifdef CONFIG_NET
7396 static int crypto_scomp_report(struct sk_buff *skb, struct crypto_alg *alg)
7397@@ -193,7 +195,7 @@ static int scomp_acomp_comp_decomp(struct acomp_req *req, int dir)
7398 void **tfm_ctx = acomp_tfm_ctx(tfm);
7399 struct crypto_scomp *scomp = *tfm_ctx;
7400 void **ctx = acomp_request_ctx(req);
7401- const int cpu = get_cpu();
7402+ const int cpu = local_lock_cpu(scomp_scratches_lock);
7403 u8 *scratch_src = *per_cpu_ptr(scomp_src_scratches, cpu);
7404 u8 *scratch_dst = *per_cpu_ptr(scomp_dst_scratches, cpu);
7405 int ret;
7406@@ -228,7 +230,7 @@ static int scomp_acomp_comp_decomp(struct acomp_req *req, int dir)
7407 1);
7408 }
7409 out:
7410- put_cpu();
7411+ local_unlock_cpu(scomp_scratches_lock);
7412 return ret;
7413 }
7414
7415diff --git a/drivers/acpi/acpica/acglobal.h b/drivers/acpi/acpica/acglobal.h
7416index 95eed442703f..50bc5b61d899 100644
7417--- a/drivers/acpi/acpica/acglobal.h
7418+++ b/drivers/acpi/acpica/acglobal.h
7419@@ -116,7 +116,7 @@ ACPI_GLOBAL(u8, acpi_gbl_global_lock_pending);
e4b2b4a8
JK
7420 * interrupt level
7421 */
7422 ACPI_GLOBAL(acpi_spinlock, acpi_gbl_gpe_lock); /* For GPE data structs and registers */
7423-ACPI_GLOBAL(acpi_spinlock, acpi_gbl_hardware_lock); /* For ACPI H/W except GPE registers */
7424+ACPI_GLOBAL(acpi_raw_spinlock, acpi_gbl_hardware_lock); /* For ACPI H/W except GPE registers */
7425 ACPI_GLOBAL(acpi_spinlock, acpi_gbl_reference_count_lock);
1a6e0f06 7426
e4b2b4a8 7427 /* Mutex for _OSI support */
b3bbd485
JK
7428diff --git a/drivers/acpi/acpica/hwregs.c b/drivers/acpi/acpica/hwregs.c
7429index acb417b58bbb..ea49e08c263f 100644
7430--- a/drivers/acpi/acpica/hwregs.c
7431+++ b/drivers/acpi/acpica/hwregs.c
7432@@ -428,14 +428,14 @@ acpi_status acpi_hw_clear_acpi_status(void)
e4b2b4a8
JK
7433 ACPI_BITMASK_ALL_FIXED_STATUS,
7434 ACPI_FORMAT_UINT64(acpi_gbl_xpm1a_status.address)));
1a6e0f06 7435
e4b2b4a8
JK
7436- lock_flags = acpi_os_acquire_lock(acpi_gbl_hardware_lock);
7437+ raw_spin_lock_irqsave(acpi_gbl_hardware_lock, lock_flags);
1a6e0f06 7438
e4b2b4a8 7439 /* Clear the fixed events in PM1 A/B */
1a6e0f06 7440
e4b2b4a8
JK
7441 status = acpi_hw_register_write(ACPI_REGISTER_PM1_STATUS,
7442 ACPI_BITMASK_ALL_FIXED_STATUS);
1a6e0f06 7443
e4b2b4a8
JK
7444- acpi_os_release_lock(acpi_gbl_hardware_lock, lock_flags);
7445+ raw_spin_unlock_irqrestore(acpi_gbl_hardware_lock, lock_flags);
1a6e0f06 7446
e4b2b4a8
JK
7447 if (ACPI_FAILURE(status)) {
7448 goto exit;
b3bbd485
JK
7449diff --git a/drivers/acpi/acpica/hwxface.c b/drivers/acpi/acpica/hwxface.c
7450index 34684ae89981..fb84983e1839 100644
7451--- a/drivers/acpi/acpica/hwxface.c
7452+++ b/drivers/acpi/acpica/hwxface.c
7453@@ -373,7 +373,7 @@ acpi_status acpi_write_bit_register(u32 register_id, u32 value)
e4b2b4a8
JK
7454 return_ACPI_STATUS(AE_BAD_PARAMETER);
7455 }
1a6e0f06 7456
e4b2b4a8
JK
7457- lock_flags = acpi_os_acquire_lock(acpi_gbl_hardware_lock);
7458+ raw_spin_lock_irqsave(acpi_gbl_hardware_lock, lock_flags);
1a6e0f06 7459
e4b2b4a8
JK
7460 /*
7461 * At this point, we know that the parent register is one of the
b3bbd485 7462@@ -434,7 +434,7 @@ acpi_status acpi_write_bit_register(u32 register_id, u32 value)
1a6e0f06 7463
e4b2b4a8
JK
7464 unlock_and_exit:
7465
7466- acpi_os_release_lock(acpi_gbl_hardware_lock, lock_flags);
7467+ raw_spin_unlock_irqrestore(acpi_gbl_hardware_lock, lock_flags);
7468 return_ACPI_STATUS(status);
7469 }
7470
b3bbd485
JK
7471diff --git a/drivers/acpi/acpica/utmutex.c b/drivers/acpi/acpica/utmutex.c
7472index 586354788018..3a3c2a86437f 100644
7473--- a/drivers/acpi/acpica/utmutex.c
7474+++ b/drivers/acpi/acpica/utmutex.c
7475@@ -88,7 +88,7 @@ acpi_status acpi_ut_mutex_initialize(void)
e4b2b4a8
JK
7476 return_ACPI_STATUS (status);
7477 }
7478
7479- status = acpi_os_create_lock (&acpi_gbl_hardware_lock);
7480+ status = acpi_os_create_raw_lock (&acpi_gbl_hardware_lock);
7481 if (ACPI_FAILURE (status)) {
7482 return_ACPI_STATUS (status);
7483 }
b3bbd485 7484@@ -145,7 +145,7 @@ void acpi_ut_mutex_terminate(void)
e4b2b4a8
JK
7485 /* Delete the spinlocks */
7486
7487 acpi_os_delete_lock(acpi_gbl_gpe_lock);
7488- acpi_os_delete_lock(acpi_gbl_hardware_lock);
7489+ acpi_os_delete_raw_lock(acpi_gbl_hardware_lock);
7490 acpi_os_delete_lock(acpi_gbl_reference_count_lock);
7491
7492 /* Delete the reader/writer lock */
b3bbd485
JK
7493diff --git a/drivers/ata/libata-sff.c b/drivers/ata/libata-sff.c
7494index cc2f2e35f4c2..0f0bc86e02df 100644
7495--- a/drivers/ata/libata-sff.c
7496+++ b/drivers/ata/libata-sff.c
7497@@ -679,9 +679,9 @@ unsigned int ata_sff_data_xfer_noirq(struct ata_queued_cmd *qc, unsigned char *b
e4b2b4a8
JK
7498 unsigned long flags;
7499 unsigned int consumed;
7500
7501- local_irq_save(flags);
7502+ local_irq_save_nort(flags);
7503 consumed = ata_sff_data_xfer32(qc, buf, buflen, rw);
7504- local_irq_restore(flags);
7505+ local_irq_restore_nort(flags);
7506
7507 return consumed;
7508 }
b3bbd485
JK
7509diff --git a/drivers/base/power/wakeup.c b/drivers/base/power/wakeup.c
7510index cdd6f256da59..2269d379c92f 100644
7511--- a/drivers/base/power/wakeup.c
7512+++ b/drivers/base/power/wakeup.c
7513@@ -52,7 +52,7 @@ static void split_counters(unsigned int *cnt, unsigned int *inpr)
7514 /* A preserved old value of the events counter. */
7515 static unsigned int saved_count;
7516
7517-static DEFINE_SPINLOCK(events_lock);
7518+static DEFINE_RAW_SPINLOCK(events_lock);
7519
7520 static void pm_wakeup_timer_fn(unsigned long data);
7521
7522@@ -180,9 +180,9 @@ void wakeup_source_add(struct wakeup_source *ws)
7523 ws->active = false;
7524 ws->last_time = ktime_get();
7525
7526- spin_lock_irqsave(&events_lock, flags);
7527+ raw_spin_lock_irqsave(&events_lock, flags);
7528 list_add_rcu(&ws->entry, &wakeup_sources);
7529- spin_unlock_irqrestore(&events_lock, flags);
7530+ raw_spin_unlock_irqrestore(&events_lock, flags);
7531 }
7532 EXPORT_SYMBOL_GPL(wakeup_source_add);
7533
7534@@ -197,9 +197,9 @@ void wakeup_source_remove(struct wakeup_source *ws)
7535 if (WARN_ON(!ws))
7536 return;
7537
7538- spin_lock_irqsave(&events_lock, flags);
7539+ raw_spin_lock_irqsave(&events_lock, flags);
7540 list_del_rcu(&ws->entry);
7541- spin_unlock_irqrestore(&events_lock, flags);
7542+ raw_spin_unlock_irqrestore(&events_lock, flags);
7543 synchronize_srcu(&wakeup_srcu);
7544 }
7545 EXPORT_SYMBOL_GPL(wakeup_source_remove);
7546@@ -844,7 +844,7 @@ bool pm_wakeup_pending(void)
7547 unsigned long flags;
7548 bool ret = false;
7549
7550- spin_lock_irqsave(&events_lock, flags);
7551+ raw_spin_lock_irqsave(&events_lock, flags);
7552 if (events_check_enabled) {
7553 unsigned int cnt, inpr;
7554
7555@@ -852,7 +852,7 @@ bool pm_wakeup_pending(void)
7556 ret = (cnt != saved_count || inpr > 0);
7557 events_check_enabled = !ret;
7558 }
7559- spin_unlock_irqrestore(&events_lock, flags);
7560+ raw_spin_unlock_irqrestore(&events_lock, flags);
7561
7562 if (ret) {
7563 pr_info("PM: Wakeup pending, aborting suspend\n");
7564@@ -941,13 +941,13 @@ bool pm_save_wakeup_count(unsigned int count)
7565 unsigned long flags;
7566
7567 events_check_enabled = false;
7568- spin_lock_irqsave(&events_lock, flags);
7569+ raw_spin_lock_irqsave(&events_lock, flags);
7570 split_counters(&cnt, &inpr);
7571 if (cnt == count && inpr == 0) {
7572 saved_count = count;
7573 events_check_enabled = true;
7574 }
7575- spin_unlock_irqrestore(&events_lock, flags);
7576+ raw_spin_unlock_irqrestore(&events_lock, flags);
7577 return events_check_enabled;
7578 }
7579
7580diff --git a/drivers/block/brd.c b/drivers/block/brd.c
7581index 2d7178f7754e..c1cf87718c2e 100644
7582--- a/drivers/block/brd.c
7583+++ b/drivers/block/brd.c
7584@@ -60,7 +60,6 @@ struct brd_device {
e4b2b4a8
JK
7585 /*
7586 * Look up and return a brd's page for a given sector.
7587 */
7588-static DEFINE_MUTEX(brd_mutex);
7589 static struct page *brd_lookup_page(struct brd_device *brd, sector_t sector)
7590 {
7591 pgoff_t idx;
b3bbd485
JK
7592diff --git a/drivers/block/zram/zcomp.c b/drivers/block/zram/zcomp.c
7593index 5b8992beffec..40345483a022 100644
7594--- a/drivers/block/zram/zcomp.c
7595+++ b/drivers/block/zram/zcomp.c
7596@@ -116,12 +116,20 @@ ssize_t zcomp_available_show(const char *comp, char *buf)
1a6e0f06
JK
7597
7598 struct zcomp_strm *zcomp_stream_get(struct zcomp *comp)
7599 {
7600- return *get_cpu_ptr(comp->stream);
7601+ struct zcomp_strm *zstrm;
7602+
e4b2b4a8 7603+ zstrm = *get_local_ptr(comp->stream);
1a6e0f06
JK
7604+ spin_lock(&zstrm->zcomp_lock);
7605+ return zstrm;
7606 }
7607
7608 void zcomp_stream_put(struct zcomp *comp)
7609 {
7610- put_cpu_ptr(comp->stream);
7611+ struct zcomp_strm *zstrm;
7612+
7613+ zstrm = *this_cpu_ptr(comp->stream);
7614+ spin_unlock(&zstrm->zcomp_lock);
e4b2b4a8 7615+ put_local_ptr(zstrm);
1a6e0f06
JK
7616 }
7617
7618 int zcomp_compress(struct zcomp_strm *zstrm,
b3bbd485 7619@@ -171,6 +179,7 @@ int zcomp_cpu_up_prepare(unsigned int cpu, struct hlist_node *node)
e4b2b4a8
JK
7620 pr_err("Can't allocate a compression stream\n");
7621 return -ENOMEM;
7622 }
7623+ spin_lock_init(&zstrm->zcomp_lock);
7624 *per_cpu_ptr(comp->stream, cpu) = zstrm;
7625 return 0;
7626 }
b3bbd485
JK
7627diff --git a/drivers/block/zram/zcomp.h b/drivers/block/zram/zcomp.h
7628index 41c1002a7d7d..d424eafcbf8e 100644
7629--- a/drivers/block/zram/zcomp.h
7630+++ b/drivers/block/zram/zcomp.h
7631@@ -14,6 +14,7 @@ struct zcomp_strm {
1a6e0f06
JK
7632 /* compression/decompression buffer */
7633 void *buffer;
7634 struct crypto_comp *tfm;
7635+ spinlock_t zcomp_lock;
7636 };
7637
7638 /* dynamic per-device compression frontend */
b3bbd485
JK
7639diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
7640index 1e2648e4c286..c5d61209eb05 100644
7641--- a/drivers/block/zram/zram_drv.c
7642+++ b/drivers/block/zram/zram_drv.c
7643@@ -761,6 +761,30 @@ static DEVICE_ATTR_RO(io_stat);
e4b2b4a8
JK
7644 static DEVICE_ATTR_RO(mm_stat);
7645 static DEVICE_ATTR_RO(debug_stat);
1a6e0f06 7646
e4b2b4a8
JK
7647+#ifdef CONFIG_PREEMPT_RT_BASE
7648+static void zram_meta_init_table_locks(struct zram *zram, size_t num_pages)
7649+{
7650+ size_t index;
7651+
7652+ for (index = 0; index < num_pages; index++)
7653+ spin_lock_init(&zram->table[index].lock);
7654+}
7655+
7656+static void zram_slot_lock(struct zram *zram, u32 index)
7657+{
7658+ spin_lock(&zram->table[index].lock);
7659+ __set_bit(ZRAM_ACCESS, &zram->table[index].value);
7660+}
7661+
7662+static void zram_slot_unlock(struct zram *zram, u32 index)
7663+{
7664+ __clear_bit(ZRAM_ACCESS, &zram->table[index].value);
7665+ spin_unlock(&zram->table[index].lock);
7666+}
7667+
7668+#else
7669+static void zram_meta_init_table_locks(struct zram *zram, size_t num_pages) { }
1a6e0f06 7670+
e4b2b4a8
JK
7671 static void zram_slot_lock(struct zram *zram, u32 index)
7672 {
7673 bit_spin_lock(ZRAM_ACCESS, &zram->table[index].value);
b3bbd485 7674@@ -770,6 +794,7 @@ static void zram_slot_unlock(struct zram *zram, u32 index)
e4b2b4a8
JK
7675 {
7676 bit_spin_unlock(ZRAM_ACCESS, &zram->table[index].value);
7677 }
7678+#endif
1a6e0f06 7679
e4b2b4a8
JK
7680 static void zram_meta_free(struct zram *zram, u64 disksize)
7681 {
b3bbd485 7682@@ -799,6 +824,7 @@ static bool zram_meta_alloc(struct zram *zram, u64 disksize)
e4b2b4a8
JK
7683 return false;
7684 }
7685
7686+ zram_meta_init_table_locks(zram, num_pages);
7687 return true;
7688 }
7689
b3bbd485 7690@@ -850,6 +876,7 @@ static int __zram_bvec_read(struct zram *zram, struct page *page, u32 index,
1a6e0f06
JK
7691 unsigned long handle;
7692 unsigned int size;
e4b2b4a8 7693 void *src, *dst;
1a6e0f06
JK
7694+ struct zcomp_strm *zstrm;
7695
e4b2b4a8
JK
7696 if (zram_wb_enabled(zram)) {
7697 zram_slot_lock(zram, index);
b3bbd485 7698@@ -884,6 +911,7 @@ static int __zram_bvec_read(struct zram *zram, struct page *page, u32 index,
1a6e0f06 7699
e4b2b4a8 7700 size = zram_get_obj_size(zram, index);
1a6e0f06
JK
7701
7702+ zstrm = zcomp_stream_get(zram->comp);
e4b2b4a8 7703 src = zs_map_object(zram->mem_pool, handle, ZS_MM_RO);
1a6e0f06 7704 if (size == PAGE_SIZE) {
e4b2b4a8 7705 dst = kmap_atomic(page);
b3bbd485 7706@@ -891,14 +919,13 @@ static int __zram_bvec_read(struct zram *zram, struct page *page, u32 index,
e4b2b4a8
JK
7707 kunmap_atomic(dst);
7708 ret = 0;
1a6e0f06
JK
7709 } else {
7710- struct zcomp_strm *zstrm = zcomp_stream_get(zram->comp);
e4b2b4a8
JK
7711
7712 dst = kmap_atomic(page);
7713 ret = zcomp_decompress(zstrm, src, size, dst);
7714 kunmap_atomic(dst);
1a6e0f06
JK
7715- zcomp_stream_put(zram->comp);
7716 }
e4b2b4a8 7717 zs_unmap_object(zram->mem_pool, handle);
1a6e0f06 7718+ zcomp_stream_put(zram->comp);
e4b2b4a8 7719 zram_slot_unlock(zram, index);
1a6e0f06
JK
7720
7721 /* Should NEVER happen. Return bio error if it does. */
b3bbd485
JK
7722diff --git a/drivers/block/zram/zram_drv.h b/drivers/block/zram/zram_drv.h
7723index 31762db861e3..a417c96b8f3f 100644
7724--- a/drivers/block/zram/zram_drv.h
7725+++ b/drivers/block/zram/zram_drv.h
7726@@ -77,6 +77,9 @@ struct zram_table_entry {
e4b2b4a8
JK
7727 unsigned long element;
7728 };
1a6e0f06
JK
7729 unsigned long value;
7730+#ifdef CONFIG_PREEMPT_RT_BASE
7731+ spinlock_t lock;
7732+#endif
7733 };
7734
7735 struct zram_stats {
b3bbd485
JK
7736diff --git a/drivers/char/random.c b/drivers/char/random.c
7737index ea4dbfa30657..c72a7f0b4494 100644
7738--- a/drivers/char/random.c
7739+++ b/drivers/char/random.c
e4b2b4a8
JK
7740@@ -265,6 +265,7 @@
7741 #include <linux/syscalls.h>
7742 #include <linux/completion.h>
7743 #include <linux/uuid.h>
7744+#include <linux/locallock.h>
7745 #include <crypto/chacha20.h>
7746
7747 #include <asm/processor.h>
b3bbd485 7748@@ -856,7 +857,7 @@ static int crng_fast_load(const char *cp, size_t len)
e4b2b4a8
JK
7749 invalidate_batched_entropy();
7750 crng_init = 1;
7751 wake_up_interruptible(&crng_init_wait);
7752- pr_notice("random: fast init done\n");
7753+ /* pr_notice("random: fast init done\n"); */
7754 }
7755 return 1;
7756 }
b3bbd485 7757@@ -941,17 +942,21 @@ static void crng_reseed(struct crng_state *crng, struct entropy_store *r)
e4b2b4a8
JK
7758 crng_init = 2;
7759 process_random_ready_list();
7760 wake_up_interruptible(&crng_init_wait);
7761- pr_notice("random: crng init done\n");
7762+ /* pr_notice("random: crng init done\n"); */
7763 if (unseeded_warning.missed) {
7764+#if 0
7765 pr_notice("random: %d get_random_xx warning(s) missed "
7766 "due to ratelimiting\n",
7767 unseeded_warning.missed);
7768+#endif
7769 unseeded_warning.missed = 0;
7770 }
7771 if (urandom_warning.missed) {
7772+#if 0
7773 pr_notice("random: %d urandom warning(s) missed "
7774 "due to ratelimiting\n",
7775 urandom_warning.missed);
7776+#endif
7777 urandom_warning.missed = 0;
7778 }
7779 }
b3bbd485 7780@@ -1122,8 +1127,6 @@ static void add_timer_randomness(struct timer_rand_state *state, unsigned num)
1a6e0f06
JK
7781 } sample;
7782 long delta, delta2, delta3;
7783
7784- preempt_disable();
7785-
7786 sample.jiffies = jiffies;
7787 sample.cycles = random_get_entropy();
7788 sample.num = num;
b3bbd485 7789@@ -1164,7 +1167,6 @@ static void add_timer_randomness(struct timer_rand_state *state, unsigned num)
1a6e0f06
JK
7790 */
7791 credit_entropy_bits(r, min_t(int, fls(delta>>1), 11));
7792 }
7793- preempt_enable();
7794 }
7795
7796 void add_input_randomness(unsigned int type, unsigned int code,
b3bbd485 7797@@ -1221,28 +1223,27 @@ static __u32 get_reg(struct fast_pool *f, struct pt_regs *regs)
e4b2b4a8 7798 return *ptr;
1a6e0f06
JK
7799 }
7800
7801-void add_interrupt_randomness(int irq, int irq_flags)
7802+void add_interrupt_randomness(int irq, int irq_flags, __u64 ip)
7803 {
7804 struct entropy_store *r;
7805 struct fast_pool *fast_pool = this_cpu_ptr(&irq_randomness);
7806- struct pt_regs *regs = get_irq_regs();
7807 unsigned long now = jiffies;
7808 cycles_t cycles = random_get_entropy();
7809 __u32 c_high, j_high;
7810- __u64 ip;
7811 unsigned long seed;
7812 int credit = 0;
7813
7814 if (cycles == 0)
7815- cycles = get_reg(fast_pool, regs);
7816+ cycles = get_reg(fast_pool, NULL);
7817 c_high = (sizeof(cycles) > 4) ? cycles >> 32 : 0;
7818 j_high = (sizeof(now) > 4) ? now >> 32 : 0;
7819 fast_pool->pool[0] ^= cycles ^ j_high ^ irq;
7820 fast_pool->pool[1] ^= now ^ c_high;
7821- ip = regs ? instruction_pointer(regs) : _RET_IP_;
7822+ if (!ip)
7823+ ip = _RET_IP_;
7824 fast_pool->pool[2] ^= ip;
7825 fast_pool->pool[3] ^= (sizeof(ip) > 4) ? ip >> 32 :
7826- get_reg(fast_pool, regs);
7827+ get_reg(fast_pool, NULL);
7828
7829 fast_mix(fast_pool);
7830 add_interrupt_bench(cycles);
b3bbd485 7831@@ -2200,6 +2201,7 @@ static rwlock_t batched_entropy_reset_lock = __RW_LOCK_UNLOCKED(batched_entropy_
e4b2b4a8
JK
7832 * at any point prior.
7833 */
7834 static DEFINE_PER_CPU(struct batched_entropy, batched_entropy_u64);
7835+static DEFINE_LOCAL_IRQ_LOCK(batched_entropy_u64_lock);
7836 u64 get_random_u64(void)
7837 {
7838 u64 ret;
b3bbd485 7839@@ -2220,7 +2222,7 @@ u64 get_random_u64(void)
e4b2b4a8
JK
7840 warn_unseeded_randomness(&previous);
7841
7842 use_lock = READ_ONCE(crng_init) < 2;
7843- batch = &get_cpu_var(batched_entropy_u64);
7844+ batch = &get_locked_var(batched_entropy_u64_lock, batched_entropy_u64);
7845 if (use_lock)
7846 read_lock_irqsave(&batched_entropy_reset_lock, flags);
7847 if (batch->position % ARRAY_SIZE(batch->entropy_u64) == 0) {
b3bbd485 7848@@ -2230,12 +2232,13 @@ u64 get_random_u64(void)
e4b2b4a8
JK
7849 ret = batch->entropy_u64[batch->position++];
7850 if (use_lock)
7851 read_unlock_irqrestore(&batched_entropy_reset_lock, flags);
7852- put_cpu_var(batched_entropy_u64);
7853+ put_locked_var(batched_entropy_u64_lock, batched_entropy_u64);
7854 return ret;
7855 }
7856 EXPORT_SYMBOL(get_random_u64);
7857
7858 static DEFINE_PER_CPU(struct batched_entropy, batched_entropy_u32);
7859+static DEFINE_LOCAL_IRQ_LOCK(batched_entropy_u32_lock);
7860 u32 get_random_u32(void)
7861 {
7862 u32 ret;
b3bbd485 7863@@ -2250,7 +2253,7 @@ u32 get_random_u32(void)
e4b2b4a8
JK
7864 warn_unseeded_randomness(&previous);
7865
7866 use_lock = READ_ONCE(crng_init) < 2;
7867- batch = &get_cpu_var(batched_entropy_u32);
7868+ batch = &get_locked_var(batched_entropy_u32_lock, batched_entropy_u32);
7869 if (use_lock)
7870 read_lock_irqsave(&batched_entropy_reset_lock, flags);
7871 if (batch->position % ARRAY_SIZE(batch->entropy_u32) == 0) {
b3bbd485 7872@@ -2260,7 +2263,7 @@ u32 get_random_u32(void)
e4b2b4a8
JK
7873 ret = batch->entropy_u32[batch->position++];
7874 if (use_lock)
7875 read_unlock_irqrestore(&batched_entropy_reset_lock, flags);
7876- put_cpu_var(batched_entropy_u32);
7877+ put_locked_var(batched_entropy_u32_lock, batched_entropy_u32);
7878 return ret;
7879 }
7880 EXPORT_SYMBOL(get_random_u32);
b3bbd485
JK
7881diff --git a/drivers/char/tpm/tpm_tis.c b/drivers/char/tpm/tpm_tis.c
7882index 50b59a69dc33..cbdb0a6c5337 100644
7883--- a/drivers/char/tpm/tpm_tis.c
7884+++ b/drivers/char/tpm/tpm_tis.c
7885@@ -52,6 +52,31 @@ static inline struct tpm_tis_tcg_phy *to_tpm_tis_tcg_phy(struct tpm_tis_data *da
e4b2b4a8
JK
7886 return container_of(data, struct tpm_tis_tcg_phy, priv);
7887 }
7888
7889+#ifdef CONFIG_PREEMPT_RT_FULL
7890+/*
7891+ * Flushes previous write operations to chip so that a subsequent
7892+ * ioread*()s won't stall a cpu.
7893+ */
7894+static inline void tpm_tis_flush(void __iomem *iobase)
7895+{
7896+ ioread8(iobase + TPM_ACCESS(0));
7897+}
7898+#else
7899+#define tpm_tis_flush(iobase) do { } while (0)
7900+#endif
7901+
7902+static inline void tpm_tis_iowrite8(u8 b, void __iomem *iobase, u32 addr)
7903+{
7904+ iowrite8(b, iobase + addr);
7905+ tpm_tis_flush(iobase);
7906+}
7907+
7908+static inline void tpm_tis_iowrite32(u32 b, void __iomem *iobase, u32 addr)
7909+{
7910+ iowrite32(b, iobase + addr);
7911+ tpm_tis_flush(iobase);
7912+}
7913+
7914 static bool interrupts = true;
7915 module_param(interrupts, bool, 0444);
7916 MODULE_PARM_DESC(interrupts, "Enable interrupts");
b3bbd485 7917@@ -149,7 +174,7 @@ static int tpm_tcg_write_bytes(struct tpm_tis_data *data, u32 addr, u16 len,
e4b2b4a8
JK
7918 struct tpm_tis_tcg_phy *phy = to_tpm_tis_tcg_phy(data);
7919
7920 while (len--)
7921- iowrite8(*value++, phy->iobase + addr);
7922+ tpm_tis_iowrite8(*value++, phy->iobase, addr);
7923
7924 return 0;
7925 }
b3bbd485 7926@@ -176,7 +201,7 @@ static int tpm_tcg_write32(struct tpm_tis_data *data, u32 addr, u32 value)
e4b2b4a8
JK
7927 {
7928 struct tpm_tis_tcg_phy *phy = to_tpm_tis_tcg_phy(data);
7929
7930- iowrite32(value, phy->iobase + addr);
7931+ tpm_tis_iowrite32(value, phy->iobase, addr);
7932
7933 return 0;
7934 }
b3bbd485
JK
7935diff --git a/drivers/clocksource/tcb_clksrc.c b/drivers/clocksource/tcb_clksrc.c
7936index 9de47d4d2d9e..05f4b88bb955 100644
7937--- a/drivers/clocksource/tcb_clksrc.c
7938+++ b/drivers/clocksource/tcb_clksrc.c
e4b2b4a8 7939@@ -25,8 +25,7 @@
1a6e0f06
JK
7940 * this 32 bit free-running counter. the second channel is not used.
7941 *
7942 * - The third channel may be used to provide a 16-bit clockevent
7943- * source, used in either periodic or oneshot mode. This runs
7944- * at 32 KiHZ, and can handle delays of up to two seconds.
7945+ * source, used in either periodic or oneshot mode.
7946 *
7947 * A boot clocksource and clockevent source are also currently needed,
7948 * unless the relevant platforms (ARM/AT91, AVR32/AT32) are changed so
b3bbd485 7949@@ -126,6 +125,8 @@ static struct clocksource clksrc = {
1a6e0f06
JK
7950 struct tc_clkevt_device {
7951 struct clock_event_device clkevt;
7952 struct clk *clk;
7953+ bool clk_enabled;
7954+ u32 freq;
7955 void __iomem *regs;
7956 };
7957
b3bbd485 7958@@ -134,15 +135,26 @@ static struct tc_clkevt_device *to_tc_clkevt(struct clock_event_device *clkevt)
1a6e0f06
JK
7959 return container_of(clkevt, struct tc_clkevt_device, clkevt);
7960 }
7961
7962-/* For now, we always use the 32K clock ... this optimizes for NO_HZ,
7963- * because using one of the divided clocks would usually mean the
7964- * tick rate can never be less than several dozen Hz (vs 0.5 Hz).
7965- *
7966- * A divided clock could be good for high resolution timers, since
7967- * 30.5 usec resolution can seem "low".
7968- */
7969 static u32 timer_clock;
7970
7971+static void tc_clk_disable(struct clock_event_device *d)
7972+{
7973+ struct tc_clkevt_device *tcd = to_tc_clkevt(d);
7974+
7975+ clk_disable(tcd->clk);
7976+ tcd->clk_enabled = false;
7977+}
7978+
7979+static void tc_clk_enable(struct clock_event_device *d)
7980+{
7981+ struct tc_clkevt_device *tcd = to_tc_clkevt(d);
7982+
7983+ if (tcd->clk_enabled)
7984+ return;
7985+ clk_enable(tcd->clk);
7986+ tcd->clk_enabled = true;
7987+}
7988+
7989 static int tc_shutdown(struct clock_event_device *d)
7990 {
7991 struct tc_clkevt_device *tcd = to_tc_clkevt(d);
b3bbd485 7992@@ -150,8 +162,14 @@ static int tc_shutdown(struct clock_event_device *d)
1a6e0f06 7993
e4b2b4a8
JK
7994 writel(0xff, regs + ATMEL_TC_REG(2, IDR));
7995 writel(ATMEL_TC_CLKDIS, regs + ATMEL_TC_REG(2, CCR));
1a6e0f06
JK
7996+ return 0;
7997+}
7998+
7999+static int tc_shutdown_clk_off(struct clock_event_device *d)
8000+{
8001+ tc_shutdown(d);
8002 if (!clockevent_state_detached(d))
8003- clk_disable(tcd->clk);
8004+ tc_clk_disable(d);
8005
8006 return 0;
8007 }
b3bbd485 8008@@ -164,9 +182,9 @@ static int tc_set_oneshot(struct clock_event_device *d)
1a6e0f06
JK
8009 if (clockevent_state_oneshot(d) || clockevent_state_periodic(d))
8010 tc_shutdown(d);
8011
8012- clk_enable(tcd->clk);
8013+ tc_clk_enable(d);
8014
8015- /* slow clock, count up to RC, then irq and stop */
8016+ /* count up to RC, then irq and stop */
e4b2b4a8 8017 writel(timer_clock | ATMEL_TC_CPCSTOP | ATMEL_TC_WAVE |
1a6e0f06 8018 ATMEL_TC_WAVESEL_UP_AUTO, regs + ATMEL_TC_REG(2, CMR));
e4b2b4a8 8019 writel(ATMEL_TC_CPCS, regs + ATMEL_TC_REG(2, IER));
b3bbd485 8020@@ -186,12 +204,12 @@ static int tc_set_periodic(struct clock_event_device *d)
1a6e0f06
JK
8021 /* By not making the gentime core emulate periodic mode on top
8022 * of oneshot, we get lower overhead and improved accuracy.
8023 */
8024- clk_enable(tcd->clk);
8025+ tc_clk_enable(d);
8026
8027- /* slow clock, count up to RC, then irq and restart */
8028+ /* count up to RC, then irq and restart */
e4b2b4a8 8029 writel(timer_clock | ATMEL_TC_WAVE | ATMEL_TC_WAVESEL_UP_AUTO,
1a6e0f06 8030 regs + ATMEL_TC_REG(2, CMR));
e4b2b4a8
JK
8031- writel((32768 + HZ / 2) / HZ, tcaddr + ATMEL_TC_REG(2, RC));
8032+ writel((tcd->freq + HZ / 2) / HZ, tcaddr + ATMEL_TC_REG(2, RC));
1a6e0f06
JK
8033
8034 /* Enable clock and interrupts on RC compare */
e4b2b4a8 8035 writel(ATMEL_TC_CPCS, regs + ATMEL_TC_REG(2, IER));
b3bbd485 8036@@ -218,9 +236,13 @@ static struct tc_clkevt_device clkevt = {
1a6e0f06
JK
8037 .features = CLOCK_EVT_FEAT_PERIODIC |
8038 CLOCK_EVT_FEAT_ONESHOT,
8039 /* Should be lower than at91rm9200's system timer */
8040+#ifdef CONFIG_ATMEL_TCB_CLKSRC_USE_SLOW_CLOCK
8041 .rating = 125,
8042+#else
8043+ .rating = 200,
8044+#endif
8045 .set_next_event = tc_next_event,
8046- .set_state_shutdown = tc_shutdown,
8047+ .set_state_shutdown = tc_shutdown_clk_off,
8048 .set_state_periodic = tc_set_periodic,
8049 .set_state_oneshot = tc_set_oneshot,
8050 },
b3bbd485 8051@@ -240,8 +262,9 @@ static irqreturn_t ch2_irq(int irq, void *handle)
1a6e0f06
JK
8052 return IRQ_NONE;
8053 }
8054
8055-static int __init setup_clkevents(struct atmel_tc *tc, int clk32k_divisor_idx)
8056+static int __init setup_clkevents(struct atmel_tc *tc, int divisor_idx)
8057 {
8058+ unsigned divisor = atmel_tc_divisors[divisor_idx];
8059 int ret;
8060 struct clk *t2_clk = tc->clk[2];
8061 int irq = tc->irq[2];
b3bbd485 8062@@ -262,7 +285,11 @@ static int __init setup_clkevents(struct atmel_tc *tc, int clk32k_divisor_idx)
1a6e0f06
JK
8063 clkevt.regs = tc->regs;
8064 clkevt.clk = t2_clk;
8065
8066- timer_clock = clk32k_divisor_idx;
8067+ timer_clock = divisor_idx;
8068+ if (!divisor)
8069+ clkevt.freq = 32768;
8070+ else
8071+ clkevt.freq = clk_get_rate(t2_clk) / divisor;
8072
8073 clkevt.clkevt.cpumask = cpumask_of(0);
8074
b3bbd485 8075@@ -273,7 +300,7 @@ static int __init setup_clkevents(struct atmel_tc *tc, int clk32k_divisor_idx)
1a6e0f06
JK
8076 return ret;
8077 }
8078
8079- clockevents_config_and_register(&clkevt.clkevt, 32768, 1, 0xffff);
8080+ clockevents_config_and_register(&clkevt.clkevt, clkevt.freq, 1, 0xffff);
8081
8082 return ret;
8083 }
b3bbd485 8084@@ -410,7 +437,11 @@ static int __init tcb_clksrc_init(void)
1a6e0f06
JK
8085 goto err_disable_t1;
8086
8087 /* channel 2: periodic and oneshot timer support */
8088+#ifdef CONFIG_ATMEL_TCB_CLKSRC_USE_SLOW_CLOCK
8089 ret = setup_clkevents(tc, clk32k_divisor_idx);
8090+#else
8091+ ret = setup_clkevents(tc, best_divisor_idx);
8092+#endif
8093 if (ret)
8094 goto err_unregister_clksrc;
8095
b3bbd485 8096diff --git a/drivers/clocksource/timer-atmel-pit.c b/drivers/clocksource/timer-atmel-pit.c
5dd41b01 8097index 2fab18fae4fc..98460c1bdec0 100644
b3bbd485
JK
8098--- a/drivers/clocksource/timer-atmel-pit.c
8099+++ b/drivers/clocksource/timer-atmel-pit.c
8100@@ -46,6 +46,7 @@ struct pit_data {
1a6e0f06
JK
8101 u32 cycle;
8102 u32 cnt;
8103 unsigned int irq;
8104+ bool irq_requested;
8105 struct clk *mck;
8106 };
8107
b3bbd485 8108@@ -96,15 +97,29 @@ static int pit_clkevt_shutdown(struct clock_event_device *dev)
1a6e0f06
JK
8109
8110 /* disable irq, leaving the clocksource active */
8111 pit_write(data->base, AT91_PIT_MR, (data->cycle - 1) | AT91_PIT_PITEN);
8112+ if (data->irq_requested) {
8113+ free_irq(data->irq, data);
8114+ data->irq_requested = false;
8115+ }
8116 return 0;
8117 }
8118
8119+static irqreturn_t at91sam926x_pit_interrupt(int irq, void *dev_id);
8120 /*
8121 * Clockevent device: interrupts every 1/HZ (== pit_cycles * MCK/16)
8122 */
8123 static int pit_clkevt_set_periodic(struct clock_event_device *dev)
8124 {
8125 struct pit_data *data = clkevt_to_pit_data(dev);
8126+ int ret;
8127+
8128+ ret = request_irq(data->irq, at91sam926x_pit_interrupt,
8129+ IRQF_SHARED | IRQF_TIMER | IRQF_IRQPOLL,
8130+ "at91_tick", data);
8131+ if (ret)
8132+ panic(pr_fmt("Unable to setup IRQ\n"));
8133+
8134+ data->irq_requested = true;
8135
8136 /* update clocksource counter */
8137 data->cnt += data->cycle * PIT_PICNT(pit_read(data->base, AT91_PIT_PIVR));
5dd41b01
JK
8138@@ -233,16 +248,6 @@ static int __init at91sam926x_pit_dt_init(struct device_node *node)
8139 goto exit;
1a6e0f06
JK
8140 }
8141
8142- /* Set up irq handler */
8143- ret = request_irq(data->irq, at91sam926x_pit_interrupt,
8144- IRQF_SHARED | IRQF_TIMER | IRQF_IRQPOLL,
8145- "at91_tick", data);
8146- if (ret) {
8147- pr_err("Unable to setup IRQ\n");
5dd41b01
JK
8148- clocksource_unregister(&data->clksrc);
8149- goto exit;
1a6e0f06
JK
8150- }
8151-
8152 /* Set up and register clockevents */
8153 data->clkevt.name = "pit";
8154 data->clkevt.features = CLOCK_EVT_FEAT_PERIODIC;
b3bbd485
JK
8155diff --git a/drivers/clocksource/timer-atmel-st.c b/drivers/clocksource/timer-atmel-st.c
8156index d2e660f475af..c63b96cfc23e 100644
8157--- a/drivers/clocksource/timer-atmel-st.c
8158+++ b/drivers/clocksource/timer-atmel-st.c
8159@@ -115,18 +115,29 @@ static void clkdev32k_disable_and_flush_irq(void)
1a6e0f06
JK
8160 last_crtr = read_CRTR();
8161 }
8162
8163+static int atmel_st_irq;
8164+
8165 static int clkevt32k_shutdown(struct clock_event_device *evt)
8166 {
8167 clkdev32k_disable_and_flush_irq();
8168 irqmask = 0;
8169 regmap_write(regmap_st, AT91_ST_IER, irqmask);
8170+ free_irq(atmel_st_irq, regmap_st);
8171 return 0;
8172 }
8173
8174 static int clkevt32k_set_oneshot(struct clock_event_device *dev)
8175 {
8176+ int ret;
8177+
8178 clkdev32k_disable_and_flush_irq();
8179
8180+ ret = request_irq(atmel_st_irq, at91rm9200_timer_interrupt,
8181+ IRQF_SHARED | IRQF_TIMER | IRQF_IRQPOLL,
8182+ "at91_tick", regmap_st);
8183+ if (ret)
8184+ panic(pr_fmt("Unable to setup IRQ\n"));
8185+
8186 /*
8187 * ALM for oneshot irqs, set by next_event()
8188 * before 32 seconds have passed.
b3bbd485 8189@@ -139,8 +150,16 @@ static int clkevt32k_set_oneshot(struct clock_event_device *dev)
1a6e0f06
JK
8190
8191 static int clkevt32k_set_periodic(struct clock_event_device *dev)
8192 {
8193+ int ret;
8194+
8195 clkdev32k_disable_and_flush_irq();
8196
8197+ ret = request_irq(atmel_st_irq, at91rm9200_timer_interrupt,
8198+ IRQF_SHARED | IRQF_TIMER | IRQF_IRQPOLL,
8199+ "at91_tick", regmap_st);
8200+ if (ret)
8201+ panic(pr_fmt("Unable to setup IRQ\n"));
8202+
8203 /* PIT for periodic irqs; fixed rate of 1/HZ */
8204 irqmask = AT91_ST_PITS;
8205 regmap_write(regmap_st, AT91_ST_PIMR, timer_latch);
b3bbd485 8206@@ -198,7 +217,7 @@ static int __init atmel_st_timer_init(struct device_node *node)
1a6e0f06
JK
8207 {
8208 struct clk *sclk;
8209 unsigned int sclk_rate, val;
8210- int irq, ret;
8211+ int ret;
8212
8213 regmap_st = syscon_node_to_regmap(node);
8214 if (IS_ERR(regmap_st)) {
b3bbd485 8215@@ -212,21 +231,12 @@ static int __init atmel_st_timer_init(struct device_node *node)
1a6e0f06
JK
8216 regmap_read(regmap_st, AT91_ST_SR, &val);
8217
8218 /* Get the interrupts property */
8219- irq = irq_of_parse_and_map(node, 0);
8220- if (!irq) {
8221+ atmel_st_irq = irq_of_parse_and_map(node, 0);
8222+ if (!atmel_st_irq) {
8223 pr_err("Unable to get IRQ from DT\n");
8224 return -EINVAL;
8225 }
8226
8227- /* Make IRQs happen for the system timer */
8228- ret = request_irq(irq, at91rm9200_timer_interrupt,
8229- IRQF_SHARED | IRQF_TIMER | IRQF_IRQPOLL,
8230- "at91_tick", regmap_st);
8231- if (ret) {
8232- pr_err("Unable to setup IRQ\n");
8233- return ret;
8234- }
8235-
8236 sclk = of_clk_get(node, 0);
8237 if (IS_ERR(sclk)) {
8238 pr_err("Unable to get slow clock\n");
b3bbd485
JK
8239diff --git a/drivers/connector/cn_proc.c b/drivers/connector/cn_proc.c
8240index a782ce87715c..19d265948526 100644
8241--- a/drivers/connector/cn_proc.c
8242+++ b/drivers/connector/cn_proc.c
1a6e0f06
JK
8243@@ -32,6 +32,7 @@
8244 #include <linux/pid_namespace.h>
8245
8246 #include <linux/cn_proc.h>
8247+#include <linux/locallock.h>
8248
8249 /*
8250 * Size of a cn_msg followed by a proc_event structure. Since the
b3bbd485 8251@@ -54,10 +55,11 @@ static struct cb_id cn_proc_event_id = { CN_IDX_PROC, CN_VAL_PROC };
1a6e0f06
JK
8252
8253 /* proc_event_counts is used as the sequence number of the netlink message */
8254 static DEFINE_PER_CPU(__u32, proc_event_counts) = { 0 };
8255+static DEFINE_LOCAL_IRQ_LOCK(send_msg_lock);
8256
8257 static inline void send_msg(struct cn_msg *msg)
8258 {
8259- preempt_disable();
8260+ local_lock(send_msg_lock);
8261
8262 msg->seq = __this_cpu_inc_return(proc_event_counts) - 1;
8263 ((struct proc_event *)msg->data)->cpu = smp_processor_id();
b3bbd485 8264@@ -70,7 +72,7 @@ static inline void send_msg(struct cn_msg *msg)
1a6e0f06
JK
8265 */
8266 cn_netlink_send(msg, 0, CN_IDX_PROC, GFP_NOWAIT);
8267
8268- preempt_enable();
8269+ local_unlock(send_msg_lock);
8270 }
8271
8272 void proc_fork_connector(struct task_struct *task)
b3bbd485
JK
8273diff --git a/drivers/cpufreq/Kconfig.x86 b/drivers/cpufreq/Kconfig.x86
8274index 35f71825b7f3..bb4a6160d0f7 100644
8275--- a/drivers/cpufreq/Kconfig.x86
8276+++ b/drivers/cpufreq/Kconfig.x86
8277@@ -125,7 +125,7 @@ config X86_POWERNOW_K7_ACPI
1a6e0f06
JK
8278
8279 config X86_POWERNOW_K8
8280 tristate "AMD Opteron/Athlon64 PowerNow!"
8281- depends on ACPI && ACPI_PROCESSOR && X86_ACPI_CPUFREQ
8282+ depends on ACPI && ACPI_PROCESSOR && X86_ACPI_CPUFREQ && !PREEMPT_RT_BASE
8283 help
8284 This adds the CPUFreq driver for K8/early Opteron/Athlon64 processors.
8285 Support for K10 and newer processors is now in acpi-cpufreq.
b3bbd485
JK
8286diff --git a/drivers/firmware/efi/efi.c b/drivers/firmware/efi/efi.c
8287index c3eefa126e3b..47093745a53c 100644
8288--- a/drivers/firmware/efi/efi.c
8289+++ b/drivers/firmware/efi/efi.c
8290@@ -74,7 +74,7 @@ static unsigned long *efi_tables[] = {
8291 &efi.mem_attr_table,
8292 };
8293
8294-static bool disable_runtime;
8295+static bool disable_runtime = IS_ENABLED(CONFIG_PREEMPT_RT_BASE);
8296 static int __init setup_noefi(char *arg)
8297 {
8298 disable_runtime = true;
8299@@ -100,6 +100,9 @@ static int __init parse_efi_cmdline(char *str)
8300 if (parse_option_str(str, "noruntime"))
8301 disable_runtime = true;
8302
8303+ if (parse_option_str(str, "runtime"))
8304+ disable_runtime = false;
8305+
8306 return 0;
8307 }
8308 early_param("efi", parse_efi_cmdline);
8309diff --git a/drivers/gpu/drm/i915/i915_gem_timeline.c b/drivers/gpu/drm/i915/i915_gem_timeline.c
8310index c597ce277a04..c1108d3921f8 100644
8311--- a/drivers/gpu/drm/i915/i915_gem_timeline.c
8312+++ b/drivers/gpu/drm/i915/i915_gem_timeline.c
8313@@ -33,11 +33,8 @@ static void __intel_timeline_init(struct intel_timeline *tl,
e4b2b4a8
JK
8314 {
8315 tl->fence_context = context;
8316 tl->common = parent;
8317-#ifdef CONFIG_DEBUG_SPINLOCK
8318- __raw_spin_lock_init(&tl->lock.rlock, lockname, lockclass);
8319-#else
8320 spin_lock_init(&tl->lock);
8321-#endif
8322+ lockdep_set_class_and_name(&tl->lock, lockclass, lockname);
8323 init_request_active(&tl->last_request, NULL);
8324 INIT_LIST_HEAD(&tl->requests);
8325 i915_syncmap_init(&tl->sync);
b3bbd485
JK
8326diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
8327index 20a471ad0ad2..5d34d48a8b7b 100644
8328--- a/drivers/gpu/drm/i915/i915_irq.c
8329+++ b/drivers/gpu/drm/i915/i915_irq.c
8330@@ -867,6 +867,7 @@ static bool i915_get_crtc_scanoutpos(struct drm_device *dev, unsigned int pipe,
1a6e0f06
JK
8331 spin_lock_irqsave(&dev_priv->uncore.lock, irqflags);
8332
8333 /* preempt_disable_rt() should go right here in PREEMPT_RT patchset. */
8334+ preempt_disable_rt();
8335
8336 /* Get optional system timestamp before query. */
8337 if (stime)
b3bbd485 8338@@ -918,6 +919,7 @@ static bool i915_get_crtc_scanoutpos(struct drm_device *dev, unsigned int pipe,
1a6e0f06
JK
8339 *etime = ktime_get();
8340
8341 /* preempt_enable_rt() should go right here in PREEMPT_RT patchset. */
8342+ preempt_enable_rt();
8343
8344 spin_unlock_irqrestore(&dev_priv->uncore.lock, irqflags);
8345
b3bbd485
JK
8346diff --git a/drivers/gpu/drm/i915/intel_sprite.c b/drivers/gpu/drm/i915/intel_sprite.c
8347index 41e31a454604..7e0cadf51b31 100644
8348--- a/drivers/gpu/drm/i915/intel_sprite.c
8349+++ b/drivers/gpu/drm/i915/intel_sprite.c
e4b2b4a8 8350@@ -36,6 +36,7 @@
c7c16703
JK
8351 #include <drm/drm_rect.h>
8352 #include <drm/drm_atomic.h>
8353 #include <drm/drm_plane_helper.h>
8354+#include <linux/locallock.h>
1a6e0f06 8355 #include "intel_drv.h"
c7c16703 8356 #include "intel_frontbuffer.h"
1a6e0f06 8357 #include <drm/i915_drm.h>
b3bbd485 8358@@ -67,7 +68,7 @@ int intel_usecs_to_scanlines(const struct drm_display_mode *adjusted_mode,
1a6e0f06
JK
8359 }
8360
e4b2b4a8
JK
8361 #define VBLANK_EVASION_TIME_US 100
8362-
1a6e0f06 8363+static DEFINE_LOCAL_IRQ_LOCK(pipe_update_lock);
1a6e0f06
JK
8364 /**
8365 * intel_pipe_update_start() - start update of a set of display registers
8366 * @crtc: the crtc of which the registers are going to be updated
b3bbd485 8367@@ -102,7 +103,7 @@ void intel_pipe_update_start(struct intel_crtc *crtc)
e4b2b4a8 8368 VBLANK_EVASION_TIME_US);
1a6e0f06
JK
8369 max = vblank_start - 1;
8370
8371- local_irq_disable();
8372+ local_lock_irq(pipe_update_lock);
8373
8374 if (min <= 0 || max <= 0)
8375 return;
b3bbd485 8376@@ -132,11 +133,11 @@ void intel_pipe_update_start(struct intel_crtc *crtc)
1a6e0f06
JK
8377 break;
8378 }
8379
8380- local_irq_enable();
8381+ local_unlock_irq(pipe_update_lock);
8382
8383 timeout = schedule_timeout(timeout);
8384
8385- local_irq_disable();
8386+ local_lock_irq(pipe_update_lock);
8387 }
8388
8389 finish_wait(wq, &wait);
b3bbd485 8390@@ -201,7 +202,7 @@ void intel_pipe_update_end(struct intel_crtc *crtc)
1a6e0f06
JK
8391 crtc->base.state->event = NULL;
8392 }
8393
8394- local_irq_enable();
8395+ local_unlock_irq(pipe_update_lock);
8396
e4b2b4a8
JK
8397 if (intel_vgpu_active(dev_priv))
8398 return;
b3bbd485
JK
8399diff --git a/drivers/gpu/drm/radeon/radeon_display.c b/drivers/gpu/drm/radeon/radeon_display.c
8400index ddfe91efa61e..3157bcf6428f 100644
8401--- a/drivers/gpu/drm/radeon/radeon_display.c
8402+++ b/drivers/gpu/drm/radeon/radeon_display.c
8403@@ -1839,6 +1839,7 @@ int radeon_get_crtc_scanoutpos(struct drm_device *dev, unsigned int pipe,
1a6e0f06
JK
8404 struct radeon_device *rdev = dev->dev_private;
8405
8406 /* preempt_disable_rt() should go right here in PREEMPT_RT patchset. */
8407+ preempt_disable_rt();
8408
8409 /* Get optional system timestamp before query. */
8410 if (stime)
b3bbd485 8411@@ -1931,6 +1932,7 @@ int radeon_get_crtc_scanoutpos(struct drm_device *dev, unsigned int pipe,
1a6e0f06
JK
8412 *etime = ktime_get();
8413
8414 /* preempt_enable_rt() should go right here in PREEMPT_RT patchset. */
8415+ preempt_enable_rt();
8416
8417 /* Decode into vertical and horizontal scanout position. */
8418 *vpos = position & 0x1fff;
b3bbd485
JK
8419diff --git a/drivers/hv/hyperv_vmbus.h b/drivers/hv/hyperv_vmbus.h
8420index 49569f8fe038..a3608cd52805 100644
8421--- a/drivers/hv/hyperv_vmbus.h
8422+++ b/drivers/hv/hyperv_vmbus.h
8423@@ -30,6 +30,7 @@
8424 #include <linux/atomic.h>
8425 #include <linux/hyperv.h>
8426 #include <linux/interrupt.h>
8427+#include <linux/irq.h>
8428
8429 /*
8430 * Timeout for services such as KVP and fcopy.
8431diff --git a/drivers/hv/vmbus_drv.c b/drivers/hv/vmbus_drv.c
8432index 2cd134dd94d2..cedf225d4182 100644
8433--- a/drivers/hv/vmbus_drv.c
8434+++ b/drivers/hv/vmbus_drv.c
8435@@ -966,6 +966,8 @@ static void vmbus_isr(void)
e4b2b4a8 8436 void *page_addr = hv_cpu->synic_event_page;
1a6e0f06
JK
8437 struct hv_message *msg;
8438 union hv_synic_event_flags *event;
8439+ struct pt_regs *regs = get_irq_regs();
8440+ u64 ip = regs ? instruction_pointer(regs) : 0;
8441 bool handled = false;
8442
e4b2b4a8 8443 if (unlikely(page_addr == NULL))
b3bbd485 8444@@ -1009,7 +1011,7 @@ static void vmbus_isr(void)
e4b2b4a8 8445 tasklet_schedule(&hv_cpu->msg_dpc);
1a6e0f06
JK
8446 }
8447
8448- add_interrupt_randomness(HYPERVISOR_CALLBACK_VECTOR, 0);
8449+ add_interrupt_randomness(HYPERVISOR_CALLBACK_VECTOR, 0, ip);
8450 }
8451
8452
b3bbd485
JK
8453diff --git a/drivers/ide/alim15x3.c b/drivers/ide/alim15x3.c
8454index 36f76e28a0bf..394f142f90c7 100644
8455--- a/drivers/ide/alim15x3.c
8456+++ b/drivers/ide/alim15x3.c
8457@@ -234,7 +234,7 @@ static int init_chipset_ali15x3(struct pci_dev *dev)
1a6e0f06
JK
8458
8459 isa_dev = pci_get_device(PCI_VENDOR_ID_AL, PCI_DEVICE_ID_AL_M1533, NULL);
8460
8461- local_irq_save(flags);
8462+ local_irq_save_nort(flags);
8463
8464 if (m5229_revision < 0xC2) {
8465 /*
b3bbd485 8466@@ -325,7 +325,7 @@ static int init_chipset_ali15x3(struct pci_dev *dev)
1a6e0f06
JK
8467 }
8468 pci_dev_put(north);
8469 pci_dev_put(isa_dev);
8470- local_irq_restore(flags);
8471+ local_irq_restore_nort(flags);
8472 return 0;
8473 }
8474
b3bbd485
JK
8475diff --git a/drivers/ide/hpt366.c b/drivers/ide/hpt366.c
8476index 4b5dc0162e67..590cc7d64622 100644
8477--- a/drivers/ide/hpt366.c
8478+++ b/drivers/ide/hpt366.c
8479@@ -1236,7 +1236,7 @@ static int init_dma_hpt366(ide_hwif_t *hwif,
1a6e0f06
JK
8480
8481 dma_old = inb(base + 2);
8482
8483- local_irq_save(flags);
8484+ local_irq_save_nort(flags);
8485
8486 dma_new = dma_old;
8487 pci_read_config_byte(dev, hwif->channel ? 0x4b : 0x43, &masterdma);
b3bbd485 8488@@ -1247,7 +1247,7 @@ static int init_dma_hpt366(ide_hwif_t *hwif,
1a6e0f06
JK
8489 if (dma_new != dma_old)
8490 outb(dma_new, base + 2);
8491
8492- local_irq_restore(flags);
8493+ local_irq_restore_nort(flags);
8494
8495 printk(KERN_INFO " %s: BM-DMA at 0x%04lx-0x%04lx\n",
8496 hwif->name, base, base + 7);
b3bbd485
JK
8497diff --git a/drivers/ide/ide-io-std.c b/drivers/ide/ide-io-std.c
8498index 19763977568c..4169433faab5 100644
8499--- a/drivers/ide/ide-io-std.c
8500+++ b/drivers/ide/ide-io-std.c
8501@@ -175,7 +175,7 @@ void ide_input_data(ide_drive_t *drive, struct ide_cmd *cmd, void *buf,
1a6e0f06
JK
8502 unsigned long uninitialized_var(flags);
8503
8504 if ((io_32bit & 2) && !mmio) {
8505- local_irq_save(flags);
8506+ local_irq_save_nort(flags);
8507 ata_vlb_sync(io_ports->nsect_addr);
8508 }
8509
b3bbd485 8510@@ -186,7 +186,7 @@ void ide_input_data(ide_drive_t *drive, struct ide_cmd *cmd, void *buf,
1a6e0f06
JK
8511 insl(data_addr, buf, words);
8512
8513 if ((io_32bit & 2) && !mmio)
8514- local_irq_restore(flags);
8515+ local_irq_restore_nort(flags);
8516
8517 if (((len + 1) & 3) < 2)
8518 return;
b3bbd485 8519@@ -219,7 +219,7 @@ void ide_output_data(ide_drive_t *drive, struct ide_cmd *cmd, void *buf,
1a6e0f06
JK
8520 unsigned long uninitialized_var(flags);
8521
8522 if ((io_32bit & 2) && !mmio) {
8523- local_irq_save(flags);
8524+ local_irq_save_nort(flags);
8525 ata_vlb_sync(io_ports->nsect_addr);
8526 }
8527
b3bbd485 8528@@ -230,7 +230,7 @@ void ide_output_data(ide_drive_t *drive, struct ide_cmd *cmd, void *buf,
1a6e0f06
JK
8529 outsl(data_addr, buf, words);
8530
8531 if ((io_32bit & 2) && !mmio)
8532- local_irq_restore(flags);
8533+ local_irq_restore_nort(flags);
8534
8535 if (((len + 1) & 3) < 2)
8536 return;
b3bbd485
JK
8537diff --git a/drivers/ide/ide-io.c b/drivers/ide/ide-io.c
8538index 3a234701d92c..420e4e645856 100644
8539--- a/drivers/ide/ide-io.c
8540+++ b/drivers/ide/ide-io.c
8541@@ -660,7 +660,7 @@ void ide_timer_expiry (unsigned long data)
8542 /* disable_irq_nosync ?? */
8543 disable_irq(hwif->irq);
8544 /* local CPU only, as if we were handling an interrupt */
8545- local_irq_disable();
8546+ local_irq_disable_nort();
8547 if (hwif->polling) {
8548 startstop = handler(drive);
8549 } else if (drive_is_ready(drive)) {
8550diff --git a/drivers/ide/ide-iops.c b/drivers/ide/ide-iops.c
8551index 210a0887dd29..7bf05b6147e8 100644
8552--- a/drivers/ide/ide-iops.c
8553+++ b/drivers/ide/ide-iops.c
8554@@ -129,12 +129,12 @@ int __ide_wait_stat(ide_drive_t *drive, u8 good, u8 bad,
8555 if ((stat & ATA_BUSY) == 0)
8556 break;
8557
8558- local_irq_restore(flags);
8559+ local_irq_restore_nort(flags);
8560 *rstat = stat;
8561 return -EBUSY;
8562 }
8563 }
8564- local_irq_restore(flags);
8565+ local_irq_restore_nort(flags);
8566 }
8567 /*
8568 * Allow status to settle, then read it again.
8569diff --git a/drivers/ide/ide-probe.c b/drivers/ide/ide-probe.c
8570index eaf39e5db08b..be4c941eaa83 100644
8571--- a/drivers/ide/ide-probe.c
8572+++ b/drivers/ide/ide-probe.c
8573@@ -196,10 +196,10 @@ static void do_identify(ide_drive_t *drive, u8 cmd, u16 *id)
1a6e0f06
JK
8574 int bswap = 1;
8575
8576 /* local CPU only; some systems need this */
8577- local_irq_save(flags);
8578+ local_irq_save_nort(flags);
8579 /* read 512 bytes of id info */
8580 hwif->tp_ops->input_data(drive, NULL, id, SECTOR_SIZE);
8581- local_irq_restore(flags);
8582+ local_irq_restore_nort(flags);
8583
8584 drive->dev_flags |= IDE_DFLAG_ID_READ;
8585 #ifdef DEBUG
b3bbd485
JK
8586diff --git a/drivers/ide/ide-taskfile.c b/drivers/ide/ide-taskfile.c
8587index 4efe4c6e956c..7eae3aa1def7 100644
8588--- a/drivers/ide/ide-taskfile.c
8589+++ b/drivers/ide/ide-taskfile.c
8590@@ -251,7 +251,7 @@ void ide_pio_bytes(ide_drive_t *drive, struct ide_cmd *cmd,
1a6e0f06
JK
8591
8592 page_is_high = PageHighMem(page);
8593 if (page_is_high)
8594- local_irq_save(flags);
8595+ local_irq_save_nort(flags);
8596
8597 buf = kmap_atomic(page) + offset;
8598
b3bbd485 8599@@ -272,7 +272,7 @@ void ide_pio_bytes(ide_drive_t *drive, struct ide_cmd *cmd,
1a6e0f06
JK
8600 kunmap_atomic(buf);
8601
8602 if (page_is_high)
8603- local_irq_restore(flags);
8604+ local_irq_restore_nort(flags);
8605
8606 len -= nr_bytes;
8607 }
b3bbd485 8608@@ -415,7 +415,7 @@ static ide_startstop_t pre_task_out_intr(ide_drive_t *drive,
1a6e0f06
JK
8609 }
8610
8611 if ((drive->dev_flags & IDE_DFLAG_UNMASK) == 0)
8612- local_irq_disable();
8613+ local_irq_disable_nort();
8614
8615 ide_set_handler(drive, &task_pio_intr, WAIT_WORSTCASE);
8616
b3bbd485
JK
8617diff --git a/drivers/infiniband/hw/hfi1/affinity.c b/drivers/infiniband/hw/hfi1/affinity.c
8618index b197e925fe36..95ac319c8e69 100644
8619--- a/drivers/infiniband/hw/hfi1/affinity.c
8620+++ b/drivers/infiniband/hw/hfi1/affinity.c
8621@@ -593,7 +593,7 @@ int hfi1_get_proc_affinity(int node)
e4b2b4a8
JK
8622 struct hfi1_affinity_node *entry;
8623 cpumask_var_t diff, hw_thread_mask, available_mask, intrs_mask;
8624 const struct cpumask *node_mask,
8625- *proc_mask = &current->cpus_allowed;
8626+ *proc_mask = current->cpus_ptr;
8627 struct hfi1_affinity_node_list *affinity = &node_affinity;
8628 struct cpu_mask_set *set = &affinity->proc;
8629
b3bbd485 8630@@ -601,7 +601,7 @@ int hfi1_get_proc_affinity(int node)
e4b2b4a8
JK
8631 * check whether process/context affinity has already
8632 * been set
8633 */
8634- if (cpumask_weight(proc_mask) == 1) {
8635+ if (current->nr_cpus_allowed == 1) {
8636 hfi1_cdbg(PROC, "PID %u %s affinity set to CPU %*pbl",
8637 current->pid, current->comm,
8638 cpumask_pr_args(proc_mask));
b3bbd485 8639@@ -612,7 +612,7 @@ int hfi1_get_proc_affinity(int node)
e4b2b4a8
JK
8640 cpu = cpumask_first(proc_mask);
8641 cpumask_set_cpu(cpu, &set->used);
8642 goto done;
8643- } else if (cpumask_weight(proc_mask) < cpumask_weight(&set->mask)) {
8644+ } else if (current->nr_cpus_allowed < cpumask_weight(&set->mask)) {
8645 hfi1_cdbg(PROC, "PID %u %s affinity set to CPU set(s) %*pbl",
8646 current->pid, current->comm,
8647 cpumask_pr_args(proc_mask));
b3bbd485
JK
8648diff --git a/drivers/infiniband/hw/hfi1/sdma.c b/drivers/infiniband/hw/hfi1/sdma.c
8649index 6781bcdb10b3..d069ad261572 100644
8650--- a/drivers/infiniband/hw/hfi1/sdma.c
8651+++ b/drivers/infiniband/hw/hfi1/sdma.c
8652@@ -856,14 +856,13 @@ struct sdma_engine *sdma_select_user_engine(struct hfi1_devdata *dd,
e4b2b4a8
JK
8653 {
8654 struct sdma_rht_node *rht_node;
8655 struct sdma_engine *sde = NULL;
8656- const struct cpumask *current_mask = &current->cpus_allowed;
8657 unsigned long cpu_id;
8658
8659 /*
8660 * To ensure that always the same sdma engine(s) will be
8661 * selected make sure the process is pinned to this CPU only.
8662 */
8663- if (cpumask_weight(current_mask) != 1)
8664+ if (current->nr_cpus_allowed != 1)
8665 goto out;
8666
8667 cpu_id = smp_processor_id();
b3bbd485
JK
8668diff --git a/drivers/infiniband/hw/qib/qib_file_ops.c b/drivers/infiniband/hw/qib/qib_file_ops.c
8669index 40efc9151ec4..12924aad90cc 100644
8670--- a/drivers/infiniband/hw/qib/qib_file_ops.c
8671+++ b/drivers/infiniband/hw/qib/qib_file_ops.c
8672@@ -1167,7 +1167,7 @@ static unsigned int qib_poll(struct file *fp, struct poll_table_struct *pt)
e4b2b4a8
JK
8673 static void assign_ctxt_affinity(struct file *fp, struct qib_devdata *dd)
8674 {
8675 struct qib_filedata *fd = fp->private_data;
8676- const unsigned int weight = cpumask_weight(&current->cpus_allowed);
8677+ const unsigned int weight = current->nr_cpus_allowed;
8678 const struct cpumask *local_mask = cpumask_of_pcibus(dd->pcidev->bus);
8679 int local_cpu;
8680
b3bbd485 8681@@ -1648,9 +1648,8 @@ static int qib_assign_ctxt(struct file *fp, const struct qib_user_info *uinfo)
e4b2b4a8
JK
8682 ret = find_free_ctxt(i_minor - 1, fp, uinfo);
8683 else {
8684 int unit;
8685- const unsigned int cpu = cpumask_first(&current->cpus_allowed);
8686- const unsigned int weight =
8687- cpumask_weight(&current->cpus_allowed);
8688+ const unsigned int cpu = cpumask_first(current->cpus_ptr);
8689+ const unsigned int weight = current->nr_cpus_allowed;
8690
8691 if (weight == 1 && !test_bit(cpu, qib_cpulist))
8692 if (!find_hca(cpu, &unit) && unit >= 0)
b3bbd485
JK
8693diff --git a/drivers/infiniband/ulp/ipoib/ipoib_multicast.c b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c
8694index 9b3f47ae2016..8327b598d909 100644
8695--- a/drivers/infiniband/ulp/ipoib/ipoib_multicast.c
8696+++ b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c
8697@@ -898,7 +898,7 @@ void ipoib_mcast_restart_task(struct work_struct *work)
1a6e0f06
JK
8698
8699 ipoib_dbg_mcast(priv, "restarting multicast task\n");
8700
8701- local_irq_save(flags);
8702+ local_irq_save_nort(flags);
8703 netif_addr_lock(dev);
8704 spin_lock(&priv->lock);
8705
b3bbd485 8706@@ -980,7 +980,7 @@ void ipoib_mcast_restart_task(struct work_struct *work)
1a6e0f06
JK
8707
8708 spin_unlock(&priv->lock);
8709 netif_addr_unlock(dev);
8710- local_irq_restore(flags);
8711+ local_irq_restore_nort(flags);
8712
e4b2b4a8
JK
8713 ipoib_mcast_remove_list(&remove_list);
8714
b3bbd485
JK
8715diff --git a/drivers/input/gameport/gameport.c b/drivers/input/gameport/gameport.c
8716index cedc665364cd..4a4fdef151aa 100644
8717--- a/drivers/input/gameport/gameport.c
8718+++ b/drivers/input/gameport/gameport.c
8719@@ -91,13 +91,13 @@ static int gameport_measure_speed(struct gameport *gameport)
e4b2b4a8
JK
8720 tx = ~0;
8721
8722 for (i = 0; i < 50; i++) {
8723- local_irq_save(flags);
8724+ local_irq_save_nort(flags);
8725 t1 = ktime_get_ns();
8726 for (t = 0; t < 50; t++)
8727 gameport_read(gameport);
8728 t2 = ktime_get_ns();
8729 t3 = ktime_get_ns();
8730- local_irq_restore(flags);
8731+ local_irq_restore_nort(flags);
8732 udelay(i * 10);
8733 t = (t2 - t1) - (t3 - t2);
8734 if (t < tx)
b3bbd485 8735@@ -124,12 +124,12 @@ static int old_gameport_measure_speed(struct gameport *gameport)
e4b2b4a8
JK
8736 tx = 1 << 30;
8737
8738 for(i = 0; i < 50; i++) {
8739- local_irq_save(flags);
8740+ local_irq_save_nort(flags);
8741 GET_TIME(t1);
8742 for (t = 0; t < 50; t++) gameport_read(gameport);
8743 GET_TIME(t2);
8744 GET_TIME(t3);
8745- local_irq_restore(flags);
8746+ local_irq_restore_nort(flags);
8747 udelay(i * 10);
8748 if ((t = DELTA(t2,t1) - DELTA(t3,t2)) < tx) tx = t;
8749 }
b3bbd485 8750@@ -148,11 +148,11 @@ static int old_gameport_measure_speed(struct gameport *gameport)
e4b2b4a8
JK
8751 tx = 1 << 30;
8752
8753 for(i = 0; i < 50; i++) {
8754- local_irq_save(flags);
8755+ local_irq_save_nort(flags);
8756 t1 = rdtsc();
8757 for (t = 0; t < 50; t++) gameport_read(gameport);
8758 t2 = rdtsc();
8759- local_irq_restore(flags);
8760+ local_irq_restore_nort(flags);
8761 udelay(i * 10);
8762 if (t2 - t1 < tx) tx = t2 - t1;
8763 }
b3bbd485 8764diff --git a/drivers/iommu/amd_iommu.c b/drivers/iommu/amd_iommu.c
5dd41b01 8765index efa6cd2500b9..7d9d41f803d1 100644
b3bbd485
JK
8766--- a/drivers/iommu/amd_iommu.c
8767+++ b/drivers/iommu/amd_iommu.c
e4b2b4a8
JK
8768@@ -81,11 +81,12 @@
8769 */
8770 #define AMD_IOMMU_PGSIZES ((~0xFFFUL) & ~(2ULL << 38))
8771
8772-static DEFINE_RWLOCK(amd_iommu_devtable_lock);
8773+static DEFINE_SPINLOCK(amd_iommu_devtable_lock);
8774+static DEFINE_SPINLOCK(pd_bitmap_lock);
8775+static DEFINE_SPINLOCK(iommu_table_lock);
8776
8777 /* List of all available dev_data structures */
8778-static LIST_HEAD(dev_data_list);
8779-static DEFINE_SPINLOCK(dev_data_list_lock);
8780+static LLIST_HEAD(dev_data_list);
8781
8782 LIST_HEAD(ioapic_map);
8783 LIST_HEAD(hpet_map);
b3bbd485 8784@@ -204,40 +205,33 @@ static struct dma_ops_domain* to_dma_ops_domain(struct protection_domain *domain
e4b2b4a8
JK
8785 static struct iommu_dev_data *alloc_dev_data(u16 devid)
8786 {
8787 struct iommu_dev_data *dev_data;
8788- unsigned long flags;
8789
8790 dev_data = kzalloc(sizeof(*dev_data), GFP_KERNEL);
8791 if (!dev_data)
8792 return NULL;
8793
8794 dev_data->devid = devid;
8795-
8796- spin_lock_irqsave(&dev_data_list_lock, flags);
8797- list_add_tail(&dev_data->dev_data_list, &dev_data_list);
8798- spin_unlock_irqrestore(&dev_data_list_lock, flags);
8799-
8800 ratelimit_default_init(&dev_data->rs);
8801
8802+ llist_add(&dev_data->dev_data_list, &dev_data_list);
8803 return dev_data;
8804 }
8805
8806 static struct iommu_dev_data *search_dev_data(u16 devid)
8807 {
8808 struct iommu_dev_data *dev_data;
8809- unsigned long flags;
8810+ struct llist_node *node;
b3bbd485
JK
8811+
8812+ if (llist_empty(&dev_data_list))
8813+ return NULL;
e4b2b4a8
JK
8814
8815- spin_lock_irqsave(&dev_data_list_lock, flags);
8816- list_for_each_entry(dev_data, &dev_data_list, dev_data_list) {
e4b2b4a8
JK
8817+ node = dev_data_list.first;
8818+ llist_for_each_entry(dev_data, node, dev_data_list) {
8819 if (dev_data->devid == devid)
8820- goto out_unlock;
8821+ return dev_data;
8822 }
8823
8824- dev_data = NULL;
8825-
8826-out_unlock:
8827- spin_unlock_irqrestore(&dev_data_list_lock, flags);
8828-
8829- return dev_data;
8830+ return NULL;
8831 }
8832
8833 static int __last_alias(struct pci_dev *pdev, u16 alias, void *data)
5dd41b01 8834@@ -1062,9 +1056,9 @@ static int iommu_queue_command_sync(struct amd_iommu *iommu,
e4b2b4a8
JK
8835 unsigned long flags;
8836 int ret;
8837
8838- spin_lock_irqsave(&iommu->lock, flags);
8839+ raw_spin_lock_irqsave(&iommu->lock, flags);
8840 ret = __iommu_queue_command_sync(iommu, cmd, sync);
8841- spin_unlock_irqrestore(&iommu->lock, flags);
8842+ raw_spin_unlock_irqrestore(&iommu->lock, flags);
8843
8844 return ret;
8845 }
5dd41b01 8846@@ -1090,7 +1084,7 @@ static int iommu_completion_wait(struct amd_iommu *iommu)
e4b2b4a8
JK
8847
8848 build_completion_wait(&cmd, (u64)&iommu->cmd_sem);
8849
8850- spin_lock_irqsave(&iommu->lock, flags);
8851+ raw_spin_lock_irqsave(&iommu->lock, flags);
8852
8853 iommu->cmd_sem = 0;
8854
5dd41b01 8855@@ -1101,7 +1095,7 @@ static int iommu_completion_wait(struct amd_iommu *iommu)
e4b2b4a8
JK
8856 ret = wait_on_sem(&iommu->cmd_sem);
8857
8858 out_unlock:
8859- spin_unlock_irqrestore(&iommu->lock, flags);
8860+ raw_spin_unlock_irqrestore(&iommu->lock, flags);
8861
8862 return ret;
8863 }
5dd41b01 8864@@ -1610,29 +1604,26 @@ static void del_domain_from_list(struct protection_domain *domain)
e4b2b4a8
JK
8865
8866 static u16 domain_id_alloc(void)
8867 {
8868- unsigned long flags;
8869 int id;
8870
8871- write_lock_irqsave(&amd_iommu_devtable_lock, flags);
8872+ spin_lock(&pd_bitmap_lock);
8873 id = find_first_zero_bit(amd_iommu_pd_alloc_bitmap, MAX_DOMAIN_ID);
8874 BUG_ON(id == 0);
8875 if (id > 0 && id < MAX_DOMAIN_ID)
8876 __set_bit(id, amd_iommu_pd_alloc_bitmap);
8877 else
8878 id = 0;
8879- write_unlock_irqrestore(&amd_iommu_devtable_lock, flags);
8880+ spin_unlock(&pd_bitmap_lock);
8881
8882 return id;
8883 }
8884
8885 static void domain_id_free(int id)
8886 {
8887- unsigned long flags;
8888-
8889- write_lock_irqsave(&amd_iommu_devtable_lock, flags);
8890+ spin_lock(&pd_bitmap_lock);
8891 if (id > 0 && id < MAX_DOMAIN_ID)
8892 __clear_bit(id, amd_iommu_pd_alloc_bitmap);
8893- write_unlock_irqrestore(&amd_iommu_devtable_lock, flags);
8894+ spin_unlock(&pd_bitmap_lock);
8895 }
8896
8897 #define DEFINE_FREE_PT_FN(LVL, FN) \
5dd41b01 8898@@ -1952,10 +1943,10 @@ static int __attach_device(struct iommu_dev_data *dev_data,
e4b2b4a8
JK
8899 int ret;
8900
8901 /*
8902- * Must be called with IRQs disabled. Warn here to detect early
8903- * when its not.
8904+ * Must be called with IRQs disabled on a non RT kernel. Warn here to
8905+ * detect early when its not.
8906 */
8907- WARN_ON(!irqs_disabled());
8908+ WARN_ON_NONRT(!irqs_disabled());
8909
8910 /* lock domain */
8911 spin_lock(&domain->lock);
5dd41b01 8912@@ -2101,9 +2092,9 @@ static int attach_device(struct device *dev,
e4b2b4a8
JK
8913 }
8914
8915 skip_ats_check:
8916- write_lock_irqsave(&amd_iommu_devtable_lock, flags);
8917+ spin_lock_irqsave(&amd_iommu_devtable_lock, flags);
8918 ret = __attach_device(dev_data, domain);
8919- write_unlock_irqrestore(&amd_iommu_devtable_lock, flags);
8920+ spin_unlock_irqrestore(&amd_iommu_devtable_lock, flags);
8921
8922 /*
8923 * We might boot into a crash-kernel here. The crashed kernel
5dd41b01 8924@@ -2123,10 +2114,10 @@ static void __detach_device(struct iommu_dev_data *dev_data)
e4b2b4a8
JK
8925 struct protection_domain *domain;
8926
8927 /*
8928- * Must be called with IRQs disabled. Warn here to detect early
8929- * when its not.
8930+ * Must be called with IRQs disabled on a non RT kernel. Warn here to
8931+ * detect early when its not.
8932 */
8933- WARN_ON(!irqs_disabled());
8934+ WARN_ON_NONRT(!irqs_disabled());
8935
8936 if (WARN_ON(!dev_data->domain))
8937 return;
5dd41b01 8938@@ -2153,9 +2144,9 @@ static void detach_device(struct device *dev)
e4b2b4a8
JK
8939 domain = dev_data->domain;
8940
8941 /* lock device table */
8942- write_lock_irqsave(&amd_iommu_devtable_lock, flags);
8943+ spin_lock_irqsave(&amd_iommu_devtable_lock, flags);
8944 __detach_device(dev_data);
8945- write_unlock_irqrestore(&amd_iommu_devtable_lock, flags);
8946+ spin_unlock_irqrestore(&amd_iommu_devtable_lock, flags);
8947
8948 if (!dev_is_pci(dev))
8949 return;
5dd41b01 8950@@ -2819,7 +2810,7 @@ static void cleanup_domain(struct protection_domain *domain)
e4b2b4a8
JK
8951 struct iommu_dev_data *entry;
8952 unsigned long flags;
8953
8954- write_lock_irqsave(&amd_iommu_devtable_lock, flags);
8955+ spin_lock_irqsave(&amd_iommu_devtable_lock, flags);
8956
8957 while (!list_empty(&domain->dev_list)) {
8958 entry = list_first_entry(&domain->dev_list,
5dd41b01 8959@@ -2827,7 +2818,7 @@ static void cleanup_domain(struct protection_domain *domain)
e4b2b4a8
JK
8960 __detach_device(entry);
8961 }
8962
8963- write_unlock_irqrestore(&amd_iommu_devtable_lock, flags);
8964+ spin_unlock_irqrestore(&amd_iommu_devtable_lock, flags);
8965 }
8966
8967 static void protection_domain_free(struct protection_domain *domain)
5dd41b01 8968@@ -3594,14 +3585,62 @@ static void set_dte_irq_entry(u16 devid, struct irq_remap_table *table)
e4b2b4a8
JK
8969 amd_iommu_dev_table[devid].data[2] = dte;
8970 }
8971
8972-static struct irq_remap_table *get_irq_table(u16 devid, bool ioapic)
8973+static struct irq_remap_table *get_irq_table(u16 devid)
8974+{
8975+ struct irq_remap_table *table;
8976+
8977+ if (WARN_ONCE(!amd_iommu_rlookup_table[devid],
8978+ "%s: no iommu for devid %x\n", __func__, devid))
8979+ return NULL;
8980+
8981+ table = irq_lookup_table[devid];
8982+ if (WARN_ONCE(!table, "%s: no table for devid %x\n", __func__, devid))
8983+ return NULL;
8984+
8985+ return table;
8986+}
8987+
8988+static struct irq_remap_table *__alloc_irq_table(void)
8989+{
8990+ struct irq_remap_table *table;
8991+
8992+ table = kzalloc(sizeof(*table), GFP_KERNEL);
8993+ if (!table)
8994+ return NULL;
8995+
8996+ table->table = kmem_cache_alloc(amd_iommu_irq_cache, GFP_KERNEL);
8997+ if (!table->table) {
8998+ kfree(table);
8999+ return NULL;
9000+ }
9001+ raw_spin_lock_init(&table->lock);
9002+
9003+ if (!AMD_IOMMU_GUEST_IR_GA(amd_iommu_guest_ir))
9004+ memset(table->table, 0,
9005+ MAX_IRQS_PER_TABLE * sizeof(u32));
9006+ else
9007+ memset(table->table, 0,
9008+ (MAX_IRQS_PER_TABLE * (sizeof(u64) * 2)));
9009+ return table;
9010+}
9011+
9012+static void set_remap_table_entry(struct amd_iommu *iommu, u16 devid,
9013+ struct irq_remap_table *table)
9014+{
9015+ irq_lookup_table[devid] = table;
9016+ set_dte_irq_entry(devid, table);
9017+ iommu_flush_dte(iommu, devid);
9018+}
9019+
9020+static struct irq_remap_table *alloc_irq_table(u16 devid)
9021 {
9022 struct irq_remap_table *table = NULL;
9023+ struct irq_remap_table *new_table = NULL;
9024 struct amd_iommu *iommu;
9025 unsigned long flags;
9026 u16 alias;
9027
9028- write_lock_irqsave(&amd_iommu_devtable_lock, flags);
9029+ spin_lock_irqsave(&iommu_table_lock, flags);
9030
9031 iommu = amd_iommu_rlookup_table[devid];
9032 if (!iommu)
5dd41b01 9033@@ -3614,60 +3653,45 @@ static struct irq_remap_table *get_irq_table(u16 devid, bool ioapic)
e4b2b4a8
JK
9034 alias = amd_iommu_alias_table[devid];
9035 table = irq_lookup_table[alias];
9036 if (table) {
9037- irq_lookup_table[devid] = table;
9038- set_dte_irq_entry(devid, table);
9039- iommu_flush_dte(iommu, devid);
9040- goto out;
9041+ set_remap_table_entry(iommu, devid, table);
9042+ goto out_wait;
9043 }
9044+ spin_unlock_irqrestore(&iommu_table_lock, flags);
9045
9046 /* Nothing there yet, allocate new irq remapping table */
9047- table = kzalloc(sizeof(*table), GFP_ATOMIC);
9048- if (!table)
9049- goto out_unlock;
9050-
9051- /* Initialize table spin-lock */
9052- spin_lock_init(&table->lock);
9053+ new_table = __alloc_irq_table();
9054+ if (!new_table)
9055+ return NULL;
9056
9057- if (ioapic)
9058- /* Keep the first 32 indexes free for IOAPIC interrupts */
9059- table->min_index = 32;
9060+ spin_lock_irqsave(&iommu_table_lock, flags);
9061
9062- table->table = kmem_cache_alloc(amd_iommu_irq_cache, GFP_ATOMIC);
9063- if (!table->table) {
9064- kfree(table);
9065- table = NULL;
9066+ table = irq_lookup_table[devid];
9067+ if (table)
9068 goto out_unlock;
9069- }
9070-
9071- if (!AMD_IOMMU_GUEST_IR_GA(amd_iommu_guest_ir))
9072- memset(table->table, 0,
9073- MAX_IRQS_PER_TABLE * sizeof(u32));
9074- else
9075- memset(table->table, 0,
9076- (MAX_IRQS_PER_TABLE * (sizeof(u64) * 2)));
b3bbd485 9077
e4b2b4a8
JK
9078- if (ioapic) {
9079- int i;
b3bbd485 9080-
e4b2b4a8
JK
9081- for (i = 0; i < 32; ++i)
9082- iommu->irte_ops->set_allocated(table, i);
9083+ table = irq_lookup_table[alias];
9084+ if (table) {
9085+ set_remap_table_entry(iommu, devid, table);
9086+ goto out_wait;
9087 }
9088
9089- irq_lookup_table[devid] = table;
9090- set_dte_irq_entry(devid, table);
9091- iommu_flush_dte(iommu, devid);
9092- if (devid != alias) {
9093- irq_lookup_table[alias] = table;
9094- set_dte_irq_entry(alias, table);
9095- iommu_flush_dte(iommu, alias);
9096- }
9097+ table = new_table;
9098+ new_table = NULL;
9099
9100-out:
9101+ set_remap_table_entry(iommu, devid, table);
9102+ if (devid != alias)
9103+ set_remap_table_entry(iommu, alias, table);
9104+
9105+out_wait:
9106 iommu_completion_wait(iommu);
9107
9108 out_unlock:
9109- write_unlock_irqrestore(&amd_iommu_devtable_lock, flags);
9110+ spin_unlock_irqrestore(&iommu_table_lock, flags);
9111
9112+ if (new_table) {
9113+ kmem_cache_free(amd_iommu_irq_cache, new_table->table);
9114+ kfree(new_table);
9115+ }
9116 return table;
9117 }
9118
5dd41b01 9119@@ -3681,11 +3705,11 @@ static int alloc_irq_index(u16 devid, int count)
e4b2b4a8
JK
9120 if (!iommu)
9121 return -ENODEV;
9122
9123- table = get_irq_table(devid, false);
9124+ table = alloc_irq_table(devid);
9125 if (!table)
9126 return -ENODEV;
9127
9128- spin_lock_irqsave(&table->lock, flags);
9129+ raw_spin_lock_irqsave(&table->lock, flags);
9130
9131 /* Scan table for free entries */
9132 for (c = 0, index = table->min_index;
5dd41b01 9133@@ -3708,7 +3732,7 @@ static int alloc_irq_index(u16 devid, int count)
e4b2b4a8
JK
9134 index = -ENOSPC;
9135
9136 out:
9137- spin_unlock_irqrestore(&table->lock, flags);
9138+ raw_spin_unlock_irqrestore(&table->lock, flags);
9139
9140 return index;
9141 }
5dd41b01 9142@@ -3725,11 +3749,11 @@ static int modify_irte_ga(u16 devid, int index, struct irte_ga *irte,
e4b2b4a8
JK
9143 if (iommu == NULL)
9144 return -EINVAL;
9145
9146- table = get_irq_table(devid, false);
9147+ table = get_irq_table(devid);
9148 if (!table)
9149 return -ENOMEM;
9150
9151- spin_lock_irqsave(&table->lock, flags);
9152+ raw_spin_lock_irqsave(&table->lock, flags);
9153
9154 entry = (struct irte_ga *)table->table;
9155 entry = &entry[index];
5dd41b01 9156@@ -3740,7 +3764,7 @@ static int modify_irte_ga(u16 devid, int index, struct irte_ga *irte,
e4b2b4a8
JK
9157 if (data)
9158 data->ref = entry;
9159
9160- spin_unlock_irqrestore(&table->lock, flags);
9161+ raw_spin_unlock_irqrestore(&table->lock, flags);
9162
9163 iommu_flush_irt(iommu, devid);
9164 iommu_completion_wait(iommu);
5dd41b01 9165@@ -3758,13 +3782,13 @@ static int modify_irte(u16 devid, int index, union irte *irte)
e4b2b4a8
JK
9166 if (iommu == NULL)
9167 return -EINVAL;
9168
9169- table = get_irq_table(devid, false);
9170+ table = get_irq_table(devid);
9171 if (!table)
9172 return -ENOMEM;
9173
9174- spin_lock_irqsave(&table->lock, flags);
9175+ raw_spin_lock_irqsave(&table->lock, flags);
9176 table->table[index] = irte->val;
9177- spin_unlock_irqrestore(&table->lock, flags);
9178+ raw_spin_unlock_irqrestore(&table->lock, flags);
9179
9180 iommu_flush_irt(iommu, devid);
9181 iommu_completion_wait(iommu);
5dd41b01 9182@@ -3782,13 +3806,13 @@ static void free_irte(u16 devid, int index)
e4b2b4a8
JK
9183 if (iommu == NULL)
9184 return;
9185
9186- table = get_irq_table(devid, false);
9187+ table = get_irq_table(devid);
9188 if (!table)
9189 return;
9190
9191- spin_lock_irqsave(&table->lock, flags);
9192+ raw_spin_lock_irqsave(&table->lock, flags);
9193 iommu->irte_ops->clear_allocated(table, index);
9194- spin_unlock_irqrestore(&table->lock, flags);
9195+ raw_spin_unlock_irqrestore(&table->lock, flags);
9196
9197 iommu_flush_irt(iommu, devid);
9198 iommu_completion_wait(iommu);
5dd41b01 9199@@ -3869,10 +3893,8 @@ static void irte_ga_set_affinity(void *entry, u16 devid, u16 index,
e4b2b4a8
JK
9200 u8 vector, u32 dest_apicid)
9201 {
9202 struct irte_ga *irte = (struct irte_ga *) entry;
9203- struct iommu_dev_data *dev_data = search_dev_data(devid);
9204
9205- if (!dev_data || !dev_data->use_vapic ||
9206- !irte->lo.fields_remap.guest_mode) {
9207+ if (!irte->lo.fields_remap.guest_mode) {
9208 irte->hi.fields.vector = vector;
9209 irte->lo.fields_remap.destination = dest_apicid;
9210 modify_irte_ga(devid, index, irte, NULL);
5dd41b01 9211@@ -4078,7 +4100,7 @@ static int irq_remapping_alloc(struct irq_domain *domain, unsigned int virq,
e4b2b4a8
JK
9212 struct amd_ir_data *data = NULL;
9213 struct irq_cfg *cfg;
9214 int i, ret, devid;
9215- int index = -1;
9216+ int index;
9217
9218 if (!info)
9219 return -EINVAL;
5dd41b01 9220@@ -4102,10 +4124,26 @@ static int irq_remapping_alloc(struct irq_domain *domain, unsigned int virq,
e4b2b4a8
JK
9221 return ret;
9222
9223 if (info->type == X86_IRQ_ALLOC_TYPE_IOAPIC) {
9224- if (get_irq_table(devid, true))
9225+ struct irq_remap_table *table;
9226+ struct amd_iommu *iommu;
9227+
9228+ table = alloc_irq_table(devid);
9229+ if (table) {
9230+ if (!table->min_index) {
9231+ /*
9232+ * Keep the first 32 indexes free for IOAPIC
9233+ * interrupts.
9234+ */
9235+ table->min_index = 32;
9236+ iommu = amd_iommu_rlookup_table[devid];
9237+ for (i = 0; i < 32; ++i)
9238+ iommu->irte_ops->set_allocated(table, i);
9239+ }
9240+ WARN_ON(table->min_index != 32);
9241 index = info->ioapic_pin;
9242- else
9243- ret = -ENOMEM;
9244+ } else {
9245+ index = -ENOMEM;
9246+ }
9247 } else {
9248 index = alloc_irq_index(devid, nr_irqs);
9249 }
5dd41b01 9250@@ -4349,7 +4387,7 @@ int amd_iommu_update_ga(int cpu, bool is_run, void *data)
e4b2b4a8
JK
9251 {
9252 unsigned long flags;
9253 struct amd_iommu *iommu;
9254- struct irq_remap_table *irt;
9255+ struct irq_remap_table *table;
9256 struct amd_ir_data *ir_data = (struct amd_ir_data *)data;
9257 int devid = ir_data->irq_2_irte.devid;
9258 struct irte_ga *entry = (struct irte_ga *) ir_data->entry;
5dd41b01 9259@@ -4363,11 +4401,11 @@ int amd_iommu_update_ga(int cpu, bool is_run, void *data)
e4b2b4a8
JK
9260 if (!iommu)
9261 return -ENODEV;
9262
9263- irt = get_irq_table(devid, false);
9264- if (!irt)
9265+ table = get_irq_table(devid);
9266+ if (!table)
9267 return -ENODEV;
9268
9269- spin_lock_irqsave(&irt->lock, flags);
9270+ raw_spin_lock_irqsave(&table->lock, flags);
9271
9272 if (ref->lo.fields_vapic.guest_mode) {
9273 if (cpu >= 0)
5dd41b01 9274@@ -4376,7 +4414,7 @@ int amd_iommu_update_ga(int cpu, bool is_run, void *data)
e4b2b4a8
JK
9275 barrier();
9276 }
9277
9278- spin_unlock_irqrestore(&irt->lock, flags);
9279+ raw_spin_unlock_irqrestore(&table->lock, flags);
9280
9281 iommu_flush_irt(iommu, devid);
9282 iommu_completion_wait(iommu);
b3bbd485
JK
9283diff --git a/drivers/iommu/amd_iommu_init.c b/drivers/iommu/amd_iommu_init.c
9284index 6fe2d0346073..e3cd81b32a33 100644
9285--- a/drivers/iommu/amd_iommu_init.c
9286+++ b/drivers/iommu/amd_iommu_init.c
9287@@ -1474,7 +1474,7 @@ static int __init init_iommu_one(struct amd_iommu *iommu, struct ivhd_header *h)
e4b2b4a8
JK
9288 {
9289 int ret;
9290
9291- spin_lock_init(&iommu->lock);
9292+ raw_spin_lock_init(&iommu->lock);
9293
9294 /* Add IOMMU to internal data structures */
9295 list_add_tail(&iommu->list, &amd_iommu_list);
b3bbd485
JK
9296diff --git a/drivers/iommu/amd_iommu_types.h b/drivers/iommu/amd_iommu_types.h
9297index f6b24c7d8b70..16b1404da58c 100644
9298--- a/drivers/iommu/amd_iommu_types.h
9299+++ b/drivers/iommu/amd_iommu_types.h
9300@@ -406,7 +406,7 @@ extern bool amd_iommu_iotlb_sup;
e4b2b4a8
JK
9301 #define IRQ_TABLE_ALIGNMENT 128
9302
9303 struct irq_remap_table {
9304- spinlock_t lock;
9305+ raw_spinlock_t lock;
9306 unsigned min_index;
9307 u32 *table;
9308 };
b3bbd485 9309@@ -488,7 +488,7 @@ struct amd_iommu {
e4b2b4a8
JK
9310 int index;
9311
9312 /* locks the accesses to the hardware */
9313- spinlock_t lock;
9314+ raw_spinlock_t lock;
9315
9316 /* Pointer to PCI device of this IOMMU */
9317 struct pci_dev *dev;
b3bbd485 9318@@ -625,7 +625,7 @@ struct devid_map {
e4b2b4a8
JK
9319 */
9320 struct iommu_dev_data {
9321 struct list_head list; /* For domain->dev_list */
9322- struct list_head dev_data_list; /* For global dev_data_list */
9323+ struct llist_node dev_data_list; /* For global dev_data_list */
9324 struct protection_domain *domain; /* Domain the device is bound to */
9325 u16 devid; /* PCI Device ID */
9326 u16 alias; /* Alias Device ID */
b3bbd485
JK
9327diff --git a/drivers/iommu/iova.c b/drivers/iommu/iova.c
9328index 33edfa794ae9..b30900025c62 100644
9329--- a/drivers/iommu/iova.c
9330+++ b/drivers/iommu/iova.c
9331@@ -570,7 +570,7 @@ void queue_iova(struct iova_domain *iovad,
e4b2b4a8
JK
9332 unsigned long pfn, unsigned long pages,
9333 unsigned long data)
9334 {
9335- struct iova_fq *fq = get_cpu_ptr(iovad->fq);
9336+ struct iova_fq *fq = raw_cpu_ptr(iovad->fq);
9337 unsigned long flags;
9338 unsigned idx;
9339
b3bbd485 9340@@ -600,8 +600,6 @@ void queue_iova(struct iova_domain *iovad,
e4b2b4a8
JK
9341 if (atomic_cmpxchg(&iovad->fq_timer_on, 0, 1) == 0)
9342 mod_timer(&iovad->fq_timer,
9343 jiffies + msecs_to_jiffies(IOVA_FQ_TIMEOUT));
9344-
9345- put_cpu_ptr(iovad->fq);
9346 }
9347 EXPORT_SYMBOL_GPL(queue_iova);
9348
b3bbd485
JK
9349diff --git a/drivers/irqchip/irq-gic-v3-its.c b/drivers/irqchip/irq-gic-v3-its.c
9350index 2ea39a83737f..a3e23d0fc4af 100644
9351--- a/drivers/irqchip/irq-gic-v3-its.c
9352+++ b/drivers/irqchip/irq-gic-v3-its.c
9353@@ -148,7 +148,7 @@ static struct {
9354 } vpe_proxy;
9355
9356 static LIST_HEAD(its_nodes);
9357-static DEFINE_SPINLOCK(its_lock);
9358+static DEFINE_RAW_SPINLOCK(its_lock);
9359 static struct rdists *gic_rdists;
9360 static struct irq_domain *its_parent;
9361
9362@@ -165,6 +165,7 @@ static DEFINE_RAW_SPINLOCK(vmovp_lock);
9363 static DEFINE_IDA(its_vpeid_ida);
9364
9365 #define gic_data_rdist() (raw_cpu_ptr(gic_rdists->rdist))
9366+#define gic_data_rdist_cpu(cpu) (per_cpu_ptr(gic_rdists->rdist, cpu))
9367 #define gic_data_rdist_rd_base() (gic_data_rdist()->rd_base)
9368 #define gic_data_rdist_vlpi_base() (gic_data_rdist_rd_base() + SZ_128K)
9369
9370@@ -1432,7 +1433,7 @@ static void its_free_prop_table(struct page *prop_page)
9371 get_order(LPI_PROPBASE_SZ));
9372 }
9373
9374-static int __init its_alloc_lpi_tables(void)
9375+static int __init its_alloc_lpi_prop_table(void)
9376 {
9377 phys_addr_t paddr;
9378
9379@@ -1758,30 +1759,47 @@ static void its_free_pending_table(struct page *pt)
9380 get_order(max_t(u32, LPI_PENDBASE_SZ, SZ_64K)));
9381 }
9382
9383-static void its_cpu_init_lpis(void)
9384+static int __init allocate_lpi_tables(void)
9385 {
9386- void __iomem *rbase = gic_data_rdist_rd_base();
9387- struct page *pend_page;
9388- u64 val, tmp;
9389+ int err, cpu;
9390
9391- /* If we didn't allocate the pending table yet, do it now */
9392- pend_page = gic_data_rdist()->pend_page;
9393- if (!pend_page) {
9394- phys_addr_t paddr;
9395+ err = its_alloc_lpi_prop_table();
9396+ if (err)
9397+ return err;
9398+
9399+ /*
9400+ * We allocate all the pending tables anyway, as we may have a
9401+ * mix of RDs that have had LPIs enabled, and some that
9402+ * don't. We'll free the unused ones as each CPU comes online.
9403+ */
9404+ for_each_possible_cpu(cpu) {
9405+ struct page *pend_page;
9406
9407 pend_page = its_allocate_pending_table(GFP_NOWAIT);
9408 if (!pend_page) {
9409- pr_err("Failed to allocate PENDBASE for CPU%d\n",
9410- smp_processor_id());
9411- return;
9412+ pr_err("Failed to allocate PENDBASE for CPU%d\n", cpu);
9413+ return -ENOMEM;
9414 }
9415
9416- paddr = page_to_phys(pend_page);
9417- pr_info("CPU%d: using LPI pending table @%pa\n",
9418- smp_processor_id(), &paddr);
9419- gic_data_rdist()->pend_page = pend_page;
9420+ gic_data_rdist_cpu(cpu)->pend_page = pend_page;
9421 }
9422
9423+ return 0;
9424+}
9425+
9426+static void its_cpu_init_lpis(void)
9427+{
9428+ void __iomem *rbase = gic_data_rdist_rd_base();
9429+ struct page *pend_page;
9430+ phys_addr_t paddr;
9431+ u64 val, tmp;
9432+
9433+ if (gic_data_rdist()->lpi_enabled)
9434+ return;
9435+
9436+ pend_page = gic_data_rdist()->pend_page;
9437+ paddr = page_to_phys(pend_page);
9438+
9439 /* Disable LPIs */
9440 val = readl_relaxed(rbase + GICR_CTLR);
9441 val &= ~GICR_CTLR_ENABLE_LPIS;
9442@@ -1843,6 +1861,10 @@ static void its_cpu_init_lpis(void)
9443
9444 /* Make sure the GIC has seen the above */
9445 dsb(sy);
9446+ gic_data_rdist()->lpi_enabled = true;
9447+ pr_info("GICv3: CPU%d: using LPI pending table @%pa\n",
9448+ smp_processor_id(),
9449+ &paddr);
9450 }
9451
9452 static void its_cpu_init_collection(void)
9453@@ -1850,7 +1872,7 @@ static void its_cpu_init_collection(void)
9454 struct its_node *its;
9455 int cpu;
9456
9457- spin_lock(&its_lock);
9458+ raw_spin_lock(&its_lock);
9459 cpu = smp_processor_id();
9460
9461 list_for_each_entry(its, &its_nodes, entry) {
9462@@ -1892,7 +1914,7 @@ static void its_cpu_init_collection(void)
9463 its_send_invall(its, &its->collections[cpu]);
9464 }
9465
9466- spin_unlock(&its_lock);
9467+ raw_spin_unlock(&its_lock);
9468 }
9469
9470 static struct its_device *its_find_device(struct its_node *its, u32 dev_id)
9471@@ -3041,9 +3063,9 @@ static int __init its_probe_one(struct resource *res,
9472 if (err)
9473 goto out_free_tables;
9474
9475- spin_lock(&its_lock);
9476+ raw_spin_lock(&its_lock);
9477 list_add(&its->entry, &its_nodes);
9478- spin_unlock(&its_lock);
9479+ raw_spin_unlock(&its_lock);
9480
9481 return 0;
9482
9483@@ -3278,7 +3300,8 @@ int __init its_init(struct fwnode_handle *handle, struct rdists *rdists,
9484 }
9485
9486 gic_rdists = rdists;
9487- err = its_alloc_lpi_tables();
9488+
9489+ err = allocate_lpi_tables();
9490 if (err)
9491 return err;
9492
9493diff --git a/drivers/leds/trigger/Kconfig b/drivers/leds/trigger/Kconfig
9494index 3f9ddb9fafa7..09da5b6b44a1 100644
9495--- a/drivers/leds/trigger/Kconfig
9496+++ b/drivers/leds/trigger/Kconfig
9497@@ -69,7 +69,7 @@ config LEDS_TRIGGER_BACKLIGHT
e4b2b4a8
JK
9498
9499 config LEDS_TRIGGER_CPU
9500 bool "LED CPU Trigger"
9501- depends on LEDS_TRIGGERS
9502+ depends on LEDS_TRIGGERS && !PREEMPT_RT_BASE
9503 help
9504 This allows LEDs to be controlled by active CPUs. This shows
9505 the active CPUs across an array of LEDs so you can see which
b3bbd485
JK
9506diff --git a/drivers/md/bcache/Kconfig b/drivers/md/bcache/Kconfig
9507index 4d200883c505..98b64ed5cb81 100644
9508--- a/drivers/md/bcache/Kconfig
9509+++ b/drivers/md/bcache/Kconfig
e4b2b4a8
JK
9510@@ -1,6 +1,7 @@
9511
9512 config BCACHE
9513 tristate "Block device as cache"
9514+ depends on !PREEMPT_RT_FULL
9515 ---help---
9516 Allows a block device to be used as cache for other devices; uses
9517 a btree for indexing and the layout is optimized for SSDs.
b3bbd485
JK
9518diff --git a/drivers/md/dm-rq.c b/drivers/md/dm-rq.c
9519index eadfcfd106ff..8824aeda85cf 100644
9520--- a/drivers/md/dm-rq.c
9521+++ b/drivers/md/dm-rq.c
9522@@ -671,7 +671,7 @@ static void dm_old_request_fn(struct request_queue *q)
e4b2b4a8
JK
9523 /* Establish tio->ti before queuing work (map_tio_request) */
9524 tio->ti = ti;
9525 kthread_queue_work(&md->kworker, &tio->work);
9526- BUG_ON(!irqs_disabled());
9527+ BUG_ON_NONRT(!irqs_disabled());
9528 }
9529 }
9530
b3bbd485 9531diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
5dd41b01 9532index dbf51b4c21b3..5cfccaf87687 100644
b3bbd485
JK
9533--- a/drivers/md/raid5.c
9534+++ b/drivers/md/raid5.c
9535@@ -410,7 +410,7 @@ void raid5_release_stripe(struct stripe_head *sh)
e4b2b4a8
JK
9536 md_wakeup_thread(conf->mddev->thread);
9537 return;
9538 slow_path:
9539- local_irq_save(flags);
9540+ local_irq_save_nort(flags);
9541 /* we are ok here if STRIPE_ON_RELEASE_LIST is set or not */
9542 if (atomic_dec_and_lock(&sh->count, &conf->device_lock)) {
9543 INIT_LIST_HEAD(&list);
b3bbd485 9544@@ -419,7 +419,7 @@ void raid5_release_stripe(struct stripe_head *sh)
e4b2b4a8
JK
9545 spin_unlock(&conf->device_lock);
9546 release_inactive_stripe_list(conf, &list, hash);
9547 }
9548- local_irq_restore(flags);
9549+ local_irq_restore_nort(flags);
9550 }
9551
9552 static inline void remove_hash(struct stripe_head *sh)
b3bbd485 9553@@ -2067,8 +2067,9 @@ static void raid_run_ops(struct stripe_head *sh, unsigned long ops_request)
e4b2b4a8
JK
9554 struct raid5_percpu *percpu;
9555 unsigned long cpu;
9556
9557- cpu = get_cpu();
9558+ cpu = get_cpu_light();
9559 percpu = per_cpu_ptr(conf->percpu, cpu);
9560+ spin_lock(&percpu->lock);
9561 if (test_bit(STRIPE_OP_BIOFILL, &ops_request)) {
9562 ops_run_biofill(sh);
9563 overlap_clear++;
b3bbd485 9564@@ -2127,7 +2128,8 @@ static void raid_run_ops(struct stripe_head *sh, unsigned long ops_request)
e4b2b4a8
JK
9565 if (test_and_clear_bit(R5_Overlap, &dev->flags))
9566 wake_up(&sh->raid_conf->wait_for_overlap);
9567 }
9568- put_cpu();
9569+ spin_unlock(&percpu->lock);
9570+ put_cpu_light();
9571 }
9572
9573 static void free_stripe(struct kmem_cache *sc, struct stripe_head *sh)
b3bbd485 9574@@ -6781,6 +6783,7 @@ static int raid456_cpu_up_prepare(unsigned int cpu, struct hlist_node *node)
e4b2b4a8
JK
9575 __func__, cpu);
9576 return -ENOMEM;
9577 }
9578+ spin_lock_init(&per_cpu_ptr(conf->percpu, cpu)->lock);
9579 return 0;
9580 }
9581
b3bbd485 9582@@ -6791,7 +6794,6 @@ static int raid5_alloc_percpu(struct r5conf *conf)
e4b2b4a8
JK
9583 conf->percpu = alloc_percpu(struct raid5_percpu);
9584 if (!conf->percpu)
9585 return -ENOMEM;
9586-
9587 err = cpuhp_state_add_instance(CPUHP_MD_RAID5_PREPARE, &conf->node);
9588 if (!err) {
9589 conf->scribble_disks = max(conf->raid_disks,
b3bbd485
JK
9590diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h
9591index 2e6123825095..37a6021418a2 100644
9592--- a/drivers/md/raid5.h
9593+++ b/drivers/md/raid5.h
9594@@ -624,6 +624,7 @@ struct r5conf {
e4b2b4a8
JK
9595 int recovery_disabled;
9596 /* per cpu variables */
9597 struct raid5_percpu {
9598+ spinlock_t lock; /* Protection for -RT */
9599 struct page *spare_page; /* Used when checking P/Q in raid6 */
9600 struct flex_array *scribble; /* space for constructing buffer
9601 * lists and performing address
b3bbd485
JK
9602diff --git a/drivers/mfd/atmel-smc.c b/drivers/mfd/atmel-smc.c
9603index 7d77948567d7..0adbd2e796fe 100644
9604--- a/drivers/mfd/atmel-smc.c
9605+++ b/drivers/mfd/atmel-smc.c
e4b2b4a8
JK
9606@@ -12,6 +12,7 @@
9607 */
9608
9609 #include <linux/mfd/syscon/atmel-smc.h>
9610+#include <linux/string.h>
9611
9612 /**
9613 * atmel_smc_cs_conf_init - initialize a SMC CS conf
b3bbd485
JK
9614diff --git a/drivers/misc/Kconfig b/drivers/misc/Kconfig
9615index 8136dc7e863d..86e83b9629d7 100644
9616--- a/drivers/misc/Kconfig
9617+++ b/drivers/misc/Kconfig
9618@@ -54,6 +54,7 @@ config AD525X_DPOT_SPI
e4b2b4a8
JK
9619 config ATMEL_TCLIB
9620 bool "Atmel AT32/AT91 Timer/Counter Library"
9621 depends on (AVR32 || ARCH_AT91)
9622+ default y if PREEMPT_RT_FULL
9623 help
9624 Select this if you want a library to allocate the Timer/Counter
9625 blocks found on many Atmel processors. This facilitates using
b3bbd485 9626@@ -69,8 +70,7 @@ config ATMEL_TCB_CLKSRC
e4b2b4a8
JK
9627 are combined to make a single 32-bit timer.
9628
9629 When GENERIC_CLOCKEVENTS is defined, the third timer channel
9630- may be used as a clock event device supporting oneshot mode
9631- (delays of up to two seconds) based on the 32 KiHz clock.
9632+ may be used as a clock event device supporting oneshot mode.
9633
9634 config ATMEL_TCB_CLKSRC_BLOCK
9635 int
b3bbd485 9636@@ -84,6 +84,15 @@ config ATMEL_TCB_CLKSRC_BLOCK
e4b2b4a8
JK
9637 TC can be used for other purposes, such as PWM generation and
9638 interval timing.
9639
9640+config ATMEL_TCB_CLKSRC_USE_SLOW_CLOCK
9641+ bool "TC Block use 32 KiHz clock"
9642+ depends on ATMEL_TCB_CLKSRC
9643+ default y if !PREEMPT_RT_FULL
9644+ help
9645+ Select this to use 32 KiHz base clock rate as TC block clock
9646+ source for clock events.
9647+
9648+
9649 config DUMMY_IRQ
9650 tristate "Dummy IRQ handler"
9651 default n
b3bbd485
JK
9652diff --git a/drivers/mmc/host/mmci.c b/drivers/mmc/host/mmci.c
9653index f1f54a818489..ce102378df02 100644
9654--- a/drivers/mmc/host/mmci.c
9655+++ b/drivers/mmc/host/mmci.c
9656@@ -1200,15 +1200,12 @@ static irqreturn_t mmci_pio_irq(int irq, void *dev_id)
e4b2b4a8
JK
9657 struct sg_mapping_iter *sg_miter = &host->sg_miter;
9658 struct variant_data *variant = host->variant;
9659 void __iomem *base = host->base;
9660- unsigned long flags;
9661 u32 status;
9662
9663 status = readl(base + MMCISTATUS);
9664
9665 dev_dbg(mmc_dev(host->mmc), "irq1 (pio) %08x\n", status);
9666
9667- local_irq_save(flags);
9668-
9669 do {
9670 unsigned int remain, len;
9671 char *buffer;
b3bbd485 9672@@ -1248,8 +1245,6 @@ static irqreturn_t mmci_pio_irq(int irq, void *dev_id)
e4b2b4a8
JK
9673
9674 sg_miter_stop(sg_miter);
9675
9676- local_irq_restore(flags);
9677-
9678 /*
9679 * If we have less than the fifo 'half-full' threshold to transfer,
9680 * trigger a PIO interrupt as soon as any data is available.
b3bbd485
JK
9681diff --git a/drivers/net/ethernet/3com/3c59x.c b/drivers/net/ethernet/3com/3c59x.c
9682index 402d9090ad29..9bc02563b853 100644
9683--- a/drivers/net/ethernet/3com/3c59x.c
9684+++ b/drivers/net/ethernet/3com/3c59x.c
9685@@ -842,9 +842,9 @@ static void poll_vortex(struct net_device *dev)
e4b2b4a8
JK
9686 {
9687 struct vortex_private *vp = netdev_priv(dev);
9688 unsigned long flags;
9689- local_irq_save(flags);
9690+ local_irq_save_nort(flags);
9691 (vp->full_bus_master_rx ? boomerang_interrupt:vortex_interrupt)(dev->irq,dev);
9692- local_irq_restore(flags);
9693+ local_irq_restore_nort(flags);
9694 }
9695 #endif
9696
b3bbd485 9697@@ -1908,12 +1908,12 @@ static void vortex_tx_timeout(struct net_device *dev)
e4b2b4a8
JK
9698 * Block interrupts because vortex_interrupt does a bare spin_lock()
9699 */
9700 unsigned long flags;
9701- local_irq_save(flags);
9702+ local_irq_save_nort(flags);
9703 if (vp->full_bus_master_tx)
9704 boomerang_interrupt(dev->irq, dev);
9705 else
9706 vortex_interrupt(dev->irq, dev);
9707- local_irq_restore(flags);
9708+ local_irq_restore_nort(flags);
9709 }
9710 }
9711
b3bbd485 9712diff --git a/drivers/net/ethernet/marvell/mvpp2.c b/drivers/net/ethernet/marvell/mvpp2.c
5dd41b01 9713index 00e6f1d155a6..9c69ab2c5b07 100644
b3bbd485
JK
9714--- a/drivers/net/ethernet/marvell/mvpp2.c
9715+++ b/drivers/net/ethernet/marvell/mvpp2.c
5dd41b01 9716@@ -831,9 +831,8 @@ struct mvpp2_pcpu_stats {
e4b2b4a8
JK
9717 /* Per-CPU port control */
9718 struct mvpp2_port_pcpu {
9719 struct hrtimer tx_done_timer;
9720+ struct net_device *dev;
9721 bool timer_scheduled;
9722- /* Tasklet for egress finalization */
9723- struct tasklet_struct tx_done_tasklet;
9724 };
9725
9726 struct mvpp2_queue_vector {
5dd41b01 9727@@ -5955,46 +5954,34 @@ static void mvpp2_link_event(struct net_device *dev)
e4b2b4a8
JK
9728 }
9729 }
9730
9731-static void mvpp2_timer_set(struct mvpp2_port_pcpu *port_pcpu)
9732-{
9733- ktime_t interval;
9734-
9735- if (!port_pcpu->timer_scheduled) {
9736- port_pcpu->timer_scheduled = true;
9737- interval = MVPP2_TXDONE_HRTIMER_PERIOD_NS;
9738- hrtimer_start(&port_pcpu->tx_done_timer, interval,
9739- HRTIMER_MODE_REL_PINNED);
9740- }
9741-}
9742-
9743-static void mvpp2_tx_proc_cb(unsigned long data)
9744+static enum hrtimer_restart mvpp2_hr_timer_cb(struct hrtimer *timer)
9745 {
9746- struct net_device *dev = (struct net_device *)data;
9747- struct mvpp2_port *port = netdev_priv(dev);
9748- struct mvpp2_port_pcpu *port_pcpu = this_cpu_ptr(port->pcpu);
9749+ struct net_device *dev;
9750+ struct mvpp2_port *port;
9751+ struct mvpp2_port_pcpu *port_pcpu;
9752 unsigned int tx_todo, cause;
9753
9754+ port_pcpu = container_of(timer, struct mvpp2_port_pcpu, tx_done_timer);
9755+ dev = port_pcpu->dev;
9756+
9757 if (!netif_running(dev))
9758- return;
9759+ return HRTIMER_NORESTART;
9760+
9761 port_pcpu->timer_scheduled = false;
9762+ port = netdev_priv(dev);
9763
9764 /* Process all the Tx queues */
9765 cause = (1 << port->ntxqs) - 1;
9766 tx_todo = mvpp2_tx_done(port, cause, smp_processor_id());
9767
9768 /* Set the timer in case not all the packets were processed */
9769- if (tx_todo)
9770- mvpp2_timer_set(port_pcpu);
9771-}
9772-
9773-static enum hrtimer_restart mvpp2_hr_timer_cb(struct hrtimer *timer)
9774-{
9775- struct mvpp2_port_pcpu *port_pcpu = container_of(timer,
9776- struct mvpp2_port_pcpu,
9777- tx_done_timer);
9778-
9779- tasklet_schedule(&port_pcpu->tx_done_tasklet);
9780+ if (tx_todo && !port_pcpu->timer_scheduled) {
9781+ port_pcpu->timer_scheduled = true;
9782+ hrtimer_forward_now(&port_pcpu->tx_done_timer,
9783+ MVPP2_TXDONE_HRTIMER_PERIOD_NS);
9784
9785+ return HRTIMER_RESTART;
9786+ }
9787 return HRTIMER_NORESTART;
9788 }
9789
5dd41b01 9790@@ -6484,7 +6471,12 @@ static int mvpp2_tx(struct sk_buff *skb, struct net_device *dev)
e4b2b4a8
JK
9791 txq_pcpu->count > 0) {
9792 struct mvpp2_port_pcpu *port_pcpu = this_cpu_ptr(port->pcpu);
9793
9794- mvpp2_timer_set(port_pcpu);
9795+ if (!port_pcpu->timer_scheduled) {
9796+ port_pcpu->timer_scheduled = true;
9797+ hrtimer_start(&port_pcpu->tx_done_timer,
9798+ MVPP2_TXDONE_HRTIMER_PERIOD_NS,
9799+ HRTIMER_MODE_REL_PINNED_SOFT);
9800+ }
9801 }
9802
9803 return NETDEV_TX_OK;
5dd41b01 9804@@ -6875,7 +6867,6 @@ static int mvpp2_stop(struct net_device *dev)
e4b2b4a8
JK
9805
9806 hrtimer_cancel(&port_pcpu->tx_done_timer);
9807 port_pcpu->timer_scheduled = false;
9808- tasklet_kill(&port_pcpu->tx_done_tasklet);
9809 }
9810 }
9811 mvpp2_cleanup_rxqs(port);
5dd41b01 9812@@ -7648,13 +7639,10 @@ static int mvpp2_port_probe(struct platform_device *pdev,
e4b2b4a8
JK
9813 port_pcpu = per_cpu_ptr(port->pcpu, cpu);
9814
9815 hrtimer_init(&port_pcpu->tx_done_timer, CLOCK_MONOTONIC,
9816- HRTIMER_MODE_REL_PINNED);
9817+ HRTIMER_MODE_REL_PINNED_SOFT);
9818 port_pcpu->tx_done_timer.function = mvpp2_hr_timer_cb;
9819 port_pcpu->timer_scheduled = false;
9820-
9821- tasklet_init(&port_pcpu->tx_done_tasklet,
9822- mvpp2_tx_proc_cb,
9823- (unsigned long)dev);
9824+ port_pcpu->dev = dev;
9825 }
9826 }
9827
b3bbd485
JK
9828diff --git a/drivers/net/wireless/intersil/orinoco/orinoco_usb.c b/drivers/net/wireless/intersil/orinoco/orinoco_usb.c
9829index 56f6e3b71f48..a50350d01a80 100644
9830--- a/drivers/net/wireless/intersil/orinoco/orinoco_usb.c
9831+++ b/drivers/net/wireless/intersil/orinoco/orinoco_usb.c
9832@@ -697,7 +697,7 @@ static void ezusb_req_ctx_wait(struct ezusb_priv *upriv,
e4b2b4a8
JK
9833 while (!ctx->done.done && msecs--)
9834 udelay(1000);
9835 } else {
9836- wait_event_interruptible(ctx->done.wait,
9837+ swait_event_interruptible(ctx->done.wait,
9838 ctx->done.done);
9839 }
9840 break;
b3bbd485 9841diff --git a/drivers/net/wireless/mac80211_hwsim.c b/drivers/net/wireless/mac80211_hwsim.c
5dd41b01 9842index aafa7aa18fbd..388f6d71ba71 100644
b3bbd485
JK
9843--- a/drivers/net/wireless/mac80211_hwsim.c
9844+++ b/drivers/net/wireless/mac80211_hwsim.c
9845@@ -537,7 +537,7 @@ struct mac80211_hwsim_data {
e4b2b4a8
JK
9846 unsigned int rx_filter;
9847 bool started, idle, scanning;
9848 struct mutex mutex;
9849- struct tasklet_hrtimer beacon_timer;
9850+ struct hrtimer beacon_timer;
9851 enum ps_mode {
9852 PS_DISABLED, PS_ENABLED, PS_AUTO_POLL, PS_MANUAL_POLL
9853 } ps;
b3bbd485 9854@@ -1423,7 +1423,7 @@ static void mac80211_hwsim_stop(struct ieee80211_hw *hw)
e4b2b4a8
JK
9855 {
9856 struct mac80211_hwsim_data *data = hw->priv;
9857 data->started = false;
9858- tasklet_hrtimer_cancel(&data->beacon_timer);
9859+ hrtimer_cancel(&data->beacon_timer);
9860 wiphy_debug(hw->wiphy, "%s\n", __func__);
9861 }
9862
b3bbd485 9863@@ -1546,14 +1546,12 @@ static enum hrtimer_restart
e4b2b4a8
JK
9864 mac80211_hwsim_beacon(struct hrtimer *timer)
9865 {
9866 struct mac80211_hwsim_data *data =
9867- container_of(timer, struct mac80211_hwsim_data,
9868- beacon_timer.timer);
9869+ container_of(timer, struct mac80211_hwsim_data, beacon_timer);
9870 struct ieee80211_hw *hw = data->hw;
9871 u64 bcn_int = data->beacon_int;
9872- ktime_t next_bcn;
9873
9874 if (!data->started)
9875- goto out;
9876+ return HRTIMER_NORESTART;
9877
9878 ieee80211_iterate_active_interfaces_atomic(
9879 hw, IEEE80211_IFACE_ITER_NORMAL,
b3bbd485 9880@@ -1565,11 +1563,9 @@ mac80211_hwsim_beacon(struct hrtimer *timer)
e4b2b4a8
JK
9881 data->bcn_delta = 0;
9882 }
9883
9884- next_bcn = ktime_add(hrtimer_get_expires(timer),
9885- ns_to_ktime(bcn_int * 1000));
9886- tasklet_hrtimer_start(&data->beacon_timer, next_bcn, HRTIMER_MODE_ABS);
9887-out:
9888- return HRTIMER_NORESTART;
9889+ hrtimer_forward(&data->beacon_timer, hrtimer_get_expires(timer),
9890+ ns_to_ktime(bcn_int * NSEC_PER_USEC));
9891+ return HRTIMER_RESTART;
9892 }
9893
9894 static const char * const hwsim_chanwidths[] = {
b3bbd485 9895@@ -1643,15 +1639,15 @@ static int mac80211_hwsim_config(struct ieee80211_hw *hw, u32 changed)
e4b2b4a8
JK
9896 mutex_unlock(&data->mutex);
9897
9898 if (!data->started || !data->beacon_int)
9899- tasklet_hrtimer_cancel(&data->beacon_timer);
9900- else if (!hrtimer_is_queued(&data->beacon_timer.timer)) {
9901+ hrtimer_cancel(&data->beacon_timer);
9902+ else if (!hrtimer_is_queued(&data->beacon_timer)) {
9903 u64 tsf = mac80211_hwsim_get_tsf(hw, NULL);
9904 u32 bcn_int = data->beacon_int;
9905 u64 until_tbtt = bcn_int - do_div(tsf, bcn_int);
9906
9907- tasklet_hrtimer_start(&data->beacon_timer,
9908- ns_to_ktime(until_tbtt * 1000),
9909- HRTIMER_MODE_REL);
9910+ hrtimer_start(&data->beacon_timer,
9911+ ns_to_ktime(until_tbtt * 1000),
9912+ HRTIMER_MODE_REL_SOFT);
9913 }
9914
9915 return 0;
b3bbd485 9916@@ -1714,7 +1710,7 @@ static void mac80211_hwsim_bss_info_changed(struct ieee80211_hw *hw,
e4b2b4a8
JK
9917 info->enable_beacon, info->beacon_int);
9918 vp->bcn_en = info->enable_beacon;
9919 if (data->started &&
9920- !hrtimer_is_queued(&data->beacon_timer.timer) &&
9921+ !hrtimer_is_queued(&data->beacon_timer) &&
9922 info->enable_beacon) {
9923 u64 tsf, until_tbtt;
9924 u32 bcn_int;
b3bbd485 9925@@ -1722,9 +1718,9 @@ static void mac80211_hwsim_bss_info_changed(struct ieee80211_hw *hw,
e4b2b4a8
JK
9926 tsf = mac80211_hwsim_get_tsf(hw, vif);
9927 bcn_int = data->beacon_int;
9928 until_tbtt = bcn_int - do_div(tsf, bcn_int);
9929- tasklet_hrtimer_start(&data->beacon_timer,
9930- ns_to_ktime(until_tbtt * 1000),
9931- HRTIMER_MODE_REL);
9932+ hrtimer_start(&data->beacon_timer,
9933+ ns_to_ktime(until_tbtt * 1000),
9934+ HRTIMER_MODE_REL_SOFT);
9935 } else if (!info->enable_beacon) {
9936 unsigned int count = 0;
9937 ieee80211_iterate_active_interfaces_atomic(
b3bbd485 9938@@ -1733,7 +1729,7 @@ static void mac80211_hwsim_bss_info_changed(struct ieee80211_hw *hw,
e4b2b4a8
JK
9939 wiphy_debug(hw->wiphy, " beaconing vifs remaining: %u",
9940 count);
9941 if (count == 0) {
9942- tasklet_hrtimer_cancel(&data->beacon_timer);
9943+ hrtimer_cancel(&data->beacon_timer);
9944 data->beacon_int = 0;
9945 }
9946 }
5dd41b01 9947@@ -2722,9 +2718,9 @@ static int mac80211_hwsim_new_radio(struct genl_info *info,
e4b2b4a8
JK
9948 data->debugfs,
9949 data, &hwsim_simulate_radar);
9950
9951- tasklet_hrtimer_init(&data->beacon_timer,
9952- mac80211_hwsim_beacon,
9953- CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
9954+ hrtimer_init(&data->beacon_timer, CLOCK_MONOTONIC,
9955+ HRTIMER_MODE_ABS_SOFT);
9956+ data->beacon_timer.function = mac80211_hwsim_beacon;
9957
9958 spin_lock_bh(&hwsim_radio_lock);
9959 list_add_tail(&data->list, &hwsim_radios);
b3bbd485
JK
9960diff --git a/drivers/pci/switch/switchtec.c b/drivers/pci/switch/switchtec.c
9961index 620f5b995a12..7fd1548a2905 100644
9962--- a/drivers/pci/switch/switchtec.c
9963+++ b/drivers/pci/switch/switchtec.c
9964@@ -308,10 +308,11 @@ struct switchtec_user {
e4b2b4a8
JK
9965
9966 enum mrpc_state state;
9967
9968- struct completion comp;
9969+ wait_queue_head_t cmd_comp;
9970 struct kref kref;
9971 struct list_head list;
9972
9973+ bool cmd_done;
9974 u32 cmd;
9975 u32 status;
9976 u32 return_code;
b3bbd485 9977@@ -333,7 +334,7 @@ static struct switchtec_user *stuser_create(struct switchtec_dev *stdev)
e4b2b4a8
JK
9978 stuser->stdev = stdev;
9979 kref_init(&stuser->kref);
9980 INIT_LIST_HEAD(&stuser->list);
9981- init_completion(&stuser->comp);
9982+ init_waitqueue_head(&stuser->cmd_comp);
9983 stuser->event_cnt = atomic_read(&stdev->event_cnt);
9984
9985 dev_dbg(&stdev->dev, "%s: %p\n", __func__, stuser);
b3bbd485 9986@@ -416,7 +417,7 @@ static int mrpc_queue_cmd(struct switchtec_user *stuser)
e4b2b4a8
JK
9987 kref_get(&stuser->kref);
9988 stuser->read_len = sizeof(stuser->data);
9989 stuser_set_state(stuser, MRPC_QUEUED);
9990- init_completion(&stuser->comp);
9991+ stuser->cmd_done = false;
9992 list_add_tail(&stuser->list, &stdev->mrpc_queue);
9993
9994 mrpc_cmd_submit(stdev);
b3bbd485 9995@@ -453,7 +454,8 @@ static void mrpc_complete_cmd(struct switchtec_dev *stdev)
e4b2b4a8
JK
9996 stuser->read_len);
9997
9998 out:
9999- complete_all(&stuser->comp);
10000+ stuser->cmd_done = true;
10001+ wake_up_interruptible(&stuser->cmd_comp);
10002 list_del_init(&stuser->list);
10003 stuser_put(stuser);
10004 stdev->mrpc_busy = 0;
b3bbd485 10005@@ -723,10 +725,11 @@ static ssize_t switchtec_dev_read(struct file *filp, char __user *data,
e4b2b4a8
JK
10006 mutex_unlock(&stdev->mrpc_mutex);
10007
10008 if (filp->f_flags & O_NONBLOCK) {
10009- if (!try_wait_for_completion(&stuser->comp))
10010+ if (!READ_ONCE(stuser->cmd_done))
10011 return -EAGAIN;
10012 } else {
10013- rc = wait_for_completion_interruptible(&stuser->comp);
10014+ rc = wait_event_interruptible(stuser->cmd_comp,
10015+ stuser->cmd_done);
10016 if (rc < 0)
10017 return rc;
10018 }
b3bbd485 10019@@ -774,7 +777,7 @@ static unsigned int switchtec_dev_poll(struct file *filp, poll_table *wait)
e4b2b4a8
JK
10020 struct switchtec_dev *stdev = stuser->stdev;
10021 int ret = 0;
10022
10023- poll_wait(filp, &stuser->comp.wait, wait);
10024+ poll_wait(filp, &stuser->cmd_comp, wait);
10025 poll_wait(filp, &stdev->event_wq, wait);
10026
10027 if (lock_mutex_and_test_alive(stdev))
b3bbd485 10028@@ -782,7 +785,7 @@ static unsigned int switchtec_dev_poll(struct file *filp, poll_table *wait)
e4b2b4a8
JK
10029
10030 mutex_unlock(&stdev->mrpc_mutex);
10031
10032- if (try_wait_for_completion(&stuser->comp))
10033+ if (READ_ONCE(stuser->cmd_done))
10034 ret |= POLLIN | POLLRDNORM;
10035
10036 if (stuser->event_cnt != atomic_read(&stdev->event_cnt))
b3bbd485 10037@@ -1259,7 +1262,8 @@ static void stdev_kill(struct switchtec_dev *stdev)
e4b2b4a8
JK
10038
10039 /* Wake up and kill any users waiting on an MRPC request */
10040 list_for_each_entry_safe(stuser, tmpuser, &stdev->mrpc_queue, list) {
10041- complete_all(&stuser->comp);
10042+ stuser->cmd_done = true;
10043+ wake_up_interruptible(&stuser->cmd_comp);
10044 list_del_init(&stuser->list);
10045 stuser_put(stuser);
10046 }
b3bbd485
JK
10047diff --git a/drivers/scsi/fcoe/fcoe.c b/drivers/scsi/fcoe/fcoe.c
10048index 85f9a3eba387..08ea05ddcd82 100644
10049--- a/drivers/scsi/fcoe/fcoe.c
10050+++ b/drivers/scsi/fcoe/fcoe.c
10051@@ -1464,11 +1464,11 @@ static int fcoe_rcv(struct sk_buff *skb, struct net_device *netdev,
e4b2b4a8
JK
10052 static int fcoe_alloc_paged_crc_eof(struct sk_buff *skb, int tlen)
10053 {
10054 struct fcoe_percpu_s *fps;
10055- int rc;
10056+ int rc, cpu = get_cpu_light();
10057
10058- fps = &get_cpu_var(fcoe_percpu);
10059+ fps = &per_cpu(fcoe_percpu, cpu);
10060 rc = fcoe_get_paged_crc_eof(skb, tlen, fps);
10061- put_cpu_var(fcoe_percpu);
10062+ put_cpu_light();
10063
10064 return rc;
10065 }
b3bbd485 10066@@ -1655,11 +1655,11 @@ static inline int fcoe_filter_frames(struct fc_lport *lport,
e4b2b4a8
JK
10067 return 0;
10068 }
10069
10070- stats = per_cpu_ptr(lport->stats, get_cpu());
10071+ stats = per_cpu_ptr(lport->stats, get_cpu_light());
10072 stats->InvalidCRCCount++;
10073 if (stats->InvalidCRCCount < 5)
10074 printk(KERN_WARNING "fcoe: dropping frame with CRC error\n");
10075- put_cpu();
10076+ put_cpu_light();
10077 return -EINVAL;
10078 }
10079
b3bbd485 10080@@ -1702,7 +1702,7 @@ static void fcoe_recv_frame(struct sk_buff *skb)
e4b2b4a8
JK
10081 */
10082 hp = (struct fcoe_hdr *) skb_network_header(skb);
10083
10084- stats = per_cpu_ptr(lport->stats, get_cpu());
10085+ stats = per_cpu_ptr(lport->stats, get_cpu_light());
10086 if (unlikely(FC_FCOE_DECAPS_VER(hp) != FC_FCOE_VER)) {
10087 if (stats->ErrorFrames < 5)
10088 printk(KERN_WARNING "fcoe: FCoE version "
b3bbd485 10089@@ -1734,13 +1734,13 @@ static void fcoe_recv_frame(struct sk_buff *skb)
e4b2b4a8
JK
10090 goto drop;
10091
10092 if (!fcoe_filter_frames(lport, fp)) {
10093- put_cpu();
10094+ put_cpu_light();
10095 fc_exch_recv(lport, fp);
10096 return;
10097 }
10098 drop:
10099 stats->ErrorFrames++;
10100- put_cpu();
10101+ put_cpu_light();
10102 kfree_skb(skb);
10103 }
10104
b3bbd485
JK
10105diff --git a/drivers/scsi/fcoe/fcoe_ctlr.c b/drivers/scsi/fcoe/fcoe_ctlr.c
10106index 03019e07abb9..9ec11316bfe6 100644
10107--- a/drivers/scsi/fcoe/fcoe_ctlr.c
10108+++ b/drivers/scsi/fcoe/fcoe_ctlr.c
10109@@ -835,7 +835,7 @@ static unsigned long fcoe_ctlr_age_fcfs(struct fcoe_ctlr *fip)
e4b2b4a8
JK
10110
10111 INIT_LIST_HEAD(&del_list);
10112
10113- stats = per_cpu_ptr(fip->lp->stats, get_cpu());
10114+ stats = per_cpu_ptr(fip->lp->stats, get_cpu_light());
10115
10116 list_for_each_entry_safe(fcf, next, &fip->fcfs, list) {
10117 deadline = fcf->time + fcf->fka_period + fcf->fka_period / 2;
b3bbd485 10118@@ -871,7 +871,7 @@ static unsigned long fcoe_ctlr_age_fcfs(struct fcoe_ctlr *fip)
e4b2b4a8
JK
10119 sel_time = fcf->time;
10120 }
10121 }
10122- put_cpu();
10123+ put_cpu_light();
10124
10125 list_for_each_entry_safe(fcf, next, &del_list, list) {
10126 /* Removes fcf from current list */
b3bbd485
JK
10127diff --git a/drivers/scsi/libfc/fc_exch.c b/drivers/scsi/libfc/fc_exch.c
10128index 42bcf7f3a0f9..2ce045d6860c 100644
10129--- a/drivers/scsi/libfc/fc_exch.c
10130+++ b/drivers/scsi/libfc/fc_exch.c
10131@@ -833,10 +833,10 @@ static struct fc_exch *fc_exch_em_alloc(struct fc_lport *lport,
e4b2b4a8
JK
10132 }
10133 memset(ep, 0, sizeof(*ep));
10134
10135- cpu = get_cpu();
10136+ cpu = get_cpu_light();
10137 pool = per_cpu_ptr(mp->pool, cpu);
10138 spin_lock_bh(&pool->lock);
10139- put_cpu();
10140+ put_cpu_light();
10141
10142 /* peek cache of free slot */
10143 if (pool->left != FC_XID_UNKNOWN) {
b3bbd485
JK
10144diff --git a/drivers/scsi/libsas/sas_ata.c b/drivers/scsi/libsas/sas_ata.c
10145index 70be4425ae0b..a23ef685deac 100644
10146--- a/drivers/scsi/libsas/sas_ata.c
10147+++ b/drivers/scsi/libsas/sas_ata.c
10148@@ -190,7 +190,7 @@ static unsigned int sas_ata_qc_issue(struct ata_queued_cmd *qc)
e4b2b4a8
JK
10149 /* TODO: audit callers to ensure they are ready for qc_issue to
10150 * unconditionally re-enable interrupts
10151 */
10152- local_irq_save(flags);
10153+ local_irq_save_nort(flags);
10154 spin_unlock(ap->lock);
10155
10156 /* If the device fell off, no sense in issuing commands */
b3bbd485 10157@@ -252,7 +252,7 @@ static unsigned int sas_ata_qc_issue(struct ata_queued_cmd *qc)
e4b2b4a8
JK
10158
10159 out:
10160 spin_lock(ap->lock);
10161- local_irq_restore(flags);
10162+ local_irq_restore_nort(flags);
10163 return ret;
10164 }
10165
b3bbd485
JK
10166diff --git a/drivers/scsi/qla2xxx/qla_inline.h b/drivers/scsi/qla2xxx/qla_inline.h
10167index 3f5a0f0f8b62..c75783143dc1 100644
10168--- a/drivers/scsi/qla2xxx/qla_inline.h
10169+++ b/drivers/scsi/qla2xxx/qla_inline.h
10170@@ -59,12 +59,12 @@ qla2x00_poll(struct rsp_que *rsp)
e4b2b4a8
JK
10171 {
10172 unsigned long flags;
10173 struct qla_hw_data *ha = rsp->hw;
10174- local_irq_save(flags);
10175+ local_irq_save_nort(flags);
10176 if (IS_P3P_TYPE(ha))
10177 qla82xx_poll(0, rsp);
10178 else
10179 ha->isp_ops->intr_handler(0, rsp);
10180- local_irq_restore(flags);
10181+ local_irq_restore_nort(flags);
10182 }
10183
10184 static inline uint8_t *
b3bbd485
JK
10185diff --git a/drivers/staging/greybus/audio_manager.c b/drivers/staging/greybus/audio_manager.c
10186index aa6508b44fab..045696ce85c7 100644
10187--- a/drivers/staging/greybus/audio_manager.c
10188+++ b/drivers/staging/greybus/audio_manager.c
e4b2b4a8
JK
10189@@ -10,7 +10,7 @@
10190 #include <linux/sysfs.h>
10191 #include <linux/module.h>
10192 #include <linux/init.h>
10193-#include <linux/rwlock.h>
10194+#include <linux/spinlock.h>
10195 #include <linux/idr.h>
10196
10197 #include "audio_manager.h"
b3bbd485
JK
10198diff --git a/drivers/target/target_core_tmr.c b/drivers/target/target_core_tmr.c
10199index 9c7bc1ca341a..3d35dad1de2c 100644
10200--- a/drivers/target/target_core_tmr.c
10201+++ b/drivers/target/target_core_tmr.c
10202@@ -114,8 +114,6 @@ static bool __target_check_io_state(struct se_cmd *se_cmd,
e4b2b4a8
JK
10203 {
10204 struct se_session *sess = se_cmd->se_sess;
10205
10206- assert_spin_locked(&sess->sess_cmd_lock);
10207- WARN_ON_ONCE(!irqs_disabled());
10208 /*
10209 * If command already reached CMD_T_COMPLETE state within
10210 * target_complete_cmd() or CMD_T_FABRIC_STOP due to shutdown,
b3bbd485
JK
10211diff --git a/drivers/target/target_core_transport.c b/drivers/target/target_core_transport.c
10212index 0d0be7d8b9d6..f652e58e2988 100644
10213--- a/drivers/target/target_core_transport.c
10214+++ b/drivers/target/target_core_transport.c
10215@@ -2967,9 +2967,6 @@ __transport_wait_for_tasks(struct se_cmd *cmd, bool fabric_stop,
e4b2b4a8
JK
10216 __acquires(&cmd->t_state_lock)
10217 {
10218
10219- assert_spin_locked(&cmd->t_state_lock);
10220- WARN_ON_ONCE(!irqs_disabled());
10221-
10222 if (fabric_stop)
10223 cmd->transport_state |= CMD_T_FABRIC_STOP;
10224
b3bbd485 10225@@ -3239,9 +3236,6 @@ static int __transport_check_aborted_status(struct se_cmd *cmd, int send_status)
e4b2b4a8
JK
10226 {
10227 int ret;
10228
10229- assert_spin_locked(&cmd->t_state_lock);
10230- WARN_ON_ONCE(!irqs_disabled());
10231-
10232 if (!(cmd->transport_state & CMD_T_ABORTED))
10233 return 0;
10234 /*
b3bbd485
JK
10235diff --git a/drivers/thermal/x86_pkg_temp_thermal.c b/drivers/thermal/x86_pkg_temp_thermal.c
10236index d93eee2f101b..0287333b1f3c 100644
10237--- a/drivers/thermal/x86_pkg_temp_thermal.c
10238+++ b/drivers/thermal/x86_pkg_temp_thermal.c
e4b2b4a8
JK
10239@@ -29,6 +29,7 @@
10240 #include <linux/pm.h>
10241 #include <linux/thermal.h>
10242 #include <linux/debugfs.h>
10243+#include <linux/swork.h>
10244 #include <asm/cpu_device_id.h>
10245 #include <asm/mce.h>
10246
b3bbd485 10247@@ -329,7 +330,7 @@ static void pkg_thermal_schedule_work(int cpu, struct delayed_work *work)
e4b2b4a8
JK
10248 schedule_delayed_work_on(cpu, work, ms);
10249 }
10250
10251-static int pkg_thermal_notify(u64 msr_val)
10252+static void pkg_thermal_notify_work(struct swork_event *event)
10253 {
10254 int cpu = smp_processor_id();
10255 struct pkg_device *pkgdev;
b3bbd485 10256@@ -348,9 +349,47 @@ static int pkg_thermal_notify(u64 msr_val)
e4b2b4a8
JK
10257 }
10258
10259 spin_unlock_irqrestore(&pkg_temp_lock, flags);
10260+}
10261+
10262+#ifdef CONFIG_PREEMPT_RT_FULL
10263+static struct swork_event notify_work;
10264+
10265+static int pkg_thermal_notify_work_init(void)
10266+{
10267+ int err;
10268+
10269+ err = swork_get();
10270+ if (err)
10271+ return err;
10272+
10273+ INIT_SWORK(&notify_work, pkg_thermal_notify_work);
b3bbd485
JK
10274 return 0;
10275 }
10276
e4b2b4a8
JK
10277+static void pkg_thermal_notify_work_cleanup(void)
10278+{
10279+ swork_put();
10280+}
10281+
10282+static int pkg_thermal_notify(u64 msr_val)
10283+{
10284+ swork_queue(&notify_work);
10285+ return 0;
10286+}
10287+
10288+#else /* !CONFIG_PREEMPT_RT_FULL */
10289+
10290+static int pkg_thermal_notify_work_init(void) { return 0; }
10291+
10292+static void pkg_thermal_notify_work_cleanup(void) { }
10293+
10294+static int pkg_thermal_notify(u64 msr_val)
10295+{
10296+ pkg_thermal_notify_work(NULL);
b3bbd485
JK
10297+ return 0;
10298+}
e4b2b4a8 10299+#endif /* CONFIG_PREEMPT_RT_FULL */
b3bbd485 10300+
e4b2b4a8
JK
10301 static int pkg_temp_thermal_device_add(unsigned int cpu)
10302 {
b3bbd485
JK
10303 int pkgid = topology_logical_package_id(cpu);
10304@@ -515,10 +554,15 @@ static int __init pkg_temp_thermal_init(void)
e4b2b4a8
JK
10305 if (!x86_match_cpu(pkg_temp_thermal_ids))
10306 return -ENODEV;
10307
10308+ if (!pkg_thermal_notify_work_init())
10309+ return -ENODEV;
10310+
10311 max_packages = topology_max_packages();
10312 packages = kzalloc(max_packages * sizeof(struct pkg_device *), GFP_KERNEL);
10313- if (!packages)
10314- return -ENOMEM;
10315+ if (!packages) {
10316+ ret = -ENOMEM;
10317+ goto err;
10318+ }
10319
10320 ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "thermal/x86_pkg:online",
10321 pkg_thermal_cpu_online, pkg_thermal_cpu_offline);
b3bbd485 10322@@ -536,6 +580,7 @@ static int __init pkg_temp_thermal_init(void)
e4b2b4a8
JK
10323 return 0;
10324
10325 err:
10326+ pkg_thermal_notify_work_cleanup();
10327 kfree(packages);
10328 return ret;
10329 }
b3bbd485 10330@@ -549,6 +594,7 @@ static void __exit pkg_temp_thermal_exit(void)
e4b2b4a8
JK
10331 cpuhp_remove_state(pkg_thermal_hp_state);
10332 debugfs_remove_recursive(debugfs);
10333 kfree(packages);
10334+ pkg_thermal_notify_work_cleanup();
10335 }
10336 module_exit(pkg_temp_thermal_exit)
10337
b3bbd485
JK
10338diff --git a/drivers/tty/serial/8250/8250_core.c b/drivers/tty/serial/8250/8250_core.c
10339index d29b512a7d9f..bc8cbb995b29 100644
10340--- a/drivers/tty/serial/8250/8250_core.c
10341+++ b/drivers/tty/serial/8250/8250_core.c
10342@@ -58,7 +58,16 @@ static struct uart_driver serial8250_reg;
e4b2b4a8
JK
10343
10344 static unsigned int skip_txen_test; /* force skip of txen test at init time */
10345
10346-#define PASS_LIMIT 512
10347+/*
10348+ * On -rt we can have a more delays, and legitimately
10349+ * so - so don't drop work spuriously and spam the
10350+ * syslog:
10351+ */
10352+#ifdef CONFIG_PREEMPT_RT_FULL
10353+# define PASS_LIMIT 1000000
10354+#else
10355+# define PASS_LIMIT 512
10356+#endif
10357
10358 #include <asm/serial.h>
10359 /*
b3bbd485
JK
10360diff --git a/drivers/tty/serial/8250/8250_port.c b/drivers/tty/serial/8250/8250_port.c
10361index ecf3d631bc09..6e029f34f37f 100644
10362--- a/drivers/tty/serial/8250/8250_port.c
10363+++ b/drivers/tty/serial/8250/8250_port.c
e4b2b4a8
JK
10364@@ -35,6 +35,7 @@
10365 #include <linux/nmi.h>
10366 #include <linux/mutex.h>
10367 #include <linux/slab.h>
10368+#include <linux/kdb.h>
10369 #include <linux/uaccess.h>
10370 #include <linux/pm_runtime.h>
10371 #include <linux/ktime.h>
b3bbd485 10372@@ -3224,9 +3225,9 @@ void serial8250_console_write(struct uart_8250_port *up, const char *s,
e4b2b4a8
JK
10373
10374 serial8250_rpm_get(up);
10375
10376- if (port->sysrq)
10377+ if (port->sysrq || oops_in_progress)
10378 locked = 0;
10379- else if (oops_in_progress)
10380+ else if (in_kdb_printk())
10381 locked = spin_trylock_irqsave(&port->lock, flags);
10382 else
10383 spin_lock_irqsave(&port->lock, flags);
b3bbd485
JK
10384diff --git a/drivers/tty/serial/amba-pl011.c b/drivers/tty/serial/amba-pl011.c
10385index c9f701aca677..81d6b15fb80a 100644
10386--- a/drivers/tty/serial/amba-pl011.c
10387+++ b/drivers/tty/serial/amba-pl011.c
10388@@ -2236,13 +2236,19 @@ pl011_console_write(struct console *co, const char *s, unsigned int count)
e4b2b4a8
JK
10389
10390 clk_enable(uap->clk);
10391
10392- local_irq_save(flags);
10393+ /*
10394+ * local_irq_save(flags);
10395+ *
10396+ * This local_irq_save() is nonsense. If we come in via sysrq
10397+ * handling then interrupts are already disabled. Aside of
10398+ * that the port.sysrq check is racy on SMP regardless.
10399+ */
10400 if (uap->port.sysrq)
10401 locked = 0;
10402 else if (oops_in_progress)
10403- locked = spin_trylock(&uap->port.lock);
10404+ locked = spin_trylock_irqsave(&uap->port.lock, flags);
10405 else
10406- spin_lock(&uap->port.lock);
10407+ spin_lock_irqsave(&uap->port.lock, flags);
10408
10409 /*
10410 * First save the CR then disable the interrupts
b3bbd485 10411@@ -2268,8 +2274,7 @@ pl011_console_write(struct console *co, const char *s, unsigned int count)
e4b2b4a8
JK
10412 pl011_write(old_cr, uap, REG_CR);
10413
10414 if (locked)
10415- spin_unlock(&uap->port.lock);
10416- local_irq_restore(flags);
10417+ spin_unlock_irqrestore(&uap->port.lock, flags);
10418
10419 clk_disable(uap->clk);
10420 }
b3bbd485
JK
10421diff --git a/drivers/tty/serial/omap-serial.c b/drivers/tty/serial/omap-serial.c
10422index 26a22b100df1..69117e355bcd 100644
10423--- a/drivers/tty/serial/omap-serial.c
10424+++ b/drivers/tty/serial/omap-serial.c
10425@@ -1311,13 +1311,10 @@ serial_omap_console_write(struct console *co, const char *s,
e4b2b4a8
JK
10426
10427 pm_runtime_get_sync(up->dev);
10428
10429- local_irq_save(flags);
10430- if (up->port.sysrq)
10431- locked = 0;
10432- else if (oops_in_progress)
10433- locked = spin_trylock(&up->port.lock);
10434+ if (up->port.sysrq || oops_in_progress)
10435+ locked = spin_trylock_irqsave(&up->port.lock, flags);
10436 else
10437- spin_lock(&up->port.lock);
10438+ spin_lock_irqsave(&up->port.lock, flags);
10439
10440 /*
10441 * First save the IER then disable the interrupts
b3bbd485 10442@@ -1346,8 +1343,7 @@ serial_omap_console_write(struct console *co, const char *s,
e4b2b4a8
JK
10443 pm_runtime_mark_last_busy(up->dev);
10444 pm_runtime_put_autosuspend(up->dev);
10445 if (locked)
10446- spin_unlock(&up->port.lock);
10447- local_irq_restore(flags);
10448+ spin_unlock_irqrestore(&up->port.lock, flags);
10449 }
10450
10451 static int __init
b3bbd485
JK
10452diff --git a/drivers/usb/core/hcd.c b/drivers/usb/core/hcd.c
10453index d0b2e0ed9bab..91f4f2bd55b0 100644
10454--- a/drivers/usb/core/hcd.c
10455+++ b/drivers/usb/core/hcd.c
10456@@ -1775,9 +1775,9 @@ static void __usb_hcd_giveback_urb(struct urb *urb)
e4b2b4a8
JK
10457 * and no one may trigger the above deadlock situation when
10458 * running complete() in tasklet.
10459 */
10460- local_irq_save(flags);
10461+ local_irq_save_nort(flags);
10462 urb->complete(urb);
10463- local_irq_restore(flags);
10464+ local_irq_restore_nort(flags);
10465
10466 usb_anchor_resume_wakeups(anchor);
10467 atomic_dec(&urb->use_count);
b3bbd485
JK
10468diff --git a/drivers/usb/gadget/function/f_fs.c b/drivers/usb/gadget/function/f_fs.c
10469index 17467545391b..42ec6f2db6a9 100644
10470--- a/drivers/usb/gadget/function/f_fs.c
10471+++ b/drivers/usb/gadget/function/f_fs.c
10472@@ -1623,7 +1623,7 @@ static void ffs_data_put(struct ffs_data *ffs)
e4b2b4a8
JK
10473 pr_info("%s(): freeing\n", __func__);
10474 ffs_data_clear(ffs);
10475 BUG_ON(waitqueue_active(&ffs->ev.waitq) ||
10476- waitqueue_active(&ffs->ep0req_completion.wait) ||
10477+ swait_active(&ffs->ep0req_completion.wait) ||
10478 waitqueue_active(&ffs->wait));
10479 destroy_workqueue(ffs->io_completion_wq);
10480 kfree(ffs->dev_name);
b3bbd485
JK
10481diff --git a/drivers/usb/gadget/function/f_ncm.c b/drivers/usb/gadget/function/f_ncm.c
10482index 45b334ceaf2e..5f24e6d3b6eb 100644
10483--- a/drivers/usb/gadget/function/f_ncm.c
10484+++ b/drivers/usb/gadget/function/f_ncm.c
10485@@ -77,9 +77,7 @@ struct f_ncm {
e4b2b4a8
JK
10486 struct sk_buff *skb_tx_ndp;
10487 u16 ndp_dgram_count;
10488 bool timer_force_tx;
10489- struct tasklet_struct tx_tasklet;
10490 struct hrtimer task_timer;
10491-
10492 bool timer_stopping;
10493 };
10494
b3bbd485 10495@@ -1108,7 +1106,7 @@ static struct sk_buff *ncm_wrap_ntb(struct gether *port,
e4b2b4a8
JK
10496
10497 /* Delay the timer. */
10498 hrtimer_start(&ncm->task_timer, TX_TIMEOUT_NSECS,
10499- HRTIMER_MODE_REL);
10500+ HRTIMER_MODE_REL_SOFT);
10501
10502 /* Add the datagram position entries */
10503 ntb_ndp = skb_put_zero(ncm->skb_tx_ndp, dgram_idx_len);
b3bbd485 10504@@ -1152,17 +1150,15 @@ static struct sk_buff *ncm_wrap_ntb(struct gether *port,
e4b2b4a8
JK
10505 }
10506
10507 /*
10508- * This transmits the NTB if there are frames waiting.
10509+ * The transmit should only be run if no skb data has been sent
10510+ * for a certain duration.
10511 */
10512-static void ncm_tx_tasklet(unsigned long data)
10513+static enum hrtimer_restart ncm_tx_timeout(struct hrtimer *data)
10514 {
10515- struct f_ncm *ncm = (void *)data;
10516-
10517- if (ncm->timer_stopping)
10518- return;
10519+ struct f_ncm *ncm = container_of(data, struct f_ncm, task_timer);
10520
10521 /* Only send if data is available. */
10522- if (ncm->skb_tx_data) {
10523+ if (!ncm->timer_stopping && ncm->skb_tx_data) {
10524 ncm->timer_force_tx = true;
10525
10526 /* XXX This allowance of a NULL skb argument to ndo_start_xmit
b3bbd485 10527@@ -1175,16 +1171,6 @@ static void ncm_tx_tasklet(unsigned long data)
e4b2b4a8
JK
10528
10529 ncm->timer_force_tx = false;
10530 }
10531-}
10532-
10533-/*
10534- * The transmit should only be run if no skb data has been sent
10535- * for a certain duration.
10536- */
10537-static enum hrtimer_restart ncm_tx_timeout(struct hrtimer *data)
10538-{
10539- struct f_ncm *ncm = container_of(data, struct f_ncm, task_timer);
10540- tasklet_schedule(&ncm->tx_tasklet);
10541 return HRTIMER_NORESTART;
10542 }
10543
b3bbd485 10544@@ -1517,8 +1503,7 @@ static int ncm_bind(struct usb_configuration *c, struct usb_function *f)
e4b2b4a8
JK
10545 ncm->port.open = ncm_open;
10546 ncm->port.close = ncm_close;
10547
10548- tasklet_init(&ncm->tx_tasklet, ncm_tx_tasklet, (unsigned long) ncm);
10549- hrtimer_init(&ncm->task_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
10550+ hrtimer_init(&ncm->task_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_SOFT);
10551 ncm->task_timer.function = ncm_tx_timeout;
10552
10553 DBG(cdev, "CDC Network: %s speed IN/%s OUT/%s NOTIFY/%s\n",
b3bbd485 10554@@ -1627,7 +1612,6 @@ static void ncm_unbind(struct usb_configuration *c, struct usb_function *f)
e4b2b4a8
JK
10555 DBG(c->cdev, "ncm unbind\n");
10556
10557 hrtimer_cancel(&ncm->task_timer);
10558- tasklet_kill(&ncm->tx_tasklet);
10559
10560 ncm_string_defs[0].id = 0;
10561 usb_free_all_descriptors(f);
b3bbd485
JK
10562diff --git a/drivers/usb/gadget/legacy/inode.c b/drivers/usb/gadget/legacy/inode.c
10563index 5c28bee327e1..ed49dba4704d 100644
10564--- a/drivers/usb/gadget/legacy/inode.c
10565+++ b/drivers/usb/gadget/legacy/inode.c
10566@@ -347,7 +347,7 @@ ep_io (struct ep_data *epdata, void *buf, unsigned len)
e4b2b4a8
JK
10567 spin_unlock_irq (&epdata->dev->lock);
10568
10569 if (likely (value == 0)) {
10570- value = wait_event_interruptible (done.wait, done.done);
10571+ value = swait_event_interruptible (done.wait, done.done);
10572 if (value != 0) {
10573 spin_lock_irq (&epdata->dev->lock);
10574 if (likely (epdata->ep != NULL)) {
b3bbd485 10575@@ -356,7 +356,7 @@ ep_io (struct ep_data *epdata, void *buf, unsigned len)
e4b2b4a8
JK
10576 usb_ep_dequeue (epdata->ep, epdata->req);
10577 spin_unlock_irq (&epdata->dev->lock);
10578
10579- wait_event (done.wait, done.done);
10580+ swait_event (done.wait, done.done);
10581 if (epdata->status == -ECONNRESET)
10582 epdata->status = -EINTR;
10583 } else {
b3bbd485
JK
10584diff --git a/fs/aio.c b/fs/aio.c
10585index 3a749c3a92e3..24c6ceadaae6 100644
10586--- a/fs/aio.c
10587+++ b/fs/aio.c
e4b2b4a8
JK
10588@@ -40,6 +40,7 @@
10589 #include <linux/ramfs.h>
10590 #include <linux/percpu-refcount.h>
10591 #include <linux/mount.h>
10592+#include <linux/swork.h>
10593
10594 #include <asm/kmap_types.h>
10595 #include <linux/uaccess.h>
b3bbd485 10596@@ -117,6 +118,7 @@ struct kioctx {
e4b2b4a8
JK
10597
10598 struct rcu_head free_rcu;
10599 struct work_struct free_work; /* see free_ioctx() */
10600+ struct swork_event free_swork; /* see free_ioctx() */
10601
10602 /*
10603 * signals when all in-flight requests are done
b3bbd485 10604@@ -259,6 +261,7 @@ static int __init aio_setup(void)
e4b2b4a8
JK
10605 .mount = aio_mount,
10606 .kill_sb = kill_anon_super,
10607 };
10608+ BUG_ON(swork_get());
10609 aio_mnt = kern_mount(&aio_fs);
10610 if (IS_ERR(aio_mnt))
10611 panic("Failed to create aio fs mount.");
b3bbd485 10612@@ -633,9 +636,9 @@ static void free_ioctx_reqs(struct percpu_ref *ref)
e4b2b4a8
JK
10613 * and ctx->users has dropped to 0, so we know no more kiocbs can be submitted -
10614 * now it's safe to cancel any that need to be.
10615 */
10616-static void free_ioctx_users(struct percpu_ref *ref)
10617+static void free_ioctx_users_work(struct swork_event *sev)
10618 {
10619- struct kioctx *ctx = container_of(ref, struct kioctx, users);
10620+ struct kioctx *ctx = container_of(sev, struct kioctx, free_swork);
10621 struct aio_kiocb *req;
10622
10623 spin_lock_irq(&ctx->ctx_lock);
b3bbd485 10624@@ -653,6 +656,14 @@ static void free_ioctx_users(struct percpu_ref *ref)
e4b2b4a8
JK
10625 percpu_ref_put(&ctx->reqs);
10626 }
10627
10628+static void free_ioctx_users(struct percpu_ref *ref)
10629+{
10630+ struct kioctx *ctx = container_of(ref, struct kioctx, users);
10631+
10632+ INIT_SWORK(&ctx->free_swork, free_ioctx_users_work);
10633+ swork_queue(&ctx->free_swork);
10634+}
10635+
10636 static int ioctx_add_table(struct kioctx *ctx, struct mm_struct *mm)
10637 {
10638 unsigned i, new_nr;
b3bbd485
JK
10639diff --git a/fs/autofs4/autofs_i.h b/fs/autofs4/autofs_i.h
10640index ce696d6c4641..b120fbd41483 100644
10641--- a/fs/autofs4/autofs_i.h
10642+++ b/fs/autofs4/autofs_i.h
e4b2b4a8
JK
10643@@ -20,6 +20,7 @@
10644 #include <linux/sched.h>
10645 #include <linux/mount.h>
10646 #include <linux/namei.h>
10647+#include <linux/delay.h>
10648 #include <linux/uaccess.h>
10649 #include <linux/mutex.h>
10650 #include <linux/spinlock.h>
b3bbd485
JK
10651diff --git a/fs/autofs4/expire.c b/fs/autofs4/expire.c
10652index 57725d4a8c59..62220508bace 100644
10653--- a/fs/autofs4/expire.c
10654+++ b/fs/autofs4/expire.c
10655@@ -148,7 +148,7 @@ static struct dentry *get_next_positive_dentry(struct dentry *prev,
e4b2b4a8
JK
10656 parent = p->d_parent;
10657 if (!spin_trylock(&parent->d_lock)) {
10658 spin_unlock(&p->d_lock);
10659- cpu_relax();
10660+ cpu_chill();
10661 goto relock;
10662 }
10663 spin_unlock(&p->d_lock);
b3bbd485
JK
10664diff --git a/fs/buffer.c b/fs/buffer.c
10665index b96f3b98a6ef..4ca5f222537a 100644
10666--- a/fs/buffer.c
10667+++ b/fs/buffer.c
10668@@ -302,8 +302,7 @@ static void end_buffer_async_read(struct buffer_head *bh, int uptodate)
e4b2b4a8
JK
10669 * decide that the page is now completely done.
10670 */
10671 first = page_buffers(page);
10672- local_irq_save(flags);
10673- bit_spin_lock(BH_Uptodate_Lock, &first->b_state);
10674+ flags = bh_uptodate_lock_irqsave(first);
10675 clear_buffer_async_read(bh);
10676 unlock_buffer(bh);
10677 tmp = bh;
b3bbd485 10678@@ -316,8 +315,7 @@ static void end_buffer_async_read(struct buffer_head *bh, int uptodate)
e4b2b4a8
JK
10679 }
10680 tmp = tmp->b_this_page;
10681 } while (tmp != bh);
10682- bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
10683- local_irq_restore(flags);
10684+ bh_uptodate_unlock_irqrestore(first, flags);
10685
10686 /*
10687 * If none of the buffers had errors and they are all
b3bbd485 10688@@ -329,9 +327,7 @@ static void end_buffer_async_read(struct buffer_head *bh, int uptodate)
e4b2b4a8
JK
10689 return;
10690
10691 still_busy:
10692- bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
10693- local_irq_restore(flags);
10694- return;
10695+ bh_uptodate_unlock_irqrestore(first, flags);
10696 }
10697
10698 /*
b3bbd485 10699@@ -358,8 +354,7 @@ void end_buffer_async_write(struct buffer_head *bh, int uptodate)
e4b2b4a8
JK
10700 }
10701
10702 first = page_buffers(page);
10703- local_irq_save(flags);
10704- bit_spin_lock(BH_Uptodate_Lock, &first->b_state);
10705+ flags = bh_uptodate_lock_irqsave(first);
10706
10707 clear_buffer_async_write(bh);
10708 unlock_buffer(bh);
b3bbd485 10709@@ -371,15 +366,12 @@ void end_buffer_async_write(struct buffer_head *bh, int uptodate)
e4b2b4a8
JK
10710 }
10711 tmp = tmp->b_this_page;
10712 }
10713- bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
10714- local_irq_restore(flags);
10715+ bh_uptodate_unlock_irqrestore(first, flags);
10716 end_page_writeback(page);
10717 return;
10718
10719 still_busy:
10720- bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
10721- local_irq_restore(flags);
10722- return;
10723+ bh_uptodate_unlock_irqrestore(first, flags);
10724 }
10725 EXPORT_SYMBOL(end_buffer_async_write);
10726
b3bbd485 10727@@ -3417,6 +3409,7 @@ struct buffer_head *alloc_buffer_head(gfp_t gfp_flags)
e4b2b4a8
JK
10728 struct buffer_head *ret = kmem_cache_zalloc(bh_cachep, gfp_flags);
10729 if (ret) {
10730 INIT_LIST_HEAD(&ret->b_assoc_buffers);
10731+ buffer_head_init_locks(ret);
10732 preempt_disable();
10733 __this_cpu_inc(bh_accounting.nr);
10734 recalc_bh_state();
b3bbd485 10735diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c
5dd41b01 10736index ef24b4527459..3ce6331a1101 100644
b3bbd485
JK
10737--- a/fs/cifs/readdir.c
10738+++ b/fs/cifs/readdir.c
10739@@ -80,7 +80,7 @@ cifs_prime_dcache(struct dentry *parent, struct qstr *name,
e4b2b4a8
JK
10740 struct inode *inode;
10741 struct super_block *sb = parent->d_sb;
10742 struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
10743- DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq);
10744+ DECLARE_SWAIT_QUEUE_HEAD_ONSTACK(wq);
10745
10746 cifs_dbg(FYI, "%s: for %s\n", __func__, name->name);
10747
b3bbd485 10748diff --git a/fs/dcache.c b/fs/dcache.c
5dd41b01 10749index 28b2e770bb69..b08506ef464a 100644
b3bbd485
JK
10750--- a/fs/dcache.c
10751+++ b/fs/dcache.c
e4b2b4a8
JK
10752@@ -19,6 +19,7 @@
10753 #include <linux/mm.h>
10754 #include <linux/fs.h>
10755 #include <linux/fsnotify.h>
10756+#include <linux/delay.h>
10757 #include <linux/slab.h>
10758 #include <linux/init.h>
10759 #include <linux/hash.h>
5dd41b01 10760@@ -808,6 +809,8 @@ static inline bool fast_dput(struct dentry *dentry)
e4b2b4a8
JK
10761 */
10762 void dput(struct dentry *dentry)
10763 {
10764+ struct dentry *parent;
10765+
10766 if (unlikely(!dentry))
10767 return;
10768
5dd41b01 10769@@ -844,9 +847,18 @@ void dput(struct dentry *dentry)
e4b2b4a8
JK
10770 return;
10771
10772 kill_it:
10773- dentry = dentry_kill(dentry);
10774- if (dentry) {
10775- cond_resched();
10776+ parent = dentry_kill(dentry);
10777+ if (parent) {
10778+ int r;
10779+
10780+ if (parent == dentry) {
10781+ /* the task with the highest priority won't schedule */
10782+ r = cond_resched();
10783+ if (!r)
10784+ cpu_chill();
10785+ } else {
10786+ dentry = parent;
10787+ }
10788 goto repeat;
10789 }
10790 }
5dd41b01 10791@@ -2414,7 +2426,7 @@ void d_delete(struct dentry * dentry)
e4b2b4a8
JK
10792 if (dentry->d_lockref.count == 1) {
10793 if (!spin_trylock(&inode->i_lock)) {
10794 spin_unlock(&dentry->d_lock);
10795- cpu_relax();
10796+ cpu_chill();
10797 goto again;
10798 }
10799 dentry->d_flags &= ~DCACHE_CANT_MOUNT;
5dd41b01 10800@@ -2459,9 +2471,10 @@ EXPORT_SYMBOL(d_rehash);
e4b2b4a8
JK
10801 static inline unsigned start_dir_add(struct inode *dir)
10802 {
10803
10804+ preempt_disable_rt();
10805 for (;;) {
10806- unsigned n = dir->i_dir_seq;
10807- if (!(n & 1) && cmpxchg(&dir->i_dir_seq, n, n + 1) == n)
10808+ unsigned n = dir->__i_dir_seq;
10809+ if (!(n & 1) && cmpxchg(&dir->__i_dir_seq, n, n + 1) == n)
10810 return n;
10811 cpu_relax();
10812 }
5dd41b01 10813@@ -2469,26 +2482,30 @@ static inline unsigned start_dir_add(struct inode *dir)
e4b2b4a8
JK
10814
10815 static inline void end_dir_add(struct inode *dir, unsigned n)
10816 {
10817- smp_store_release(&dir->i_dir_seq, n + 2);
10818+ smp_store_release(&dir->__i_dir_seq, n + 2);
10819+ preempt_enable_rt();
10820 }
10821
10822 static void d_wait_lookup(struct dentry *dentry)
10823 {
10824- if (d_in_lookup(dentry)) {
10825- DECLARE_WAITQUEUE(wait, current);
10826- add_wait_queue(dentry->d_wait, &wait);
10827- do {
10828- set_current_state(TASK_UNINTERRUPTIBLE);
10829- spin_unlock(&dentry->d_lock);
10830- schedule();
10831- spin_lock(&dentry->d_lock);
10832- } while (d_in_lookup(dentry));
10833- }
10834+ struct swait_queue __wait;
10835+
10836+ if (!d_in_lookup(dentry))
10837+ return;
10838+
10839+ INIT_LIST_HEAD(&__wait.task_list);
10840+ do {
10841+ prepare_to_swait(dentry->d_wait, &__wait, TASK_UNINTERRUPTIBLE);
10842+ spin_unlock(&dentry->d_lock);
10843+ schedule();
10844+ spin_lock(&dentry->d_lock);
10845+ } while (d_in_lookup(dentry));
10846+ finish_swait(dentry->d_wait, &__wait);
10847 }
10848
10849 struct dentry *d_alloc_parallel(struct dentry *parent,
10850 const struct qstr *name,
10851- wait_queue_head_t *wq)
10852+ struct swait_queue_head *wq)
10853 {
10854 unsigned int hash = name->hash;
10855 struct hlist_bl_head *b = in_lookup_hash(parent, hash);
5dd41b01 10856@@ -2502,7 +2519,7 @@ struct dentry *d_alloc_parallel(struct dentry *parent,
e4b2b4a8
JK
10857
10858 retry:
10859 rcu_read_lock();
10860- seq = smp_load_acquire(&parent->d_inode->i_dir_seq);
10861+ seq = smp_load_acquire(&parent->d_inode->__i_dir_seq);
10862 r_seq = read_seqbegin(&rename_lock);
10863 dentry = __d_lookup_rcu(parent, name, &d_seq);
10864 if (unlikely(dentry)) {
5dd41b01 10865@@ -2530,7 +2547,7 @@ struct dentry *d_alloc_parallel(struct dentry *parent,
e4b2b4a8
JK
10866 }
10867
10868 hlist_bl_lock(b);
10869- if (unlikely(READ_ONCE(parent->d_inode->i_dir_seq) != seq)) {
10870+ if (unlikely(READ_ONCE(parent->d_inode->__i_dir_seq) != seq)) {
10871 hlist_bl_unlock(b);
10872 rcu_read_unlock();
10873 goto retry;
5dd41b01 10874@@ -2603,7 +2620,7 @@ void __d_lookup_done(struct dentry *dentry)
e4b2b4a8
JK
10875 hlist_bl_lock(b);
10876 dentry->d_flags &= ~DCACHE_PAR_LOOKUP;
10877 __hlist_bl_del(&dentry->d_u.d_in_lookup_hash);
10878- wake_up_all(dentry->d_wait);
10879+ swake_up_all(dentry->d_wait);
10880 dentry->d_wait = NULL;
10881 hlist_bl_unlock(b);
10882 INIT_HLIST_NODE(&dentry->d_u.d_alias);
5dd41b01 10883@@ -3638,6 +3655,8 @@ __setup("dhash_entries=", set_dhash_entries);
e4b2b4a8
JK
10884
10885 static void __init dcache_init_early(void)
10886 {
10887+ unsigned int loop;
10888+
10889 /* If hashes are distributed across NUMA nodes, defer
10890 * hash allocation until vmalloc space is available.
10891 */
5dd41b01 10892@@ -3654,10 +3673,14 @@ static void __init dcache_init_early(void)
e4b2b4a8
JK
10893 &d_hash_mask,
10894 0,
10895 0);
10896+
10897+ for (loop = 0; loop < (1U << d_hash_shift); loop++)
10898+ INIT_HLIST_BL_HEAD(dentry_hashtable + loop);
10899 }
10900
10901 static void __init dcache_init(void)
10902 {
10903+ unsigned int loop;
10904 /*
10905 * A constructor could be added for stable state like the lists,
10906 * but it is probably not worth it because of the cache nature
5dd41b01 10907@@ -3680,6 +3703,10 @@ static void __init dcache_init(void)
e4b2b4a8
JK
10908 &d_hash_mask,
10909 0,
10910 0);
10911+
10912+ for (loop = 0; loop < (1U << d_hash_shift); loop++)
10913+ INIT_HLIST_BL_HEAD(dentry_hashtable + loop);
10914+
10915 }
10916
10917 /* SLAB cache for __getname() consumers */
b3bbd485
JK
10918diff --git a/fs/eventpoll.c b/fs/eventpoll.c
10919index 2fabd19cdeea..b768c32631eb 100644
10920--- a/fs/eventpoll.c
10921+++ b/fs/eventpoll.c
10922@@ -587,12 +587,12 @@ static int ep_poll_wakeup_proc(void *priv, void *cookie, int call_nests)
e4b2b4a8
JK
10923 */
10924 static void ep_poll_safewake(wait_queue_head_t *wq)
10925 {
10926- int this_cpu = get_cpu();
10927+ int this_cpu = get_cpu_light();
10928
10929 ep_call_nested(&poll_safewake_ncalls, EP_MAX_NESTS,
10930 ep_poll_wakeup_proc, NULL, wq, (void *) (long) this_cpu);
10931
10932- put_cpu();
10933+ put_cpu_light();
10934 }
10935
10936 static void ep_remove_wait_queue(struct eppoll_entry *pwq)
b3bbd485
JK
10937diff --git a/fs/exec.c b/fs/exec.c
10938index 0da4d748b4e6..609aee4dbfa9 100644
10939--- a/fs/exec.c
10940+++ b/fs/exec.c
10941@@ -1024,12 +1024,14 @@ static int exec_mmap(struct mm_struct *mm)
e4b2b4a8
JK
10942 }
10943 }
10944 task_lock(tsk);
10945+ preempt_disable_rt();
10946 active_mm = tsk->active_mm;
10947 tsk->mm = mm;
10948 tsk->active_mm = mm;
10949 activate_mm(active_mm, mm);
10950 tsk->mm->vmacache_seqnum = 0;
10951 vmacache_flush(tsk);
10952+ preempt_enable_rt();
10953 task_unlock(tsk);
10954 if (old_mm) {
10955 up_read(&old_mm->mmap_sem);
b3bbd485
JK
10956diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c
10957index db7590178dfc..d76364124443 100644
10958--- a/fs/ext4/page-io.c
10959+++ b/fs/ext4/page-io.c
10960@@ -95,8 +95,7 @@ static void ext4_finish_bio(struct bio *bio)
e4b2b4a8
JK
10961 * We check all buffers in the page under BH_Uptodate_Lock
10962 * to avoid races with other end io clearing async_write flags
10963 */
10964- local_irq_save(flags);
10965- bit_spin_lock(BH_Uptodate_Lock, &head->b_state);
10966+ flags = bh_uptodate_lock_irqsave(head);
10967 do {
10968 if (bh_offset(bh) < bio_start ||
10969 bh_offset(bh) + bh->b_size > bio_end) {
b3bbd485 10970@@ -108,8 +107,7 @@ static void ext4_finish_bio(struct bio *bio)
e4b2b4a8
JK
10971 if (bio->bi_status)
10972 buffer_io_error(bh);
10973 } while ((bh = bh->b_this_page) != head);
10974- bit_spin_unlock(BH_Uptodate_Lock, &head->b_state);
10975- local_irq_restore(flags);
10976+ bh_uptodate_unlock_irqrestore(head, flags);
10977 if (!under_io) {
10978 #ifdef CONFIG_EXT4_FS_ENCRYPTION
10979 if (data_page)
b3bbd485
JK
10980diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
10981index 29868c35c19a..76d354eee035 100644
10982--- a/fs/fuse/dir.c
10983+++ b/fs/fuse/dir.c
10984@@ -1188,7 +1188,7 @@ static int fuse_direntplus_link(struct file *file,
e4b2b4a8
JK
10985 struct inode *dir = d_inode(parent);
10986 struct fuse_conn *fc;
10987 struct inode *inode;
10988- DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq);
10989+ DECLARE_SWAIT_QUEUE_HEAD_ONSTACK(wq);
10990
10991 if (!o->nodeid) {
10992 /*
b3bbd485
JK
10993diff --git a/fs/inode.c b/fs/inode.c
10994index cfc36d11bcb3..b77ce179798a 100644
10995--- a/fs/inode.c
10996+++ b/fs/inode.c
10997@@ -154,7 +154,7 @@ int inode_init_always(struct super_block *sb, struct inode *inode)
e4b2b4a8
JK
10998 inode->i_bdev = NULL;
10999 inode->i_cdev = NULL;
11000 inode->i_link = NULL;
11001- inode->i_dir_seq = 0;
11002+ inode->__i_dir_seq = 0;
11003 inode->i_rdev = 0;
11004 inode->dirtied_when = 0;
11005
b3bbd485
JK
11006diff --git a/fs/libfs.c b/fs/libfs.c
11007index 3aabe553fc45..b5d63bf1ad8e 100644
11008--- a/fs/libfs.c
11009+++ b/fs/libfs.c
11010@@ -90,7 +90,7 @@ static struct dentry *next_positive(struct dentry *parent,
e4b2b4a8
JK
11011 struct list_head *from,
11012 int count)
11013 {
11014- unsigned *seq = &parent->d_inode->i_dir_seq, n;
11015+ unsigned *seq = &parent->d_inode->__i_dir_seq, n;
11016 struct dentry *res;
11017 struct list_head *p;
11018 bool skipped;
b3bbd485 11019@@ -123,8 +123,9 @@ static struct dentry *next_positive(struct dentry *parent,
e4b2b4a8
JK
11020 static void move_cursor(struct dentry *cursor, struct list_head *after)
11021 {
11022 struct dentry *parent = cursor->d_parent;
11023- unsigned n, *seq = &parent->d_inode->i_dir_seq;
11024+ unsigned n, *seq = &parent->d_inode->__i_dir_seq;
11025 spin_lock(&parent->d_lock);
11026+ preempt_disable_rt();
11027 for (;;) {
11028 n = *seq;
11029 if (!(n & 1) && cmpxchg(seq, n, n + 1) == n)
b3bbd485 11030@@ -137,6 +138,7 @@ static void move_cursor(struct dentry *cursor, struct list_head *after)
e4b2b4a8
JK
11031 else
11032 list_add_tail(&cursor->d_child, &parent->d_subdirs);
11033 smp_store_release(seq, n + 2);
11034+ preempt_enable_rt();
11035 spin_unlock(&parent->d_lock);
11036 }
11037
b3bbd485 11038diff --git a/fs/locks.c b/fs/locks.c
5dd41b01 11039index 665e3ce9ab47..47b66bfc4fa3 100644
b3bbd485
JK
11040--- a/fs/locks.c
11041+++ b/fs/locks.c
11042@@ -945,7 +945,7 @@ static int flock_lock_inode(struct inode *inode, struct file_lock *request)
e4b2b4a8
JK
11043 return -ENOMEM;
11044 }
11045
11046- percpu_down_read_preempt_disable(&file_rwsem);
11047+ percpu_down_read(&file_rwsem);
11048 spin_lock(&ctx->flc_lock);
11049 if (request->fl_flags & FL_ACCESS)
11050 goto find_conflict;
b3bbd485 11051@@ -986,7 +986,7 @@ static int flock_lock_inode(struct inode *inode, struct file_lock *request)
e4b2b4a8
JK
11052
11053 out:
11054 spin_unlock(&ctx->flc_lock);
11055- percpu_up_read_preempt_enable(&file_rwsem);
11056+ percpu_up_read(&file_rwsem);
11057 if (new_fl)
11058 locks_free_lock(new_fl);
11059 locks_dispose_list(&dispose);
b3bbd485 11060@@ -1023,7 +1023,7 @@ static int posix_lock_inode(struct inode *inode, struct file_lock *request,
e4b2b4a8
JK
11061 new_fl2 = locks_alloc_lock();
11062 }
11063
11064- percpu_down_read_preempt_disable(&file_rwsem);
11065+ percpu_down_read(&file_rwsem);
11066 spin_lock(&ctx->flc_lock);
11067 /*
11068 * New lock request. Walk all POSIX locks and look for conflicts. If
b3bbd485 11069@@ -1195,7 +1195,7 @@ static int posix_lock_inode(struct inode *inode, struct file_lock *request,
e4b2b4a8
JK
11070 }
11071 out:
11072 spin_unlock(&ctx->flc_lock);
11073- percpu_up_read_preempt_enable(&file_rwsem);
11074+ percpu_up_read(&file_rwsem);
11075 /*
11076 * Free any unused locks.
11077 */
b3bbd485 11078@@ -1470,7 +1470,7 @@ int __break_lease(struct inode *inode, unsigned int mode, unsigned int type)
e4b2b4a8
JK
11079 return error;
11080 }
11081
11082- percpu_down_read_preempt_disable(&file_rwsem);
11083+ percpu_down_read(&file_rwsem);
11084 spin_lock(&ctx->flc_lock);
11085
11086 time_out_leases(inode, &dispose);
b3bbd485 11087@@ -1522,13 +1522,13 @@ int __break_lease(struct inode *inode, unsigned int mode, unsigned int type)
e4b2b4a8
JK
11088 locks_insert_block(fl, new_fl);
11089 trace_break_lease_block(inode, new_fl);
11090 spin_unlock(&ctx->flc_lock);
11091- percpu_up_read_preempt_enable(&file_rwsem);
11092+ percpu_up_read(&file_rwsem);
11093
11094 locks_dispose_list(&dispose);
11095 error = wait_event_interruptible_timeout(new_fl->fl_wait,
11096 !new_fl->fl_next, break_time);
11097
11098- percpu_down_read_preempt_disable(&file_rwsem);
11099+ percpu_down_read(&file_rwsem);
11100 spin_lock(&ctx->flc_lock);
11101 trace_break_lease_unblock(inode, new_fl);
11102 locks_delete_block(new_fl);
b3bbd485 11103@@ -1545,7 +1545,7 @@ int __break_lease(struct inode *inode, unsigned int mode, unsigned int type)
e4b2b4a8
JK
11104 }
11105 out:
11106 spin_unlock(&ctx->flc_lock);
11107- percpu_up_read_preempt_enable(&file_rwsem);
11108+ percpu_up_read(&file_rwsem);
11109 locks_dispose_list(&dispose);
11110 locks_free_lock(new_fl);
11111 return error;
b3bbd485 11112@@ -1619,7 +1619,7 @@ int fcntl_getlease(struct file *filp)
e4b2b4a8
JK
11113
11114 ctx = smp_load_acquire(&inode->i_flctx);
11115 if (ctx && !list_empty_careful(&ctx->flc_lease)) {
11116- percpu_down_read_preempt_disable(&file_rwsem);
11117+ percpu_down_read(&file_rwsem);
11118 spin_lock(&ctx->flc_lock);
11119 time_out_leases(inode, &dispose);
11120 list_for_each_entry(fl, &ctx->flc_lease, fl_list) {
b3bbd485 11121@@ -1629,7 +1629,7 @@ int fcntl_getlease(struct file *filp)
e4b2b4a8
JK
11122 break;
11123 }
11124 spin_unlock(&ctx->flc_lock);
11125- percpu_up_read_preempt_enable(&file_rwsem);
11126+ percpu_up_read(&file_rwsem);
11127
11128 locks_dispose_list(&dispose);
11129 }
b3bbd485 11130@@ -1704,7 +1704,7 @@ generic_add_lease(struct file *filp, long arg, struct file_lock **flp, void **pr
e4b2b4a8
JK
11131 return -EINVAL;
11132 }
11133
11134- percpu_down_read_preempt_disable(&file_rwsem);
11135+ percpu_down_read(&file_rwsem);
11136 spin_lock(&ctx->flc_lock);
11137 time_out_leases(inode, &dispose);
11138 error = check_conflicting_open(dentry, arg, lease->fl_flags);
b3bbd485 11139@@ -1775,7 +1775,7 @@ generic_add_lease(struct file *filp, long arg, struct file_lock **flp, void **pr
e4b2b4a8
JK
11140 lease->fl_lmops->lm_setup(lease, priv);
11141 out:
11142 spin_unlock(&ctx->flc_lock);
11143- percpu_up_read_preempt_enable(&file_rwsem);
11144+ percpu_up_read(&file_rwsem);
11145 locks_dispose_list(&dispose);
11146 if (is_deleg)
11147 inode_unlock(inode);
b3bbd485 11148@@ -1798,7 +1798,7 @@ static int generic_delete_lease(struct file *filp, void *owner)
e4b2b4a8
JK
11149 return error;
11150 }
11151
11152- percpu_down_read_preempt_disable(&file_rwsem);
11153+ percpu_down_read(&file_rwsem);
11154 spin_lock(&ctx->flc_lock);
11155 list_for_each_entry(fl, &ctx->flc_lease, fl_list) {
11156 if (fl->fl_file == filp &&
b3bbd485 11157@@ -1811,7 +1811,7 @@ static int generic_delete_lease(struct file *filp, void *owner)
e4b2b4a8
JK
11158 if (victim)
11159 error = fl->fl_lmops->lm_change(victim, F_UNLCK, &dispose);
11160 spin_unlock(&ctx->flc_lock);
11161- percpu_up_read_preempt_enable(&file_rwsem);
11162+ percpu_up_read(&file_rwsem);
11163 locks_dispose_list(&dispose);
11164 return error;
11165 }
5dd41b01 11166@@ -2542,13 +2542,13 @@ locks_remove_lease(struct file *filp, struct file_lock_context *ctx)
e4b2b4a8
JK
11167 if (list_empty(&ctx->flc_lease))
11168 return;
11169
11170- percpu_down_read_preempt_disable(&file_rwsem);
11171+ percpu_down_read(&file_rwsem);
11172 spin_lock(&ctx->flc_lock);
11173 list_for_each_entry_safe(fl, tmp, &ctx->flc_lease, fl_list)
11174 if (filp == fl->fl_file)
11175 lease_modify(fl, F_UNLCK, &dispose);
11176 spin_unlock(&ctx->flc_lock);
11177- percpu_up_read_preempt_enable(&file_rwsem);
11178+ percpu_up_read(&file_rwsem);
11179
11180 locks_dispose_list(&dispose);
11181 }
b3bbd485
JK
11182diff --git a/fs/namei.c b/fs/namei.c
11183index 0b46b858cd42..f5c6c2ec44ce 100644
11184--- a/fs/namei.c
11185+++ b/fs/namei.c
11186@@ -1627,7 +1627,7 @@ static struct dentry *lookup_slow(const struct qstr *name,
e4b2b4a8
JK
11187 {
11188 struct dentry *dentry = ERR_PTR(-ENOENT), *old;
11189 struct inode *inode = dir->d_inode;
11190- DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq);
11191+ DECLARE_SWAIT_QUEUE_HEAD_ONSTACK(wq);
11192
11193 inode_lock_shared(inode);
11194 /* Don't go there if it's already dead */
b3bbd485 11195@@ -3100,7 +3100,7 @@ static int lookup_open(struct nameidata *nd, struct path *path,
e4b2b4a8
JK
11196 struct dentry *dentry;
11197 int error, create_error = 0;
11198 umode_t mode = op->mode;
11199- DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq);
11200+ DECLARE_SWAIT_QUEUE_HEAD_ONSTACK(wq);
11201
11202 if (unlikely(IS_DEADDIR(dir_inode)))
11203 return -ENOENT;
b3bbd485
JK
11204diff --git a/fs/namespace.c b/fs/namespace.c
11205index 9dc146e7b5e0..85bfe5e55adf 100644
11206--- a/fs/namespace.c
11207+++ b/fs/namespace.c
e4b2b4a8
JK
11208@@ -14,6 +14,7 @@
11209 #include <linux/mnt_namespace.h>
11210 #include <linux/user_namespace.h>
11211 #include <linux/namei.h>
11212+#include <linux/delay.h>
11213 #include <linux/security.h>
11214 #include <linux/cred.h>
11215 #include <linux/idr.h>
b3bbd485 11216@@ -353,8 +354,11 @@ int __mnt_want_write(struct vfsmount *m)
e4b2b4a8
JK
11217 * incremented count after it has set MNT_WRITE_HOLD.
11218 */
11219 smp_mb();
11220- while (ACCESS_ONCE(mnt->mnt.mnt_flags) & MNT_WRITE_HOLD)
11221- cpu_relax();
11222+ while (ACCESS_ONCE(mnt->mnt.mnt_flags) & MNT_WRITE_HOLD) {
11223+ preempt_enable();
11224+ cpu_chill();
11225+ preempt_disable();
11226+ }
11227 /*
11228 * After the slowpath clears MNT_WRITE_HOLD, mnt_is_readonly will
11229 * be set to match its requirements. So we must not load that until
b3bbd485
JK
11230diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c
11231index 606dd3871f66..fa41eb75b4d8 100644
11232--- a/fs/nfs/delegation.c
11233+++ b/fs/nfs/delegation.c
11234@@ -150,11 +150,11 @@ static int nfs_delegation_claim_opens(struct inode *inode,
e4b2b4a8
JK
11235 sp = state->owner;
11236 /* Block nfs4_proc_unlck */
11237 mutex_lock(&sp->so_delegreturn_mutex);
11238- seq = raw_seqcount_begin(&sp->so_reclaim_seqcount);
11239+ seq = read_seqbegin(&sp->so_reclaim_seqlock);
11240 err = nfs4_open_delegation_recall(ctx, state, stateid, type);
11241 if (!err)
11242 err = nfs_delegation_claim_locks(ctx, state, stateid);
11243- if (!err && read_seqcount_retry(&sp->so_reclaim_seqcount, seq))
11244+ if (!err && read_seqretry(&sp->so_reclaim_seqlock, seq))
11245 err = -EAGAIN;
11246 mutex_unlock(&sp->so_delegreturn_mutex);
11247 put_nfs_open_context(ctx);
b3bbd485
JK
11248diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
11249index bf2c43635062..f43f5da4a8c3 100644
11250--- a/fs/nfs/dir.c
11251+++ b/fs/nfs/dir.c
11252@@ -452,7 +452,7 @@ static
e4b2b4a8
JK
11253 void nfs_prime_dcache(struct dentry *parent, struct nfs_entry *entry)
11254 {
11255 struct qstr filename = QSTR_INIT(entry->name, entry->len);
11256- DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq);
11257+ DECLARE_SWAIT_QUEUE_HEAD_ONSTACK(wq);
11258 struct dentry *dentry;
11259 struct dentry *alias;
11260 struct inode *dir = d_inode(parent);
b3bbd485 11261@@ -1443,7 +1443,7 @@ int nfs_atomic_open(struct inode *dir, struct dentry *dentry,
e4b2b4a8
JK
11262 struct file *file, unsigned open_flags,
11263 umode_t mode, int *opened)
11264 {
11265- DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq);
11266+ DECLARE_SWAIT_QUEUE_HEAD_ONSTACK(wq);
11267 struct nfs_open_context *ctx;
11268 struct dentry *res;
11269 struct iattr attr = { .ia_valid = ATTR_OPEN };
b3bbd485 11270@@ -1763,7 +1763,11 @@ int nfs_rmdir(struct inode *dir, struct dentry *dentry)
e4b2b4a8
JK
11271
11272 trace_nfs_rmdir_enter(dir, dentry);
11273 if (d_really_is_positive(dentry)) {
11274+#ifdef CONFIG_PREEMPT_RT_BASE
11275+ down(&NFS_I(d_inode(dentry))->rmdir_sem);
11276+#else
11277 down_write(&NFS_I(d_inode(dentry))->rmdir_sem);
11278+#endif
11279 error = NFS_PROTO(dir)->rmdir(dir, &dentry->d_name);
11280 /* Ensure the VFS deletes this inode */
11281 switch (error) {
b3bbd485 11282@@ -1773,7 +1777,11 @@ int nfs_rmdir(struct inode *dir, struct dentry *dentry)
e4b2b4a8
JK
11283 case -ENOENT:
11284 nfs_dentry_handle_enoent(dentry);
11285 }
11286+#ifdef CONFIG_PREEMPT_RT_BASE
11287+ up(&NFS_I(d_inode(dentry))->rmdir_sem);
11288+#else
11289 up_write(&NFS_I(d_inode(dentry))->rmdir_sem);
11290+#endif
11291 } else
11292 error = NFS_PROTO(dir)->rmdir(dir, &dentry->d_name);
11293 trace_nfs_rmdir_exit(dir, dentry, error);
b3bbd485
JK
11294diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
11295index 134d9f560240..ff64167f9811 100644
11296--- a/fs/nfs/inode.c
11297+++ b/fs/nfs/inode.c
11298@@ -2014,7 +2014,11 @@ static void init_once(void *foo)
e4b2b4a8
JK
11299 atomic_long_set(&nfsi->nrequests, 0);
11300 atomic_long_set(&nfsi->commit_info.ncommit, 0);
11301 atomic_set(&nfsi->commit_info.rpcs_out, 0);
11302+#ifdef CONFIG_PREEMPT_RT_BASE
11303+ sema_init(&nfsi->rmdir_sem, 1);
11304+#else
11305 init_rwsem(&nfsi->rmdir_sem);
11306+#endif
11307 mutex_init(&nfsi->commit_mutex);
11308 nfs4_init_once(nfsi);
11309 }
b3bbd485
JK
11310diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h
11311index a73144b3cb8c..0c403d280b96 100644
11312--- a/fs/nfs/nfs4_fs.h
11313+++ b/fs/nfs/nfs4_fs.h
11314@@ -112,7 +112,7 @@ struct nfs4_state_owner {
e4b2b4a8
JK
11315 unsigned long so_flags;
11316 struct list_head so_states;
11317 struct nfs_seqid_counter so_seqid;
11318- seqcount_t so_reclaim_seqcount;
11319+ seqlock_t so_reclaim_seqlock;
11320 struct mutex so_delegreturn_mutex;
11321 };
11322
b3bbd485 11323diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
5dd41b01 11324index a3b67d3b1dfb..4ce6ec109c2b 100644
b3bbd485
JK
11325--- a/fs/nfs/nfs4proc.c
11326+++ b/fs/nfs/nfs4proc.c
5dd41b01 11327@@ -2700,7 +2700,7 @@ static int _nfs4_open_and_get_state(struct nfs4_opendata *opendata,
e4b2b4a8
JK
11328 unsigned int seq;
11329 int ret;
11330
11331- seq = raw_seqcount_begin(&sp->so_reclaim_seqcount);
11332+ seq = raw_seqcount_begin(&sp->so_reclaim_seqlock.seqcount);
11333
11334 ret = _nfs4_proc_open(opendata);
11335 if (ret != 0)
5dd41b01 11336@@ -2738,7 +2738,7 @@ static int _nfs4_open_and_get_state(struct nfs4_opendata *opendata,
e4b2b4a8
JK
11337
11338 if (d_inode(dentry) == state->inode) {
11339 nfs_inode_attach_open_context(ctx);
11340- if (read_seqcount_retry(&sp->so_reclaim_seqcount, seq))
11341+ if (read_seqretry(&sp->so_reclaim_seqlock, seq))
11342 nfs4_schedule_stateid_recovery(server, state);
11343 }
11344 out:
b3bbd485 11345diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
5dd41b01 11346index e1d88bca815e..c51bcc176026 100644
b3bbd485
JK
11347--- a/fs/nfs/nfs4state.c
11348+++ b/fs/nfs/nfs4state.c
11349@@ -494,7 +494,7 @@ nfs4_alloc_state_owner(struct nfs_server *server,
e4b2b4a8
JK
11350 nfs4_init_seqid_counter(&sp->so_seqid);
11351 atomic_set(&sp->so_count, 1);
11352 INIT_LIST_HEAD(&sp->so_lru);
11353- seqcount_init(&sp->so_reclaim_seqcount);
11354+ seqlock_init(&sp->so_reclaim_seqlock);
11355 mutex_init(&sp->so_delegreturn_mutex);
11356 return sp;
11357 }
5dd41b01 11358@@ -1521,8 +1521,12 @@ static int nfs4_reclaim_open_state(struct nfs4_state_owner *sp, const struct nfs
e4b2b4a8
JK
11359 * recovering after a network partition or a reboot from a
11360 * server that doesn't support a grace period.
11361 */
11362+#ifdef CONFIG_PREEMPT_RT_FULL
11363+ write_seqlock(&sp->so_reclaim_seqlock);
11364+#else
11365+ write_seqcount_begin(&sp->so_reclaim_seqlock.seqcount);
11366+#endif
11367 spin_lock(&sp->so_lock);
11368- raw_write_seqcount_begin(&sp->so_reclaim_seqcount);
11369 restart:
11370 list_for_each_entry(state, &sp->so_states, open_states) {
11371 if (!test_and_clear_bit(ops->state_flag_bit, &state->flags))
5dd41b01 11372@@ -1591,14 +1595,20 @@ static int nfs4_reclaim_open_state(struct nfs4_state_owner *sp, const struct nfs
e4b2b4a8
JK
11373 spin_lock(&sp->so_lock);
11374 goto restart;
11375 }
11376- raw_write_seqcount_end(&sp->so_reclaim_seqcount);
11377 spin_unlock(&sp->so_lock);
11378+#ifdef CONFIG_PREEMPT_RT_FULL
11379+ write_sequnlock(&sp->so_reclaim_seqlock);
11380+#else
11381+ write_seqcount_end(&sp->so_reclaim_seqlock.seqcount);
11382+#endif
11383 return 0;
11384 out_err:
11385 nfs4_put_open_state(state);
11386- spin_lock(&sp->so_lock);
11387- raw_write_seqcount_end(&sp->so_reclaim_seqcount);
11388- spin_unlock(&sp->so_lock);
11389+#ifdef CONFIG_PREEMPT_RT_FULL
11390+ write_sequnlock(&sp->so_reclaim_seqlock);
11391+#else
11392+ write_seqcount_end(&sp->so_reclaim_seqlock.seqcount);
11393+#endif
11394 return status;
11395 }
11396
b3bbd485
JK
11397diff --git a/fs/nfs/unlink.c b/fs/nfs/unlink.c
11398index 630b4a3c1a93..0dc1d3e6a62f 100644
11399--- a/fs/nfs/unlink.c
11400+++ b/fs/nfs/unlink.c
e4b2b4a8
JK
11401@@ -13,7 +13,7 @@
11402 #include <linux/sunrpc/clnt.h>
11403 #include <linux/nfs_fs.h>
11404 #include <linux/sched.h>
11405-#include <linux/wait.h>
11406+#include <linux/swait.h>
11407 #include <linux/namei.h>
11408 #include <linux/fsnotify.h>
11409
b3bbd485 11410@@ -52,6 +52,29 @@ static void nfs_async_unlink_done(struct rpc_task *task, void *calldata)
e4b2b4a8
JK
11411 rpc_restart_call_prepare(task);
11412 }
11413
11414+#ifdef CONFIG_PREEMPT_RT_BASE
11415+static void nfs_down_anon(struct semaphore *sema)
11416+{
11417+ down(sema);
11418+}
11419+
11420+static void nfs_up_anon(struct semaphore *sema)
11421+{
11422+ up(sema);
11423+}
11424+
11425+#else
11426+static void nfs_down_anon(struct rw_semaphore *rwsem)
11427+{
11428+ down_read_non_owner(rwsem);
11429+}
11430+
11431+static void nfs_up_anon(struct rw_semaphore *rwsem)
11432+{
11433+ up_read_non_owner(rwsem);
11434+}
11435+#endif
11436+
11437 /**
11438 * nfs_async_unlink_release - Release the sillydelete data.
11439 * @task: rpc_task of the sillydelete
b3bbd485 11440@@ -65,7 +88,7 @@ static void nfs_async_unlink_release(void *calldata)
e4b2b4a8
JK
11441 struct dentry *dentry = data->dentry;
11442 struct super_block *sb = dentry->d_sb;
11443
11444- up_read_non_owner(&NFS_I(d_inode(dentry->d_parent))->rmdir_sem);
11445+ nfs_up_anon(&NFS_I(d_inode(dentry->d_parent))->rmdir_sem);
11446 d_lookup_done(dentry);
11447 nfs_free_unlinkdata(data);
11448 dput(dentry);
b3bbd485 11449@@ -118,10 +141,10 @@ static int nfs_call_unlink(struct dentry *dentry, struct nfs_unlinkdata *data)
e4b2b4a8
JK
11450 struct inode *dir = d_inode(dentry->d_parent);
11451 struct dentry *alias;
11452
11453- down_read_non_owner(&NFS_I(dir)->rmdir_sem);
11454+ nfs_down_anon(&NFS_I(dir)->rmdir_sem);
11455 alias = d_alloc_parallel(dentry->d_parent, &data->args.name, &data->wq);
11456 if (IS_ERR(alias)) {
11457- up_read_non_owner(&NFS_I(dir)->rmdir_sem);
11458+ nfs_up_anon(&NFS_I(dir)->rmdir_sem);
11459 return 0;
11460 }
11461 if (!d_in_lookup(alias)) {
b3bbd485 11462@@ -143,7 +166,7 @@ static int nfs_call_unlink(struct dentry *dentry, struct nfs_unlinkdata *data)
e4b2b4a8
JK
11463 ret = 0;
11464 spin_unlock(&alias->d_lock);
11465 dput(alias);
11466- up_read_non_owner(&NFS_I(dir)->rmdir_sem);
11467+ nfs_up_anon(&NFS_I(dir)->rmdir_sem);
11468 /*
11469 * If we'd displaced old cached devname, free it. At that
11470 * point dentry is definitely not a root, so we won't need
b3bbd485 11471@@ -183,7 +206,7 @@ nfs_async_unlink(struct dentry *dentry, const struct qstr *name)
e4b2b4a8
JK
11472 goto out_free_name;
11473 }
11474 data->res.dir_attr = &data->dir_attr;
11475- init_waitqueue_head(&data->wq);
11476+ init_swait_queue_head(&data->wq);
11477
11478 status = -EBUSY;
11479 spin_lock(&dentry->d_lock);
b3bbd485
JK
11480diff --git a/fs/ntfs/aops.c b/fs/ntfs/aops.c
11481index cc91856b5e2d..a982d7c3ad91 100644
11482--- a/fs/ntfs/aops.c
11483+++ b/fs/ntfs/aops.c
11484@@ -93,13 +93,13 @@ static void ntfs_end_buffer_async_read(struct buffer_head *bh, int uptodate)
e4b2b4a8
JK
11485 ofs = 0;
11486 if (file_ofs < init_size)
11487 ofs = init_size - file_ofs;
11488- local_irq_save(flags);
11489+ local_irq_save_nort(flags);
11490 kaddr = kmap_atomic(page);
11491 memset(kaddr + bh_offset(bh) + ofs, 0,
11492 bh->b_size - ofs);
11493 flush_dcache_page(page);
11494 kunmap_atomic(kaddr);
11495- local_irq_restore(flags);
11496+ local_irq_restore_nort(flags);
11497 }
11498 } else {
11499 clear_buffer_uptodate(bh);
b3bbd485 11500@@ -108,8 +108,7 @@ static void ntfs_end_buffer_async_read(struct buffer_head *bh, int uptodate)
e4b2b4a8
JK
11501 "0x%llx.", (unsigned long long)bh->b_blocknr);
11502 }
11503 first = page_buffers(page);
11504- local_irq_save(flags);
11505- bit_spin_lock(BH_Uptodate_Lock, &first->b_state);
11506+ flags = bh_uptodate_lock_irqsave(first);
11507 clear_buffer_async_read(bh);
11508 unlock_buffer(bh);
11509 tmp = bh;
b3bbd485 11510@@ -124,8 +123,7 @@ static void ntfs_end_buffer_async_read(struct buffer_head *bh, int uptodate)
e4b2b4a8
JK
11511 }
11512 tmp = tmp->b_this_page;
11513 } while (tmp != bh);
11514- bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
11515- local_irq_restore(flags);
11516+ bh_uptodate_unlock_irqrestore(first, flags);
11517 /*
11518 * If none of the buffers had errors then we can set the page uptodate,
11519 * but we first have to perform the post read mst fixups, if the
b3bbd485 11520@@ -146,13 +144,13 @@ static void ntfs_end_buffer_async_read(struct buffer_head *bh, int uptodate)
e4b2b4a8
JK
11521 recs = PAGE_SIZE / rec_size;
11522 /* Should have been verified before we got here... */
11523 BUG_ON(!recs);
11524- local_irq_save(flags);
11525+ local_irq_save_nort(flags);
11526 kaddr = kmap_atomic(page);
11527 for (i = 0; i < recs; i++)
11528 post_read_mst_fixup((NTFS_RECORD*)(kaddr +
11529 i * rec_size), rec_size);
11530 kunmap_atomic(kaddr);
11531- local_irq_restore(flags);
11532+ local_irq_restore_nort(flags);
11533 flush_dcache_page(page);
11534 if (likely(page_uptodate && !PageError(page)))
11535 SetPageUptodate(page);
b3bbd485 11536@@ -160,9 +158,7 @@ static void ntfs_end_buffer_async_read(struct buffer_head *bh, int uptodate)
e4b2b4a8
JK
11537 unlock_page(page);
11538 return;
11539 still_busy:
11540- bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
11541- local_irq_restore(flags);
11542- return;
11543+ bh_uptodate_unlock_irqrestore(first, flags);
11544 }
11545
11546 /**
b3bbd485
JK
11547diff --git a/fs/proc/array.c b/fs/proc/array.c
11548index 4ac811e1a26c..9dcb40690cde 100644
11549--- a/fs/proc/array.c
11550+++ b/fs/proc/array.c
11551@@ -386,9 +386,9 @@ static inline void task_context_switch_counts(struct seq_file *m,
e4b2b4a8
JK
11552 static void task_cpus_allowed(struct seq_file *m, struct task_struct *task)
11553 {
11554 seq_printf(m, "Cpus_allowed:\t%*pb\n",
11555- cpumask_pr_args(&task->cpus_allowed));
11556+ cpumask_pr_args(task->cpus_ptr));
11557 seq_printf(m, "Cpus_allowed_list:\t%*pbl\n",
11558- cpumask_pr_args(&task->cpus_allowed));
11559+ cpumask_pr_args(task->cpus_ptr));
11560 }
11561
11562 int proc_pid_status(struct seq_file *m, struct pid_namespace *ns,
b3bbd485 11563diff --git a/fs/proc/base.c b/fs/proc/base.c
5dd41b01 11564index 9063738ff1f0..4085e56e261c 100644
b3bbd485
JK
11565--- a/fs/proc/base.c
11566+++ b/fs/proc/base.c
5dd41b01 11567@@ -1900,7 +1900,7 @@ bool proc_fill_cache(struct file *file, struct dir_context *ctx,
e4b2b4a8
JK
11568
11569 child = d_hash_and_lookup(dir, &qname);
11570 if (!child) {
11571- DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq);
11572+ DECLARE_SWAIT_QUEUE_HEAD_ONSTACK(wq);
11573 child = d_alloc_parallel(dir, &qname, &wq);
11574 if (IS_ERR(child))
11575 goto end_instantiate;
b3bbd485
JK
11576diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c
11577index 82ac5f682b73..c35714621a38 100644
11578--- a/fs/proc/proc_sysctl.c
11579+++ b/fs/proc/proc_sysctl.c
11580@@ -679,7 +679,7 @@ static bool proc_sys_fill_cache(struct file *file,
e4b2b4a8
JK
11581
11582 child = d_lookup(dir, &qname);
11583 if (!child) {
11584- DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq);
11585+ DECLARE_SWAIT_QUEUE_HEAD_ONSTACK(wq);
11586 child = d_alloc_parallel(dir, &qname, &wq);
11587 if (IS_ERR(child))
11588 return false;
b3bbd485
JK
11589diff --git a/fs/squashfs/decompressor_multi_percpu.c b/fs/squashfs/decompressor_multi_percpu.c
11590index 23a9c28ad8ea..6a73c4fa88e7 100644
11591--- a/fs/squashfs/decompressor_multi_percpu.c
11592+++ b/fs/squashfs/decompressor_multi_percpu.c
11593@@ -10,6 +10,7 @@
11594 #include <linux/slab.h>
11595 #include <linux/percpu.h>
11596 #include <linux/buffer_head.h>
11597+#include <linux/locallock.h>
11598
11599 #include "squashfs_fs.h"
11600 #include "squashfs_fs_sb.h"
11601@@ -25,6 +26,8 @@ struct squashfs_stream {
11602 void *stream;
11603 };
11604
11605+static DEFINE_LOCAL_IRQ_LOCK(stream_lock);
11606+
11607 void *squashfs_decompressor_create(struct squashfs_sb_info *msblk,
11608 void *comp_opts)
11609 {
11610@@ -79,10 +82,15 @@ int squashfs_decompress(struct squashfs_sb_info *msblk, struct buffer_head **bh,
11611 {
11612 struct squashfs_stream __percpu *percpu =
11613 (struct squashfs_stream __percpu *) msblk->stream;
11614- struct squashfs_stream *stream = get_cpu_ptr(percpu);
11615- int res = msblk->decompressor->decompress(msblk, stream->stream, bh, b,
11616- offset, length, output);
11617- put_cpu_ptr(stream);
11618+ struct squashfs_stream *stream;
11619+ int res;
11620+
11621+ stream = get_locked_ptr(stream_lock, percpu);
11622+
11623+ res = msblk->decompressor->decompress(msblk, stream->stream, bh, b,
11624+ offset, length, output);
11625+
11626+ put_locked_ptr(stream_lock, stream);
11627
11628 if (res < 0)
11629 ERROR("%s decompression failed, data probably corrupt\n",
11630diff --git a/fs/timerfd.c b/fs/timerfd.c
11631index 040612ec9598..b3d9d435926c 100644
11632--- a/fs/timerfd.c
11633+++ b/fs/timerfd.c
11634@@ -471,7 +471,10 @@ static int do_timerfd_settime(int ufd, int flags,
e4b2b4a8
JK
11635 break;
11636 }
11637 spin_unlock_irq(&ctx->wqh.lock);
11638- cpu_relax();
11639+ if (isalarm(ctx))
11640+ hrtimer_wait_for_timer(&ctx->t.alarm.timer);
11641+ else
11642+ hrtimer_wait_for_timer(&ctx->t.tmr);
11643 }
11644
11645 /*
b3bbd485
JK
11646diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
11647index b0cccf8a81a8..eaa4383defec 100644
11648--- a/fs/xfs/xfs_aops.c
11649+++ b/fs/xfs/xfs_aops.c
11650@@ -120,8 +120,7 @@ xfs_finish_page_writeback(
e4b2b4a8
JK
11651 ASSERT(bvec->bv_offset + bvec->bv_len <= PAGE_SIZE);
11652 ASSERT((bvec->bv_len & (i_blocksize(inode) - 1)) == 0);
11653
11654- local_irq_save(flags);
11655- bit_spin_lock(BH_Uptodate_Lock, &head->b_state);
11656+ flags = bh_uptodate_lock_irqsave(head);
11657 do {
11658 if (off >= bvec->bv_offset &&
11659 off < bvec->bv_offset + bvec->bv_len) {
b3bbd485 11660@@ -143,8 +142,7 @@ xfs_finish_page_writeback(
e4b2b4a8
JK
11661 }
11662 off += bh->b_size;
11663 } while ((bh = bh->b_this_page) != head);
11664- bit_spin_unlock(BH_Uptodate_Lock, &head->b_state);
11665- local_irq_restore(flags);
11666+ bh_uptodate_unlock_irqrestore(head, flags);
11667
11668 if (!busy)
11669 end_page_writeback(bvec->bv_page);
b3bbd485
JK
11670diff --git a/include/acpi/platform/aclinux.h b/include/acpi/platform/aclinux.h
11671index 1b473efd9eb6..89ee5e1dac48 100644
11672--- a/include/acpi/platform/aclinux.h
11673+++ b/include/acpi/platform/aclinux.h
e4b2b4a8
JK
11674@@ -134,6 +134,7 @@
11675
11676 #define acpi_cache_t struct kmem_cache
11677 #define acpi_spinlock spinlock_t *
11678+#define acpi_raw_spinlock raw_spinlock_t *
11679 #define acpi_cpu_flags unsigned long
11680
11681 /* Use native linux version of acpi_os_allocate_zeroed */
11682@@ -152,6 +153,20 @@
11683 #define ACPI_USE_ALTERNATE_PROTOTYPE_acpi_os_get_thread_id
11684 #define ACPI_USE_ALTERNATE_PROTOTYPE_acpi_os_create_lock
11685
11686+#define acpi_os_create_raw_lock(__handle) \
11687+({ \
11688+ raw_spinlock_t *lock = ACPI_ALLOCATE(sizeof(*lock)); \
11689+ \
11690+ if (lock) { \
11691+ *(__handle) = lock; \
11692+ raw_spin_lock_init(*(__handle)); \
11693+ } \
11694+ lock ? AE_OK : AE_NO_MEMORY; \
11695+ })
11696+
11697+#define acpi_os_delete_raw_lock(__handle) kfree(__handle)
11698+
11699+
11700 /*
11701 * OSL interfaces used by debugger/disassembler
11702 */
b3bbd485
JK
11703diff --git a/include/asm-generic/bug.h b/include/asm-generic/bug.h
11704index ae1a33aa8955..c6d04eca8345 100644
11705--- a/include/asm-generic/bug.h
11706+++ b/include/asm-generic/bug.h
11707@@ -234,6 +234,20 @@ void __warn(const char *file, int line, void *caller, unsigned taint,
e4b2b4a8
JK
11708 # define WARN_ON_SMP(x) ({0;})
11709 #endif
11710
11711+#ifdef CONFIG_PREEMPT_RT_BASE
11712+# define BUG_ON_RT(c) BUG_ON(c)
11713+# define BUG_ON_NONRT(c) do { } while (0)
11714+# define WARN_ON_RT(condition) WARN_ON(condition)
11715+# define WARN_ON_NONRT(condition) do { } while (0)
11716+# define WARN_ON_ONCE_NONRT(condition) do { } while (0)
11717+#else
11718+# define BUG_ON_RT(c) do { } while (0)
11719+# define BUG_ON_NONRT(c) BUG_ON(c)
11720+# define WARN_ON_RT(condition) do { } while (0)
11721+# define WARN_ON_NONRT(condition) WARN_ON(condition)
11722+# define WARN_ON_ONCE_NONRT(condition) WARN_ON_ONCE(condition)
11723+#endif
11724+
11725 #endif /* __ASSEMBLY__ */
11726
11727 #endif
b3bbd485
JK
11728diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
11729index 994cbb0f7ffc..0d4b7e3489a9 100644
11730--- a/include/linux/blk-mq.h
11731+++ b/include/linux/blk-mq.h
11732@@ -226,7 +226,7 @@ static inline u16 blk_mq_unique_tag_to_tag(u32 unique_tag)
11733 return unique_tag & BLK_MQ_UNIQUE_TAG_MASK;
11734 }
11735
11736-
11737+void __blk_mq_complete_request_remote_work(struct work_struct *work);
11738 int blk_mq_request_started(struct request *rq);
11739 void blk_mq_start_request(struct request *rq);
11740 void blk_mq_end_request(struct request *rq, blk_status_t error);
11741diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
11742index 4d4af0e94059..cbf9d5730dd3 100644
11743--- a/include/linux/blkdev.h
11744+++ b/include/linux/blkdev.h
e4b2b4a8
JK
11745@@ -27,6 +27,7 @@
11746 #include <linux/percpu-refcount.h>
11747 #include <linux/scatterlist.h>
11748 #include <linux/blkzoned.h>
11749+#include <linux/swork.h>
11750
11751 struct module;
11752 struct scsi_ioctl_command;
b3bbd485 11753@@ -134,6 +135,9 @@ typedef __u32 __bitwise req_flags_t;
e4b2b4a8
JK
11754 */
11755 struct request {
11756 struct list_head queuelist;
11757+#ifdef CONFIG_PREEMPT_RT_FULL
11758+ struct work_struct work;
11759+#endif
11760 union {
11761 struct __call_single_data csd;
11762 u64 fifo_time;
b3bbd485 11763@@ -596,6 +600,7 @@ struct request_queue {
e4b2b4a8
JK
11764 #endif
11765 struct rcu_head rcu_head;
11766 wait_queue_head_t mq_freeze_wq;
11767+ struct swork_event mq_pcpu_wake;
11768 struct percpu_ref q_usage_counter;
11769 struct list_head all_q_node;
11770
b3bbd485
JK
11771diff --git a/include/linux/bottom_half.h b/include/linux/bottom_half.h
11772index a19519f4241d..40dd5ef9c154 100644
11773--- a/include/linux/bottom_half.h
11774+++ b/include/linux/bottom_half.h
e4b2b4a8
JK
11775@@ -4,6 +4,39 @@
11776
11777 #include <linux/preempt.h>
11778
11779+#ifdef CONFIG_PREEMPT_RT_FULL
11780+
11781+extern void __local_bh_disable(void);
11782+extern void _local_bh_enable(void);
11783+extern void __local_bh_enable(void);
11784+
11785+static inline void local_bh_disable(void)
11786+{
11787+ __local_bh_disable();
11788+}
11789+
11790+static inline void __local_bh_disable_ip(unsigned long ip, unsigned int cnt)
11791+{
11792+ __local_bh_disable();
11793+}
11794+
11795+static inline void local_bh_enable(void)
11796+{
11797+ __local_bh_enable();
11798+}
11799+
11800+static inline void __local_bh_enable_ip(unsigned long ip, unsigned int cnt)
11801+{
11802+ __local_bh_enable();
11803+}
11804+
11805+static inline void local_bh_enable_ip(unsigned long ip)
11806+{
11807+ __local_bh_enable();
11808+}
11809+
11810+#else
11811+
11812 #ifdef CONFIG_TRACE_IRQFLAGS
11813 extern void __local_bh_disable_ip(unsigned long ip, unsigned int cnt);
11814 #else
b3bbd485 11815@@ -31,5 +64,6 @@ static inline void local_bh_enable(void)
e4b2b4a8
JK
11816 {
11817 __local_bh_enable_ip(_THIS_IP_, SOFTIRQ_DISABLE_OFFSET);
11818 }
11819+#endif
11820
11821 #endif /* _LINUX_BH_H */
b3bbd485
JK
11822diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h
11823index afa37f807f12..48505fade7e1 100644
11824--- a/include/linux/buffer_head.h
11825+++ b/include/linux/buffer_head.h
11826@@ -76,8 +76,50 @@ struct buffer_head {
e4b2b4a8
JK
11827 struct address_space *b_assoc_map; /* mapping this buffer is
11828 associated with */
11829 atomic_t b_count; /* users using this buffer_head */
11830+#ifdef CONFIG_PREEMPT_RT_BASE
11831+ spinlock_t b_uptodate_lock;
11832+#if IS_ENABLED(CONFIG_JBD2)
11833+ spinlock_t b_state_lock;
11834+ spinlock_t b_journal_head_lock;
11835+#endif
11836+#endif
11837 };
11838
11839+static inline unsigned long bh_uptodate_lock_irqsave(struct buffer_head *bh)
11840+{
11841+ unsigned long flags;
11842+
11843+#ifndef CONFIG_PREEMPT_RT_BASE
11844+ local_irq_save(flags);
11845+ bit_spin_lock(BH_Uptodate_Lock, &bh->b_state);
11846+#else
11847+ spin_lock_irqsave(&bh->b_uptodate_lock, flags);
11848+#endif
11849+ return flags;
11850+}
11851+
11852+static inline void
11853+bh_uptodate_unlock_irqrestore(struct buffer_head *bh, unsigned long flags)
11854+{
11855+#ifndef CONFIG_PREEMPT_RT_BASE
11856+ bit_spin_unlock(BH_Uptodate_Lock, &bh->b_state);
11857+ local_irq_restore(flags);
11858+#else
11859+ spin_unlock_irqrestore(&bh->b_uptodate_lock, flags);
11860+#endif
11861+}
11862+
11863+static inline void buffer_head_init_locks(struct buffer_head *bh)
11864+{
11865+#ifdef CONFIG_PREEMPT_RT_BASE
11866+ spin_lock_init(&bh->b_uptodate_lock);
11867+#if IS_ENABLED(CONFIG_JBD2)
11868+ spin_lock_init(&bh->b_state_lock);
11869+ spin_lock_init(&bh->b_journal_head_lock);
11870+#endif
11871+#endif
11872+}
11873+
11874 /*
11875 * macro tricks to expand the set_buffer_foo(), clear_buffer_foo()
11876 * and buffer_foo() functions.
b3bbd485 11877diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h
5dd41b01 11878index e7905d9353e8..4ecf7875e04f 100644
b3bbd485
JK
11879--- a/include/linux/cgroup-defs.h
11880+++ b/include/linux/cgroup-defs.h
e4b2b4a8
JK
11881@@ -19,6 +19,7 @@
11882 #include <linux/percpu-rwsem.h>
11883 #include <linux/workqueue.h>
11884 #include <linux/bpf-cgroup.h>
11885+#include <linux/swork.h>
11886
11887 #ifdef CONFIG_CGROUPS
11888
b3bbd485 11889@@ -152,6 +153,7 @@ struct cgroup_subsys_state {
e4b2b4a8
JK
11890 /* percpu_ref killing and RCU release */
11891 struct rcu_head rcu_head;
11892 struct work_struct destroy_work;
11893+ struct swork_event destroy_swork;
11894
11895 /*
11896 * PI: the parent css. Placed here for cache proximity to following
b3bbd485
JK
11897diff --git a/include/linux/completion.h b/include/linux/completion.h
11898index 7828451e161a..f5838b10cf84 100644
11899--- a/include/linux/completion.h
11900+++ b/include/linux/completion.h
e4b2b4a8
JK
11901@@ -9,7 +9,7 @@
11902 * See kernel/sched/completion.c for details.
11903 */
11904
11905-#include <linux/wait.h>
11906+#include <linux/swait.h>
11907 #ifdef CONFIG_LOCKDEP_COMPLETIONS
11908 #include <linux/lockdep.h>
11909 #endif
11910@@ -28,7 +28,7 @@
11911 */
11912 struct completion {
11913 unsigned int done;
11914- wait_queue_head_t wait;
11915+ struct swait_queue_head wait;
11916 #ifdef CONFIG_LOCKDEP_COMPLETIONS
11917 struct lockdep_map_cross map;
11918 #endif
b3bbd485 11919@@ -67,11 +67,11 @@ static inline void complete_release_commit(struct completion *x) {}
e4b2b4a8
JK
11920
11921 #ifdef CONFIG_LOCKDEP_COMPLETIONS
11922 #define COMPLETION_INITIALIZER(work) \
11923- { 0, __WAIT_QUEUE_HEAD_INITIALIZER((work).wait), \
11924+ { 0, __SWAIT_QUEUE_HEAD_INITIALIZER((work).wait), \
11925 STATIC_CROSS_LOCKDEP_MAP_INIT("(complete)" #work, &(work)) }
11926 #else
11927 #define COMPLETION_INITIALIZER(work) \
11928- { 0, __WAIT_QUEUE_HEAD_INITIALIZER((work).wait) }
11929+ { 0, __SWAIT_QUEUE_HEAD_INITIALIZER((work).wait) }
11930 #endif
11931
11932 #define COMPLETION_INITIALIZER_ONSTACK(work) \
b3bbd485 11933@@ -117,7 +117,7 @@ static inline void complete_release_commit(struct completion *x) {}
e4b2b4a8
JK
11934 static inline void __init_completion(struct completion *x)
11935 {
11936 x->done = 0;
11937- init_waitqueue_head(&x->wait);
11938+ init_swait_queue_head(&x->wait);
11939 }
11940
11941 /**
b3bbd485
JK
11942diff --git a/include/linux/cpu.h b/include/linux/cpu.h
11943index 2a378d261914..b418d3c5159d 100644
11944--- a/include/linux/cpu.h
11945+++ b/include/linux/cpu.h
11946@@ -120,6 +120,8 @@ extern void cpu_hotplug_disable(void);
e4b2b4a8
JK
11947 extern void cpu_hotplug_enable(void);
11948 void clear_tasks_mm_cpumask(int cpu);
11949 int cpu_down(unsigned int cpu);
11950+extern void pin_current_cpu(void);
11951+extern void unpin_current_cpu(void);
11952
11953 #else /* CONFIG_HOTPLUG_CPU */
11954
b3bbd485 11955@@ -130,6 +132,9 @@ static inline void cpus_read_unlock(void) { }
e4b2b4a8
JK
11956 static inline void lockdep_assert_cpus_held(void) { }
11957 static inline void cpu_hotplug_disable(void) { }
11958 static inline void cpu_hotplug_enable(void) { }
11959+static inline void pin_current_cpu(void) { }
11960+static inline void unpin_current_cpu(void) { }
11961+
11962 #endif /* !CONFIG_HOTPLUG_CPU */
11963
11964 /* Wrappers which go away once all code is converted */
b3bbd485
JK
11965diff --git a/include/linux/dcache.h b/include/linux/dcache.h
11966index 006f4ccda5f5..d413993f7f17 100644
11967--- a/include/linux/dcache.h
11968+++ b/include/linux/dcache.h
11969@@ -107,7 +107,7 @@ struct dentry {
e4b2b4a8
JK
11970
11971 union {
11972 struct list_head d_lru; /* LRU list */
11973- wait_queue_head_t *d_wait; /* in-lookup ones only */
11974+ struct swait_queue_head *d_wait; /* in-lookup ones only */
11975 };
11976 struct list_head d_child; /* child of parent list */
11977 struct list_head d_subdirs; /* our children */
b3bbd485 11978@@ -238,7 +238,7 @@ extern void d_set_d_op(struct dentry *dentry, const struct dentry_operations *op
e4b2b4a8
JK
11979 extern struct dentry * d_alloc(struct dentry *, const struct qstr *);
11980 extern struct dentry * d_alloc_pseudo(struct super_block *, const struct qstr *);
11981 extern struct dentry * d_alloc_parallel(struct dentry *, const struct qstr *,
11982- wait_queue_head_t *);
11983+ struct swait_queue_head *);
11984 extern struct dentry * d_splice_alias(struct inode *, struct dentry *);
11985 extern struct dentry * d_add_ci(struct dentry *, struct inode *, struct qstr *);
11986 extern struct dentry * d_exact_alias(struct dentry *, struct inode *);
b3bbd485
JK
11987diff --git a/include/linux/delay.h b/include/linux/delay.h
11988index b78bab4395d8..7c4bc414a504 100644
11989--- a/include/linux/delay.h
11990+++ b/include/linux/delay.h
11991@@ -64,4 +64,10 @@ static inline void ssleep(unsigned int seconds)
e4b2b4a8
JK
11992 msleep(seconds * 1000);
11993 }
11994
11995+#ifdef CONFIG_PREEMPT_RT_FULL
11996+extern void cpu_chill(void);
11997+#else
11998+# define cpu_chill() cpu_relax()
11999+#endif
12000+
12001 #endif /* defined(_LINUX_DELAY_H) */
b3bbd485
JK
12002diff --git a/include/linux/fs.h b/include/linux/fs.h
12003index cc613f20e5a6..b806e2116f5c 100644
12004--- a/include/linux/fs.h
12005+++ b/include/linux/fs.h
12006@@ -655,7 +655,7 @@ struct inode {
e4b2b4a8
JK
12007 struct block_device *i_bdev;
12008 struct cdev *i_cdev;
12009 char *i_link;
12010- unsigned i_dir_seq;
12011+ unsigned __i_dir_seq;
12012 };
12013
12014 __u32 i_generation;
b3bbd485
JK
12015diff --git a/include/linux/highmem.h b/include/linux/highmem.h
12016index 776f90f3a1cd..5f0bd7a3e6a7 100644
12017--- a/include/linux/highmem.h
12018+++ b/include/linux/highmem.h
e4b2b4a8
JK
12019@@ -8,6 +8,7 @@
12020 #include <linux/mm.h>
12021 #include <linux/uaccess.h>
12022 #include <linux/hardirq.h>
12023+#include <linux/sched.h>
12024
12025 #include <asm/cacheflush.h>
12026
b3bbd485 12027@@ -66,7 +67,7 @@ static inline void kunmap(struct page *page)
e4b2b4a8
JK
12028
12029 static inline void *kmap_atomic(struct page *page)
12030 {
12031- preempt_disable();
12032+ preempt_disable_nort();
12033 pagefault_disable();
12034 return page_address(page);
12035 }
b3bbd485 12036@@ -75,7 +76,7 @@ static inline void *kmap_atomic(struct page *page)
e4b2b4a8
JK
12037 static inline void __kunmap_atomic(void *addr)
12038 {
12039 pagefault_enable();
12040- preempt_enable();
12041+ preempt_enable_nort();
12042 }
12043
12044 #define kmap_atomic_pfn(pfn) kmap_atomic(pfn_to_page(pfn))
b3bbd485 12045@@ -87,32 +88,51 @@ static inline void __kunmap_atomic(void *addr)
e4b2b4a8
JK
12046
12047 #if defined(CONFIG_HIGHMEM) || defined(CONFIG_X86_32)
12048
12049+#ifndef CONFIG_PREEMPT_RT_FULL
12050 DECLARE_PER_CPU(int, __kmap_atomic_idx);
12051+#endif
12052
12053 static inline int kmap_atomic_idx_push(void)
12054 {
12055+#ifndef CONFIG_PREEMPT_RT_FULL
12056 int idx = __this_cpu_inc_return(__kmap_atomic_idx) - 1;
12057
12058-#ifdef CONFIG_DEBUG_HIGHMEM
12059+# ifdef CONFIG_DEBUG_HIGHMEM
12060 WARN_ON_ONCE(in_irq() && !irqs_disabled());
12061 BUG_ON(idx >= KM_TYPE_NR);
12062-#endif
12063+# endif
12064 return idx;
12065+#else
12066+ current->kmap_idx++;
12067+ BUG_ON(current->kmap_idx > KM_TYPE_NR);
12068+ return current->kmap_idx - 1;
12069+#endif
12070 }
12071
12072 static inline int kmap_atomic_idx(void)
12073 {
12074+#ifndef CONFIG_PREEMPT_RT_FULL
12075 return __this_cpu_read(__kmap_atomic_idx) - 1;
12076+#else
12077+ return current->kmap_idx - 1;
12078+#endif
12079 }
12080
12081 static inline void kmap_atomic_idx_pop(void)
12082 {
12083-#ifdef CONFIG_DEBUG_HIGHMEM
12084+#ifndef CONFIG_PREEMPT_RT_FULL
12085+# ifdef CONFIG_DEBUG_HIGHMEM
12086 int idx = __this_cpu_dec_return(__kmap_atomic_idx);
12087
12088 BUG_ON(idx < 0);
12089-#else
12090+# else
12091 __this_cpu_dec(__kmap_atomic_idx);
12092+# endif
12093+#else
12094+ current->kmap_idx--;
12095+# ifdef CONFIG_DEBUG_HIGHMEM
12096+ BUG_ON(current->kmap_idx < 0);
12097+# endif
12098 #endif
12099 }
12100
b3bbd485
JK
12101diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h
12102index 012c37fdb688..3bd606859b0a 100644
12103--- a/include/linux/hrtimer.h
12104+++ b/include/linux/hrtimer.h
e4b2b4a8
JK
12105@@ -22,19 +22,42 @@
12106 #include <linux/percpu.h>
12107 #include <linux/timer.h>
12108 #include <linux/timerqueue.h>
12109+#include <linux/wait.h>
12110
12111 struct hrtimer_clock_base;
12112 struct hrtimer_cpu_base;
12113
12114 /*
12115 * Mode arguments of xxx_hrtimer functions:
12116+ *
12117+ * HRTIMER_MODE_ABS - Time value is absolute
12118+ * HRTIMER_MODE_REL - Time value is relative to now
12119+ * HRTIMER_MODE_PINNED - Timer is bound to CPU (is only considered
12120+ * when starting the timer)
12121+ * HRTIMER_MODE_SOFT - Timer callback function will be executed in
12122+ * soft irq context
12123 */
12124 enum hrtimer_mode {
12125- HRTIMER_MODE_ABS = 0x0, /* Time value is absolute */
12126- HRTIMER_MODE_REL = 0x1, /* Time value is relative to now */
12127- HRTIMER_MODE_PINNED = 0x02, /* Timer is bound to CPU */
12128- HRTIMER_MODE_ABS_PINNED = 0x02,
12129- HRTIMER_MODE_REL_PINNED = 0x03,
12130+ HRTIMER_MODE_ABS = 0x00,
12131+ HRTIMER_MODE_REL = 0x01,
12132+ HRTIMER_MODE_PINNED = 0x02,
12133+ HRTIMER_MODE_SOFT = 0x04,
12134+ HRTIMER_MODE_HARD = 0x08,
12135+
12136+ HRTIMER_MODE_ABS_PINNED = HRTIMER_MODE_ABS | HRTIMER_MODE_PINNED,
12137+ HRTIMER_MODE_REL_PINNED = HRTIMER_MODE_REL | HRTIMER_MODE_PINNED,
12138+
12139+ HRTIMER_MODE_ABS_SOFT = HRTIMER_MODE_ABS | HRTIMER_MODE_SOFT,
12140+ HRTIMER_MODE_REL_SOFT = HRTIMER_MODE_REL | HRTIMER_MODE_SOFT,
12141+
12142+ HRTIMER_MODE_ABS_PINNED_SOFT = HRTIMER_MODE_ABS_PINNED | HRTIMER_MODE_SOFT,
12143+ HRTIMER_MODE_REL_PINNED_SOFT = HRTIMER_MODE_REL_PINNED | HRTIMER_MODE_SOFT,
12144+
12145+ HRTIMER_MODE_ABS_HARD = HRTIMER_MODE_ABS | HRTIMER_MODE_HARD,
12146+ HRTIMER_MODE_REL_HARD = HRTIMER_MODE_REL | HRTIMER_MODE_HARD,
12147+
12148+ HRTIMER_MODE_ABS_PINNED_HARD = HRTIMER_MODE_ABS_PINNED | HRTIMER_MODE_HARD,
12149+ HRTIMER_MODE_REL_PINNED_HARD = HRTIMER_MODE_REL_PINNED | HRTIMER_MODE_HARD,
12150 };
12151
12152 /*
b3bbd485 12153@@ -87,6 +110,7 @@ enum hrtimer_restart {
e4b2b4a8
JK
12154 * @base: pointer to the timer base (per cpu and per clock)
12155 * @state: state information (See bit values above)
12156 * @is_rel: Set if the timer was armed relative
12157+ * @is_soft: Set if hrtimer will be expired in soft interrupt context.
12158 *
12159 * The hrtimer structure must be initialized by hrtimer_init()
12160 */
b3bbd485 12161@@ -97,6 +121,7 @@ struct hrtimer {
e4b2b4a8
JK
12162 struct hrtimer_clock_base *base;
12163 u8 state;
12164 u8 is_rel;
12165+ u8 is_soft;
12166 };
12167
12168 /**
b3bbd485 12169@@ -112,9 +137,9 @@ struct hrtimer_sleeper {
e4b2b4a8
JK
12170 };
12171
12172 #ifdef CONFIG_64BIT
12173-# define HRTIMER_CLOCK_BASE_ALIGN 64
12174+# define __hrtimer_clock_base_align ____cacheline_aligned
12175 #else
12176-# define HRTIMER_CLOCK_BASE_ALIGN 32
12177+# define __hrtimer_clock_base_align
12178 #endif
12179
12180 /**
b3bbd485 12181@@ -123,48 +148,57 @@ struct hrtimer_sleeper {
e4b2b4a8
JK
12182 * @index: clock type index for per_cpu support when moving a
12183 * timer to a base on another cpu.
12184 * @clockid: clock id for per_cpu support
12185+ * @seq: seqcount around __run_hrtimer
12186+ * @running: pointer to the currently running hrtimer
12187 * @active: red black tree root node for the active timers
12188 * @get_time: function to retrieve the current time of the clock
12189 * @offset: offset of this clock to the monotonic base
12190 */
12191 struct hrtimer_clock_base {
12192 struct hrtimer_cpu_base *cpu_base;
12193- int index;
12194+ unsigned int index;
12195 clockid_t clockid;
12196+ seqcount_t seq;
12197+ struct hrtimer *running;
12198 struct timerqueue_head active;
12199 ktime_t (*get_time)(void);
12200 ktime_t offset;
12201-} __attribute__((__aligned__(HRTIMER_CLOCK_BASE_ALIGN)));
12202+} __hrtimer_clock_base_align;
12203
12204 enum hrtimer_base_type {
12205 HRTIMER_BASE_MONOTONIC,
12206 HRTIMER_BASE_REALTIME,
12207 HRTIMER_BASE_BOOTTIME,
12208 HRTIMER_BASE_TAI,
12209+ HRTIMER_BASE_MONOTONIC_SOFT,
12210+ HRTIMER_BASE_REALTIME_SOFT,
12211+ HRTIMER_BASE_BOOTTIME_SOFT,
12212+ HRTIMER_BASE_TAI_SOFT,
12213 HRTIMER_MAX_CLOCK_BASES,
12214 };
12215
12216-/*
12217+/**
12218 * struct hrtimer_cpu_base - the per cpu clock bases
12219 * @lock: lock protecting the base and associated clock bases
12220 * and timers
12221- * @seq: seqcount around __run_hrtimer
12222- * @running: pointer to the currently running hrtimer
12223 * @cpu: cpu number
12224 * @active_bases: Bitfield to mark bases with active timers
12225 * @clock_was_set_seq: Sequence counter of clock was set events
12226- * @migration_enabled: The migration of hrtimers to other cpus is enabled
12227- * @nohz_active: The nohz functionality is enabled
12228- * @expires_next: absolute time of the next event which was scheduled
12229- * via clock_set_next_event()
12230- * @next_timer: Pointer to the first expiring timer
12231- * @in_hrtirq: hrtimer_interrupt() is currently executing
12232 * @hres_active: State of high resolution mode
12233+ * @in_hrtirq: hrtimer_interrupt() is currently executing
12234 * @hang_detected: The last hrtimer interrupt detected a hang
12235+ * @softirq_activated: displays, if the softirq is raised - update of softirq
12236+ * related settings is not required then.
12237 * @nr_events: Total number of hrtimer interrupt events
12238 * @nr_retries: Total number of hrtimer interrupt retries
12239 * @nr_hangs: Total number of hrtimer interrupt hangs
12240 * @max_hang_time: Maximum time spent in hrtimer_interrupt
12241+ * @expires_next: absolute time of the next event, is required for remote
12242+ * hrtimer enqueue; it is the total first expiry time (hard
12243+ * and soft hrtimer are taken into account)
12244+ * @next_timer: Pointer to the first expiring timer
12245+ * @softirq_expires_next: Time to check, if soft queues needs also to be expired
12246+ * @softirq_next_timer: Pointer to the first expiring softirq based timer
12247 * @clock_base: array of clock bases for this cpu
12248 *
12249 * Note: next_timer is just an optimization for __remove_hrtimer().
b3bbd485 12250@@ -173,31 +207,31 @@ enum hrtimer_base_type {
e4b2b4a8
JK
12251 */
12252 struct hrtimer_cpu_base {
12253 raw_spinlock_t lock;
12254- seqcount_t seq;
12255- struct hrtimer *running;
12256 unsigned int cpu;
12257 unsigned int active_bases;
12258 unsigned int clock_was_set_seq;
12259- bool migration_enabled;
12260- bool nohz_active;
12261+ unsigned int hres_active : 1,
12262+ in_hrtirq : 1,
12263+ hang_detected : 1,
12264+ softirq_activated : 1;
12265 #ifdef CONFIG_HIGH_RES_TIMERS
12266- unsigned int in_hrtirq : 1,
12267- hres_active : 1,
12268- hang_detected : 1;
12269- ktime_t expires_next;
12270- struct hrtimer *next_timer;
12271 unsigned int nr_events;
12272- unsigned int nr_retries;
12273- unsigned int nr_hangs;
12274+ unsigned short nr_retries;
12275+ unsigned short nr_hangs;
12276 unsigned int max_hang_time;
12277 #endif
12278+ ktime_t expires_next;
12279+ struct hrtimer *next_timer;
12280+ ktime_t softirq_expires_next;
12281+#ifdef CONFIG_PREEMPT_RT_BASE
12282+ wait_queue_head_t wait;
12283+#endif
12284+ struct hrtimer *softirq_next_timer;
12285 struct hrtimer_clock_base clock_base[HRTIMER_MAX_CLOCK_BASES];
12286 } ____cacheline_aligned;
12287
12288 static inline void hrtimer_set_expires(struct hrtimer *timer, ktime_t time)
12289 {
12290- BUILD_BUG_ON(sizeof(struct hrtimer_clock_base) > HRTIMER_CLOCK_BASE_ALIGN);
12291-
12292 timer->node.expires = time;
12293 timer->_softexpires = time;
12294 }
b3bbd485 12295@@ -266,16 +300,17 @@ static inline ktime_t hrtimer_cb_get_time(struct hrtimer *timer)
e4b2b4a8
JK
12296 return timer->base->get_time();
12297 }
12298
12299+static inline int hrtimer_is_hres_active(struct hrtimer *timer)
12300+{
12301+ return IS_ENABLED(CONFIG_HIGH_RES_TIMERS) ?
12302+ timer->base->cpu_base->hres_active : 0;
12303+}
12304+
12305 #ifdef CONFIG_HIGH_RES_TIMERS
12306 struct clock_event_device;
12307
12308 extern void hrtimer_interrupt(struct clock_event_device *dev);
12309
12310-static inline int hrtimer_is_hres_active(struct hrtimer *timer)
12311-{
12312- return timer->base->cpu_base->hres_active;
12313-}
12314-
12315 /*
12316 * The resolution of the clocks. The resolution value is returned in
12317 * the clock_getres() system call to give application programmers an
b3bbd485 12318@@ -298,11 +333,6 @@ extern unsigned int hrtimer_resolution;
e4b2b4a8
JK
12319
12320 #define hrtimer_resolution (unsigned int)LOW_RES_NSEC
12321
12322-static inline int hrtimer_is_hres_active(struct hrtimer *timer)
12323-{
12324- return 0;
12325-}
12326-
12327 static inline void clock_was_set_delayed(void) { }
12328
12329 #endif
b3bbd485 12330@@ -344,10 +374,17 @@ DECLARE_PER_CPU(struct tick_device, tick_cpu_device);
e4b2b4a8
JK
12331 /* Initialize timers: */
12332 extern void hrtimer_init(struct hrtimer *timer, clockid_t which_clock,
12333 enum hrtimer_mode mode);
12334+extern void hrtimer_init_sleeper(struct hrtimer_sleeper *sl, clockid_t clock_id,
12335+ enum hrtimer_mode mode,
12336+ struct task_struct *task);
12337
12338 #ifdef CONFIG_DEBUG_OBJECTS_TIMERS
12339 extern void hrtimer_init_on_stack(struct hrtimer *timer, clockid_t which_clock,
12340 enum hrtimer_mode mode);
12341+extern void hrtimer_init_sleeper_on_stack(struct hrtimer_sleeper *sl,
12342+ clockid_t clock_id,
12343+ enum hrtimer_mode mode,
12344+ struct task_struct *task);
12345
12346 extern void destroy_hrtimer_on_stack(struct hrtimer *timer);
12347 #else
b3bbd485 12348@@ -357,6 +394,15 @@ static inline void hrtimer_init_on_stack(struct hrtimer *timer,
e4b2b4a8
JK
12349 {
12350 hrtimer_init(timer, which_clock, mode);
12351 }
12352+
12353+static inline void hrtimer_init_sleeper_on_stack(struct hrtimer_sleeper *sl,
12354+ clockid_t clock_id,
12355+ enum hrtimer_mode mode,
12356+ struct task_struct *task)
12357+{
12358+ hrtimer_init_sleeper(sl, clock_id, mode, task);
12359+}
12360+
12361 static inline void destroy_hrtimer_on_stack(struct hrtimer *timer) { }
12362 #endif
12363
b3bbd485 12364@@ -365,11 +411,12 @@ extern void hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim,
e4b2b4a8
JK
12365 u64 range_ns, const enum hrtimer_mode mode);
12366
12367 /**
12368- * hrtimer_start - (re)start an hrtimer on the current CPU
12369+ * hrtimer_start - (re)start an hrtimer
12370 * @timer: the timer to be added
12371 * @tim: expiry time
12372- * @mode: expiry mode: absolute (HRTIMER_MODE_ABS) or
12373- * relative (HRTIMER_MODE_REL)
12374+ * @mode: timer mode: absolute (HRTIMER_MODE_ABS) or
12375+ * relative (HRTIMER_MODE_REL), and pinned (HRTIMER_MODE_PINNED);
12376+ * softirq based mode is considered for debug purpose only!
12377 */
12378 static inline void hrtimer_start(struct hrtimer *timer, ktime_t tim,
12379 const enum hrtimer_mode mode)
b3bbd485 12380@@ -396,6 +443,13 @@ static inline void hrtimer_restart(struct hrtimer *timer)
e4b2b4a8
JK
12381 hrtimer_start_expires(timer, HRTIMER_MODE_ABS);
12382 }
12383
12384+/* Softirq preemption could deadlock timer removal */
12385+#ifdef CONFIG_PREEMPT_RT_BASE
12386+ extern void hrtimer_wait_for_timer(const struct hrtimer *timer);
12387+#else
12388+# define hrtimer_wait_for_timer(timer) do { cpu_relax(); } while (0)
12389+#endif
12390+
12391 /* Query timers: */
12392 extern ktime_t __hrtimer_get_remaining(const struct hrtimer *timer, bool adjust);
12393
b3bbd485 12394@@ -420,9 +474,9 @@ static inline int hrtimer_is_queued(struct hrtimer *timer)
e4b2b4a8
JK
12395 * Helper function to check, whether the timer is running the callback
12396 * function
12397 */
12398-static inline int hrtimer_callback_running(struct hrtimer *timer)
12399+static inline int hrtimer_callback_running(const struct hrtimer *timer)
12400 {
12401- return timer->base->cpu_base->running == timer;
12402+ return timer->base->running == timer;
12403 }
12404
12405 /* Forward a hrtimer so it expires after now: */
b3bbd485 12406@@ -458,15 +512,12 @@ extern long hrtimer_nanosleep(const struct timespec64 *rqtp,
e4b2b4a8
JK
12407 const enum hrtimer_mode mode,
12408 const clockid_t clockid);
12409
12410-extern void hrtimer_init_sleeper(struct hrtimer_sleeper *sl,
12411- struct task_struct *tsk);
12412-
12413 extern int schedule_hrtimeout_range(ktime_t *expires, u64 delta,
12414 const enum hrtimer_mode mode);
12415 extern int schedule_hrtimeout_range_clock(ktime_t *expires,
12416 u64 delta,
12417 const enum hrtimer_mode mode,
12418- int clock);
12419+ clockid_t clock_id);
12420 extern int schedule_hrtimeout(ktime_t *expires, const enum hrtimer_mode mode);
12421
12422 /* Soft interrupt function to run the hrtimer queues: */
b3bbd485
JK
12423diff --git a/include/linux/idr.h b/include/linux/idr.h
12424index 7c3a365f7e12..a922d984d9b6 100644
12425--- a/include/linux/idr.h
12426+++ b/include/linux/idr.h
12427@@ -167,10 +167,7 @@ static inline bool idr_is_empty(const struct idr *idr)
e4b2b4a8
JK
12428 * Each idr_preload() should be matched with an invocation of this
12429 * function. See idr_preload() for details.
12430 */
12431-static inline void idr_preload_end(void)
12432-{
12433- preempt_enable();
12434-}
12435+void idr_preload_end(void);
12436
12437 /**
12438 * idr_find - return pointer for given id
b3bbd485
JK
12439diff --git a/include/linux/init_task.h b/include/linux/init_task.h
12440index 8062e6cc607c..ee3ff961b84c 100644
12441--- a/include/linux/init_task.h
12442+++ b/include/linux/init_task.h
12443@@ -163,6 +163,12 @@ extern struct cred init_cred;
e4b2b4a8
JK
12444 # define INIT_PERF_EVENTS(tsk)
12445 #endif
12446
12447+#if defined(CONFIG_POSIX_TIMERS) && defined(CONFIG_PREEMPT_RT_BASE)
12448+# define INIT_TIMER_LIST .posix_timer_list = NULL,
12449+#else
12450+# define INIT_TIMER_LIST
12451+#endif
12452+
12453 #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
12454 # define INIT_VTIME(tsk) \
12455 .vtime.seqcount = SEQCNT_ZERO(tsk.vtime.seqcount), \
b3bbd485 12456@@ -234,7 +240,8 @@ extern struct cred init_cred;
e4b2b4a8
JK
12457 .static_prio = MAX_PRIO-20, \
12458 .normal_prio = MAX_PRIO-20, \
12459 .policy = SCHED_NORMAL, \
12460- .cpus_allowed = CPU_MASK_ALL, \
12461+ .cpus_ptr = &tsk.cpus_mask, \
12462+ .cpus_mask = CPU_MASK_ALL, \
12463 .nr_cpus_allowed= NR_CPUS, \
12464 .mm = NULL, \
12465 .active_mm = &init_mm, \
b3bbd485 12466@@ -276,6 +283,7 @@ extern struct cred init_cred;
e4b2b4a8
JK
12467 INIT_CPU_TIMERS(tsk) \
12468 .pi_lock = __RAW_SPIN_LOCK_UNLOCKED(tsk.pi_lock), \
12469 .timer_slack_ns = 50000, /* 50 usec default slack */ \
12470+ INIT_TIMER_LIST \
12471 .pids = { \
12472 [PIDTYPE_PID] = INIT_PID_LINK(PIDTYPE_PID), \
12473 [PIDTYPE_PGID] = INIT_PID_LINK(PIDTYPE_PGID), \
b3bbd485
JK
12474diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h
12475index 69c238210325..0f25fa19b2d8 100644
12476--- a/include/linux/interrupt.h
12477+++ b/include/linux/interrupt.h
e4b2b4a8
JK
12478@@ -15,6 +15,7 @@
12479 #include <linux/hrtimer.h>
12480 #include <linux/kref.h>
12481 #include <linux/workqueue.h>
12482+#include <linux/swork.h>
12483
12484 #include <linux/atomic.h>
12485 #include <asm/ptrace.h>
12486@@ -63,6 +64,7 @@
12487 * interrupt handler after suspending interrupts. For system
12488 * wakeup devices users need to implement wakeup detection in
12489 * their interrupt handlers.
12490+ * IRQF_NO_SOFTIRQ_CALL - Do not process softirqs in the irq thread context (RT)
12491 */
12492 #define IRQF_SHARED 0x00000080
12493 #define IRQF_PROBE_SHARED 0x00000100
12494@@ -76,6 +78,7 @@
12495 #define IRQF_NO_THREAD 0x00010000
12496 #define IRQF_EARLY_RESUME 0x00020000
12497 #define IRQF_COND_SUSPEND 0x00040000
12498+#define IRQF_NO_SOFTIRQ_CALL 0x00080000
12499
12500 #define IRQF_TIMER (__IRQF_TIMER | IRQF_NO_SUSPEND | IRQF_NO_THREAD)
12501
b3bbd485 12502@@ -207,7 +210,7 @@ extern void devm_free_irq(struct device *dev, unsigned int irq, void *dev_id);
e4b2b4a8
JK
12503 #ifdef CONFIG_LOCKDEP
12504 # define local_irq_enable_in_hardirq() do { } while (0)
12505 #else
12506-# define local_irq_enable_in_hardirq() local_irq_enable()
12507+# define local_irq_enable_in_hardirq() local_irq_enable_nort()
12508 #endif
12509
12510 extern void disable_irq_nosync(unsigned int irq);
b3bbd485 12511@@ -227,6 +230,7 @@ extern void resume_device_irqs(void);
e4b2b4a8
JK
12512 * struct irq_affinity_notify - context for notification of IRQ affinity changes
12513 * @irq: Interrupt to which notification applies
12514 * @kref: Reference count, for internal use
12515+ * @swork: Swork item, for internal use
12516 * @work: Work item, for internal use
12517 * @notify: Function to be called on change. This will be
12518 * called in process context.
b3bbd485 12519@@ -238,7 +242,11 @@ extern void resume_device_irqs(void);
e4b2b4a8
JK
12520 struct irq_affinity_notify {
12521 unsigned int irq;
12522 struct kref kref;
12523+#ifdef CONFIG_PREEMPT_RT_BASE
12524+ struct swork_event swork;
12525+#else
12526 struct work_struct work;
12527+#endif
12528 void (*notify)(struct irq_affinity_notify *, const cpumask_t *mask);
12529 void (*release)(struct kref *ref);
12530 };
b3bbd485 12531@@ -429,9 +437,13 @@ extern int irq_set_irqchip_state(unsigned int irq, enum irqchip_irq_state which,
e4b2b4a8
JK
12532 bool state);
12533
12534 #ifdef CONFIG_IRQ_FORCED_THREADING
12535+# ifndef CONFIG_PREEMPT_RT_BASE
12536 extern bool force_irqthreads;
12537+# else
12538+# define force_irqthreads (true)
12539+# endif
12540 #else
12541-#define force_irqthreads (0)
12542+#define force_irqthreads (false)
12543 #endif
12544
12545 #ifndef __ARCH_SET_SOFTIRQ_PENDING
b3bbd485 12546@@ -488,9 +500,10 @@ struct softirq_action
e4b2b4a8
JK
12547 void (*action)(struct softirq_action *);
12548 };
12549
12550+#ifndef CONFIG_PREEMPT_RT_FULL
12551 asmlinkage void do_softirq(void);
12552 asmlinkage void __do_softirq(void);
12553-
12554+static inline void thread_do_softirq(void) { do_softirq(); }
12555 #ifdef __ARCH_HAS_DO_SOFTIRQ
12556 void do_softirq_own_stack(void);
12557 #else
b3bbd485 12558@@ -499,13 +512,25 @@ static inline void do_softirq_own_stack(void)
e4b2b4a8
JK
12559 __do_softirq();
12560 }
12561 #endif
12562+#else
12563+extern void thread_do_softirq(void);
12564+#endif
12565
12566 extern void open_softirq(int nr, void (*action)(struct softirq_action *));
12567 extern void softirq_init(void);
12568 extern void __raise_softirq_irqoff(unsigned int nr);
12569+#ifdef CONFIG_PREEMPT_RT_FULL
12570+extern void __raise_softirq_irqoff_ksoft(unsigned int nr);
12571+#else
12572+static inline void __raise_softirq_irqoff_ksoft(unsigned int nr)
12573+{
12574+ __raise_softirq_irqoff(nr);
12575+}
12576+#endif
12577
12578 extern void raise_softirq_irqoff(unsigned int nr);
12579 extern void raise_softirq(unsigned int nr);
12580+extern void softirq_check_pending_idle(void);
12581
12582 DECLARE_PER_CPU(struct task_struct *, ksoftirqd);
12583
b3bbd485 12584@@ -527,8 +552,9 @@ static inline struct task_struct *this_cpu_ksoftirqd(void)
e4b2b4a8
JK
12585 to be executed on some cpu at least once after this.
12586 * If the tasklet is already scheduled, but its execution is still not
12587 started, it will be executed only once.
12588- * If this tasklet is already running on another CPU (or schedule is called
12589- from tasklet itself), it is rescheduled for later.
12590+ * If this tasklet is already running on another CPU, it is rescheduled
12591+ for later.
12592+ * Schedule must not be called from the tasklet itself (a lockup occurs)
12593 * Tasklet is strictly serialized wrt itself, but not
12594 wrt another tasklets. If client needs some intertask synchronization,
12595 he makes it with spinlocks.
b3bbd485 12596@@ -553,27 +579,36 @@ struct tasklet_struct name = { NULL, 0, ATOMIC_INIT(1), func, data }
e4b2b4a8
JK
12597 enum
12598 {
12599 TASKLET_STATE_SCHED, /* Tasklet is scheduled for execution */
12600- TASKLET_STATE_RUN /* Tasklet is running (SMP only) */
12601+ TASKLET_STATE_RUN, /* Tasklet is running (SMP only) */
12602+ TASKLET_STATE_PENDING /* Tasklet is pending */
12603 };
12604
12605-#ifdef CONFIG_SMP
12606+#define TASKLET_STATEF_SCHED (1 << TASKLET_STATE_SCHED)
12607+#define TASKLET_STATEF_RUN (1 << TASKLET_STATE_RUN)
12608+#define TASKLET_STATEF_PENDING (1 << TASKLET_STATE_PENDING)
12609+
12610+#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT_FULL)
12611 static inline int tasklet_trylock(struct tasklet_struct *t)
12612 {
12613 return !test_and_set_bit(TASKLET_STATE_RUN, &(t)->state);
12614 }
12615
12616+static inline int tasklet_tryunlock(struct tasklet_struct *t)
12617+{
12618+ return cmpxchg(&t->state, TASKLET_STATEF_RUN, 0) == TASKLET_STATEF_RUN;
12619+}
12620+
12621 static inline void tasklet_unlock(struct tasklet_struct *t)
12622 {
12623 smp_mb__before_atomic();
12624 clear_bit(TASKLET_STATE_RUN, &(t)->state);
12625 }
12626
12627-static inline void tasklet_unlock_wait(struct tasklet_struct *t)
12628-{
12629- while (test_bit(TASKLET_STATE_RUN, &(t)->state)) { barrier(); }
12630-}
12631+extern void tasklet_unlock_wait(struct tasklet_struct *t);
12632+
12633 #else
12634 #define tasklet_trylock(t) 1
12635+#define tasklet_tryunlock(t) 1
12636 #define tasklet_unlock_wait(t) do { } while (0)
12637 #define tasklet_unlock(t) do { } while (0)
12638 #endif
b3bbd485 12639@@ -607,41 +642,17 @@ static inline void tasklet_disable(struct tasklet_struct *t)
e4b2b4a8
JK
12640 smp_mb();
12641 }
12642
12643-static inline void tasklet_enable(struct tasklet_struct *t)
12644-{
12645- smp_mb__before_atomic();
12646- atomic_dec(&t->count);
12647-}
12648-
12649+extern void tasklet_enable(struct tasklet_struct *t);
12650 extern void tasklet_kill(struct tasklet_struct *t);
12651 extern void tasklet_kill_immediate(struct tasklet_struct *t, unsigned int cpu);
12652 extern void tasklet_init(struct tasklet_struct *t,
12653 void (*func)(unsigned long), unsigned long data);
12654
12655-struct tasklet_hrtimer {
12656- struct hrtimer timer;
12657- struct tasklet_struct tasklet;
12658- enum hrtimer_restart (*function)(struct hrtimer *);
12659-};
12660-
12661-extern void
12662-tasklet_hrtimer_init(struct tasklet_hrtimer *ttimer,
12663- enum hrtimer_restart (*function)(struct hrtimer *),
12664- clockid_t which_clock, enum hrtimer_mode mode);
12665-
12666-static inline
12667-void tasklet_hrtimer_start(struct tasklet_hrtimer *ttimer, ktime_t time,
12668- const enum hrtimer_mode mode)
12669-{
12670- hrtimer_start(&ttimer->timer, time, mode);
12671-}
12672-
12673-static inline
12674-void tasklet_hrtimer_cancel(struct tasklet_hrtimer *ttimer)
12675-{
12676- hrtimer_cancel(&ttimer->timer);
12677- tasklet_kill(&ttimer->tasklet);
12678-}
12679+#ifdef CONFIG_PREEMPT_RT_FULL
12680+extern void softirq_early_init(void);
12681+#else
12682+static inline void softirq_early_init(void) { }
12683+#endif
12684
12685 /*
12686 * Autoprobing for irqs:
b3bbd485
JK
12687diff --git a/include/linux/irq.h b/include/linux/irq.h
12688index 0d53626405bf..ddd23c6e2e55 100644
12689--- a/include/linux/irq.h
12690+++ b/include/linux/irq.h
12691@@ -74,6 +74,7 @@ enum irqchip_irq_state;
12692 * IRQ_IS_POLLED - Always polled by another interrupt. Exclude
12693 * it from the spurious interrupt detection
12694 * mechanism and from core side polling.
12695+ * IRQ_NO_SOFTIRQ_CALL - No softirq processing in the irq thread context (RT)
12696 * IRQ_DISABLE_UNLAZY - Disable lazy irq disable
12697 */
12698 enum {
12699@@ -101,13 +102,14 @@ enum {
12700 IRQ_PER_CPU_DEVID = (1 << 17),
12701 IRQ_IS_POLLED = (1 << 18),
12702 IRQ_DISABLE_UNLAZY = (1 << 19),
12703+ IRQ_NO_SOFTIRQ_CALL = (1 << 20),
12704 };
12705
12706 #define IRQF_MODIFY_MASK \
12707 (IRQ_TYPE_SENSE_MASK | IRQ_NOPROBE | IRQ_NOREQUEST | \
12708 IRQ_NOAUTOEN | IRQ_MOVE_PCNTXT | IRQ_LEVEL | IRQ_NO_BALANCING | \
12709 IRQ_PER_CPU | IRQ_NESTED_THREAD | IRQ_NOTHREAD | IRQ_PER_CPU_DEVID | \
12710- IRQ_IS_POLLED | IRQ_DISABLE_UNLAZY)
12711+ IRQ_IS_POLLED | IRQ_DISABLE_UNLAZY | IRQ_NO_SOFTIRQ_CALL)
12712
12713 #define IRQ_NO_BALANCING_MASK (IRQ_PER_CPU | IRQ_NO_BALANCING)
12714
12715diff --git a/include/linux/irq_work.h b/include/linux/irq_work.h
12716index 9270d73ea682..1e66fac6f1d2 100644
12717--- a/include/linux/irq_work.h
12718+++ b/include/linux/irq_work.h
12719@@ -17,6 +17,7 @@
12720 #define IRQ_WORK_BUSY 2UL
12721 #define IRQ_WORK_FLAGS 3UL
12722 #define IRQ_WORK_LAZY 4UL /* Doesn't want IPI, wait for tick */
12723+#define IRQ_WORK_HARD_IRQ 8UL /* Run hard IRQ context, even on RT */
12724
12725 struct irq_work {
12726 unsigned long flags;
12727@@ -52,4 +53,10 @@ static inline bool irq_work_needs_cpu(void) { return false; }
12728 static inline void irq_work_run(void) { }
12729 #endif
12730
12731+#if defined(CONFIG_IRQ_WORK) && defined(CONFIG_PREEMPT_RT_FULL)
12732+void irq_work_tick_soft(void);
12733+#else
12734+static inline void irq_work_tick_soft(void) { }
12735+#endif
12736+
12737 #endif /* _LINUX_IRQ_WORK_H */
12738diff --git a/include/linux/irqchip/arm-gic-v3.h b/include/linux/irqchip/arm-gic-v3.h
12739index bacb499c512c..688f2565294c 100644
12740--- a/include/linux/irqchip/arm-gic-v3.h
12741+++ b/include/linux/irqchip/arm-gic-v3.h
12742@@ -568,6 +568,7 @@ struct rdists {
12743 void __iomem *rd_base;
12744 struct page *pend_page;
12745 phys_addr_t phys_base;
12746+ bool lpi_enabled;
12747 } __percpu *rdist;
12748 struct page *prop_page;
12749 int id_bits;
12750diff --git a/include/linux/irqdesc.h b/include/linux/irqdesc.h
12751index b6084898d330..d334476cdca6 100644
12752--- a/include/linux/irqdesc.h
12753+++ b/include/linux/irqdesc.h
12754@@ -70,6 +70,7 @@ struct irq_desc {
e4b2b4a8
JK
12755 unsigned int irqs_unhandled;
12756 atomic_t threads_handled;
12757 int threads_handled_last;
12758+ u64 random_ip;
12759 raw_spinlock_t lock;
12760 struct cpumask *percpu_enabled;
12761 const struct cpumask *percpu_affinity;
b3bbd485
JK
12762diff --git a/include/linux/irqflags.h b/include/linux/irqflags.h
12763index 46cb57d5eb13..2e023bfe45af 100644
12764--- a/include/linux/irqflags.h
12765+++ b/include/linux/irqflags.h
12766@@ -34,16 +34,6 @@ do { \
e4b2b4a8
JK
12767 current->hardirq_context--; \
12768 crossrelease_hist_end(XHLOCK_HARD); \
12769 } while (0)
12770-# define lockdep_softirq_enter() \
12771-do { \
12772- current->softirq_context++; \
12773- crossrelease_hist_start(XHLOCK_SOFT); \
12774-} while (0)
12775-# define lockdep_softirq_exit() \
12776-do { \
12777- current->softirq_context--; \
12778- crossrelease_hist_end(XHLOCK_SOFT); \
12779-} while (0)
12780 # define INIT_TRACE_IRQFLAGS .softirqs_enabled = 1,
12781 #else
12782 # define trace_hardirqs_on() do { } while (0)
b3bbd485 12783@@ -56,9 +46,23 @@ do { \
e4b2b4a8
JK
12784 # define trace_softirqs_enabled(p) 0
12785 # define trace_hardirq_enter() do { } while (0)
12786 # define trace_hardirq_exit() do { } while (0)
12787+# define INIT_TRACE_IRQFLAGS
12788+#endif
12789+
12790+#if defined(CONFIG_TRACE_IRQFLAGS) && !defined(CONFIG_PREEMPT_RT_FULL)
12791+# define lockdep_softirq_enter() \
12792+do { \
12793+ current->softirq_context++; \
12794+ crossrelease_hist_start(XHLOCK_SOFT); \
12795+} while (0)
12796+# define lockdep_softirq_exit() \
12797+do { \
12798+ current->softirq_context--; \
12799+ crossrelease_hist_end(XHLOCK_SOFT); \
12800+} while (0)
12801+#else
12802 # define lockdep_softirq_enter() do { } while (0)
12803 # define lockdep_softirq_exit() do { } while (0)
12804-# define INIT_TRACE_IRQFLAGS
12805 #endif
12806
12807 #if defined(CONFIG_IRQSOFF_TRACER) || \
b3bbd485 12808@@ -165,4 +169,23 @@ do { \
e4b2b4a8
JK
12809
12810 #define irqs_disabled_flags(flags) raw_irqs_disabled_flags(flags)
12811
12812+/*
12813+ * local_irq* variants depending on RT/!RT
12814+ */
12815+#ifdef CONFIG_PREEMPT_RT_FULL
12816+# define local_irq_disable_nort() do { } while (0)
12817+# define local_irq_enable_nort() do { } while (0)
12818+# define local_irq_save_nort(flags) local_save_flags(flags)
12819+# define local_irq_restore_nort(flags) (void)(flags)
12820+# define local_irq_disable_rt() local_irq_disable()
12821+# define local_irq_enable_rt() local_irq_enable()
12822+#else
12823+# define local_irq_disable_nort() local_irq_disable()
12824+# define local_irq_enable_nort() local_irq_enable()
12825+# define local_irq_save_nort(flags) local_irq_save(flags)
12826+# define local_irq_restore_nort(flags) local_irq_restore(flags)
12827+# define local_irq_disable_rt() do { } while (0)
12828+# define local_irq_enable_rt() do { } while (0)
12829+#endif
12830+
12831 #endif
b3bbd485
JK
12832diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h
12833index 29290bfb94a8..32379bfab9f0 100644
12834--- a/include/linux/jbd2.h
12835+++ b/include/linux/jbd2.h
12836@@ -347,32 +347,56 @@ static inline struct journal_head *bh2jh(struct buffer_head *bh)
e4b2b4a8
JK
12837
12838 static inline void jbd_lock_bh_state(struct buffer_head *bh)
12839 {
12840+#ifndef CONFIG_PREEMPT_RT_BASE
12841 bit_spin_lock(BH_State, &bh->b_state);
12842+#else
12843+ spin_lock(&bh->b_state_lock);
12844+#endif
12845 }
12846
12847 static inline int jbd_trylock_bh_state(struct buffer_head *bh)
12848 {
12849+#ifndef CONFIG_PREEMPT_RT_BASE
12850 return bit_spin_trylock(BH_State, &bh->b_state);
12851+#else
12852+ return spin_trylock(&bh->b_state_lock);
12853+#endif
12854 }
12855
12856 static inline int jbd_is_locked_bh_state(struct buffer_head *bh)
12857 {
12858+#ifndef CONFIG_PREEMPT_RT_BASE
12859 return bit_spin_is_locked(BH_State, &bh->b_state);
12860+#else
12861+ return spin_is_locked(&bh->b_state_lock);
12862+#endif
12863 }
12864
12865 static inline void jbd_unlock_bh_state(struct buffer_head *bh)
12866 {
12867+#ifndef CONFIG_PREEMPT_RT_BASE
12868 bit_spin_unlock(BH_State, &bh->b_state);
12869+#else
12870+ spin_unlock(&bh->b_state_lock);
12871+#endif
12872 }
12873
12874 static inline void jbd_lock_bh_journal_head(struct buffer_head *bh)
12875 {
12876+#ifndef CONFIG_PREEMPT_RT_BASE
12877 bit_spin_lock(BH_JournalHead, &bh->b_state);
12878+#else
12879+ spin_lock(&bh->b_journal_head_lock);
12880+#endif
12881 }
12882
12883 static inline void jbd_unlock_bh_journal_head(struct buffer_head *bh)
12884 {
12885+#ifndef CONFIG_PREEMPT_RT_BASE
12886 bit_spin_unlock(BH_JournalHead, &bh->b_state);
12887+#else
12888+ spin_unlock(&bh->b_journal_head_lock);
12889+#endif
12890 }
12891
12892 #define J_ASSERT(assert) BUG_ON(!(assert))
b3bbd485
JK
12893diff --git a/include/linux/kdb.h b/include/linux/kdb.h
12894index 68bd88223417..e033b25b0b72 100644
12895--- a/include/linux/kdb.h
12896+++ b/include/linux/kdb.h
12897@@ -167,6 +167,7 @@ extern __printf(2, 0) int vkdb_printf(enum kdb_msgsrc src, const char *fmt,
e4b2b4a8
JK
12898 extern __printf(1, 2) int kdb_printf(const char *, ...);
12899 typedef __printf(1, 2) int (*kdb_printf_t)(const char *, ...);
12900
12901+#define in_kdb_printk() (kdb_trap_printk)
12902 extern void kdb_init(int level);
12903
12904 /* Access to kdb specific polling devices */
b3bbd485 12905@@ -201,6 +202,7 @@ extern int kdb_register_flags(char *, kdb_func_t, char *, char *,
e4b2b4a8
JK
12906 extern int kdb_unregister(char *);
12907 #else /* ! CONFIG_KGDB_KDB */
12908 static inline __printf(1, 2) int kdb_printf(const char *fmt, ...) { return 0; }
12909+#define in_kdb_printk() (0)
12910 static inline void kdb_init(int level) {}
12911 static inline int kdb_register(char *cmd, kdb_func_t func, char *usage,
12912 char *help, short minlen) { return 0; }
b3bbd485
JK
12913diff --git a/include/linux/kernel.h b/include/linux/kernel.h
12914index 4b484ab9e163..74feebf9d82c 100644
12915--- a/include/linux/kernel.h
12916+++ b/include/linux/kernel.h
12917@@ -225,6 +225,9 @@ extern int _cond_resched(void);
e4b2b4a8
JK
12918 */
12919 # define might_sleep() \
12920 do { __might_sleep(__FILE__, __LINE__, 0); might_resched(); } while (0)
12921+
12922+# define might_sleep_no_state_check() \
12923+ do { ___might_sleep(__FILE__, __LINE__, 0); might_resched(); } while (0)
12924 # define sched_annotate_sleep() (current->task_state_change = 0)
12925 #else
12926 static inline void ___might_sleep(const char *file, int line,
b3bbd485 12927@@ -232,6 +235,7 @@ extern int _cond_resched(void);
e4b2b4a8
JK
12928 static inline void __might_sleep(const char *file, int line,
12929 int preempt_offset) { }
12930 # define might_sleep() do { might_resched(); } while (0)
12931+# define might_sleep_no_state_check() do { might_resched(); } while (0)
12932 # define sched_annotate_sleep() do { } while (0)
12933 #endif
12934
b3bbd485 12935@@ -531,6 +535,7 @@ extern enum system_states {
e4b2b4a8
JK
12936 SYSTEM_HALT,
12937 SYSTEM_POWER_OFF,
12938 SYSTEM_RESTART,
12939+ SYSTEM_SUSPEND,
12940 } system_state;
12941
12942 #define TAINT_PROPRIETARY_MODULE 0
b3bbd485
JK
12943diff --git a/include/linux/list_bl.h b/include/linux/list_bl.h
12944index 3fc2cc57ba1b..0b5de7d9ffcf 100644
12945--- a/include/linux/list_bl.h
12946+++ b/include/linux/list_bl.h
e4b2b4a8
JK
12947@@ -3,6 +3,7 @@
12948 #define _LINUX_LIST_BL_H
12949
12950 #include <linux/list.h>
12951+#include <linux/spinlock.h>
12952 #include <linux/bit_spinlock.h>
12953
12954 /*
12955@@ -33,13 +34,24 @@
12956
12957 struct hlist_bl_head {
12958 struct hlist_bl_node *first;
12959+#ifdef CONFIG_PREEMPT_RT_BASE
12960+ raw_spinlock_t lock;
12961+#endif
12962 };
12963
12964 struct hlist_bl_node {
12965 struct hlist_bl_node *next, **pprev;
12966 };
12967-#define INIT_HLIST_BL_HEAD(ptr) \
12968- ((ptr)->first = NULL)
12969+
12970+#ifdef CONFIG_PREEMPT_RT_BASE
12971+#define INIT_HLIST_BL_HEAD(h) \
12972+do { \
12973+ (h)->first = NULL; \
12974+ raw_spin_lock_init(&(h)->lock); \
12975+} while (0)
12976+#else
12977+#define INIT_HLIST_BL_HEAD(h) (h)->first = NULL
12978+#endif
12979
12980 static inline void INIT_HLIST_BL_NODE(struct hlist_bl_node *h)
12981 {
b3bbd485 12982@@ -119,12 +131,26 @@ static inline void hlist_bl_del_init(struct hlist_bl_node *n)
e4b2b4a8
JK
12983
12984 static inline void hlist_bl_lock(struct hlist_bl_head *b)
12985 {
12986+#ifndef CONFIG_PREEMPT_RT_BASE
12987 bit_spin_lock(0, (unsigned long *)b);
12988+#else
12989+ raw_spin_lock(&b->lock);
12990+#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK)
12991+ __set_bit(0, (unsigned long *)b);
12992+#endif
12993+#endif
12994 }
12995
12996 static inline void hlist_bl_unlock(struct hlist_bl_head *b)
12997 {
12998+#ifndef CONFIG_PREEMPT_RT_BASE
12999 __bit_spin_unlock(0, (unsigned long *)b);
13000+#else
13001+#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK)
13002+ __clear_bit(0, (unsigned long *)b);
13003+#endif
13004+ raw_spin_unlock(&b->lock);
13005+#endif
13006 }
13007
13008 static inline bool hlist_bl_is_locked(struct hlist_bl_head *b)
b3bbd485
JK
13009diff --git a/include/linux/locallock.h b/include/linux/locallock.h
13010new file mode 100644
13011index 000000000000..921eab83cd34
13012--- /dev/null
13013+++ b/include/linux/locallock.h
13014@@ -0,0 +1,281 @@
e4b2b4a8
JK
13015+#ifndef _LINUX_LOCALLOCK_H
13016+#define _LINUX_LOCALLOCK_H
13017+
13018+#include <linux/percpu.h>
13019+#include <linux/spinlock.h>
13020+
13021+#ifdef CONFIG_PREEMPT_RT_BASE
13022+
13023+#ifdef CONFIG_DEBUG_SPINLOCK
13024+# define LL_WARN(cond) WARN_ON(cond)
13025+#else
13026+# define LL_WARN(cond) do { } while (0)
13027+#endif
13028+
13029+/*
13030+ * per cpu lock based substitute for local_irq_*()
13031+ */
13032+struct local_irq_lock {
13033+ spinlock_t lock;
13034+ struct task_struct *owner;
13035+ int nestcnt;
13036+ unsigned long flags;
13037+};
13038+
13039+#define DEFINE_LOCAL_IRQ_LOCK(lvar) \
13040+ DEFINE_PER_CPU(struct local_irq_lock, lvar) = { \
13041+ .lock = __SPIN_LOCK_UNLOCKED((lvar).lock) }
13042+
13043+#define DECLARE_LOCAL_IRQ_LOCK(lvar) \
13044+ DECLARE_PER_CPU(struct local_irq_lock, lvar)
13045+
13046+#define local_irq_lock_init(lvar) \
13047+ do { \
13048+ int __cpu; \
13049+ for_each_possible_cpu(__cpu) \
13050+ spin_lock_init(&per_cpu(lvar, __cpu).lock); \
13051+ } while (0)
13052+
13053+static inline void __local_lock(struct local_irq_lock *lv)
13054+{
13055+ if (lv->owner != current) {
13056+ spin_lock(&lv->lock);
13057+ LL_WARN(lv->owner);
13058+ LL_WARN(lv->nestcnt);
13059+ lv->owner = current;
13060+ }
13061+ lv->nestcnt++;
13062+}
13063+
13064+#define local_lock(lvar) \
13065+ do { __local_lock(&get_local_var(lvar)); } while (0)
13066+
13067+#define local_lock_on(lvar, cpu) \
13068+ do { __local_lock(&per_cpu(lvar, cpu)); } while (0)
13069+
13070+static inline int __local_trylock(struct local_irq_lock *lv)
13071+{
13072+ if (lv->owner != current && spin_trylock(&lv->lock)) {
13073+ LL_WARN(lv->owner);
13074+ LL_WARN(lv->nestcnt);
13075+ lv->owner = current;
13076+ lv->nestcnt = 1;
13077+ return 1;
13078+ } else if (lv->owner == current) {
13079+ lv->nestcnt++;
13080+ return 1;
13081+ }
13082+ return 0;
13083+}
13084+
13085+#define local_trylock(lvar) \
13086+ ({ \
13087+ int __locked; \
13088+ __locked = __local_trylock(&get_local_var(lvar)); \
13089+ if (!__locked) \
13090+ put_local_var(lvar); \
13091+ __locked; \
13092+ })
13093+
13094+static inline void __local_unlock(struct local_irq_lock *lv)
13095+{
13096+ LL_WARN(lv->nestcnt == 0);
13097+ LL_WARN(lv->owner != current);
13098+ if (--lv->nestcnt)
13099+ return;
13100+
13101+ lv->owner = NULL;
13102+ spin_unlock(&lv->lock);
13103+}
13104+
13105+#define local_unlock(lvar) \
13106+ do { \
13107+ __local_unlock(this_cpu_ptr(&lvar)); \
13108+ put_local_var(lvar); \
13109+ } while (0)
13110+
13111+#define local_unlock_on(lvar, cpu) \
13112+ do { __local_unlock(&per_cpu(lvar, cpu)); } while (0)
13113+
13114+static inline void __local_lock_irq(struct local_irq_lock *lv)
13115+{
13116+ spin_lock_irqsave(&lv->lock, lv->flags);
13117+ LL_WARN(lv->owner);
13118+ LL_WARN(lv->nestcnt);
13119+ lv->owner = current;
13120+ lv->nestcnt = 1;
13121+}
13122+
13123+#define local_lock_irq(lvar) \
13124+ do { __local_lock_irq(&get_local_var(lvar)); } while (0)
13125+
13126+#define local_lock_irq_on(lvar, cpu) \
13127+ do { __local_lock_irq(&per_cpu(lvar, cpu)); } while (0)
13128+
13129+static inline void __local_unlock_irq(struct local_irq_lock *lv)
13130+{
13131+ LL_WARN(!lv->nestcnt);
13132+ LL_WARN(lv->owner != current);
13133+ lv->owner = NULL;
13134+ lv->nestcnt = 0;
13135+ spin_unlock_irq(&lv->lock);
13136+}
13137+
13138+#define local_unlock_irq(lvar) \
13139+ do { \
13140+ __local_unlock_irq(this_cpu_ptr(&lvar)); \
13141+ put_local_var(lvar); \
13142+ } while (0)
13143+
13144+#define local_unlock_irq_on(lvar, cpu) \
13145+ do { \
13146+ __local_unlock_irq(&per_cpu(lvar, cpu)); \
13147+ } while (0)
13148+
13149+static inline int __local_lock_irqsave(struct local_irq_lock *lv)
13150+{
13151+ if (lv->owner != current) {
13152+ __local_lock_irq(lv);
13153+ return 0;
13154+ } else {
13155+ lv->nestcnt++;
13156+ return 1;
13157+ }
13158+}
13159+
13160+#define local_lock_irqsave(lvar, _flags) \
13161+ do { \
13162+ if (__local_lock_irqsave(&get_local_var(lvar))) \
13163+ put_local_var(lvar); \
13164+ _flags = __this_cpu_read(lvar.flags); \
13165+ } while (0)
13166+
13167+#define local_lock_irqsave_on(lvar, _flags, cpu) \
13168+ do { \
13169+ __local_lock_irqsave(&per_cpu(lvar, cpu)); \
13170+ _flags = per_cpu(lvar, cpu).flags; \
13171+ } while (0)
13172+
13173+static inline int __local_unlock_irqrestore(struct local_irq_lock *lv,
13174+ unsigned long flags)
13175+{
13176+ LL_WARN(!lv->nestcnt);
13177+ LL_WARN(lv->owner != current);
13178+ if (--lv->nestcnt)
13179+ return 0;
13180+
13181+ lv->owner = NULL;
13182+ spin_unlock_irqrestore(&lv->lock, lv->flags);
13183+ return 1;
13184+}
13185+
13186+#define local_unlock_irqrestore(lvar, flags) \
13187+ do { \
13188+ if (__local_unlock_irqrestore(this_cpu_ptr(&lvar), flags)) \
13189+ put_local_var(lvar); \
13190+ } while (0)
13191+
13192+#define local_unlock_irqrestore_on(lvar, flags, cpu) \
13193+ do { \
13194+ __local_unlock_irqrestore(&per_cpu(lvar, cpu), flags); \
13195+ } while (0)
13196+
13197+#define local_spin_trylock_irq(lvar, lock) \
13198+ ({ \
13199+ int __locked; \
13200+ local_lock_irq(lvar); \
13201+ __locked = spin_trylock(lock); \
13202+ if (!__locked) \
13203+ local_unlock_irq(lvar); \
13204+ __locked; \
13205+ })
13206+
13207+#define local_spin_lock_irq(lvar, lock) \
13208+ do { \
13209+ local_lock_irq(lvar); \
13210+ spin_lock(lock); \
13211+ } while (0)
13212+
13213+#define local_spin_unlock_irq(lvar, lock) \
13214+ do { \
13215+ spin_unlock(lock); \
13216+ local_unlock_irq(lvar); \
13217+ } while (0)
13218+
13219+#define local_spin_lock_irqsave(lvar, lock, flags) \
13220+ do { \
13221+ local_lock_irqsave(lvar, flags); \
13222+ spin_lock(lock); \
13223+ } while (0)
13224+
13225+#define local_spin_unlock_irqrestore(lvar, lock, flags) \
13226+ do { \
13227+ spin_unlock(lock); \
13228+ local_unlock_irqrestore(lvar, flags); \
13229+ } while (0)
13230+
13231+#define get_locked_var(lvar, var) \
13232+ (*({ \
13233+ local_lock(lvar); \
13234+ this_cpu_ptr(&var); \
13235+ }))
13236+
13237+#define put_locked_var(lvar, var) local_unlock(lvar);
13238+
b3bbd485
JK
13239+#define get_locked_ptr(lvar, var) \
13240+ ({ \
13241+ local_lock(lvar); \
13242+ this_cpu_ptr(var); \
13243+ })
13244+
13245+#define put_locked_ptr(lvar, var) local_unlock(lvar);
13246+
e4b2b4a8
JK
13247+#define local_lock_cpu(lvar) \
13248+ ({ \
13249+ local_lock(lvar); \
13250+ smp_processor_id(); \
13251+ })
13252+
13253+#define local_unlock_cpu(lvar) local_unlock(lvar)
13254+
13255+#else /* PREEMPT_RT_BASE */
13256+
13257+#define DEFINE_LOCAL_IRQ_LOCK(lvar) __typeof__(const int) lvar
13258+#define DECLARE_LOCAL_IRQ_LOCK(lvar) extern __typeof__(const int) lvar
13259+
13260+static inline void local_irq_lock_init(int lvar) { }
13261+
13262+#define local_trylock(lvar) \
13263+ ({ \
13264+ preempt_disable(); \
13265+ 1; \
13266+ })
13267+
13268+#define local_lock(lvar) preempt_disable()
13269+#define local_unlock(lvar) preempt_enable()
13270+#define local_lock_irq(lvar) local_irq_disable()
13271+#define local_lock_irq_on(lvar, cpu) local_irq_disable()
13272+#define local_unlock_irq(lvar) local_irq_enable()
13273+#define local_unlock_irq_on(lvar, cpu) local_irq_enable()
13274+#define local_lock_irqsave(lvar, flags) local_irq_save(flags)
13275+#define local_unlock_irqrestore(lvar, flags) local_irq_restore(flags)
13276+
13277+#define local_spin_trylock_irq(lvar, lock) spin_trylock_irq(lock)
13278+#define local_spin_lock_irq(lvar, lock) spin_lock_irq(lock)
13279+#define local_spin_unlock_irq(lvar, lock) spin_unlock_irq(lock)
13280+#define local_spin_lock_irqsave(lvar, lock, flags) \
13281+ spin_lock_irqsave(lock, flags)
13282+#define local_spin_unlock_irqrestore(lvar, lock, flags) \
13283+ spin_unlock_irqrestore(lock, flags)
13284+
13285+#define get_locked_var(lvar, var) get_cpu_var(var)
13286+#define put_locked_var(lvar, var) put_cpu_var(var)
b3bbd485
JK
13287+#define get_locked_ptr(lvar, var) get_cpu_ptr(var)
13288+#define put_locked_ptr(lvar, var) put_cpu_ptr(var)
e4b2b4a8
JK
13289+
13290+#define local_lock_cpu(lvar) get_cpu()
13291+#define local_unlock_cpu(lvar) put_cpu()
13292+
13293+#endif
13294+
13295+#endif
b3bbd485
JK
13296diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
13297index e41ef532c4ce..63317710311e 100644
13298--- a/include/linux/mm_types.h
13299+++ b/include/linux/mm_types.h
e4b2b4a8
JK
13300@@ -12,6 +12,7 @@
13301 #include <linux/completion.h>
13302 #include <linux/cpumask.h>
13303 #include <linux/uprobes.h>
13304+#include <linux/rcupdate.h>
13305 #include <linux/page-flags-layout.h>
13306 #include <linux/workqueue.h>
13307
b3bbd485 13308@@ -496,6 +497,9 @@ struct mm_struct {
e4b2b4a8
JK
13309 bool tlb_flush_batched;
13310 #endif
13311 struct uprobes_state uprobes_state;
13312+#ifdef CONFIG_PREEMPT_RT_BASE
13313+ struct rcu_head delayed_drop;
13314+#endif
13315 #ifdef CONFIG_HUGETLB_PAGE
13316 atomic_long_t hugetlb_usage;
13317 #endif
b3bbd485
JK
13318diff --git a/include/linux/mutex.h b/include/linux/mutex.h
13319index 153274f78402..dbb52857b25b 100644
13320--- a/include/linux/mutex.h
13321+++ b/include/linux/mutex.h
e4b2b4a8 13322@@ -23,6 +23,17 @@
1a6e0f06 13323
e4b2b4a8 13324 struct ww_acquire_ctx;
1a6e0f06 13325
e4b2b4a8
JK
13326+#ifdef CONFIG_DEBUG_LOCK_ALLOC
13327+# define __DEP_MAP_MUTEX_INITIALIZER(lockname) \
13328+ , .dep_map = { .name = #lockname }
13329+#else
13330+# define __DEP_MAP_MUTEX_INITIALIZER(lockname)
13331+#endif
13332+
13333+#ifdef CONFIG_PREEMPT_RT_FULL
13334+# include <linux/mutex_rt.h>
13335+#else
13336+
13337 /*
13338 * Simple, straightforward mutexes with strict semantics:
13339 *
b3bbd485 13340@@ -114,13 +125,6 @@ do { \
e4b2b4a8
JK
13341 __mutex_init((mutex), #mutex, &__key); \
13342 } while (0)
1a6e0f06 13343
e4b2b4a8
JK
13344-#ifdef CONFIG_DEBUG_LOCK_ALLOC
13345-# define __DEP_MAP_MUTEX_INITIALIZER(lockname) \
13346- , .dep_map = { .name = #lockname }
13347-#else
13348-# define __DEP_MAP_MUTEX_INITIALIZER(lockname)
13349-#endif
13350-
13351 #define __MUTEX_INITIALIZER(lockname) \
13352 { .owner = ATOMIC_LONG_INIT(0) \
13353 , .wait_lock = __SPIN_LOCK_UNLOCKED(lockname.wait_lock) \
b3bbd485 13354@@ -228,4 +232,6 @@ mutex_trylock_recursive(struct mutex *lock)
e4b2b4a8
JK
13355 return mutex_trylock(lock);
13356 }
1a6e0f06 13357
e4b2b4a8
JK
13358+#endif /* !PREEMPT_RT_FULL */
13359+
13360 #endif /* __LINUX_MUTEX_H */
b3bbd485
JK
13361diff --git a/include/linux/mutex_rt.h b/include/linux/mutex_rt.h
13362new file mode 100644
13363index 000000000000..3fcb5edb1d2b
13364--- /dev/null
13365+++ b/include/linux/mutex_rt.h
e4b2b4a8
JK
13366@@ -0,0 +1,130 @@
13367+#ifndef __LINUX_MUTEX_RT_H
13368+#define __LINUX_MUTEX_RT_H
13369+
13370+#ifndef __LINUX_MUTEX_H
13371+#error "Please include mutex.h"
13372+#endif
13373+
13374+#include <linux/rtmutex.h>
13375+
13376+/* FIXME: Just for __lockfunc */
13377+#include <linux/spinlock.h>
13378+
13379+struct mutex {
13380+ struct rt_mutex lock;
13381+#ifdef CONFIG_DEBUG_LOCK_ALLOC
13382+ struct lockdep_map dep_map;
13383+#endif
13384+};
13385+
13386+#define __MUTEX_INITIALIZER(mutexname) \
13387+ { \
13388+ .lock = __RT_MUTEX_INITIALIZER(mutexname.lock) \
13389+ __DEP_MAP_MUTEX_INITIALIZER(mutexname) \
13390+ }
13391+
13392+#define DEFINE_MUTEX(mutexname) \
13393+ struct mutex mutexname = __MUTEX_INITIALIZER(mutexname)
13394+
13395+extern void __mutex_do_init(struct mutex *lock, const char *name, struct lock_class_key *key);
13396+extern void __lockfunc _mutex_lock(struct mutex *lock);
13397+extern void __lockfunc _mutex_lock_io(struct mutex *lock);
13398+extern void __lockfunc _mutex_lock_io_nested(struct mutex *lock, int subclass);
13399+extern int __lockfunc _mutex_lock_interruptible(struct mutex *lock);
13400+extern int __lockfunc _mutex_lock_killable(struct mutex *lock);
13401+extern void __lockfunc _mutex_lock_nested(struct mutex *lock, int subclass);
13402+extern void __lockfunc _mutex_lock_nest_lock(struct mutex *lock, struct lockdep_map *nest_lock);
13403+extern int __lockfunc _mutex_lock_interruptible_nested(struct mutex *lock, int subclass);
13404+extern int __lockfunc _mutex_lock_killable_nested(struct mutex *lock, int subclass);
13405+extern int __lockfunc _mutex_trylock(struct mutex *lock);
13406+extern void __lockfunc _mutex_unlock(struct mutex *lock);
13407+
13408+#define mutex_is_locked(l) rt_mutex_is_locked(&(l)->lock)
13409+#define mutex_lock(l) _mutex_lock(l)
13410+#define mutex_lock_interruptible(l) _mutex_lock_interruptible(l)
13411+#define mutex_lock_killable(l) _mutex_lock_killable(l)
13412+#define mutex_trylock(l) _mutex_trylock(l)
13413+#define mutex_unlock(l) _mutex_unlock(l)
13414+#define mutex_lock_io(l) _mutex_lock_io(l);
13415+
13416+#define __mutex_owner(l) ((l)->lock.owner)
13417+
13418+#ifdef CONFIG_DEBUG_MUTEXES
13419+#define mutex_destroy(l) rt_mutex_destroy(&(l)->lock)
13420+#else
13421+static inline void mutex_destroy(struct mutex *lock) {}
13422+#endif
13423+
13424+#ifdef CONFIG_DEBUG_LOCK_ALLOC
13425+# define mutex_lock_nested(l, s) _mutex_lock_nested(l, s)
13426+# define mutex_lock_interruptible_nested(l, s) \
13427+ _mutex_lock_interruptible_nested(l, s)
13428+# define mutex_lock_killable_nested(l, s) \
13429+ _mutex_lock_killable_nested(l, s)
13430+# define mutex_lock_io_nested(l, s) _mutex_lock_io_nested(l, s)
13431+
13432+# define mutex_lock_nest_lock(lock, nest_lock) \
13433+do { \
13434+ typecheck(struct lockdep_map *, &(nest_lock)->dep_map); \
13435+ _mutex_lock_nest_lock(lock, &(nest_lock)->dep_map); \
13436+} while (0)
13437+
13438+#else
13439+# define mutex_lock_nested(l, s) _mutex_lock(l)
13440+# define mutex_lock_interruptible_nested(l, s) \
13441+ _mutex_lock_interruptible(l)
13442+# define mutex_lock_killable_nested(l, s) \
13443+ _mutex_lock_killable(l)
13444+# define mutex_lock_nest_lock(lock, nest_lock) mutex_lock(lock)
13445+# define mutex_lock_io_nested(l, s) _mutex_lock_io(l)
13446+#endif
13447+
13448+# define mutex_init(mutex) \
13449+do { \
13450+ static struct lock_class_key __key; \
13451+ \
13452+ rt_mutex_init(&(mutex)->lock); \
13453+ __mutex_do_init((mutex), #mutex, &__key); \
13454+} while (0)
13455+
13456+# define __mutex_init(mutex, name, key) \
13457+do { \
13458+ rt_mutex_init(&(mutex)->lock); \
13459+ __mutex_do_init((mutex), name, key); \
13460+} while (0)
13461+
13462+/**
13463+ * These values are chosen such that FAIL and SUCCESS match the
13464+ * values of the regular mutex_trylock().
13465+ */
13466+enum mutex_trylock_recursive_enum {
13467+ MUTEX_TRYLOCK_FAILED = 0,
13468+ MUTEX_TRYLOCK_SUCCESS = 1,
13469+ MUTEX_TRYLOCK_RECURSIVE,
13470+};
13471+/**
13472+ * mutex_trylock_recursive - trylock variant that allows recursive locking
13473+ * @lock: mutex to be locked
13474+ *
13475+ * This function should not be used, _ever_. It is purely for hysterical GEM
13476+ * raisins, and once those are gone this will be removed.
13477+ *
13478+ * Returns:
13479+ * MUTEX_TRYLOCK_FAILED - trylock failed,
13480+ * MUTEX_TRYLOCK_SUCCESS - lock acquired,
13481+ * MUTEX_TRYLOCK_RECURSIVE - we already owned the lock.
13482+ */
13483+int __rt_mutex_owner_current(struct rt_mutex *lock);
13484+
13485+static inline /* __deprecated */ __must_check enum mutex_trylock_recursive_enum
13486+mutex_trylock_recursive(struct mutex *lock)
13487+{
13488+ if (unlikely(__rt_mutex_owner_current(&lock->lock)))
13489+ return MUTEX_TRYLOCK_RECURSIVE;
13490+
13491+ return mutex_trylock(lock);
13492+}
13493+
13494+extern int atomic_dec_and_mutex_lock(atomic_t *cnt, struct mutex *lock);
13495+
13496+#endif
b3bbd485 13497diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
5dd41b01 13498index a516dbe5869f..3ceccf72757e 100644
b3bbd485
JK
13499--- a/include/linux/netdevice.h
13500+++ b/include/linux/netdevice.h
13501@@ -409,7 +409,19 @@ typedef enum rx_handler_result rx_handler_result_t;
e4b2b4a8 13502 typedef rx_handler_result_t rx_handler_func_t(struct sk_buff **pskb);
1a6e0f06 13503
e4b2b4a8
JK
13504 void __napi_schedule(struct napi_struct *n);
13505+
13506+/*
13507+ * When PREEMPT_RT_FULL is defined, all device interrupt handlers
13508+ * run as threads, and they can also be preempted (without PREEMPT_RT
13509+ * interrupt threads can not be preempted). Which means that calling
13510+ * __napi_schedule_irqoff() from an interrupt handler can be preempted
13511+ * and can corrupt the napi->poll_list.
13512+ */
13513+#ifdef CONFIG_PREEMPT_RT_FULL
13514+#define __napi_schedule_irqoff(n) __napi_schedule(n)
13515+#else
13516 void __napi_schedule_irqoff(struct napi_struct *n);
13517+#endif
1a6e0f06 13518
e4b2b4a8
JK
13519 static inline bool napi_disable_pending(struct napi_struct *n)
13520 {
b3bbd485 13521@@ -571,7 +583,11 @@ struct netdev_queue {
e4b2b4a8
JK
13522 * write-mostly part
13523 */
13524 spinlock_t _xmit_lock ____cacheline_aligned_in_smp;
13525+#ifdef CONFIG_PREEMPT_RT_FULL
13526+ struct task_struct *xmit_lock_owner;
13527+#else
13528 int xmit_lock_owner;
13529+#endif
1a6e0f06 13530 /*
e4b2b4a8 13531 * Time (in jiffies) of last Tx
1a6e0f06 13532 */
5dd41b01 13533@@ -2440,14 +2456,53 @@ void netdev_freemem(struct net_device *dev);
e4b2b4a8
JK
13534 void synchronize_net(void);
13535 int init_dummy_netdev(struct net_device *dev);
1a6e0f06 13536
e4b2b4a8
JK
13537-DECLARE_PER_CPU(int, xmit_recursion);
13538 #define XMIT_RECURSION_LIMIT 10
13539+#ifdef CONFIG_PREEMPT_RT_FULL
13540+static inline int dev_recursion_level(void)
13541+{
13542+ return current->xmit_recursion;
13543+}
13544+
13545+static inline int xmit_rec_read(void)
13546+{
13547+ return current->xmit_recursion;
13548+}
13549+
13550+static inline void xmit_rec_inc(void)
13551+{
13552+ current->xmit_recursion++;
13553+}
13554+
13555+static inline void xmit_rec_dec(void)
13556+{
13557+ current->xmit_recursion--;
13558+}
13559+
13560+#else
13561+
13562+DECLARE_PER_CPU(int, xmit_recursion);
1a6e0f06 13563
e4b2b4a8
JK
13564 static inline int dev_recursion_level(void)
13565 {
13566 return this_cpu_read(xmit_recursion);
13567 }
1a6e0f06 13568
e4b2b4a8
JK
13569+static inline int xmit_rec_read(void)
13570+{
13571+ return __this_cpu_read(xmit_recursion);
13572+}
13573+
13574+static inline void xmit_rec_inc(void)
13575+{
13576+ __this_cpu_inc(xmit_recursion);
13577+}
13578+
13579+static inline void xmit_rec_dec(void)
13580+{
13581+ __this_cpu_dec(xmit_recursion);
13582+}
13583+#endif
13584+
13585 struct net_device *dev_get_by_index(struct net *net, int ifindex);
13586 struct net_device *__dev_get_by_index(struct net *net, int ifindex);
13587 struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex);
5dd41b01 13588@@ -2799,6 +2854,7 @@ struct softnet_data {
e4b2b4a8
JK
13589 unsigned int dropped;
13590 struct sk_buff_head input_pkt_queue;
13591 struct napi_struct backlog;
13592+ struct sk_buff_head tofree_queue;
1a6e0f06 13593
e4b2b4a8 13594 };
1a6e0f06 13595
5dd41b01 13596@@ -3522,10 +3578,48 @@ static inline u32 netif_msg_init(int debug_value, int default_msg_enable_bits)
e4b2b4a8 13597 return (1 << debug_value) - 1;
1a6e0f06
JK
13598 }
13599
e4b2b4a8
JK
13600+#ifdef CONFIG_PREEMPT_RT_FULL
13601+static inline void netdev_queue_set_owner(struct netdev_queue *txq, int cpu)
13602+{
13603+ txq->xmit_lock_owner = current;
13604+}
13605+
13606+static inline void netdev_queue_clear_owner(struct netdev_queue *txq)
13607+{
13608+ txq->xmit_lock_owner = NULL;
13609+}
13610+
13611+static inline bool netdev_queue_has_owner(struct netdev_queue *txq)
13612+{
13613+ if (txq->xmit_lock_owner != NULL)
13614+ return true;
13615+ return false;
13616+}
13617+
13618+#else
13619+
13620+static inline void netdev_queue_set_owner(struct netdev_queue *txq, int cpu)
13621+{
13622+ txq->xmit_lock_owner = cpu;
13623+}
13624+
13625+static inline void netdev_queue_clear_owner(struct netdev_queue *txq)
13626+{
13627+ txq->xmit_lock_owner = -1;
13628+}
13629+
13630+static inline bool netdev_queue_has_owner(struct netdev_queue *txq)
13631+{
13632+ if (txq->xmit_lock_owner != -1)
13633+ return true;
13634+ return false;
13635+}
13636+#endif
13637+
13638 static inline void __netif_tx_lock(struct netdev_queue *txq, int cpu)
13639 {
13640 spin_lock(&txq->_xmit_lock);
13641- txq->xmit_lock_owner = cpu;
13642+ netdev_queue_set_owner(txq, cpu);
13643 }
1a6e0f06 13644
e4b2b4a8 13645 static inline bool __netif_tx_acquire(struct netdev_queue *txq)
5dd41b01 13646@@ -3542,32 +3636,32 @@ static inline void __netif_tx_release(struct netdev_queue *txq)
e4b2b4a8
JK
13647 static inline void __netif_tx_lock_bh(struct netdev_queue *txq)
13648 {
13649 spin_lock_bh(&txq->_xmit_lock);
13650- txq->xmit_lock_owner = smp_processor_id();
13651+ netdev_queue_set_owner(txq, smp_processor_id());
13652 }
1a6e0f06 13653
e4b2b4a8
JK
13654 static inline bool __netif_tx_trylock(struct netdev_queue *txq)
13655 {
13656 bool ok = spin_trylock(&txq->_xmit_lock);
13657 if (likely(ok))
13658- txq->xmit_lock_owner = smp_processor_id();
13659+ netdev_queue_set_owner(txq, smp_processor_id());
13660 return ok;
13661 }
1a6e0f06 13662
e4b2b4a8
JK
13663 static inline void __netif_tx_unlock(struct netdev_queue *txq)
13664 {
13665- txq->xmit_lock_owner = -1;
13666+ netdev_queue_clear_owner(txq);
13667 spin_unlock(&txq->_xmit_lock);
13668 }
1a6e0f06 13669
e4b2b4a8
JK
13670 static inline void __netif_tx_unlock_bh(struct netdev_queue *txq)
13671 {
13672- txq->xmit_lock_owner = -1;
13673+ netdev_queue_clear_owner(txq);
13674 spin_unlock_bh(&txq->_xmit_lock);
13675 }
1a6e0f06 13676
e4b2b4a8
JK
13677 static inline void txq_trans_update(struct netdev_queue *txq)
13678 {
13679- if (txq->xmit_lock_owner != -1)
13680+ if (netdev_queue_has_owner(txq))
13681 txq->trans_start = jiffies;
13682 }
1a6e0f06 13683
b3bbd485
JK
13684diff --git a/include/linux/netfilter/x_tables.h b/include/linux/netfilter/x_tables.h
13685index 54f346a45cd0..79723e76af66 100644
13686--- a/include/linux/netfilter/x_tables.h
13687+++ b/include/linux/netfilter/x_tables.h
e4b2b4a8
JK
13688@@ -6,6 +6,7 @@
13689 #include <linux/netdevice.h>
13690 #include <linux/static_key.h>
13691 #include <linux/netfilter.h>
13692+#include <linux/locallock.h>
13693 #include <uapi/linux/netfilter/x_tables.h>
1a6e0f06 13694
e4b2b4a8 13695 /* Test a struct->invflags and a boolean for inequality */
b3bbd485 13696@@ -341,6 +342,8 @@ void xt_free_table_info(struct xt_table_info *info);
e4b2b4a8
JK
13697 */
13698 DECLARE_PER_CPU(seqcount_t, xt_recseq);
1a6e0f06 13699
e4b2b4a8
JK
13700+DECLARE_LOCAL_IRQ_LOCK(xt_write_lock);
13701+
13702 /* xt_tee_enabled - true if x_tables needs to handle reentrancy
13703 *
13704 * Enabled if current ip(6)tables ruleset has at least one -j TEE rule.
b3bbd485 13705@@ -361,6 +364,9 @@ static inline unsigned int xt_write_recseq_begin(void)
e4b2b4a8
JK
13706 {
13707 unsigned int addend;
1a6e0f06 13708
e4b2b4a8
JK
13709+ /* RT protection */
13710+ local_lock(xt_write_lock);
13711+
13712 /*
13713 * Low order bit of sequence is set if we already
13714 * called xt_write_recseq_begin().
b3bbd485 13715@@ -391,6 +397,7 @@ static inline void xt_write_recseq_end(unsigned int addend)
e4b2b4a8
JK
13716 /* this is kind of a write_seqcount_end(), but addend is 0 or 1 */
13717 smp_wmb();
13718 __this_cpu_add(xt_recseq.sequence, addend);
13719+ local_unlock(xt_write_lock);
13720 }
1a6e0f06 13721
e4b2b4a8 13722 /*
b3bbd485
JK
13723diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h
13724index f0015f801a78..c38288622819 100644
13725--- a/include/linux/nfs_fs.h
13726+++ b/include/linux/nfs_fs.h
13727@@ -162,7 +162,11 @@ struct nfs_inode {
1a6e0f06 13728
e4b2b4a8
JK
13729 /* Readers: in-flight sillydelete RPC calls */
13730 /* Writers: rmdir */
13731+#ifdef CONFIG_PREEMPT_RT_BASE
13732+ struct semaphore rmdir_sem;
13733+#else
13734 struct rw_semaphore rmdir_sem;
13735+#endif
13736 struct mutex commit_mutex;
1a6e0f06 13737
e4b2b4a8 13738 #if IS_ENABLED(CONFIG_NFS_V4)
b3bbd485
JK
13739diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
13740index 6959968dc36a..802e849b57ac 100644
13741--- a/include/linux/nfs_xdr.h
13742+++ b/include/linux/nfs_xdr.h
13743@@ -1530,7 +1530,7 @@ struct nfs_unlinkdata {
e4b2b4a8
JK
13744 struct nfs_removeargs args;
13745 struct nfs_removeres res;
13746 struct dentry *dentry;
13747- wait_queue_head_t wq;
13748+ struct swait_queue_head wq;
13749 struct rpc_cred *cred;
13750 struct nfs_fattr dir_attr;
13751 long timeout;
b3bbd485
JK
13752diff --git a/include/linux/notifier.h b/include/linux/notifier.h
13753index 6d731110e0db..e758627da14d 100644
13754--- a/include/linux/notifier.h
13755+++ b/include/linux/notifier.h
e4b2b4a8
JK
13756@@ -7,7 +7,7 @@
13757 *
13758 * Alan Cox <Alan.Cox@linux.org>
13759 */
13760-
13761+
13762 #ifndef _LINUX_NOTIFIER_H
13763 #define _LINUX_NOTIFIER_H
13764 #include <linux/errno.h>
13765@@ -43,9 +43,7 @@
13766 * in srcu_notifier_call_chain(): no cache bounces and no memory barriers.
13767 * As compensation, srcu_notifier_chain_unregister() is rather expensive.
13768 * SRCU notifier chains should be used when the chain will be called very
13769- * often but notifier_blocks will seldom be removed. Also, SRCU notifier
13770- * chains are slightly more difficult to use because they require special
13771- * runtime initialization.
13772+ * often but notifier_blocks will seldom be removed.
13773 */
1a6e0f06 13774
e4b2b4a8 13775 struct notifier_block;
b3bbd485 13776@@ -91,7 +89,7 @@ struct srcu_notifier_head {
e4b2b4a8
JK
13777 (name)->head = NULL; \
13778 } while (0)
1a6e0f06 13779
e4b2b4a8
JK
13780-/* srcu_notifier_heads must be initialized and cleaned up dynamically */
13781+/* srcu_notifier_heads must be cleaned up dynamically */
13782 extern void srcu_init_notifier_head(struct srcu_notifier_head *nh);
13783 #define srcu_cleanup_notifier_head(name) \
13784 cleanup_srcu_struct(&(name)->srcu);
b3bbd485 13785@@ -104,7 +102,13 @@ extern void srcu_init_notifier_head(struct srcu_notifier_head *nh);
e4b2b4a8
JK
13786 .head = NULL }
13787 #define RAW_NOTIFIER_INIT(name) { \
13788 .head = NULL }
13789-/* srcu_notifier_heads cannot be initialized statically */
13790+
13791+#define SRCU_NOTIFIER_INIT(name, pcpu) \
13792+ { \
13793+ .mutex = __MUTEX_INITIALIZER(name.mutex), \
13794+ .head = NULL, \
13795+ .srcu = __SRCU_STRUCT_INIT(name.srcu, pcpu), \
13796+ }
1a6e0f06 13797
e4b2b4a8
JK
13798 #define ATOMIC_NOTIFIER_HEAD(name) \
13799 struct atomic_notifier_head name = \
b3bbd485 13800@@ -116,6 +120,26 @@ extern void srcu_init_notifier_head(struct srcu_notifier_head *nh);
e4b2b4a8
JK
13801 struct raw_notifier_head name = \
13802 RAW_NOTIFIER_INIT(name)
1a6e0f06 13803
e4b2b4a8
JK
13804+#ifdef CONFIG_TREE_SRCU
13805+#define _SRCU_NOTIFIER_HEAD(name, mod) \
13806+ static DEFINE_PER_CPU(struct srcu_data, \
13807+ name##_head_srcu_data); \
13808+ mod struct srcu_notifier_head name = \
13809+ SRCU_NOTIFIER_INIT(name, name##_head_srcu_data)
13810+
13811+#else
13812+#define _SRCU_NOTIFIER_HEAD(name, mod) \
13813+ mod struct srcu_notifier_head name = \
13814+ SRCU_NOTIFIER_INIT(name, name)
13815+
13816+#endif
13817+
13818+#define SRCU_NOTIFIER_HEAD(name) \
13819+ _SRCU_NOTIFIER_HEAD(name, )
13820+
13821+#define SRCU_NOTIFIER_HEAD_STATIC(name) \
13822+ _SRCU_NOTIFIER_HEAD(name, static)
13823+
13824 #ifdef __KERNEL__
1a6e0f06 13825
e4b2b4a8 13826 extern int atomic_notifier_chain_register(struct atomic_notifier_head *nh,
b3bbd485 13827@@ -185,12 +209,12 @@ static inline int notifier_to_errno(int ret)
c7c16703 13828
e4b2b4a8
JK
13829 /*
13830 * Declared notifiers so far. I can imagine quite a few more chains
13831- * over time (eg laptop power reset chains, reboot chain (to clean
13832+ * over time (eg laptop power reset chains, reboot chain (to clean
13833 * device units up), device [un]mount chain, module load/unload chain,
13834- * low memory chain, screenblank chain (for plug in modular screenblankers)
13835+ * low memory chain, screenblank chain (for plug in modular screenblankers)
13836 * VC switch chains (for loadable kernel svgalib VC switch helpers) etc...
13837 */
13838-
13839+
13840 /* CPU notfiers are defined in include/linux/cpu.h. */
c7c16703 13841
e4b2b4a8 13842 /* netdevice notifiers are defined in include/linux/netdevice.h */
b3bbd485
JK
13843diff --git a/include/linux/percpu-rwsem.h b/include/linux/percpu-rwsem.h
13844index 79b99d653e03..fb44e237316d 100644
13845--- a/include/linux/percpu-rwsem.h
13846+++ b/include/linux/percpu-rwsem.h
13847@@ -29,7 +29,7 @@ static struct percpu_rw_semaphore name = { \
e4b2b4a8
JK
13848 extern int __percpu_down_read(struct percpu_rw_semaphore *, int);
13849 extern void __percpu_up_read(struct percpu_rw_semaphore *);
1a6e0f06 13850
e4b2b4a8
JK
13851-static inline void percpu_down_read_preempt_disable(struct percpu_rw_semaphore *sem)
13852+static inline void percpu_down_read(struct percpu_rw_semaphore *sem)
13853 {
13854 might_sleep();
1a6e0f06 13855
b3bbd485 13856@@ -47,16 +47,10 @@ static inline void percpu_down_read_preempt_disable(struct percpu_rw_semaphore *
e4b2b4a8
JK
13857 __this_cpu_inc(*sem->read_count);
13858 if (unlikely(!rcu_sync_is_idle(&sem->rss)))
13859 __percpu_down_read(sem, false); /* Unconditional memory barrier */
13860- barrier();
13861 /*
13862- * The barrier() prevents the compiler from
13863+ * The preempt_enable() prevents the compiler from
13864 * bleeding the critical section out.
13865 */
13866-}
1a6e0f06 13867-
e4b2b4a8
JK
13868-static inline void percpu_down_read(struct percpu_rw_semaphore *sem)
13869-{
13870- percpu_down_read_preempt_disable(sem);
13871 preempt_enable();
13872 }
1a6e0f06 13873
b3bbd485 13874@@ -83,13 +77,9 @@ static inline int percpu_down_read_trylock(struct percpu_rw_semaphore *sem)
e4b2b4a8
JK
13875 return ret;
13876 }
1a6e0f06 13877
e4b2b4a8
JK
13878-static inline void percpu_up_read_preempt_enable(struct percpu_rw_semaphore *sem)
13879+static inline void percpu_up_read(struct percpu_rw_semaphore *sem)
1a6e0f06 13880 {
e4b2b4a8
JK
13881- /*
13882- * The barrier() prevents the compiler from
13883- * bleeding the critical section out.
13884- */
13885- barrier();
13886+ preempt_disable();
13887 /*
13888 * Same as in percpu_down_read().
13889 */
b3bbd485 13890@@ -102,12 +92,6 @@ static inline void percpu_up_read_preempt_enable(struct percpu_rw_semaphore *sem
e4b2b4a8 13891 rwsem_release(&sem->rw_sem.dep_map, 1, _RET_IP_);
1a6e0f06 13892 }
1f39f580 13893
e4b2b4a8
JK
13894-static inline void percpu_up_read(struct percpu_rw_semaphore *sem)
13895-{
13896- preempt_disable();
13897- percpu_up_read_preempt_enable(sem);
13898-}
13899-
13900 extern void percpu_down_write(struct percpu_rw_semaphore *);
13901 extern void percpu_up_write(struct percpu_rw_semaphore *);
1f39f580 13902
b3bbd485
JK
13903diff --git a/include/linux/percpu.h b/include/linux/percpu.h
13904index 296bbe49d5d1..4414796e3941 100644
13905--- a/include/linux/percpu.h
13906+++ b/include/linux/percpu.h
13907@@ -19,6 +19,35 @@
13908 #define PERCPU_MODULE_RESERVE 0
13909 #endif
13910
13911+#ifdef CONFIG_PREEMPT_RT_FULL
13912+
13913+#define get_local_var(var) (*({ \
13914+ migrate_disable(); \
13915+ this_cpu_ptr(&var); }))
13916+
13917+#define put_local_var(var) do { \
13918+ (void)&(var); \
13919+ migrate_enable(); \
13920+} while (0)
13921+
13922+# define get_local_ptr(var) ({ \
13923+ migrate_disable(); \
13924+ this_cpu_ptr(var); })
13925+
13926+# define put_local_ptr(var) do { \
13927+ (void)(var); \
13928+ migrate_enable(); \
13929+} while (0)
13930+
13931+#else
13932+
13933+#define get_local_var(var) get_cpu_var(var)
13934+#define put_local_var(var) put_cpu_var(var)
13935+#define get_local_ptr(var) get_cpu_ptr(var)
13936+#define put_local_ptr(var) put_cpu_ptr(var)
13937+
13938+#endif
13939+
13940 /* minimum unit size, also is the maximum supported allocation size */
13941 #define PCPU_MIN_UNIT_SIZE PFN_ALIGN(32 << 10)
13942
13943diff --git a/include/linux/pid.h b/include/linux/pid.h
13944index dfd684ce0787..bc954a99aa70 100644
13945--- a/include/linux/pid.h
13946+++ b/include/linux/pid.h
e4b2b4a8
JK
13947@@ -3,6 +3,7 @@
13948 #define _LINUX_PID_H
1f39f580 13949
e4b2b4a8
JK
13950 #include <linux/rculist.h>
13951+#include <linux/atomic.h>
1f39f580 13952
e4b2b4a8
JK
13953 enum pid_type
13954 {
b3bbd485 13955diff --git a/include/linux/posix-timers.h b/include/linux/posix-timers.h
5dd41b01 13956index 437a539898ae..de5c49b0dccf 100644
b3bbd485
JK
13957--- a/include/linux/posix-timers.h
13958+++ b/include/linux/posix-timers.h
13959@@ -101,8 +101,8 @@ struct k_itimer {
e4b2b4a8
JK
13960 struct {
13961 struct alarm alarmtimer;
13962 } alarm;
13963- struct rcu_head rcu;
13964 } it;
13965+ struct rcu_head rcu;
13966 };
1f39f580 13967
e4b2b4a8 13968 void run_posix_cpu_timers(struct task_struct *task);
b3bbd485
JK
13969diff --git a/include/linux/preempt.h b/include/linux/preempt.h
13970index 5bd3f151da78..6728662a81e8 100644
13971--- a/include/linux/preempt.h
13972+++ b/include/linux/preempt.h
e4b2b4a8
JK
13973@@ -51,7 +51,11 @@
13974 #define HARDIRQ_OFFSET (1UL << HARDIRQ_SHIFT)
13975 #define NMI_OFFSET (1UL << NMI_SHIFT)
1f39f580 13976
e4b2b4a8
JK
13977-#define SOFTIRQ_DISABLE_OFFSET (2 * SOFTIRQ_OFFSET)
13978+#ifndef CONFIG_PREEMPT_RT_FULL
13979+# define SOFTIRQ_DISABLE_OFFSET (2 * SOFTIRQ_OFFSET)
13980+#else
13981+# define SOFTIRQ_DISABLE_OFFSET (0)
13982+#endif
1f39f580 13983
e4b2b4a8
JK
13984 /* We use the MSB mostly because its available */
13985 #define PREEMPT_NEED_RESCHED 0x80000000
13986@@ -81,9 +85,15 @@
13987 #include <asm/preempt.h>
1f39f580 13988
e4b2b4a8
JK
13989 #define hardirq_count() (preempt_count() & HARDIRQ_MASK)
13990-#define softirq_count() (preempt_count() & SOFTIRQ_MASK)
13991 #define irq_count() (preempt_count() & (HARDIRQ_MASK | SOFTIRQ_MASK \
13992 | NMI_MASK))
13993+#ifndef CONFIG_PREEMPT_RT_FULL
13994+# define softirq_count() (preempt_count() & SOFTIRQ_MASK)
13995+# define in_serving_softirq() (softirq_count() & SOFTIRQ_OFFSET)
13996+#else
13997+# define softirq_count() (0UL)
13998+extern int in_serving_softirq(void);
13999+#endif
1f39f580 14000
e4b2b4a8
JK
14001 /*
14002 * Are we doing bottom half or hardware interrupt processing?
14003@@ -101,7 +111,6 @@
14004 #define in_irq() (hardirq_count())
14005 #define in_softirq() (softirq_count())
14006 #define in_interrupt() (irq_count())
14007-#define in_serving_softirq() (softirq_count() & SOFTIRQ_OFFSET)
14008 #define in_nmi() (preempt_count() & NMI_MASK)
14009 #define in_task() (!(preempt_count() & \
14010 (NMI_MASK | HARDIRQ_MASK | SOFTIRQ_OFFSET)))
14011@@ -118,7 +127,11 @@
14012 /*
14013 * The preempt_count offset after spin_lock()
14014 */
14015+#if !defined(CONFIG_PREEMPT_RT_FULL)
14016 #define PREEMPT_LOCK_OFFSET PREEMPT_DISABLE_OFFSET
14017+#else
14018+#define PREEMPT_LOCK_OFFSET 0
14019+#endif
1f39f580 14020
e4b2b4a8
JK
14021 /*
14022 * The preempt_count offset needed for things like:
b3bbd485 14023@@ -167,6 +180,20 @@ extern void preempt_count_sub(int val);
e4b2b4a8
JK
14024 #define preempt_count_inc() preempt_count_add(1)
14025 #define preempt_count_dec() preempt_count_sub(1)
1f39f580 14026
e4b2b4a8
JK
14027+#ifdef CONFIG_PREEMPT_LAZY
14028+#define add_preempt_lazy_count(val) do { preempt_lazy_count() += (val); } while (0)
14029+#define sub_preempt_lazy_count(val) do { preempt_lazy_count() -= (val); } while (0)
14030+#define inc_preempt_lazy_count() add_preempt_lazy_count(1)
14031+#define dec_preempt_lazy_count() sub_preempt_lazy_count(1)
14032+#define preempt_lazy_count() (current_thread_info()->preempt_lazy_count)
14033+#else
14034+#define add_preempt_lazy_count(val) do { } while (0)
14035+#define sub_preempt_lazy_count(val) do { } while (0)
14036+#define inc_preempt_lazy_count() do { } while (0)
14037+#define dec_preempt_lazy_count() do { } while (0)
14038+#define preempt_lazy_count() (0)
14039+#endif
14040+
14041 #ifdef CONFIG_PREEMPT_COUNT
1f39f580 14042
e4b2b4a8 14043 #define preempt_disable() \
b3bbd485 14044@@ -175,16 +202,53 @@ do { \
e4b2b4a8
JK
14045 barrier(); \
14046 } while (0)
1f39f580 14047
e4b2b4a8
JK
14048+#define preempt_lazy_disable() \
14049+do { \
14050+ inc_preempt_lazy_count(); \
14051+ barrier(); \
14052+} while (0)
14053+
14054 #define sched_preempt_enable_no_resched() \
14055 do { \
14056 barrier(); \
14057 preempt_count_dec(); \
14058 } while (0)
1f39f580 14059
e4b2b4a8
JK
14060-#define preempt_enable_no_resched() sched_preempt_enable_no_resched()
14061+#ifdef CONFIG_PREEMPT_RT_BASE
14062+# define preempt_enable_no_resched() sched_preempt_enable_no_resched()
14063+# define preempt_check_resched_rt() preempt_check_resched()
14064+#else
14065+# define preempt_enable_no_resched() preempt_enable()
14066+# define preempt_check_resched_rt() barrier();
14067+#endif
1f39f580 14068
e4b2b4a8 14069 #define preemptible() (preempt_count() == 0 && !irqs_disabled())
1f39f580 14070
b3bbd485 14071+#if defined(CONFIG_SMP) && defined(CONFIG_PREEMPT_RT_BASE)
e4b2b4a8
JK
14072+
14073+extern void migrate_disable(void);
14074+extern void migrate_enable(void);
14075+
14076+int __migrate_disabled(struct task_struct *p);
14077+
14078+#elif !defined(CONFIG_SMP) && defined(CONFIG_PREEMPT_RT_BASE)
14079+
14080+extern void migrate_disable(void);
14081+extern void migrate_enable(void);
14082+static inline int __migrate_disabled(struct task_struct *p)
14083+{
14084+ return 0;
14085+}
14086+
14087+#else
b3bbd485
JK
14088+#define migrate_disable() preempt_disable()
14089+#define migrate_enable() preempt_enable()
e4b2b4a8
JK
14090+static inline int __migrate_disabled(struct task_struct *p)
14091+{
14092+ return 0;
14093+}
14094+#endif
14095+
14096 #ifdef CONFIG_PREEMPT
14097 #define preempt_enable() \
14098 do { \
b3bbd485 14099@@ -206,6 +270,13 @@ do { \
e4b2b4a8
JK
14100 __preempt_schedule(); \
14101 } while (0)
1f39f580 14102
e4b2b4a8
JK
14103+#define preempt_lazy_enable() \
14104+do { \
14105+ dec_preempt_lazy_count(); \
14106+ barrier(); \
14107+ preempt_check_resched(); \
14108+} while (0)
14109+
14110 #else /* !CONFIG_PREEMPT */
14111 #define preempt_enable() \
14112 do { \
b3bbd485 14113@@ -213,6 +284,12 @@ do { \
e4b2b4a8
JK
14114 preempt_count_dec(); \
14115 } while (0)
1f39f580 14116
e4b2b4a8
JK
14117+#define preempt_lazy_enable() \
14118+do { \
14119+ dec_preempt_lazy_count(); \
14120+ barrier(); \
14121+} while (0)
14122+
14123 #define preempt_enable_notrace() \
14124 do { \
14125 barrier(); \
b3bbd485 14126@@ -251,8 +328,16 @@ do { \
e4b2b4a8
JK
14127 #define preempt_disable_notrace() barrier()
14128 #define preempt_enable_no_resched_notrace() barrier()
14129 #define preempt_enable_notrace() barrier()
14130+#define preempt_check_resched_rt() barrier()
14131 #define preemptible() 0
1f39f580 14132
e4b2b4a8
JK
14133+#define migrate_disable() barrier()
14134+#define migrate_enable() barrier()
14135+
14136+static inline int __migrate_disabled(struct task_struct *p)
14137+{
14138+ return 0;
14139+}
14140 #endif /* CONFIG_PREEMPT_COUNT */
1f39f580 14141
e4b2b4a8 14142 #ifdef MODULE
b3bbd485 14143@@ -271,10 +356,22 @@ do { \
e4b2b4a8
JK
14144 } while (0)
14145 #define preempt_fold_need_resched() \
14146 do { \
14147- if (tif_need_resched()) \
14148+ if (tif_need_resched_now()) \
14149 set_preempt_need_resched(); \
14150 } while (0)
1f39f580 14151
e4b2b4a8
JK
14152+#ifdef CONFIG_PREEMPT_RT_FULL
14153+# define preempt_disable_rt() preempt_disable()
14154+# define preempt_enable_rt() preempt_enable()
14155+# define preempt_disable_nort() barrier()
14156+# define preempt_enable_nort() barrier()
14157+#else
14158+# define preempt_disable_rt() barrier()
14159+# define preempt_enable_rt() barrier()
14160+# define preempt_disable_nort() preempt_disable()
14161+# define preempt_enable_nort() preempt_enable()
14162+#endif
14163+
14164 #ifdef CONFIG_PREEMPT_NOTIFIERS
1f39f580 14165
e4b2b4a8 14166 struct preempt_notifier;
b3bbd485
JK
14167diff --git a/include/linux/printk.h b/include/linux/printk.h
14168index 6106befed756..1dba9cb7b91b 100644
14169--- a/include/linux/printk.h
14170+++ b/include/linux/printk.h
14171@@ -142,9 +142,11 @@ struct va_format {
e4b2b4a8
JK
14172 #ifdef CONFIG_EARLY_PRINTK
14173 extern asmlinkage __printf(1, 2)
14174 void early_printk(const char *fmt, ...);
14175+extern void printk_kill(void);
14176 #else
14177 static inline __printf(1, 2) __cold
14178 void early_printk(const char *s, ...) { }
14179+static inline void printk_kill(void) { }
14180 #endif
1f39f580 14181
e4b2b4a8 14182 #ifdef CONFIG_PRINTK_NMI
b3bbd485
JK
14183diff --git a/include/linux/radix-tree.h b/include/linux/radix-tree.h
14184index 567ebb5eaab0..9da7ea957399 100644
14185--- a/include/linux/radix-tree.h
14186+++ b/include/linux/radix-tree.h
14187@@ -328,6 +328,8 @@ unsigned int radix_tree_gang_lookup_slot(const struct radix_tree_root *,
e4b2b4a8
JK
14188 int radix_tree_preload(gfp_t gfp_mask);
14189 int radix_tree_maybe_preload(gfp_t gfp_mask);
14190 int radix_tree_maybe_preload_order(gfp_t gfp_mask, int order);
14191+void radix_tree_preload_end(void);
14192+
14193 void radix_tree_init(void);
14194 void *radix_tree_tag_set(struct radix_tree_root *,
14195 unsigned long index, unsigned int tag);
b3bbd485 14196@@ -347,11 +349,6 @@ unsigned int radix_tree_gang_lookup_tag_slot(const struct radix_tree_root *,
e4b2b4a8
JK
14197 unsigned int max_items, unsigned int tag);
14198 int radix_tree_tagged(const struct radix_tree_root *, unsigned int tag);
1f39f580 14199
e4b2b4a8
JK
14200-static inline void radix_tree_preload_end(void)
14201-{
14202- preempt_enable();
14203-}
14204-
14205 int radix_tree_split_preload(unsigned old_order, unsigned new_order, gfp_t);
14206 int radix_tree_split(struct radix_tree_root *, unsigned long index,
14207 unsigned new_order);
b3bbd485
JK
14208diff --git a/include/linux/random.h b/include/linux/random.h
14209index 4024f7d9c77d..462d752a739b 100644
14210--- a/include/linux/random.h
14211+++ b/include/linux/random.h
14212@@ -32,7 +32,7 @@ static inline void add_latent_entropy(void) {}
1f39f580 14213
e4b2b4a8
JK
14214 extern void add_input_randomness(unsigned int type, unsigned int code,
14215 unsigned int value) __latent_entropy;
14216-extern void add_interrupt_randomness(int irq, int irq_flags) __latent_entropy;
14217+extern void add_interrupt_randomness(int irq, int irq_flags, __u64 ip) __latent_entropy;
1f39f580 14218
e4b2b4a8
JK
14219 extern void get_random_bytes(void *buf, int nbytes);
14220 extern int wait_for_random_bytes(void);
b3bbd485
JK
14221diff --git a/include/linux/rbtree.h b/include/linux/rbtree.h
14222index d574361943ea..0a9f442409b9 100644
14223--- a/include/linux/rbtree.h
14224+++ b/include/linux/rbtree.h
e4b2b4a8 14225@@ -31,7 +31,7 @@
1f39f580 14226
e4b2b4a8
JK
14227 #include <linux/kernel.h>
14228 #include <linux/stddef.h>
14229-#include <linux/rcupdate.h>
14230+#include <linux/rcu_assign_pointer.h>
1f39f580 14231
e4b2b4a8
JK
14232 struct rb_node {
14233 unsigned long __rb_parent_color;
b3bbd485
JK
14234diff --git a/include/linux/rbtree_augmented.h b/include/linux/rbtree_augmented.h
14235index 6bfd2b581f75..af8a61be2d8d 100644
14236--- a/include/linux/rbtree_augmented.h
14237+++ b/include/linux/rbtree_augmented.h
14238@@ -26,6 +26,7 @@
14239
14240 #include <linux/compiler.h>
14241 #include <linux/rbtree.h>
14242+#include <linux/rcupdate.h>
14243
14244 /*
14245 * Please note - only struct rb_augment_callbacks and the prototypes for
14246diff --git a/include/linux/rbtree_latch.h b/include/linux/rbtree_latch.h
14247index ece43e882b56..7d012faa509a 100644
14248--- a/include/linux/rbtree_latch.h
14249+++ b/include/linux/rbtree_latch.h
e4b2b4a8 14250@@ -35,6 +35,7 @@
1f39f580 14251
e4b2b4a8
JK
14252 #include <linux/rbtree.h>
14253 #include <linux/seqlock.h>
14254+#include <linux/rcupdate.h>
1f39f580 14255
e4b2b4a8
JK
14256 struct latch_tree_node {
14257 struct rb_node node[2];
b3bbd485
JK
14258diff --git a/include/linux/rcu_assign_pointer.h b/include/linux/rcu_assign_pointer.h
14259new file mode 100644
14260index 000000000000..7066962a4379
14261--- /dev/null
14262+++ b/include/linux/rcu_assign_pointer.h
e4b2b4a8
JK
14263@@ -0,0 +1,54 @@
14264+#ifndef __LINUX_RCU_ASSIGN_POINTER_H__
14265+#define __LINUX_RCU_ASSIGN_POINTER_H__
14266+#include <linux/compiler.h>
14267+#include <asm/barrier.h>
14268+
14269+/**
14270+ * RCU_INITIALIZER() - statically initialize an RCU-protected global variable
14271+ * @v: The value to statically initialize with.
14272+ */
14273+#define RCU_INITIALIZER(v) (typeof(*(v)) __force __rcu *)(v)
14274+
14275+/**
14276+ * rcu_assign_pointer() - assign to RCU-protected pointer
14277+ * @p: pointer to assign to
14278+ * @v: value to assign (publish)
14279+ *
14280+ * Assigns the specified value to the specified RCU-protected
14281+ * pointer, ensuring that any concurrent RCU readers will see
14282+ * any prior initialization.
14283+ *
14284+ * Inserts memory barriers on architectures that require them
14285+ * (which is most of them), and also prevents the compiler from
14286+ * reordering the code that initializes the structure after the pointer
14287+ * assignment. More importantly, this call documents which pointers
14288+ * will be dereferenced by RCU read-side code.
14289+ *
14290+ * In some special cases, you may use RCU_INIT_POINTER() instead
14291+ * of rcu_assign_pointer(). RCU_INIT_POINTER() is a bit faster due
14292+ * to the fact that it does not constrain either the CPU or the compiler.
14293+ * That said, using RCU_INIT_POINTER() when you should have used
14294+ * rcu_assign_pointer() is a very bad thing that results in
14295+ * impossible-to-diagnose memory corruption. So please be careful.
14296+ * See the RCU_INIT_POINTER() comment header for details.
14297+ *
14298+ * Note that rcu_assign_pointer() evaluates each of its arguments only
14299+ * once, appearances notwithstanding. One of the "extra" evaluations
14300+ * is in typeof() and the other visible only to sparse (__CHECKER__),
14301+ * neither of which actually execute the argument. As with most cpp
14302+ * macros, this execute-arguments-only-once property is important, so
14303+ * please be careful when making changes to rcu_assign_pointer() and the
14304+ * other macros that it invokes.
14305+ */
14306+#define rcu_assign_pointer(p, v) \
14307+({ \
14308+ uintptr_t _r_a_p__v = (uintptr_t)(v); \
14309+ \
14310+ if (__builtin_constant_p(v) && (_r_a_p__v) == (uintptr_t)NULL) \
14311+ WRITE_ONCE((p), (typeof(p))(_r_a_p__v)); \
14312+ else \
14313+ smp_store_release(&p, RCU_INITIALIZER((typeof(p))_r_a_p__v)); \
14314+ _r_a_p__v; \
14315+})
14316+
14317+#endif
b3bbd485
JK
14318diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
14319index a6ddc42f87a5..70996e134818 100644
14320--- a/include/linux/rcupdate.h
14321+++ b/include/linux/rcupdate.h
e4b2b4a8
JK
14322@@ -42,6 +42,7 @@
14323 #include <linux/lockdep.h>
14324 #include <asm/processor.h>
14325 #include <linux/cpumask.h>
14326+#include <linux/rcu_assign_pointer.h>
1f39f580 14327
e4b2b4a8
JK
14328 #define ULONG_CMP_GE(a, b) (ULONG_MAX / 2 >= (a) - (b))
14329 #define ULONG_CMP_LT(a, b) (ULONG_MAX / 2 < (a) - (b))
b3bbd485 14330@@ -55,7 +56,11 @@ void call_rcu(struct rcu_head *head, rcu_callback_t func);
e4b2b4a8
JK
14331 #define call_rcu call_rcu_sched
14332 #endif /* #else #ifdef CONFIG_PREEMPT_RCU */
1f39f580 14333
e4b2b4a8
JK
14334+#ifdef CONFIG_PREEMPT_RT_FULL
14335+#define call_rcu_bh call_rcu
14336+#else
14337 void call_rcu_bh(struct rcu_head *head, rcu_callback_t func);
14338+#endif
14339 void call_rcu_sched(struct rcu_head *head, rcu_callback_t func);
14340 void synchronize_sched(void);
14341 void rcu_barrier_tasks(void);
b3bbd485 14342@@ -74,6 +79,11 @@ void synchronize_rcu(void);
e4b2b4a8
JK
14343 * types of kernel builds, the rcu_read_lock() nesting depth is unknowable.
14344 */
14345 #define rcu_preempt_depth() (current->rcu_read_lock_nesting)
14346+#ifndef CONFIG_PREEMPT_RT_FULL
14347+#define sched_rcu_preempt_depth() rcu_preempt_depth()
14348+#else
14349+static inline int sched_rcu_preempt_depth(void) { return 0; }
14350+#endif
1f39f580 14351
e4b2b4a8 14352 #else /* #ifdef CONFIG_PREEMPT_RCU */
1f39f580 14353
b3bbd485 14354@@ -99,6 +109,8 @@ static inline int rcu_preempt_depth(void)
e4b2b4a8 14355 return 0;
1f39f580
JK
14356 }
14357
e4b2b4a8
JK
14358+#define sched_rcu_preempt_depth() rcu_preempt_depth()
14359+
14360 #endif /* #else #ifdef CONFIG_PREEMPT_RCU */
1f39f580 14361
e4b2b4a8 14362 /* Internal to kernel */
b3bbd485 14363@@ -255,7 +267,14 @@ extern struct lockdep_map rcu_sched_lock_map;
e4b2b4a8
JK
14364 extern struct lockdep_map rcu_callback_map;
14365 int debug_lockdep_rcu_enabled(void);
14366 int rcu_read_lock_held(void);
14367+#ifdef CONFIG_PREEMPT_RT_FULL
14368+static inline int rcu_read_lock_bh_held(void)
14369+{
14370+ return rcu_read_lock_held();
14371+}
14372+#else
14373 int rcu_read_lock_bh_held(void);
14374+#endif
14375 int rcu_read_lock_sched_held(void);
1f39f580 14376
e4b2b4a8 14377 #else /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
b3bbd485
JK
14378@@ -364,54 +383,6 @@ static inline void rcu_preempt_sleep_check(void) { }
14379 ((typeof(*p) __force __kernel *)(________p1)); \
e4b2b4a8 14380 })
1f39f580 14381
b3bbd485 14382-/**
e4b2b4a8
JK
14383- * RCU_INITIALIZER() - statically initialize an RCU-protected global variable
14384- * @v: The value to statically initialize with.
14385- */
14386-#define RCU_INITIALIZER(v) (typeof(*(v)) __force __rcu *)(v)
14387-
14388-/**
14389- * rcu_assign_pointer() - assign to RCU-protected pointer
14390- * @p: pointer to assign to
14391- * @v: value to assign (publish)
14392- *
14393- * Assigns the specified value to the specified RCU-protected
14394- * pointer, ensuring that any concurrent RCU readers will see
14395- * any prior initialization.
14396- *
14397- * Inserts memory barriers on architectures that require them
14398- * (which is most of them), and also prevents the compiler from
14399- * reordering the code that initializes the structure after the pointer
14400- * assignment. More importantly, this call documents which pointers
14401- * will be dereferenced by RCU read-side code.
14402- *
14403- * In some special cases, you may use RCU_INIT_POINTER() instead
14404- * of rcu_assign_pointer(). RCU_INIT_POINTER() is a bit faster due
14405- * to the fact that it does not constrain either the CPU or the compiler.
14406- * That said, using RCU_INIT_POINTER() when you should have used
14407- * rcu_assign_pointer() is a very bad thing that results in
14408- * impossible-to-diagnose memory corruption. So please be careful.
14409- * See the RCU_INIT_POINTER() comment header for details.
14410- *
14411- * Note that rcu_assign_pointer() evaluates each of its arguments only
14412- * once, appearances notwithstanding. One of the "extra" evaluations
14413- * is in typeof() and the other visible only to sparse (__CHECKER__),
14414- * neither of which actually execute the argument. As with most cpp
14415- * macros, this execute-arguments-only-once property is important, so
14416- * please be careful when making changes to rcu_assign_pointer() and the
14417- * other macros that it invokes.
14418- */
14419-#define rcu_assign_pointer(p, v) \
14420-({ \
14421- uintptr_t _r_a_p__v = (uintptr_t)(v); \
14422- \
14423- if (__builtin_constant_p(v) && (_r_a_p__v) == (uintptr_t)NULL) \
14424- WRITE_ONCE((p), (typeof(p))(_r_a_p__v)); \
14425- else \
14426- smp_store_release(&p, RCU_INITIALIZER((typeof(p))_r_a_p__v)); \
14427- _r_a_p__v; \
14428-})
14429-
b3bbd485 14430 /**
e4b2b4a8
JK
14431 * rcu_swap_protected() - swap an RCU and a regular pointer
14432 * @rcu_ptr: RCU pointer
b3bbd485 14433@@ -707,10 +678,14 @@ static inline void rcu_read_unlock(void)
e4b2b4a8
JK
14434 static inline void rcu_read_lock_bh(void)
14435 {
14436 local_bh_disable();
14437+#ifdef CONFIG_PREEMPT_RT_FULL
14438+ rcu_read_lock();
14439+#else
14440 __acquire(RCU_BH);
14441 rcu_lock_acquire(&rcu_bh_lock_map);
14442 RCU_LOCKDEP_WARN(!rcu_is_watching(),
14443 "rcu_read_lock_bh() used illegally while idle");
14444+#endif
1f39f580 14445 }
1f39f580 14446
e4b2b4a8 14447 /*
b3bbd485 14448@@ -720,10 +695,14 @@ static inline void rcu_read_lock_bh(void)
e4b2b4a8
JK
14449 */
14450 static inline void rcu_read_unlock_bh(void)
1a6e0f06 14451 {
e4b2b4a8
JK
14452+#ifdef CONFIG_PREEMPT_RT_FULL
14453+ rcu_read_unlock();
14454+#else
14455 RCU_LOCKDEP_WARN(!rcu_is_watching(),
14456 "rcu_read_unlock_bh() used illegally while idle");
14457 rcu_lock_release(&rcu_bh_lock_map);
14458 __release(RCU_BH);
14459+#endif
14460 local_bh_enable();
1a6e0f06 14461 }
1a6e0f06 14462
b3bbd485
JK
14463diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h
14464index 37d6fd3b7ff8..a082fde7d6bc 100644
14465--- a/include/linux/rcutree.h
14466+++ b/include/linux/rcutree.h
14467@@ -44,7 +44,11 @@ static inline void rcu_virt_note_context_switch(int cpu)
e4b2b4a8 14468 rcu_note_context_switch(false);
1a6e0f06
JK
14469 }
14470
e4b2b4a8
JK
14471+#ifdef CONFIG_PREEMPT_RT_FULL
14472+# define synchronize_rcu_bh synchronize_rcu
14473+#else
14474 void synchronize_rcu_bh(void);
14475+#endif
14476 void synchronize_sched_expedited(void);
14477 void synchronize_rcu_expedited(void);
1a6e0f06 14478
b3bbd485 14479@@ -72,7 +76,11 @@ static inline void synchronize_rcu_bh_expedited(void)
1a6e0f06
JK
14480 }
14481
e4b2b4a8
JK
14482 void rcu_barrier(void);
14483+#ifdef CONFIG_PREEMPT_RT_FULL
14484+# define rcu_barrier_bh rcu_barrier
14485+#else
14486 void rcu_barrier_bh(void);
14487+#endif
14488 void rcu_barrier_sched(void);
14489 unsigned long get_state_synchronize_rcu(void);
14490 void cond_synchronize_rcu(unsigned long oldstate);
b3bbd485
JK
14491diff --git a/include/linux/ring_buffer.h b/include/linux/ring_buffer.h
14492index 5caa062a02b2..abce5f5325e1 100644
14493--- a/include/linux/ring_buffer.h
14494+++ b/include/linux/ring_buffer.h
14495@@ -34,10 +34,12 @@ struct ring_buffer_event {
e4b2b4a8
JK
14496 * array[0] = time delta (28 .. 59)
14497 * size = 8 bytes
14498 *
14499- * @RINGBUF_TYPE_TIME_STAMP: Sync time stamp with external clock
14500- * array[0] = tv_nsec
14501- * array[1..2] = tv_sec
14502- * size = 16 bytes
14503+ * @RINGBUF_TYPE_TIME_STAMP: Absolute timestamp
14504+ * Same format as TIME_EXTEND except that the
14505+ * value is an absolute timestamp, not a delta
14506+ * event.time_delta contains bottom 27 bits
14507+ * array[0] = top (28 .. 59) bits
14508+ * size = 8 bytes
14509 *
14510 * <= @RINGBUF_TYPE_DATA_TYPE_LEN_MAX:
14511 * Data record
b3bbd485 14512@@ -54,12 +56,12 @@ enum ring_buffer_type {
e4b2b4a8
JK
14513 RINGBUF_TYPE_DATA_TYPE_LEN_MAX = 28,
14514 RINGBUF_TYPE_PADDING,
14515 RINGBUF_TYPE_TIME_EXTEND,
14516- /* FIXME: RINGBUF_TYPE_TIME_STAMP not implemented */
14517 RINGBUF_TYPE_TIME_STAMP,
14518 };
1a6e0f06 14519
e4b2b4a8
JK
14520 unsigned ring_buffer_event_length(struct ring_buffer_event *event);
14521 void *ring_buffer_event_data(struct ring_buffer_event *event);
14522+u64 ring_buffer_event_time_stamp(struct ring_buffer_event *event);
1a6e0f06 14523
e4b2b4a8
JK
14524 /*
14525 * ring_buffer_discard_commit will remove an event that has not
b3bbd485 14526@@ -115,6 +117,9 @@ int ring_buffer_unlock_commit(struct ring_buffer *buffer,
e4b2b4a8
JK
14527 int ring_buffer_write(struct ring_buffer *buffer,
14528 unsigned long length, void *data);
14529
14530+void ring_buffer_nest_start(struct ring_buffer *buffer);
14531+void ring_buffer_nest_end(struct ring_buffer *buffer);
14532+
14533 struct ring_buffer_event *
14534 ring_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts,
14535 unsigned long *lost_events);
b3bbd485 14536@@ -179,6 +184,8 @@ void ring_buffer_normalize_time_stamp(struct ring_buffer *buffer,
e4b2b4a8
JK
14537 int cpu, u64 *ts);
14538 void ring_buffer_set_clock(struct ring_buffer *buffer,
14539 u64 (*clock)(void));
14540+void ring_buffer_set_time_stamp_abs(struct ring_buffer *buffer, bool abs);
14541+bool ring_buffer_time_stamp_abs(struct ring_buffer *buffer);
14542
14543 size_t ring_buffer_page_len(void *page);
14544
b3bbd485
JK
14545diff --git a/include/linux/rtmutex.h b/include/linux/rtmutex.h
14546index 6fd615a0eea9..138bd1e183e0 100644
14547--- a/include/linux/rtmutex.h
14548+++ b/include/linux/rtmutex.h
e4b2b4a8
JK
14549@@ -14,11 +14,15 @@
14550 #define __LINUX_RT_MUTEX_H
1a6e0f06 14551
e4b2b4a8
JK
14552 #include <linux/linkage.h>
14553+#include <linux/spinlock_types_raw.h>
14554 #include <linux/rbtree.h>
14555-#include <linux/spinlock_types.h>
1a6e0f06 14556
e4b2b4a8 14557 extern int max_lock_depth; /* for sysctl */
1a6e0f06 14558
e4b2b4a8
JK
14559+#ifdef CONFIG_DEBUG_MUTEXES
14560+#include <linux/debug_locks.h>
14561+#endif
14562+
14563 /**
14564 * The rt_mutex structure
14565 *
b3bbd485 14566@@ -31,8 +35,8 @@ struct rt_mutex {
e4b2b4a8
JK
14567 raw_spinlock_t wait_lock;
14568 struct rb_root_cached waiters;
14569 struct task_struct *owner;
14570-#ifdef CONFIG_DEBUG_RT_MUTEXES
14571 int save_state;
14572+#ifdef CONFIG_DEBUG_RT_MUTEXES
14573 const char *name, *file;
14574 int line;
14575 void *magic;
b3bbd485 14576@@ -82,16 +86,23 @@ do { \
e4b2b4a8
JK
14577 #define __DEP_MAP_RT_MUTEX_INITIALIZER(mutexname)
14578 #endif
1a6e0f06 14579
e4b2b4a8
JK
14580-#define __RT_MUTEX_INITIALIZER(mutexname) \
14581- { .wait_lock = __RAW_SPIN_LOCK_UNLOCKED(mutexname.wait_lock) \
14582+#define __RT_MUTEX_INITIALIZER_PLAIN(mutexname) \
14583+ .wait_lock = __RAW_SPIN_LOCK_UNLOCKED(mutexname.wait_lock) \
14584 , .waiters = RB_ROOT_CACHED \
14585 , .owner = NULL \
14586 __DEBUG_RT_MUTEX_INITIALIZER(mutexname) \
14587- __DEP_MAP_RT_MUTEX_INITIALIZER(mutexname)}
14588+ __DEP_MAP_RT_MUTEX_INITIALIZER(mutexname)
14589+
14590+#define __RT_MUTEX_INITIALIZER(mutexname) \
14591+ { __RT_MUTEX_INITIALIZER_PLAIN(mutexname) }
1a6e0f06 14592
e4b2b4a8
JK
14593 #define DEFINE_RT_MUTEX(mutexname) \
14594 struct rt_mutex mutexname = __RT_MUTEX_INITIALIZER(mutexname)
1a6e0f06 14595
e4b2b4a8
JK
14596+#define __RT_MUTEX_INITIALIZER_SAVE_STATE(mutexname) \
14597+ { __RT_MUTEX_INITIALIZER_PLAIN(mutexname) \
14598+ , .save_state = 1 }
14599+
14600 /**
14601 * rt_mutex_is_locked - is the mutex locked
14602 * @lock: the mutex to be queried
b3bbd485
JK
14603@@ -115,6 +126,7 @@ extern void rt_mutex_lock(struct rt_mutex *lock);
14604 #endif
1a6e0f06 14605
e4b2b4a8
JK
14606 extern int rt_mutex_lock_interruptible(struct rt_mutex *lock);
14607+extern int rt_mutex_lock_killable(struct rt_mutex *lock);
14608 extern int rt_mutex_timed_lock(struct rt_mutex *lock,
14609 struct hrtimer_sleeper *timeout);
1a6e0f06 14610
b3bbd485
JK
14611diff --git a/include/linux/rwlock_rt.h b/include/linux/rwlock_rt.h
14612new file mode 100644
14613index 000000000000..a9c4c2ac4d1f
14614--- /dev/null
14615+++ b/include/linux/rwlock_rt.h
e4b2b4a8
JK
14616@@ -0,0 +1,119 @@
14617+#ifndef __LINUX_RWLOCK_RT_H
14618+#define __LINUX_RWLOCK_RT_H
14619+
14620+#ifndef __LINUX_SPINLOCK_H
14621+#error Do not include directly. Use spinlock.h
1a6e0f06 14622+#endif
e4b2b4a8
JK
14623+
14624+extern void __lockfunc rt_write_lock(rwlock_t *rwlock);
14625+extern void __lockfunc rt_read_lock(rwlock_t *rwlock);
14626+extern int __lockfunc rt_write_trylock(rwlock_t *rwlock);
14627+extern int __lockfunc rt_read_trylock(rwlock_t *rwlock);
14628+extern void __lockfunc rt_write_unlock(rwlock_t *rwlock);
14629+extern void __lockfunc rt_read_unlock(rwlock_t *rwlock);
14630+extern int __lockfunc rt_read_can_lock(rwlock_t *rwlock);
14631+extern int __lockfunc rt_write_can_lock(rwlock_t *rwlock);
14632+extern void __rt_rwlock_init(rwlock_t *rwlock, char *name, struct lock_class_key *key);
14633+
14634+#define read_can_lock(rwlock) rt_read_can_lock(rwlock)
14635+#define write_can_lock(rwlock) rt_write_can_lock(rwlock)
14636+
14637+#define read_trylock(lock) __cond_lock(lock, rt_read_trylock(lock))
14638+#define write_trylock(lock) __cond_lock(lock, rt_write_trylock(lock))
14639+
14640+static inline int __write_trylock_rt_irqsave(rwlock_t *lock, unsigned long *flags)
14641+{
14642+ /* XXX ARCH_IRQ_ENABLED */
14643+ *flags = 0;
14644+ return rt_write_trylock(lock);
1a6e0f06
JK
14645+}
14646+
e4b2b4a8
JK
14647+#define write_trylock_irqsave(lock, flags) \
14648+ __cond_lock(lock, __write_trylock_rt_irqsave(lock, &(flags)))
14649+
14650+#define read_lock_irqsave(lock, flags) \
14651+ do { \
14652+ typecheck(unsigned long, flags); \
14653+ rt_read_lock(lock); \
14654+ flags = 0; \
14655+ } while (0)
14656+
14657+#define write_lock_irqsave(lock, flags) \
14658+ do { \
14659+ typecheck(unsigned long, flags); \
14660+ rt_write_lock(lock); \
14661+ flags = 0; \
14662+ } while (0)
1a6e0f06 14663+
e4b2b4a8 14664+#define read_lock(lock) rt_read_lock(lock)
1a6e0f06 14665+
e4b2b4a8
JK
14666+#define read_lock_bh(lock) \
14667+ do { \
14668+ local_bh_disable(); \
14669+ rt_read_lock(lock); \
14670+ } while (0)
1a6e0f06 14671+
e4b2b4a8 14672+#define read_lock_irq(lock) read_lock(lock)
1a6e0f06 14673+
e4b2b4a8 14674+#define write_lock(lock) rt_write_lock(lock)
1a6e0f06 14675+
e4b2b4a8
JK
14676+#define write_lock_bh(lock) \
14677+ do { \
14678+ local_bh_disable(); \
14679+ rt_write_lock(lock); \
14680+ } while (0)
1a6e0f06 14681+
e4b2b4a8 14682+#define write_lock_irq(lock) write_lock(lock)
1a6e0f06 14683+
e4b2b4a8 14684+#define read_unlock(lock) rt_read_unlock(lock)
1a6e0f06 14685+
e4b2b4a8
JK
14686+#define read_unlock_bh(lock) \
14687+ do { \
14688+ rt_read_unlock(lock); \
14689+ local_bh_enable(); \
14690+ } while (0)
1a6e0f06 14691+
e4b2b4a8 14692+#define read_unlock_irq(lock) read_unlock(lock)
1a6e0f06 14693+
e4b2b4a8
JK
14694+#define write_unlock(lock) rt_write_unlock(lock)
14695+
14696+#define write_unlock_bh(lock) \
14697+ do { \
14698+ rt_write_unlock(lock); \
14699+ local_bh_enable(); \
14700+ } while (0)
14701+
14702+#define write_unlock_irq(lock) write_unlock(lock)
14703+
14704+#define read_unlock_irqrestore(lock, flags) \
14705+ do { \
14706+ typecheck(unsigned long, flags); \
14707+ (void) flags; \
14708+ rt_read_unlock(lock); \
14709+ } while (0)
14710+
14711+#define write_unlock_irqrestore(lock, flags) \
14712+ do { \
14713+ typecheck(unsigned long, flags); \
14714+ (void) flags; \
14715+ rt_write_unlock(lock); \
14716+ } while (0)
14717+
14718+#define rwlock_init(rwl) \
14719+do { \
14720+ static struct lock_class_key __key; \
14721+ \
14722+ __rt_rwlock_init(rwl, #rwl, &__key); \
14723+} while (0)
1a6e0f06 14724+
1a6e0f06 14725+/*
e4b2b4a8 14726+ * Internal functions made global for CPU pinning
1a6e0f06 14727+ */
e4b2b4a8
JK
14728+void __read_rt_lock(struct rt_rw_lock *lock);
14729+int __read_rt_trylock(struct rt_rw_lock *lock);
14730+void __write_rt_lock(struct rt_rw_lock *lock);
14731+int __write_rt_trylock(struct rt_rw_lock *lock);
14732+void __read_rt_unlock(struct rt_rw_lock *lock);
14733+void __write_rt_unlock(struct rt_rw_lock *lock);
14734+
1a6e0f06 14735+#endif
b3bbd485
JK
14736diff --git a/include/linux/rwlock_types.h b/include/linux/rwlock_types.h
14737index cc0072e93e36..5317cd957292 100644
14738--- a/include/linux/rwlock_types.h
14739+++ b/include/linux/rwlock_types.h
e4b2b4a8
JK
14740@@ -1,6 +1,10 @@
14741 #ifndef __LINUX_RWLOCK_TYPES_H
14742 #define __LINUX_RWLOCK_TYPES_H
1a6e0f06 14743
e4b2b4a8
JK
14744+#if !defined(__LINUX_SPINLOCK_TYPES_H)
14745+# error "Do not include directly, include spinlock_types.h"
14746+#endif
14747+
1a6e0f06 14748 /*
e4b2b4a8
JK
14749 * include/linux/rwlock_types.h - generic rwlock type definitions
14750 * and initializers
b3bbd485
JK
14751diff --git a/include/linux/rwlock_types_rt.h b/include/linux/rwlock_types_rt.h
14752new file mode 100644
14753index 000000000000..546a1f8f1274
14754--- /dev/null
14755+++ b/include/linux/rwlock_types_rt.h
e4b2b4a8
JK
14756@@ -0,0 +1,55 @@
14757+#ifndef __LINUX_RWLOCK_TYPES_RT_H
14758+#define __LINUX_RWLOCK_TYPES_RT_H
14759+
14760+#ifndef __LINUX_SPINLOCK_TYPES_H
14761+#error "Do not include directly. Include spinlock_types.h instead"
14762+#endif
14763+
14764+#ifdef CONFIG_DEBUG_LOCK_ALLOC
14765+# define RW_DEP_MAP_INIT(lockname) .dep_map = { .name = #lockname }
14766+#else
14767+# define RW_DEP_MAP_INIT(lockname)
14768+#endif
14769+
14770+typedef struct rt_rw_lock rwlock_t;
14771+
14772+#define __RW_LOCK_UNLOCKED(name) __RWLOCK_RT_INITIALIZER(name)
14773+
14774+#define DEFINE_RWLOCK(name) \
14775+ rwlock_t name = __RW_LOCK_UNLOCKED(name)
14776+
14777+/*
14778+ * A reader biased implementation primarily for CPU pinning.
14779+ *
14780+ * Can be selected as general replacement for the single reader RT rwlock
14781+ * variant
14782+ */
14783+struct rt_rw_lock {
14784+ struct rt_mutex rtmutex;
14785+ atomic_t readers;
14786+#ifdef CONFIG_DEBUG_LOCK_ALLOC
14787+ struct lockdep_map dep_map;
14788+#endif
14789+};
14790+
14791+#define READER_BIAS (1U << 31)
14792+#define WRITER_BIAS (1U << 30)
14793+
14794+#define __RWLOCK_RT_INITIALIZER(name) \
14795+{ \
14796+ .readers = ATOMIC_INIT(READER_BIAS), \
14797+ .rtmutex = __RT_MUTEX_INITIALIZER_SAVE_STATE(name.rtmutex), \
14798+ RW_DEP_MAP_INIT(name) \
14799+}
14800+
14801+void __rwlock_biased_rt_init(struct rt_rw_lock *lock, const char *name,
14802+ struct lock_class_key *key);
14803+
14804+#define rwlock_biased_rt_init(rwlock) \
14805+ do { \
14806+ static struct lock_class_key __key; \
14807+ \
14808+ __rwlock_biased_rt_init((rwlock), #rwlock, &__key); \
14809+ } while (0)
14810+
14811+#endif
b3bbd485
JK
14812diff --git a/include/linux/rwsem.h b/include/linux/rwsem.h
14813index c427ffaa4904..513df11a364e 100644
14814--- a/include/linux/rwsem.h
14815+++ b/include/linux/rwsem.h
e4b2b4a8
JK
14816@@ -20,6 +20,10 @@
14817 #include <linux/osq_lock.h>
14818 #endif
1a6e0f06 14819
e4b2b4a8
JK
14820+#ifdef CONFIG_PREEMPT_RT_FULL
14821+#include <linux/rwsem_rt.h>
14822+#else /* PREEMPT_RT_FULL */
14823+
14824 struct rw_semaphore;
1a6e0f06 14825
e4b2b4a8 14826 #ifdef CONFIG_RWSEM_GENERIC_SPINLOCK
b3bbd485 14827@@ -114,6 +118,13 @@ static inline int rwsem_is_contended(struct rw_semaphore *sem)
e4b2b4a8 14828 return !list_empty(&sem->wait_list);
1a6e0f06
JK
14829 }
14830
e4b2b4a8
JK
14831+#endif /* !PREEMPT_RT_FULL */
14832+
14833+/*
14834+ * The functions below are the same for all rwsem implementations including
14835+ * the RT specific variant.
14836+ */
14837+
1a6e0f06 14838 /*
e4b2b4a8 14839 * lock for reading
1a6e0f06 14840 */
b3bbd485
JK
14841diff --git a/include/linux/rwsem_rt.h b/include/linux/rwsem_rt.h
14842new file mode 100644
14843index 000000000000..2ffbf093ae92
14844--- /dev/null
14845+++ b/include/linux/rwsem_rt.h
e4b2b4a8
JK
14846@@ -0,0 +1,67 @@
14847+#ifndef _LINUX_RWSEM_RT_H
14848+#define _LINUX_RWSEM_RT_H
14849+
14850+#ifndef _LINUX_RWSEM_H
14851+#error "Include rwsem.h"
14852+#endif
14853+
14854+#include <linux/rtmutex.h>
14855+#include <linux/swait.h>
14856+
14857+#define READER_BIAS (1U << 31)
14858+#define WRITER_BIAS (1U << 30)
14859+
14860+struct rw_semaphore {
14861+ atomic_t readers;
14862+ struct rt_mutex rtmutex;
14863+#ifdef CONFIG_DEBUG_LOCK_ALLOC
14864+ struct lockdep_map dep_map;
14865+#endif
14866+};
14867+
14868+#define __RWSEM_INITIALIZER(name) \
14869+{ \
14870+ .readers = ATOMIC_INIT(READER_BIAS), \
14871+ .rtmutex = __RT_MUTEX_INITIALIZER(name.rtmutex), \
14872+ RW_DEP_MAP_INIT(name) \
14873+}
14874+
14875+#define DECLARE_RWSEM(lockname) \
14876+ struct rw_semaphore lockname = __RWSEM_INITIALIZER(lockname)
14877+
14878+extern void __rwsem_init(struct rw_semaphore *rwsem, const char *name,
14879+ struct lock_class_key *key);
14880+
14881+#define __init_rwsem(sem, name, key) \
14882+do { \
14883+ rt_mutex_init(&(sem)->rtmutex); \
14884+ __rwsem_init((sem), (name), (key)); \
14885+} while (0)
14886+
14887+#define init_rwsem(sem) \
14888+do { \
14889+ static struct lock_class_key __key; \
14890+ \
14891+ __init_rwsem((sem), #sem, &__key); \
14892+} while (0)
14893+
14894+static inline int rwsem_is_locked(struct rw_semaphore *sem)
1a6e0f06 14895+{
e4b2b4a8
JK
14896+ return atomic_read(&sem->readers) != READER_BIAS;
14897+}
1a6e0f06 14898+
e4b2b4a8
JK
14899+static inline int rwsem_is_contended(struct rw_semaphore *sem)
14900+{
14901+ return atomic_read(&sem->readers) > 0;
1a6e0f06
JK
14902+}
14903+
e4b2b4a8
JK
14904+extern void __down_read(struct rw_semaphore *sem);
14905+extern int __down_read_trylock(struct rw_semaphore *sem);
14906+extern void __down_write(struct rw_semaphore *sem);
14907+extern int __must_check __down_write_killable(struct rw_semaphore *sem);
14908+extern int __down_write_trylock(struct rw_semaphore *sem);
14909+extern void __up_read(struct rw_semaphore *sem);
14910+extern void __up_write(struct rw_semaphore *sem);
14911+extern void __downgrade_write(struct rw_semaphore *sem);
14912+
14913+#endif
b3bbd485
JK
14914diff --git a/include/linux/sched.h b/include/linux/sched.h
14915index e04919aa8201..a6ffb552be01 100644
14916--- a/include/linux/sched.h
14917+++ b/include/linux/sched.h
e4b2b4a8
JK
14918@@ -27,6 +27,7 @@
14919 #include <linux/signal_types.h>
14920 #include <linux/mm_types_task.h>
14921 #include <linux/task_io_accounting.h>
14922+#include <asm/kmap_types.h>
1a6e0f06 14923
e4b2b4a8
JK
14924 /* task_struct member predeclarations (sorted alphabetically): */
14925 struct audit_context;
b3bbd485 14926@@ -93,7 +94,6 @@ struct task_group;
1a6e0f06 14927
e4b2b4a8
JK
14928 /* Convenience macros for the sake of wake_up(): */
14929 #define TASK_NORMAL (TASK_INTERRUPTIBLE | TASK_UNINTERRUPTIBLE)
14930-#define TASK_ALL (TASK_NORMAL | __TASK_STOPPED | __TASK_TRACED)
1a6e0f06 14931
e4b2b4a8
JK
14932 /* get_task_state(): */
14933 #define TASK_REPORT (TASK_RUNNING | TASK_INTERRUPTIBLE | \
b3bbd485 14934@@ -101,12 +101,8 @@ struct task_group;
e4b2b4a8
JK
14935 __TASK_TRACED | EXIT_DEAD | EXIT_ZOMBIE | \
14936 TASK_PARKED)
1a6e0f06 14937
e4b2b4a8
JK
14938-#define task_is_traced(task) ((task->state & __TASK_TRACED) != 0)
14939-
14940 #define task_is_stopped(task) ((task->state & __TASK_STOPPED) != 0)
c7c16703 14941
e4b2b4a8
JK
14942-#define task_is_stopped_or_traced(task) ((task->state & (__TASK_STOPPED | __TASK_TRACED)) != 0)
14943-
14944 #define task_contributes_to_load(task) ((task->state & TASK_UNINTERRUPTIBLE) != 0 && \
14945 (task->flags & PF_FROZEN) == 0 && \
14946 (task->state & TASK_NOLOAD) == 0)
b3bbd485 14947@@ -134,6 +130,11 @@ struct task_group;
e4b2b4a8
JK
14948 smp_store_mb(current->state, (state_value)); \
14949 } while (0)
c7c16703 14950
e4b2b4a8
JK
14951+#define __set_current_state_no_track(state_value) \
14952+ current->state = (state_value);
14953+#define set_current_state_no_track(state_value) \
14954+ smp_store_mb(current->state, (state_value));
14955+
14956 #define set_special_state(state_value) \
14957 do { \
14958 unsigned long flags; /* may shadow */ \
b3bbd485 14959@@ -187,6 +188,9 @@ struct task_group;
e4b2b4a8
JK
14960 #define set_current_state(state_value) \
14961 smp_store_mb(current->state, (state_value))
14962
14963+#define __set_current_state_no_track(state_value) __set_current_state(state_value)
14964+#define set_current_state_no_track(state_value) set_current_state(state_value)
14965+
14966 /*
14967 * set_special_state() should be used for those states when the blocking task
14968 * can not use the regular condition based wait-loop. In that case we must
b3bbd485 14969@@ -566,6 +570,8 @@ struct task_struct {
e4b2b4a8
JK
14970 #endif
14971 /* -1 unrunnable, 0 runnable, >0 stopped: */
14972 volatile long state;
14973+ /* saved state for "spinlock sleepers" */
14974+ volatile long saved_state;
c7c16703 14975
c7c16703 14976 /*
e4b2b4a8 14977 * This begins the randomizable portion of task_struct. Only
b3bbd485 14978@@ -618,7 +624,25 @@ struct task_struct {
e4b2b4a8
JK
14979
14980 unsigned int policy;
14981 int nr_cpus_allowed;
14982- cpumask_t cpus_allowed;
14983+ const cpumask_t *cpus_ptr;
14984+ cpumask_t cpus_mask;
b3bbd485 14985+#if defined(CONFIG_SMP) && defined(CONFIG_PREEMPT_RT_BASE)
e4b2b4a8
JK
14986+ int migrate_disable;
14987+ int migrate_disable_update;
14988+ int pinned_on_cpu;
14989+# ifdef CONFIG_SCHED_DEBUG
14990+ int migrate_disable_atomic;
14991+# endif
14992+
14993+#elif !defined(CONFIG_SMP) && defined(CONFIG_PREEMPT_RT_BASE)
e4b2b4a8 14994+# ifdef CONFIG_SCHED_DEBUG
b3bbd485 14995+ int migrate_disable;
e4b2b4a8
JK
14996+ int migrate_disable_atomic;
14997+# endif
14998+#endif
14999+#ifdef CONFIG_PREEMPT_RT_FULL
15000+ int sleeping_lock;
15001+#endif
c7c16703 15002
e4b2b4a8
JK
15003 #ifdef CONFIG_PREEMPT_RCU
15004 int rcu_read_lock_nesting;
b3bbd485 15005@@ -777,6 +801,9 @@ struct task_struct {
e4b2b4a8
JK
15006 #ifdef CONFIG_POSIX_TIMERS
15007 struct task_cputime cputime_expires;
15008 struct list_head cpu_timers[3];
15009+#ifdef CONFIG_PREEMPT_RT_BASE
15010+ struct task_struct *posix_timer_list;
15011+#endif
15012 #endif
c7c16703 15013
e4b2b4a8 15014 /* Process credentials: */
b3bbd485 15015@@ -820,11 +847,17 @@ struct task_struct {
e4b2b4a8
JK
15016 /* Signal handlers: */
15017 struct signal_struct *signal;
15018 struct sighand_struct *sighand;
15019+ struct sigqueue *sigqueue_cache;
15020+
15021 sigset_t blocked;
15022 sigset_t real_blocked;
15023 /* Restored if set_restore_sigmask() was used: */
15024 sigset_t saved_sigmask;
15025 struct sigpending pending;
15026+#ifdef CONFIG_PREEMPT_RT_FULL
15027+ /* TODO: move me into ->restart_block ? */
15028+ struct siginfo forced_info;
15029+#endif
15030 unsigned long sas_ss_sp;
15031 size_t sas_ss_size;
15032 unsigned int sas_ss_flags;
b3bbd485 15033@@ -849,6 +882,7 @@ struct task_struct {
e4b2b4a8
JK
15034 raw_spinlock_t pi_lock;
15035
15036 struct wake_q_node wake_q;
15037+ struct wake_q_node wake_q_sleeper;
15038
15039 #ifdef CONFIG_RT_MUTEXES
15040 /* PI waiters blocked on a rt_mutex held by this task: */
b3bbd485 15041@@ -1116,8 +1150,22 @@ struct task_struct {
e4b2b4a8
JK
15042 unsigned int sequential_io;
15043 unsigned int sequential_io_avg;
15044 #endif
15045+#ifdef CONFIG_PREEMPT_RT_BASE
15046+ struct rcu_head put_rcu;
15047+ int softirq_nestcnt;
15048+ unsigned int softirqs_raised;
15049+#endif
15050+#ifdef CONFIG_PREEMPT_RT_FULL
15051+# if defined CONFIG_HIGHMEM || defined CONFIG_X86_32
15052+ int kmap_idx;
15053+ pte_t kmap_pte[KM_TYPE_NR];
15054+# endif
15055+#endif
15056 #ifdef CONFIG_DEBUG_ATOMIC_SLEEP
15057 unsigned long task_state_change;
b3bbd485 15058+#endif
e4b2b4a8
JK
15059+#ifdef CONFIG_PREEMPT_RT_FULL
15060+ int xmit_recursion;
b3bbd485 15061 #endif
e4b2b4a8
JK
15062 int pagefault_disabled;
15063 #ifdef CONFIG_MMU
b3bbd485 15064@@ -1332,6 +1380,7 @@ extern struct pid *cad_pid;
e4b2b4a8
JK
15065 /*
15066 * Per process flags
15067 */
15068+#define PF_IN_SOFTIRQ 0x00000001 /* Task is serving softirq */
15069 #define PF_IDLE 0x00000002 /* I am an IDLE thread */
15070 #define PF_EXITING 0x00000004 /* Getting shut down */
15071 #define PF_EXITPIDONE 0x00000008 /* PI exit done on shut down */
b3bbd485 15072@@ -1355,7 +1404,7 @@ extern struct pid *cad_pid;
e4b2b4a8
JK
15073 #define PF_KTHREAD 0x00200000 /* I am a kernel thread */
15074 #define PF_RANDOMIZE 0x00400000 /* Randomize virtual address space */
15075 #define PF_SWAPWRITE 0x00800000 /* Allowed to write to swap */
15076-#define PF_NO_SETAFFINITY 0x04000000 /* Userland is not allowed to meddle with cpus_allowed */
15077+#define PF_NO_SETAFFINITY 0x04000000 /* Userland is not allowed to meddle with cpus_mask */
15078 #define PF_MCE_EARLY 0x08000000 /* Early kill for mce process policy */
15079 #define PF_MUTEX_TESTER 0x20000000 /* Thread belongs to the rt mutex tester */
15080 #define PF_FREEZER_SKIP 0x40000000 /* Freezer should not count it as freezable */
b3bbd485 15081@@ -1535,6 +1584,7 @@ extern struct task_struct *find_task_by_pid_ns(pid_t nr, struct pid_namespace *n
e4b2b4a8
JK
15082
15083 extern int wake_up_state(struct task_struct *tsk, unsigned int state);
15084 extern int wake_up_process(struct task_struct *tsk);
15085+extern int wake_up_lock_sleeper(struct task_struct *tsk);
15086 extern void wake_up_new_task(struct task_struct *tsk);
15087
15088 #ifdef CONFIG_SMP
b3bbd485 15089@@ -1611,6 +1661,89 @@ static inline int test_tsk_need_resched(struct task_struct *tsk)
e4b2b4a8
JK
15090 return unlikely(test_tsk_thread_flag(tsk,TIF_NEED_RESCHED));
15091 }
15092
15093+#ifdef CONFIG_PREEMPT_LAZY
15094+static inline void set_tsk_need_resched_lazy(struct task_struct *tsk)
15095+{
15096+ set_tsk_thread_flag(tsk,TIF_NEED_RESCHED_LAZY);
15097+}
15098+
15099+static inline void clear_tsk_need_resched_lazy(struct task_struct *tsk)
15100+{
15101+ clear_tsk_thread_flag(tsk,TIF_NEED_RESCHED_LAZY);
15102+}
15103+
15104+static inline int test_tsk_need_resched_lazy(struct task_struct *tsk)
15105+{
15106+ return unlikely(test_tsk_thread_flag(tsk,TIF_NEED_RESCHED_LAZY));
15107+}
15108+
15109+static inline int need_resched_lazy(void)
15110+{
15111+ return test_thread_flag(TIF_NEED_RESCHED_LAZY);
15112+}
15113+
15114+static inline int need_resched_now(void)
15115+{
15116+ return test_thread_flag(TIF_NEED_RESCHED);
15117+}
15118+
15119+#else
15120+static inline void clear_tsk_need_resched_lazy(struct task_struct *tsk) { }
15121+static inline int need_resched_lazy(void) { return 0; }
15122+
15123+static inline int need_resched_now(void)
15124+{
15125+ return test_thread_flag(TIF_NEED_RESCHED);
15126+}
15127+
15128+#endif
15129+
15130+
15131+static inline bool __task_is_stopped_or_traced(struct task_struct *task)
15132+{
15133+ if (task->state & (__TASK_STOPPED | __TASK_TRACED))
15134+ return true;
15135+#ifdef CONFIG_PREEMPT_RT_FULL
15136+ if (task->saved_state & (__TASK_STOPPED | __TASK_TRACED))
15137+ return true;
15138+#endif
15139+ return false;
15140+}
15141+
15142+static inline bool task_is_stopped_or_traced(struct task_struct *task)
15143+{
15144+ bool traced_stopped;
15145+
15146+#ifdef CONFIG_PREEMPT_RT_FULL
15147+ unsigned long flags;
15148+
15149+ raw_spin_lock_irqsave(&task->pi_lock, flags);
15150+ traced_stopped = __task_is_stopped_or_traced(task);
15151+ raw_spin_unlock_irqrestore(&task->pi_lock, flags);
15152+#else
15153+ traced_stopped = __task_is_stopped_or_traced(task);
15154+#endif
15155+ return traced_stopped;
15156+}
15157+
15158+static inline bool task_is_traced(struct task_struct *task)
15159+{
15160+ bool traced = false;
15161+
15162+ if (task->state & __TASK_TRACED)
15163+ return true;
15164+#ifdef CONFIG_PREEMPT_RT_FULL
15165+ /* in case the task is sleeping on tasklist_lock */
15166+ raw_spin_lock_irq(&task->pi_lock);
15167+ if (task->state & __TASK_TRACED)
15168+ traced = true;
15169+ else if (task->saved_state & __TASK_TRACED)
15170+ traced = true;
15171+ raw_spin_unlock_irq(&task->pi_lock);
15172+#endif
15173+ return traced;
15174+}
15175+
15176 /*
15177 * cond_resched() and cond_resched_lock(): latency reduction via
15178 * explicit rescheduling in places that are safe. The return
b3bbd485 15179@@ -1636,12 +1769,16 @@ extern int __cond_resched_lock(spinlock_t *lock);
e4b2b4a8
JK
15180 __cond_resched_lock(lock); \
15181 })
15182
15183+#ifndef CONFIG_PREEMPT_RT_FULL
15184 extern int __cond_resched_softirq(void);
15185
15186 #define cond_resched_softirq() ({ \
15187 ___might_sleep(__FILE__, __LINE__, SOFTIRQ_DISABLE_OFFSET); \
15188 __cond_resched_softirq(); \
15189 })
15190+#else
15191+# define cond_resched_softirq() cond_resched()
15192+#endif
c7c16703 15193
e4b2b4a8
JK
15194 static inline void cond_resched_rcu(void)
15195 {
b3bbd485 15196@@ -1671,6 +1808,23 @@ static __always_inline bool need_resched(void)
e4b2b4a8
JK
15197 return unlikely(tif_need_resched());
15198 }
c7c16703 15199
e4b2b4a8
JK
15200+#ifdef CONFIG_PREEMPT_RT_FULL
15201+static inline void sleeping_lock_inc(void)
15202+{
15203+ current->sleeping_lock++;
15204+}
15205+
15206+static inline void sleeping_lock_dec(void)
15207+{
15208+ current->sleeping_lock--;
15209+}
15210+
15211+#else
15212+
15213+static inline void sleeping_lock_inc(void) { }
15214+static inline void sleeping_lock_dec(void) { }
15215+#endif
15216+
15217 /*
15218 * Wrappers for p->thread_info->cpu access. No-op on UP.
15219 */
b3bbd485
JK
15220diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h
15221index 3d49b91b674d..d8f2fa8f500c 100644
15222--- a/include/linux/sched/mm.h
15223+++ b/include/linux/sched/mm.h
15224@@ -43,6 +43,17 @@ static inline void mmdrop(struct mm_struct *mm)
15225 __mmdrop(mm);
15226 }
15227
15228+#ifdef CONFIG_PREEMPT_RT_BASE
15229+extern void __mmdrop_delayed(struct rcu_head *rhp);
15230+static inline void mmdrop_delayed(struct mm_struct *mm)
15231+{
15232+ if (atomic_dec_and_test(&mm->mm_count))
15233+ call_rcu(&mm->delayed_drop, __mmdrop_delayed);
15234+}
15235+#else
15236+# define mmdrop_delayed(mm) mmdrop(mm)
15237+#endif
15238+
15239 static inline void mmdrop_async_fn(struct work_struct *work)
15240 {
15241 struct mm_struct *mm = container_of(work, struct mm_struct, async_put_work);
15242diff --git a/include/linux/sched/task.h b/include/linux/sched/task.h
15243index a74ec619ac51..8e7f741370c5 100644
15244--- a/include/linux/sched/task.h
15245+++ b/include/linux/sched/task.h
15246@@ -88,6 +88,15 @@ extern void sched_exec(void);
15247
15248 #define get_task_struct(tsk) do { atomic_inc(&(tsk)->usage); } while(0)
15249
15250+#ifdef CONFIG_PREEMPT_RT_BASE
15251+extern void __put_task_struct_cb(struct rcu_head *rhp);
15252+
15253+static inline void put_task_struct(struct task_struct *t)
15254+{
15255+ if (atomic_dec_and_test(&t->usage))
15256+ call_rcu(&t->put_rcu, __put_task_struct_cb);
15257+}
15258+#else
15259 extern void __put_task_struct(struct task_struct *t);
15260
15261 static inline void put_task_struct(struct task_struct *t)
15262@@ -95,7 +104,7 @@ static inline void put_task_struct(struct task_struct *t)
15263 if (atomic_dec_and_test(&t->usage))
15264 __put_task_struct(t);
15265 }
15266-
15267+#endif
15268 struct task_struct *task_rcu_dereference(struct task_struct **ptask);
15269
15270 #ifdef CONFIG_ARCH_WANTS_DYNAMIC_TASK_STRUCT
15271diff --git a/include/linux/sched/wake_q.h b/include/linux/sched/wake_q.h
15272index 10b19a192b2d..ce3ccff3d9d8 100644
15273--- a/include/linux/sched/wake_q.h
15274+++ b/include/linux/sched/wake_q.h
15275@@ -47,8 +47,29 @@ static inline void wake_q_init(struct wake_q_head *head)
15276 head->lastp = &head->first;
15277 }
15278
15279-extern void wake_q_add(struct wake_q_head *head,
15280- struct task_struct *task);
15281-extern void wake_up_q(struct wake_q_head *head);
15282+extern void __wake_q_add(struct wake_q_head *head,
15283+ struct task_struct *task, bool sleeper);
15284+static inline void wake_q_add(struct wake_q_head *head,
15285+ struct task_struct *task)
15286+{
15287+ __wake_q_add(head, task, false);
15288+}
15289+
15290+static inline void wake_q_add_sleeper(struct wake_q_head *head,
15291+ struct task_struct *task)
15292+{
15293+ __wake_q_add(head, task, true);
15294+}
15295+
15296+extern void __wake_up_q(struct wake_q_head *head, bool sleeper);
15297+static inline void wake_up_q(struct wake_q_head *head)
15298+{
15299+ __wake_up_q(head, false);
15300+}
15301+
15302+static inline void wake_up_q_sleeper(struct wake_q_head *head)
15303+{
15304+ __wake_up_q(head, true);
15305+}
15306
15307 #endif /* _LINUX_SCHED_WAKE_Q_H */
15308diff --git a/include/linux/seqlock.h b/include/linux/seqlock.h
15309index f189a8a3bbb8..107079a2d7ed 100644
15310--- a/include/linux/seqlock.h
15311+++ b/include/linux/seqlock.h
15312@@ -221,20 +221,30 @@ static inline int read_seqcount_retry(const seqcount_t *s, unsigned start)
e4b2b4a8
JK
15313 return __read_seqcount_retry(s, start);
15314 }
c7c16703 15315
e4b2b4a8
JK
15316-
15317-
15318-static inline void raw_write_seqcount_begin(seqcount_t *s)
15319+static inline void __raw_write_seqcount_begin(seqcount_t *s)
15320 {
15321 s->sequence++;
15322 smp_wmb();
15323 }
c7c16703 15324
e4b2b4a8
JK
15325-static inline void raw_write_seqcount_end(seqcount_t *s)
15326+static inline void raw_write_seqcount_begin(seqcount_t *s)
15327+{
15328+ preempt_disable_rt();
15329+ __raw_write_seqcount_begin(s);
15330+}
15331+
15332+static inline void __raw_write_seqcount_end(seqcount_t *s)
15333 {
15334 smp_wmb();
15335 s->sequence++;
15336 }
c7c16703 15337
e4b2b4a8
JK
15338+static inline void raw_write_seqcount_end(seqcount_t *s)
15339+{
15340+ __raw_write_seqcount_end(s);
15341+ preempt_enable_rt();
15342+}
15343+
15344 /**
15345 * raw_write_seqcount_barrier - do a seq write barrier
15346 * @s: pointer to seqcount_t
b3bbd485 15347@@ -429,10 +439,33 @@ typedef struct {
e4b2b4a8
JK
15348 /*
15349 * Read side functions for starting and finalizing a read side section.
15350 */
15351+#ifndef CONFIG_PREEMPT_RT_FULL
15352 static inline unsigned read_seqbegin(const seqlock_t *sl)
15353 {
15354 return read_seqcount_begin(&sl->seqcount);
15355 }
15356+#else
15357+/*
15358+ * Starvation safe read side for RT
15359+ */
15360+static inline unsigned read_seqbegin(seqlock_t *sl)
15361+{
15362+ unsigned ret;
15363+
15364+repeat:
15365+ ret = ACCESS_ONCE(sl->seqcount.sequence);
15366+ if (unlikely(ret & 1)) {
15367+ /*
15368+ * Take the lock and let the writer proceed (i.e. evtl
15369+ * boost it), otherwise we could loop here forever.
15370+ */
15371+ spin_unlock_wait(&sl->lock);
15372+ goto repeat;
15373+ }
b3bbd485 15374+ smp_rmb();
e4b2b4a8
JK
15375+ return ret;
15376+}
15377+#endif
c7c16703 15378
e4b2b4a8
JK
15379 static inline unsigned read_seqretry(const seqlock_t *sl, unsigned start)
15380 {
b3bbd485 15381@@ -447,36 +480,45 @@ static inline unsigned read_seqretry(const seqlock_t *sl, unsigned start)
e4b2b4a8
JK
15382 static inline void write_seqlock(seqlock_t *sl)
15383 {
15384 spin_lock(&sl->lock);
15385- write_seqcount_begin(&sl->seqcount);
15386+ __raw_write_seqcount_begin(&sl->seqcount);
15387+}
15388+
15389+static inline int try_write_seqlock(seqlock_t *sl)
15390+{
15391+ if (spin_trylock(&sl->lock)) {
15392+ __raw_write_seqcount_begin(&sl->seqcount);
15393+ return 1;
15394+ }
15395+ return 0;
c7c16703 15396 }
c7c16703 15397
e4b2b4a8
JK
15398 static inline void write_sequnlock(seqlock_t *sl)
15399 {
15400- write_seqcount_end(&sl->seqcount);
15401+ __raw_write_seqcount_end(&sl->seqcount);
15402 spin_unlock(&sl->lock);
15403 }
c7c16703 15404
e4b2b4a8
JK
15405 static inline void write_seqlock_bh(seqlock_t *sl)
15406 {
15407 spin_lock_bh(&sl->lock);
15408- write_seqcount_begin(&sl->seqcount);
15409+ __raw_write_seqcount_begin(&sl->seqcount);
c7c16703 15410 }
e4b2b4a8
JK
15411
15412 static inline void write_sequnlock_bh(seqlock_t *sl)
1a6e0f06 15413 {
e4b2b4a8
JK
15414- write_seqcount_end(&sl->seqcount);
15415+ __raw_write_seqcount_end(&sl->seqcount);
15416 spin_unlock_bh(&sl->lock);
15417 }
1a6e0f06 15418
e4b2b4a8
JK
15419 static inline void write_seqlock_irq(seqlock_t *sl)
15420 {
15421 spin_lock_irq(&sl->lock);
15422- write_seqcount_begin(&sl->seqcount);
15423+ __raw_write_seqcount_begin(&sl->seqcount);
15424 }
1a6e0f06 15425
e4b2b4a8 15426 static inline void write_sequnlock_irq(seqlock_t *sl)
1a6e0f06 15427 {
e4b2b4a8
JK
15428- write_seqcount_end(&sl->seqcount);
15429+ __raw_write_seqcount_end(&sl->seqcount);
15430 spin_unlock_irq(&sl->lock);
15431 }
15432
b3bbd485 15433@@ -485,7 +527,7 @@ static inline unsigned long __write_seqlock_irqsave(seqlock_t *sl)
e4b2b4a8
JK
15434 unsigned long flags;
15435
15436 spin_lock_irqsave(&sl->lock, flags);
15437- write_seqcount_begin(&sl->seqcount);
15438+ __raw_write_seqcount_begin(&sl->seqcount);
15439 return flags;
15440 }
15441
b3bbd485 15442@@ -495,7 +537,7 @@ static inline unsigned long __write_seqlock_irqsave(seqlock_t *sl)
e4b2b4a8
JK
15443 static inline void
15444 write_sequnlock_irqrestore(seqlock_t *sl, unsigned long flags)
1a6e0f06 15445 {
e4b2b4a8
JK
15446- write_seqcount_end(&sl->seqcount);
15447+ __raw_write_seqcount_end(&sl->seqcount);
15448 spin_unlock_irqrestore(&sl->lock, flags);
15449 }
1a6e0f06 15450
b3bbd485
JK
15451diff --git a/include/linux/signal.h b/include/linux/signal.h
15452index 042968dd98f0..a7d20f85cc0e 100644
15453--- a/include/linux/signal.h
15454+++ b/include/linux/signal.h
15455@@ -243,6 +243,7 @@ static inline void init_sigpending(struct sigpending *sig)
1a6e0f06
JK
15456 }
15457
e4b2b4a8
JK
15458 extern void flush_sigqueue(struct sigpending *queue);
15459+extern void flush_task_sigqueue(struct task_struct *tsk);
15460
15461 /* Test if 'sig' is valid signal. Use this instead of testing _NSIG directly */
15462 static inline int valid_signal(unsigned long sig)
b3bbd485
JK
15463diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
15464index f64e88444082..07576a062ac0 100644
15465--- a/include/linux/skbuff.h
15466+++ b/include/linux/skbuff.h
15467@@ -287,6 +287,7 @@ struct sk_buff_head {
e4b2b4a8
JK
15468
15469 __u32 qlen;
15470 spinlock_t lock;
15471+ raw_spinlock_t raw_lock;
1a6e0f06
JK
15472 };
15473
e4b2b4a8 15474 struct sk_buff;
b3bbd485 15475@@ -1672,6 +1673,12 @@ static inline void skb_queue_head_init(struct sk_buff_head *list)
e4b2b4a8
JK
15476 __skb_queue_head_init(list);
15477 }
1a6e0f06 15478
e4b2b4a8
JK
15479+static inline void skb_queue_head_init_raw(struct sk_buff_head *list)
15480+{
15481+ raw_spin_lock_init(&list->raw_lock);
15482+ __skb_queue_head_init(list);
15483+}
15484+
15485 static inline void skb_queue_head_init_class(struct sk_buff_head *list,
15486 struct lock_class_key *class)
15487 {
b3bbd485
JK
15488diff --git a/include/linux/smp.h b/include/linux/smp.h
15489index 9fb239e12b82..5801e516ba63 100644
15490--- a/include/linux/smp.h
15491+++ b/include/linux/smp.h
15492@@ -202,6 +202,9 @@ static inline int get_boot_cpu_id(void)
e4b2b4a8
JK
15493 #define get_cpu() ({ preempt_disable(); smp_processor_id(); })
15494 #define put_cpu() preempt_enable()
1a6e0f06 15495
e4b2b4a8
JK
15496+#define get_cpu_light() ({ migrate_disable(); smp_processor_id(); })
15497+#define put_cpu_light() migrate_enable()
15498+
15499 /*
15500 * Callback to arch code if there's nosmp or maxcpus=0 on the
15501 * boot command line:
b3bbd485
JK
15502diff --git a/include/linux/spinlock.h b/include/linux/spinlock.h
15503index 341e1a12bfc7..7c8f0a985b9e 100644
15504--- a/include/linux/spinlock.h
15505+++ b/include/linux/spinlock.h
15506@@ -286,7 +286,11 @@ static inline void do_raw_spin_unlock(raw_spinlock_t *lock) __releases(lock)
e4b2b4a8
JK
15507 #define raw_spin_can_lock(lock) (!raw_spin_is_locked(lock))
15508
15509 /* Include rwlock functions */
15510-#include <linux/rwlock.h>
1a6e0f06 15511+#ifdef CONFIG_PREEMPT_RT_FULL
e4b2b4a8 15512+# include <linux/rwlock_rt.h>
1a6e0f06 15513+#else
e4b2b4a8 15514+# include <linux/rwlock.h>
1a6e0f06 15515+#endif
1a6e0f06 15516
e4b2b4a8
JK
15517 /*
15518 * Pull the _spin_*()/_read_*()/_write_*() functions/declarations:
b3bbd485 15519@@ -297,6 +301,10 @@ static inline void do_raw_spin_unlock(raw_spinlock_t *lock) __releases(lock)
e4b2b4a8
JK
15520 # include <linux/spinlock_api_up.h>
15521 #endif
1a6e0f06 15522
e4b2b4a8
JK
15523+#ifdef CONFIG_PREEMPT_RT_FULL
15524+# include <linux/spinlock_rt.h>
15525+#else /* PREEMPT_RT_FULL */
15526+
15527 /*
15528 * Map the spin_lock functions to the raw variants for PREEMPT_RT=n
15529 */
b3bbd485 15530@@ -421,4 +429,6 @@ extern int _atomic_dec_and_lock(atomic_t *atomic, spinlock_t *lock);
e4b2b4a8
JK
15531 #define atomic_dec_and_lock(atomic, lock) \
15532 __cond_lock(lock, _atomic_dec_and_lock(atomic, lock))
1a6e0f06 15533
e4b2b4a8
JK
15534+#endif /* !PREEMPT_RT_FULL */
15535+
15536 #endif /* __LINUX_SPINLOCK_H */
b3bbd485
JK
15537diff --git a/include/linux/spinlock_api_smp.h b/include/linux/spinlock_api_smp.h
15538index 42dfab89e740..29d99ae5a8ab 100644
15539--- a/include/linux/spinlock_api_smp.h
15540+++ b/include/linux/spinlock_api_smp.h
15541@@ -187,6 +187,8 @@ static inline int __raw_spin_trylock_bh(raw_spinlock_t *lock)
15542 return 0;
15543 }
15544
15545-#include <linux/rwlock_api_smp.h>
15546+#ifndef CONFIG_PREEMPT_RT_FULL
15547+# include <linux/rwlock_api_smp.h>
15548+#endif
15549
15550 #endif /* __LINUX_SPINLOCK_API_SMP_H */
15551diff --git a/include/linux/spinlock_rt.h b/include/linux/spinlock_rt.h
15552new file mode 100644
15553index 000000000000..c95e1f5145ac
15554--- /dev/null
15555+++ b/include/linux/spinlock_rt.h
e4b2b4a8
JK
15556@@ -0,0 +1,159 @@
15557+#ifndef __LINUX_SPINLOCK_RT_H
15558+#define __LINUX_SPINLOCK_RT_H
15559+
15560+#ifndef __LINUX_SPINLOCK_H
15561+#error Do not include directly. Use spinlock.h
15562+#endif
15563+
15564+#include <linux/bug.h>
15565+
15566+extern void
15567+__rt_spin_lock_init(spinlock_t *lock, const char *name, struct lock_class_key *key);
15568+
15569+#define spin_lock_init(slock) \
15570+do { \
15571+ static struct lock_class_key __key; \
15572+ \
15573+ rt_mutex_init(&(slock)->lock); \
15574+ __rt_spin_lock_init(slock, #slock, &__key); \
15575+} while (0)
15576+
15577+extern void __lockfunc rt_spin_lock(spinlock_t *lock);
15578+extern unsigned long __lockfunc rt_spin_lock_trace_flags(spinlock_t *lock);
15579+extern void __lockfunc rt_spin_lock_nested(spinlock_t *lock, int subclass);
15580+extern void __lockfunc rt_spin_unlock(spinlock_t *lock);
15581+extern void __lockfunc rt_spin_unlock_wait(spinlock_t *lock);
15582+extern int __lockfunc rt_spin_trylock_irqsave(spinlock_t *lock, unsigned long *flags);
15583+extern int __lockfunc rt_spin_trylock_bh(spinlock_t *lock);
15584+extern int __lockfunc rt_spin_trylock(spinlock_t *lock);
15585+extern int atomic_dec_and_spin_lock(atomic_t *atomic, spinlock_t *lock);
15586+
15587+/*
15588+ * lockdep-less calls, for derived types like rwlock:
15589+ * (for trylock they can use rt_mutex_trylock() directly.
15590+ * Migrate disable handling must be done at the call site.
15591+ */
15592+extern void __lockfunc __rt_spin_lock(struct rt_mutex *lock);
15593+extern void __lockfunc __rt_spin_trylock(struct rt_mutex *lock);
15594+extern void __lockfunc __rt_spin_unlock(struct rt_mutex *lock);
15595+
15596+#define spin_lock(lock) rt_spin_lock(lock)
15597+
15598+#define spin_lock_bh(lock) \
15599+ do { \
15600+ local_bh_disable(); \
15601+ rt_spin_lock(lock); \
15602+ } while (0)
15603+
15604+#define spin_lock_irq(lock) spin_lock(lock)
15605+
15606+#define spin_do_trylock(lock) __cond_lock(lock, rt_spin_trylock(lock))
15607+
15608+#define spin_trylock(lock) \
15609+({ \
15610+ int __locked; \
15611+ __locked = spin_do_trylock(lock); \
15612+ __locked; \
15613+})
15614+
15615+#ifdef CONFIG_LOCKDEP
15616+# define spin_lock_nested(lock, subclass) \
15617+ do { \
15618+ rt_spin_lock_nested(lock, subclass); \
15619+ } while (0)
15620+
15621+#define spin_lock_bh_nested(lock, subclass) \
15622+ do { \
15623+ local_bh_disable(); \
15624+ rt_spin_lock_nested(lock, subclass); \
15625+ } while (0)
15626+
15627+# define spin_lock_irqsave_nested(lock, flags, subclass) \
15628+ do { \
15629+ typecheck(unsigned long, flags); \
15630+ flags = 0; \
15631+ rt_spin_lock_nested(lock, subclass); \
15632+ } while (0)
15633+#else
15634+# define spin_lock_nested(lock, subclass) spin_lock(lock)
15635+# define spin_lock_bh_nested(lock, subclass) spin_lock_bh(lock)
15636+
15637+# define spin_lock_irqsave_nested(lock, flags, subclass) \
15638+ do { \
15639+ typecheck(unsigned long, flags); \
15640+ flags = 0; \
15641+ spin_lock(lock); \
15642+ } while (0)
15643+#endif
15644+
15645+#define spin_lock_irqsave(lock, flags) \
15646+ do { \
15647+ typecheck(unsigned long, flags); \
15648+ flags = 0; \
15649+ spin_lock(lock); \
15650+ } while (0)
15651+
15652+static inline unsigned long spin_lock_trace_flags(spinlock_t *lock)
1a6e0f06 15653+{
e4b2b4a8
JK
15654+ unsigned long flags = 0;
15655+#ifdef CONFIG_TRACE_IRQFLAGS
15656+ flags = rt_spin_lock_trace_flags(lock);
15657+#else
15658+ spin_lock(lock); /* lock_local */
15659+#endif
15660+ return flags;
1a6e0f06
JK
15661+}
15662+
e4b2b4a8
JK
15663+/* FIXME: we need rt_spin_lock_nest_lock */
15664+#define spin_lock_nest_lock(lock, nest_lock) spin_lock_nested(lock, 0)
15665+
15666+#define spin_unlock(lock) rt_spin_unlock(lock)
15667+
15668+#define spin_unlock_bh(lock) \
15669+ do { \
15670+ rt_spin_unlock(lock); \
15671+ local_bh_enable(); \
15672+ } while (0)
15673+
15674+#define spin_unlock_irq(lock) spin_unlock(lock)
15675+
15676+#define spin_unlock_irqrestore(lock, flags) \
15677+ do { \
15678+ typecheck(unsigned long, flags); \
15679+ (void) flags; \
15680+ spin_unlock(lock); \
15681+ } while (0)
15682+
15683+#define spin_trylock_bh(lock) __cond_lock(lock, rt_spin_trylock_bh(lock))
15684+#define spin_trylock_irq(lock) spin_trylock(lock)
15685+
15686+#define spin_trylock_irqsave(lock, flags) \
15687+ rt_spin_trylock_irqsave(lock, &(flags))
15688+
15689+#define spin_unlock_wait(lock) rt_spin_unlock_wait(lock)
15690+
15691+#ifdef CONFIG_GENERIC_LOCKBREAK
15692+# define spin_is_contended(lock) ((lock)->break_lock)
15693+#else
15694+# define spin_is_contended(lock) (((void)(lock), 0))
15695+#endif
15696+
15697+static inline int spin_can_lock(spinlock_t *lock)
1a6e0f06 15698+{
e4b2b4a8 15699+ return !rt_mutex_is_locked(&lock->lock);
1a6e0f06
JK
15700+}
15701+
e4b2b4a8 15702+static inline int spin_is_locked(spinlock_t *lock)
1a6e0f06 15703+{
e4b2b4a8 15704+ return rt_mutex_is_locked(&lock->lock);
1a6e0f06
JK
15705+}
15706+
e4b2b4a8 15707+static inline void assert_spin_locked(spinlock_t *lock)
1a6e0f06 15708+{
e4b2b4a8 15709+ BUG_ON(!spin_is_locked(lock));
1a6e0f06 15710+}
1a6e0f06 15711+
e4b2b4a8
JK
15712+#define atomic_dec_and_lock(atomic, lock) \
15713+ atomic_dec_and_spin_lock(atomic, lock)
15714+
15715+#endif
b3bbd485
JK
15716diff --git a/include/linux/spinlock_types.h b/include/linux/spinlock_types.h
15717index 73548eb13a5d..10bac715ea96 100644
15718--- a/include/linux/spinlock_types.h
15719+++ b/include/linux/spinlock_types.h
e4b2b4a8
JK
15720@@ -9,80 +9,15 @@
15721 * Released under the General Public License (GPL).
15722 */
1a6e0f06 15723
e4b2b4a8
JK
15724-#if defined(CONFIG_SMP)
15725-# include <asm/spinlock_types.h>
15726-#else
15727-# include <linux/spinlock_types_up.h>
15728-#endif
15729-
15730-#include <linux/lockdep.h>
15731-
15732-typedef struct raw_spinlock {
15733- arch_spinlock_t raw_lock;
15734-#ifdef CONFIG_GENERIC_LOCKBREAK
15735- unsigned int break_lock;
15736-#endif
15737-#ifdef CONFIG_DEBUG_SPINLOCK
15738- unsigned int magic, owner_cpu;
15739- void *owner;
15740-#endif
15741-#ifdef CONFIG_DEBUG_LOCK_ALLOC
15742- struct lockdep_map dep_map;
15743-#endif
15744-} raw_spinlock_t;
15745-
15746-#define SPINLOCK_MAGIC 0xdead4ead
15747-
15748-#define SPINLOCK_OWNER_INIT ((void *)-1L)
15749-
15750-#ifdef CONFIG_DEBUG_LOCK_ALLOC
15751-# define SPIN_DEP_MAP_INIT(lockname) .dep_map = { .name = #lockname }
15752-#else
15753-# define SPIN_DEP_MAP_INIT(lockname)
15754-#endif
15755+#include <linux/spinlock_types_raw.h>
1a6e0f06 15756
e4b2b4a8
JK
15757-#ifdef CONFIG_DEBUG_SPINLOCK
15758-# define SPIN_DEBUG_INIT(lockname) \
15759- .magic = SPINLOCK_MAGIC, \
15760- .owner_cpu = -1, \
15761- .owner = SPINLOCK_OWNER_INIT,
15762+#ifndef CONFIG_PREEMPT_RT_FULL
15763+# include <linux/spinlock_types_nort.h>
15764+# include <linux/rwlock_types.h>
15765 #else
15766-# define SPIN_DEBUG_INIT(lockname)
b3bbd485
JK
15767+# include <linux/rtmutex.h>
15768+# include <linux/spinlock_types_rt.h>
15769+# include <linux/rwlock_types_rt.h>
15770 #endif
15771
e4b2b4a8
JK
15772-#define __RAW_SPIN_LOCK_INITIALIZER(lockname) \
15773- { \
15774- .raw_lock = __ARCH_SPIN_LOCK_UNLOCKED, \
15775- SPIN_DEBUG_INIT(lockname) \
15776- SPIN_DEP_MAP_INIT(lockname) }
15777-
15778-#define __RAW_SPIN_LOCK_UNLOCKED(lockname) \
15779- (raw_spinlock_t) __RAW_SPIN_LOCK_INITIALIZER(lockname)
15780-
15781-#define DEFINE_RAW_SPINLOCK(x) raw_spinlock_t x = __RAW_SPIN_LOCK_UNLOCKED(x)
15782-
15783-typedef struct spinlock {
15784- union {
15785- struct raw_spinlock rlock;
15786-
15787-#ifdef CONFIG_DEBUG_LOCK_ALLOC
15788-# define LOCK_PADSIZE (offsetof(struct raw_spinlock, dep_map))
15789- struct {
15790- u8 __padding[LOCK_PADSIZE];
15791- struct lockdep_map dep_map;
15792- };
b3bbd485 15793-#endif
e4b2b4a8
JK
15794- };
15795-} spinlock_t;
15796-
15797-#define __SPIN_LOCK_INITIALIZER(lockname) \
15798- { { .rlock = __RAW_SPIN_LOCK_INITIALIZER(lockname) } }
15799-
15800-#define __SPIN_LOCK_UNLOCKED(lockname) \
15801- (spinlock_t ) __SPIN_LOCK_INITIALIZER(lockname)
15802-
15803-#define DEFINE_SPINLOCK(x) spinlock_t x = __SPIN_LOCK_UNLOCKED(x)
15804-
15805-#include <linux/rwlock_types.h>
b3bbd485 15806-
e4b2b4a8 15807 #endif /* __LINUX_SPINLOCK_TYPES_H */
b3bbd485
JK
15808diff --git a/include/linux/spinlock_types_nort.h b/include/linux/spinlock_types_nort.h
15809new file mode 100644
15810index 000000000000..f1dac1fb1d6a
15811--- /dev/null
15812+++ b/include/linux/spinlock_types_nort.h
e4b2b4a8
JK
15813@@ -0,0 +1,33 @@
15814+#ifndef __LINUX_SPINLOCK_TYPES_NORT_H
15815+#define __LINUX_SPINLOCK_TYPES_NORT_H
1a6e0f06 15816+
e4b2b4a8
JK
15817+#ifndef __LINUX_SPINLOCK_TYPES_H
15818+#error "Do not include directly. Include spinlock_types.h instead"
15819+#endif
1a6e0f06 15820+
e4b2b4a8
JK
15821+/*
15822+ * The non RT version maps spinlocks to raw_spinlocks
15823+ */
15824+typedef struct spinlock {
15825+ union {
15826+ struct raw_spinlock rlock;
1a6e0f06 15827+
e4b2b4a8
JK
15828+#ifdef CONFIG_DEBUG_LOCK_ALLOC
15829+# define LOCK_PADSIZE (offsetof(struct raw_spinlock, dep_map))
15830+ struct {
15831+ u8 __padding[LOCK_PADSIZE];
15832+ struct lockdep_map dep_map;
15833+ };
1a6e0f06 15834+#endif
e4b2b4a8
JK
15835+ };
15836+} spinlock_t;
1a6e0f06 15837+
e4b2b4a8
JK
15838+#define __SPIN_LOCK_INITIALIZER(lockname) \
15839+ { { .rlock = __RAW_SPIN_LOCK_INITIALIZER(lockname) } }
1a6e0f06 15840+
e4b2b4a8
JK
15841+#define __SPIN_LOCK_UNLOCKED(lockname) \
15842+ (spinlock_t ) __SPIN_LOCK_INITIALIZER(lockname)
1a6e0f06 15843+
e4b2b4a8 15844+#define DEFINE_SPINLOCK(x) spinlock_t x = __SPIN_LOCK_UNLOCKED(x)
1a6e0f06 15845+
e4b2b4a8 15846+#endif
b3bbd485
JK
15847diff --git a/include/linux/spinlock_types_raw.h b/include/linux/spinlock_types_raw.h
15848new file mode 100644
15849index 000000000000..03235b475b77
15850--- /dev/null
15851+++ b/include/linux/spinlock_types_raw.h
e4b2b4a8
JK
15852@@ -0,0 +1,58 @@
15853+#ifndef __LINUX_SPINLOCK_TYPES_RAW_H
15854+#define __LINUX_SPINLOCK_TYPES_RAW_H
1a6e0f06 15855+
e4b2b4a8 15856+#include <linux/types.h>
1a6e0f06 15857+
e4b2b4a8
JK
15858+#if defined(CONFIG_SMP)
15859+# include <asm/spinlock_types.h>
15860+#else
15861+# include <linux/spinlock_types_up.h>
15862+#endif
1a6e0f06 15863+
e4b2b4a8
JK
15864+#include <linux/lockdep.h>
15865+
15866+typedef struct raw_spinlock {
15867+ arch_spinlock_t raw_lock;
15868+#ifdef CONFIG_GENERIC_LOCKBREAK
15869+ unsigned int break_lock;
15870+#endif
15871+#ifdef CONFIG_DEBUG_SPINLOCK
15872+ unsigned int magic, owner_cpu;
15873+ void *owner;
15874+#endif
15875+#ifdef CONFIG_DEBUG_LOCK_ALLOC
15876+ struct lockdep_map dep_map;
15877+#endif
15878+} raw_spinlock_t;
15879+
15880+#define SPINLOCK_MAGIC 0xdead4ead
15881+
15882+#define SPINLOCK_OWNER_INIT ((void *)-1L)
1a6e0f06 15883+
e4b2b4a8
JK
15884+#ifdef CONFIG_DEBUG_LOCK_ALLOC
15885+# define SPIN_DEP_MAP_INIT(lockname) .dep_map = { .name = #lockname }
1a6e0f06 15886+#else
e4b2b4a8
JK
15887+# define SPIN_DEP_MAP_INIT(lockname)
15888+#endif
1a6e0f06 15889+
e4b2b4a8
JK
15890+#ifdef CONFIG_DEBUG_SPINLOCK
15891+# define SPIN_DEBUG_INIT(lockname) \
15892+ .magic = SPINLOCK_MAGIC, \
15893+ .owner_cpu = -1, \
15894+ .owner = SPINLOCK_OWNER_INIT,
15895+#else
15896+# define SPIN_DEBUG_INIT(lockname)
1a6e0f06 15897+#endif
e4b2b4a8
JK
15898+
15899+#define __RAW_SPIN_LOCK_INITIALIZER(lockname) \
15900+ { \
15901+ .raw_lock = __ARCH_SPIN_LOCK_UNLOCKED, \
15902+ SPIN_DEBUG_INIT(lockname) \
15903+ SPIN_DEP_MAP_INIT(lockname) }
15904+
15905+#define __RAW_SPIN_LOCK_UNLOCKED(lockname) \
15906+ (raw_spinlock_t) __RAW_SPIN_LOCK_INITIALIZER(lockname)
15907+
15908+#define DEFINE_RAW_SPINLOCK(x) raw_spinlock_t x = __RAW_SPIN_LOCK_UNLOCKED(x)
15909+
1a6e0f06 15910+#endif
b3bbd485
JK
15911diff --git a/include/linux/spinlock_types_rt.h b/include/linux/spinlock_types_rt.h
15912new file mode 100644
15913index 000000000000..3e3d8c5f7a9a
15914--- /dev/null
15915+++ b/include/linux/spinlock_types_rt.h
e4b2b4a8
JK
15916@@ -0,0 +1,48 @@
15917+#ifndef __LINUX_SPINLOCK_TYPES_RT_H
15918+#define __LINUX_SPINLOCK_TYPES_RT_H
15919+
15920+#ifndef __LINUX_SPINLOCK_TYPES_H
15921+#error "Do not include directly. Include spinlock_types.h instead"
1a6e0f06 15922+#endif
1a6e0f06 15923+
e4b2b4a8
JK
15924+#include <linux/cache.h>
15925+
15926+/*
15927+ * PREEMPT_RT: spinlocks - an RT mutex plus lock-break field:
15928+ */
15929+typedef struct spinlock {
15930+ struct rt_mutex lock;
15931+ unsigned int break_lock;
15932+#ifdef CONFIG_DEBUG_LOCK_ALLOC
15933+ struct lockdep_map dep_map;
1a6e0f06 15934+#endif
e4b2b4a8 15935+} spinlock_t;
1a6e0f06 15936+
e4b2b4a8
JK
15937+#ifdef CONFIG_DEBUG_RT_MUTEXES
15938+# define __RT_SPIN_INITIALIZER(name) \
15939+ { \
15940+ .wait_lock = __RAW_SPIN_LOCK_UNLOCKED(name.wait_lock), \
15941+ .save_state = 1, \
15942+ .file = __FILE__, \
15943+ .line = __LINE__ , \
15944+ }
1a6e0f06 15945+#else
e4b2b4a8
JK
15946+# define __RT_SPIN_INITIALIZER(name) \
15947+ { \
15948+ .wait_lock = __RAW_SPIN_LOCK_UNLOCKED(name.wait_lock), \
15949+ .save_state = 1, \
15950+ }
1a6e0f06 15951+#endif
1a6e0f06 15952+
e4b2b4a8
JK
15953+/*
15954+.wait_list = PLIST_HEAD_INIT_RAW((name).lock.wait_list, (name).lock.wait_lock)
15955+*/
15956+
15957+#define __SPIN_LOCK_UNLOCKED(name) \
15958+ { .lock = __RT_SPIN_INITIALIZER(name.lock), \
15959+ SPIN_DEP_MAP_INIT(name) }
15960+
15961+#define DEFINE_SPINLOCK(name) \
15962+ spinlock_t name = __SPIN_LOCK_UNLOCKED(name)
15963+
1a6e0f06 15964+#endif
b3bbd485
JK
15965diff --git a/include/linux/spinlock_types_up.h b/include/linux/spinlock_types_up.h
15966index c09b6407ae1b..b0243ba07fb7 100644
15967--- a/include/linux/spinlock_types_up.h
15968+++ b/include/linux/spinlock_types_up.h
e4b2b4a8
JK
15969@@ -1,10 +1,6 @@
15970 #ifndef __LINUX_SPINLOCK_TYPES_UP_H
15971 #define __LINUX_SPINLOCK_TYPES_UP_H
1a6e0f06 15972
e4b2b4a8
JK
15973-#ifndef __LINUX_SPINLOCK_TYPES_H
15974-# error "please don't include this file directly"
15975-#endif
15976-
15977 /*
15978 * include/linux/spinlock_types_up.h - spinlock type definitions for UP
15979 *
b3bbd485
JK
15980diff --git a/include/linux/srcutiny.h b/include/linux/srcutiny.h
15981index 261471f407a5..f41d2fb09f87 100644
15982--- a/include/linux/srcutiny.h
15983+++ b/include/linux/srcutiny.h
15984@@ -43,7 +43,7 @@ struct srcu_struct {
1a6e0f06 15985
e4b2b4a8 15986 void srcu_drive_gp(struct work_struct *wp);
1a6e0f06 15987
e4b2b4a8
JK
15988-#define __SRCU_STRUCT_INIT(name) \
15989+#define __SRCU_STRUCT_INIT(name, __ignored) \
15990 { \
15991 .srcu_wq = __SWAIT_QUEUE_HEAD_INITIALIZER(name.srcu_wq), \
15992 .srcu_cb_tail = &name.srcu_cb_head, \
b3bbd485 15993@@ -56,9 +56,9 @@ void srcu_drive_gp(struct work_struct *wp);
e4b2b4a8 15994 * Tree SRCU, which needs some per-CPU data.
1a6e0f06 15995 */
e4b2b4a8
JK
15996 #define DEFINE_SRCU(name) \
15997- struct srcu_struct name = __SRCU_STRUCT_INIT(name)
15998+ struct srcu_struct name = __SRCU_STRUCT_INIT(name, name)
15999 #define DEFINE_STATIC_SRCU(name) \
16000- static struct srcu_struct name = __SRCU_STRUCT_INIT(name)
16001+ static struct srcu_struct name = __SRCU_STRUCT_INIT(name, name)
16002
16003 void synchronize_srcu(struct srcu_struct *sp);
16004
b3bbd485
JK
16005diff --git a/include/linux/srcutree.h b/include/linux/srcutree.h
16006index a949f4f9e4d7..745d4ca4dd50 100644
16007--- a/include/linux/srcutree.h
16008+++ b/include/linux/srcutree.h
16009@@ -40,7 +40,7 @@ struct srcu_data {
e4b2b4a8
JK
16010 unsigned long srcu_unlock_count[2]; /* Unlocks per CPU. */
16011
16012 /* Update-side state. */
16013- raw_spinlock_t __private lock ____cacheline_internodealigned_in_smp;
16014+ spinlock_t __private lock ____cacheline_internodealigned_in_smp;
16015 struct rcu_segcblist srcu_cblist; /* List of callbacks.*/
16016 unsigned long srcu_gp_seq_needed; /* Furthest future GP needed. */
16017 unsigned long srcu_gp_seq_needed_exp; /* Furthest future exp GP. */
b3bbd485 16018@@ -58,7 +58,7 @@ struct srcu_data {
e4b2b4a8
JK
16019 * Node in SRCU combining tree, similar in function to rcu_data.
16020 */
16021 struct srcu_node {
16022- raw_spinlock_t __private lock;
16023+ spinlock_t __private lock;
16024 unsigned long srcu_have_cbs[4]; /* GP seq for children */
16025 /* having CBs, but only */
16026 /* is > ->srcu_gq_seq. */
b3bbd485 16027@@ -78,7 +78,7 @@ struct srcu_struct {
e4b2b4a8
JK
16028 struct srcu_node *level[RCU_NUM_LVLS + 1];
16029 /* First node at each level. */
16030 struct mutex srcu_cb_mutex; /* Serialize CB preparation. */
16031- raw_spinlock_t __private lock; /* Protect counters */
16032+ spinlock_t __private lock; /* Protect counters */
16033 struct mutex srcu_gp_mutex; /* Serialize GP work. */
16034 unsigned int srcu_idx; /* Current rdr array element. */
16035 unsigned long srcu_gp_seq; /* Grace-period seq #. */
b3bbd485 16036@@ -104,10 +104,10 @@ struct srcu_struct {
e4b2b4a8
JK
16037 #define SRCU_STATE_SCAN1 1
16038 #define SRCU_STATE_SCAN2 2
1a6e0f06 16039
e4b2b4a8
JK
16040-#define __SRCU_STRUCT_INIT(name) \
16041+#define __SRCU_STRUCT_INIT(name, pcpu_name) \
16042 { \
16043- .sda = &name##_srcu_data, \
16044- .lock = __RAW_SPIN_LOCK_UNLOCKED(name.lock), \
16045+ .sda = &pcpu_name, \
16046+ .lock = __SPIN_LOCK_UNLOCKED(name.lock), \
16047 .srcu_gp_seq_needed = 0 - 1, \
16048 __SRCU_DEP_MAP_INIT(name) \
16049 }
b3bbd485 16050@@ -133,7 +133,7 @@ struct srcu_struct {
1a6e0f06 16051 */
e4b2b4a8
JK
16052 #define __DEFINE_SRCU(name, is_static) \
16053 static DEFINE_PER_CPU(struct srcu_data, name##_srcu_data);\
16054- is_static struct srcu_struct name = __SRCU_STRUCT_INIT(name)
16055+ is_static struct srcu_struct name = __SRCU_STRUCT_INIT(name, name##_srcu_data)
16056 #define DEFINE_SRCU(name) __DEFINE_SRCU(name, /* not static */)
16057 #define DEFINE_STATIC_SRCU(name) __DEFINE_SRCU(name, static)
16058
b3bbd485
JK
16059diff --git a/include/linux/suspend.h b/include/linux/suspend.h
16060index 8544357d92d0..616ea66cd283 100644
16061--- a/include/linux/suspend.h
16062+++ b/include/linux/suspend.h
16063@@ -196,6 +196,12 @@ struct platform_s2idle_ops {
e4b2b4a8 16064 void (*end)(void);
1a6e0f06
JK
16065 };
16066
e4b2b4a8
JK
16067+#if defined(CONFIG_SUSPEND) || defined(CONFIG_HIBERNATION)
16068+extern bool pm_in_action;
16069+#else
16070+# define pm_in_action false
16071+#endif
16072+
16073 #ifdef CONFIG_SUSPEND
16074 extern suspend_state_t mem_sleep_current;
16075 extern suspend_state_t mem_sleep_default;
b3bbd485
JK
16076diff --git a/include/linux/swait.h b/include/linux/swait.h
16077index c98aaf677466..853f3e61a9f4 100644
16078--- a/include/linux/swait.h
16079+++ b/include/linux/swait.h
e4b2b4a8
JK
16080@@ -5,6 +5,7 @@
16081 #include <linux/list.h>
16082 #include <linux/stddef.h>
16083 #include <linux/spinlock.h>
16084+#include <linux/wait.h>
16085 #include <asm/current.h>
1a6e0f06 16086
e4b2b4a8 16087 /*
b3bbd485 16088@@ -147,6 +148,7 @@ static inline bool swq_has_sleeper(struct swait_queue_head *wq)
e4b2b4a8
JK
16089 extern void swake_up(struct swait_queue_head *q);
16090 extern void swake_up_all(struct swait_queue_head *q);
16091 extern void swake_up_locked(struct swait_queue_head *q);
16092+extern void swake_up_all_locked(struct swait_queue_head *q);
1a6e0f06 16093
e4b2b4a8
JK
16094 extern void __prepare_to_swait(struct swait_queue_head *q, struct swait_queue *wait);
16095 extern void prepare_to_swait(struct swait_queue_head *q, struct swait_queue *wait, int state);
b3bbd485
JK
16096diff --git a/include/linux/swap.h b/include/linux/swap.h
16097index f02fb5db8914..6c775168df67 100644
16098--- a/include/linux/swap.h
16099+++ b/include/linux/swap.h
e4b2b4a8
JK
16100@@ -12,6 +12,7 @@
16101 #include <linux/fs.h>
16102 #include <linux/atomic.h>
16103 #include <linux/page-flags.h>
16104+#include <linux/locallock.h>
16105 #include <asm/page.h>
16106
16107 struct notifier_block;
b3bbd485 16108@@ -297,7 +298,8 @@ struct vma_swap_readahead {
e4b2b4a8
JK
16109 void *workingset_eviction(struct address_space *mapping, struct page *page);
16110 bool workingset_refault(void *shadow);
16111 void workingset_activation(struct page *page);
16112-void workingset_update_node(struct radix_tree_node *node, void *private);
16113+void __workingset_update_node(struct radix_tree_node *node, void *private);
16114+DECLARE_LOCAL_IRQ_LOCK(shadow_nodes_lock);
1a6e0f06 16115
e4b2b4a8
JK
16116 /* linux/mm/page_alloc.c */
16117 extern unsigned long totalram_pages;
b3bbd485 16118@@ -310,6 +312,7 @@ extern unsigned long nr_free_pagecache_pages(void);
1a6e0f06 16119
1a6e0f06 16120
e4b2b4a8
JK
16121 /* linux/mm/swap.c */
16122+DECLARE_LOCAL_IRQ_LOCK(swapvec_lock);
16123 extern void lru_cache_add(struct page *);
16124 extern void lru_cache_add_anon(struct page *page);
16125 extern void lru_cache_add_file(struct page *page);
b3bbd485
JK
16126diff --git a/include/linux/swork.h b/include/linux/swork.h
16127new file mode 100644
16128index 000000000000..f175fa9a6016
16129--- /dev/null
16130+++ b/include/linux/swork.h
e4b2b4a8
JK
16131@@ -0,0 +1,24 @@
16132+#ifndef _LINUX_SWORK_H
16133+#define _LINUX_SWORK_H
16134+
16135+#include <linux/list.h>
16136+
16137+struct swork_event {
16138+ struct list_head item;
16139+ unsigned long flags;
16140+ void (*func)(struct swork_event *);
16141+};
16142+
16143+static inline void INIT_SWORK(struct swork_event *event,
16144+ void (*func)(struct swork_event *))
16145+{
16146+ event->flags = 0;
16147+ event->func = func;
16148+}
16149+
16150+bool swork_queue(struct swork_event *sev);
16151+
16152+int swork_get(void);
16153+void swork_put(void);
16154+
16155+#endif /* _LINUX_SWORK_H */
b3bbd485
JK
16156diff --git a/include/linux/thread_info.h b/include/linux/thread_info.h
16157index cf2862bd134a..fd05d83740df 100644
16158--- a/include/linux/thread_info.h
16159+++ b/include/linux/thread_info.h
16160@@ -86,7 +86,17 @@ static inline int test_ti_thread_flag(struct thread_info *ti, int flag)
e4b2b4a8
JK
16161 #define test_thread_flag(flag) \
16162 test_ti_thread_flag(current_thread_info(), flag)
16163
16164-#define tif_need_resched() test_thread_flag(TIF_NEED_RESCHED)
16165+#ifdef CONFIG_PREEMPT_LAZY
16166+#define tif_need_resched() (test_thread_flag(TIF_NEED_RESCHED) || \
16167+ test_thread_flag(TIF_NEED_RESCHED_LAZY))
16168+#define tif_need_resched_now() (test_thread_flag(TIF_NEED_RESCHED))
16169+#define tif_need_resched_lazy() test_thread_flag(TIF_NEED_RESCHED_LAZY))
16170+
1a6e0f06 16171+#else
e4b2b4a8
JK
16172+#define tif_need_resched() test_thread_flag(TIF_NEED_RESCHED)
16173+#define tif_need_resched_now() test_thread_flag(TIF_NEED_RESCHED)
16174+#define tif_need_resched_lazy() 0
1a6e0f06 16175+#endif
1a6e0f06 16176
e4b2b4a8
JK
16177 #ifndef CONFIG_HAVE_ARCH_WITHIN_STACK_FRAMES
16178 static inline int arch_within_stack_frames(const void * const stack,
b3bbd485
JK
16179diff --git a/include/linux/timer.h b/include/linux/timer.h
16180index e0ea1fe87572..df3085ddf662 100644
16181--- a/include/linux/timer.h
16182+++ b/include/linux/timer.h
16183@@ -213,7 +213,7 @@ extern void add_timer(struct timer_list *timer);
1a6e0f06 16184
e4b2b4a8 16185 extern int try_to_del_timer_sync(struct timer_list *timer);
1a6e0f06 16186
e4b2b4a8
JK
16187-#ifdef CONFIG_SMP
16188+#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT_FULL)
16189 extern int del_timer_sync(struct timer_list *timer);
16190 #else
16191 # define del_timer_sync(t) del_timer(t)
b3bbd485
JK
16192diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h
16193index 2bcb4dc6df1a..edd1e42e8a2f 100644
16194--- a/include/linux/trace_events.h
16195+++ b/include/linux/trace_events.h
16196@@ -62,6 +62,9 @@ struct trace_entry {
e4b2b4a8
JK
16197 unsigned char flags;
16198 unsigned char preempt_count;
16199 int pid;
16200+ unsigned short migrate_disable;
16201+ unsigned short padding;
16202+ unsigned char preempt_lazy_count;
16203 };
1a6e0f06 16204
e4b2b4a8 16205 #define TRACE_EVENT_TYPE_MAX \
b3bbd485 16206@@ -402,11 +405,13 @@ enum event_trigger_type {
e4b2b4a8
JK
16207
16208 extern int filter_match_preds(struct event_filter *filter, void *rec);
16209
16210-extern enum event_trigger_type event_triggers_call(struct trace_event_file *file,
16211- void *rec);
16212-extern void event_triggers_post_call(struct trace_event_file *file,
16213- enum event_trigger_type tt,
16214- void *rec);
16215+extern enum event_trigger_type
16216+event_triggers_call(struct trace_event_file *file, void *rec,
16217+ struct ring_buffer_event *event);
16218+extern void
16219+event_triggers_post_call(struct trace_event_file *file,
16220+ enum event_trigger_type tt,
16221+ void *rec, struct ring_buffer_event *event);
1a6e0f06 16222
e4b2b4a8 16223 bool trace_event_ignore_this_pid(struct trace_event_file *trace_file);
1a6e0f06 16224
b3bbd485 16225@@ -426,7 +431,7 @@ trace_trigger_soft_disabled(struct trace_event_file *file)
1a6e0f06 16226
e4b2b4a8
JK
16227 if (!(eflags & EVENT_FILE_FL_TRIGGER_COND)) {
16228 if (eflags & EVENT_FILE_FL_TRIGGER_MODE)
16229- event_triggers_call(file, NULL);
16230+ event_triggers_call(file, NULL, NULL);
16231 if (eflags & EVENT_FILE_FL_SOFT_DISABLED)
16232 return true;
16233 if (eflags & EVENT_FILE_FL_PID_FILTER)
b3bbd485
JK
16234diff --git a/include/linux/uaccess.h b/include/linux/uaccess.h
16235index 251e655d407f..57e8e32ef2b0 100644
16236--- a/include/linux/uaccess.h
16237+++ b/include/linux/uaccess.h
16238@@ -185,6 +185,7 @@ static __always_inline void pagefault_disabled_dec(void)
e4b2b4a8
JK
16239 */
16240 static inline void pagefault_disable(void)
1a6e0f06 16241 {
e4b2b4a8
JK
16242+ migrate_disable();
16243 pagefault_disabled_inc();
16244 /*
16245 * make sure to have issued the store before a pagefault
b3bbd485 16246@@ -201,6 +202,7 @@ static inline void pagefault_enable(void)
e4b2b4a8
JK
16247 */
16248 barrier();
16249 pagefault_disabled_dec();
16250+ migrate_enable();
16251 }
1a6e0f06 16252
e4b2b4a8 16253 /*
b3bbd485
JK
16254diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h
16255index 1e0cb72e0598..87ab0996a9b0 100644
16256--- a/include/linux/vmstat.h
16257+++ b/include/linux/vmstat.h
16258@@ -33,7 +33,9 @@ DECLARE_PER_CPU(struct vm_event_state, vm_event_states);
e4b2b4a8
JK
16259 */
16260 static inline void __count_vm_event(enum vm_event_item item)
16261 {
16262+ preempt_disable_rt();
16263 raw_cpu_inc(vm_event_states.event[item]);
16264+ preempt_enable_rt();
1a6e0f06
JK
16265 }
16266
e4b2b4a8 16267 static inline void count_vm_event(enum vm_event_item item)
b3bbd485 16268@@ -43,7 +45,9 @@ static inline void count_vm_event(enum vm_event_item item)
e4b2b4a8
JK
16269
16270 static inline void __count_vm_events(enum vm_event_item item, long delta)
1a6e0f06 16271 {
e4b2b4a8
JK
16272+ preempt_disable_rt();
16273 raw_cpu_add(vm_event_states.event[item], delta);
16274+ preempt_enable_rt();
1a6e0f06
JK
16275 }
16276
e4b2b4a8 16277 static inline void count_vm_events(enum vm_event_item item, long delta)
b3bbd485
JK
16278diff --git a/include/linux/wait.h b/include/linux/wait.h
16279index 158715445ffb..3451706a3074 100644
16280--- a/include/linux/wait.h
16281+++ b/include/linux/wait.h
e4b2b4a8
JK
16282@@ -10,6 +10,7 @@
16283
16284 #include <asm/current.h>
16285 #include <uapi/linux/wait.h>
16286+#include <linux/atomic.h>
16287
16288 typedef struct wait_queue_entry wait_queue_entry_t;
16289
b3bbd485 16290@@ -486,8 +487,8 @@ do { \
e4b2b4a8
JK
16291 int __ret = 0; \
16292 struct hrtimer_sleeper __t; \
16293 \
16294- hrtimer_init_on_stack(&__t.timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); \
16295- hrtimer_init_sleeper(&__t, current); \
16296+ hrtimer_init_sleeper_on_stack(&__t, CLOCK_MONOTONIC, HRTIMER_MODE_REL, \
16297+ current); \
16298 if ((timeout) != KTIME_MAX) \
16299 hrtimer_start_range_ns(&__t.timer, timeout, \
16300 current->timer_slack_ns, \
b3bbd485
JK
16301diff --git a/include/net/gen_stats.h b/include/net/gen_stats.h
16302index 304f7aa9cc01..00d3813cef26 100644
16303--- a/include/net/gen_stats.h
16304+++ b/include/net/gen_stats.h
e4b2b4a8
JK
16305@@ -6,6 +6,7 @@
16306 #include <linux/socket.h>
16307 #include <linux/rtnetlink.h>
16308 #include <linux/pkt_sched.h>
16309+#include <net/net_seq_lock.h>
16310
16311 struct gnet_stats_basic_cpu {
16312 struct gnet_stats_basic_packed bstats;
b3bbd485 16313@@ -36,11 +37,11 @@ int gnet_stats_start_copy_compat(struct sk_buff *skb, int type,
e4b2b4a8
JK
16314 spinlock_t *lock, struct gnet_dump *d,
16315 int padattr);
1a6e0f06 16316
e4b2b4a8
JK
16317-int gnet_stats_copy_basic(const seqcount_t *running,
16318+int gnet_stats_copy_basic(net_seqlock_t *running,
16319 struct gnet_dump *d,
16320 struct gnet_stats_basic_cpu __percpu *cpu,
16321 struct gnet_stats_basic_packed *b);
16322-void __gnet_stats_copy_basic(const seqcount_t *running,
16323+void __gnet_stats_copy_basic(net_seqlock_t *running,
16324 struct gnet_stats_basic_packed *bstats,
16325 struct gnet_stats_basic_cpu __percpu *cpu,
16326 struct gnet_stats_basic_packed *b);
b3bbd485 16327@@ -57,13 +58,13 @@ int gen_new_estimator(struct gnet_stats_basic_packed *bstats,
e4b2b4a8
JK
16328 struct gnet_stats_basic_cpu __percpu *cpu_bstats,
16329 struct net_rate_estimator __rcu **rate_est,
16330 spinlock_t *stats_lock,
16331- seqcount_t *running, struct nlattr *opt);
16332+ net_seqlock_t *running, struct nlattr *opt);
16333 void gen_kill_estimator(struct net_rate_estimator __rcu **ptr);
16334 int gen_replace_estimator(struct gnet_stats_basic_packed *bstats,
16335 struct gnet_stats_basic_cpu __percpu *cpu_bstats,
16336 struct net_rate_estimator __rcu **ptr,
16337 spinlock_t *stats_lock,
16338- seqcount_t *running, struct nlattr *opt);
16339+ net_seqlock_t *running, struct nlattr *opt);
16340 bool gen_estimator_active(struct net_rate_estimator __rcu **ptr);
16341 bool gen_estimator_read(struct net_rate_estimator __rcu **ptr,
16342 struct gnet_stats_rate_est64 *sample);
b3bbd485
JK
16343diff --git a/include/net/neighbour.h b/include/net/neighbour.h
16344index a964366a7ef5..51c854583987 100644
16345--- a/include/net/neighbour.h
16346+++ b/include/net/neighbour.h
16347@@ -450,7 +450,7 @@ static inline int neigh_hh_bridge(struct hh_cache *hh, struct sk_buff *skb)
1a6e0f06 16348 }
e4b2b4a8 16349 #endif
1a6e0f06 16350
e4b2b4a8
JK
16351-static inline int neigh_hh_output(const struct hh_cache *hh, struct sk_buff *skb)
16352+static inline int neigh_hh_output(struct hh_cache *hh, struct sk_buff *skb)
16353 {
16354 unsigned int seq;
16355 unsigned int hh_len;
b3bbd485 16356@@ -474,7 +474,7 @@ static inline int neigh_hh_output(const struct hh_cache *hh, struct sk_buff *skb
1a6e0f06 16357
e4b2b4a8
JK
16358 static inline int neigh_output(struct neighbour *n, struct sk_buff *skb)
16359 {
16360- const struct hh_cache *hh = &n->hh;
16361+ struct hh_cache *hh = &n->hh;
1a6e0f06 16362
e4b2b4a8
JK
16363 if ((n->nud_state & NUD_CONNECTED) && hh->hh_len)
16364 return neigh_hh_output(hh, skb);
b3bbd485 16365@@ -515,7 +515,7 @@ struct neighbour_cb {
1a6e0f06 16366
e4b2b4a8 16367 #define NEIGH_CB(skb) ((struct neighbour_cb *)(skb)->cb)
1a6e0f06 16368
e4b2b4a8
JK
16369-static inline void neigh_ha_snapshot(char *dst, const struct neighbour *n,
16370+static inline void neigh_ha_snapshot(char *dst, struct neighbour *n,
16371 const struct net_device *dev)
16372 {
16373 unsigned int seq;
b3bbd485
JK
16374diff --git a/include/net/net_seq_lock.h b/include/net/net_seq_lock.h
16375new file mode 100644
16376index 000000000000..a7034298a82a
16377--- /dev/null
16378+++ b/include/net/net_seq_lock.h
e4b2b4a8
JK
16379@@ -0,0 +1,15 @@
16380+#ifndef __NET_NET_SEQ_LOCK_H__
16381+#define __NET_NET_SEQ_LOCK_H__
16382+
1a6e0f06 16383+#ifdef CONFIG_PREEMPT_RT_BASE
e4b2b4a8
JK
16384+# define net_seqlock_t seqlock_t
16385+# define net_seq_begin(__r) read_seqbegin(__r)
16386+# define net_seq_retry(__r, __s) read_seqretry(__r, __s)
16387+
1a6e0f06 16388+#else
e4b2b4a8
JK
16389+# define net_seqlock_t seqcount_t
16390+# define net_seq_begin(__r) read_seqcount_begin(__r)
16391+# define net_seq_retry(__r, __s) read_seqcount_retry(__r, __s)
1a6e0f06
JK
16392+#endif
16393+
e4b2b4a8 16394+#endif
b3bbd485
JK
16395diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
16396index f59acacaa265..6ac7c3659973 100644
16397--- a/include/net/sch_generic.h
16398+++ b/include/net/sch_generic.h
e4b2b4a8
JK
16399@@ -10,6 +10,7 @@
16400 #include <linux/percpu.h>
16401 #include <linux/dynamic_queue_limits.h>
16402 #include <linux/list.h>
16403+#include <net/net_seq_lock.h>
16404 #include <linux/refcount.h>
16405 #include <linux/workqueue.h>
16406 #include <net/gen_stats.h>
b3bbd485 16407@@ -90,7 +91,7 @@ struct Qdisc {
e4b2b4a8
JK
16408 struct sk_buff *gso_skb ____cacheline_aligned_in_smp;
16409 struct qdisc_skb_head q;
16410 struct gnet_stats_basic_packed bstats;
16411- seqcount_t running;
16412+ net_seqlock_t running;
16413 struct gnet_stats_queue qstats;
16414 unsigned long state;
16415 struct Qdisc *next_sched;
b3bbd485 16416@@ -109,13 +110,22 @@ static inline void qdisc_refcount_inc(struct Qdisc *qdisc)
e4b2b4a8
JK
16417 refcount_inc(&qdisc->refcnt);
16418 }
1a6e0f06 16419
e4b2b4a8
JK
16420-static inline bool qdisc_is_running(const struct Qdisc *qdisc)
16421+static inline bool qdisc_is_running(struct Qdisc *qdisc)
1a6e0f06 16422 {
e4b2b4a8
JK
16423+#ifdef CONFIG_PREEMPT_RT_BASE
16424+ return spin_is_locked(&qdisc->running.lock) ? true : false;
1a6e0f06 16425+#else
e4b2b4a8 16426 return (raw_read_seqcount(&qdisc->running) & 1) ? true : false;
1a6e0f06 16427+#endif
e4b2b4a8 16428 }
1a6e0f06 16429
e4b2b4a8
JK
16430 static inline bool qdisc_run_begin(struct Qdisc *qdisc)
16431 {
1a6e0f06 16432+#ifdef CONFIG_PREEMPT_RT_BASE
e4b2b4a8
JK
16433+ if (try_write_seqlock(&qdisc->running))
16434+ return true;
16435+ return false;
1a6e0f06 16436+#else
e4b2b4a8
JK
16437 if (qdisc_is_running(qdisc))
16438 return false;
16439 /* Variant of write_seqcount_begin() telling lockdep a trylock
b3bbd485 16440@@ -124,11 +134,16 @@ static inline bool qdisc_run_begin(struct Qdisc *qdisc)
e4b2b4a8
JK
16441 raw_write_seqcount_begin(&qdisc->running);
16442 seqcount_acquire(&qdisc->running.dep_map, 0, 1, _RET_IP_);
16443 return true;
1a6e0f06 16444+#endif
e4b2b4a8 16445 }
1a6e0f06 16446
e4b2b4a8
JK
16447 static inline void qdisc_run_end(struct Qdisc *qdisc)
16448 {
1a6e0f06 16449+#ifdef CONFIG_PREEMPT_RT_BASE
e4b2b4a8 16450+ write_sequnlock(&qdisc->running);
1a6e0f06 16451+#else
e4b2b4a8 16452 write_seqcount_end(&qdisc->running);
1a6e0f06 16453+#endif
e4b2b4a8 16454 }
1a6e0f06 16455
e4b2b4a8 16456 static inline bool qdisc_may_bulk(const struct Qdisc *qdisc)
b3bbd485 16457@@ -337,7 +352,7 @@ static inline spinlock_t *qdisc_root_sleeping_lock(const struct Qdisc *qdisc)
e4b2b4a8
JK
16458 return qdisc_lock(root);
16459 }
1a6e0f06 16460
e4b2b4a8
JK
16461-static inline seqcount_t *qdisc_root_sleeping_running(const struct Qdisc *qdisc)
16462+static inline net_seqlock_t *qdisc_root_sleeping_running(const struct Qdisc *qdisc)
16463 {
16464 struct Qdisc *root = qdisc_root_sleeping(qdisc);
1a6e0f06 16465
b3bbd485
JK
16466diff --git a/include/net/xfrm.h b/include/net/xfrm.h
16467index db99efb2d1d0..a7b95ffbbf8b 100644
16468--- a/include/net/xfrm.h
16469+++ b/include/net/xfrm.h
16470@@ -217,7 +217,7 @@ struct xfrm_state {
e4b2b4a8
JK
16471 struct xfrm_stats stats;
16472
16473 struct xfrm_lifetime_cur curlft;
16474- struct tasklet_hrtimer mtimer;
16475+ struct hrtimer mtimer;
16476
16477 struct xfrm_state_offload xso;
16478
b3bbd485
JK
16479diff --git a/include/trace/events/timer.h b/include/trace/events/timer.h
16480index c6f728037c53..a57e4ee989d6 100644
16481--- a/include/trace/events/timer.h
16482+++ b/include/trace/events/timer.h
16483@@ -148,7 +148,11 @@ DEFINE_EVENT(timer_class, timer_cancel,
e4b2b4a8
JK
16484 { HRTIMER_MODE_ABS, "ABS" }, \
16485 { HRTIMER_MODE_REL, "REL" }, \
16486 { HRTIMER_MODE_ABS_PINNED, "ABS|PINNED" }, \
16487- { HRTIMER_MODE_REL_PINNED, "REL|PINNED" })
16488+ { HRTIMER_MODE_REL_PINNED, "REL|PINNED" }, \
16489+ { HRTIMER_MODE_ABS_SOFT, "ABS|SOFT" }, \
16490+ { HRTIMER_MODE_REL_SOFT, "REL|SOFT" }, \
16491+ { HRTIMER_MODE_ABS_PINNED_SOFT, "ABS|PINNED|SOFT" }, \
16492+ { HRTIMER_MODE_REL_PINNED_SOFT, "REL|PINNED|SOFT" })
1a6e0f06 16493
e4b2b4a8
JK
16494 /**
16495 * hrtimer_init - called when the hrtimer is initialized
b3bbd485 16496@@ -186,15 +190,16 @@ TRACE_EVENT(hrtimer_init,
e4b2b4a8
JK
16497 */
16498 TRACE_EVENT(hrtimer_start,
16499
16500- TP_PROTO(struct hrtimer *hrtimer),
16501+ TP_PROTO(struct hrtimer *hrtimer, enum hrtimer_mode mode),
16502
16503- TP_ARGS(hrtimer),
16504+ TP_ARGS(hrtimer, mode),
16505
16506 TP_STRUCT__entry(
16507 __field( void *, hrtimer )
16508 __field( void *, function )
16509 __field( s64, expires )
16510 __field( s64, softexpires )
16511+ __field( enum hrtimer_mode, mode )
16512 ),
16513
16514 TP_fast_assign(
b3bbd485 16515@@ -202,12 +207,14 @@ TRACE_EVENT(hrtimer_start,
e4b2b4a8
JK
16516 __entry->function = hrtimer->function;
16517 __entry->expires = hrtimer_get_expires(hrtimer);
16518 __entry->softexpires = hrtimer_get_softexpires(hrtimer);
16519+ __entry->mode = mode;
16520 ),
16521
16522- TP_printk("hrtimer=%p function=%pf expires=%llu softexpires=%llu",
16523- __entry->hrtimer, __entry->function,
16524+ TP_printk("hrtimer=%p function=%pf expires=%llu softexpires=%llu "
16525+ "mode=%s", __entry->hrtimer, __entry->function,
16526 (unsigned long long) __entry->expires,
16527- (unsigned long long) __entry->softexpires)
16528+ (unsigned long long) __entry->softexpires,
16529+ decode_hrtimer_mode(__entry->mode))
16530 );
1a6e0f06 16531
e4b2b4a8 16532 /**
b3bbd485
JK
16533diff --git a/init/Kconfig b/init/Kconfig
16534index 46075327c165..a7aff2c1a203 100644
16535--- a/init/Kconfig
16536+++ b/init/Kconfig
16537@@ -744,6 +744,7 @@ config CFS_BANDWIDTH
e4b2b4a8
JK
16538 config RT_GROUP_SCHED
16539 bool "Group scheduling for SCHED_RR/FIFO"
16540 depends on CGROUP_SCHED
16541+ depends on !PREEMPT_RT_FULL
16542 default n
16543 help
16544 This feature lets you explicitly allocate real CPU bandwidth
b3bbd485 16545@@ -1533,6 +1534,7 @@ choice
1a6e0f06 16546
e4b2b4a8
JK
16547 config SLAB
16548 bool "SLAB"
16549+ depends on !PREEMPT_RT_FULL
16550 select HAVE_HARDENED_USERCOPY_ALLOCATOR
16551 help
16552 The regular slab allocator that is established and known to work
b3bbd485 16553@@ -1553,6 +1555,7 @@ config SLUB
e4b2b4a8
JK
16554 config SLOB
16555 depends on EXPERT
16556 bool "SLOB (Simple Allocator)"
16557+ depends on !PREEMPT_RT_FULL
16558 help
16559 SLOB replaces the stock allocator with a drastically simpler
16560 allocator. SLOB is generally more space efficient but
b3bbd485 16561@@ -1594,7 +1597,7 @@ config SLAB_FREELIST_HARDENED
1a6e0f06 16562
e4b2b4a8
JK
16563 config SLUB_CPU_PARTIAL
16564 default y
16565- depends on SLUB && SMP
16566+ depends on SLUB && SMP && !PREEMPT_RT_FULL
16567 bool "SLUB per cpu partial cache"
16568 help
16569 Per cpu partial caches accellerate objects allocation and freeing
b3bbd485
JK
16570diff --git a/init/Makefile b/init/Makefile
16571index 1dbb23787290..eabf3f1b14be 100644
16572--- a/init/Makefile
16573+++ b/init/Makefile
16574@@ -36,4 +36,4 @@ silent_chk_compile.h = :
16575 include/generated/compile.h: FORCE
16576 @$($(quiet)chk_compile.h)
16577 $(Q)$(CONFIG_SHELL) $(srctree)/scripts/mkcompile_h $@ \
16578- "$(UTS_MACHINE)" "$(CONFIG_SMP)" "$(CONFIG_PREEMPT)" "$(CC) $(KBUILD_CFLAGS)"
16579+ "$(UTS_MACHINE)" "$(CONFIG_SMP)" "$(CONFIG_PREEMPT)" "$(CONFIG_PREEMPT_RT_FULL)" "$(CC) $(KBUILD_CFLAGS)"
16580diff --git a/init/main.c b/init/main.c
16581index c4a45145e102..c86f3d3b9a72 100644
16582--- a/init/main.c
16583+++ b/init/main.c
16584@@ -543,6 +543,7 @@ asmlinkage __visible void __init start_kernel(void)
e4b2b4a8
JK
16585 setup_command_line(command_line);
16586 setup_nr_cpu_ids();
16587 setup_per_cpu_areas();
16588+ softirq_early_init();
16589 smp_prepare_boot_cpu(); /* arch-specific boot-cpu hooks */
16590 boot_cpu_hotplug_init();
1a6e0f06 16591
b3bbd485
JK
16592diff --git a/kernel/Kconfig.locks b/kernel/Kconfig.locks
16593index 84d882f3e299..af27c4000812 100644
16594--- a/kernel/Kconfig.locks
16595+++ b/kernel/Kconfig.locks
16596@@ -225,11 +225,11 @@ config ARCH_SUPPORTS_ATOMIC_RMW
16597
16598 config MUTEX_SPIN_ON_OWNER
16599 def_bool y
16600- depends on SMP && ARCH_SUPPORTS_ATOMIC_RMW
16601+ depends on SMP && ARCH_SUPPORTS_ATOMIC_RMW && !PREEMPT_RT_FULL
16602
16603 config RWSEM_SPIN_ON_OWNER
16604 def_bool y
16605- depends on SMP && RWSEM_XCHGADD_ALGORITHM && ARCH_SUPPORTS_ATOMIC_RMW
16606+ depends on SMP && RWSEM_XCHGADD_ALGORITHM && ARCH_SUPPORTS_ATOMIC_RMW && !PREEMPT_RT_FULL
16607
16608 config LOCK_SPIN_ON_OWNER
16609 def_bool y
16610diff --git a/kernel/Kconfig.preempt b/kernel/Kconfig.preempt
16611index 3f9c97419f02..11dbe26a8279 100644
16612--- a/kernel/Kconfig.preempt
16613+++ b/kernel/Kconfig.preempt
16614@@ -1,3 +1,16 @@
16615+config PREEMPT
16616+ bool
16617+ select PREEMPT_COUNT
16618+
16619+config PREEMPT_RT_BASE
16620+ bool
16621+ select PREEMPT
16622+
16623+config HAVE_PREEMPT_LAZY
16624+ bool
16625+
16626+config PREEMPT_LAZY
16627+ def_bool y if HAVE_PREEMPT_LAZY && PREEMPT_RT_FULL
16628
16629 choice
16630 prompt "Preemption Model"
16631@@ -33,9 +46,9 @@ config PREEMPT_VOLUNTARY
16632
16633 Select this if you are building a kernel for a desktop system.
16634
16635-config PREEMPT
16636+config PREEMPT__LL
16637 bool "Preemptible Kernel (Low-Latency Desktop)"
16638- select PREEMPT_COUNT
16639+ select PREEMPT
16640 select UNINLINE_SPIN_UNLOCK if !ARCH_INLINE_SPIN_UNLOCK
16641 help
16642 This option reduces the latency of the kernel by making
16643@@ -52,6 +65,22 @@ config PREEMPT
16644 embedded system with latency requirements in the milliseconds
16645 range.
16646
16647+config PREEMPT_RTB
16648+ bool "Preemptible Kernel (Basic RT)"
16649+ select PREEMPT_RT_BASE
16650+ help
16651+ This option is basically the same as (Low-Latency Desktop) but
16652+ enables changes which are preliminary for the full preemptible
16653+ RT kernel.
16654+
16655+config PREEMPT_RT_FULL
16656+ bool "Fully Preemptible Kernel (RT)"
16657+ depends on IRQ_FORCED_THREADING
16658+ select PREEMPT_RT_BASE
16659+ select PREEMPT_RCU
16660+ help
16661+ All and everything
16662+
16663 endchoice
16664
16665 config PREEMPT_COUNT
16666diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
5dd41b01 16667index 3fc11b8851ac..a04c3aded76b 100644
b3bbd485
JK
16668--- a/kernel/cgroup/cgroup.c
16669+++ b/kernel/cgroup/cgroup.c
5dd41b01 16670@@ -4515,10 +4515,10 @@ static void css_free_rcu_fn(struct rcu_head *rcu_head)
e4b2b4a8 16671 queue_work(cgroup_destroy_wq, &css->destroy_work);
1a6e0f06
JK
16672 }
16673
e4b2b4a8
JK
16674-static void css_release_work_fn(struct work_struct *work)
16675+static void css_release_work_fn(struct swork_event *sev)
1a6e0f06 16676 {
e4b2b4a8
JK
16677 struct cgroup_subsys_state *css =
16678- container_of(work, struct cgroup_subsys_state, destroy_work);
16679+ container_of(sev, struct cgroup_subsys_state, destroy_swork);
16680 struct cgroup_subsys *ss = css->ss;
16681 struct cgroup *cgrp = css->cgroup;
1a6e0f06 16682
5dd41b01 16683@@ -4569,8 +4569,8 @@ static void css_release(struct percpu_ref *ref)
e4b2b4a8
JK
16684 struct cgroup_subsys_state *css =
16685 container_of(ref, struct cgroup_subsys_state, refcnt);
1a6e0f06 16686
e4b2b4a8
JK
16687- INIT_WORK(&css->destroy_work, css_release_work_fn);
16688- queue_work(cgroup_destroy_wq, &css->destroy_work);
16689+ INIT_SWORK(&css->destroy_swork, css_release_work_fn);
16690+ swork_queue(&css->destroy_swork);
1a6e0f06
JK
16691 }
16692
e4b2b4a8 16693 static void init_and_link_css(struct cgroup_subsys_state *css,
5dd41b01 16694@@ -5276,6 +5276,7 @@ static int __init cgroup_wq_init(void)
e4b2b4a8
JK
16695 */
16696 cgroup_destroy_wq = alloc_workqueue("cgroup_destroy", 0, 1);
16697 BUG_ON(!cgroup_destroy_wq);
16698+ BUG_ON(swork_get());
16699 return 0;
16700 }
16701 core_initcall(cgroup_wq_init);
b3bbd485
JK
16702diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
16703index 4657e2924ecb..bda2af78277a 100644
16704--- a/kernel/cgroup/cpuset.c
16705+++ b/kernel/cgroup/cpuset.c
16706@@ -288,7 +288,7 @@ static struct cpuset top_cpuset = {
1a6e0f06 16707 */
1a6e0f06 16708
e4b2b4a8
JK
16709 static DEFINE_MUTEX(cpuset_mutex);
16710-static DEFINE_SPINLOCK(callback_lock);
16711+static DEFINE_RAW_SPINLOCK(callback_lock);
1a6e0f06 16712
e4b2b4a8 16713 static struct workqueue_struct *cpuset_migrate_mm_wq;
1a6e0f06 16714
b3bbd485 16715@@ -926,9 +926,9 @@ static void update_cpumasks_hier(struct cpuset *cs, struct cpumask *new_cpus)
e4b2b4a8
JK
16716 continue;
16717 rcu_read_unlock();
1a6e0f06 16718
e4b2b4a8
JK
16719- spin_lock_irq(&callback_lock);
16720+ raw_spin_lock_irq(&callback_lock);
16721 cpumask_copy(cp->effective_cpus, new_cpus);
16722- spin_unlock_irq(&callback_lock);
16723+ raw_spin_unlock_irq(&callback_lock);
1a6e0f06 16724
e4b2b4a8
JK
16725 WARN_ON(!is_in_v2_mode() &&
16726 !cpumask_equal(cp->cpus_allowed, cp->effective_cpus));
b3bbd485 16727@@ -993,9 +993,9 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
e4b2b4a8
JK
16728 if (retval < 0)
16729 return retval;
1a6e0f06 16730
e4b2b4a8
JK
16731- spin_lock_irq(&callback_lock);
16732+ raw_spin_lock_irq(&callback_lock);
16733 cpumask_copy(cs->cpus_allowed, trialcs->cpus_allowed);
16734- spin_unlock_irq(&callback_lock);
16735+ raw_spin_unlock_irq(&callback_lock);
16736
16737 /* use trialcs->cpus_allowed as a temp variable */
16738 update_cpumasks_hier(cs, trialcs->cpus_allowed);
b3bbd485 16739@@ -1179,9 +1179,9 @@ static void update_nodemasks_hier(struct cpuset *cs, nodemask_t *new_mems)
e4b2b4a8
JK
16740 continue;
16741 rcu_read_unlock();
16742
16743- spin_lock_irq(&callback_lock);
16744+ raw_spin_lock_irq(&callback_lock);
16745 cp->effective_mems = *new_mems;
16746- spin_unlock_irq(&callback_lock);
16747+ raw_spin_unlock_irq(&callback_lock);
16748
16749 WARN_ON(!is_in_v2_mode() &&
16750 !nodes_equal(cp->mems_allowed, cp->effective_mems));
b3bbd485 16751@@ -1249,9 +1249,9 @@ static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs,
e4b2b4a8
JK
16752 if (retval < 0)
16753 goto done;
16754
16755- spin_lock_irq(&callback_lock);
16756+ raw_spin_lock_irq(&callback_lock);
16757 cs->mems_allowed = trialcs->mems_allowed;
16758- spin_unlock_irq(&callback_lock);
16759+ raw_spin_unlock_irq(&callback_lock);
16760
16761 /* use trialcs->mems_allowed as a temp variable */
16762 update_nodemasks_hier(cs, &trialcs->mems_allowed);
b3bbd485 16763@@ -1342,9 +1342,9 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs,
e4b2b4a8
JK
16764 spread_flag_changed = ((is_spread_slab(cs) != is_spread_slab(trialcs))
16765 || (is_spread_page(cs) != is_spread_page(trialcs)));
16766
16767- spin_lock_irq(&callback_lock);
16768+ raw_spin_lock_irq(&callback_lock);
16769 cs->flags = trialcs->flags;
16770- spin_unlock_irq(&callback_lock);
16771+ raw_spin_unlock_irq(&callback_lock);
16772
16773 if (!cpumask_empty(trialcs->cpus_allowed) && balance_flag_changed)
16774 rebuild_sched_domains_locked();
b3bbd485 16775@@ -1759,7 +1759,7 @@ static int cpuset_common_seq_show(struct seq_file *sf, void *v)
e4b2b4a8
JK
16776 cpuset_filetype_t type = seq_cft(sf)->private;
16777 int ret = 0;
1a6e0f06 16778
e4b2b4a8
JK
16779- spin_lock_irq(&callback_lock);
16780+ raw_spin_lock_irq(&callback_lock);
1a6e0f06 16781
e4b2b4a8
JK
16782 switch (type) {
16783 case FILE_CPULIST:
b3bbd485 16784@@ -1778,7 +1778,7 @@ static int cpuset_common_seq_show(struct seq_file *sf, void *v)
e4b2b4a8
JK
16785 ret = -EINVAL;
16786 }
1a6e0f06 16787
e4b2b4a8
JK
16788- spin_unlock_irq(&callback_lock);
16789+ raw_spin_unlock_irq(&callback_lock);
16790 return ret;
1a6e0f06
JK
16791 }
16792
b3bbd485 16793@@ -1993,12 +1993,12 @@ static int cpuset_css_online(struct cgroup_subsys_state *css)
1a6e0f06 16794
e4b2b4a8 16795 cpuset_inc();
1a6e0f06 16796
e4b2b4a8
JK
16797- spin_lock_irq(&callback_lock);
16798+ raw_spin_lock_irq(&callback_lock);
16799 if (is_in_v2_mode()) {
16800 cpumask_copy(cs->effective_cpus, parent->effective_cpus);
16801 cs->effective_mems = parent->effective_mems;
16802 }
16803- spin_unlock_irq(&callback_lock);
16804+ raw_spin_unlock_irq(&callback_lock);
16805
16806 if (!test_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags))
16807 goto out_unlock;
b3bbd485 16808@@ -2025,12 +2025,12 @@ static int cpuset_css_online(struct cgroup_subsys_state *css)
e4b2b4a8
JK
16809 }
16810 rcu_read_unlock();
16811
16812- spin_lock_irq(&callback_lock);
16813+ raw_spin_lock_irq(&callback_lock);
16814 cs->mems_allowed = parent->mems_allowed;
16815 cs->effective_mems = parent->mems_allowed;
16816 cpumask_copy(cs->cpus_allowed, parent->cpus_allowed);
16817 cpumask_copy(cs->effective_cpus, parent->cpus_allowed);
16818- spin_unlock_irq(&callback_lock);
16819+ raw_spin_unlock_irq(&callback_lock);
16820 out_unlock:
16821 mutex_unlock(&cpuset_mutex);
16822 return 0;
b3bbd485 16823@@ -2069,7 +2069,7 @@ static void cpuset_css_free(struct cgroup_subsys_state *css)
e4b2b4a8 16824 static void cpuset_bind(struct cgroup_subsys_state *root_css)
1a6e0f06 16825 {
e4b2b4a8
JK
16826 mutex_lock(&cpuset_mutex);
16827- spin_lock_irq(&callback_lock);
16828+ raw_spin_lock_irq(&callback_lock);
16829
16830 if (is_in_v2_mode()) {
16831 cpumask_copy(top_cpuset.cpus_allowed, cpu_possible_mask);
b3bbd485 16832@@ -2080,7 +2080,7 @@ static void cpuset_bind(struct cgroup_subsys_state *root_css)
e4b2b4a8
JK
16833 top_cpuset.mems_allowed = top_cpuset.effective_mems;
16834 }
16835
16836- spin_unlock_irq(&callback_lock);
16837+ raw_spin_unlock_irq(&callback_lock);
16838 mutex_unlock(&cpuset_mutex);
1a6e0f06
JK
16839 }
16840
b3bbd485 16841@@ -2094,7 +2094,7 @@ static void cpuset_fork(struct task_struct *task)
e4b2b4a8
JK
16842 if (task_css_is_root(task, cpuset_cgrp_id))
16843 return;
16844
16845- set_cpus_allowed_ptr(task, &current->cpus_allowed);
16846+ set_cpus_allowed_ptr(task, current->cpus_ptr);
16847 task->mems_allowed = current->mems_allowed;
1a6e0f06
JK
16848 }
16849
b3bbd485 16850@@ -2178,12 +2178,12 @@ hotplug_update_tasks_legacy(struct cpuset *cs,
1a6e0f06 16851 {
e4b2b4a8 16852 bool is_empty;
1a6e0f06 16853
e4b2b4a8
JK
16854- spin_lock_irq(&callback_lock);
16855+ raw_spin_lock_irq(&callback_lock);
16856 cpumask_copy(cs->cpus_allowed, new_cpus);
16857 cpumask_copy(cs->effective_cpus, new_cpus);
16858 cs->mems_allowed = *new_mems;
16859 cs->effective_mems = *new_mems;
16860- spin_unlock_irq(&callback_lock);
16861+ raw_spin_unlock_irq(&callback_lock);
1a6e0f06 16862
e4b2b4a8
JK
16863 /*
16864 * Don't call update_tasks_cpumask() if the cpuset becomes empty,
b3bbd485 16865@@ -2220,10 +2220,10 @@ hotplug_update_tasks(struct cpuset *cs,
e4b2b4a8
JK
16866 if (nodes_empty(*new_mems))
16867 *new_mems = parent_cs(cs)->effective_mems;
1a6e0f06 16868
e4b2b4a8
JK
16869- spin_lock_irq(&callback_lock);
16870+ raw_spin_lock_irq(&callback_lock);
16871 cpumask_copy(cs->effective_cpus, new_cpus);
16872 cs->effective_mems = *new_mems;
16873- spin_unlock_irq(&callback_lock);
16874+ raw_spin_unlock_irq(&callback_lock);
1a6e0f06 16875
e4b2b4a8
JK
16876 if (cpus_updated)
16877 update_tasks_cpumask(cs);
b3bbd485 16878@@ -2316,21 +2316,21 @@ static void cpuset_hotplug_workfn(struct work_struct *work)
1a6e0f06 16879
e4b2b4a8
JK
16880 /* synchronize cpus_allowed to cpu_active_mask */
16881 if (cpus_updated) {
16882- spin_lock_irq(&callback_lock);
16883+ raw_spin_lock_irq(&callback_lock);
16884 if (!on_dfl)
16885 cpumask_copy(top_cpuset.cpus_allowed, &new_cpus);
16886 cpumask_copy(top_cpuset.effective_cpus, &new_cpus);
16887- spin_unlock_irq(&callback_lock);
16888+ raw_spin_unlock_irq(&callback_lock);
16889 /* we don't mess with cpumasks of tasks in top_cpuset */
16890 }
1a6e0f06 16891
e4b2b4a8
JK
16892 /* synchronize mems_allowed to N_MEMORY */
16893 if (mems_updated) {
16894- spin_lock_irq(&callback_lock);
16895+ raw_spin_lock_irq(&callback_lock);
16896 if (!on_dfl)
16897 top_cpuset.mems_allowed = new_mems;
16898 top_cpuset.effective_mems = new_mems;
16899- spin_unlock_irq(&callback_lock);
16900+ raw_spin_unlock_irq(&callback_lock);
16901 update_tasks_nodemask(&top_cpuset);
16902 }
1a6e0f06 16903
b3bbd485 16904@@ -2429,11 +2429,11 @@ void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask)
e4b2b4a8
JK
16905 {
16906 unsigned long flags;
1a6e0f06 16907
e4b2b4a8
JK
16908- spin_lock_irqsave(&callback_lock, flags);
16909+ raw_spin_lock_irqsave(&callback_lock, flags);
16910 rcu_read_lock();
16911 guarantee_online_cpus(task_cs(tsk), pmask);
16912 rcu_read_unlock();
16913- spin_unlock_irqrestore(&callback_lock, flags);
16914+ raw_spin_unlock_irqrestore(&callback_lock, flags);
16915 }
1a6e0f06 16916
e4b2b4a8 16917 void cpuset_cpus_allowed_fallback(struct task_struct *tsk)
b3bbd485 16918@@ -2481,11 +2481,11 @@ nodemask_t cpuset_mems_allowed(struct task_struct *tsk)
e4b2b4a8
JK
16919 nodemask_t mask;
16920 unsigned long flags;
1a6e0f06 16921
e4b2b4a8
JK
16922- spin_lock_irqsave(&callback_lock, flags);
16923+ raw_spin_lock_irqsave(&callback_lock, flags);
16924 rcu_read_lock();
16925 guarantee_online_mems(task_cs(tsk), &mask);
16926 rcu_read_unlock();
16927- spin_unlock_irqrestore(&callback_lock, flags);
16928+ raw_spin_unlock_irqrestore(&callback_lock, flags);
1a6e0f06 16929
e4b2b4a8 16930 return mask;
1a6e0f06 16931 }
b3bbd485 16932@@ -2577,14 +2577,14 @@ bool __cpuset_node_allowed(int node, gfp_t gfp_mask)
e4b2b4a8
JK
16933 return true;
16934
16935 /* Not hardwall and node outside mems_allowed: scan up cpusets */
16936- spin_lock_irqsave(&callback_lock, flags);
16937+ raw_spin_lock_irqsave(&callback_lock, flags);
16938
16939 rcu_read_lock();
16940 cs = nearest_hardwall_ancestor(task_cs(current));
16941 allowed = node_isset(node, cs->mems_allowed);
16942 rcu_read_unlock();
1a6e0f06 16943
e4b2b4a8
JK
16944- spin_unlock_irqrestore(&callback_lock, flags);
16945+ raw_spin_unlock_irqrestore(&callback_lock, flags);
16946 return allowed;
1a6e0f06
JK
16947 }
16948
b3bbd485
JK
16949diff --git a/kernel/cpu.c b/kernel/cpu.c
16950index f3f389e33343..7d777b62e4eb 100644
16951--- a/kernel/cpu.c
16952+++ b/kernel/cpu.c
16953@@ -74,6 +74,11 @@ static DEFINE_PER_CPU(struct cpuhp_cpu_state, cpuhp_state) = {
e4b2b4a8
JK
16954 .fail = CPUHP_INVALID,
16955 };
16956
16957+#if defined(CONFIG_HOTPLUG_CPU) && defined(CONFIG_PREEMPT_RT_FULL)
16958+static DEFINE_PER_CPU(struct rt_rw_lock, cpuhp_pin_lock) = \
16959+ __RWLOCK_RT_INITIALIZER(cpuhp_pin_lock);
1a6e0f06
JK
16960+#endif
16961+
e4b2b4a8
JK
16962 #if defined(CONFIG_LOCKDEP) && defined(CONFIG_SMP)
16963 static struct lockdep_map cpuhp_state_up_map =
16964 STATIC_LOCKDEP_MAP_INIT("cpuhp_state-up", &cpuhp_state_up_map);
b3bbd485 16965@@ -287,6 +292,55 @@ static int cpu_hotplug_disabled;
e4b2b4a8
JK
16966
16967 #ifdef CONFIG_HOTPLUG_CPU
16968
16969+/**
16970+ * pin_current_cpu - Prevent the current cpu from being unplugged
1a6e0f06 16971+ */
e4b2b4a8 16972+void pin_current_cpu(void)
1a6e0f06 16973+{
e4b2b4a8
JK
16974+#ifdef CONFIG_PREEMPT_RT_FULL
16975+ struct rt_rw_lock *cpuhp_pin;
16976+ unsigned int cpu;
16977+ int ret;
1a6e0f06 16978+
e4b2b4a8
JK
16979+again:
16980+ cpuhp_pin = this_cpu_ptr(&cpuhp_pin_lock);
16981+ ret = __read_rt_trylock(cpuhp_pin);
16982+ if (ret) {
16983+ current->pinned_on_cpu = smp_processor_id();
16984+ return;
16985+ }
16986+ cpu = smp_processor_id();
16987+ preempt_lazy_enable();
16988+ preempt_enable();
1a6e0f06 16989+
e4b2b4a8 16990+ __read_rt_lock(cpuhp_pin);
1a6e0f06 16991+
e4b2b4a8
JK
16992+ preempt_disable();
16993+ preempt_lazy_disable();
16994+ if (cpu != smp_processor_id()) {
16995+ __read_rt_unlock(cpuhp_pin);
16996+ goto again;
16997+ }
16998+ current->pinned_on_cpu = cpu;
16999+#endif
17000+}
1a6e0f06 17001+
e4b2b4a8
JK
17002+/**
17003+ * unpin_current_cpu - Allow unplug of current cpu
17004+ */
17005+void unpin_current_cpu(void)
17006+{
17007+#ifdef CONFIG_PREEMPT_RT_FULL
17008+ struct rt_rw_lock *cpuhp_pin = this_cpu_ptr(&cpuhp_pin_lock);
1a6e0f06 17009+
e4b2b4a8
JK
17010+ if (WARN_ON(current->pinned_on_cpu != smp_processor_id()))
17011+ cpuhp_pin = per_cpu_ptr(&cpuhp_pin_lock, current->pinned_on_cpu);
1a6e0f06 17012+
e4b2b4a8
JK
17013+ current->pinned_on_cpu = -1;
17014+ __read_rt_unlock(cpuhp_pin);
17015+#endif
17016+}
1a6e0f06 17017+
e4b2b4a8
JK
17018 DEFINE_STATIC_PERCPU_RWSEM(cpu_hotplug_lock);
17019
17020 void cpus_read_lock(void)
b3bbd485 17021@@ -843,6 +897,9 @@ static int take_cpu_down(void *_param)
e4b2b4a8
JK
17022
17023 static int takedown_cpu(unsigned int cpu)
17024 {
17025+#ifdef CONFIG_PREEMPT_RT_FULL
17026+ struct rt_rw_lock *cpuhp_pin = per_cpu_ptr(&cpuhp_pin_lock, cpu);
17027+#endif
17028 struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
17029 int err;
17030
b3bbd485 17031@@ -855,11 +912,18 @@ static int takedown_cpu(unsigned int cpu)
e4b2b4a8
JK
17032 */
17033 irq_lock_sparse();
17034
17035+#ifdef CONFIG_PREEMPT_RT_FULL
17036+ __write_rt_lock(cpuhp_pin);
1a6e0f06
JK
17037+#endif
17038+
e4b2b4a8
JK
17039 /*
17040 * So now all preempt/rcu users must observe !cpu_active().
17041 */
17042 err = stop_machine_cpuslocked(take_cpu_down, NULL, cpumask_of(cpu));
17043 if (err) {
17044+#ifdef CONFIG_PREEMPT_RT_FULL
17045+ __write_rt_unlock(cpuhp_pin);
1a6e0f06 17046+#endif
e4b2b4a8
JK
17047 /* CPU refused to die */
17048 irq_unlock_sparse();
17049 /* Unpark the hotplug thread so we can rollback there */
b3bbd485 17050@@ -878,6 +942,9 @@ static int takedown_cpu(unsigned int cpu)
e4b2b4a8
JK
17051 wait_for_ap_thread(st, false);
17052 BUG_ON(st->state != CPUHP_AP_IDLE_DEAD);
17053
17054+#ifdef CONFIG_PREEMPT_RT_FULL
17055+ __write_rt_unlock(cpuhp_pin);
1a6e0f06 17056+#endif
e4b2b4a8
JK
17057 /* Interrupts are moved away from the dying cpu, reenable alloc/free */
17058 irq_unlock_sparse();
1a6e0f06 17059
b3bbd485
JK
17060diff --git a/kernel/debug/kdb/kdb_io.c b/kernel/debug/kdb/kdb_io.c
17061index ed5d34925ad0..c0d4c24fc241 100644
17062--- a/kernel/debug/kdb/kdb_io.c
17063+++ b/kernel/debug/kdb/kdb_io.c
17064@@ -854,9 +854,11 @@ int kdb_printf(const char *fmt, ...)
e4b2b4a8
JK
17065 va_list ap;
17066 int r;
17067
17068+ kdb_trap_printk++;
17069 va_start(ap, fmt);
17070 r = vkdb_printf(KDB_MSGSRC_INTERNAL, fmt, ap);
17071 va_end(ap);
17072+ kdb_trap_printk--;
17073
17074 return r;
17075 }
b3bbd485 17076diff --git a/kernel/events/core.c b/kernel/events/core.c
5dd41b01 17077index 4dbce29a9313..de3d23bae9bf 100644
b3bbd485
JK
17078--- a/kernel/events/core.c
17079+++ b/kernel/events/core.c
17080@@ -1065,7 +1065,7 @@ static void __perf_mux_hrtimer_init(struct perf_cpu_context *cpuctx, int cpu)
e4b2b4a8
JK
17081 cpuctx->hrtimer_interval = ns_to_ktime(NSEC_PER_MSEC * interval);
17082
17083 raw_spin_lock_init(&cpuctx->hrtimer_lock);
17084- hrtimer_init(timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED);
17085+ hrtimer_init(timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED_HARD);
17086 timer->function = perf_mux_hrtimer_handler;
17087 }
17088
5dd41b01 17089@@ -8760,7 +8760,7 @@ static void perf_swevent_init_hrtimer(struct perf_event *event)
e4b2b4a8
JK
17090 if (!is_sampling_event(event))
17091 return;
17092
17093- hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
17094+ hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD);
17095 hwc->hrtimer.function = perf_swevent_hrtimer;
17096
17097 /*
b3bbd485
JK
17098diff --git a/kernel/exit.c b/kernel/exit.c
17099index e3a08761eb40..26f3b352b37a 100644
17100--- a/kernel/exit.c
17101+++ b/kernel/exit.c
17102@@ -159,7 +159,7 @@ static void __exit_signal(struct task_struct *tsk)
e4b2b4a8
JK
17103 * Do this under ->siglock, we can race with another thread
17104 * doing sigqueue_free() if we have SIGQUEUE_PREALLOC signals.
17105 */
17106- flush_sigqueue(&tsk->pending);
17107+ flush_task_sigqueue(tsk);
17108 tsk->sighand = NULL;
17109 spin_unlock(&sighand->siglock);
17110
b3bbd485
JK
17111diff --git a/kernel/fork.c b/kernel/fork.c
17112index 6a219fea4926..bc849ac60aa6 100644
17113--- a/kernel/fork.c
17114+++ b/kernel/fork.c
e4b2b4a8
JK
17115@@ -40,6 +40,7 @@
17116 #include <linux/hmm.h>
17117 #include <linux/fs.h>
17118 #include <linux/mm.h>
17119+#include <linux/kprobes.h>
17120 #include <linux/vmacache.h>
17121 #include <linux/nsproxy.h>
17122 #include <linux/capability.h>
b3bbd485 17123@@ -407,13 +408,24 @@ static inline void put_signal_struct(struct signal_struct *sig)
e4b2b4a8
JK
17124 if (atomic_dec_and_test(&sig->sigcnt))
17125 free_signal_struct(sig);
17126 }
17127-
17128+#ifdef CONFIG_PREEMPT_RT_BASE
17129+static
1a6e0f06 17130+#endif
e4b2b4a8
JK
17131 void __put_task_struct(struct task_struct *tsk)
17132 {
17133 WARN_ON(!tsk->exit_state);
17134 WARN_ON(atomic_read(&tsk->usage));
17135 WARN_ON(tsk == current);
17136
17137+ /*
17138+ * Remove function-return probe instances associated with this
17139+ * task and put them back on the free list.
17140+ */
17141+ kprobe_flush_task(tsk);
1a6e0f06 17142+
e4b2b4a8
JK
17143+ /* Task is done with its stack. */
17144+ put_task_stack(tsk);
17145+
17146 cgroup_free(tsk);
17147 task_numa_free(tsk);
17148 security_task_free(tsk);
b3bbd485 17149@@ -424,7 +436,18 @@ void __put_task_struct(struct task_struct *tsk)
e4b2b4a8
JK
17150 if (!profile_handoff_task(tsk))
17151 free_task(tsk);
17152 }
17153+#ifndef CONFIG_PREEMPT_RT_BASE
17154 EXPORT_SYMBOL_GPL(__put_task_struct);
1a6e0f06 17155+#else
e4b2b4a8
JK
17156+void __put_task_struct_cb(struct rcu_head *rhp)
17157+{
17158+ struct task_struct *tsk = container_of(rhp, struct task_struct, put_rcu);
1a6e0f06 17159+
e4b2b4a8
JK
17160+ __put_task_struct(tsk);
17161+
17162+}
17163+EXPORT_SYMBOL_GPL(__put_task_struct_cb);
17164+#endif
17165
17166 void __init __weak arch_task_cache_init(void) { }
17167
b3bbd485 17168@@ -563,7 +586,8 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
e4b2b4a8
JK
17169 #ifdef CONFIG_CC_STACKPROTECTOR
17170 tsk->stack_canary = get_random_canary();
1a6e0f06 17171 #endif
e4b2b4a8
JK
17172-
17173+ if (orig->cpus_ptr == &orig->cpus_mask)
17174+ tsk->cpus_ptr = &tsk->cpus_mask;
17175 /*
17176 * One for us, one for whoever does the "release_task()" (usually
17177 * parent)
b3bbd485 17178@@ -575,6 +599,7 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
e4b2b4a8
JK
17179 tsk->splice_pipe = NULL;
17180 tsk->task_frag.page = NULL;
17181 tsk->wake_q.next = NULL;
17182+ tsk->wake_q_sleeper.next = NULL;
1a6e0f06 17183
e4b2b4a8
JK
17184 account_kernel_stack(tsk, 1);
17185
b3bbd485 17186@@ -915,6 +940,19 @@ void __mmdrop(struct mm_struct *mm)
e4b2b4a8
JK
17187 }
17188 EXPORT_SYMBOL_GPL(__mmdrop);
17189
17190+#ifdef CONFIG_PREEMPT_RT_BASE
17191+/*
17192+ * RCU callback for delayed mm drop. Not strictly rcu, but we don't
17193+ * want another facility to make this work.
17194+ */
17195+void __mmdrop_delayed(struct rcu_head *rhp)
17196+{
17197+ struct mm_struct *mm = container_of(rhp, struct mm_struct, delayed_drop);
17198+
17199+ __mmdrop(mm);
17200+}
17201+#endif
17202+
17203 static inline void __mmput(struct mm_struct *mm)
17204 {
17205 VM_BUG_ON(atomic_read(&mm->mm_users));
b3bbd485 17206@@ -1496,6 +1534,9 @@ static void rt_mutex_init_task(struct task_struct *p)
e4b2b4a8
JK
17207 */
17208 static void posix_cpu_timers_init(struct task_struct *tsk)
17209 {
17210+#ifdef CONFIG_PREEMPT_RT_BASE
17211+ tsk->posix_timer_list = NULL;
17212+#endif
17213 tsk->cputime_expires.prof_exp = 0;
17214 tsk->cputime_expires.virt_exp = 0;
17215 tsk->cputime_expires.sched_exp = 0;
b3bbd485 17216@@ -1648,6 +1689,7 @@ static __latent_entropy struct task_struct *copy_process(
e4b2b4a8
JK
17217 spin_lock_init(&p->alloc_lock);
17218
17219 init_sigpending(&p->pending);
17220+ p->sigqueue_cache = NULL;
17221
17222 p->utime = p->stime = p->gtime = 0;
17223 #ifdef CONFIG_ARCH_HAS_SCALED_CPUTIME
b3bbd485
JK
17224diff --git a/kernel/futex.c b/kernel/futex.c
17225index 046cd780d057..2ba7fb04a107 100644
17226--- a/kernel/futex.c
17227+++ b/kernel/futex.c
17228@@ -936,7 +936,9 @@ void exit_pi_state_list(struct task_struct *curr)
e4b2b4a8
JK
17229 if (head->next != next) {
17230 /* retain curr->pi_lock for the loop invariant */
17231 raw_spin_unlock(&pi_state->pi_mutex.wait_lock);
17232+ raw_spin_unlock_irq(&curr->pi_lock);
17233 spin_unlock(&hb->lock);
17234+ raw_spin_lock_irq(&curr->pi_lock);
17235 put_pi_state(pi_state);
17236 continue;
17237 }
b3bbd485 17238@@ -1430,6 +1432,7 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_pi_state *pi_
e4b2b4a8
JK
17239 struct task_struct *new_owner;
17240 bool postunlock = false;
17241 DEFINE_WAKE_Q(wake_q);
17242+ DEFINE_WAKE_Q(wake_sleeper_q);
17243 int ret = 0;
17244
17245 new_owner = rt_mutex_next_owner(&pi_state->pi_mutex);
b3bbd485 17246@@ -1491,13 +1494,13 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_pi_state *pi_
e4b2b4a8
JK
17247 pi_state->owner = new_owner;
17248 raw_spin_unlock(&new_owner->pi_lock);
17249
17250- postunlock = __rt_mutex_futex_unlock(&pi_state->pi_mutex, &wake_q);
1a6e0f06 17251-
e4b2b4a8
JK
17252+ postunlock = __rt_mutex_futex_unlock(&pi_state->pi_mutex, &wake_q,
17253+ &wake_sleeper_q);
17254 out_unlock:
17255 raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
17256
17257 if (postunlock)
17258- rt_mutex_postunlock(&wake_q);
17259+ rt_mutex_postunlock(&wake_q, &wake_sleeper_q);
17260
17261 return ret;
17262 }
b3bbd485 17263@@ -2104,6 +2107,16 @@ static int futex_requeue(u32 __user *uaddr1, unsigned int flags,
e4b2b4a8
JK
17264 requeue_pi_wake_futex(this, &key2, hb2);
17265 drop_count++;
17266 continue;
17267+ } else if (ret == -EAGAIN) {
17268+ /*
17269+ * Waiter was woken by timeout or
17270+ * signal and has set pi_blocked_on to
17271+ * PI_WAKEUP_INPROGRESS before we
17272+ * tried to enqueue it on the rtmutex.
17273+ */
17274+ this->pi_state = NULL;
17275+ put_pi_state(pi_state);
17276+ continue;
17277 } else if (ret) {
17278 /*
17279 * rt_mutex_start_proxy_lock() detected a
b3bbd485 17280@@ -2642,10 +2655,9 @@ static int futex_wait(u32 __user *uaddr, unsigned int flags, u32 val,
e4b2b4a8
JK
17281 if (abs_time) {
17282 to = &timeout;
17283
17284- hrtimer_init_on_stack(&to->timer, (flags & FLAGS_CLOCKRT) ?
17285- CLOCK_REALTIME : CLOCK_MONOTONIC,
17286- HRTIMER_MODE_ABS);
17287- hrtimer_init_sleeper(to, current);
17288+ hrtimer_init_sleeper_on_stack(to, (flags & FLAGS_CLOCKRT) ?
17289+ CLOCK_REALTIME : CLOCK_MONOTONIC,
17290+ HRTIMER_MODE_ABS, current);
17291 hrtimer_set_expires_range_ns(&to->timer, *abs_time,
17292 current->timer_slack_ns);
17293 }
b3bbd485 17294@@ -2744,9 +2756,8 @@ static int futex_lock_pi(u32 __user *uaddr, unsigned int flags,
e4b2b4a8
JK
17295
17296 if (time) {
17297 to = &timeout;
17298- hrtimer_init_on_stack(&to->timer, CLOCK_REALTIME,
17299- HRTIMER_MODE_ABS);
17300- hrtimer_init_sleeper(to, current);
17301+ hrtimer_init_sleeper_on_stack(to, CLOCK_REALTIME,
17302+ HRTIMER_MODE_ABS, current);
17303 hrtimer_set_expires(&to->timer, *time);
17304 }
17305
b3bbd485 17306@@ -2801,7 +2812,7 @@ static int futex_lock_pi(u32 __user *uaddr, unsigned int flags,
e4b2b4a8
JK
17307 goto no_block;
17308 }
17309
17310- rt_mutex_init_waiter(&rt_waiter);
17311+ rt_mutex_init_waiter(&rt_waiter, false);
17312
17313 /*
17314 * On PREEMPT_RT_FULL, when hb->lock becomes an rt_mutex, we must not
b3bbd485 17315@@ -2816,9 +2827,18 @@ static int futex_lock_pi(u32 __user *uaddr, unsigned int flags,
e4b2b4a8
JK
17316 * lock handoff sequence.
17317 */
17318 raw_spin_lock_irq(&q.pi_state->pi_mutex.wait_lock);
17319+ /*
17320+ * the migrate_disable() here disables migration in the in_atomic() fast
17321+ * path which is enabled again in the following spin_unlock(). We have
17322+ * one migrate_disable() pending in the slow-path which is reversed
17323+ * after the raw_spin_unlock_irq() where we leave the atomic context.
17324+ */
17325+ migrate_disable();
17326+
17327 spin_unlock(q.lock_ptr);
17328 ret = __rt_mutex_start_proxy_lock(&q.pi_state->pi_mutex, &rt_waiter, current);
17329 raw_spin_unlock_irq(&q.pi_state->pi_mutex.wait_lock);
17330+ migrate_enable();
17331
17332 if (ret) {
17333 if (ret == 1)
b3bbd485 17334@@ -2965,11 +2985,21 @@ static int futex_unlock_pi(u32 __user *uaddr, unsigned int flags)
e4b2b4a8
JK
17335 * observed.
17336 */
17337 raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
17338+ /*
17339+ * Magic trickery for now to make the RT migrate disable
17340+ * logic happy. The following spin_unlock() happens with
17341+ * interrupts disabled so the internal migrate_enable()
17342+ * won't undo the migrate_disable() which was issued when
17343+ * locking hb->lock.
17344+ */
17345+ migrate_disable();
17346 spin_unlock(&hb->lock);
17347
17348 /* drops pi_state->pi_mutex.wait_lock */
17349 ret = wake_futex_pi(uaddr, uval, pi_state);
17350
17351+ migrate_enable();
17352+
17353 put_pi_state(pi_state);
17354
17355 /*
b3bbd485 17356@@ -3127,7 +3157,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
e4b2b4a8
JK
17357 struct hrtimer_sleeper timeout, *to = NULL;
17358 struct futex_pi_state *pi_state = NULL;
17359 struct rt_mutex_waiter rt_waiter;
17360- struct futex_hash_bucket *hb;
17361+ struct futex_hash_bucket *hb, *hb2;
17362 union futex_key key2 = FUTEX_KEY_INIT;
17363 struct futex_q q = futex_q_init;
17364 int res, ret;
b3bbd485 17365@@ -3143,10 +3173,9 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
e4b2b4a8
JK
17366
17367 if (abs_time) {
17368 to = &timeout;
17369- hrtimer_init_on_stack(&to->timer, (flags & FLAGS_CLOCKRT) ?
17370- CLOCK_REALTIME : CLOCK_MONOTONIC,
17371- HRTIMER_MODE_ABS);
17372- hrtimer_init_sleeper(to, current);
17373+ hrtimer_init_sleeper_on_stack(to, (flags & FLAGS_CLOCKRT) ?
17374+ CLOCK_REALTIME : CLOCK_MONOTONIC,
17375+ HRTIMER_MODE_ABS, current);
17376 hrtimer_set_expires_range_ns(&to->timer, *abs_time,
17377 current->timer_slack_ns);
17378 }
b3bbd485 17379@@ -3155,7 +3184,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
e4b2b4a8
JK
17380 * The waiter is allocated on our stack, manipulated by the requeue
17381 * code while we sleep on uaddr.
17382 */
17383- rt_mutex_init_waiter(&rt_waiter);
17384+ rt_mutex_init_waiter(&rt_waiter, false);
1a6e0f06 17385
e4b2b4a8
JK
17386 ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2, VERIFY_WRITE);
17387 if (unlikely(ret != 0))
b3bbd485 17388@@ -3186,20 +3215,55 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
e4b2b4a8
JK
17389 /* Queue the futex_q, drop the hb lock, wait for wakeup. */
17390 futex_wait_queue_me(hb, &q, to);
1a6e0f06 17391
e4b2b4a8
JK
17392- spin_lock(&hb->lock);
17393- ret = handle_early_requeue_pi_wakeup(hb, &q, &key2, to);
17394- spin_unlock(&hb->lock);
17395- if (ret)
17396- goto out_put_keys;
17397+ /*
17398+ * On RT we must avoid races with requeue and trying to block
17399+ * on two mutexes (hb->lock and uaddr2's rtmutex) by
17400+ * serializing access to pi_blocked_on with pi_lock.
17401+ */
17402+ raw_spin_lock_irq(&current->pi_lock);
17403+ if (current->pi_blocked_on) {
17404+ /*
17405+ * We have been requeued or are in the process of
17406+ * being requeued.
17407+ */
17408+ raw_spin_unlock_irq(&current->pi_lock);
17409+ } else {
17410+ /*
17411+ * Setting pi_blocked_on to PI_WAKEUP_INPROGRESS
17412+ * prevents a concurrent requeue from moving us to the
17413+ * uaddr2 rtmutex. After that we can safely acquire
17414+ * (and possibly block on) hb->lock.
17415+ */
17416+ current->pi_blocked_on = PI_WAKEUP_INPROGRESS;
17417+ raw_spin_unlock_irq(&current->pi_lock);
1a6e0f06 17418+
e4b2b4a8 17419+ spin_lock(&hb->lock);
1a6e0f06 17420+
e4b2b4a8
JK
17421+ /*
17422+ * Clean up pi_blocked_on. We might leak it otherwise
17423+ * when we succeeded with the hb->lock in the fast
17424+ * path.
17425+ */
17426+ raw_spin_lock_irq(&current->pi_lock);
17427+ current->pi_blocked_on = NULL;
17428+ raw_spin_unlock_irq(&current->pi_lock);
1a6e0f06 17429+
e4b2b4a8
JK
17430+ ret = handle_early_requeue_pi_wakeup(hb, &q, &key2, to);
17431+ spin_unlock(&hb->lock);
17432+ if (ret)
17433+ goto out_put_keys;
1a6e0f06 17434+ }
c7c16703 17435
e4b2b4a8
JK
17436 /*
17437- * In order for us to be here, we know our q.key == key2, and since
17438- * we took the hb->lock above, we also know that futex_requeue() has
17439- * completed and we no longer have to concern ourselves with a wakeup
17440- * race with the atomic proxy lock acquisition by the requeue code. The
17441- * futex_requeue dropped our key1 reference and incremented our key2
17442- * reference count.
17443+ * In order to be here, we have either been requeued, are in
17444+ * the process of being requeued, or requeue successfully
17445+ * acquired uaddr2 on our behalf. If pi_blocked_on was
17446+ * non-null above, we may be racing with a requeue. Do not
17447+ * rely on q->lock_ptr to be hb2->lock until after blocking on
17448+ * hb->lock or hb2->lock. The futex_requeue dropped our key1
17449+ * reference and incremented our key2 reference count.
17450 */
17451+ hb2 = hash_futex(&key2);
17452
17453 /* Check if the requeue code acquired the second futex for us. */
17454 if (!q.rt_waiter) {
b3bbd485 17455@@ -3208,7 +3272,8 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
e4b2b4a8
JK
17456 * did a lock-steal - fix up the PI-state in that case.
17457 */
17458 if (q.pi_state && (q.pi_state->owner != current)) {
17459- spin_lock(q.lock_ptr);
17460+ spin_lock(&hb2->lock);
17461+ BUG_ON(&hb2->lock != q.lock_ptr);
17462 ret = fixup_pi_state_owner(uaddr2, &q, current);
17463 if (ret && rt_mutex_owner(&q.pi_state->pi_mutex) == current) {
17464 pi_state = q.pi_state;
b3bbd485 17465@@ -3219,7 +3284,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
e4b2b4a8
JK
17466 * the requeue_pi() code acquired for us.
17467 */
17468 put_pi_state(q.pi_state);
17469- spin_unlock(q.lock_ptr);
17470+ spin_unlock(&hb2->lock);
17471 }
17472 } else {
17473 struct rt_mutex *pi_mutex;
b3bbd485 17474@@ -3233,7 +3298,8 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
e4b2b4a8
JK
17475 pi_mutex = &q.pi_state->pi_mutex;
17476 ret = rt_mutex_wait_proxy_lock(pi_mutex, to, &rt_waiter);
17477
17478- spin_lock(q.lock_ptr);
17479+ spin_lock(&hb2->lock);
17480+ BUG_ON(&hb2->lock != q.lock_ptr);
17481 if (ret && !rt_mutex_cleanup_proxy_lock(pi_mutex, &rt_waiter))
17482 ret = 0;
17483
b3bbd485
JK
17484diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
17485index 79f987b942b8..d1dbacc29941 100644
17486--- a/kernel/irq/handle.c
17487+++ b/kernel/irq/handle.c
17488@@ -183,10 +183,16 @@ irqreturn_t handle_irq_event_percpu(struct irq_desc *desc)
e4b2b4a8
JK
17489 {
17490 irqreturn_t retval;
17491 unsigned int flags = 0;
17492+ struct pt_regs *regs = get_irq_regs();
17493+ u64 ip = regs ? instruction_pointer(regs) : 0;
17494
17495 retval = __handle_irq_event_percpu(desc, &flags);
17496
17497- add_interrupt_randomness(desc->irq_data.irq, flags);
c7c16703 17498+#ifdef CONFIG_PREEMPT_RT_FULL
e4b2b4a8 17499+ desc->random_ip = ip;
c7c16703 17500+#else
e4b2b4a8 17501+ add_interrupt_randomness(desc->irq_data.irq, flags, ip);
c7c16703
JK
17502+#endif
17503
e4b2b4a8
JK
17504 if (!noirqdebug)
17505 note_interrupt(desc, retval);
b3bbd485
JK
17506diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
17507index 069311541577..f82dcca81712 100644
17508--- a/kernel/irq/manage.c
17509+++ b/kernel/irq/manage.c
e4b2b4a8
JK
17510@@ -24,6 +24,7 @@
17511 #include "internals.h"
1a6e0f06 17512
e4b2b4a8
JK
17513 #ifdef CONFIG_IRQ_FORCED_THREADING
17514+# ifndef CONFIG_PREEMPT_RT_BASE
17515 __read_mostly bool force_irqthreads;
17516
17517 static int __init setup_forced_irqthreads(char *arg)
b3bbd485 17518@@ -32,6 +33,7 @@ static int __init setup_forced_irqthreads(char *arg)
e4b2b4a8
JK
17519 return 0;
17520 }
17521 early_param("threadirqs", setup_forced_irqthreads);
17522+# endif
17523 #endif
17524
17525 static void __synchronize_hardirq(struct irq_desc *desc)
b3bbd485 17526@@ -224,7 +226,12 @@ int irq_set_affinity_locked(struct irq_data *data, const struct cpumask *mask,
e4b2b4a8
JK
17527
17528 if (desc->affinity_notify) {
17529 kref_get(&desc->affinity_notify->kref);
1a6e0f06 17530+
e4b2b4a8
JK
17531+#ifdef CONFIG_PREEMPT_RT_BASE
17532+ swork_queue(&desc->affinity_notify->swork);
1a6e0f06 17533+#else
e4b2b4a8
JK
17534 schedule_work(&desc->affinity_notify->work);
17535+#endif
17536 }
17537 irqd_set(data, IRQD_AFFINITY_SET);
1a6e0f06 17538
b3bbd485 17539@@ -262,10 +269,8 @@ int irq_set_affinity_hint(unsigned int irq, const struct cpumask *m)
e4b2b4a8
JK
17540 }
17541 EXPORT_SYMBOL_GPL(irq_set_affinity_hint);
17542
17543-static void irq_affinity_notify(struct work_struct *work)
17544+static void _irq_affinity_notify(struct irq_affinity_notify *notify)
1a6e0f06 17545 {
e4b2b4a8
JK
17546- struct irq_affinity_notify *notify =
17547- container_of(work, struct irq_affinity_notify, work);
17548 struct irq_desc *desc = irq_to_desc(notify->irq);
17549 cpumask_var_t cpumask;
17550 unsigned long flags;
b3bbd485 17551@@ -287,6 +292,35 @@ static void irq_affinity_notify(struct work_struct *work)
e4b2b4a8 17552 kref_put(&notify->kref, notify->release);
1a6e0f06
JK
17553 }
17554
e4b2b4a8
JK
17555+#ifdef CONFIG_PREEMPT_RT_BASE
17556+static void init_helper_thread(void)
1a6e0f06 17557+{
e4b2b4a8
JK
17558+ static int init_sworker_once;
17559+
17560+ if (init_sworker_once)
17561+ return;
17562+ if (WARN_ON(swork_get()))
17563+ return;
17564+ init_sworker_once = 1;
1a6e0f06
JK
17565+}
17566+
e4b2b4a8 17567+static void irq_affinity_notify(struct swork_event *swork)
1a6e0f06 17568+{
e4b2b4a8
JK
17569+ struct irq_affinity_notify *notify =
17570+ container_of(swork, struct irq_affinity_notify, swork);
17571+ _irq_affinity_notify(notify);
1a6e0f06
JK
17572+}
17573+
e4b2b4a8
JK
17574+#else
17575+
17576+static void irq_affinity_notify(struct work_struct *work)
1a6e0f06 17577+{
e4b2b4a8
JK
17578+ struct irq_affinity_notify *notify =
17579+ container_of(work, struct irq_affinity_notify, work);
17580+ _irq_affinity_notify(notify);
1a6e0f06
JK
17581+}
17582+#endif
17583+
e4b2b4a8
JK
17584 /**
17585 * irq_set_affinity_notifier - control notification of IRQ affinity changes
17586 * @irq: Interrupt for which to enable/disable notification
b3bbd485 17587@@ -315,7 +349,12 @@ irq_set_affinity_notifier(unsigned int irq, struct irq_affinity_notify *notify)
e4b2b4a8
JK
17588 if (notify) {
17589 notify->irq = irq;
17590 kref_init(&notify->kref);
17591+#ifdef CONFIG_PREEMPT_RT_BASE
17592+ INIT_SWORK(&notify->swork, irq_affinity_notify);
17593+ init_helper_thread();
17594+#else
17595 INIT_WORK(&notify->work, irq_affinity_notify);
17596+#endif
17597 }
1a6e0f06 17598
e4b2b4a8 17599 raw_spin_lock_irqsave(&desc->lock, flags);
b3bbd485 17600@@ -883,7 +922,15 @@ irq_forced_thread_fn(struct irq_desc *desc, struct irqaction *action)
e4b2b4a8
JK
17601 local_bh_disable();
17602 ret = action->thread_fn(action->irq, action->dev_id);
17603 irq_finalize_oneshot(desc, action);
17604- local_bh_enable();
17605+ /*
17606+ * Interrupts which have real time requirements can be set up
17607+ * to avoid softirq processing in the thread handler. This is
17608+ * safe as these interrupts do not raise soft interrupts.
17609+ */
17610+ if (irq_settings_no_softirq_call(desc))
17611+ _local_bh_enable();
17612+ else
17613+ local_bh_enable();
17614 return ret;
17615 }
1a6e0f06 17616
b3bbd485 17617@@ -980,6 +1027,12 @@ static int irq_thread(void *data)
e4b2b4a8
JK
17618 if (action_ret == IRQ_WAKE_THREAD)
17619 irq_wake_secondary(desc, action);
1a6e0f06 17620
e4b2b4a8
JK
17621+#ifdef CONFIG_PREEMPT_RT_FULL
17622+ migrate_disable();
17623+ add_interrupt_randomness(action->irq, 0,
17624+ desc->random_ip ^ (unsigned long) action);
17625+ migrate_enable();
17626+#endif
17627 wake_threads_waitq(desc);
17628 }
1a6e0f06 17629
b3bbd485 17630@@ -1378,6 +1431,9 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
e4b2b4a8
JK
17631 irqd_set(&desc->irq_data, IRQD_NO_BALANCING);
17632 }
1a6e0f06 17633
e4b2b4a8
JK
17634+ if (new->flags & IRQF_NO_SOFTIRQ_CALL)
17635+ irq_settings_set_no_softirq_call(desc);
1a6e0f06 17636+
e4b2b4a8
JK
17637 if (irq_settings_can_autoenable(desc)) {
17638 irq_startup(desc, IRQ_RESEND, IRQ_START_COND);
17639 } else {
b3bbd485 17640@@ -2159,7 +2215,7 @@ EXPORT_SYMBOL_GPL(irq_get_irqchip_state);
e4b2b4a8
JK
17641 * This call sets the internal irqchip state of an interrupt,
17642 * depending on the value of @which.
1a6e0f06 17643 *
e4b2b4a8
JK
17644- * This function should be called with preemption disabled if the
17645+ * This function should be called with migration disabled if the
17646 * interrupt controller has per-cpu registers.
17647 */
17648 int irq_set_irqchip_state(unsigned int irq, enum irqchip_irq_state which,
b3bbd485
JK
17649diff --git a/kernel/irq/settings.h b/kernel/irq/settings.h
17650index e43795cd2ccf..47e2f9e23586 100644
17651--- a/kernel/irq/settings.h
17652+++ b/kernel/irq/settings.h
17653@@ -17,6 +17,7 @@ enum {
e4b2b4a8
JK
17654 _IRQ_PER_CPU_DEVID = IRQ_PER_CPU_DEVID,
17655 _IRQ_IS_POLLED = IRQ_IS_POLLED,
17656 _IRQ_DISABLE_UNLAZY = IRQ_DISABLE_UNLAZY,
17657+ _IRQ_NO_SOFTIRQ_CALL = IRQ_NO_SOFTIRQ_CALL,
17658 _IRQF_MODIFY_MASK = IRQF_MODIFY_MASK,
17659 };
1a6e0f06 17660
b3bbd485 17661@@ -31,6 +32,7 @@ enum {
e4b2b4a8
JK
17662 #define IRQ_PER_CPU_DEVID GOT_YOU_MORON
17663 #define IRQ_IS_POLLED GOT_YOU_MORON
17664 #define IRQ_DISABLE_UNLAZY GOT_YOU_MORON
17665+#define IRQ_NO_SOFTIRQ_CALL GOT_YOU_MORON
17666 #undef IRQF_MODIFY_MASK
17667 #define IRQF_MODIFY_MASK GOT_YOU_MORON
1a6e0f06 17668
b3bbd485 17669@@ -41,6 +43,16 @@ irq_settings_clr_and_set(struct irq_desc *desc, u32 clr, u32 set)
e4b2b4a8
JK
17670 desc->status_use_accessors |= (set & _IRQF_MODIFY_MASK);
17671 }
1a6e0f06 17672
e4b2b4a8
JK
17673+static inline bool irq_settings_no_softirq_call(struct irq_desc *desc)
17674+{
17675+ return desc->status_use_accessors & _IRQ_NO_SOFTIRQ_CALL;
17676+}
1a6e0f06 17677+
e4b2b4a8
JK
17678+static inline void irq_settings_set_no_softirq_call(struct irq_desc *desc)
17679+{
17680+ desc->status_use_accessors |= _IRQ_NO_SOFTIRQ_CALL;
17681+}
1a6e0f06 17682+
e4b2b4a8
JK
17683 static inline bool irq_settings_is_per_cpu(struct irq_desc *desc)
17684 {
17685 return desc->status_use_accessors & _IRQ_PER_CPU;
b3bbd485
JK
17686diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c
17687index 987d7bca4864..75347fb1dfea 100644
17688--- a/kernel/irq/spurious.c
17689+++ b/kernel/irq/spurious.c
17690@@ -445,6 +445,10 @@ MODULE_PARM_DESC(noirqdebug, "Disable irq lockup detection when true");
1a6e0f06 17691
e4b2b4a8
JK
17692 static int __init irqfixup_setup(char *str)
17693 {
17694+#ifdef CONFIG_PREEMPT_RT_BASE
17695+ pr_warn("irqfixup boot option not supported w/ CONFIG_PREEMPT_RT_BASE\n");
17696+ return 1;
17697+#endif
17698 irqfixup = 1;
17699 printk(KERN_WARNING "Misrouted IRQ fixup support enabled.\n");
17700 printk(KERN_WARNING "This may impact system performance.\n");
b3bbd485 17701@@ -457,6 +461,10 @@ module_param(irqfixup, int, 0644);
1a6e0f06 17702
e4b2b4a8
JK
17703 static int __init irqpoll_setup(char *str)
17704 {
17705+#ifdef CONFIG_PREEMPT_RT_BASE
17706+ pr_warn("irqpoll boot option not supported w/ CONFIG_PREEMPT_RT_BASE\n");
17707+ return 1;
17708+#endif
17709 irqfixup = 2;
17710 printk(KERN_WARNING "Misrouted IRQ fixup and polling support "
17711 "enabled\n");
b3bbd485
JK
17712diff --git a/kernel/irq_work.c b/kernel/irq_work.c
17713index bcf107ce0854..2899ba0d23d1 100644
17714--- a/kernel/irq_work.c
17715+++ b/kernel/irq_work.c
e4b2b4a8
JK
17716@@ -17,6 +17,7 @@
17717 #include <linux/cpu.h>
17718 #include <linux/notifier.h>
17719 #include <linux/smp.h>
17720+#include <linux/interrupt.h>
17721 #include <asm/processor.h>
1a6e0f06 17722
1a6e0f06 17723
b3bbd485 17724@@ -65,6 +66,8 @@ void __weak arch_irq_work_raise(void)
1a6e0f06 17725 */
e4b2b4a8
JK
17726 bool irq_work_queue_on(struct irq_work *work, int cpu)
17727 {
17728+ struct llist_head *list;
1a6e0f06 17729+
e4b2b4a8
JK
17730 /* All work should have been flushed before going offline */
17731 WARN_ON_ONCE(cpu_is_offline(cpu));
1a6e0f06 17732
b3bbd485 17733@@ -75,7 +78,12 @@ bool irq_work_queue_on(struct irq_work *work, int cpu)
e4b2b4a8
JK
17734 if (!irq_work_claim(work))
17735 return false;
17736
17737- if (llist_add(&work->llnode, &per_cpu(raised_list, cpu)))
17738+ if (IS_ENABLED(CONFIG_PREEMPT_RT_FULL) && !(work->flags & IRQ_WORK_HARD_IRQ))
17739+ list = &per_cpu(lazy_list, cpu);
17740+ else
17741+ list = &per_cpu(raised_list, cpu);
17742+
17743+ if (llist_add(&work->llnode, list))
17744 arch_send_call_function_single_ipi(cpu);
c7c16703 17745
e4b2b4a8 17746 return true;
b3bbd485 17747@@ -86,6 +94,9 @@ EXPORT_SYMBOL_GPL(irq_work_queue_on);
e4b2b4a8
JK
17748 /* Enqueue the irq work @work on the current CPU */
17749 bool irq_work_queue(struct irq_work *work)
17750 {
17751+ struct llist_head *list;
17752+ bool lazy_work, realtime = IS_ENABLED(CONFIG_PREEMPT_RT_FULL);
17753+
17754 /* Only queue if not already pending */
17755 if (!irq_work_claim(work))
17756 return false;
b3bbd485 17757@@ -93,13 +104,15 @@ bool irq_work_queue(struct irq_work *work)
e4b2b4a8
JK
17758 /* Queue the entry and raise the IPI if needed. */
17759 preempt_disable();
c7c16703 17760
e4b2b4a8
JK
17761- /* If the work is "lazy", handle it from next tick if any */
17762- if (work->flags & IRQ_WORK_LAZY) {
17763- if (llist_add(&work->llnode, this_cpu_ptr(&lazy_list)) &&
17764- tick_nohz_tick_stopped())
17765- arch_irq_work_raise();
17766- } else {
17767- if (llist_add(&work->llnode, this_cpu_ptr(&raised_list)))
17768+ lazy_work = work->flags & IRQ_WORK_LAZY;
17769+
17770+ if (lazy_work || (realtime && !(work->flags & IRQ_WORK_HARD_IRQ)))
17771+ list = this_cpu_ptr(&lazy_list);
17772+ else
17773+ list = this_cpu_ptr(&raised_list);
17774+
17775+ if (llist_add(&work->llnode, list)) {
17776+ if (!lazy_work || tick_nohz_tick_stopped())
17777 arch_irq_work_raise();
17778 }
c7c16703 17779
b3bbd485 17780@@ -116,9 +129,8 @@ bool irq_work_needs_cpu(void)
e4b2b4a8
JK
17781 raised = this_cpu_ptr(&raised_list);
17782 lazy = this_cpu_ptr(&lazy_list);
c7c16703 17783
e4b2b4a8
JK
17784- if (llist_empty(raised) || arch_irq_work_has_interrupt())
17785- if (llist_empty(lazy))
17786- return false;
17787+ if (llist_empty(raised) && llist_empty(lazy))
17788+ return false;
c7c16703 17789
e4b2b4a8
JK
17790 /* All work should have been flushed before going offline */
17791 WARN_ON_ONCE(cpu_is_offline(smp_processor_id()));
b3bbd485 17792@@ -132,7 +144,7 @@ static void irq_work_run_list(struct llist_head *list)
e4b2b4a8
JK
17793 struct irq_work *work;
17794 struct llist_node *llnode;
c7c16703 17795
e4b2b4a8
JK
17796- BUG_ON(!irqs_disabled());
17797+ BUG_ON_NONRT(!irqs_disabled());
c7c16703 17798
e4b2b4a8
JK
17799 if (llist_empty(list))
17800 return;
b3bbd485 17801@@ -169,7 +181,16 @@ static void irq_work_run_list(struct llist_head *list)
e4b2b4a8 17802 void irq_work_run(void)
c7c16703 17803 {
e4b2b4a8
JK
17804 irq_work_run_list(this_cpu_ptr(&raised_list));
17805- irq_work_run_list(this_cpu_ptr(&lazy_list));
17806+ if (IS_ENABLED(CONFIG_PREEMPT_RT_FULL)) {
17807+ /*
17808+ * NOTE: we raise softirq via IPI for safety,
17809+ * and execute in irq_work_tick() to move the
17810+ * overhead from hard to soft irq context.
17811+ */
17812+ if (!llist_empty(this_cpu_ptr(&lazy_list)))
17813+ raise_softirq(TIMER_SOFTIRQ);
17814+ } else
17815+ irq_work_run_list(this_cpu_ptr(&lazy_list));
c7c16703 17816 }
e4b2b4a8 17817 EXPORT_SYMBOL_GPL(irq_work_run);
c7c16703 17818
b3bbd485 17819@@ -179,8 +200,17 @@ void irq_work_tick(void)
1a6e0f06 17820
e4b2b4a8
JK
17821 if (!llist_empty(raised) && !arch_irq_work_has_interrupt())
17822 irq_work_run_list(raised);
1a6e0f06 17823+
e4b2b4a8
JK
17824+ if (!IS_ENABLED(CONFIG_PREEMPT_RT_FULL))
17825+ irq_work_run_list(this_cpu_ptr(&lazy_list));
17826+}
1a6e0f06 17827+
e4b2b4a8
JK
17828+#if defined(CONFIG_IRQ_WORK) && defined(CONFIG_PREEMPT_RT_FULL)
17829+void irq_work_tick_soft(void)
17830+{
17831 irq_work_run_list(this_cpu_ptr(&lazy_list));
17832 }
1a6e0f06 17833+#endif
1a6e0f06 17834
e4b2b4a8
JK
17835 /*
17836 * Synchronize against the irq_work @entry, ensures the entry is not
b3bbd485
JK
17837diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c
17838index 46ba853656f6..9a23632b6294 100644
17839--- a/kernel/ksysfs.c
17840+++ b/kernel/ksysfs.c
17841@@ -140,6 +140,15 @@ KERNEL_ATTR_RO(vmcoreinfo);
1a6e0f06 17842
e4b2b4a8 17843 #endif /* CONFIG_CRASH_CORE */
1a6e0f06 17844
e4b2b4a8
JK
17845+#if defined(CONFIG_PREEMPT_RT_FULL)
17846+static ssize_t realtime_show(struct kobject *kobj,
17847+ struct kobj_attribute *attr, char *buf)
17848+{
17849+ return sprintf(buf, "%d\n", 1);
17850+}
17851+KERNEL_ATTR_RO(realtime);
17852+#endif
17853+
17854 /* whether file capabilities are enabled */
17855 static ssize_t fscaps_show(struct kobject *kobj,
17856 struct kobj_attribute *attr, char *buf)
b3bbd485
JK
17857@@ -230,6 +239,9 @@ static struct attribute * kernel_attrs[] = {
17858 #ifndef CONFIG_TINY_RCU
e4b2b4a8
JK
17859 &rcu_expedited_attr.attr,
17860 &rcu_normal_attr.attr,
b3bbd485 17861+#endif
e4b2b4a8
JK
17862+#ifdef CONFIG_PREEMPT_RT_FULL
17863+ &realtime_attr.attr,
b3bbd485 17864 #endif
e4b2b4a8
JK
17865 NULL
17866 };
b3bbd485
JK
17867diff --git a/kernel/locking/Makefile b/kernel/locking/Makefile
17868index 392c7f23af76..c0bf04b6b965 100644
17869--- a/kernel/locking/Makefile
17870+++ b/kernel/locking/Makefile
e4b2b4a8
JK
17871@@ -3,7 +3,7 @@
17872 # and is generally not a function of system call inputs.
17873 KCOV_INSTRUMENT := n
1a6e0f06 17874
e4b2b4a8
JK
17875-obj-y += mutex.o semaphore.o rwsem.o percpu-rwsem.o
17876+obj-y += semaphore.o percpu-rwsem.o
1a6e0f06 17877
e4b2b4a8
JK
17878 ifdef CONFIG_FUNCTION_TRACER
17879 CFLAGS_REMOVE_lockdep.o = $(CC_FLAGS_FTRACE)
b3bbd485 17880@@ -12,7 +12,11 @@ CFLAGS_REMOVE_mutex-debug.o = $(CC_FLAGS_FTRACE)
e4b2b4a8
JK
17881 CFLAGS_REMOVE_rtmutex-debug.o = $(CC_FLAGS_FTRACE)
17882 endif
17883
17884+ifneq ($(CONFIG_PREEMPT_RT_FULL),y)
17885+obj-y += mutex.o
17886 obj-$(CONFIG_DEBUG_MUTEXES) += mutex-debug.o
17887+endif
17888+obj-y += rwsem.o
17889 obj-$(CONFIG_LOCKDEP) += lockdep.o
17890 ifeq ($(CONFIG_PROC_FS),y)
17891 obj-$(CONFIG_LOCKDEP) += lockdep_proc.o
b3bbd485 17892@@ -25,8 +29,11 @@ obj-$(CONFIG_RT_MUTEXES) += rtmutex.o
e4b2b4a8
JK
17893 obj-$(CONFIG_DEBUG_RT_MUTEXES) += rtmutex-debug.o
17894 obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock.o
17895 obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock_debug.o
17896+ifneq ($(CONFIG_PREEMPT_RT_FULL),y)
17897 obj-$(CONFIG_RWSEM_GENERIC_SPINLOCK) += rwsem-spinlock.o
17898 obj-$(CONFIG_RWSEM_XCHGADD_ALGORITHM) += rwsem-xadd.o
17899+endif
17900+obj-$(CONFIG_PREEMPT_RT_FULL) += mutex-rt.o rwsem-rt.o rwlock-rt.o
17901 obj-$(CONFIG_QUEUED_RWLOCKS) += qrwlock.o
17902 obj-$(CONFIG_LOCK_TORTURE_TEST) += locktorture.o
17903 obj-$(CONFIG_WW_MUTEX_SELFTEST) += test-ww_mutex.o
b3bbd485
JK
17904diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
17905index d7c155048ea9..def51a27f20f 100644
17906--- a/kernel/locking/lockdep.c
17907+++ b/kernel/locking/lockdep.c
17908@@ -3914,6 +3914,7 @@ static void check_flags(unsigned long flags)
17909 }
17910 }
17911
17912+#ifndef CONFIG_PREEMPT_RT_FULL
17913 /*
17914 * We dont accurately track softirq state in e.g.
17915 * hardirq contexts (such as on 4KSTACKS), so only
17916@@ -3928,6 +3929,7 @@ static void check_flags(unsigned long flags)
17917 DEBUG_LOCKS_WARN_ON(!current->softirqs_enabled);
17918 }
17919 }
17920+#endif
17921
17922 if (!debug_locks)
17923 print_irqtrace_events(current);
17924diff --git a/kernel/locking/locktorture.c b/kernel/locking/locktorture.c
17925index 6dca260eeccf..5d01ac590d4c 100644
17926--- a/kernel/locking/locktorture.c
17927+++ b/kernel/locking/locktorture.c
17928@@ -26,7 +26,6 @@
17929 #include <linux/kthread.h>
17930 #include <linux/sched/rt.h>
17931 #include <linux/spinlock.h>
17932-#include <linux/rwlock.h>
17933 #include <linux/mutex.h>
17934 #include <linux/rwsem.h>
17935 #include <linux/smp.h>
17936diff --git a/kernel/locking/mutex-rt.c b/kernel/locking/mutex-rt.c
17937new file mode 100644
17938index 000000000000..4f81595c0f52
17939--- /dev/null
17940+++ b/kernel/locking/mutex-rt.c
e4b2b4a8
JK
17941@@ -0,0 +1,223 @@
17942+/*
17943+ * kernel/rt.c
17944+ *
17945+ * Real-Time Preemption Support
17946+ *
17947+ * started by Ingo Molnar:
17948+ *
17949+ * Copyright (C) 2004-2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
17950+ * Copyright (C) 2006, Timesys Corp., Thomas Gleixner <tglx@timesys.com>
17951+ *
17952+ * historic credit for proving that Linux spinlocks can be implemented via
17953+ * RT-aware mutexes goes to many people: The Pmutex project (Dirk Grambow
17954+ * and others) who prototyped it on 2.4 and did lots of comparative
17955+ * research and analysis; TimeSys, for proving that you can implement a
17956+ * fully preemptible kernel via the use of IRQ threading and mutexes;
17957+ * Bill Huey for persuasively arguing on lkml that the mutex model is the
17958+ * right one; and to MontaVista, who ported pmutexes to 2.6.
17959+ *
17960+ * This code is a from-scratch implementation and is not based on pmutexes,
17961+ * but the idea of converting spinlocks to mutexes is used here too.
17962+ *
17963+ * lock debugging, locking tree, deadlock detection:
17964+ *
17965+ * Copyright (C) 2004, LynuxWorks, Inc., Igor Manyilov, Bill Huey
17966+ * Released under the General Public License (GPL).
17967+ *
17968+ * Includes portions of the generic R/W semaphore implementation from:
17969+ *
17970+ * Copyright (c) 2001 David Howells (dhowells@redhat.com).
17971+ * - Derived partially from idea by Andrea Arcangeli <andrea@suse.de>
17972+ * - Derived also from comments by Linus
17973+ *
17974+ * Pending ownership of locks and ownership stealing:
17975+ *
17976+ * Copyright (C) 2005, Kihon Technologies Inc., Steven Rostedt
17977+ *
17978+ * (also by Steven Rostedt)
17979+ * - Converted single pi_lock to individual task locks.
17980+ *
17981+ * By Esben Nielsen:
17982+ * Doing priority inheritance with help of the scheduler.
17983+ *
17984+ * Copyright (C) 2006, Timesys Corp., Thomas Gleixner <tglx@timesys.com>
17985+ * - major rework based on Esben Nielsens initial patch
17986+ * - replaced thread_info references by task_struct refs
17987+ * - removed task->pending_owner dependency
17988+ * - BKL drop/reacquire for semaphore style locks to avoid deadlocks
17989+ * in the scheduler return path as discussed with Steven Rostedt
17990+ *
17991+ * Copyright (C) 2006, Kihon Technologies Inc.
17992+ * Steven Rostedt <rostedt@goodmis.org>
17993+ * - debugged and patched Thomas Gleixner's rework.
17994+ * - added back the cmpxchg to the rework.
17995+ * - turned atomic require back on for SMP.
17996+ */
17997+
17998+#include <linux/spinlock.h>
17999+#include <linux/rtmutex.h>
18000+#include <linux/sched.h>
18001+#include <linux/delay.h>
18002+#include <linux/module.h>
18003+#include <linux/kallsyms.h>
18004+#include <linux/syscalls.h>
18005+#include <linux/interrupt.h>
18006+#include <linux/plist.h>
18007+#include <linux/fs.h>
18008+#include <linux/futex.h>
18009+#include <linux/hrtimer.h>
18010+
18011+#include "rtmutex_common.h"
18012+
18013+/*
18014+ * struct mutex functions
18015+ */
18016+void __mutex_do_init(struct mutex *mutex, const char *name,
18017+ struct lock_class_key *key)
18018+{
18019+#ifdef CONFIG_DEBUG_LOCK_ALLOC
18020+ /*
18021+ * Make sure we are not reinitializing a held lock:
18022+ */
18023+ debug_check_no_locks_freed((void *)mutex, sizeof(*mutex));
18024+ lockdep_init_map(&mutex->dep_map, name, key, 0);
1a6e0f06 18025+#endif
e4b2b4a8
JK
18026+ mutex->lock.save_state = 0;
18027+}
18028+EXPORT_SYMBOL(__mutex_do_init);
18029+
18030+void __lockfunc _mutex_lock(struct mutex *lock)
18031+{
18032+ mutex_acquire(&lock->dep_map, 0, 0, _RET_IP_);
18033+ __rt_mutex_lock_state(&lock->lock, TASK_UNINTERRUPTIBLE);
18034+}
18035+EXPORT_SYMBOL(_mutex_lock);
18036+
18037+void __lockfunc _mutex_lock_io(struct mutex *lock)
18038+{
18039+ int token;
18040+
18041+ token = io_schedule_prepare();
18042+ _mutex_lock(lock);
18043+ io_schedule_finish(token);
18044+}
18045+EXPORT_SYMBOL_GPL(_mutex_lock_io);
18046+
18047+int __lockfunc _mutex_lock_interruptible(struct mutex *lock)
18048+{
18049+ int ret;
18050+
18051+ mutex_acquire(&lock->dep_map, 0, 0, _RET_IP_);
18052+ ret = __rt_mutex_lock_state(&lock->lock, TASK_INTERRUPTIBLE);
18053+ if (ret)
18054+ mutex_release(&lock->dep_map, 1, _RET_IP_);
18055+ return ret;
18056+}
18057+EXPORT_SYMBOL(_mutex_lock_interruptible);
18058+
18059+int __lockfunc _mutex_lock_killable(struct mutex *lock)
18060+{
18061+ int ret;
18062+
18063+ mutex_acquire(&lock->dep_map, 0, 0, _RET_IP_);
18064+ ret = __rt_mutex_lock_state(&lock->lock, TASK_KILLABLE);
18065+ if (ret)
18066+ mutex_release(&lock->dep_map, 1, _RET_IP_);
18067+ return ret;
18068+}
18069+EXPORT_SYMBOL(_mutex_lock_killable);
18070+
18071+#ifdef CONFIG_DEBUG_LOCK_ALLOC
18072+void __lockfunc _mutex_lock_nested(struct mutex *lock, int subclass)
18073+{
18074+ mutex_acquire_nest(&lock->dep_map, subclass, 0, NULL, _RET_IP_);
18075+ __rt_mutex_lock_state(&lock->lock, TASK_UNINTERRUPTIBLE);
18076+}
18077+EXPORT_SYMBOL(_mutex_lock_nested);
1a6e0f06 18078+
e4b2b4a8
JK
18079+void __lockfunc _mutex_lock_io_nested(struct mutex *lock, int subclass)
18080+{
18081+ int token;
1f39f580 18082+
e4b2b4a8 18083+ token = io_schedule_prepare();
1a6e0f06 18084+
e4b2b4a8
JK
18085+ mutex_acquire_nest(&lock->dep_map, subclass, 0, NULL, _RET_IP_);
18086+ __rt_mutex_lock_state(&lock->lock, TASK_UNINTERRUPTIBLE);
1a6e0f06 18087+
e4b2b4a8
JK
18088+ io_schedule_finish(token);
18089+}
18090+EXPORT_SYMBOL_GPL(_mutex_lock_io_nested);
1a6e0f06 18091+
e4b2b4a8
JK
18092+void __lockfunc _mutex_lock_nest_lock(struct mutex *lock, struct lockdep_map *nest)
18093+{
18094+ mutex_acquire_nest(&lock->dep_map, 0, 0, nest, _RET_IP_);
18095+ __rt_mutex_lock_state(&lock->lock, TASK_UNINTERRUPTIBLE);
18096+}
18097+EXPORT_SYMBOL(_mutex_lock_nest_lock);
1a6e0f06 18098+
e4b2b4a8 18099+int __lockfunc _mutex_lock_interruptible_nested(struct mutex *lock, int subclass)
1a6e0f06 18100+{
e4b2b4a8
JK
18101+ int ret;
18102+
18103+ mutex_acquire_nest(&lock->dep_map, subclass, 0, NULL, _RET_IP_);
18104+ ret = __rt_mutex_lock_state(&lock->lock, TASK_INTERRUPTIBLE);
18105+ if (ret)
18106+ mutex_release(&lock->dep_map, 1, _RET_IP_);
18107+ return ret;
1a6e0f06 18108+}
e4b2b4a8
JK
18109+EXPORT_SYMBOL(_mutex_lock_interruptible_nested);
18110+
18111+int __lockfunc _mutex_lock_killable_nested(struct mutex *lock, int subclass)
18112+{
18113+ int ret;
18114+
18115+ mutex_acquire(&lock->dep_map, subclass, 0, _RET_IP_);
18116+ ret = __rt_mutex_lock_state(&lock->lock, TASK_KILLABLE);
18117+ if (ret)
18118+ mutex_release(&lock->dep_map, 1, _RET_IP_);
18119+ return ret;
18120+}
18121+EXPORT_SYMBOL(_mutex_lock_killable_nested);
1a6e0f06 18122+#endif
e4b2b4a8
JK
18123+
18124+int __lockfunc _mutex_trylock(struct mutex *lock)
18125+{
18126+ int ret = __rt_mutex_trylock(&lock->lock);
18127+
18128+ if (ret)
18129+ mutex_acquire(&lock->dep_map, 0, 1, _RET_IP_);
18130+
18131+ return ret;
18132+}
18133+EXPORT_SYMBOL(_mutex_trylock);
18134+
18135+void __lockfunc _mutex_unlock(struct mutex *lock)
18136+{
18137+ mutex_release(&lock->dep_map, 1, _RET_IP_);
18138+ __rt_mutex_unlock(&lock->lock);
18139+}
18140+EXPORT_SYMBOL(_mutex_unlock);
18141+
18142+/**
18143+ * atomic_dec_and_mutex_lock - return holding mutex if we dec to 0
18144+ * @cnt: the atomic which we are to dec
18145+ * @lock: the mutex to return holding if we dec to 0
18146+ *
18147+ * return true and hold lock if we dec to 0, return false otherwise
18148+ */
18149+int atomic_dec_and_mutex_lock(atomic_t *cnt, struct mutex *lock)
18150+{
18151+ /* dec if we can't possibly hit 0 */
18152+ if (atomic_add_unless(cnt, -1, 1))
18153+ return 0;
18154+ /* we might hit 0, so take the lock */
18155+ mutex_lock(lock);
18156+ if (!atomic_dec_and_test(cnt)) {
18157+ /* when we actually did the dec, we didn't hit 0 */
18158+ mutex_unlock(lock);
18159+ return 0;
18160+ }
18161+ /* we hit 0, and we hold the lock */
18162+ return 1;
18163+}
18164+EXPORT_SYMBOL(atomic_dec_and_mutex_lock);
b3bbd485
JK
18165diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c
18166index 4ad35718f123..08e233b7dc21 100644
18167--- a/kernel/locking/rtmutex.c
18168+++ b/kernel/locking/rtmutex.c
e4b2b4a8
JK
18169@@ -7,6 +7,11 @@
18170 * Copyright (C) 2005-2006 Timesys Corp., Thomas Gleixner <tglx@timesys.com>
18171 * Copyright (C) 2005 Kihon Technologies Inc., Steven Rostedt
18172 * Copyright (C) 2006 Esben Nielsen
18173+ * Adaptive Spinlocks:
18174+ * Copyright (C) 2008 Novell, Inc., Gregory Haskins, Sven Dietrich,
18175+ * and Peter Morreale,
18176+ * Adaptive Spinlocks simplification:
18177+ * Copyright (C) 2008 Red Hat, Inc., Steven Rostedt <srostedt@redhat.com>
1a6e0f06 18178 *
e4b2b4a8
JK
18179 * See Documentation/locking/rt-mutex-design.txt for details.
18180 */
18181@@ -18,6 +23,8 @@
18182 #include <linux/sched/wake_q.h>
18183 #include <linux/sched/debug.h>
18184 #include <linux/timer.h>
18185+#include <linux/ww_mutex.h>
18186+#include <linux/blkdev.h>
18187
18188 #include "rtmutex_common.h"
18189
b3bbd485 18190@@ -135,6 +142,12 @@ static void fixup_rt_mutex_waiters(struct rt_mutex *lock)
e4b2b4a8 18191 WRITE_ONCE(*p, owner & ~RT_MUTEX_HAS_WAITERS);
1a6e0f06
JK
18192 }
18193
e4b2b4a8
JK
18194+static int rt_mutex_real_waiter(struct rt_mutex_waiter *waiter)
18195+{
18196+ return waiter && waiter != PI_WAKEUP_INPROGRESS &&
18197+ waiter != PI_REQUEUE_INPROGRESS;
18198+}
18199+
1a6e0f06 18200 /*
e4b2b4a8
JK
18201 * We can speed up the acquire/release, if there's no debugging state to be
18202 * set up.
b3bbd485 18203@@ -228,7 +241,7 @@ static inline bool unlock_rt_mutex_safe(struct rt_mutex *lock,
e4b2b4a8 18204 * Only use with rt_mutex_waiter_{less,equal}()
1a6e0f06 18205 */
e4b2b4a8
JK
18206 #define task_to_waiter(p) \
18207- &(struct rt_mutex_waiter){ .prio = (p)->prio, .deadline = (p)->dl.deadline }
18208+ &(struct rt_mutex_waiter){ .prio = (p)->prio, .deadline = (p)->dl.deadline, .task = (p) }
1a6e0f06 18209
e4b2b4a8
JK
18210 static inline int
18211 rt_mutex_waiter_less(struct rt_mutex_waiter *left,
b3bbd485 18212@@ -268,6 +281,27 @@ rt_mutex_waiter_equal(struct rt_mutex_waiter *left,
e4b2b4a8 18213 return 1;
1a6e0f06
JK
18214 }
18215
e4b2b4a8
JK
18216+#define STEAL_NORMAL 0
18217+#define STEAL_LATERAL 1
18218+
18219+static inline int
18220+rt_mutex_steal(struct rt_mutex *lock, struct rt_mutex_waiter *waiter, int mode)
18221+{
18222+ struct rt_mutex_waiter *top_waiter = rt_mutex_top_waiter(lock);
18223+
18224+ if (waiter == top_waiter || rt_mutex_waiter_less(waiter, top_waiter))
18225+ return 1;
18226+
18227+ /*
18228+ * Note that RT tasks are excluded from lateral-steals
18229+ * to prevent the introduction of an unbounded latency.
18230+ */
18231+ if (mode == STEAL_NORMAL || rt_task(waiter->task))
18232+ return 0;
18233+
18234+ return rt_mutex_waiter_equal(waiter, top_waiter);
18235+}
18236+
18237 static void
18238 rt_mutex_enqueue(struct rt_mutex *lock, struct rt_mutex_waiter *waiter)
18239 {
b3bbd485 18240@@ -372,6 +406,14 @@ static bool rt_mutex_cond_detect_deadlock(struct rt_mutex_waiter *waiter,
e4b2b4a8 18241 return debug_rt_mutex_detect_deadlock(waiter, chwalk);
1a6e0f06
JK
18242 }
18243
e4b2b4a8
JK
18244+static void rt_mutex_wake_waiter(struct rt_mutex_waiter *waiter)
18245+{
18246+ if (waiter->savestate)
18247+ wake_up_lock_sleeper(waiter->task);
18248+ else
18249+ wake_up_process(waiter->task);
18250+}
18251+
18252 /*
18253 * Max number of times we'll walk the boosting chain:
18254 */
b3bbd485 18255@@ -379,7 +421,8 @@ int max_lock_depth = 1024;
1a6e0f06 18256
e4b2b4a8
JK
18257 static inline struct rt_mutex *task_blocked_on_lock(struct task_struct *p)
18258 {
18259- return p->pi_blocked_on ? p->pi_blocked_on->lock : NULL;
18260+ return rt_mutex_real_waiter(p->pi_blocked_on) ?
18261+ p->pi_blocked_on->lock : NULL;
18262 }
1a6e0f06 18263
e4b2b4a8 18264 /*
b3bbd485 18265@@ -515,7 +558,7 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
e4b2b4a8
JK
18266 * reached or the state of the chain has changed while we
18267 * dropped the locks.
18268 */
18269- if (!waiter)
18270+ if (!rt_mutex_real_waiter(waiter))
18271 goto out_unlock_pi;
1a6e0f06 18272
e4b2b4a8 18273 /*
b3bbd485 18274@@ -696,13 +739,16 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
e4b2b4a8
JK
18275 * follow here. This is the end of the chain we are walking.
18276 */
18277 if (!rt_mutex_owner(lock)) {
18278+ struct rt_mutex_waiter *lock_top_waiter;
1a6e0f06 18279+
e4b2b4a8
JK
18280 /*
18281 * If the requeue [7] above changed the top waiter,
18282 * then we need to wake the new top waiter up to try
18283 * to get the lock.
18284 */
18285- if (prerequeue_top_waiter != rt_mutex_top_waiter(lock))
18286- wake_up_process(rt_mutex_top_waiter(lock)->task);
18287+ lock_top_waiter = rt_mutex_top_waiter(lock);
18288+ if (prerequeue_top_waiter != lock_top_waiter)
18289+ rt_mutex_wake_waiter(lock_top_waiter);
18290 raw_spin_unlock_irq(&lock->wait_lock);
18291 return 0;
18292 }
b3bbd485 18293@@ -804,9 +850,11 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
e4b2b4a8
JK
18294 * @task: The task which wants to acquire the lock
18295 * @waiter: The waiter that is queued to the lock's wait tree if the
18296 * callsite called task_blocked_on_lock(), otherwise NULL
18297+ * @mode: Lock steal mode (STEAL_NORMAL, STEAL_LATERAL)
18298 */
18299-static int try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task,
18300- struct rt_mutex_waiter *waiter)
18301+static int __try_to_take_rt_mutex(struct rt_mutex *lock,
18302+ struct task_struct *task,
18303+ struct rt_mutex_waiter *waiter, int mode)
18304 {
18305 lockdep_assert_held(&lock->wait_lock);
1a6e0f06 18306
b3bbd485 18307@@ -842,12 +890,11 @@ static int try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task,
e4b2b4a8
JK
18308 */
18309 if (waiter) {
18310 /*
18311- * If waiter is not the highest priority waiter of
18312- * @lock, give up.
18313+ * If waiter is not the highest priority waiter of @lock,
18314+ * or its peer when lateral steal is allowed, give up.
18315 */
18316- if (waiter != rt_mutex_top_waiter(lock))
18317+ if (!rt_mutex_steal(lock, waiter, mode))
18318 return 0;
18319-
18320 /*
18321 * We can acquire the lock. Remove the waiter from the
18322 * lock waiters tree.
b3bbd485 18323@@ -865,14 +912,12 @@ static int try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task,
e4b2b4a8
JK
18324 */
18325 if (rt_mutex_has_waiters(lock)) {
18326 /*
18327- * If @task->prio is greater than or equal to
18328- * the top waiter priority (kernel view),
18329- * @task lost.
18330+ * If @task->prio is greater than the top waiter
18331+ * priority (kernel view), or equal to it when a
18332+ * lateral steal is forbidden, @task lost.
18333 */
18334- if (!rt_mutex_waiter_less(task_to_waiter(task),
18335- rt_mutex_top_waiter(lock)))
18336+ if (!rt_mutex_steal(lock, task_to_waiter(task), mode))
18337 return 0;
18338-
18339 /*
18340 * The current top waiter stays enqueued. We
18341 * don't have to change anything in the lock
b3bbd485 18342@@ -919,6 +964,351 @@ static int try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task,
e4b2b4a8
JK
18343 return 1;
18344 }
1a6e0f06 18345
e4b2b4a8
JK
18346+#ifdef CONFIG_PREEMPT_RT_FULL
18347+/*
18348+ * preemptible spin_lock functions:
18349+ */
18350+static inline void rt_spin_lock_fastlock(struct rt_mutex *lock,
18351+ void (*slowfn)(struct rt_mutex *lock))
18352+{
18353+ might_sleep_no_state_check();
18354+
18355+ if (likely(rt_mutex_cmpxchg_acquire(lock, NULL, current)))
18356+ return;
18357+ else
18358+ slowfn(lock);
18359+}
18360+
18361+static inline void rt_spin_lock_fastunlock(struct rt_mutex *lock,
18362+ void (*slowfn)(struct rt_mutex *lock))
18363+{
18364+ if (likely(rt_mutex_cmpxchg_release(lock, current, NULL)))
18365+ return;
18366+ else
18367+ slowfn(lock);
18368+}
18369+#ifdef CONFIG_SMP
18370+/*
18371+ * Note that owner is a speculative pointer and dereferencing relies
18372+ * on rcu_read_lock() and the check against the lock owner.
18373+ */
18374+static int adaptive_wait(struct rt_mutex *lock,
18375+ struct task_struct *owner)
18376+{
18377+ int res = 0;
18378+
18379+ rcu_read_lock();
18380+ for (;;) {
18381+ if (owner != rt_mutex_owner(lock))
18382+ break;
18383+ /*
18384+ * Ensure that owner->on_cpu is dereferenced _after_
18385+ * checking the above to be valid.
18386+ */
18387+ barrier();
18388+ if (!owner->on_cpu) {
18389+ res = 1;
18390+ break;
18391+ }
18392+ cpu_relax();
18393+ }
18394+ rcu_read_unlock();
18395+ return res;
18396+}
18397+#else
18398+static int adaptive_wait(struct rt_mutex *lock,
18399+ struct task_struct *orig_owner)
18400+{
18401+ return 1;
18402+}
1a6e0f06
JK
18403+#endif
18404+
e4b2b4a8
JK
18405+static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
18406+ struct rt_mutex_waiter *waiter,
18407+ struct task_struct *task,
18408+ enum rtmutex_chainwalk chwalk);
18409+/*
18410+ * Slow path lock function spin_lock style: this variant is very
18411+ * careful not to miss any non-lock wakeups.
18412+ *
18413+ * We store the current state under p->pi_lock in p->saved_state and
18414+ * the try_to_wake_up() code handles this accordingly.
18415+ */
18416+void __sched rt_spin_lock_slowlock_locked(struct rt_mutex *lock,
18417+ struct rt_mutex_waiter *waiter,
18418+ unsigned long flags)
18419+{
18420+ struct task_struct *lock_owner, *self = current;
18421+ struct rt_mutex_waiter *top_waiter;
18422+ int ret;
1a6e0f06 18423+
e4b2b4a8
JK
18424+ if (__try_to_take_rt_mutex(lock, self, NULL, STEAL_LATERAL))
18425+ return;
1a6e0f06 18426+
e4b2b4a8 18427+ BUG_ON(rt_mutex_owner(lock) == self);
1a6e0f06 18428+
e4b2b4a8
JK
18429+ /*
18430+ * We save whatever state the task is in and we'll restore it
18431+ * after acquiring the lock taking real wakeups into account
18432+ * as well. We are serialized via pi_lock against wakeups. See
18433+ * try_to_wake_up().
18434+ */
18435+ raw_spin_lock(&self->pi_lock);
18436+ self->saved_state = self->state;
18437+ __set_current_state_no_track(TASK_UNINTERRUPTIBLE);
18438+ raw_spin_unlock(&self->pi_lock);
1a6e0f06 18439+
e4b2b4a8
JK
18440+ ret = task_blocks_on_rt_mutex(lock, waiter, self, RT_MUTEX_MIN_CHAINWALK);
18441+ BUG_ON(ret);
1a6e0f06 18442+
e4b2b4a8
JK
18443+ for (;;) {
18444+ /* Try to acquire the lock again. */
18445+ if (__try_to_take_rt_mutex(lock, self, waiter, STEAL_LATERAL))
18446+ break;
1a6e0f06 18447+
e4b2b4a8
JK
18448+ top_waiter = rt_mutex_top_waiter(lock);
18449+ lock_owner = rt_mutex_owner(lock);
1a6e0f06 18450+
e4b2b4a8 18451+ raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
1a6e0f06 18452+
e4b2b4a8 18453+ debug_rt_mutex_print_deadlock(waiter);
1a6e0f06 18454+
e4b2b4a8
JK
18455+ if (top_waiter != waiter || adaptive_wait(lock, lock_owner))
18456+ schedule();
1a6e0f06 18457+
e4b2b4a8 18458+ raw_spin_lock_irqsave(&lock->wait_lock, flags);
1a6e0f06 18459+
e4b2b4a8
JK
18460+ raw_spin_lock(&self->pi_lock);
18461+ __set_current_state_no_track(TASK_UNINTERRUPTIBLE);
18462+ raw_spin_unlock(&self->pi_lock);
18463+ }
1a6e0f06 18464+
e4b2b4a8
JK
18465+ /*
18466+ * Restore the task state to current->saved_state. We set it
18467+ * to the original state above and the try_to_wake_up() code
18468+ * has possibly updated it when a real (non-rtmutex) wakeup
18469+ * happened while we were blocked. Clear saved_state so
18470+ * try_to_wakeup() does not get confused.
18471+ */
18472+ raw_spin_lock(&self->pi_lock);
18473+ __set_current_state_no_track(self->saved_state);
18474+ self->saved_state = TASK_RUNNING;
18475+ raw_spin_unlock(&self->pi_lock);
1a6e0f06 18476+
e4b2b4a8
JK
18477+ /*
18478+ * try_to_take_rt_mutex() sets the waiter bit
18479+ * unconditionally. We might have to fix that up:
18480+ */
18481+ fixup_rt_mutex_waiters(lock);
1a6e0f06 18482+
e4b2b4a8
JK
18483+ BUG_ON(rt_mutex_has_waiters(lock) && waiter == rt_mutex_top_waiter(lock));
18484+ BUG_ON(!RB_EMPTY_NODE(&waiter->tree_entry));
18485+}
1a6e0f06 18486+
e4b2b4a8
JK
18487+static void noinline __sched rt_spin_lock_slowlock(struct rt_mutex *lock)
18488+{
18489+ struct rt_mutex_waiter waiter;
18490+ unsigned long flags;
1a6e0f06 18491+
e4b2b4a8 18492+ rt_mutex_init_waiter(&waiter, true);
1a6e0f06 18493+
e4b2b4a8
JK
18494+ raw_spin_lock_irqsave(&lock->wait_lock, flags);
18495+ rt_spin_lock_slowlock_locked(lock, &waiter, flags);
18496+ raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
18497+ debug_rt_mutex_free_waiter(&waiter);
18498+}
1a6e0f06 18499+
e4b2b4a8
JK
18500+static bool __sched __rt_mutex_unlock_common(struct rt_mutex *lock,
18501+ struct wake_q_head *wake_q,
18502+ struct wake_q_head *wq_sleeper);
18503+/*
18504+ * Slow path to release a rt_mutex spin_lock style
18505+ */
18506+void __sched rt_spin_lock_slowunlock(struct rt_mutex *lock)
18507+{
18508+ unsigned long flags;
18509+ DEFINE_WAKE_Q(wake_q);
18510+ DEFINE_WAKE_Q(wake_sleeper_q);
18511+ bool postunlock;
1a6e0f06 18512+
e4b2b4a8
JK
18513+ raw_spin_lock_irqsave(&lock->wait_lock, flags);
18514+ postunlock = __rt_mutex_unlock_common(lock, &wake_q, &wake_sleeper_q);
18515+ raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
1a6e0f06 18516+
e4b2b4a8
JK
18517+ if (postunlock)
18518+ rt_mutex_postunlock(&wake_q, &wake_sleeper_q);
18519+}
1a6e0f06 18520+
e4b2b4a8
JK
18521+void __lockfunc rt_spin_lock(spinlock_t *lock)
18522+{
18523+ sleeping_lock_inc();
18524+ migrate_disable();
18525+ spin_acquire(&lock->dep_map, 0, 0, _RET_IP_);
18526+ rt_spin_lock_fastlock(&lock->lock, rt_spin_lock_slowlock);
18527+}
18528+EXPORT_SYMBOL(rt_spin_lock);
1a6e0f06 18529+
e4b2b4a8
JK
18530+void __lockfunc __rt_spin_lock(struct rt_mutex *lock)
18531+{
18532+ rt_spin_lock_fastlock(lock, rt_spin_lock_slowlock);
18533+}
1a6e0f06 18534+
e4b2b4a8
JK
18535+#ifdef CONFIG_DEBUG_LOCK_ALLOC
18536+void __lockfunc rt_spin_lock_nested(spinlock_t *lock, int subclass)
18537+{
18538+ sleeping_lock_inc();
18539+ migrate_disable();
18540+ spin_acquire(&lock->dep_map, subclass, 0, _RET_IP_);
18541+ rt_spin_lock_fastlock(&lock->lock, rt_spin_lock_slowlock);
18542+}
18543+EXPORT_SYMBOL(rt_spin_lock_nested);
18544+#endif
1a6e0f06 18545+
e4b2b4a8
JK
18546+void __lockfunc rt_spin_unlock(spinlock_t *lock)
18547+{
18548+ /* NOTE: we always pass in '1' for nested, for simplicity */
18549+ spin_release(&lock->dep_map, 1, _RET_IP_);
18550+ rt_spin_lock_fastunlock(&lock->lock, rt_spin_lock_slowunlock);
18551+ migrate_enable();
18552+ sleeping_lock_dec();
18553+}
18554+EXPORT_SYMBOL(rt_spin_unlock);
1a6e0f06 18555+
e4b2b4a8
JK
18556+void __lockfunc __rt_spin_unlock(struct rt_mutex *lock)
18557+{
18558+ rt_spin_lock_fastunlock(lock, rt_spin_lock_slowunlock);
18559+}
18560+EXPORT_SYMBOL(__rt_spin_unlock);
18561+
18562+/*
18563+ * Wait for the lock to get unlocked: instead of polling for an unlock
18564+ * (like raw spinlocks do), we lock and unlock, to force the kernel to
18565+ * schedule if there's contention:
18566+ */
18567+void __lockfunc rt_spin_unlock_wait(spinlock_t *lock)
18568+{
18569+ spin_lock(lock);
18570+ spin_unlock(lock);
18571+}
18572+EXPORT_SYMBOL(rt_spin_unlock_wait);
18573+
18574+int __lockfunc rt_spin_trylock(spinlock_t *lock)
18575+{
18576+ int ret;
18577+
18578+ sleeping_lock_inc();
18579+ migrate_disable();
18580+ ret = __rt_mutex_trylock(&lock->lock);
18581+ if (ret) {
18582+ spin_acquire(&lock->dep_map, 0, 1, _RET_IP_);
18583+ } else {
18584+ migrate_enable();
18585+ sleeping_lock_dec();
18586+ }
18587+ return ret;
18588+}
18589+EXPORT_SYMBOL(rt_spin_trylock);
18590+
18591+int __lockfunc rt_spin_trylock_bh(spinlock_t *lock)
18592+{
18593+ int ret;
18594+
18595+ local_bh_disable();
18596+ ret = __rt_mutex_trylock(&lock->lock);
18597+ if (ret) {
18598+ sleeping_lock_inc();
18599+ migrate_disable();
18600+ spin_acquire(&lock->dep_map, 0, 1, _RET_IP_);
18601+ } else
18602+ local_bh_enable();
18603+ return ret;
18604+}
18605+EXPORT_SYMBOL(rt_spin_trylock_bh);
18606+
18607+int __lockfunc rt_spin_trylock_irqsave(spinlock_t *lock, unsigned long *flags)
18608+{
18609+ int ret;
1a6e0f06 18610+
e4b2b4a8
JK
18611+ *flags = 0;
18612+ ret = __rt_mutex_trylock(&lock->lock);
18613+ if (ret) {
18614+ sleeping_lock_inc();
18615+ migrate_disable();
18616+ spin_acquire(&lock->dep_map, 0, 1, _RET_IP_);
18617+ }
18618+ return ret;
18619+}
18620+EXPORT_SYMBOL(rt_spin_trylock_irqsave);
1a6e0f06 18621+
e4b2b4a8
JK
18622+int atomic_dec_and_spin_lock(atomic_t *atomic, spinlock_t *lock)
18623+{
18624+ /* Subtract 1 from counter unless that drops it to 0 (ie. it was 1) */
18625+ if (atomic_add_unless(atomic, -1, 1))
18626+ return 0;
18627+ rt_spin_lock(lock);
18628+ if (atomic_dec_and_test(atomic))
18629+ return 1;
18630+ rt_spin_unlock(lock);
18631+ return 0;
18632+}
18633+EXPORT_SYMBOL(atomic_dec_and_spin_lock);
1a6e0f06 18634+
e4b2b4a8
JK
18635+void
18636+__rt_spin_lock_init(spinlock_t *lock, const char *name, struct lock_class_key *key)
18637+{
1a6e0f06 18638+#ifdef CONFIG_DEBUG_LOCK_ALLOC
e4b2b4a8
JK
18639+ /*
18640+ * Make sure we are not reinitializing a held lock:
18641+ */
18642+ debug_check_no_locks_freed((void *)lock, sizeof(*lock));
18643+ lockdep_init_map(&lock->dep_map, name, key, 0);
1a6e0f06 18644+#endif
e4b2b4a8
JK
18645+}
18646+EXPORT_SYMBOL(__rt_spin_lock_init);
1a6e0f06 18647+
e4b2b4a8 18648+#endif /* PREEMPT_RT_FULL */
1a6e0f06 18649+
e4b2b4a8
JK
18650+#ifdef CONFIG_PREEMPT_RT_FULL
18651+ static inline int __sched
18652+__mutex_lock_check_stamp(struct rt_mutex *lock, struct ww_acquire_ctx *ctx)
18653+{
18654+ struct ww_mutex *ww = container_of(lock, struct ww_mutex, base.lock);
18655+ struct ww_acquire_ctx *hold_ctx = ACCESS_ONCE(ww->ctx);
1a6e0f06 18656+
e4b2b4a8
JK
18657+ if (!hold_ctx)
18658+ return 0;
1a6e0f06 18659+
e4b2b4a8
JK
18660+ if (unlikely(ctx == hold_ctx))
18661+ return -EALREADY;
1a6e0f06 18662+
e4b2b4a8
JK
18663+ if (ctx->stamp - hold_ctx->stamp <= LONG_MAX &&
18664+ (ctx->stamp != hold_ctx->stamp || ctx > hold_ctx)) {
18665+#ifdef CONFIG_DEBUG_MUTEXES
18666+ DEBUG_LOCKS_WARN_ON(ctx->contending_lock);
18667+ ctx->contending_lock = ww;
18668+#endif
18669+ return -EDEADLK;
18670+ }
1a6e0f06 18671+
e4b2b4a8
JK
18672+ return 0;
18673+}
18674+#else
18675+ static inline int __sched
18676+__mutex_lock_check_stamp(struct rt_mutex *lock, struct ww_acquire_ctx *ctx)
18677+{
18678+ BUG();
18679+ return 0;
18680+}
1a6e0f06 18681+
1a6e0f06
JK
18682+#endif
18683+
e4b2b4a8
JK
18684+static inline int
18685+try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task,
18686+ struct rt_mutex_waiter *waiter)
18687+{
18688+ return __try_to_take_rt_mutex(lock, task, waiter, STEAL_NORMAL);
18689+}
1a6e0f06 18690+
e4b2b4a8
JK
18691 /*
18692 * Task blocks on lock.
18693 *
b3bbd485 18694@@ -951,6 +1341,22 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
e4b2b4a8
JK
18695 return -EDEADLK;
18696
18697 raw_spin_lock(&task->pi_lock);
18698+ /*
18699+ * In the case of futex requeue PI, this will be a proxy
18700+ * lock. The task will wake unaware that it is enqueueed on
18701+ * this lock. Avoid blocking on two locks and corrupting
18702+ * pi_blocked_on via the PI_WAKEUP_INPROGRESS
18703+ * flag. futex_wait_requeue_pi() sets this when it wakes up
18704+ * before requeue (due to a signal or timeout). Do not enqueue
18705+ * the task if PI_WAKEUP_INPROGRESS is set.
18706+ */
18707+ if (task != current && task->pi_blocked_on == PI_WAKEUP_INPROGRESS) {
18708+ raw_spin_unlock(&task->pi_lock);
18709+ return -EAGAIN;
18710+ }
1a6e0f06 18711+
e4b2b4a8 18712+ BUG_ON(rt_mutex_real_waiter(task->pi_blocked_on));
1a6e0f06 18713+
e4b2b4a8
JK
18714 waiter->task = task;
18715 waiter->lock = lock;
18716 waiter->prio = task->prio;
b3bbd485 18717@@ -974,7 +1380,7 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
e4b2b4a8
JK
18718 rt_mutex_enqueue_pi(owner, waiter);
18719
18720 rt_mutex_adjust_prio(owner);
18721- if (owner->pi_blocked_on)
18722+ if (rt_mutex_real_waiter(owner->pi_blocked_on))
18723 chain_walk = 1;
18724 } else if (rt_mutex_cond_detect_deadlock(waiter, chwalk)) {
18725 chain_walk = 1;
b3bbd485 18726@@ -1016,6 +1422,7 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
e4b2b4a8
JK
18727 * Called with lock->wait_lock held and interrupts disabled.
18728 */
18729 static void mark_wakeup_next_waiter(struct wake_q_head *wake_q,
18730+ struct wake_q_head *wake_sleeper_q,
18731 struct rt_mutex *lock)
18732 {
18733 struct rt_mutex_waiter *waiter;
b3bbd485 18734@@ -1055,7 +1462,10 @@ static void mark_wakeup_next_waiter(struct wake_q_head *wake_q,
e4b2b4a8
JK
18735 * Pairs with preempt_enable() in rt_mutex_postunlock();
18736 */
18737 preempt_disable();
18738- wake_q_add(wake_q, waiter->task);
18739+ if (waiter->savestate)
18740+ wake_q_add_sleeper(wake_sleeper_q, waiter->task);
18741+ else
18742+ wake_q_add(wake_q, waiter->task);
18743 raw_spin_unlock(&current->pi_lock);
18744 }
18745
b3bbd485 18746@@ -1070,7 +1480,7 @@ static void remove_waiter(struct rt_mutex *lock,
e4b2b4a8
JK
18747 {
18748 bool is_top_waiter = (waiter == rt_mutex_top_waiter(lock));
18749 struct task_struct *owner = rt_mutex_owner(lock);
18750- struct rt_mutex *next_lock;
18751+ struct rt_mutex *next_lock = NULL;
18752
18753 lockdep_assert_held(&lock->wait_lock);
18754
b3bbd485 18755@@ -1096,7 +1506,8 @@ static void remove_waiter(struct rt_mutex *lock,
e4b2b4a8
JK
18756 rt_mutex_adjust_prio(owner);
18757
18758 /* Store the lock on which owner is blocked or NULL */
18759- next_lock = task_blocked_on_lock(owner);
18760+ if (rt_mutex_real_waiter(owner->pi_blocked_on))
18761+ next_lock = task_blocked_on_lock(owner);
18762
18763 raw_spin_unlock(&owner->pi_lock);
18764
b3bbd485 18765@@ -1132,26 +1543,28 @@ void rt_mutex_adjust_pi(struct task_struct *task)
e4b2b4a8
JK
18766 raw_spin_lock_irqsave(&task->pi_lock, flags);
18767
18768 waiter = task->pi_blocked_on;
18769- if (!waiter || rt_mutex_waiter_equal(waiter, task_to_waiter(task))) {
18770+ if (!rt_mutex_real_waiter(waiter) ||
18771+ rt_mutex_waiter_equal(waiter, task_to_waiter(task))) {
18772 raw_spin_unlock_irqrestore(&task->pi_lock, flags);
18773 return;
18774 }
18775 next_lock = waiter->lock;
18776- raw_spin_unlock_irqrestore(&task->pi_lock, flags);
18777
18778 /* gets dropped in rt_mutex_adjust_prio_chain()! */
18779 get_task_struct(task);
18780
18781+ raw_spin_unlock_irqrestore(&task->pi_lock, flags);
18782 rt_mutex_adjust_prio_chain(task, RT_MUTEX_MIN_CHAINWALK, NULL,
18783 next_lock, NULL, task);
18784 }
18785
18786-void rt_mutex_init_waiter(struct rt_mutex_waiter *waiter)
18787+void rt_mutex_init_waiter(struct rt_mutex_waiter *waiter, bool savestate)
18788 {
18789 debug_rt_mutex_init_waiter(waiter);
18790 RB_CLEAR_NODE(&waiter->pi_tree_entry);
18791 RB_CLEAR_NODE(&waiter->tree_entry);
18792 waiter->task = NULL;
18793+ waiter->savestate = savestate;
18794 }
18795
18796 /**
b3bbd485 18797@@ -1167,7 +1580,8 @@ void rt_mutex_init_waiter(struct rt_mutex_waiter *waiter)
e4b2b4a8
JK
18798 static int __sched
18799 __rt_mutex_slowlock(struct rt_mutex *lock, int state,
18800 struct hrtimer_sleeper *timeout,
18801- struct rt_mutex_waiter *waiter)
18802+ struct rt_mutex_waiter *waiter,
18803+ struct ww_acquire_ctx *ww_ctx)
18804 {
18805 int ret = 0;
18806
b3bbd485 18807@@ -1176,16 +1590,17 @@ __rt_mutex_slowlock(struct rt_mutex *lock, int state,
e4b2b4a8
JK
18808 if (try_to_take_rt_mutex(lock, current, waiter))
18809 break;
18810
18811- /*
18812- * TASK_INTERRUPTIBLE checks for signals and
18813- * timeout. Ignored otherwise.
18814- */
18815- if (likely(state == TASK_INTERRUPTIBLE)) {
18816- /* Signal pending? */
18817- if (signal_pending(current))
18818- ret = -EINTR;
18819- if (timeout && !timeout->task)
18820- ret = -ETIMEDOUT;
18821+ if (timeout && !timeout->task) {
18822+ ret = -ETIMEDOUT;
18823+ break;
18824+ }
18825+ if (signal_pending_state(state, current)) {
18826+ ret = -EINTR;
18827+ break;
18828+ }
1a6e0f06 18829+
e4b2b4a8
JK
18830+ if (ww_ctx && ww_ctx->acquired > 0) {
18831+ ret = __mutex_lock_check_stamp(lock, ww_ctx);
18832 if (ret)
18833 break;
18834 }
b3bbd485 18835@@ -1224,33 +1639,104 @@ static void rt_mutex_handle_deadlock(int res, int detect_deadlock,
e4b2b4a8
JK
18836 }
18837 }
18838
18839-/*
18840- * Slow path lock function:
18841- */
18842-static int __sched
18843-rt_mutex_slowlock(struct rt_mutex *lock, int state,
18844- struct hrtimer_sleeper *timeout,
18845- enum rtmutex_chainwalk chwalk)
18846+static __always_inline void ww_mutex_lock_acquired(struct ww_mutex *ww,
18847+ struct ww_acquire_ctx *ww_ctx)
18848 {
18849- struct rt_mutex_waiter waiter;
18850- unsigned long flags;
18851- int ret = 0;
18852+#ifdef CONFIG_DEBUG_MUTEXES
18853+ /*
18854+ * If this WARN_ON triggers, you used ww_mutex_lock to acquire,
18855+ * but released with a normal mutex_unlock in this call.
18856+ *
18857+ * This should never happen, always use ww_mutex_unlock.
18858+ */
18859+ DEBUG_LOCKS_WARN_ON(ww->ctx);
18860
18861- rt_mutex_init_waiter(&waiter);
18862+ /*
18863+ * Not quite done after calling ww_acquire_done() ?
18864+ */
18865+ DEBUG_LOCKS_WARN_ON(ww_ctx->done_acquire);
1a6e0f06 18866+
e4b2b4a8
JK
18867+ if (ww_ctx->contending_lock) {
18868+ /*
18869+ * After -EDEADLK you tried to
18870+ * acquire a different ww_mutex? Bad!
18871+ */
18872+ DEBUG_LOCKS_WARN_ON(ww_ctx->contending_lock != ww);
1a6e0f06 18873+
e4b2b4a8
JK
18874+ /*
18875+ * You called ww_mutex_lock after receiving -EDEADLK,
18876+ * but 'forgot' to unlock everything else first?
18877+ */
18878+ DEBUG_LOCKS_WARN_ON(ww_ctx->acquired > 0);
18879+ ww_ctx->contending_lock = NULL;
18880+ }
18881
18882 /*
18883- * Technically we could use raw_spin_[un]lock_irq() here, but this can
18884- * be called in early boot if the cmpxchg() fast path is disabled
18885- * (debug, no architecture support). In this case we will acquire the
18886- * rtmutex with lock->wait_lock held. But we cannot unconditionally
18887- * enable interrupts in that early boot case. So we need to use the
18888- * irqsave/restore variants.
18889+ * Naughty, using a different class will lead to undefined behavior!
18890 */
18891- raw_spin_lock_irqsave(&lock->wait_lock, flags);
18892+ DEBUG_LOCKS_WARN_ON(ww_ctx->ww_class != ww->ww_class);
18893+#endif
18894+ ww_ctx->acquired++;
18895+}
1a6e0f06 18896+
e4b2b4a8
JK
18897+#ifdef CONFIG_PREEMPT_RT_FULL
18898+static void ww_mutex_account_lock(struct rt_mutex *lock,
18899+ struct ww_acquire_ctx *ww_ctx)
18900+{
18901+ struct ww_mutex *ww = container_of(lock, struct ww_mutex, base.lock);
18902+ struct rt_mutex_waiter *waiter, *n;
1a6e0f06 18903+
e4b2b4a8
JK
18904+ /*
18905+ * This branch gets optimized out for the common case,
18906+ * and is only important for ww_mutex_lock.
18907+ */
18908+ ww_mutex_lock_acquired(ww, ww_ctx);
18909+ ww->ctx = ww_ctx;
1a6e0f06 18910+
e4b2b4a8
JK
18911+ /*
18912+ * Give any possible sleeping processes the chance to wake up,
18913+ * so they can recheck if they have to back off.
18914+ */
18915+ rbtree_postorder_for_each_entry_safe(waiter, n, &lock->waiters.rb_root,
18916+ tree_entry) {
18917+ /* XXX debug rt mutex waiter wakeup */
1a6e0f06 18918+
e4b2b4a8
JK
18919+ BUG_ON(waiter->lock != lock);
18920+ rt_mutex_wake_waiter(waiter);
18921+ }
1a6e0f06
JK
18922+}
18923+
e4b2b4a8 18924+#else
1a6e0f06 18925+
e4b2b4a8
JK
18926+static void ww_mutex_account_lock(struct rt_mutex *lock,
18927+ struct ww_acquire_ctx *ww_ctx)
1a6e0f06 18928+{
e4b2b4a8 18929+ BUG();
1a6e0f06 18930+}
e4b2b4a8 18931+#endif
1a6e0f06 18932+
e4b2b4a8
JK
18933+int __sched rt_mutex_slowlock_locked(struct rt_mutex *lock, int state,
18934+ struct hrtimer_sleeper *timeout,
18935+ enum rtmutex_chainwalk chwalk,
18936+ struct ww_acquire_ctx *ww_ctx,
18937+ struct rt_mutex_waiter *waiter)
1a6e0f06 18938+{
e4b2b4a8 18939+ int ret;
1a6e0f06 18940+
e4b2b4a8
JK
18941+#ifdef CONFIG_PREEMPT_RT_FULL
18942+ if (ww_ctx) {
18943+ struct ww_mutex *ww;
1a6e0f06 18944+
e4b2b4a8
JK
18945+ ww = container_of(lock, struct ww_mutex, base.lock);
18946+ if (unlikely(ww_ctx == READ_ONCE(ww->ctx)))
18947+ return -EALREADY;
18948+ }
18949+#endif
18950
18951 /* Try to acquire the lock again: */
18952 if (try_to_take_rt_mutex(lock, current, NULL)) {
18953- raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
18954+ if (ww_ctx)
18955+ ww_mutex_account_lock(lock, ww_ctx);
18956 return 0;
18957 }
18958
b3bbd485 18959@@ -1260,17 +1746,27 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state,
e4b2b4a8
JK
18960 if (unlikely(timeout))
18961 hrtimer_start_expires(&timeout->timer, HRTIMER_MODE_ABS);
18962
18963- ret = task_blocks_on_rt_mutex(lock, &waiter, current, chwalk);
18964+ ret = task_blocks_on_rt_mutex(lock, waiter, current, chwalk);
18965
18966- if (likely(!ret))
18967+ if (likely(!ret)) {
18968 /* sleep on the mutex */
18969- ret = __rt_mutex_slowlock(lock, state, timeout, &waiter);
18970+ ret = __rt_mutex_slowlock(lock, state, timeout, waiter,
18971+ ww_ctx);
18972+ } else if (ww_ctx) {
18973+ /* ww_mutex received EDEADLK, let it become EALREADY */
18974+ ret = __mutex_lock_check_stamp(lock, ww_ctx);
18975+ BUG_ON(!ret);
18976+ }
18977
18978 if (unlikely(ret)) {
18979 __set_current_state(TASK_RUNNING);
18980 if (rt_mutex_has_waiters(lock))
18981- remove_waiter(lock, &waiter);
18982- rt_mutex_handle_deadlock(ret, chwalk, &waiter);
18983+ remove_waiter(lock, waiter);
18984+ /* ww_mutex want to report EDEADLK/EALREADY, let them */
18985+ if (!ww_ctx)
18986+ rt_mutex_handle_deadlock(ret, chwalk, waiter);
18987+ } else if (ww_ctx) {
18988+ ww_mutex_account_lock(lock, ww_ctx);
18989 }
18990
18991 /*
b3bbd485 18992@@ -1278,6 +1774,36 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state,
e4b2b4a8
JK
18993 * unconditionally. We might have to fix that up.
18994 */
18995 fixup_rt_mutex_waiters(lock);
18996+ return ret;
1a6e0f06
JK
18997+}
18998+
e4b2b4a8
JK
18999+/*
19000+ * Slow path lock function:
19001+ */
19002+static int __sched
19003+rt_mutex_slowlock(struct rt_mutex *lock, int state,
19004+ struct hrtimer_sleeper *timeout,
19005+ enum rtmutex_chainwalk chwalk,
19006+ struct ww_acquire_ctx *ww_ctx)
1a6e0f06 19007+{
e4b2b4a8
JK
19008+ struct rt_mutex_waiter waiter;
19009+ unsigned long flags;
19010+ int ret = 0;
1a6e0f06 19011+
e4b2b4a8 19012+ rt_mutex_init_waiter(&waiter, false);
1a6e0f06 19013+
e4b2b4a8
JK
19014+ /*
19015+ * Technically we could use raw_spin_[un]lock_irq() here, but this can
19016+ * be called in early boot if the cmpxchg() fast path is disabled
19017+ * (debug, no architecture support). In this case we will acquire the
19018+ * rtmutex with lock->wait_lock held. But we cannot unconditionally
19019+ * enable interrupts in that early boot case. So we need to use the
19020+ * irqsave/restore variants.
19021+ */
19022+ raw_spin_lock_irqsave(&lock->wait_lock, flags);
1a6e0f06 19023+
e4b2b4a8
JK
19024+ ret = rt_mutex_slowlock_locked(lock, state, timeout, chwalk, ww_ctx,
19025+ &waiter);
19026
19027 raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
19028
b3bbd485 19029@@ -1338,7 +1864,8 @@ static inline int rt_mutex_slowtrylock(struct rt_mutex *lock)
e4b2b4a8
JK
19030 * Return whether the current task needs to call rt_mutex_postunlock().
19031 */
19032 static bool __sched rt_mutex_slowunlock(struct rt_mutex *lock,
19033- struct wake_q_head *wake_q)
19034+ struct wake_q_head *wake_q,
19035+ struct wake_q_head *wake_sleeper_q)
19036 {
19037 unsigned long flags;
19038
b3bbd485 19039@@ -1392,7 +1919,7 @@ static bool __sched rt_mutex_slowunlock(struct rt_mutex *lock,
e4b2b4a8
JK
19040 *
19041 * Queue the next waiter for wakeup once we release the wait_lock.
19042 */
19043- mark_wakeup_next_waiter(wake_q, lock);
19044+ mark_wakeup_next_waiter(wake_q, wake_sleeper_q, lock);
19045 raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
19046
19047 return true; /* call rt_mutex_postunlock() */
b3bbd485 19048@@ -1406,29 +1933,45 @@ static bool __sched rt_mutex_slowunlock(struct rt_mutex *lock,
e4b2b4a8
JK
19049 */
19050 static inline int
19051 rt_mutex_fastlock(struct rt_mutex *lock, int state,
19052+ struct ww_acquire_ctx *ww_ctx,
19053 int (*slowfn)(struct rt_mutex *lock, int state,
19054 struct hrtimer_sleeper *timeout,
19055- enum rtmutex_chainwalk chwalk))
19056+ enum rtmutex_chainwalk chwalk,
19057+ struct ww_acquire_ctx *ww_ctx))
19058 {
19059 if (likely(rt_mutex_cmpxchg_acquire(lock, NULL, current)))
19060 return 0;
19061
19062- return slowfn(lock, state, NULL, RT_MUTEX_MIN_CHAINWALK);
19063+ /*
19064+ * If rt_mutex blocks, the function sched_submit_work will not call
19065+ * blk_schedule_flush_plug (because tsk_is_pi_blocked would be true).
19066+ * We must call blk_schedule_flush_plug here, if we don't call it,
19067+ * a deadlock in device mapper may happen.
19068+ */
19069+ if (unlikely(blk_needs_flush_plug(current)))
19070+ blk_schedule_flush_plug(current);
1a6e0f06 19071+
e4b2b4a8
JK
19072+ return slowfn(lock, state, NULL, RT_MUTEX_MIN_CHAINWALK, ww_ctx);
19073 }
19074
19075 static inline int
19076 rt_mutex_timed_fastlock(struct rt_mutex *lock, int state,
19077 struct hrtimer_sleeper *timeout,
19078 enum rtmutex_chainwalk chwalk,
19079+ struct ww_acquire_ctx *ww_ctx,
19080 int (*slowfn)(struct rt_mutex *lock, int state,
19081 struct hrtimer_sleeper *timeout,
19082- enum rtmutex_chainwalk chwalk))
19083+ enum rtmutex_chainwalk chwalk,
19084+ struct ww_acquire_ctx *ww_ctx))
19085 {
19086 if (chwalk == RT_MUTEX_MIN_CHAINWALK &&
19087 likely(rt_mutex_cmpxchg_acquire(lock, NULL, current)))
19088 return 0;
19089
19090- return slowfn(lock, state, timeout, chwalk);
19091+ if (unlikely(blk_needs_flush_plug(current)))
19092+ blk_schedule_flush_plug(current);
1a6e0f06 19093+
e4b2b4a8
JK
19094+ return slowfn(lock, state, timeout, chwalk, ww_ctx);
19095 }
19096
19097 static inline int
b3bbd485 19098@@ -1444,9 +1987,11 @@ rt_mutex_fasttrylock(struct rt_mutex *lock,
e4b2b4a8
JK
19099 /*
19100 * Performs the wakeup of the the top-waiter and re-enables preemption.
19101 */
19102-void rt_mutex_postunlock(struct wake_q_head *wake_q)
19103+void rt_mutex_postunlock(struct wake_q_head *wake_q,
19104+ struct wake_q_head *wake_sleeper_q)
19105 {
19106 wake_up_q(wake_q);
19107+ wake_up_q_sleeper(wake_sleeper_q);
19108
19109 /* Pairs with preempt_disable() in rt_mutex_slowunlock() */
19110 preempt_enable();
b3bbd485 19111@@ -1455,23 +2000,40 @@ void rt_mutex_postunlock(struct wake_q_head *wake_q)
e4b2b4a8
JK
19112 static inline void
19113 rt_mutex_fastunlock(struct rt_mutex *lock,
19114 bool (*slowfn)(struct rt_mutex *lock,
19115- struct wake_q_head *wqh))
19116+ struct wake_q_head *wqh,
19117+ struct wake_q_head *wq_sleeper))
19118 {
19119 DEFINE_WAKE_Q(wake_q);
19120+ DEFINE_WAKE_Q(wake_sleeper_q);
19121
19122 if (likely(rt_mutex_cmpxchg_release(lock, current, NULL)))
19123 return;
19124
19125- if (slowfn(lock, &wake_q))
19126- rt_mutex_postunlock(&wake_q);
19127+ if (slowfn(lock, &wake_q, &wake_sleeper_q))
19128+ rt_mutex_postunlock(&wake_q, &wake_sleeper_q);
b3bbd485
JK
19129 }
19130
19131-static inline void __rt_mutex_lock(struct rt_mutex *lock, unsigned int subclass)
e4b2b4a8 19132+int __sched __rt_mutex_lock_state(struct rt_mutex *lock, int state)
b3bbd485
JK
19133 {
19134 might_sleep();
e4b2b4a8 19135+ return rt_mutex_fastlock(lock, state, NULL, rt_mutex_slowlock);
1a6e0f06
JK
19136+}
19137+
e4b2b4a8
JK
19138+/**
19139+ * rt_mutex_lock_state - lock a rt_mutex with a given state
19140+ *
19141+ * @lock: The rt_mutex to be locked
19142+ * @state: The state to set when blocking on the rt_mutex
19143+ */
b3bbd485 19144+static int __sched rt_mutex_lock_state(struct rt_mutex *lock, int state, unsigned int subclass)
1a6e0f06 19145+{
e4b2b4a8 19146+ int ret;
b3bbd485
JK
19147
19148 mutex_acquire(&lock->dep_map, subclass, 0, _RET_IP_);
19149- rt_mutex_fastlock(lock, TASK_UNINTERRUPTIBLE, rt_mutex_slowlock);
e4b2b4a8
JK
19150+ ret = __rt_mutex_lock_state(lock, state);
19151+ if (ret)
19152+ mutex_release(&lock->dep_map, 1, _RET_IP_);
19153+ return ret;
19154 }
19155
b3bbd485
JK
19156 #ifdef CONFIG_DEBUG_LOCK_ALLOC
19157@@ -1483,7 +2045,7 @@ static inline void __rt_mutex_lock(struct rt_mutex *lock, unsigned int subclass)
19158 */
19159 void __sched rt_mutex_lock_nested(struct rt_mutex *lock, unsigned int subclass)
19160 {
19161- __rt_mutex_lock(lock, subclass);
19162+ rt_mutex_lock_state(lock, TASK_UNINTERRUPTIBLE, subclass);
19163 }
19164 EXPORT_SYMBOL_GPL(rt_mutex_lock_nested);
19165 #endif
19166@@ -1496,7 +2058,7 @@ EXPORT_SYMBOL_GPL(rt_mutex_lock_nested);
e4b2b4a8
JK
19167 */
19168 void __sched rt_mutex_lock(struct rt_mutex *lock)
19169 {
b3bbd485
JK
19170- __rt_mutex_lock(lock, 0);
19171+ rt_mutex_lock_state(lock, TASK_UNINTERRUPTIBLE, 0);
e4b2b4a8
JK
19172 }
19173 EXPORT_SYMBOL_GPL(rt_mutex_lock);
b3bbd485
JK
19174 #endif
19175@@ -1512,16 +2074,7 @@ EXPORT_SYMBOL_GPL(rt_mutex_lock);
e4b2b4a8
JK
19176 */
19177 int __sched rt_mutex_lock_interruptible(struct rt_mutex *lock)
19178 {
19179- int ret;
19180-
19181- might_sleep();
19182-
19183- mutex_acquire(&lock->dep_map, 0, 0, _RET_IP_);
19184- ret = rt_mutex_fastlock(lock, TASK_INTERRUPTIBLE, rt_mutex_slowlock);
19185- if (ret)
19186- mutex_release(&lock->dep_map, 1, _RET_IP_);
19187-
19188- return ret;
b3bbd485 19189+ return rt_mutex_lock_state(lock, TASK_INTERRUPTIBLE, 0);
e4b2b4a8
JK
19190 }
19191 EXPORT_SYMBOL_GPL(rt_mutex_lock_interruptible);
19192
b3bbd485
JK
19193@@ -1538,6 +2091,22 @@ int __sched __rt_mutex_futex_trylock(struct rt_mutex *lock)
19194 return __rt_mutex_slowtrylock(lock);
e4b2b4a8
JK
19195 }
19196
b3bbd485 19197+/**
e4b2b4a8
JK
19198+ * rt_mutex_lock_killable - lock a rt_mutex killable
19199+ *
19200+ * @lock: the rt_mutex to be locked
19201+ * @detect_deadlock: deadlock detection on/off
19202+ *
19203+ * Returns:
19204+ * 0 on success
19205+ * -EINTR when interrupted by a signal
19206+ */
19207+int __sched rt_mutex_lock_killable(struct rt_mutex *lock)
1a6e0f06 19208+{
b3bbd485 19209+ return rt_mutex_lock_state(lock, TASK_KILLABLE, 0);
1a6e0f06 19210+}
e4b2b4a8 19211+EXPORT_SYMBOL_GPL(rt_mutex_lock_killable);
1a6e0f06 19212+
b3bbd485 19213 /**
e4b2b4a8
JK
19214 * rt_mutex_timed_lock - lock a rt_mutex interruptible
19215 * the timeout structure is provided
b3bbd485 19216@@ -1561,6 +2130,7 @@ rt_mutex_timed_lock(struct rt_mutex *lock, struct hrtimer_sleeper *timeout)
e4b2b4a8
JK
19217 mutex_acquire(&lock->dep_map, 0, 0, _RET_IP_);
19218 ret = rt_mutex_timed_fastlock(lock, TASK_INTERRUPTIBLE, timeout,
19219 RT_MUTEX_MIN_CHAINWALK,
19220+ NULL,
19221 rt_mutex_slowlock);
19222 if (ret)
19223 mutex_release(&lock->dep_map, 1, _RET_IP_);
b3bbd485 19224@@ -1569,6 +2139,18 @@ rt_mutex_timed_lock(struct rt_mutex *lock, struct hrtimer_sleeper *timeout)
e4b2b4a8
JK
19225 }
19226 EXPORT_SYMBOL_GPL(rt_mutex_timed_lock);
19227
19228+int __sched __rt_mutex_trylock(struct rt_mutex *lock)
1a6e0f06 19229+{
e4b2b4a8
JK
19230+#ifdef CONFIG_PREEMPT_RT_FULL
19231+ if (WARN_ON_ONCE(in_irq() || in_nmi()))
1a6e0f06 19232+#else
e4b2b4a8
JK
19233+ if (WARN_ON_ONCE(in_irq() || in_nmi() || in_serving_softirq()))
19234+#endif
19235+ return 0;
1a6e0f06 19236+
e4b2b4a8 19237+ return rt_mutex_fasttrylock(lock, rt_mutex_slowtrylock);
1a6e0f06 19238+}
e4b2b4a8
JK
19239+
19240 /**
19241 * rt_mutex_trylock - try to lock a rt_mutex
19242 *
b3bbd485 19243@@ -1584,10 +2166,7 @@ int __sched rt_mutex_trylock(struct rt_mutex *lock)
e4b2b4a8
JK
19244 {
19245 int ret;
1a6e0f06 19246
e4b2b4a8
JK
19247- if (WARN_ON_ONCE(in_irq() || in_nmi() || in_serving_softirq()))
19248- return 0;
19249-
19250- ret = rt_mutex_fasttrylock(lock, rt_mutex_slowtrylock);
19251+ ret = __rt_mutex_trylock(lock);
19252 if (ret)
19253 mutex_acquire(&lock->dep_map, 0, 1, _RET_IP_);
1a6e0f06 19254
b3bbd485 19255@@ -1595,6 +2174,11 @@ int __sched rt_mutex_trylock(struct rt_mutex *lock)
e4b2b4a8
JK
19256 }
19257 EXPORT_SYMBOL_GPL(rt_mutex_trylock);
1a6e0f06 19258
e4b2b4a8
JK
19259+void __sched __rt_mutex_unlock(struct rt_mutex *lock)
19260+{
19261+ rt_mutex_fastunlock(lock, rt_mutex_slowunlock);
19262+}
1a6e0f06 19263+
e4b2b4a8
JK
19264 /**
19265 * rt_mutex_unlock - unlock a rt_mutex
19266 *
b3bbd485 19267@@ -1603,16 +2187,13 @@ EXPORT_SYMBOL_GPL(rt_mutex_trylock);
e4b2b4a8
JK
19268 void __sched rt_mutex_unlock(struct rt_mutex *lock)
19269 {
19270 mutex_release(&lock->dep_map, 1, _RET_IP_);
19271- rt_mutex_fastunlock(lock, rt_mutex_slowunlock);
19272+ __rt_mutex_unlock(lock);
19273 }
19274 EXPORT_SYMBOL_GPL(rt_mutex_unlock);
19275
19276-/**
19277- * Futex variant, that since futex variants do not use the fast-path, can be
19278- * simple and will not need to retry.
19279- */
19280-bool __sched __rt_mutex_futex_unlock(struct rt_mutex *lock,
19281- struct wake_q_head *wake_q)
19282+static bool __sched __rt_mutex_unlock_common(struct rt_mutex *lock,
19283+ struct wake_q_head *wake_q,
19284+ struct wake_q_head *wq_sleeper)
19285 {
19286 lockdep_assert_held(&lock->wait_lock);
19287
b3bbd485 19288@@ -1629,22 +2210,35 @@ bool __sched __rt_mutex_futex_unlock(struct rt_mutex *lock,
e4b2b4a8
JK
19289 * avoid inversion prior to the wakeup. preempt_disable()
19290 * therein pairs with rt_mutex_postunlock().
19291 */
19292- mark_wakeup_next_waiter(wake_q, lock);
19293+ mark_wakeup_next_waiter(wake_q, wq_sleeper, lock);
1a6e0f06 19294
e4b2b4a8
JK
19295 return true; /* call postunlock() */
19296 }
1a6e0f06 19297
e4b2b4a8
JK
19298+/**
19299+ * Futex variant, that since futex variants do not use the fast-path, can be
19300+ * simple and will not need to retry.
19301+ */
19302+bool __sched __rt_mutex_futex_unlock(struct rt_mutex *lock,
19303+ struct wake_q_head *wake_q,
19304+ struct wake_q_head *wq_sleeper)
1a6e0f06 19305+{
e4b2b4a8 19306+ return __rt_mutex_unlock_common(lock, wake_q, wq_sleeper);
1a6e0f06
JK
19307+}
19308+
e4b2b4a8
JK
19309 void __sched rt_mutex_futex_unlock(struct rt_mutex *lock)
19310 {
19311 DEFINE_WAKE_Q(wake_q);
19312+ DEFINE_WAKE_Q(wake_sleeper_q);
19313+ unsigned long flags;
19314 bool postunlock;
1a6e0f06 19315
e4b2b4a8
JK
19316- raw_spin_lock_irq(&lock->wait_lock);
19317- postunlock = __rt_mutex_futex_unlock(lock, &wake_q);
19318- raw_spin_unlock_irq(&lock->wait_lock);
19319+ raw_spin_lock_irqsave(&lock->wait_lock, flags);
19320+ postunlock = __rt_mutex_futex_unlock(lock, &wake_q, &wake_sleeper_q);
19321+ raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
1a6e0f06 19322
e4b2b4a8
JK
19323 if (postunlock)
19324- rt_mutex_postunlock(&wake_q);
19325+ rt_mutex_postunlock(&wake_q, &wake_sleeper_q);
19326 }
1a6e0f06 19327
e4b2b4a8 19328 /**
b3bbd485 19329@@ -1683,7 +2277,7 @@ void __rt_mutex_init(struct rt_mutex *lock, const char *name,
e4b2b4a8
JK
19330 if (name && key)
19331 debug_rt_mutex_init(lock, name, key);
19332 }
19333-EXPORT_SYMBOL_GPL(__rt_mutex_init);
19334+EXPORT_SYMBOL(__rt_mutex_init);
1a6e0f06 19335
e4b2b4a8
JK
19336 /**
19337 * rt_mutex_init_proxy_locked - initialize and lock a rt_mutex on behalf of a
b3bbd485 19338@@ -1703,6 +2297,14 @@ void rt_mutex_init_proxy_locked(struct rt_mutex *lock,
e4b2b4a8
JK
19339 struct task_struct *proxy_owner)
19340 {
19341 __rt_mutex_init(lock, NULL, NULL);
19342+#ifdef CONFIG_DEBUG_SPINLOCK
19343+ /*
19344+ * get another key class for the wait_lock. LOCK_PI and UNLOCK_PI is
19345+ * holding the ->wait_lock of the proxy_lock while unlocking a sleeping
19346+ * lock.
19347+ */
19348+ raw_spin_lock_init(&lock->wait_lock);
1a6e0f06 19349+#endif
e4b2b4a8
JK
19350 debug_rt_mutex_proxy_lock(lock, proxy_owner);
19351 rt_mutex_set_owner(lock, proxy_owner);
19352 }
b3bbd485 19353@@ -1735,6 +2337,34 @@ int __rt_mutex_start_proxy_lock(struct rt_mutex *lock,
e4b2b4a8
JK
19354 if (try_to_take_rt_mutex(lock, task, NULL))
19355 return 1;
1a6e0f06 19356
1a6e0f06 19357+#ifdef CONFIG_PREEMPT_RT_FULL
e4b2b4a8
JK
19358+ /*
19359+ * In PREEMPT_RT there's an added race.
19360+ * If the task, that we are about to requeue, times out,
19361+ * it can set the PI_WAKEUP_INPROGRESS. This tells the requeue
19362+ * to skip this task. But right after the task sets
19363+ * its pi_blocked_on to PI_WAKEUP_INPROGRESS it can then
19364+ * block on the spin_lock(&hb->lock), which in RT is an rtmutex.
19365+ * This will replace the PI_WAKEUP_INPROGRESS with the actual
19366+ * lock that it blocks on. We *must not* place this task
19367+ * on this proxy lock in that case.
19368+ *
19369+ * To prevent this race, we first take the task's pi_lock
19370+ * and check if it has updated its pi_blocked_on. If it has,
19371+ * we assume that it woke up and we return -EAGAIN.
19372+ * Otherwise, we set the task's pi_blocked_on to
19373+ * PI_REQUEUE_INPROGRESS, so that if the task is waking up
19374+ * it will know that we are in the process of requeuing it.
19375+ */
19376+ raw_spin_lock(&task->pi_lock);
19377+ if (task->pi_blocked_on) {
19378+ raw_spin_unlock(&task->pi_lock);
19379+ return -EAGAIN;
19380+ }
19381+ task->pi_blocked_on = PI_REQUEUE_INPROGRESS;
19382+ raw_spin_unlock(&task->pi_lock);
1a6e0f06 19383+#endif
1a6e0f06 19384+
e4b2b4a8
JK
19385 /* We enforce deadlock detection for futexes */
19386 ret = task_blocks_on_rt_mutex(lock, waiter, task,
19387 RT_MUTEX_FULL_CHAINWALK);
b3bbd485 19388@@ -1749,7 +2379,7 @@ int __rt_mutex_start_proxy_lock(struct rt_mutex *lock,
e4b2b4a8
JK
19389 ret = 0;
19390 }
1a6e0f06 19391
e4b2b4a8
JK
19392- if (unlikely(ret))
19393+ if (ret && rt_mutex_has_waiters(lock))
19394 remove_waiter(lock, waiter);
1a6e0f06 19395
e4b2b4a8 19396 debug_rt_mutex_print_deadlock(waiter);
b3bbd485 19397@@ -1824,17 +2454,36 @@ int rt_mutex_wait_proxy_lock(struct rt_mutex *lock,
e4b2b4a8
JK
19398 struct hrtimer_sleeper *to,
19399 struct rt_mutex_waiter *waiter)
19400 {
19401+ struct task_struct *tsk = current;
19402 int ret;
19403
19404 raw_spin_lock_irq(&lock->wait_lock);
19405 /* sleep on the mutex */
19406 set_current_state(TASK_INTERRUPTIBLE);
19407- ret = __rt_mutex_slowlock(lock, TASK_INTERRUPTIBLE, to, waiter);
19408+ ret = __rt_mutex_slowlock(lock, TASK_INTERRUPTIBLE, to, waiter, NULL);
19409 /*
19410 * try_to_take_rt_mutex() sets the waiter bit unconditionally. We might
19411 * have to fix that up.
19412 */
19413 fixup_rt_mutex_waiters(lock);
19414+ /*
19415+ * RT has a problem here when the wait got interrupted by a timeout
19416+ * or a signal. task->pi_blocked_on is still set. The task must
19417+ * acquire the hash bucket lock when returning from this function.
19418+ *
19419+ * If the hash bucket lock is contended then the
19420+ * BUG_ON(rt_mutex_real_waiter(task->pi_blocked_on)) in
19421+ * task_blocks_on_rt_mutex() will trigger. This can be avoided by
19422+ * clearing task->pi_blocked_on which removes the task from the
19423+ * boosting chain of the rtmutex. That's correct because the task
19424+ * is not longer blocked on it.
19425+ */
19426+ if (ret) {
19427+ raw_spin_lock(&tsk->pi_lock);
19428+ tsk->pi_blocked_on = NULL;
19429+ raw_spin_unlock(&tsk->pi_lock);
19430+ }
1a6e0f06 19431+
e4b2b4a8 19432 raw_spin_unlock_irq(&lock->wait_lock);
1a6e0f06 19433
e4b2b4a8 19434 return ret;
b3bbd485 19435@@ -1895,3 +2544,99 @@ bool rt_mutex_cleanup_proxy_lock(struct rt_mutex *lock,
1a6e0f06 19436
e4b2b4a8 19437 return cleanup;
1a6e0f06 19438 }
e4b2b4a8
JK
19439+
19440+static inline int
19441+ww_mutex_deadlock_injection(struct ww_mutex *lock, struct ww_acquire_ctx *ctx)
1a6e0f06 19442+{
e4b2b4a8
JK
19443+#ifdef CONFIG_DEBUG_WW_MUTEX_SLOWPATH
19444+ unsigned tmp;
19445+
19446+ if (ctx->deadlock_inject_countdown-- == 0) {
19447+ tmp = ctx->deadlock_inject_interval;
19448+ if (tmp > UINT_MAX/4)
19449+ tmp = UINT_MAX;
19450+ else
19451+ tmp = tmp*2 + tmp + tmp/2;
19452+
19453+ ctx->deadlock_inject_interval = tmp;
19454+ ctx->deadlock_inject_countdown = tmp;
19455+ ctx->contending_lock = lock;
19456+
19457+ ww_mutex_unlock(lock);
19458+
19459+ return -EDEADLK;
19460+ }
1a6e0f06
JK
19461+#endif
19462+
e4b2b4a8 19463+ return 0;
1a6e0f06
JK
19464+}
19465+
e4b2b4a8
JK
19466+#ifdef CONFIG_PREEMPT_RT_FULL
19467+int __sched
19468+ww_mutex_lock_interruptible(struct ww_mutex *lock, struct ww_acquire_ctx *ctx)
1a6e0f06 19469+{
e4b2b4a8 19470+ int ret;
1a6e0f06 19471+
e4b2b4a8 19472+ might_sleep();
1a6e0f06 19473+
e4b2b4a8
JK
19474+ mutex_acquire_nest(&lock->base.dep_map, 0, 0,
19475+ ctx ? &ctx->dep_map : NULL, _RET_IP_);
19476+ ret = rt_mutex_slowlock(&lock->base.lock, TASK_INTERRUPTIBLE, NULL, 0,
19477+ ctx);
19478+ if (ret)
19479+ mutex_release(&lock->base.dep_map, 1, _RET_IP_);
19480+ else if (!ret && ctx && ctx->acquired > 1)
19481+ return ww_mutex_deadlock_injection(lock, ctx);
1a6e0f06 19482+
e4b2b4a8 19483+ return ret;
1a6e0f06 19484+}
e4b2b4a8 19485+EXPORT_SYMBOL_GPL(ww_mutex_lock_interruptible);
1a6e0f06 19486+
e4b2b4a8
JK
19487+int __sched
19488+ww_mutex_lock(struct ww_mutex *lock, struct ww_acquire_ctx *ctx)
1a6e0f06 19489+{
e4b2b4a8 19490+ int ret;
1a6e0f06 19491+
e4b2b4a8 19492+ might_sleep();
1a6e0f06 19493+
e4b2b4a8
JK
19494+ mutex_acquire_nest(&lock->base.dep_map, 0, 0,
19495+ ctx ? &ctx->dep_map : NULL, _RET_IP_);
19496+ ret = rt_mutex_slowlock(&lock->base.lock, TASK_UNINTERRUPTIBLE, NULL, 0,
19497+ ctx);
19498+ if (ret)
19499+ mutex_release(&lock->base.dep_map, 1, _RET_IP_);
19500+ else if (!ret && ctx && ctx->acquired > 1)
19501+ return ww_mutex_deadlock_injection(lock, ctx);
19502+
19503+ return ret;
1a6e0f06 19504+}
e4b2b4a8 19505+EXPORT_SYMBOL_GPL(ww_mutex_lock);
1a6e0f06 19506+
e4b2b4a8 19507+void __sched ww_mutex_unlock(struct ww_mutex *lock)
1a6e0f06 19508+{
e4b2b4a8 19509+ int nest = !!lock->ctx;
1a6e0f06 19510+
e4b2b4a8
JK
19511+ /*
19512+ * The unlocking fastpath is the 0->1 transition from 'locked'
19513+ * into 'unlocked' state:
19514+ */
19515+ if (nest) {
19516+#ifdef CONFIG_DEBUG_MUTEXES
19517+ DEBUG_LOCKS_WARN_ON(!lock->ctx->acquired);
1a6e0f06 19518+#endif
e4b2b4a8
JK
19519+ if (lock->ctx->acquired > 0)
19520+ lock->ctx->acquired--;
19521+ lock->ctx = NULL;
19522+ }
19523+
19524+ mutex_release(&lock->base.dep_map, nest, _RET_IP_);
19525+ __rt_mutex_unlock(&lock->base.lock);
1a6e0f06 19526+}
e4b2b4a8 19527+EXPORT_SYMBOL(ww_mutex_unlock);
1a6e0f06 19528+
e4b2b4a8 19529+int __rt_mutex_owner_current(struct rt_mutex *lock)
1a6e0f06 19530+{
e4b2b4a8 19531+ return rt_mutex_owner(lock) == current;
1a6e0f06 19532+}
e4b2b4a8 19533+EXPORT_SYMBOL(__rt_mutex_owner_current);
1a6e0f06 19534+#endif
b3bbd485
JK
19535diff --git a/kernel/locking/rtmutex_common.h b/kernel/locking/rtmutex_common.h
19536index 68686b3ec3c1..2a157c78e18c 100644
19537--- a/kernel/locking/rtmutex_common.h
19538+++ b/kernel/locking/rtmutex_common.h
e4b2b4a8 19539@@ -15,6 +15,7 @@
1a6e0f06 19540
e4b2b4a8
JK
19541 #include <linux/rtmutex.h>
19542 #include <linux/sched/wake_q.h>
19543+#include <linux/sched/debug.h>
1a6e0f06 19544
e4b2b4a8
JK
19545 /*
19546 * This is the control structure for tasks blocked on a rt_mutex,
b3bbd485 19547@@ -29,6 +30,7 @@ struct rt_mutex_waiter {
e4b2b4a8
JK
19548 struct rb_node pi_tree_entry;
19549 struct task_struct *task;
19550 struct rt_mutex *lock;
19551+ bool savestate;
19552 #ifdef CONFIG_DEBUG_RT_MUTEXES
19553 unsigned long ip;
19554 struct pid *deadlock_task_pid;
b3bbd485 19555@@ -129,12 +131,15 @@ enum rtmutex_chainwalk {
e4b2b4a8
JK
19556 /*
19557 * PI-futex support (proxy locking functions, etc.):
19558 */
19559+#define PI_WAKEUP_INPROGRESS ((struct rt_mutex_waiter *) 1)
19560+#define PI_REQUEUE_INPROGRESS ((struct rt_mutex_waiter *) 2)
19561+
19562 extern struct task_struct *rt_mutex_next_owner(struct rt_mutex *lock);
19563 extern void rt_mutex_init_proxy_locked(struct rt_mutex *lock,
19564 struct task_struct *proxy_owner);
19565 extern void rt_mutex_proxy_unlock(struct rt_mutex *lock,
19566 struct task_struct *proxy_owner);
19567-extern void rt_mutex_init_waiter(struct rt_mutex_waiter *waiter);
19568+extern void rt_mutex_init_waiter(struct rt_mutex_waiter *waiter, bool savetate);
19569 extern int __rt_mutex_start_proxy_lock(struct rt_mutex *lock,
19570 struct rt_mutex_waiter *waiter,
19571 struct task_struct *task);
b3bbd485 19572@@ -152,9 +157,27 @@ extern int __rt_mutex_futex_trylock(struct rt_mutex *l);
e4b2b4a8
JK
19573
19574 extern void rt_mutex_futex_unlock(struct rt_mutex *lock);
19575 extern bool __rt_mutex_futex_unlock(struct rt_mutex *lock,
19576- struct wake_q_head *wqh);
b3bbd485
JK
19577-
19578-extern void rt_mutex_postunlock(struct wake_q_head *wake_q);
e4b2b4a8
JK
19579+ struct wake_q_head *wqh,
19580+ struct wake_q_head *wq_sleeper);
b3bbd485 19581+
e4b2b4a8
JK
19582+extern void rt_mutex_postunlock(struct wake_q_head *wake_q,
19583+ struct wake_q_head *wake_sleeper_q);
19584+
19585+/* RW semaphore special interface */
19586+struct ww_acquire_ctx;
19587+
19588+extern int __rt_mutex_lock_state(struct rt_mutex *lock, int state);
19589+extern int __rt_mutex_trylock(struct rt_mutex *lock);
19590+extern void __rt_mutex_unlock(struct rt_mutex *lock);
19591+int __sched rt_mutex_slowlock_locked(struct rt_mutex *lock, int state,
19592+ struct hrtimer_sleeper *timeout,
19593+ enum rtmutex_chainwalk chwalk,
19594+ struct ww_acquire_ctx *ww_ctx,
19595+ struct rt_mutex_waiter *waiter);
19596+void __sched rt_spin_lock_slowlock_locked(struct rt_mutex *lock,
19597+ struct rt_mutex_waiter *waiter,
19598+ unsigned long flags);
19599+void __sched rt_spin_lock_slowunlock(struct rt_mutex *lock);
1a6e0f06 19600
e4b2b4a8
JK
19601 #ifdef CONFIG_DEBUG_RT_MUTEXES
19602 # include "rtmutex-debug.h"
b3bbd485
JK
19603diff --git a/kernel/locking/rwlock-rt.c b/kernel/locking/rwlock-rt.c
19604new file mode 100644
19605index 000000000000..f2e155b2c4a8
19606--- /dev/null
19607+++ b/kernel/locking/rwlock-rt.c
e4b2b4a8
JK
19608@@ -0,0 +1,378 @@
19609+/*
19610+ */
19611+#include <linux/sched/debug.h>
19612+#include <linux/export.h>
19613+
19614+#include "rtmutex_common.h"
19615+#include <linux/rwlock_types_rt.h>
19616+
19617+/*
19618+ * RT-specific reader/writer locks
19619+ *
19620+ * write_lock()
19621+ * 1) Lock lock->rtmutex
19622+ * 2) Remove the reader BIAS to force readers into the slow path
19623+ * 3) Wait until all readers have left the critical region
19624+ * 4) Mark it write locked
19625+ *
19626+ * write_unlock()
19627+ * 1) Remove the write locked marker
19628+ * 2) Set the reader BIAS so readers can use the fast path again
19629+ * 3) Unlock lock->rtmutex to release blocked readers
19630+ *
19631+ * read_lock()
19632+ * 1) Try fast path acquisition (reader BIAS is set)
19633+ * 2) Take lock->rtmutex.wait_lock which protects the writelocked flag
19634+ * 3) If !writelocked, acquire it for read
19635+ * 4) If writelocked, block on lock->rtmutex
19636+ * 5) unlock lock->rtmutex, goto 1)
19637+ *
19638+ * read_unlock()
19639+ * 1) Try fast path release (reader count != 1)
19640+ * 2) Wake the writer waiting in write_lock()#3
19641+ *
19642+ * read_lock()#3 has the consequence, that rw locks on RT are not writer
19643+ * fair, but writers, which should be avoided in RT tasks (think tasklist
19644+ * lock), are subject to the rtmutex priority/DL inheritance mechanism.
19645+ *
19646+ * It's possible to make the rw locks writer fair by keeping a list of
19647+ * active readers. A blocked writer would force all newly incoming readers
19648+ * to block on the rtmutex, but the rtmutex would have to be proxy locked
19649+ * for one reader after the other. We can't use multi-reader inheritance
19650+ * because there is no way to support that with
19651+ * SCHED_DEADLINE. Implementing the one by one reader boosting/handover
19652+ * mechanism is a major surgery for a very dubious value.
19653+ *
19654+ * The risk of writer starvation is there, but the pathological use cases
19655+ * which trigger it are not necessarily the typical RT workloads.
19656+ */
19657+
19658+void __rwlock_biased_rt_init(struct rt_rw_lock *lock, const char *name,
19659+ struct lock_class_key *key)
1a6e0f06 19660+{
e4b2b4a8
JK
19661+#ifdef CONFIG_DEBUG_LOCK_ALLOC
19662+ /*
19663+ * Make sure we are not reinitializing a held semaphore:
19664+ */
19665+ debug_check_no_locks_freed((void *)lock, sizeof(*lock));
19666+ lockdep_init_map(&lock->dep_map, name, key, 0);
1a6e0f06 19667+#endif
e4b2b4a8
JK
19668+ atomic_set(&lock->readers, READER_BIAS);
19669+ rt_mutex_init(&lock->rtmutex);
19670+ lock->rtmutex.save_state = 1;
1a6e0f06
JK
19671+}
19672+
e4b2b4a8 19673+int __read_rt_trylock(struct rt_rw_lock *lock)
1a6e0f06 19674+{
e4b2b4a8 19675+ int r, old;
1a6e0f06 19676+
e4b2b4a8
JK
19677+ /*
19678+ * Increment reader count, if lock->readers < 0, i.e. READER_BIAS is
19679+ * set.
19680+ */
19681+ for (r = atomic_read(&lock->readers); r < 0;) {
19682+ old = atomic_cmpxchg(&lock->readers, r, r + 1);
19683+ if (likely(old == r))
19684+ return 1;
19685+ r = old;
19686+ }
19687+ return 0;
1a6e0f06
JK
19688+}
19689+
e4b2b4a8 19690+void __sched __read_rt_lock(struct rt_rw_lock *lock)
1a6e0f06 19691+{
e4b2b4a8
JK
19692+ struct rt_mutex *m = &lock->rtmutex;
19693+ struct rt_mutex_waiter waiter;
19694+ unsigned long flags;
1a6e0f06 19695+
e4b2b4a8
JK
19696+ if (__read_rt_trylock(lock))
19697+ return;
19698+
19699+ raw_spin_lock_irqsave(&m->wait_lock, flags);
19700+ /*
19701+ * Allow readers as long as the writer has not completely
19702+ * acquired the semaphore for write.
19703+ */
19704+ if (atomic_read(&lock->readers) != WRITER_BIAS) {
19705+ atomic_inc(&lock->readers);
19706+ raw_spin_unlock_irqrestore(&m->wait_lock, flags);
19707+ return;
19708+ }
19709+
19710+ /*
19711+ * Call into the slow lock path with the rtmutex->wait_lock
19712+ * held, so this can't result in the following race:
19713+ *
19714+ * Reader1 Reader2 Writer
19715+ * read_lock()
19716+ * write_lock()
19717+ * rtmutex_lock(m)
19718+ * swait()
19719+ * read_lock()
19720+ * unlock(m->wait_lock)
19721+ * read_unlock()
19722+ * swake()
19723+ * lock(m->wait_lock)
19724+ * lock->writelocked=true
19725+ * unlock(m->wait_lock)
19726+ *
19727+ * write_unlock()
19728+ * lock->writelocked=false
19729+ * rtmutex_unlock(m)
19730+ * read_lock()
19731+ * write_lock()
19732+ * rtmutex_lock(m)
19733+ * swait()
19734+ * rtmutex_lock(m)
19735+ *
19736+ * That would put Reader1 behind the writer waiting on
19737+ * Reader2 to call read_unlock() which might be unbound.
19738+ */
19739+ rt_mutex_init_waiter(&waiter, false);
19740+ rt_spin_lock_slowlock_locked(m, &waiter, flags);
19741+ /*
19742+ * The slowlock() above is guaranteed to return with the rtmutex is
19743+ * now held, so there can't be a writer active. Increment the reader
19744+ * count and immediately drop the rtmutex again.
19745+ */
19746+ atomic_inc(&lock->readers);
19747+ raw_spin_unlock_irqrestore(&m->wait_lock, flags);
19748+ rt_spin_lock_slowunlock(m);
19749+
19750+ debug_rt_mutex_free_waiter(&waiter);
1a6e0f06
JK
19751+}
19752+
e4b2b4a8 19753+void __read_rt_unlock(struct rt_rw_lock *lock)
1a6e0f06 19754+{
e4b2b4a8
JK
19755+ struct rt_mutex *m = &lock->rtmutex;
19756+ struct task_struct *tsk;
19757+
19758+ /*
19759+ * sem->readers can only hit 0 when a writer is waiting for the
19760+ * active readers to leave the critical region.
19761+ */
19762+ if (!atomic_dec_and_test(&lock->readers))
19763+ return;
19764+
19765+ raw_spin_lock_irq(&m->wait_lock);
19766+ /*
19767+ * Wake the writer, i.e. the rtmutex owner. It might release the
19768+ * rtmutex concurrently in the fast path, but to clean up the rw
19769+ * lock it needs to acquire m->wait_lock. The worst case which can
19770+ * happen is a spurious wakeup.
19771+ */
19772+ tsk = rt_mutex_owner(m);
19773+ if (tsk)
19774+ wake_up_process(tsk);
19775+
19776+ raw_spin_unlock_irq(&m->wait_lock);
1a6e0f06
JK
19777+}
19778+
e4b2b4a8
JK
19779+static void __write_unlock_common(struct rt_rw_lock *lock, int bias,
19780+ unsigned long flags)
1a6e0f06 19781+{
e4b2b4a8 19782+ struct rt_mutex *m = &lock->rtmutex;
1a6e0f06 19783+
e4b2b4a8
JK
19784+ atomic_add(READER_BIAS - bias, &lock->readers);
19785+ raw_spin_unlock_irqrestore(&m->wait_lock, flags);
19786+ rt_spin_lock_slowunlock(m);
1a6e0f06
JK
19787+}
19788+
e4b2b4a8 19789+void __sched __write_rt_lock(struct rt_rw_lock *lock)
1a6e0f06 19790+{
e4b2b4a8
JK
19791+ struct rt_mutex *m = &lock->rtmutex;
19792+ struct task_struct *self = current;
19793+ unsigned long flags;
19794+
19795+ /* Take the rtmutex as a first step */
19796+ __rt_spin_lock(m);
19797+
19798+ /* Force readers into slow path */
19799+ atomic_sub(READER_BIAS, &lock->readers);
19800+
19801+ raw_spin_lock_irqsave(&m->wait_lock, flags);
19802+
19803+ raw_spin_lock(&self->pi_lock);
19804+ self->saved_state = self->state;
19805+ __set_current_state_no_track(TASK_UNINTERRUPTIBLE);
19806+ raw_spin_unlock(&self->pi_lock);
19807+
19808+ for (;;) {
19809+ /* Have all readers left the critical region? */
19810+ if (!atomic_read(&lock->readers)) {
19811+ atomic_set(&lock->readers, WRITER_BIAS);
19812+ raw_spin_lock(&self->pi_lock);
19813+ __set_current_state_no_track(self->saved_state);
19814+ self->saved_state = TASK_RUNNING;
19815+ raw_spin_unlock(&self->pi_lock);
19816+ raw_spin_unlock_irqrestore(&m->wait_lock, flags);
19817+ return;
19818+ }
19819+
19820+ raw_spin_unlock_irqrestore(&m->wait_lock, flags);
19821+
19822+ if (atomic_read(&lock->readers) != 0)
19823+ schedule();
19824+
19825+ raw_spin_lock_irqsave(&m->wait_lock, flags);
19826+
19827+ raw_spin_lock(&self->pi_lock);
19828+ __set_current_state_no_track(TASK_UNINTERRUPTIBLE);
19829+ raw_spin_unlock(&self->pi_lock);
19830+ }
19831+}
19832+
19833+int __write_rt_trylock(struct rt_rw_lock *lock)
1a6e0f06 19834+{
e4b2b4a8
JK
19835+ struct rt_mutex *m = &lock->rtmutex;
19836+ unsigned long flags;
19837+
19838+ if (!__rt_mutex_trylock(m))
19839+ return 0;
19840+
19841+ atomic_sub(READER_BIAS, &lock->readers);
19842+
19843+ raw_spin_lock_irqsave(&m->wait_lock, flags);
19844+ if (!atomic_read(&lock->readers)) {
19845+ atomic_set(&lock->readers, WRITER_BIAS);
19846+ raw_spin_unlock_irqrestore(&m->wait_lock, flags);
19847+ return 1;
19848+ }
19849+ __write_unlock_common(lock, 0, flags);
19850+ return 0;
1a6e0f06
JK
19851+}
19852+
e4b2b4a8
JK
19853+void __write_rt_unlock(struct rt_rw_lock *lock)
19854+{
19855+ struct rt_mutex *m = &lock->rtmutex;
19856+ unsigned long flags;
1a6e0f06 19857+
e4b2b4a8
JK
19858+ raw_spin_lock_irqsave(&m->wait_lock, flags);
19859+ __write_unlock_common(lock, WRITER_BIAS, flags);
19860+}
1a6e0f06 19861+
e4b2b4a8
JK
19862+/* Map the reader biased implementation */
19863+static inline int do_read_rt_trylock(rwlock_t *rwlock)
1a6e0f06 19864+{
e4b2b4a8 19865+ return __read_rt_trylock(rwlock);
1a6e0f06
JK
19866+}
19867+
e4b2b4a8
JK
19868+static inline int do_write_rt_trylock(rwlock_t *rwlock)
19869+{
19870+ return __write_rt_trylock(rwlock);
19871+}
1a6e0f06 19872+
e4b2b4a8
JK
19873+static inline void do_read_rt_lock(rwlock_t *rwlock)
19874+{
19875+ __read_rt_lock(rwlock);
19876+}
1a6e0f06 19877+
e4b2b4a8
JK
19878+static inline void do_write_rt_lock(rwlock_t *rwlock)
19879+{
19880+ __write_rt_lock(rwlock);
19881+}
1a6e0f06 19882+
e4b2b4a8
JK
19883+static inline void do_read_rt_unlock(rwlock_t *rwlock)
19884+{
19885+ __read_rt_unlock(rwlock);
19886+}
1a6e0f06 19887+
e4b2b4a8
JK
19888+static inline void do_write_rt_unlock(rwlock_t *rwlock)
19889+{
19890+ __write_rt_unlock(rwlock);
19891+}
1a6e0f06 19892+
e4b2b4a8
JK
19893+static inline void do_rwlock_rt_init(rwlock_t *rwlock, const char *name,
19894+ struct lock_class_key *key)
19895+{
19896+ __rwlock_biased_rt_init(rwlock, name, key);
19897+}
1a6e0f06 19898+
e4b2b4a8
JK
19899+int __lockfunc rt_read_can_lock(rwlock_t *rwlock)
19900+{
19901+ return atomic_read(&rwlock->readers) < 0;
19902+}
1a6e0f06 19903+
e4b2b4a8
JK
19904+int __lockfunc rt_write_can_lock(rwlock_t *rwlock)
19905+{
19906+ return atomic_read(&rwlock->readers) == READER_BIAS;
19907+}
1a6e0f06
JK
19908+
19909+/*
e4b2b4a8 19910+ * The common functions which get wrapped into the rwlock API.
1a6e0f06 19911+ */
e4b2b4a8
JK
19912+int __lockfunc rt_read_trylock(rwlock_t *rwlock)
19913+{
19914+ int ret;
1a6e0f06 19915+
e4b2b4a8
JK
19916+ sleeping_lock_inc();
19917+ migrate_disable();
19918+ ret = do_read_rt_trylock(rwlock);
19919+ if (ret) {
19920+ rwlock_acquire_read(&rwlock->dep_map, 0, 1, _RET_IP_);
19921+ } else {
19922+ migrate_enable();
19923+ sleeping_lock_dec();
19924+ }
19925+ return ret;
19926+}
19927+EXPORT_SYMBOL(rt_read_trylock);
1a6e0f06 19928+
e4b2b4a8
JK
19929+int __lockfunc rt_write_trylock(rwlock_t *rwlock)
19930+{
19931+ int ret;
1a6e0f06 19932+
e4b2b4a8
JK
19933+ sleeping_lock_inc();
19934+ migrate_disable();
19935+ ret = do_write_rt_trylock(rwlock);
19936+ if (ret) {
19937+ rwlock_acquire(&rwlock->dep_map, 0, 1, _RET_IP_);
19938+ } else {
19939+ migrate_enable();
19940+ sleeping_lock_dec();
19941+ }
19942+ return ret;
19943+}
19944+EXPORT_SYMBOL(rt_write_trylock);
1a6e0f06 19945+
e4b2b4a8
JK
19946+void __lockfunc rt_read_lock(rwlock_t *rwlock)
19947+{
19948+ sleeping_lock_inc();
19949+ migrate_disable();
19950+ rwlock_acquire_read(&rwlock->dep_map, 0, 0, _RET_IP_);
19951+ do_read_rt_lock(rwlock);
19952+}
19953+EXPORT_SYMBOL(rt_read_lock);
1a6e0f06 19954+
e4b2b4a8
JK
19955+void __lockfunc rt_write_lock(rwlock_t *rwlock)
19956+{
19957+ sleeping_lock_inc();
19958+ migrate_disable();
19959+ rwlock_acquire(&rwlock->dep_map, 0, 0, _RET_IP_);
19960+ do_write_rt_lock(rwlock);
19961+}
19962+EXPORT_SYMBOL(rt_write_lock);
1a6e0f06 19963+
e4b2b4a8
JK
19964+void __lockfunc rt_read_unlock(rwlock_t *rwlock)
19965+{
19966+ rwlock_release(&rwlock->dep_map, 1, _RET_IP_);
19967+ do_read_rt_unlock(rwlock);
19968+ migrate_enable();
19969+ sleeping_lock_dec();
19970+}
19971+EXPORT_SYMBOL(rt_read_unlock);
1a6e0f06 19972+
e4b2b4a8
JK
19973+void __lockfunc rt_write_unlock(rwlock_t *rwlock)
19974+{
19975+ rwlock_release(&rwlock->dep_map, 1, _RET_IP_);
19976+ do_write_rt_unlock(rwlock);
19977+ migrate_enable();
19978+ sleeping_lock_dec();
19979+}
19980+EXPORT_SYMBOL(rt_write_unlock);
1a6e0f06 19981+
e4b2b4a8
JK
19982+void __rt_rwlock_init(rwlock_t *rwlock, char *name, struct lock_class_key *key)
19983+{
19984+ do_rwlock_rt_init(rwlock, name, key);
19985+}
19986+EXPORT_SYMBOL(__rt_rwlock_init);
b3bbd485
JK
19987diff --git a/kernel/locking/rwsem-rt.c b/kernel/locking/rwsem-rt.c
19988new file mode 100644
19989index 000000000000..26991ddb6c5a
19990--- /dev/null
19991+++ b/kernel/locking/rwsem-rt.c
e4b2b4a8
JK
19992@@ -0,0 +1,269 @@
19993+/*
19994+ */
19995+#include <linux/rwsem.h>
19996+#include <linux/sched/debug.h>
19997+#include <linux/sched/signal.h>
19998+#include <linux/export.h>
1a6e0f06 19999+
e4b2b4a8
JK
20000+#include "rtmutex_common.h"
20001+
20002+/*
20003+ * RT-specific reader/writer semaphores
20004+ *
20005+ * down_write()
20006+ * 1) Lock sem->rtmutex
20007+ * 2) Remove the reader BIAS to force readers into the slow path
20008+ * 3) Wait until all readers have left the critical region
20009+ * 4) Mark it write locked
20010+ *
20011+ * up_write()
20012+ * 1) Remove the write locked marker
20013+ * 2) Set the reader BIAS so readers can use the fast path again
20014+ * 3) Unlock sem->rtmutex to release blocked readers
20015+ *
20016+ * down_read()
20017+ * 1) Try fast path acquisition (reader BIAS is set)
20018+ * 2) Take sem->rtmutex.wait_lock which protects the writelocked flag
20019+ * 3) If !writelocked, acquire it for read
20020+ * 4) If writelocked, block on sem->rtmutex
20021+ * 5) unlock sem->rtmutex, goto 1)
20022+ *
20023+ * up_read()
20024+ * 1) Try fast path release (reader count != 1)
20025+ * 2) Wake the writer waiting in down_write()#3
20026+ *
20027+ * down_read()#3 has the consequence, that rw semaphores on RT are not writer
20028+ * fair, but writers, which should be avoided in RT tasks (think mmap_sem),
20029+ * are subject to the rtmutex priority/DL inheritance mechanism.
20030+ *
20031+ * It's possible to make the rw semaphores writer fair by keeping a list of
20032+ * active readers. A blocked writer would force all newly incoming readers to
20033+ * block on the rtmutex, but the rtmutex would have to be proxy locked for one
20034+ * reader after the other. We can't use multi-reader inheritance because there
20035+ * is no way to support that with SCHED_DEADLINE. Implementing the one by one
20036+ * reader boosting/handover mechanism is a major surgery for a very dubious
20037+ * value.
20038+ *
20039+ * The risk of writer starvation is there, but the pathological use cases
20040+ * which trigger it are not necessarily the typical RT workloads.
20041+ */
20042+
20043+void __rwsem_init(struct rw_semaphore *sem, const char *name,
20044+ struct lock_class_key *key)
20045+{
20046+#ifdef CONFIG_DEBUG_LOCK_ALLOC
20047+ /*
20048+ * Make sure we are not reinitializing a held semaphore:
20049+ */
20050+ debug_check_no_locks_freed((void *)sem, sizeof(*sem));
20051+ lockdep_init_map(&sem->dep_map, name, key, 0);
1a6e0f06 20052+#endif
e4b2b4a8
JK
20053+ atomic_set(&sem->readers, READER_BIAS);
20054+}
20055+EXPORT_SYMBOL(__rwsem_init);
20056+
20057+int __down_read_trylock(struct rw_semaphore *sem)
20058+{
20059+ int r, old;
20060+
20061+ /*
20062+ * Increment reader count, if sem->readers < 0, i.e. READER_BIAS is
20063+ * set.
20064+ */
20065+ for (r = atomic_read(&sem->readers); r < 0;) {
20066+ old = atomic_cmpxchg(&sem->readers, r, r + 1);
20067+ if (likely(old == r))
20068+ return 1;
20069+ r = old;
20070+ }
20071+ return 0;
20072+}
20073+
20074+void __sched __down_read(struct rw_semaphore *sem)
20075+{
20076+ struct rt_mutex *m = &sem->rtmutex;
20077+ struct rt_mutex_waiter waiter;
20078+
20079+ if (__down_read_trylock(sem))
20080+ return;
20081+
20082+ might_sleep();
20083+ raw_spin_lock_irq(&m->wait_lock);
20084+ /*
20085+ * Allow readers as long as the writer has not completely
20086+ * acquired the semaphore for write.
20087+ */
20088+ if (atomic_read(&sem->readers) != WRITER_BIAS) {
20089+ atomic_inc(&sem->readers);
20090+ raw_spin_unlock_irq(&m->wait_lock);
20091+ return;
20092+ }
1a6e0f06 20093+
e4b2b4a8
JK
20094+ /*
20095+ * Call into the slow lock path with the rtmutex->wait_lock
20096+ * held, so this can't result in the following race:
20097+ *
20098+ * Reader1 Reader2 Writer
20099+ * down_read()
20100+ * down_write()
20101+ * rtmutex_lock(m)
20102+ * swait()
20103+ * down_read()
20104+ * unlock(m->wait_lock)
20105+ * up_read()
20106+ * swake()
20107+ * lock(m->wait_lock)
20108+ * sem->writelocked=true
20109+ * unlock(m->wait_lock)
20110+ *
20111+ * up_write()
20112+ * sem->writelocked=false
20113+ * rtmutex_unlock(m)
20114+ * down_read()
20115+ * down_write()
20116+ * rtmutex_lock(m)
20117+ * swait()
20118+ * rtmutex_lock(m)
20119+ *
20120+ * That would put Reader1 behind the writer waiting on
20121+ * Reader2 to call up_read() which might be unbound.
20122+ */
20123+ rt_mutex_init_waiter(&waiter, false);
20124+ rt_mutex_slowlock_locked(m, TASK_UNINTERRUPTIBLE, NULL,
20125+ RT_MUTEX_MIN_CHAINWALK, NULL,
20126+ &waiter);
20127+ /*
20128+ * The slowlock() above is guaranteed to return with the rtmutex is
20129+ * now held, so there can't be a writer active. Increment the reader
20130+ * count and immediately drop the rtmutex again.
20131+ */
20132+ atomic_inc(&sem->readers);
20133+ raw_spin_unlock_irq(&m->wait_lock);
20134+ __rt_mutex_unlock(m);
1a6e0f06 20135+
e4b2b4a8
JK
20136+ debug_rt_mutex_free_waiter(&waiter);
20137+}
20138+
20139+void __up_read(struct rw_semaphore *sem)
1a6e0f06 20140+{
e4b2b4a8
JK
20141+ struct rt_mutex *m = &sem->rtmutex;
20142+ struct task_struct *tsk;
20143+
20144+ /*
20145+ * sem->readers can only hit 0 when a writer is waiting for the
20146+ * active readers to leave the critical region.
20147+ */
20148+ if (!atomic_dec_and_test(&sem->readers))
20149+ return;
20150+
20151+ might_sleep();
20152+ raw_spin_lock_irq(&m->wait_lock);
20153+ /*
20154+ * Wake the writer, i.e. the rtmutex owner. It might release the
20155+ * rtmutex concurrently in the fast path (due to a signal), but to
20156+ * clean up the rwsem it needs to acquire m->wait_lock. The worst
20157+ * case which can happen is a spurious wakeup.
20158+ */
20159+ tsk = rt_mutex_owner(m);
20160+ if (tsk)
20161+ wake_up_process(tsk);
20162+
20163+ raw_spin_unlock_irq(&m->wait_lock);
1a6e0f06
JK
20164+}
20165+
e4b2b4a8
JK
20166+static void __up_write_unlock(struct rw_semaphore *sem, int bias,
20167+ unsigned long flags)
20168+{
20169+ struct rt_mutex *m = &sem->rtmutex;
1a6e0f06 20170+
e4b2b4a8
JK
20171+ atomic_add(READER_BIAS - bias, &sem->readers);
20172+ raw_spin_unlock_irqrestore(&m->wait_lock, flags);
20173+ __rt_mutex_unlock(m);
20174+}
1a6e0f06 20175+
e4b2b4a8
JK
20176+static int __sched __down_write_common(struct rw_semaphore *sem, int state)
20177+{
20178+ struct rt_mutex *m = &sem->rtmutex;
20179+ unsigned long flags;
1a6e0f06 20180+
e4b2b4a8
JK
20181+ /* Take the rtmutex as a first step */
20182+ if (__rt_mutex_lock_state(m, state))
20183+ return -EINTR;
1a6e0f06 20184+
e4b2b4a8
JK
20185+ /* Force readers into slow path */
20186+ atomic_sub(READER_BIAS, &sem->readers);
20187+ might_sleep();
1a6e0f06 20188+
e4b2b4a8
JK
20189+ set_current_state(state);
20190+ for (;;) {
20191+ raw_spin_lock_irqsave(&m->wait_lock, flags);
20192+ /* Have all readers left the critical region? */
20193+ if (!atomic_read(&sem->readers)) {
20194+ atomic_set(&sem->readers, WRITER_BIAS);
20195+ __set_current_state(TASK_RUNNING);
20196+ raw_spin_unlock_irqrestore(&m->wait_lock, flags);
20197+ return 0;
20198+ }
1a6e0f06 20199+
e4b2b4a8
JK
20200+ if (signal_pending_state(state, current)) {
20201+ __set_current_state(TASK_RUNNING);
20202+ __up_write_unlock(sem, 0, flags);
20203+ return -EINTR;
20204+ }
20205+ raw_spin_unlock_irqrestore(&m->wait_lock, flags);
1a6e0f06 20206+
e4b2b4a8
JK
20207+ if (atomic_read(&sem->readers) != 0) {
20208+ schedule();
20209+ set_current_state(state);
20210+ }
20211+ }
20212+}
1a6e0f06 20213+
e4b2b4a8
JK
20214+void __sched __down_write(struct rw_semaphore *sem)
20215+{
20216+ __down_write_common(sem, TASK_UNINTERRUPTIBLE);
20217+}
1a6e0f06 20218+
e4b2b4a8 20219+int __sched __down_write_killable(struct rw_semaphore *sem)
1a6e0f06 20220+{
e4b2b4a8 20221+ return __down_write_common(sem, TASK_KILLABLE);
1a6e0f06
JK
20222+}
20223+
e4b2b4a8 20224+int __down_write_trylock(struct rw_semaphore *sem)
1a6e0f06 20225+{
e4b2b4a8
JK
20226+ struct rt_mutex *m = &sem->rtmutex;
20227+ unsigned long flags;
20228+
20229+ if (!__rt_mutex_trylock(m))
20230+ return 0;
20231+
20232+ atomic_sub(READER_BIAS, &sem->readers);
20233+
20234+ raw_spin_lock_irqsave(&m->wait_lock, flags);
20235+ if (!atomic_read(&sem->readers)) {
20236+ atomic_set(&sem->readers, WRITER_BIAS);
20237+ raw_spin_unlock_irqrestore(&m->wait_lock, flags);
20238+ return 1;
20239+ }
20240+ __up_write_unlock(sem, 0, flags);
20241+ return 0;
1a6e0f06
JK
20242+}
20243+
e4b2b4a8 20244+void __up_write(struct rw_semaphore *sem)
1a6e0f06 20245+{
e4b2b4a8
JK
20246+ struct rt_mutex *m = &sem->rtmutex;
20247+ unsigned long flags;
20248+
20249+ raw_spin_lock_irqsave(&m->wait_lock, flags);
20250+ __up_write_unlock(sem, WRITER_BIAS, flags);
1a6e0f06
JK
20251+}
20252+
e4b2b4a8
JK
20253+void __downgrade_write(struct rw_semaphore *sem)
20254+{
20255+ struct rt_mutex *m = &sem->rtmutex;
20256+ unsigned long flags;
1a6e0f06 20257+
e4b2b4a8
JK
20258+ raw_spin_lock_irqsave(&m->wait_lock, flags);
20259+ /* Release it and account current as reader */
20260+ __up_write_unlock(sem, WRITER_BIAS - 1, flags);
20261+}
b3bbd485
JK
20262diff --git a/kernel/locking/spinlock.c b/kernel/locking/spinlock.c
20263index 6e40fdfba326..401bda23f786 100644
20264--- a/kernel/locking/spinlock.c
20265+++ b/kernel/locking/spinlock.c
20266@@ -125,8 +125,11 @@ void __lockfunc __raw_##op##_lock_bh(locktype##_t *lock) \
e4b2b4a8 20267 * __[spin|read|write]_lock_bh()
1a6e0f06 20268 */
e4b2b4a8 20269 BUILD_LOCK_OPS(spin, raw_spinlock);
1a6e0f06
JK
20270+
20271+#ifndef CONFIG_PREEMPT_RT_FULL
e4b2b4a8
JK
20272 BUILD_LOCK_OPS(read, rwlock);
20273 BUILD_LOCK_OPS(write, rwlock);
20274+#endif
20275
1a6e0f06
JK
20276 #endif
20277
b3bbd485 20278@@ -210,6 +213,8 @@ void __lockfunc _raw_spin_unlock_bh(raw_spinlock_t *lock)
e4b2b4a8
JK
20279 EXPORT_SYMBOL(_raw_spin_unlock_bh);
20280 #endif
20281
20282+#ifndef CONFIG_PREEMPT_RT_FULL
1a6e0f06 20283+
e4b2b4a8
JK
20284 #ifndef CONFIG_INLINE_READ_TRYLOCK
20285 int __lockfunc _raw_read_trylock(rwlock_t *lock)
20286 {
b3bbd485 20287@@ -354,6 +359,8 @@ void __lockfunc _raw_write_unlock_bh(rwlock_t *lock)
e4b2b4a8
JK
20288 EXPORT_SYMBOL(_raw_write_unlock_bh);
20289 #endif
20290
20291+#endif /* !PREEMPT_RT_FULL */
1a6e0f06 20292+
e4b2b4a8
JK
20293 #ifdef CONFIG_DEBUG_LOCK_ALLOC
20294
20295 void __lockfunc _raw_spin_lock_nested(raw_spinlock_t *lock, int subclass)
b3bbd485
JK
20296diff --git a/kernel/locking/spinlock_debug.c b/kernel/locking/spinlock_debug.c
20297index 9aa0fccd5d43..76d0b40d9193 100644
20298--- a/kernel/locking/spinlock_debug.c
20299+++ b/kernel/locking/spinlock_debug.c
20300@@ -31,6 +31,7 @@ void __raw_spin_lock_init(raw_spinlock_t *lock, const char *name,
e4b2b4a8
JK
20301
20302 EXPORT_SYMBOL(__raw_spin_lock_init);
20303
20304+#ifndef CONFIG_PREEMPT_RT_FULL
20305 void __rwlock_init(rwlock_t *lock, const char *name,
20306 struct lock_class_key *key)
20307 {
b3bbd485 20308@@ -48,6 +49,7 @@ void __rwlock_init(rwlock_t *lock, const char *name,
e4b2b4a8
JK
20309 }
20310
20311 EXPORT_SYMBOL(__rwlock_init);
1a6e0f06 20312+#endif
e4b2b4a8
JK
20313
20314 static void spin_dump(raw_spinlock_t *lock, const char *msg)
20315 {
b3bbd485 20316@@ -135,6 +137,7 @@ void do_raw_spin_unlock(raw_spinlock_t *lock)
e4b2b4a8
JK
20317 arch_spin_unlock(&lock->raw_lock);
20318 }
20319
20320+#ifndef CONFIG_PREEMPT_RT_FULL
20321 static void rwlock_bug(rwlock_t *lock, const char *msg)
20322 {
20323 if (!debug_locks_off())
b3bbd485 20324@@ -224,3 +227,5 @@ void do_raw_write_unlock(rwlock_t *lock)
e4b2b4a8
JK
20325 debug_write_unlock(lock);
20326 arch_write_unlock(&lock->raw_lock);
20327 }
1a6e0f06
JK
20328+
20329+#endif
b3bbd485
JK
20330diff --git a/kernel/panic.c b/kernel/panic.c
20331index bdd18afa19a4..5da649633795 100644
20332--- a/kernel/panic.c
20333+++ b/kernel/panic.c
20334@@ -482,9 +482,11 @@ static u64 oops_id;
e4b2b4a8
JK
20335
20336 static int init_oops_id(void)
20337 {
20338+#ifndef CONFIG_PREEMPT_RT_FULL
20339 if (!oops_id)
20340 get_random_bytes(&oops_id, sizeof(oops_id));
20341 else
1a6e0f06 20342+#endif
e4b2b4a8
JK
20343 oops_id++;
20344
20345 return 0;
b3bbd485
JK
20346diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
20347index a5c36e9c56a6..a4b83cb0c6e5 100644
20348--- a/kernel/power/hibernate.c
20349+++ b/kernel/power/hibernate.c
20350@@ -287,6 +287,8 @@ static int create_image(int platform_mode)
e4b2b4a8
JK
20351
20352 local_irq_disable();
20353
20354+ system_state = SYSTEM_SUSPEND;
1a6e0f06 20355+
e4b2b4a8
JK
20356 error = syscore_suspend();
20357 if (error) {
20358 pr_err("Some system devices failed to power down, aborting hibernation\n");
b3bbd485 20359@@ -317,6 +319,7 @@ static int create_image(int platform_mode)
e4b2b4a8
JK
20360 syscore_resume();
20361
20362 Enable_irqs:
20363+ system_state = SYSTEM_RUNNING;
20364 local_irq_enable();
20365
20366 Enable_cpus:
b3bbd485 20367@@ -445,6 +448,7 @@ static int resume_target_kernel(bool platform_mode)
e4b2b4a8
JK
20368 goto Enable_cpus;
20369
20370 local_irq_disable();
20371+ system_state = SYSTEM_SUSPEND;
20372
20373 error = syscore_suspend();
20374 if (error)
b3bbd485 20375@@ -478,6 +482,7 @@ static int resume_target_kernel(bool platform_mode)
e4b2b4a8
JK
20376 syscore_resume();
20377
20378 Enable_irqs:
20379+ system_state = SYSTEM_RUNNING;
20380 local_irq_enable();
20381
20382 Enable_cpus:
b3bbd485 20383@@ -563,6 +568,7 @@ int hibernation_platform_enter(void)
e4b2b4a8
JK
20384 goto Enable_cpus;
20385
20386 local_irq_disable();
20387+ system_state = SYSTEM_SUSPEND;
20388 syscore_suspend();
20389 if (pm_wakeup_pending()) {
20390 error = -EAGAIN;
b3bbd485 20391@@ -575,6 +581,7 @@ int hibernation_platform_enter(void)
e4b2b4a8
JK
20392
20393 Power_up:
20394 syscore_resume();
20395+ system_state = SYSTEM_RUNNING;
20396 local_irq_enable();
20397
20398 Enable_cpus:
b3bbd485 20399@@ -672,6 +679,10 @@ static int load_image_and_restore(void)
e4b2b4a8
JK
20400 return error;
20401 }
20402
20403+#ifndef CONFIG_SUSPEND
20404+bool pm_in_action;
1a6e0f06 20405+#endif
1a6e0f06 20406+
e4b2b4a8
JK
20407 /**
20408 * hibernate - Carry out system hibernation, including saving the image.
20409 */
b3bbd485 20410@@ -685,6 +696,8 @@ int hibernate(void)
e4b2b4a8
JK
20411 return -EPERM;
20412 }
20413
20414+ pm_in_action = true;
1a6e0f06 20415+
e4b2b4a8
JK
20416 lock_system_sleep();
20417 /* The snapshot device should not be opened while we're running */
20418 if (!atomic_add_unless(&snapshot_device_available, -1, 0)) {
b3bbd485 20419@@ -763,6 +776,7 @@ int hibernate(void)
e4b2b4a8
JK
20420 atomic_inc(&snapshot_device_available);
20421 Unlock:
20422 unlock_system_sleep();
20423+ pm_in_action = false;
20424 pr_info("hibernation exit\n");
20425
20426 return error;
b3bbd485
JK
20427diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
20428index c0bc2c89697a..b89605fe0e88 100644
20429--- a/kernel/power/suspend.c
20430+++ b/kernel/power/suspend.c
20431@@ -27,6 +27,7 @@
20432 #include <linux/export.h>
20433 #include <linux/suspend.h>
20434 #include <linux/syscore_ops.h>
20435+#include <linux/swait.h>
20436 #include <linux/ftrace.h>
20437 #include <trace/events/power.h>
20438 #include <linux/compiler.h>
20439@@ -57,7 +58,7 @@ EXPORT_SYMBOL_GPL(pm_suspend_global_flags);
20440
20441 static const struct platform_suspend_ops *suspend_ops;
20442 static const struct platform_s2idle_ops *s2idle_ops;
20443-static DECLARE_WAIT_QUEUE_HEAD(s2idle_wait_head);
20444+static DECLARE_SWAIT_QUEUE_HEAD(s2idle_wait_head);
20445
20446 enum s2idle_states __read_mostly s2idle_state;
20447 static DEFINE_RAW_SPINLOCK(s2idle_lock);
20448@@ -91,8 +92,8 @@ static void s2idle_enter(void)
20449 /* Push all the CPUs into the idle loop. */
20450 wake_up_all_idle_cpus();
20451 /* Make the current CPU wait so it can enter the idle loop too. */
20452- wait_event(s2idle_wait_head,
20453- s2idle_state == S2IDLE_STATE_WAKE);
20454+ swait_event(s2idle_wait_head,
20455+ s2idle_state == S2IDLE_STATE_WAKE);
20456
20457 cpuidle_pause();
20458 put_online_cpus();
20459@@ -159,7 +160,7 @@ void s2idle_wake(void)
20460 raw_spin_lock_irqsave(&s2idle_lock, flags);
20461 if (s2idle_state > S2IDLE_STATE_NONE) {
20462 s2idle_state = S2IDLE_STATE_WAKE;
20463- wake_up(&s2idle_wait_head);
20464+ swake_up(&s2idle_wait_head);
20465 }
20466 raw_spin_unlock_irqrestore(&s2idle_lock, flags);
20467 }
20468@@ -428,6 +429,8 @@ static int suspend_enter(suspend_state_t state, bool *wakeup)
e4b2b4a8
JK
20469 arch_suspend_disable_irqs();
20470 BUG_ON(!irqs_disabled());
20471
20472+ system_state = SYSTEM_SUSPEND;
1a6e0f06 20473+
e4b2b4a8
JK
20474 error = syscore_suspend();
20475 if (!error) {
20476 *wakeup = pm_wakeup_pending();
b3bbd485 20477@@ -443,6 +446,8 @@ static int suspend_enter(suspend_state_t state, bool *wakeup)
e4b2b4a8
JK
20478 syscore_resume();
20479 }
20480
20481+ system_state = SYSTEM_RUNNING;
1a6e0f06 20482+
e4b2b4a8
JK
20483 arch_suspend_enable_irqs();
20484 BUG_ON(irqs_disabled());
20485
b3bbd485 20486@@ -589,6 +594,8 @@ static int enter_state(suspend_state_t state)
e4b2b4a8
JK
20487 return error;
20488 }
20489
20490+bool pm_in_action;
1a6e0f06 20491+
e4b2b4a8
JK
20492 /**
20493 * pm_suspend - Externally visible function for suspending the system.
20494 * @state: System sleep state to enter.
b3bbd485 20495@@ -603,6 +610,7 @@ int pm_suspend(suspend_state_t state)
e4b2b4a8
JK
20496 if (state <= PM_SUSPEND_ON || state >= PM_SUSPEND_MAX)
20497 return -EINVAL;
20498
20499+ pm_in_action = true;
20500 pr_info("suspend entry (%s)\n", mem_sleep_labels[state]);
20501 error = enter_state(state);
20502 if (error) {
b3bbd485 20503@@ -612,6 +620,7 @@ int pm_suspend(suspend_state_t state)
e4b2b4a8
JK
20504 suspend_stats.success++;
20505 }
20506 pr_info("suspend exit\n");
20507+ pm_in_action = false;
20508 return error;
20509 }
20510 EXPORT_SYMBOL(pm_suspend);
b3bbd485
JK
20511diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
20512index f0223a7d9ed1..13fd0bcf2367 100644
20513--- a/kernel/printk/printk.c
20514+++ b/kernel/printk/printk.c
20515@@ -1348,6 +1348,8 @@ static int syslog_print_all(char __user *buf, int size, bool clear)
e4b2b4a8
JK
20516 {
20517 char *text;
20518 int len = 0;
20519+ int attempts = 0;
20520+ int num_msg;
20521
20522 text = kmalloc(LOG_LINE_MAX + PREFIX_MAX, GFP_KERNEL);
20523 if (!text)
b3bbd485 20524@@ -1359,6 +1361,14 @@ static int syslog_print_all(char __user *buf, int size, bool clear)
e4b2b4a8
JK
20525 u64 seq;
20526 u32 idx;
20527
20528+try_again:
20529+ attempts++;
20530+ if (attempts > 10) {
20531+ len = -EBUSY;
20532+ goto out;
20533+ }
20534+ num_msg = 0;
20535+
20536 /*
20537 * Find first record that fits, including all following records,
20538 * into the user-provided buffer for this dump.
b3bbd485 20539@@ -1371,6 +1381,14 @@ static int syslog_print_all(char __user *buf, int size, bool clear)
e4b2b4a8
JK
20540 len += msg_print_text(msg, true, NULL, 0);
20541 idx = log_next(idx);
20542 seq++;
20543+ num_msg++;
20544+ if (num_msg > 5) {
20545+ num_msg = 0;
20546+ logbuf_unlock_irq();
20547+ logbuf_lock_irq();
20548+ if (clear_seq < log_first_seq)
20549+ goto try_again;
20550+ }
20551 }
20552
20553 /* move first record forward until length fits into the buffer */
b3bbd485 20554@@ -1382,6 +1400,14 @@ static int syslog_print_all(char __user *buf, int size, bool clear)
e4b2b4a8
JK
20555 len -= msg_print_text(msg, true, NULL, 0);
20556 idx = log_next(idx);
20557 seq++;
20558+ num_msg++;
20559+ if (num_msg > 5) {
20560+ num_msg = 0;
20561+ logbuf_unlock_irq();
20562+ logbuf_lock_irq();
20563+ if (clear_seq < log_first_seq)
20564+ goto try_again;
20565+ }
20566 }
20567
20568 /* last message fitting into this dump */
b3bbd485 20569@@ -1420,6 +1446,7 @@ static int syslog_print_all(char __user *buf, int size, bool clear)
e4b2b4a8
JK
20570 clear_seq = log_next_seq;
20571 clear_idx = log_next_idx;
20572 }
20573+out:
20574 logbuf_unlock_irq();
20575
20576 kfree(text);
b3bbd485 20577@@ -1558,6 +1585,12 @@ static void call_console_drivers(const char *ext_text, size_t ext_len,
e4b2b4a8
JK
20578 if (!console_drivers)
20579 return;
20580
20581+ if (IS_ENABLED(CONFIG_PREEMPT_RT_BASE)) {
20582+ if (in_irq() || in_nmi())
20583+ return;
1a6e0f06 20584+ }
1a6e0f06 20585+
e4b2b4a8
JK
20586+ migrate_disable();
20587 for_each_console(con) {
20588 if (exclusive_console && con != exclusive_console)
20589 continue;
b3bbd485 20590@@ -1573,6 +1606,7 @@ static void call_console_drivers(const char *ext_text, size_t ext_len,
e4b2b4a8
JK
20591 else
20592 con->write(con, text, len);
20593 }
20594+ migrate_enable();
20595 }
20596
20597 int printk_delay_msec __read_mostly;
b3bbd485 20598@@ -1757,12 +1791,22 @@ asmlinkage int vprintk_emit(int facility, int level,
e4b2b4a8
JK
20599
20600 /* If called from the scheduler, we can not call up(). */
20601 if (!in_sched) {
20602+ int may_trylock = 1;
1a6e0f06 20603+
e4b2b4a8
JK
20604+#ifdef CONFIG_PREEMPT_RT_FULL
20605+ /*
20606+ * we can't take a sleeping lock with IRQs or preeption disabled
20607+ * so we can't print in these contexts
20608+ */
20609+ if (!(preempt_count() == 0 && !irqs_disabled()))
20610+ may_trylock = 0;
1a6e0f06 20611+#endif
e4b2b4a8
JK
20612 /*
20613 * Try to acquire and then immediately release the console
20614 * semaphore. The release will print out buffers and wake up
20615 * /dev/kmsg and syslog() users.
20616 */
20617- if (console_trylock())
20618+ if (may_trylock && console_trylock())
20619 console_unlock();
20620 }
1a6e0f06 20621
b3bbd485 20622@@ -1872,26 +1916,6 @@ static bool suppress_message_printing(int level) { return false; }
1a6e0f06 20623
e4b2b4a8 20624 #endif /* CONFIG_PRINTK */
1a6e0f06 20625
e4b2b4a8
JK
20626-#ifdef CONFIG_EARLY_PRINTK
20627-struct console *early_console;
20628-
20629-asmlinkage __visible void early_printk(const char *fmt, ...)
20630-{
20631- va_list ap;
20632- char buf[512];
20633- int n;
20634-
20635- if (!early_console)
20636- return;
20637-
20638- va_start(ap, fmt);
20639- n = vscnprintf(buf, sizeof(buf), fmt, ap);
20640- va_end(ap);
20641-
20642- early_console->write(early_console, buf, n);
20643-}
20644-#endif
20645-
20646 static int __add_preferred_console(char *name, int idx, char *options,
20647 char *brl_options)
20648 {
b3bbd485 20649@@ -2238,10 +2262,15 @@ void console_unlock(void)
e4b2b4a8
JK
20650 console_seq++;
20651 raw_spin_unlock(&logbuf_lock);
1a6e0f06 20652
e4b2b4a8
JK
20653+#ifdef CONFIG_PREEMPT_RT_FULL
20654+ printk_safe_exit_irqrestore(flags);
20655+ call_console_drivers(ext_text, ext_len, text, len);
1a6e0f06 20656+#else
e4b2b4a8
JK
20657 stop_critical_timings(); /* don't trace print latency */
20658 call_console_drivers(ext_text, ext_len, text, len);
20659 start_critical_timings();
20660 printk_safe_exit_irqrestore(flags);
1a6e0f06 20661+#endif
1a6e0f06 20662
e4b2b4a8
JK
20663 if (do_cond_resched)
20664 cond_resched();
b3bbd485 20665@@ -2295,6 +2324,11 @@ void console_unblank(void)
1a6e0f06 20666 {
e4b2b4a8 20667 struct console *c;
1a6e0f06 20668
e4b2b4a8
JK
20669+ if (IS_ENABLED(CONFIG_PREEMPT_RT_BASE)) {
20670+ if (in_irq() || in_nmi())
20671+ return;
20672+ }
1a6e0f06 20673+
e4b2b4a8
JK
20674 /*
20675 * console_unblank can no longer be called in interrupt context unless
20676 * oops_in_progress is set to 1..
b3bbd485
JK
20677diff --git a/kernel/printk/printk_safe.c b/kernel/printk/printk_safe.c
20678index 64f8046586b6..a24e16bef51c 100644
20679--- a/kernel/printk/printk_safe.c
20680+++ b/kernel/printk/printk_safe.c
20681@@ -22,6 +22,7 @@
20682 #include <linux/cpumask.h>
20683 #include <linux/irq_work.h>
20684 #include <linux/printk.h>
20685+#include <linux/console.h>
20686
20687 #include "internal.h"
20688
20689@@ -373,8 +374,74 @@ void __printk_safe_exit(void)
20690 this_cpu_dec(printk_context);
20691 }
20692
20693+#ifdef CONFIG_EARLY_PRINTK
20694+struct console *early_console;
20695+
20696+static void early_vprintk(const char *fmt, va_list ap)
20697+{
20698+ if (early_console) {
20699+ char buf[512];
20700+ int n = vscnprintf(buf, sizeof(buf), fmt, ap);
20701+
20702+ early_console->write(early_console, buf, n);
20703+ }
20704+}
20705+
20706+asmlinkage void early_printk(const char *fmt, ...)
20707+{
20708+ va_list ap;
20709+
20710+ va_start(ap, fmt);
20711+ early_vprintk(fmt, ap);
20712+ va_end(ap);
20713+}
20714+
20715+/*
20716+ * This is independent of any log levels - a global
20717+ * kill switch that turns off all of printk.
20718+ *
20719+ * Used by the NMI watchdog if early-printk is enabled.
20720+ */
20721+static bool __read_mostly printk_killswitch;
20722+
20723+static int __init force_early_printk_setup(char *str)
20724+{
20725+ printk_killswitch = true;
20726+ return 0;
20727+}
20728+early_param("force_early_printk", force_early_printk_setup);
20729+
20730+void printk_kill(void)
20731+{
20732+ printk_killswitch = true;
20733+}
20734+
20735+#ifdef CONFIG_PRINTK
20736+static int forced_early_printk(const char *fmt, va_list ap)
20737+{
20738+ if (!printk_killswitch)
20739+ return 0;
20740+ early_vprintk(fmt, ap);
20741+ return 1;
20742+}
20743+#endif
20744+
20745+#else
20746+static inline int forced_early_printk(const char *fmt, va_list ap)
20747+{
20748+ return 0;
20749+}
20750+#endif
20751+
20752 __printf(1, 0) int vprintk_func(const char *fmt, va_list args)
20753 {
20754+ /*
20755+ * Fall back to early_printk if a debugging subsystem has
20756+ * killed printk output
20757+ */
20758+ if (unlikely(forced_early_printk(fmt, args)))
20759+ return 1;
20760+
20761 /*
20762 * Try to use the main logbuf even in NMI. But avoid calling console
20763 * drivers that might have their own locks.
20764diff --git a/kernel/ptrace.c b/kernel/ptrace.c
20765index 84b1367935e4..b32a86f63522 100644
20766--- a/kernel/ptrace.c
20767+++ b/kernel/ptrace.c
20768@@ -175,7 +175,14 @@ static bool ptrace_freeze_traced(struct task_struct *task)
e4b2b4a8
JK
20769
20770 spin_lock_irq(&task->sighand->siglock);
20771 if (task_is_traced(task) && !__fatal_signal_pending(task)) {
20772- task->state = __TASK_TRACED;
20773+ unsigned long flags;
1a6e0f06 20774+
e4b2b4a8
JK
20775+ raw_spin_lock_irqsave(&task->pi_lock, flags);
20776+ if (task->state & __TASK_TRACED)
20777+ task->state = __TASK_TRACED;
20778+ else
20779+ task->saved_state = __TASK_TRACED;
20780+ raw_spin_unlock_irqrestore(&task->pi_lock, flags);
20781 ret = true;
20782 }
20783 spin_unlock_irq(&task->sighand->siglock);
b3bbd485
JK
20784diff --git a/kernel/rcu/Kconfig b/kernel/rcu/Kconfig
20785index 9210379c0353..0be2c96fb640 100644
20786--- a/kernel/rcu/Kconfig
20787+++ b/kernel/rcu/Kconfig
20788@@ -36,7 +36,7 @@ config TINY_RCU
1a6e0f06 20789
e4b2b4a8
JK
20790 config RCU_EXPERT
20791 bool "Make expert-level adjustments to RCU configuration"
20792- default n
20793+ default y if PREEMPT_RT_FULL
20794 help
20795 This option needs to be enabled if you wish to make
20796 expert-level adjustments to RCU configuration. By default,
b3bbd485 20797@@ -172,7 +172,7 @@ config RCU_FANOUT_LEAF
e4b2b4a8
JK
20798
20799 config RCU_FAST_NO_HZ
20800 bool "Accelerate last non-dyntick-idle CPU's grace periods"
20801- depends on NO_HZ_COMMON && SMP && RCU_EXPERT
20802+ depends on NO_HZ_COMMON && SMP && RCU_EXPERT && !PREEMPT_RT_FULL
20803 default n
20804 help
20805 This option permits CPUs to enter dynticks-idle state even if
b3bbd485 20806@@ -191,7 +191,7 @@ config RCU_FAST_NO_HZ
e4b2b4a8
JK
20807 config RCU_BOOST
20808 bool "Enable RCU priority boosting"
20809 depends on RT_MUTEXES && PREEMPT_RCU && RCU_EXPERT
20810- default n
20811+ default y if PREEMPT_RT_FULL
20812 help
20813 This option boosts the priority of preempted RCU readers that
20814 block the current preemptible RCU grace period for too long.
b3bbd485
JK
20815diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h
20816index e4b43fef89f5..0b056c30e9b1 100644
20817--- a/kernel/rcu/rcu.h
20818+++ b/kernel/rcu/rcu.h
20819@@ -462,18 +462,26 @@ static inline void show_rcu_gp_kthreads(void) { }
e4b2b4a8
JK
20820 extern unsigned long rcutorture_testseq;
20821 extern unsigned long rcutorture_vernum;
20822 unsigned long rcu_batches_started(void);
20823-unsigned long rcu_batches_started_bh(void);
20824 unsigned long rcu_batches_started_sched(void);
20825 unsigned long rcu_batches_completed(void);
20826-unsigned long rcu_batches_completed_bh(void);
20827 unsigned long rcu_batches_completed_sched(void);
20828 unsigned long rcu_exp_batches_completed(void);
20829 unsigned long rcu_exp_batches_completed_sched(void);
20830 unsigned long srcu_batches_completed(struct srcu_struct *sp);
20831 void show_rcu_gp_kthreads(void);
20832 void rcu_force_quiescent_state(void);
20833-void rcu_bh_force_quiescent_state(void);
20834 void rcu_sched_force_quiescent_state(void);
1a6e0f06 20835+
e4b2b4a8
JK
20836+#ifndef CONFIG_PREEMPT_RT_FULL
20837+void rcu_bh_force_quiescent_state(void);
20838+unsigned long rcu_batches_started_bh(void);
20839+unsigned long rcu_batches_completed_bh(void);
1a6e0f06 20840+#else
e4b2b4a8
JK
20841+# define rcu_bh_force_quiescent_state rcu_force_quiescent_state
20842+# define rcu_batches_completed_bh rcu_batches_completed
20843+# define rcu_batches_started_bh rcu_batches_completed
1a6e0f06 20844+#endif
e4b2b4a8
JK
20845+
20846 #endif /* #else #ifdef CONFIG_TINY_RCU */
1a6e0f06 20847
e4b2b4a8 20848 #ifdef CONFIG_RCU_NOCB_CPU
b3bbd485
JK
20849diff --git a/kernel/rcu/rcu_segcblist.c b/kernel/rcu/rcu_segcblist.c
20850index 7649fcd2c4c7..88cba7c2956c 100644
20851--- a/kernel/rcu/rcu_segcblist.c
20852+++ b/kernel/rcu/rcu_segcblist.c
e4b2b4a8
JK
20853@@ -23,6 +23,7 @@
20854 #include <linux/types.h>
20855 #include <linux/kernel.h>
20856 #include <linux/interrupt.h>
20857+#include <linux/rcupdate.h>
1a6e0f06 20858
e4b2b4a8 20859 #include "rcu_segcblist.h"
1a6e0f06 20860
b3bbd485
JK
20861diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
20862index 45f2ffbc1e78..2e9dbb734d5a 100644
20863--- a/kernel/rcu/rcutorture.c
20864+++ b/kernel/rcu/rcutorture.c
20865@@ -417,6 +417,7 @@ static struct rcu_torture_ops rcu_ops = {
e4b2b4a8 20866 .name = "rcu"
1a6e0f06
JK
20867 };
20868
e4b2b4a8
JK
20869+#ifndef CONFIG_PREEMPT_RT_FULL
20870 /*
20871 * Definitions for rcu_bh torture testing.
1a6e0f06 20872 */
b3bbd485 20873@@ -456,6 +457,12 @@ static struct rcu_torture_ops rcu_bh_ops = {
e4b2b4a8
JK
20874 .name = "rcu_bh"
20875 };
1a6e0f06 20876
e4b2b4a8
JK
20877+#else
20878+static struct rcu_torture_ops rcu_bh_ops = {
20879+ .ttype = INVALID_RCU_FLAVOR,
20880+};
20881+#endif
20882+
1a6e0f06 20883 /*
e4b2b4a8
JK
20884 * Don't even think about trying any of these in real life!!!
20885 * The names includes "busted", and they really means it!
b3bbd485
JK
20886diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c
20887index 6d5880089ff6..0e3b2bd3f2ac 100644
20888--- a/kernel/rcu/srcutree.c
20889+++ b/kernel/rcu/srcutree.c
e4b2b4a8
JK
20890@@ -36,6 +36,8 @@
20891 #include <linux/delay.h>
20892 #include <linux/module.h>
20893 #include <linux/srcu.h>
20894+#include <linux/cpu.h>
20895+#include <linux/locallock.h>
1a6e0f06 20896
e4b2b4a8
JK
20897 #include "rcu.h"
20898 #include "rcu_segcblist.h"
b3bbd485 20899@@ -53,6 +55,33 @@ static void srcu_invoke_callbacks(struct work_struct *work);
e4b2b4a8
JK
20900 static void srcu_reschedule(struct srcu_struct *sp, unsigned long delay);
20901 static void process_srcu(struct work_struct *work);
20902
20903+/* Wrappers for lock acquisition and release, see raw_spin_lock_rcu_node(). */
20904+#define spin_lock_rcu_node(p) \
20905+do { \
20906+ spin_lock(&ACCESS_PRIVATE(p, lock)); \
20907+ smp_mb__after_unlock_lock(); \
20908+} while (0)
20909+
20910+#define spin_unlock_rcu_node(p) spin_unlock(&ACCESS_PRIVATE(p, lock))
20911+
20912+#define spin_lock_irq_rcu_node(p) \
20913+do { \
20914+ spin_lock_irq(&ACCESS_PRIVATE(p, lock)); \
20915+ smp_mb__after_unlock_lock(); \
20916+} while (0)
20917+
20918+#define spin_unlock_irq_rcu_node(p) \
20919+ spin_unlock_irq(&ACCESS_PRIVATE(p, lock))
20920+
20921+#define spin_lock_irqsave_rcu_node(p, flags) \
20922+do { \
20923+ spin_lock_irqsave(&ACCESS_PRIVATE(p, lock), flags); \
20924+ smp_mb__after_unlock_lock(); \
20925+} while (0)
20926+
20927+#define spin_unlock_irqrestore_rcu_node(p, flags) \
20928+ spin_unlock_irqrestore(&ACCESS_PRIVATE(p, lock), flags) \
20929+
20930 /*
20931 * Initialize SRCU combining tree. Note that statically allocated
20932 * srcu_struct structures might already have srcu_read_lock() and
b3bbd485 20933@@ -77,7 +106,7 @@ static void init_srcu_struct_nodes(struct srcu_struct *sp, bool is_static)
e4b2b4a8
JK
20934
20935 /* Each pass through this loop initializes one srcu_node structure. */
20936 rcu_for_each_node_breadth_first(sp, snp) {
20937- raw_spin_lock_init(&ACCESS_PRIVATE(snp, lock));
20938+ spin_lock_init(&ACCESS_PRIVATE(snp, lock));
20939 WARN_ON_ONCE(ARRAY_SIZE(snp->srcu_have_cbs) !=
20940 ARRAY_SIZE(snp->srcu_data_have_cbs));
20941 for (i = 0; i < ARRAY_SIZE(snp->srcu_have_cbs); i++) {
b3bbd485 20942@@ -111,7 +140,7 @@ static void init_srcu_struct_nodes(struct srcu_struct *sp, bool is_static)
e4b2b4a8
JK
20943 snp_first = sp->level[level];
20944 for_each_possible_cpu(cpu) {
20945 sdp = per_cpu_ptr(sp->sda, cpu);
20946- raw_spin_lock_init(&ACCESS_PRIVATE(sdp, lock));
20947+ spin_lock_init(&ACCESS_PRIVATE(sdp, lock));
20948 rcu_segcblist_init(&sdp->srcu_cblist);
20949 sdp->srcu_cblist_invoking = false;
20950 sdp->srcu_gp_seq_needed = sp->srcu_gp_seq;
b3bbd485 20951@@ -170,7 +199,7 @@ int __init_srcu_struct(struct srcu_struct *sp, const char *name,
e4b2b4a8
JK
20952 /* Don't re-initialize a lock while it is held. */
20953 debug_check_no_locks_freed((void *)sp, sizeof(*sp));
20954 lockdep_init_map(&sp->dep_map, name, key, 0);
20955- raw_spin_lock_init(&ACCESS_PRIVATE(sp, lock));
20956+ spin_lock_init(&ACCESS_PRIVATE(sp, lock));
20957 return init_srcu_struct_fields(sp, false);
20958 }
20959 EXPORT_SYMBOL_GPL(__init_srcu_struct);
b3bbd485 20960@@ -187,7 +216,7 @@ EXPORT_SYMBOL_GPL(__init_srcu_struct);
1a6e0f06 20961 */
e4b2b4a8
JK
20962 int init_srcu_struct(struct srcu_struct *sp)
20963 {
20964- raw_spin_lock_init(&ACCESS_PRIVATE(sp, lock));
20965+ spin_lock_init(&ACCESS_PRIVATE(sp, lock));
20966 return init_srcu_struct_fields(sp, false);
20967 }
20968 EXPORT_SYMBOL_GPL(init_srcu_struct);
b3bbd485 20969@@ -210,13 +239,13 @@ static void check_init_srcu_struct(struct srcu_struct *sp)
e4b2b4a8
JK
20970 /* The smp_load_acquire() pairs with the smp_store_release(). */
20971 if (!rcu_seq_state(smp_load_acquire(&sp->srcu_gp_seq_needed))) /*^^^*/
20972 return; /* Already initialized. */
20973- raw_spin_lock_irqsave_rcu_node(sp, flags);
20974+ spin_lock_irqsave_rcu_node(sp, flags);
20975 if (!rcu_seq_state(sp->srcu_gp_seq_needed)) {
20976- raw_spin_unlock_irqrestore_rcu_node(sp, flags);
20977+ spin_unlock_irqrestore_rcu_node(sp, flags);
20978 return;
20979 }
20980 init_srcu_struct_fields(sp, true);
20981- raw_spin_unlock_irqrestore_rcu_node(sp, flags);
20982+ spin_unlock_irqrestore_rcu_node(sp, flags);
1a6e0f06
JK
20983 }
20984
e4b2b4a8 20985 /*
b3bbd485
JK
20986@@ -424,21 +453,6 @@ static void srcu_gp_start(struct srcu_struct *sp)
20987 WARN_ON_ONCE(state != SRCU_STATE_SCAN1);
e4b2b4a8 20988 }
1a6e0f06 20989
b3bbd485 20990-/*
e4b2b4a8
JK
20991- * Track online CPUs to guide callback workqueue placement.
20992- */
20993-DEFINE_PER_CPU(bool, srcu_online);
20994-
20995-void srcu_online_cpu(unsigned int cpu)
20996-{
20997- WRITE_ONCE(per_cpu(srcu_online, cpu), true);
20998-}
20999-
21000-void srcu_offline_cpu(unsigned int cpu)
21001-{
21002- WRITE_ONCE(per_cpu(srcu_online, cpu), false);
21003-}
21004-
b3bbd485 21005 /*
e4b2b4a8
JK
21006 * Place the workqueue handler on the specified CPU if online, otherwise
21007 * just run it whereever. This is useful for placing workqueue handlers
b3bbd485 21008@@ -450,12 +464,12 @@ static bool srcu_queue_delayed_work_on(int cpu, struct workqueue_struct *wq,
1a6e0f06 21009 {
e4b2b4a8
JK
21010 bool ret;
21011
21012- preempt_disable();
21013- if (READ_ONCE(per_cpu(srcu_online, cpu)))
21014+ cpus_read_lock();
21015+ if (cpu_online(cpu))
21016 ret = queue_delayed_work_on(cpu, wq, dwork, delay);
21017 else
21018 ret = queue_delayed_work(wq, dwork, delay);
21019- preempt_enable();
21020+ cpus_read_unlock();
21021 return ret;
1a6e0f06
JK
21022 }
21023
b3bbd485 21024@@ -513,7 +527,7 @@ static void srcu_gp_end(struct srcu_struct *sp)
e4b2b4a8
JK
21025 mutex_lock(&sp->srcu_cb_mutex);
21026
21027 /* End the current grace period. */
21028- raw_spin_lock_irq_rcu_node(sp);
21029+ spin_lock_irq_rcu_node(sp);
21030 idx = rcu_seq_state(sp->srcu_gp_seq);
21031 WARN_ON_ONCE(idx != SRCU_STATE_SCAN2);
21032 cbdelay = srcu_get_delay(sp);
b3bbd485 21033@@ -522,7 +536,7 @@ static void srcu_gp_end(struct srcu_struct *sp)
e4b2b4a8
JK
21034 gpseq = rcu_seq_current(&sp->srcu_gp_seq);
21035 if (ULONG_CMP_LT(sp->srcu_gp_seq_needed_exp, gpseq))
21036 sp->srcu_gp_seq_needed_exp = gpseq;
21037- raw_spin_unlock_irq_rcu_node(sp);
21038+ spin_unlock_irq_rcu_node(sp);
21039 mutex_unlock(&sp->srcu_gp_mutex);
21040 /* A new grace period can start at this point. But only one. */
21041
b3bbd485 21042@@ -530,7 +544,7 @@ static void srcu_gp_end(struct srcu_struct *sp)
e4b2b4a8
JK
21043 idx = rcu_seq_ctr(gpseq) % ARRAY_SIZE(snp->srcu_have_cbs);
21044 idxnext = (idx + 1) % ARRAY_SIZE(snp->srcu_have_cbs);
21045 rcu_for_each_node_breadth_first(sp, snp) {
21046- raw_spin_lock_irq_rcu_node(snp);
21047+ spin_lock_irq_rcu_node(snp);
21048 cbs = false;
21049 if (snp >= sp->level[rcu_num_lvls - 1])
21050 cbs = snp->srcu_have_cbs[idx] == gpseq;
b3bbd485 21051@@ -540,7 +554,7 @@ static void srcu_gp_end(struct srcu_struct *sp)
e4b2b4a8
JK
21052 snp->srcu_gp_seq_needed_exp = gpseq;
21053 mask = snp->srcu_data_have_cbs[idx];
21054 snp->srcu_data_have_cbs[idx] = 0;
21055- raw_spin_unlock_irq_rcu_node(snp);
21056+ spin_unlock_irq_rcu_node(snp);
21057 if (cbs)
21058 srcu_schedule_cbs_snp(sp, snp, mask, cbdelay);
21059
b3bbd485 21060@@ -548,11 +562,11 @@ static void srcu_gp_end(struct srcu_struct *sp)
e4b2b4a8
JK
21061 if (!(gpseq & counter_wrap_check))
21062 for (cpu = snp->grplo; cpu <= snp->grphi; cpu++) {
21063 sdp = per_cpu_ptr(sp->sda, cpu);
21064- raw_spin_lock_irqsave_rcu_node(sdp, flags);
21065+ spin_lock_irqsave_rcu_node(sdp, flags);
21066 if (ULONG_CMP_GE(gpseq,
21067 sdp->srcu_gp_seq_needed + 100))
21068 sdp->srcu_gp_seq_needed = gpseq;
21069- raw_spin_unlock_irqrestore_rcu_node(sdp, flags);
21070+ spin_unlock_irqrestore_rcu_node(sdp, flags);
21071 }
21072 }
1a6e0f06 21073
b3bbd485 21074@@ -560,17 +574,17 @@ static void srcu_gp_end(struct srcu_struct *sp)
e4b2b4a8
JK
21075 mutex_unlock(&sp->srcu_cb_mutex);
21076
21077 /* Start a new grace period if needed. */
21078- raw_spin_lock_irq_rcu_node(sp);
21079+ spin_lock_irq_rcu_node(sp);
21080 gpseq = rcu_seq_current(&sp->srcu_gp_seq);
21081 if (!rcu_seq_state(gpseq) &&
21082 ULONG_CMP_LT(gpseq, sp->srcu_gp_seq_needed)) {
21083 srcu_gp_start(sp);
21084- raw_spin_unlock_irq_rcu_node(sp);
21085+ spin_unlock_irq_rcu_node(sp);
21086 /* Throttle expedited grace periods: Should be rare! */
21087 srcu_reschedule(sp, rcu_seq_ctr(gpseq) & 0x3ff
21088 ? 0 : SRCU_INTERVAL);
21089 } else {
21090- raw_spin_unlock_irq_rcu_node(sp);
21091+ spin_unlock_irq_rcu_node(sp);
21092 }
21093 }
1a6e0f06 21094
b3bbd485 21095@@ -590,18 +604,18 @@ static void srcu_funnel_exp_start(struct srcu_struct *sp, struct srcu_node *snp,
e4b2b4a8
JK
21096 if (rcu_seq_done(&sp->srcu_gp_seq, s) ||
21097 ULONG_CMP_GE(READ_ONCE(snp->srcu_gp_seq_needed_exp), s))
21098 return;
21099- raw_spin_lock_irqsave_rcu_node(snp, flags);
21100+ spin_lock_irqsave_rcu_node(snp, flags);
21101 if (ULONG_CMP_GE(snp->srcu_gp_seq_needed_exp, s)) {
21102- raw_spin_unlock_irqrestore_rcu_node(snp, flags);
21103+ spin_unlock_irqrestore_rcu_node(snp, flags);
21104 return;
21105 }
21106 WRITE_ONCE(snp->srcu_gp_seq_needed_exp, s);
21107- raw_spin_unlock_irqrestore_rcu_node(snp, flags);
21108+ spin_unlock_irqrestore_rcu_node(snp, flags);
21109 }
21110- raw_spin_lock_irqsave_rcu_node(sp, flags);
21111+ spin_lock_irqsave_rcu_node(sp, flags);
21112 if (!ULONG_CMP_LT(sp->srcu_gp_seq_needed_exp, s))
21113 sp->srcu_gp_seq_needed_exp = s;
21114- raw_spin_unlock_irqrestore_rcu_node(sp, flags);
21115+ spin_unlock_irqrestore_rcu_node(sp, flags);
21116 }
1a6e0f06 21117
e4b2b4a8 21118 /*
b3bbd485 21119@@ -623,12 +637,12 @@ static void srcu_funnel_gp_start(struct srcu_struct *sp, struct srcu_data *sdp,
e4b2b4a8
JK
21120 for (; snp != NULL; snp = snp->srcu_parent) {
21121 if (rcu_seq_done(&sp->srcu_gp_seq, s) && snp != sdp->mynode)
21122 return; /* GP already done and CBs recorded. */
21123- raw_spin_lock_irqsave_rcu_node(snp, flags);
21124+ spin_lock_irqsave_rcu_node(snp, flags);
21125 if (ULONG_CMP_GE(snp->srcu_have_cbs[idx], s)) {
21126 snp_seq = snp->srcu_have_cbs[idx];
21127 if (snp == sdp->mynode && snp_seq == s)
21128 snp->srcu_data_have_cbs[idx] |= sdp->grpmask;
21129- raw_spin_unlock_irqrestore_rcu_node(snp, flags);
21130+ spin_unlock_irqrestore_rcu_node(snp, flags);
21131 if (snp == sdp->mynode && snp_seq != s) {
21132 srcu_schedule_cbs_sdp(sdp, do_norm
21133 ? SRCU_INTERVAL
b3bbd485 21134@@ -644,11 +658,11 @@ static void srcu_funnel_gp_start(struct srcu_struct *sp, struct srcu_data *sdp,
e4b2b4a8
JK
21135 snp->srcu_data_have_cbs[idx] |= sdp->grpmask;
21136 if (!do_norm && ULONG_CMP_LT(snp->srcu_gp_seq_needed_exp, s))
21137 snp->srcu_gp_seq_needed_exp = s;
21138- raw_spin_unlock_irqrestore_rcu_node(snp, flags);
21139+ spin_unlock_irqrestore_rcu_node(snp, flags);
21140 }
21141
21142 /* Top of tree, must ensure the grace period will be started. */
21143- raw_spin_lock_irqsave_rcu_node(sp, flags);
21144+ spin_lock_irqsave_rcu_node(sp, flags);
21145 if (ULONG_CMP_LT(sp->srcu_gp_seq_needed, s)) {
21146 /*
21147 * Record need for grace period s. Pair with load
b3bbd485 21148@@ -667,7 +681,7 @@ static void srcu_funnel_gp_start(struct srcu_struct *sp, struct srcu_data *sdp,
e4b2b4a8
JK
21149 queue_delayed_work(system_power_efficient_wq, &sp->work,
21150 srcu_get_delay(sp));
21151 }
21152- raw_spin_unlock_irqrestore_rcu_node(sp, flags);
21153+ spin_unlock_irqrestore_rcu_node(sp, flags);
1a6e0f06 21154 }
1a6e0f06 21155
e4b2b4a8 21156 /*
b3bbd485 21157@@ -736,6 +750,8 @@ static void srcu_flip(struct srcu_struct *sp)
e4b2b4a8
JK
21158 * negligible when amoritized over that time period, and the extra latency
21159 * of a needlessly non-expedited grace period is similarly negligible.
21160 */
21161+static DEFINE_LOCAL_IRQ_LOCK(sp_llock);
21162+
21163 static bool srcu_might_be_idle(struct srcu_struct *sp)
1a6e0f06 21164 {
e4b2b4a8 21165 unsigned long curseq;
b3bbd485 21166@@ -744,13 +760,13 @@ static bool srcu_might_be_idle(struct srcu_struct *sp)
e4b2b4a8 21167 unsigned long t;
1a6e0f06 21168
e4b2b4a8
JK
21169 /* If the local srcu_data structure has callbacks, not idle. */
21170- local_irq_save(flags);
21171+ local_lock_irqsave(sp_llock, flags);
21172 sdp = this_cpu_ptr(sp->sda);
21173 if (rcu_segcblist_pend_cbs(&sdp->srcu_cblist)) {
21174- local_irq_restore(flags);
21175+ local_unlock_irqrestore(sp_llock, flags);
21176 return false; /* Callbacks already present, so not idle. */
21177 }
21178- local_irq_restore(flags);
21179+ local_unlock_irqrestore(sp_llock, flags);
1a6e0f06 21180
e4b2b4a8
JK
21181 /*
21182 * No local callbacks, so probabalistically probe global state.
b3bbd485 21183@@ -828,9 +844,9 @@ void __call_srcu(struct srcu_struct *sp, struct rcu_head *rhp,
e4b2b4a8
JK
21184 return;
21185 }
21186 rhp->func = func;
21187- local_irq_save(flags);
21188+ local_lock_irqsave(sp_llock, flags);
21189 sdp = this_cpu_ptr(sp->sda);
21190- raw_spin_lock_rcu_node(sdp);
21191+ spin_lock_rcu_node(sdp);
21192 rcu_segcblist_enqueue(&sdp->srcu_cblist, rhp, false);
21193 rcu_segcblist_advance(&sdp->srcu_cblist,
21194 rcu_seq_current(&sp->srcu_gp_seq));
b3bbd485 21195@@ -844,7 +860,8 @@ void __call_srcu(struct srcu_struct *sp, struct rcu_head *rhp,
e4b2b4a8
JK
21196 sdp->srcu_gp_seq_needed_exp = s;
21197 needexp = true;
21198 }
21199- raw_spin_unlock_irqrestore_rcu_node(sdp, flags);
21200+ spin_unlock_rcu_node(sdp);
21201+ local_unlock_irqrestore(sp_llock, flags);
21202 if (needgp)
21203 srcu_funnel_gp_start(sp, sdp, s, do_norm);
21204 else if (needexp)
b3bbd485 21205@@ -900,7 +917,7 @@ static void __synchronize_srcu(struct srcu_struct *sp, bool do_norm)
e4b2b4a8
JK
21206
21207 /*
21208 * Make sure that later code is ordered after the SRCU grace
21209- * period. This pairs with the raw_spin_lock_irq_rcu_node()
21210+ * period. This pairs with the spin_lock_irq_rcu_node()
21211 * in srcu_invoke_callbacks(). Unlike Tree RCU, this is needed
21212 * because the current CPU might have been totally uninvolved with
21213 * (and thus unordered against) that grace period.
b3bbd485 21214@@ -1024,7 +1041,7 @@ void srcu_barrier(struct srcu_struct *sp)
e4b2b4a8
JK
21215 */
21216 for_each_possible_cpu(cpu) {
21217 sdp = per_cpu_ptr(sp->sda, cpu);
21218- raw_spin_lock_irq_rcu_node(sdp);
21219+ spin_lock_irq_rcu_node(sdp);
21220 atomic_inc(&sp->srcu_barrier_cpu_cnt);
21221 sdp->srcu_barrier_head.func = srcu_barrier_cb;
21222 debug_rcu_head_queue(&sdp->srcu_barrier_head);
b3bbd485 21223@@ -1033,7 +1050,7 @@ void srcu_barrier(struct srcu_struct *sp)
e4b2b4a8
JK
21224 debug_rcu_head_unqueue(&sdp->srcu_barrier_head);
21225 atomic_dec(&sp->srcu_barrier_cpu_cnt);
21226 }
21227- raw_spin_unlock_irq_rcu_node(sdp);
21228+ spin_unlock_irq_rcu_node(sdp);
21229 }
21230
21231 /* Remove the initial count, at which point reaching zero can happen. */
b3bbd485 21232@@ -1082,17 +1099,17 @@ static void srcu_advance_state(struct srcu_struct *sp)
e4b2b4a8
JK
21233 */
21234 idx = rcu_seq_state(smp_load_acquire(&sp->srcu_gp_seq)); /* ^^^ */
21235 if (idx == SRCU_STATE_IDLE) {
21236- raw_spin_lock_irq_rcu_node(sp);
21237+ spin_lock_irq_rcu_node(sp);
21238 if (ULONG_CMP_GE(sp->srcu_gp_seq, sp->srcu_gp_seq_needed)) {
21239 WARN_ON_ONCE(rcu_seq_state(sp->srcu_gp_seq));
21240- raw_spin_unlock_irq_rcu_node(sp);
21241+ spin_unlock_irq_rcu_node(sp);
21242 mutex_unlock(&sp->srcu_gp_mutex);
21243 return;
21244 }
21245 idx = rcu_seq_state(READ_ONCE(sp->srcu_gp_seq));
21246 if (idx == SRCU_STATE_IDLE)
21247 srcu_gp_start(sp);
21248- raw_spin_unlock_irq_rcu_node(sp);
21249+ spin_unlock_irq_rcu_node(sp);
21250 if (idx != SRCU_STATE_IDLE) {
21251 mutex_unlock(&sp->srcu_gp_mutex);
21252 return; /* Someone else started the grace period. */
b3bbd485 21253@@ -1141,19 +1158,19 @@ static void srcu_invoke_callbacks(struct work_struct *work)
e4b2b4a8
JK
21254 sdp = container_of(work, struct srcu_data, work.work);
21255 sp = sdp->sp;
21256 rcu_cblist_init(&ready_cbs);
21257- raw_spin_lock_irq_rcu_node(sdp);
21258+ spin_lock_irq_rcu_node(sdp);
21259 rcu_segcblist_advance(&sdp->srcu_cblist,
21260 rcu_seq_current(&sp->srcu_gp_seq));
21261 if (sdp->srcu_cblist_invoking ||
21262 !rcu_segcblist_ready_cbs(&sdp->srcu_cblist)) {
21263- raw_spin_unlock_irq_rcu_node(sdp);
21264+ spin_unlock_irq_rcu_node(sdp);
21265 return; /* Someone else on the job or nothing to do. */
21266 }
21267
21268 /* We are on the job! Extract and invoke ready callbacks. */
21269 sdp->srcu_cblist_invoking = true;
21270 rcu_segcblist_extract_done_cbs(&sdp->srcu_cblist, &ready_cbs);
21271- raw_spin_unlock_irq_rcu_node(sdp);
21272+ spin_unlock_irq_rcu_node(sdp);
21273 rhp = rcu_cblist_dequeue(&ready_cbs);
21274 for (; rhp != NULL; rhp = rcu_cblist_dequeue(&ready_cbs)) {
21275 debug_rcu_head_unqueue(rhp);
b3bbd485 21276@@ -1166,13 +1183,13 @@ static void srcu_invoke_callbacks(struct work_struct *work)
e4b2b4a8
JK
21277 * Update counts, accelerate new callbacks, and if needed,
21278 * schedule another round of callback invocation.
21279 */
21280- raw_spin_lock_irq_rcu_node(sdp);
21281+ spin_lock_irq_rcu_node(sdp);
21282 rcu_segcblist_insert_count(&sdp->srcu_cblist, &ready_cbs);
21283 (void)rcu_segcblist_accelerate(&sdp->srcu_cblist,
21284 rcu_seq_snap(&sp->srcu_gp_seq));
21285 sdp->srcu_cblist_invoking = false;
21286 more = rcu_segcblist_ready_cbs(&sdp->srcu_cblist);
21287- raw_spin_unlock_irq_rcu_node(sdp);
21288+ spin_unlock_irq_rcu_node(sdp);
21289 if (more)
21290 srcu_schedule_cbs_sdp(sdp, 0);
21291 }
b3bbd485 21292@@ -1185,7 +1202,7 @@ static void srcu_reschedule(struct srcu_struct *sp, unsigned long delay)
e4b2b4a8
JK
21293 {
21294 bool pushgp = true;
21295
21296- raw_spin_lock_irq_rcu_node(sp);
21297+ spin_lock_irq_rcu_node(sp);
21298 if (ULONG_CMP_GE(sp->srcu_gp_seq, sp->srcu_gp_seq_needed)) {
21299 if (!WARN_ON_ONCE(rcu_seq_state(sp->srcu_gp_seq))) {
21300 /* All requests fulfilled, time to go idle. */
b3bbd485 21301@@ -1195,7 +1212,7 @@ static void srcu_reschedule(struct srcu_struct *sp, unsigned long delay)
e4b2b4a8
JK
21302 /* Outstanding request and no GP. Start one. */
21303 srcu_gp_start(sp);
21304 }
21305- raw_spin_unlock_irq_rcu_node(sp);
21306+ spin_unlock_irq_rcu_node(sp);
21307
21308 if (pushgp)
21309 queue_delayed_work(system_power_efficient_wq, &sp->work, delay);
b3bbd485
JK
21310diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
21311index 3e3650e94ae6..0a722b56d90b 100644
21312--- a/kernel/rcu/tree.c
21313+++ b/kernel/rcu/tree.c
e4b2b4a8
JK
21314@@ -58,6 +58,11 @@
21315 #include <linux/trace_events.h>
21316 #include <linux/suspend.h>
21317 #include <linux/ftrace.h>
21318+#include <linux/delay.h>
21319+#include <linux/gfp.h>
21320+#include <linux/oom.h>
21321+#include <linux/smpboot.h>
21322+#include "../time/tick-internal.h"
21323
21324 #include "tree.h"
21325 #include "rcu.h"
b3bbd485 21326@@ -243,6 +248,19 @@ void rcu_sched_qs(void)
e4b2b4a8
JK
21327 this_cpu_ptr(&rcu_sched_data), true);
21328 }
21329
21330+#ifdef CONFIG_PREEMPT_RT_FULL
21331+static void rcu_preempt_qs(void);
1a6e0f06 21332+
e4b2b4a8
JK
21333+void rcu_bh_qs(void)
21334+{
21335+ unsigned long flags;
1a6e0f06 21336+
e4b2b4a8
JK
21337+ /* Callers to this function, rcu_preempt_qs(), must disable irqs. */
21338+ local_irq_save(flags);
21339+ rcu_preempt_qs();
21340+ local_irq_restore(flags);
21341+}
1a6e0f06 21342+#else
e4b2b4a8
JK
21343 void rcu_bh_qs(void)
21344 {
21345 RCU_LOCKDEP_WARN(preemptible(), "rcu_bh_qs() invoked with preemption enabled!!!");
b3bbd485 21346@@ -253,6 +271,7 @@ void rcu_bh_qs(void)
e4b2b4a8
JK
21347 __this_cpu_write(rcu_bh_data.cpu_no_qs.b.norm, false);
21348 }
21349 }
1a6e0f06 21350+#endif
1a6e0f06 21351
e4b2b4a8
JK
21352 /*
21353 * Steal a bit from the bottom of ->dynticks for idle entry/exit
b3bbd485 21354@@ -564,11 +583,13 @@ EXPORT_SYMBOL_GPL(rcu_batches_started_sched);
e4b2b4a8
JK
21355 /*
21356 * Return the number of RCU BH batches started thus far for debug & stats.
21357 */
21358+#ifndef CONFIG_PREEMPT_RT_FULL
21359 unsigned long rcu_batches_started_bh(void)
21360 {
21361 return rcu_bh_state.gpnum;
21362 }
21363 EXPORT_SYMBOL_GPL(rcu_batches_started_bh);
21364+#endif
1a6e0f06 21365
e4b2b4a8
JK
21366 /*
21367 * Return the number of RCU batches completed thus far for debug & stats.
b3bbd485 21368@@ -588,6 +609,7 @@ unsigned long rcu_batches_completed_sched(void)
e4b2b4a8
JK
21369 }
21370 EXPORT_SYMBOL_GPL(rcu_batches_completed_sched);
1a6e0f06 21371
e4b2b4a8
JK
21372+#ifndef CONFIG_PREEMPT_RT_FULL
21373 /*
21374 * Return the number of RCU BH batches completed thus far for debug & stats.
21375 */
b3bbd485 21376@@ -596,6 +618,7 @@ unsigned long rcu_batches_completed_bh(void)
e4b2b4a8
JK
21377 return rcu_bh_state.completed;
21378 }
21379 EXPORT_SYMBOL_GPL(rcu_batches_completed_bh);
1a6e0f06 21380+#endif
e4b2b4a8
JK
21381
21382 /*
21383 * Return the number of RCU expedited batches completed thus far for
b3bbd485 21384@@ -619,6 +642,7 @@ unsigned long rcu_exp_batches_completed_sched(void)
1a6e0f06 21385 }
e4b2b4a8 21386 EXPORT_SYMBOL_GPL(rcu_exp_batches_completed_sched);
1a6e0f06 21387
e4b2b4a8
JK
21388+#ifndef CONFIG_PREEMPT_RT_FULL
21389 /*
21390 * Force a quiescent state.
21391 */
b3bbd485 21392@@ -637,6 +661,13 @@ void rcu_bh_force_quiescent_state(void)
1a6e0f06 21393 }
e4b2b4a8 21394 EXPORT_SYMBOL_GPL(rcu_bh_force_quiescent_state);
1a6e0f06 21395
1a6e0f06 21396+#else
e4b2b4a8
JK
21397+void rcu_force_quiescent_state(void)
21398+{
21399+}
21400+EXPORT_SYMBOL_GPL(rcu_force_quiescent_state);
1a6e0f06 21401+#endif
e4b2b4a8
JK
21402+
21403 /*
21404 * Force a quiescent state for RCU-sched.
21405 */
b3bbd485 21406@@ -687,9 +718,11 @@ void rcutorture_get_gp_data(enum rcutorture_type test_type, int *flags,
e4b2b4a8
JK
21407 case RCU_FLAVOR:
21408 rsp = rcu_state_p;
21409 break;
21410+#ifndef CONFIG_PREEMPT_RT_FULL
21411 case RCU_BH_FLAVOR:
21412 rsp = &rcu_bh_state;
21413 break;
21414+#endif
21415 case RCU_SCHED_FLAVOR:
21416 rsp = &rcu_sched_state;
21417 break;
b3bbd485 21418@@ -2918,18 +2951,17 @@ __rcu_process_callbacks(struct rcu_state *rsp)
e4b2b4a8
JK
21419 /*
21420 * Do RCU core processing for the current CPU.
21421 */
21422-static __latent_entropy void rcu_process_callbacks(struct softirq_action *unused)
21423+static __latent_entropy void rcu_process_callbacks(void)
21424 {
21425 struct rcu_state *rsp;
1a6e0f06 21426
e4b2b4a8
JK
21427 if (cpu_is_offline(smp_processor_id()))
21428 return;
21429- trace_rcu_utilization(TPS("Start RCU core"));
21430 for_each_rcu_flavor(rsp)
21431 __rcu_process_callbacks(rsp);
21432- trace_rcu_utilization(TPS("End RCU core"));
1a6e0f06
JK
21433 }
21434
e4b2b4a8
JK
21435+static DEFINE_PER_CPU(struct task_struct *, rcu_cpu_kthread_task);
21436 /*
21437 * Schedule RCU callback invocation. If the specified type of RCU
21438 * does not support RCU priority boosting, just do a direct call,
b3bbd485 21439@@ -2941,19 +2973,106 @@ static void invoke_rcu_callbacks(struct rcu_state *rsp, struct rcu_data *rdp)
1a6e0f06 21440 {
e4b2b4a8
JK
21441 if (unlikely(!READ_ONCE(rcu_scheduler_fully_active)))
21442 return;
21443- if (likely(!rsp->boost)) {
21444- rcu_do_batch(rsp, rdp);
b3bbd485
JK
21445- return;
21446- }
21447- invoke_rcu_callbacks_kthread();
e4b2b4a8 21448+ rcu_do_batch(rsp, rdp);
b3bbd485
JK
21449 }
21450
e4b2b4a8
JK
21451+static void rcu_wake_cond(struct task_struct *t, int status)
21452+{
21453+ /*
21454+ * If the thread is yielding, only wake it when this
21455+ * is invoked from idle
21456+ */
21457+ if (t && (status != RCU_KTHREAD_YIELDING || is_idle_task(current)))
21458+ wake_up_process(t);
21459+}
1a6e0f06 21460+
e4b2b4a8
JK
21461+/*
21462+ * Wake up this CPU's rcuc kthread to do RCU core processing.
21463+ */
b3bbd485
JK
21464 static void invoke_rcu_core(void)
21465 {
21466- if (cpu_online(smp_processor_id()))
21467- raise_softirq(RCU_SOFTIRQ);
e4b2b4a8
JK
21468+ unsigned long flags;
21469+ struct task_struct *t;
1a6e0f06 21470+
e4b2b4a8 21471+ if (!cpu_online(smp_processor_id()))
b3bbd485 21472+ return;
e4b2b4a8
JK
21473+ local_irq_save(flags);
21474+ __this_cpu_write(rcu_cpu_has_work, 1);
21475+ t = __this_cpu_read(rcu_cpu_kthread_task);
21476+ if (t != NULL && current != t)
21477+ rcu_wake_cond(t, __this_cpu_read(rcu_cpu_kthread_status));
21478+ local_irq_restore(flags);
21479+}
1a6e0f06 21480+
e4b2b4a8
JK
21481+static void rcu_cpu_kthread_park(unsigned int cpu)
21482+{
21483+ per_cpu(rcu_cpu_kthread_status, cpu) = RCU_KTHREAD_OFFCPU;
21484+}
1a6e0f06 21485+
e4b2b4a8 21486+static int rcu_cpu_kthread_should_run(unsigned int cpu)
1a6e0f06 21487+{
e4b2b4a8 21488+ return __this_cpu_read(rcu_cpu_has_work);
b3bbd485
JK
21489 }
21490
e4b2b4a8
JK
21491+/*
21492+ * Per-CPU kernel thread that invokes RCU callbacks. This replaces the
21493+ * RCU softirq used in flavors and configurations of RCU that do not
21494+ * support RCU priority boosting.
21495+ */
21496+static void rcu_cpu_kthread(unsigned int cpu)
21497+{
21498+ unsigned int *statusp = this_cpu_ptr(&rcu_cpu_kthread_status);
21499+ char work, *workp = this_cpu_ptr(&rcu_cpu_has_work);
21500+ int spincnt;
21501+
21502+ for (spincnt = 0; spincnt < 10; spincnt++) {
21503+ trace_rcu_utilization(TPS("Start CPU kthread@rcu_wait"));
21504+ local_bh_disable();
21505+ *statusp = RCU_KTHREAD_RUNNING;
21506+ this_cpu_inc(rcu_cpu_kthread_loops);
21507+ local_irq_disable();
21508+ work = *workp;
21509+ *workp = 0;
21510+ local_irq_enable();
21511+ if (work)
21512+ rcu_process_callbacks();
21513+ local_bh_enable();
21514+ if (*workp == 0) {
21515+ trace_rcu_utilization(TPS("End CPU kthread@rcu_wait"));
21516+ *statusp = RCU_KTHREAD_WAITING;
21517+ return;
21518+ }
b3bbd485 21519+ }
e4b2b4a8
JK
21520+ *statusp = RCU_KTHREAD_YIELDING;
21521+ trace_rcu_utilization(TPS("Start CPU kthread@rcu_yield"));
21522+ schedule_timeout_interruptible(2);
21523+ trace_rcu_utilization(TPS("End CPU kthread@rcu_yield"));
21524+ *statusp = RCU_KTHREAD_WAITING;
b3bbd485
JK
21525+}
21526+
e4b2b4a8
JK
21527+static struct smp_hotplug_thread rcu_cpu_thread_spec = {
21528+ .store = &rcu_cpu_kthread_task,
21529+ .thread_should_run = rcu_cpu_kthread_should_run,
21530+ .thread_fn = rcu_cpu_kthread,
21531+ .thread_comm = "rcuc/%u",
21532+ .setup = rcu_cpu_kthread_setup,
21533+ .park = rcu_cpu_kthread_park,
21534+};
21535+
21536+/*
21537+ * Spawn per-CPU RCU core processing kthreads.
21538+ */
21539+static int __init rcu_spawn_core_kthreads(void)
b3bbd485 21540+{
e4b2b4a8
JK
21541+ int cpu;
21542+
21543+ for_each_possible_cpu(cpu)
21544+ per_cpu(rcu_cpu_has_work, cpu) = 0;
21545+ BUG_ON(smpboot_register_percpu_thread(&rcu_cpu_thread_spec));
21546+ return 0;
b3bbd485 21547+}
e4b2b4a8 21548+early_initcall(rcu_spawn_core_kthreads);
b3bbd485 21549+
e4b2b4a8
JK
21550 /*
21551 * Handle any core-RCU processing required by a call_rcu() invocation.
b3bbd485
JK
21552 */
21553@@ -3113,6 +3232,7 @@ void call_rcu_sched(struct rcu_head *head, rcu_callback_t func)
1a6e0f06 21554 }
e4b2b4a8 21555 EXPORT_SYMBOL_GPL(call_rcu_sched);
1a6e0f06 21556
e4b2b4a8 21557+#ifndef CONFIG_PREEMPT_RT_FULL
1a6e0f06 21558 /**
e4b2b4a8
JK
21559 * call_rcu_bh() - Queue an RCU for invocation after a quicker grace period.
21560 * @head: structure to be used for queueing the RCU updates.
b3bbd485 21561@@ -3140,6 +3260,7 @@ void call_rcu_bh(struct rcu_head *head, rcu_callback_t func)
e4b2b4a8 21562 __call_rcu(head, func, &rcu_bh_state, -1, 0);
1a6e0f06 21563 }
e4b2b4a8
JK
21564 EXPORT_SYMBOL_GPL(call_rcu_bh);
21565+#endif
1a6e0f06 21566
e4b2b4a8
JK
21567 /*
21568 * Queue an RCU callback for lazy invocation after a grace period.
b3bbd485 21569@@ -3225,6 +3346,7 @@ void synchronize_sched(void)
e4b2b4a8
JK
21570 }
21571 EXPORT_SYMBOL_GPL(synchronize_sched);
1a6e0f06 21572
e4b2b4a8
JK
21573+#ifndef CONFIG_PREEMPT_RT_FULL
21574 /**
21575 * synchronize_rcu_bh - wait until an rcu_bh grace period has elapsed.
21576 *
b3bbd485 21577@@ -3251,6 +3373,7 @@ void synchronize_rcu_bh(void)
e4b2b4a8
JK
21578 wait_rcu_gp(call_rcu_bh);
21579 }
21580 EXPORT_SYMBOL_GPL(synchronize_rcu_bh);
21581+#endif
1a6e0f06 21582
e4b2b4a8
JK
21583 /**
21584 * get_state_synchronize_rcu - Snapshot current RCU state
b3bbd485 21585@@ -3601,6 +3724,7 @@ static void _rcu_barrier(struct rcu_state *rsp)
e4b2b4a8
JK
21586 mutex_unlock(&rsp->barrier_mutex);
21587 }
1a6e0f06 21588
e4b2b4a8
JK
21589+#ifndef CONFIG_PREEMPT_RT_FULL
21590 /**
21591 * rcu_barrier_bh - Wait until all in-flight call_rcu_bh() callbacks complete.
21592 */
b3bbd485 21593@@ -3609,6 +3733,7 @@ void rcu_barrier_bh(void)
e4b2b4a8
JK
21594 _rcu_barrier(&rcu_bh_state);
21595 }
21596 EXPORT_SYMBOL_GPL(rcu_barrier_bh);
21597+#endif
1a6e0f06 21598
e4b2b4a8
JK
21599 /**
21600 * rcu_barrier_sched - Wait for in-flight call_rcu_sched() callbacks.
b3bbd485 21601@@ -3741,8 +3866,6 @@ int rcutree_online_cpu(unsigned int cpu)
e4b2b4a8
JK
21602 {
21603 sync_sched_exp_online_cleanup(cpu);
21604 rcutree_affinity_setting(cpu, -1);
21605- if (IS_ENABLED(CONFIG_TREE_SRCU))
21606- srcu_online_cpu(cpu);
21607 return 0;
21608 }
1a6e0f06 21609
b3bbd485 21610@@ -3753,8 +3876,6 @@ int rcutree_online_cpu(unsigned int cpu)
e4b2b4a8
JK
21611 int rcutree_offline_cpu(unsigned int cpu)
21612 {
21613 rcutree_affinity_setting(cpu, cpu);
21614- if (IS_ENABLED(CONFIG_TREE_SRCU))
21615- srcu_offline_cpu(cpu);
21616 return 0;
21617 }
1a6e0f06 21618
b3bbd485 21619@@ -4184,12 +4305,13 @@ void __init rcu_init(void)
1a6e0f06 21620
e4b2b4a8
JK
21621 rcu_bootup_announce();
21622 rcu_init_geometry();
21623+#ifndef CONFIG_PREEMPT_RT_FULL
21624 rcu_init_one(&rcu_bh_state);
21625+#endif
21626 rcu_init_one(&rcu_sched_state);
21627 if (dump_tree)
21628 rcu_dump_rcu_node_tree(&rcu_sched_state);
21629 __rcu_init_preempt();
21630- open_softirq(RCU_SOFTIRQ, rcu_process_callbacks);
1a6e0f06 21631
e4b2b4a8
JK
21632 /*
21633 * We don't need protection against CPU-hotplug here because
b3bbd485 21634@@ -4200,8 +4322,6 @@ void __init rcu_init(void)
e4b2b4a8
JK
21635 for_each_online_cpu(cpu) {
21636 rcutree_prepare_cpu(cpu);
21637 rcu_cpu_starting(cpu);
21638- if (IS_ENABLED(CONFIG_TREE_SRCU))
21639- srcu_online_cpu(cpu);
21640 }
1a6e0f06
JK
21641 }
21642
b3bbd485
JK
21643diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
21644index 8e1f285f0a70..7acc23da94e2 100644
21645--- a/kernel/rcu/tree.h
21646+++ b/kernel/rcu/tree.h
21647@@ -427,7 +427,9 @@ extern struct list_head rcu_struct_flavors;
e4b2b4a8
JK
21648 */
21649 extern struct rcu_state rcu_sched_state;
1a6e0f06 21650
e4b2b4a8
JK
21651+#ifndef CONFIG_PREEMPT_RT_FULL
21652 extern struct rcu_state rcu_bh_state;
21653+#endif
1a6e0f06 21654
e4b2b4a8
JK
21655 #ifdef CONFIG_PREEMPT_RCU
21656 extern struct rcu_state rcu_preempt_state;
b3bbd485 21657@@ -436,12 +438,10 @@ extern struct rcu_state rcu_preempt_state;
e4b2b4a8
JK
21658 int rcu_dynticks_snap(struct rcu_dynticks *rdtp);
21659 bool rcu_eqs_special_set(int cpu);
1a6e0f06 21660
e4b2b4a8
JK
21661-#ifdef CONFIG_RCU_BOOST
21662 DECLARE_PER_CPU(unsigned int, rcu_cpu_kthread_status);
21663 DECLARE_PER_CPU(int, rcu_cpu_kthread_cpu);
21664 DECLARE_PER_CPU(unsigned int, rcu_cpu_kthread_loops);
21665 DECLARE_PER_CPU(char, rcu_cpu_has_work);
21666-#endif /* #ifdef CONFIG_RCU_BOOST */
1a6e0f06 21667
e4b2b4a8
JK
21668 #ifndef RCU_TREE_NONCORE
21669
b3bbd485 21670@@ -461,10 +461,9 @@ void call_rcu(struct rcu_head *head, rcu_callback_t func);
e4b2b4a8
JK
21671 static void __init __rcu_init_preempt(void);
21672 static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags);
21673 static void rcu_preempt_boost_start_gp(struct rcu_node *rnp);
21674-static void invoke_rcu_callbacks_kthread(void);
21675 static bool rcu_is_callbacks_kthread(void);
21676+static void rcu_cpu_kthread_setup(unsigned int cpu);
21677 #ifdef CONFIG_RCU_BOOST
21678-static void rcu_preempt_do_callbacks(void);
21679 static int rcu_spawn_one_boost_kthread(struct rcu_state *rsp,
21680 struct rcu_node *rnp);
21681 #endif /* #ifdef CONFIG_RCU_BOOST */
b3bbd485
JK
21682diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
21683index 8b3102d22823..17ee8d1f38c4 100644
21684--- a/kernel/rcu/tree_plugin.h
21685+++ b/kernel/rcu/tree_plugin.h
e4b2b4a8
JK
21686@@ -24,39 +24,16 @@
21687 * Paul E. McKenney <paulmck@linux.vnet.ibm.com>
21688 */
21689
21690-#include <linux/delay.h>
21691-#include <linux/gfp.h>
21692-#include <linux/oom.h>
21693-#include <linux/sched/debug.h>
21694-#include <linux/smpboot.h>
21695-#include <uapi/linux/sched/types.h>
21696-#include "../time/tick-internal.h"
21697-
21698-#ifdef CONFIG_RCU_BOOST
21699-
21700 #include "../locking/rtmutex_common.h"
21701
21702 /*
21703 * Control variables for per-CPU and per-rcu_node kthreads. These
21704 * handle all flavors of RCU.
21705 */
21706-static DEFINE_PER_CPU(struct task_struct *, rcu_cpu_kthread_task);
21707 DEFINE_PER_CPU(unsigned int, rcu_cpu_kthread_status);
21708 DEFINE_PER_CPU(unsigned int, rcu_cpu_kthread_loops);
21709 DEFINE_PER_CPU(char, rcu_cpu_has_work);
21710
21711-#else /* #ifdef CONFIG_RCU_BOOST */
21712-
21713-/*
21714- * Some architectures do not define rt_mutexes, but if !CONFIG_RCU_BOOST,
21715- * all uses are in dead code. Provide a definition to keep the compiler
21716- * happy, but add WARN_ON_ONCE() to complain if used in the wrong place.
21717- * This probably needs to be excluded from -rt builds.
21718- */
21719-#define rt_mutex_owner(a) ({ WARN_ON_ONCE(1); NULL; })
21720-
21721-#endif /* #else #ifdef CONFIG_RCU_BOOST */
21722-
21723 #ifdef CONFIG_RCU_NOCB_CPU
21724 static cpumask_var_t rcu_nocb_mask; /* CPUs to have callbacks offloaded. */
21725 static bool have_rcu_nocb_mask; /* Was rcu_nocb_mask allocated? */
b3bbd485 21726@@ -324,9 +301,13 @@ static void rcu_preempt_note_context_switch(bool preempt)
e4b2b4a8
JK
21727 struct task_struct *t = current;
21728 struct rcu_data *rdp;
21729 struct rcu_node *rnp;
21730+ int sleeping_l = 0;
21731
21732 RCU_LOCKDEP_WARN(!irqs_disabled(), "rcu_preempt_note_context_switch() invoked with interrupts enabled!!!\n");
21733- WARN_ON_ONCE(!preempt && t->rcu_read_lock_nesting > 0);
21734+#if defined(CONFIG_PREEMPT_RT_FULL)
21735+ sleeping_l = t->sleeping_lock;
21736+#endif
21737+ WARN_ON_ONCE(!preempt && t->rcu_read_lock_nesting > 0 && !sleeping_l);
21738 if (t->rcu_read_lock_nesting > 0 &&
21739 !t->rcu_read_unlock_special.b.blocked) {
21740
b3bbd485 21741@@ -463,7 +444,7 @@ void rcu_read_unlock_special(struct task_struct *t)
e4b2b4a8
JK
21742 }
21743
21744 /* Hardware IRQ handlers cannot block, complain if they get here. */
21745- if (in_irq() || in_serving_softirq()) {
21746+ if (preempt_count() & (HARDIRQ_MASK | SOFTIRQ_OFFSET)) {
21747 lockdep_rcu_suspicious(__FILE__, __LINE__,
21748 "rcu_read_unlock() from irq or softirq with blocking in critical section!!!\n");
21749 pr_alert("->rcu_read_unlock_special: %#x (b: %d, enq: %d nq: %d)\n",
b3bbd485 21750@@ -530,7 +511,7 @@ void rcu_read_unlock_special(struct task_struct *t)
e4b2b4a8
JK
21751
21752 /* Unboost if we were boosted. */
21753 if (IS_ENABLED(CONFIG_RCU_BOOST) && drop_boost_mutex)
21754- rt_mutex_unlock(&rnp->boost_mtx);
21755+ rt_mutex_futex_unlock(&rnp->boost_mtx);
21756
21757 /*
21758 * If this was the last task on the expedited lists,
b3bbd485 21759@@ -684,15 +665,6 @@ static void rcu_preempt_check_callbacks(void)
e4b2b4a8
JK
21760 t->rcu_read_unlock_special.b.need_qs = true;
21761 }
21762
21763-#ifdef CONFIG_RCU_BOOST
21764-
21765-static void rcu_preempt_do_callbacks(void)
21766-{
21767- rcu_do_batch(rcu_state_p, this_cpu_ptr(rcu_data_p));
21768-}
21769-
21770-#endif /* #ifdef CONFIG_RCU_BOOST */
21771-
21772 /**
21773 * call_rcu() - Queue an RCU callback for invocation after a grace period.
21774 * @head: structure to be used for queueing the RCU updates.
b3bbd485 21775@@ -915,20 +887,23 @@ void exit_rcu(void)
e4b2b4a8
JK
21776
21777 #endif /* #else #ifdef CONFIG_PREEMPT_RCU */
1a6e0f06 21778
1a6e0f06 21779+/*
e4b2b4a8 21780+ * If boosting, set rcuc kthreads to realtime priority.
1a6e0f06 21781+ */
e4b2b4a8 21782+static void rcu_cpu_kthread_setup(unsigned int cpu)
1a6e0f06 21783+{
b3bbd485 21784+#ifdef CONFIG_RCU_BOOST
e4b2b4a8 21785+ struct sched_param sp;
b3bbd485
JK
21786+
21787+ sp.sched_priority = kthread_prio;
21788+ sched_setscheduler_nocheck(current, SCHED_FIFO, &sp);
21789+#endif /* #ifdef CONFIG_RCU_BOOST */
21790+}
21791+
21792 #ifdef CONFIG_RCU_BOOST
21793
21794 #include "../locking/rtmutex_common.h"
e4b2b4a8 21795
e4b2b4a8
JK
21796-static void rcu_wake_cond(struct task_struct *t, int status)
21797-{
21798- /*
21799- * If the thread is yielding, only wake it when this
21800- * is invoked from idle
21801- */
21802- if (status != RCU_KTHREAD_YIELDING || is_idle_task(current))
21803- wake_up_process(t);
b3bbd485
JK
21804-}
21805-
e4b2b4a8
JK
21806 /*
21807 * Carry out RCU priority boosting on the task indicated by ->exp_tasks
21808 * or ->boost_tasks, advancing the pointer to the next task in the
b3bbd485
JK
21809@@ -1070,23 +1045,6 @@ static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags)
21810 }
e4b2b4a8 21811 }
1a6e0f06 21812
b3bbd485 21813-/*
e4b2b4a8
JK
21814- * Wake up the per-CPU kthread to invoke RCU callbacks.
21815- */
21816-static void invoke_rcu_callbacks_kthread(void)
21817-{
21818- unsigned long flags;
21819-
21820- local_irq_save(flags);
21821- __this_cpu_write(rcu_cpu_has_work, 1);
21822- if (__this_cpu_read(rcu_cpu_kthread_task) != NULL &&
21823- current != __this_cpu_read(rcu_cpu_kthread_task)) {
21824- rcu_wake_cond(__this_cpu_read(rcu_cpu_kthread_task),
21825- __this_cpu_read(rcu_cpu_kthread_status));
21826- }
21827- local_irq_restore(flags);
21828-}
21829-
b3bbd485 21830 /*
e4b2b4a8
JK
21831 * Is the current CPU running the RCU-callbacks kthread?
21832 * Caller must have preemption disabled.
b3bbd485 21833@@ -1141,67 +1099,6 @@ static int rcu_spawn_one_boost_kthread(struct rcu_state *rsp,
e4b2b4a8
JK
21834 return 0;
21835 }
1a6e0f06 21836
e4b2b4a8
JK
21837-static void rcu_kthread_do_work(void)
21838-{
21839- rcu_do_batch(&rcu_sched_state, this_cpu_ptr(&rcu_sched_data));
21840- rcu_do_batch(&rcu_bh_state, this_cpu_ptr(&rcu_bh_data));
21841- rcu_preempt_do_callbacks();
21842-}
21843-
21844-static void rcu_cpu_kthread_setup(unsigned int cpu)
21845-{
21846- struct sched_param sp;
21847-
21848- sp.sched_priority = kthread_prio;
21849- sched_setscheduler_nocheck(current, SCHED_FIFO, &sp);
21850-}
21851-
21852-static void rcu_cpu_kthread_park(unsigned int cpu)
21853-{
21854- per_cpu(rcu_cpu_kthread_status, cpu) = RCU_KTHREAD_OFFCPU;
21855-}
21856-
21857-static int rcu_cpu_kthread_should_run(unsigned int cpu)
21858-{
21859- return __this_cpu_read(rcu_cpu_has_work);
21860-}
21861-
21862-/*
21863- * Per-CPU kernel thread that invokes RCU callbacks. This replaces the
21864- * RCU softirq used in flavors and configurations of RCU that do not
21865- * support RCU priority boosting.
21866- */
21867-static void rcu_cpu_kthread(unsigned int cpu)
21868-{
21869- unsigned int *statusp = this_cpu_ptr(&rcu_cpu_kthread_status);
21870- char work, *workp = this_cpu_ptr(&rcu_cpu_has_work);
21871- int spincnt;
21872-
21873- for (spincnt = 0; spincnt < 10; spincnt++) {
21874- trace_rcu_utilization(TPS("Start CPU kthread@rcu_wait"));
21875- local_bh_disable();
21876- *statusp = RCU_KTHREAD_RUNNING;
21877- this_cpu_inc(rcu_cpu_kthread_loops);
21878- local_irq_disable();
21879- work = *workp;
21880- *workp = 0;
21881- local_irq_enable();
21882- if (work)
21883- rcu_kthread_do_work();
21884- local_bh_enable();
21885- if (*workp == 0) {
21886- trace_rcu_utilization(TPS("End CPU kthread@rcu_wait"));
21887- *statusp = RCU_KTHREAD_WAITING;
21888- return;
21889- }
21890- }
21891- *statusp = RCU_KTHREAD_YIELDING;
21892- trace_rcu_utilization(TPS("Start CPU kthread@rcu_yield"));
21893- schedule_timeout_interruptible(2);
21894- trace_rcu_utilization(TPS("End CPU kthread@rcu_yield"));
21895- *statusp = RCU_KTHREAD_WAITING;
21896-}
21897-
21898 /*
21899 * Set the per-rcu_node kthread's affinity to cover all CPUs that are
21900 * served by the rcu_node in question. The CPU hotplug lock is still
b3bbd485 21901@@ -1232,26 +1129,12 @@ static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu)
e4b2b4a8
JK
21902 free_cpumask_var(cm);
21903 }
1a6e0f06 21904
e4b2b4a8
JK
21905-static struct smp_hotplug_thread rcu_cpu_thread_spec = {
21906- .store = &rcu_cpu_kthread_task,
21907- .thread_should_run = rcu_cpu_kthread_should_run,
21908- .thread_fn = rcu_cpu_kthread,
21909- .thread_comm = "rcuc/%u",
21910- .setup = rcu_cpu_kthread_setup,
21911- .park = rcu_cpu_kthread_park,
21912-};
21913-
21914 /*
21915 * Spawn boost kthreads -- called as soon as the scheduler is running.
21916 */
21917 static void __init rcu_spawn_boost_kthreads(void)
21918 {
21919 struct rcu_node *rnp;
21920- int cpu;
21921-
21922- for_each_possible_cpu(cpu)
21923- per_cpu(rcu_cpu_has_work, cpu) = 0;
21924- BUG_ON(smpboot_register_percpu_thread(&rcu_cpu_thread_spec));
21925 rcu_for_each_leaf_node(rcu_state_p, rnp)
21926 (void)rcu_spawn_one_boost_kthread(rcu_state_p, rnp);
21927 }
b3bbd485 21928@@ -1274,11 +1157,6 @@ static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags)
e4b2b4a8
JK
21929 raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
21930 }
1a6e0f06 21931
e4b2b4a8
JK
21932-static void invoke_rcu_callbacks_kthread(void)
21933-{
21934- WARN_ON_ONCE(1);
21935-}
21936-
21937 static bool rcu_is_callbacks_kthread(void)
21938 {
21939 return false;
b3bbd485 21940@@ -1302,7 +1180,7 @@ static void rcu_prepare_kthreads(int cpu)
1a6e0f06 21941
e4b2b4a8 21942 #endif /* #else #ifdef CONFIG_RCU_BOOST */
1a6e0f06 21943
e4b2b4a8
JK
21944-#if !defined(CONFIG_RCU_FAST_NO_HZ)
21945+#if !defined(CONFIG_RCU_FAST_NO_HZ) || defined(CONFIG_PREEMPT_RT_FULL)
1a6e0f06 21946
e4b2b4a8
JK
21947 /*
21948 * Check to see if any future RCU-related work will need to be done
b3bbd485 21949@@ -1318,7 +1196,9 @@ int rcu_needs_cpu(u64 basemono, u64 *nextevt)
e4b2b4a8
JK
21950 *nextevt = KTIME_MAX;
21951 return rcu_cpu_has_callbacks(NULL);
1a6e0f06 21952 }
e4b2b4a8 21953+#endif /* !defined(CONFIG_RCU_FAST_NO_HZ) || defined(CONFIG_PREEMPT_RT_FULL) */
1a6e0f06 21954
e4b2b4a8
JK
21955+#if !defined(CONFIG_RCU_FAST_NO_HZ)
21956 /*
21957 * Because we do not have RCU_FAST_NO_HZ, don't bother cleaning up
21958 * after it.
b3bbd485 21959@@ -1414,6 +1294,8 @@ static bool __maybe_unused rcu_try_advance_all_cbs(void)
e4b2b4a8
JK
21960 return cbs_ready;
21961 }
1f39f580 21962
e4b2b4a8
JK
21963+#ifndef CONFIG_PREEMPT_RT_FULL
21964+
21965 /*
21966 * Allow the CPU to enter dyntick-idle mode unless it has callbacks ready
21967 * to invoke. If the CPU has callbacks, try to advance them. Tell the
b3bbd485 21968@@ -1456,6 +1338,7 @@ int rcu_needs_cpu(u64 basemono, u64 *nextevt)
e4b2b4a8
JK
21969 *nextevt = basemono + dj * TICK_NSEC;
21970 return 0;
21971 }
21972+#endif /* #ifndef CONFIG_PREEMPT_RT_FULL */
1f39f580 21973
e4b2b4a8
JK
21974 /*
21975 * Prepare a CPU for idle from an RCU perspective. The first major task
b3bbd485
JK
21976diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c
21977index 7a577bd989a4..2006a09680aa 100644
21978--- a/kernel/rcu/update.c
21979+++ b/kernel/rcu/update.c
21980@@ -66,7 +66,7 @@ extern int rcu_expedited; /* from sysctl */
e4b2b4a8
JK
21981 module_param(rcu_expedited, int, 0);
21982 extern int rcu_normal; /* from sysctl */
21983 module_param(rcu_normal, int, 0);
21984-static int rcu_normal_after_boot;
21985+static int rcu_normal_after_boot = IS_ENABLED(CONFIG_PREEMPT_RT_FULL);
21986 module_param(rcu_normal_after_boot, int, 0);
21987 #endif /* #ifndef CONFIG_TINY_RCU */
1f39f580 21988
b3bbd485 21989@@ -333,6 +333,7 @@ int rcu_read_lock_held(void)
e4b2b4a8
JK
21990 }
21991 EXPORT_SYMBOL_GPL(rcu_read_lock_held);
1f39f580 21992
e4b2b4a8
JK
21993+#ifndef CONFIG_PREEMPT_RT_FULL
21994 /**
21995 * rcu_read_lock_bh_held() - might we be in RCU-bh read-side critical section?
21996 *
b3bbd485 21997@@ -359,6 +360,7 @@ int rcu_read_lock_bh_held(void)
e4b2b4a8
JK
21998 return in_softirq() || irqs_disabled();
21999 }
22000 EXPORT_SYMBOL_GPL(rcu_read_lock_bh_held);
22001+#endif
1f39f580 22002
e4b2b4a8 22003 #endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
1f39f580 22004
b3bbd485
JK
22005diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile
22006index a9ee16bbc693..9943019095e9 100644
22007--- a/kernel/sched/Makefile
22008+++ b/kernel/sched/Makefile
22009@@ -18,7 +18,7 @@ endif
22010
22011 obj-y += core.o loadavg.o clock.o cputime.o
22012 obj-y += idle_task.o fair.o rt.o deadline.o
22013-obj-y += wait.o wait_bit.o swait.o completion.o idle.o
22014+obj-y += wait.o wait_bit.o swait.o swork.o completion.o idle.o
22015 obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o topology.o stop_task.o
22016 obj-$(CONFIG_SCHED_AUTOGROUP) += autogroup.o
22017 obj-$(CONFIG_SCHEDSTATS) += stats.o
22018diff --git a/kernel/sched/completion.c b/kernel/sched/completion.c
22019index 2ddaec40956f..0fe2982e46a0 100644
22020--- a/kernel/sched/completion.c
22021+++ b/kernel/sched/completion.c
22022@@ -32,7 +32,7 @@ void complete(struct completion *x)
e4b2b4a8
JK
22023 {
22024 unsigned long flags;
1f39f580 22025
e4b2b4a8
JK
22026- spin_lock_irqsave(&x->wait.lock, flags);
22027+ raw_spin_lock_irqsave(&x->wait.lock, flags);
1f39f580 22028
e4b2b4a8
JK
22029 /*
22030 * Perform commit of crossrelease here.
b3bbd485 22031@@ -41,8 +41,8 @@ void complete(struct completion *x)
1f39f580 22032
e4b2b4a8
JK
22033 if (x->done != UINT_MAX)
22034 x->done++;
22035- __wake_up_locked(&x->wait, TASK_NORMAL, 1);
22036- spin_unlock_irqrestore(&x->wait.lock, flags);
22037+ swake_up_locked(&x->wait);
22038+ raw_spin_unlock_irqrestore(&x->wait.lock, flags);
22039 }
22040 EXPORT_SYMBOL(complete);
1f39f580 22041
b3bbd485 22042@@ -66,10 +66,10 @@ void complete_all(struct completion *x)
e4b2b4a8
JK
22043 {
22044 unsigned long flags;
1f39f580 22045
e4b2b4a8
JK
22046- spin_lock_irqsave(&x->wait.lock, flags);
22047+ raw_spin_lock_irqsave(&x->wait.lock, flags);
22048 x->done = UINT_MAX;
22049- __wake_up_locked(&x->wait, TASK_NORMAL, 0);
22050- spin_unlock_irqrestore(&x->wait.lock, flags);
22051+ swake_up_all_locked(&x->wait);
22052+ raw_spin_unlock_irqrestore(&x->wait.lock, flags);
22053 }
22054 EXPORT_SYMBOL(complete_all);
1f39f580 22055
b3bbd485 22056@@ -78,20 +78,20 @@ do_wait_for_common(struct completion *x,
e4b2b4a8
JK
22057 long (*action)(long), long timeout, int state)
22058 {
22059 if (!x->done) {
22060- DECLARE_WAITQUEUE(wait, current);
22061+ DECLARE_SWAITQUEUE(wait);
1f39f580 22062
e4b2b4a8
JK
22063- __add_wait_queue_entry_tail_exclusive(&x->wait, &wait);
22064+ __prepare_to_swait(&x->wait, &wait);
22065 do {
22066 if (signal_pending_state(state, current)) {
22067 timeout = -ERESTARTSYS;
22068 break;
22069 }
22070 __set_current_state(state);
22071- spin_unlock_irq(&x->wait.lock);
22072+ raw_spin_unlock_irq(&x->wait.lock);
22073 timeout = action(timeout);
22074- spin_lock_irq(&x->wait.lock);
22075+ raw_spin_lock_irq(&x->wait.lock);
22076 } while (!x->done && timeout);
22077- __remove_wait_queue(&x->wait, &wait);
22078+ __finish_swait(&x->wait, &wait);
22079 if (!x->done)
22080 return timeout;
22081 }
b3bbd485 22082@@ -108,9 +108,9 @@ __wait_for_common(struct completion *x,
1f39f580 22083
e4b2b4a8 22084 complete_acquire(x);
1f39f580 22085
e4b2b4a8
JK
22086- spin_lock_irq(&x->wait.lock);
22087+ raw_spin_lock_irq(&x->wait.lock);
22088 timeout = do_wait_for_common(x, action, timeout, state);
22089- spin_unlock_irq(&x->wait.lock);
22090+ raw_spin_unlock_irq(&x->wait.lock);
1f39f580 22091
e4b2b4a8
JK
22092 complete_release(x);
22093
b3bbd485 22094@@ -299,12 +299,12 @@ bool try_wait_for_completion(struct completion *x)
e4b2b4a8
JK
22095 if (!READ_ONCE(x->done))
22096 return 0;
22097
22098- spin_lock_irqsave(&x->wait.lock, flags);
22099+ raw_spin_lock_irqsave(&x->wait.lock, flags);
22100 if (!x->done)
22101 ret = 0;
22102 else if (x->done != UINT_MAX)
22103 x->done--;
22104- spin_unlock_irqrestore(&x->wait.lock, flags);
22105+ raw_spin_unlock_irqrestore(&x->wait.lock, flags);
1f39f580
JK
22106 return ret;
22107 }
e4b2b4a8 22108 EXPORT_SYMBOL(try_wait_for_completion);
b3bbd485 22109@@ -330,8 +330,8 @@ bool completion_done(struct completion *x)
e4b2b4a8
JK
22110 * otherwise we can end up freeing the completion before complete()
22111 * is done referencing it.
22112 */
22113- spin_lock_irqsave(&x->wait.lock, flags);
22114- spin_unlock_irqrestore(&x->wait.lock, flags);
22115+ raw_spin_lock_irqsave(&x->wait.lock, flags);
22116+ raw_spin_unlock_irqrestore(&x->wait.lock, flags);
22117 return true;
22118 }
22119 EXPORT_SYMBOL(completion_done);
b3bbd485
JK
22120diff --git a/kernel/sched/core.c b/kernel/sched/core.c
22121index 4e89ed8a0fb2..6e6bd5262f23 100644
22122--- a/kernel/sched/core.c
22123+++ b/kernel/sched/core.c
22124@@ -59,7 +59,11 @@ const_debug unsigned int sysctl_sched_features =
e4b2b4a8
JK
22125 * Number of tasks to iterate in a single balance run.
22126 * Limited because this is done with IRQs disabled.
22127 */
22128+#ifndef CONFIG_PREEMPT_RT_FULL
22129 const_debug unsigned int sysctl_sched_nr_migrate = 32;
22130+#else
22131+const_debug unsigned int sysctl_sched_nr_migrate = 8;
22132+#endif
1f39f580 22133
e4b2b4a8
JK
22134 /*
22135 * period over which we average the RT time consumption, measured
b3bbd485 22136@@ -341,7 +345,7 @@ static void init_rq_hrtick(struct rq *rq)
e4b2b4a8
JK
22137 rq->hrtick_csd.info = rq;
22138 #endif
1f39f580 22139
e4b2b4a8
JK
22140- hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
22141+ hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD);
22142 rq->hrtick_timer.function = hrtick;
22143 }
22144 #else /* CONFIG_SCHED_HRTICK */
b3bbd485 22145@@ -423,9 +427,15 @@ static bool set_nr_if_polling(struct task_struct *p)
e4b2b4a8
JK
22146 #endif
22147 #endif
1f39f580 22148
e4b2b4a8
JK
22149-void wake_q_add(struct wake_q_head *head, struct task_struct *task)
22150+void __wake_q_add(struct wake_q_head *head, struct task_struct *task,
22151+ bool sleeper)
22152 {
22153- struct wake_q_node *node = &task->wake_q;
22154+ struct wake_q_node *node;
22155+
22156+ if (sleeper)
22157+ node = &task->wake_q_sleeper;
22158+ else
22159+ node = &task->wake_q;
1f39f580 22160
e4b2b4a8
JK
22161 /*
22162 * Atomically grab the task, if ->wake_q is !nil already it means
b3bbd485 22163@@ -447,24 +457,32 @@ void wake_q_add(struct wake_q_head *head, struct task_struct *task)
e4b2b4a8
JK
22164 head->lastp = &node->next;
22165 }
22166
22167-void wake_up_q(struct wake_q_head *head)
22168+void __wake_up_q(struct wake_q_head *head, bool sleeper)
22169 {
22170 struct wake_q_node *node = head->first;
22171
22172 while (node != WAKE_Q_TAIL) {
22173 struct task_struct *task;
22174
22175- task = container_of(node, struct task_struct, wake_q);
22176+ if (sleeper)
22177+ task = container_of(node, struct task_struct, wake_q_sleeper);
22178+ else
22179+ task = container_of(node, struct task_struct, wake_q);
22180 BUG_ON(!task);
22181 /* Task can safely be re-inserted now: */
22182 node = node->next;
22183- task->wake_q.next = NULL;
22184-
22185+ if (sleeper)
22186+ task->wake_q_sleeper.next = NULL;
22187+ else
22188+ task->wake_q.next = NULL;
22189 /*
22190 * wake_up_process() implies a wmb() to pair with the queueing
22191 * in wake_q_add() so as not to miss wakeups.
22192 */
22193- wake_up_process(task);
22194+ if (sleeper)
22195+ wake_up_lock_sleeper(task);
22196+ else
22197+ wake_up_process(task);
22198 put_task_struct(task);
1f39f580 22199 }
e4b2b4a8 22200 }
b3bbd485 22201@@ -500,6 +518,48 @@ void resched_curr(struct rq *rq)
e4b2b4a8
JK
22202 trace_sched_wake_idle_without_ipi(cpu);
22203 }
1f39f580 22204
e4b2b4a8
JK
22205+#ifdef CONFIG_PREEMPT_LAZY
22206+
22207+static int tsk_is_polling(struct task_struct *p)
22208+{
22209+#ifdef TIF_POLLING_NRFLAG
22210+ return test_tsk_thread_flag(p, TIF_POLLING_NRFLAG);
22211+#else
22212+ return 0;
22213+#endif
22214+}
22215+
22216+void resched_curr_lazy(struct rq *rq)
22217+{
22218+ struct task_struct *curr = rq->curr;
22219+ int cpu;
22220+
22221+ if (!sched_feat(PREEMPT_LAZY)) {
22222+ resched_curr(rq);
22223+ return;
22224+ }
22225+
22226+ lockdep_assert_held(&rq->lock);
22227+
22228+ if (test_tsk_need_resched(curr))
22229+ return;
22230+
22231+ if (test_tsk_need_resched_lazy(curr))
22232+ return;
22233+
22234+ set_tsk_need_resched_lazy(curr);
22235+
22236+ cpu = cpu_of(rq);
22237+ if (cpu == smp_processor_id())
22238+ return;
22239+
22240+ /* NEED_RESCHED_LAZY must be visible before we test polling */
22241+ smp_mb();
22242+ if (!tsk_is_polling(curr))
22243+ smp_send_reschedule(cpu);
22244+}
22245+#endif
22246+
22247 void resched_cpu(int cpu)
1f39f580 22248 {
e4b2b4a8 22249 struct rq *rq = cpu_rq(cpu);
b3bbd485 22250@@ -523,11 +583,14 @@ void resched_cpu(int cpu)
e4b2b4a8
JK
22251 */
22252 int get_nohz_timer_target(void)
22253 {
22254- int i, cpu = smp_processor_id();
22255+ int i, cpu;
22256 struct sched_domain *sd;
1f39f580 22257
e4b2b4a8
JK
22258+ preempt_disable_rt();
22259+ cpu = smp_processor_id();
22260+
22261 if (!idle_cpu(cpu) && is_housekeeping_cpu(cpu))
22262- return cpu;
22263+ goto preempt_en_rt;
1f39f580 22264
e4b2b4a8
JK
22265 rcu_read_lock();
22266 for_each_domain(cpu, sd) {
b3bbd485 22267@@ -546,6 +609,8 @@ int get_nohz_timer_target(void)
e4b2b4a8
JK
22268 cpu = housekeeping_any_cpu();
22269 unlock:
22270 rcu_read_unlock();
22271+preempt_en_rt:
22272+ preempt_enable_rt();
22273 return cpu;
1f39f580
JK
22274 }
22275
b3bbd485 22276@@ -912,10 +977,10 @@ static inline bool is_per_cpu_kthread(struct task_struct *p)
e4b2b4a8
JK
22277 */
22278 static inline bool is_cpu_allowed(struct task_struct *p, int cpu)
1f39f580 22279 {
e4b2b4a8
JK
22280- if (!cpumask_test_cpu(cpu, &p->cpus_allowed))
22281+ if (!cpumask_test_cpu(cpu, p->cpus_ptr))
22282 return false;
1f39f580 22283
b3bbd485
JK
22284- if (is_per_cpu_kthread(p))
22285+ if (is_per_cpu_kthread(p) || __migrate_disabled(p))
22286 return cpu_online(cpu);
22287
22288 return cpu_active(cpu);
22289@@ -1007,7 +1072,7 @@ static int migration_cpu_stop(void *data)
e4b2b4a8 22290 local_irq_disable();
1f39f580 22291 /*
e4b2b4a8
JK
22292 * We need to explicitly wake pending tasks before running
22293- * __migrate_task() such that we will not miss enforcing cpus_allowed
22294+ * __migrate_task() such that we will not miss enforcing cpus_ptr
22295 * during wakeups, see set_cpus_allowed_ptr()'s TASK_WAKING test.
22296 */
22297 sched_ttwu_pending();
b3bbd485 22298@@ -1038,11 +1103,19 @@ static int migration_cpu_stop(void *data)
e4b2b4a8
JK
22299 */
22300 void set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask)
1f39f580 22301 {
e4b2b4a8
JK
22302- cpumask_copy(&p->cpus_allowed, new_mask);
22303+ cpumask_copy(&p->cpus_mask, new_mask);
22304 p->nr_cpus_allowed = cpumask_weight(new_mask);
22305 }
1f39f580 22306
e4b2b4a8 22307-void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
b3bbd485 22308+#if defined(CONFIG_SMP) && defined(CONFIG_PREEMPT_RT_BASE)
e4b2b4a8
JK
22309+int __migrate_disabled(struct task_struct *p)
22310+{
22311+ return p->migrate_disable;
22312+}
22313+#endif
22314+
22315+static void __do_set_cpus_allowed_tail(struct task_struct *p,
22316+ const struct cpumask *new_mask)
22317 {
22318 struct rq *rq = task_rq(p);
22319 bool queued, running;
b3bbd485 22320@@ -1071,6 +1144,20 @@ void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
e4b2b4a8 22321 set_curr_task(rq, p);
1f39f580
JK
22322 }
22323
e4b2b4a8
JK
22324+void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
22325+{
b3bbd485 22326+#if defined(CONFIG_SMP) && defined(CONFIG_PREEMPT_RT_BASE)
e4b2b4a8
JK
22327+ if (__migrate_disabled(p)) {
22328+ lockdep_assert_held(&p->pi_lock);
22329+
22330+ cpumask_copy(&p->cpus_mask, new_mask);
22331+ p->migrate_disable_update = 1;
22332+ return;
22333+ }
22334+#endif
22335+ __do_set_cpus_allowed_tail(p, new_mask);
22336+}
22337+
22338 /*
22339 * Change a given task's CPU affinity. Migrate the thread to a
22340 * proper CPU and schedule it away if the CPU it's executing on
b3bbd485 22341@@ -1108,7 +1195,7 @@ static int __set_cpus_allowed_ptr(struct task_struct *p,
e4b2b4a8
JK
22342 goto out;
22343 }
1f39f580 22344
e4b2b4a8
JK
22345- if (cpumask_equal(&p->cpus_allowed, new_mask))
22346+ if (cpumask_equal(p->cpus_ptr, new_mask))
22347 goto out;
1f39f580 22348
e4b2b4a8 22349 if (!cpumask_intersects(new_mask, cpu_valid_mask)) {
b3bbd485 22350@@ -1129,9 +1216,16 @@ static int __set_cpus_allowed_ptr(struct task_struct *p,
e4b2b4a8 22351 }
1f39f580 22352
e4b2b4a8
JK
22353 /* Can the task run on the task's current CPU? If so, we're done */
22354- if (cpumask_test_cpu(task_cpu(p), new_mask))
22355+ if (cpumask_test_cpu(task_cpu(p), new_mask) || __migrate_disabled(p))
22356 goto out;
1f39f580 22357
b3bbd485 22358+#if defined(CONFIG_SMP) && defined(CONFIG_PREEMPT_RT_BASE)
e4b2b4a8
JK
22359+ if (__migrate_disabled(p)) {
22360+ p->migrate_disable_update = 1;
22361+ goto out;
22362+ }
22363+#endif
22364+
22365 dest_cpu = cpumask_any_and(cpu_valid_mask, new_mask);
22366 if (task_running(rq, p) || p->state == TASK_WAKING) {
22367 struct migration_arg arg = { p, dest_cpu };
b3bbd485 22368@@ -1269,10 +1363,10 @@ static int migrate_swap_stop(void *data)
e4b2b4a8
JK
22369 if (task_cpu(arg->src_task) != arg->src_cpu)
22370 goto unlock;
1f39f580 22371
e4b2b4a8
JK
22372- if (!cpumask_test_cpu(arg->dst_cpu, &arg->src_task->cpus_allowed))
22373+ if (!cpumask_test_cpu(arg->dst_cpu, arg->src_task->cpus_ptr))
22374 goto unlock;
1f39f580 22375
e4b2b4a8
JK
22376- if (!cpumask_test_cpu(arg->src_cpu, &arg->dst_task->cpus_allowed))
22377+ if (!cpumask_test_cpu(arg->src_cpu, arg->dst_task->cpus_ptr))
22378 goto unlock;
1a6e0f06 22379
e4b2b4a8 22380 __migrate_swap_task(arg->src_task, arg->dst_cpu);
b3bbd485 22381@@ -1313,10 +1407,10 @@ int migrate_swap(struct task_struct *cur, struct task_struct *p)
e4b2b4a8
JK
22382 if (!cpu_active(arg.src_cpu) || !cpu_active(arg.dst_cpu))
22383 goto out;
1a6e0f06 22384
e4b2b4a8
JK
22385- if (!cpumask_test_cpu(arg.dst_cpu, &arg.src_task->cpus_allowed))
22386+ if (!cpumask_test_cpu(arg.dst_cpu, arg.src_task->cpus_ptr))
22387 goto out;
1a6e0f06 22388
e4b2b4a8
JK
22389- if (!cpumask_test_cpu(arg.src_cpu, &arg.dst_task->cpus_allowed))
22390+ if (!cpumask_test_cpu(arg.src_cpu, arg.dst_task->cpus_ptr))
22391 goto out;
1a6e0f06 22392
e4b2b4a8 22393 trace_sched_swap_numa(cur, arg.src_cpu, p, arg.dst_cpu);
b3bbd485 22394@@ -1326,6 +1420,18 @@ int migrate_swap(struct task_struct *cur, struct task_struct *p)
e4b2b4a8 22395 return ret;
1a6e0f06
JK
22396 }
22397
e4b2b4a8
JK
22398+static bool check_task_state(struct task_struct *p, long match_state)
22399+{
22400+ bool match = false;
22401+
22402+ raw_spin_lock_irq(&p->pi_lock);
22403+ if (p->state == match_state || p->saved_state == match_state)
22404+ match = true;
22405+ raw_spin_unlock_irq(&p->pi_lock);
22406+
22407+ return match;
22408+}
22409+
22410 /*
22411 * wait_task_inactive - wait for a thread to unschedule.
22412 *
b3bbd485 22413@@ -1370,7 +1476,7 @@ unsigned long wait_task_inactive(struct task_struct *p, long match_state)
e4b2b4a8
JK
22414 * is actually now running somewhere else!
22415 */
22416 while (task_running(rq, p)) {
22417- if (match_state && unlikely(p->state != match_state))
22418+ if (match_state && !check_task_state(p, match_state))
22419 return 0;
22420 cpu_relax();
22421 }
b3bbd485 22422@@ -1385,7 +1491,8 @@ unsigned long wait_task_inactive(struct task_struct *p, long match_state)
e4b2b4a8
JK
22423 running = task_running(rq, p);
22424 queued = task_on_rq_queued(p);
22425 ncsw = 0;
22426- if (!match_state || p->state == match_state)
22427+ if (!match_state || p->state == match_state ||
22428+ p->saved_state == match_state)
22429 ncsw = p->nvcsw | LONG_MIN; /* sets MSB */
22430 task_rq_unlock(rq, p, &rf);
1a6e0f06 22431
b3bbd485 22432@@ -1460,7 +1567,7 @@ void kick_process(struct task_struct *p)
e4b2b4a8 22433 EXPORT_SYMBOL_GPL(kick_process);
1a6e0f06 22434
e4b2b4a8
JK
22435 /*
22436- * ->cpus_allowed is protected by both rq->lock and p->pi_lock
22437+ * ->cpus_ptr is protected by both rq->lock and p->pi_lock
22438 *
22439 * A few notes on cpu_active vs cpu_online:
22440 *
b3bbd485 22441@@ -1500,14 +1607,14 @@ static int select_fallback_rq(int cpu, struct task_struct *p)
e4b2b4a8
JK
22442 for_each_cpu(dest_cpu, nodemask) {
22443 if (!cpu_active(dest_cpu))
22444 continue;
22445- if (cpumask_test_cpu(dest_cpu, &p->cpus_allowed))
22446+ if (cpumask_test_cpu(dest_cpu, p->cpus_ptr))
22447 return dest_cpu;
22448 }
22449 }
c7c16703 22450
e4b2b4a8
JK
22451 for (;;) {
22452 /* Any allowed, online CPU? */
22453- for_each_cpu(dest_cpu, &p->cpus_allowed) {
22454+ for_each_cpu(dest_cpu, p->cpus_ptr) {
22455 if (!is_cpu_allowed(p, dest_cpu))
22456 continue;
22457
b3bbd485 22458@@ -1551,7 +1658,7 @@ static int select_fallback_rq(int cpu, struct task_struct *p)
e4b2b4a8
JK
22459 }
22460
22461 /*
22462- * The caller (fork, wakeup) owns p->pi_lock, ->cpus_allowed is stable.
22463+ * The caller (fork, wakeup) owns p->pi_lock, ->cpus_ptr is stable.
22464 */
22465 static inline
22466 int select_task_rq(struct task_struct *p, int cpu, int sd_flags, int wake_flags)
b3bbd485 22467@@ -1561,11 +1668,11 @@ int select_task_rq(struct task_struct *p, int cpu, int sd_flags, int wake_flags)
e4b2b4a8
JK
22468 if (p->nr_cpus_allowed > 1)
22469 cpu = p->sched_class->select_task_rq(p, cpu, sd_flags, wake_flags);
22470 else
22471- cpu = cpumask_any(&p->cpus_allowed);
22472+ cpu = cpumask_any(p->cpus_ptr);
22473
22474 /*
22475 * In order not to call set_task_cpu() on a blocking task we need
22476- * to rely on ttwu() to place the task on a valid ->cpus_allowed
22477+ * to rely on ttwu() to place the task on a valid ->cpus_ptr
22478 * CPU.
22479 *
22480 * Since this is common to all placement strategies, this lives here.
b3bbd485 22481@@ -1668,10 +1775,6 @@ static inline void ttwu_activate(struct rq *rq, struct task_struct *p, int en_fl
1a6e0f06 22482 {
e4b2b4a8
JK
22483 activate_task(rq, p, en_flags);
22484 p->on_rq = TASK_ON_RQ_QUEUED;
22485-
22486- /* If a worker is waking up, notify the workqueue: */
22487- if (p->flags & PF_WQ_WORKER)
22488- wq_worker_waking_up(p, cpu_of(rq));
22489 }
c7c16703 22490
e4b2b4a8 22491 /*
b3bbd485 22492@@ -1995,8 +2098,27 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
e4b2b4a8
JK
22493 */
22494 raw_spin_lock_irqsave(&p->pi_lock, flags);
22495 smp_mb__after_spinlock();
22496- if (!(p->state & state))
22497+ if (!(p->state & state)) {
22498+ /*
22499+ * The task might be running due to a spinlock sleeper
22500+ * wakeup. Check the saved state and set it to running
22501+ * if the wakeup condition is true.
22502+ */
22503+ if (!(wake_flags & WF_LOCK_SLEEPER)) {
22504+ if (p->saved_state & state) {
22505+ p->saved_state = TASK_RUNNING;
22506+ success = 1;
22507+ }
22508+ }
22509 goto out;
22510+ }
22511+
c7c16703 22512+ /*
e4b2b4a8
JK
22513+ * If this is a regular wakeup, then we can unconditionally
22514+ * clear the saved state of a "lock sleeper".
c7c16703 22515+ */
e4b2b4a8
JK
22516+ if (!(wake_flags & WF_LOCK_SLEEPER))
22517+ p->saved_state = TASK_RUNNING;
1a6e0f06 22518
e4b2b4a8 22519 trace_sched_waking(p);
1a6e0f06 22520
b3bbd485
JK
22521@@ -2092,56 +2214,6 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
22522 return success;
1a6e0f06 22523 }
1a6e0f06 22524
b3bbd485 22525-/**
e4b2b4a8
JK
22526- * try_to_wake_up_local - try to wake up a local task with rq lock held
22527- * @p: the thread to be awakened
22528- * @rf: request-queue flags for pinning
22529- *
22530- * Put @p on the run-queue if it's not already there. The caller must
22531- * ensure that this_rq() is locked, @p is bound to this_rq() and not
22532- * the current task.
22533- */
22534-static void try_to_wake_up_local(struct task_struct *p, struct rq_flags *rf)
22535-{
22536- struct rq *rq = task_rq(p);
22537-
22538- if (WARN_ON_ONCE(rq != this_rq()) ||
22539- WARN_ON_ONCE(p == current))
22540- return;
22541-
22542- lockdep_assert_held(&rq->lock);
22543-
22544- if (!raw_spin_trylock(&p->pi_lock)) {
22545- /*
22546- * This is OK, because current is on_cpu, which avoids it being
22547- * picked for load-balance and preemption/IRQs are still
22548- * disabled avoiding further scheduler activity on it and we've
22549- * not yet picked a replacement task.
22550- */
22551- rq_unlock(rq, rf);
22552- raw_spin_lock(&p->pi_lock);
22553- rq_relock(rq, rf);
22554- }
22555-
22556- if (!(p->state & TASK_NORMAL))
22557- goto out;
22558-
22559- trace_sched_waking(p);
22560-
22561- if (!task_on_rq_queued(p)) {
22562- if (p->in_iowait) {
22563- delayacct_blkio_end(p);
22564- atomic_dec(&rq->nr_iowait);
22565- }
22566- ttwu_activate(rq, p, ENQUEUE_WAKEUP | ENQUEUE_NOCLOCK);
22567- }
22568-
22569- ttwu_do_wakeup(rq, p, 0, rf);
22570- ttwu_stat(p, smp_processor_id(), 0);
22571-out:
22572- raw_spin_unlock(&p->pi_lock);
22573-}
22574-
b3bbd485 22575 /**
e4b2b4a8
JK
22576 * wake_up_process - Wake up a specific process
22577 * @p: The process to be woken up.
b3bbd485 22578@@ -2160,6 +2232,18 @@ int wake_up_process(struct task_struct *p)
e4b2b4a8
JK
22579 }
22580 EXPORT_SYMBOL(wake_up_process);
22581
22582+/**
22583+ * wake_up_lock_sleeper - Wake up a specific process blocked on a "sleeping lock"
22584+ * @p: The process to be woken up.
22585+ *
22586+ * Same as wake_up_process() above, but wake_flags=WF_LOCK_SLEEPER to indicate
22587+ * the nature of the wakeup.
1a6e0f06 22588+ */
e4b2b4a8 22589+int wake_up_lock_sleeper(struct task_struct *p)
1a6e0f06 22590+{
e4b2b4a8 22591+ return try_to_wake_up(p, TASK_UNINTERRUPTIBLE, WF_LOCK_SLEEPER);
1a6e0f06 22592+}
1a6e0f06 22593+
e4b2b4a8 22594 int wake_up_state(struct task_struct *p, unsigned int state)
1a6e0f06 22595 {
e4b2b4a8 22596 return try_to_wake_up(p, state, 0);
b3bbd485 22597@@ -2420,6 +2504,9 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p)
e4b2b4a8
JK
22598 p->on_cpu = 0;
22599 #endif
22600 init_task_preempt_count(p);
22601+#ifdef CONFIG_HAVE_PREEMPT_LAZY
22602+ task_thread_info(p)->preempt_lazy_count = 0;
1a6e0f06 22603+#endif
e4b2b4a8
JK
22604 #ifdef CONFIG_SMP
22605 plist_node_init(&p->pushable_tasks, MAX_PRIO);
22606 RB_CLEAR_NODE(&p->pushable_dl_tasks);
b3bbd485 22607@@ -2462,7 +2549,7 @@ void wake_up_new_task(struct task_struct *p)
e4b2b4a8
JK
22608 #ifdef CONFIG_SMP
22609 /*
22610 * Fork balancing, do it here and not earlier because:
22611- * - cpus_allowed can change in the fork path
22612+ * - cpus_ptr can change in the fork path
22613 * - any previously selected CPU might disappear through hotplug
22614 *
22615 * Use __set_task_cpu() to avoid calling sched_class::migrate_task_rq,
b3bbd485 22616@@ -2675,21 +2762,16 @@ static struct rq *finish_task_switch(struct task_struct *prev)
e4b2b4a8 22617 finish_arch_post_lock_switch();
1a6e0f06 22618
e4b2b4a8
JK
22619 fire_sched_in_preempt_notifiers(current);
22620+ /*
22621+ * We use mmdrop_delayed() here so we don't have to do the
22622+ * full __mmdrop() when we are the last user.
22623+ */
22624 if (mm)
22625- mmdrop(mm);
22626+ mmdrop_delayed(mm);
22627 if (unlikely(prev_state == TASK_DEAD)) {
22628 if (prev->sched_class->task_dead)
22629 prev->sched_class->task_dead(prev);
1a6e0f06 22630
e4b2b4a8
JK
22631- /*
22632- * Remove function-return probe instances associated with this
22633- * task and put them back on the free list.
22634- */
22635- kprobe_flush_task(prev);
22636-
22637- /* Task is done with its stack. */
22638- put_task_stack(prev);
22639-
22640 put_task_struct(prev);
22641 }
1a6e0f06 22642
b3bbd485 22643@@ -3336,25 +3418,13 @@ static void __sched notrace __schedule(bool preempt)
e4b2b4a8
JK
22644 atomic_inc(&rq->nr_iowait);
22645 delayacct_blkio_start();
22646 }
22647-
22648- /*
22649- * If a worker went to sleep, notify and ask workqueue
22650- * whether it wants to wake up a task to maintain
22651- * concurrency.
22652- */
22653- if (prev->flags & PF_WQ_WORKER) {
22654- struct task_struct *to_wakeup;
22655-
22656- to_wakeup = wq_worker_sleeping(prev);
22657- if (to_wakeup)
22658- try_to_wake_up_local(to_wakeup, &rf);
22659- }
22660 }
22661 switch_count = &prev->nvcsw;
22662 }
1a6e0f06 22663
e4b2b4a8
JK
22664 next = pick_next_task(rq, prev, &rf);
22665 clear_tsk_need_resched(prev);
22666+ clear_tsk_need_resched_lazy(prev);
22667 clear_preempt_need_resched();
1a6e0f06 22668
e4b2b4a8 22669 if (likely(prev != next)) {
b3bbd485 22670@@ -3407,8 +3477,24 @@ void __noreturn do_task_dead(void)
1a6e0f06 22671
e4b2b4a8
JK
22672 static inline void sched_submit_work(struct task_struct *tsk)
22673 {
22674- if (!tsk->state || tsk_is_pi_blocked(tsk))
22675+ if (!tsk->state)
b3bbd485 22676 return;
1a6e0f06 22677+ /*
e4b2b4a8
JK
22678+ * If a worker went to sleep, notify and ask workqueue whether
22679+ * it wants to wake up a task to maintain concurrency.
b3bbd485
JK
22680+ * As this function is called inside the schedule() context,
22681+ * we disable preemption to avoid it calling schedule() again
22682+ * in the possible wakeup of a kworker.
1a6e0f06 22683+ */
b3bbd485
JK
22684+ if (tsk->flags & PF_WQ_WORKER) {
22685+ preempt_disable();
e4b2b4a8 22686+ wq_worker_sleeping(tsk);
b3bbd485
JK
22687+ preempt_enable_no_resched();
22688+ }
1a6e0f06 22689+
e4b2b4a8 22690+ if (tsk_is_pi_blocked(tsk))
b3bbd485 22691+ return;
1a6e0f06 22692+
1a6e0f06 22693 /*
e4b2b4a8
JK
22694 * If we are going to sleep and we have plugged IO queued,
22695 * make sure to submit it to avoid deadlocks.
b3bbd485 22696@@ -3417,6 +3503,12 @@ static inline void sched_submit_work(struct task_struct *tsk)
e4b2b4a8
JK
22697 blk_schedule_flush_plug(tsk);
22698 }
1a6e0f06 22699
e4b2b4a8
JK
22700+static void sched_update_worker(struct task_struct *tsk)
22701+{
22702+ if (tsk->flags & PF_WQ_WORKER)
22703+ wq_worker_running(tsk);
22704+}
22705+
22706 asmlinkage __visible void __sched schedule(void)
1a6e0f06 22707 {
e4b2b4a8 22708 struct task_struct *tsk = current;
b3bbd485 22709@@ -3427,6 +3519,7 @@ asmlinkage __visible void __sched schedule(void)
e4b2b4a8
JK
22710 __schedule(false);
22711 sched_preempt_enable_no_resched();
22712 } while (need_resched());
22713+ sched_update_worker(tsk);
22714 }
22715 EXPORT_SYMBOL(schedule);
1a6e0f06 22716
b3bbd485 22717@@ -3515,6 +3608,30 @@ static void __sched notrace preempt_schedule_common(void)
e4b2b4a8
JK
22718 } while (need_resched());
22719 }
1a6e0f06 22720
e4b2b4a8
JK
22721+#ifdef CONFIG_PREEMPT_LAZY
22722+/*
22723+ * If TIF_NEED_RESCHED is then we allow to be scheduled away since this is
22724+ * set by a RT task. Oterwise we try to avoid beeing scheduled out as long as
22725+ * preempt_lazy_count counter >0.
22726+ */
22727+static __always_inline int preemptible_lazy(void)
22728+{
22729+ if (test_thread_flag(TIF_NEED_RESCHED))
22730+ return 1;
22731+ if (current_thread_info()->preempt_lazy_count)
22732+ return 0;
22733+ return 1;
22734+}
22735+
1a6e0f06 22736+#else
e4b2b4a8
JK
22737+
22738+static inline int preemptible_lazy(void)
22739+{
22740+ return 1;
22741+}
22742+
1a6e0f06 22743+#endif
e4b2b4a8
JK
22744+
22745 #ifdef CONFIG_PREEMPT
22746 /*
22747 * this is the entry point to schedule() from in-kernel preemption
b3bbd485 22748@@ -3529,7 +3646,8 @@ asmlinkage __visible void __sched notrace preempt_schedule(void)
e4b2b4a8
JK
22749 */
22750 if (likely(!preemptible()))
22751 return;
22752-
22753+ if (!preemptible_lazy())
22754+ return;
22755 preempt_schedule_common();
22756 }
22757 NOKPROBE_SYMBOL(preempt_schedule);
b3bbd485 22758@@ -3556,6 +3674,9 @@ asmlinkage __visible void __sched notrace preempt_schedule_notrace(void)
e4b2b4a8
JK
22759 if (likely(!preemptible()))
22760 return;
1a6e0f06 22761
e4b2b4a8
JK
22762+ if (!preemptible_lazy())
22763+ return;
22764+
22765 do {
22766 /*
22767 * Because the function tracer can trace preempt_count_sub()
b3bbd485 22768@@ -3578,7 +3699,16 @@ asmlinkage __visible void __sched notrace preempt_schedule_notrace(void)
e4b2b4a8
JK
22769 * an infinite recursion.
22770 */
22771 prev_ctx = exception_enter();
22772+ /*
22773+ * The add/subtract must not be traced by the function
22774+ * tracer. But we still want to account for the
22775+ * preempt off latency tracer. Since the _notrace versions
22776+ * of add/subtract skip the accounting for latency tracer
22777+ * we must force it manually.
22778+ */
22779+ start_critical_timings();
22780 __schedule(true);
22781+ stop_critical_timings();
22782 exception_exit(prev_ctx);
1a6e0f06 22783
e4b2b4a8 22784 preempt_latency_stop(1);
b3bbd485 22785@@ -4164,7 +4294,7 @@ static int __sched_setscheduler(struct task_struct *p,
e4b2b4a8
JK
22786 * the entire root_domain to become SCHED_DEADLINE. We
22787 * will also fail if there's no bandwidth available.
22788 */
22789- if (!cpumask_subset(span, &p->cpus_allowed) ||
22790+ if (!cpumask_subset(span, p->cpus_ptr) ||
22791 rq->rd->dl_bw.bw == 0) {
22792 task_rq_unlock(rq, p, &rf);
22793 return -EPERM;
b3bbd485 22794@@ -4758,7 +4888,7 @@ long sched_getaffinity(pid_t pid, struct cpumask *mask)
e4b2b4a8 22795 goto out_unlock;
1a6e0f06 22796
e4b2b4a8
JK
22797 raw_spin_lock_irqsave(&p->pi_lock, flags);
22798- cpumask_and(mask, &p->cpus_allowed, cpu_active_mask);
22799+ cpumask_and(mask, &p->cpus_mask, cpu_active_mask);
22800 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
22801
22802 out_unlock:
b3bbd485 22803@@ -4877,6 +5007,7 @@ int __cond_resched_lock(spinlock_t *lock)
e4b2b4a8
JK
22804 }
22805 EXPORT_SYMBOL(__cond_resched_lock);
22806
22807+#ifndef CONFIG_PREEMPT_RT_FULL
22808 int __sched __cond_resched_softirq(void)
22809 {
22810 BUG_ON(!in_softirq());
b3bbd485 22811@@ -4890,6 +5021,7 @@ int __sched __cond_resched_softirq(void)
1a6e0f06
JK
22812 return 0;
22813 }
e4b2b4a8
JK
22814 EXPORT_SYMBOL(__cond_resched_softirq);
22815+#endif
1a6e0f06 22816
e4b2b4a8
JK
22817 /**
22818 * yield - yield the current processor to other threads.
b3bbd485 22819@@ -5284,7 +5416,9 @@ void init_idle(struct task_struct *idle, int cpu)
1a6e0f06 22820
e4b2b4a8
JK
22821 /* Set the preempt count _outside_ the spinlocks! */
22822 init_idle_preempt_count(idle, cpu);
22823-
22824+#ifdef CONFIG_HAVE_PREEMPT_LAZY
22825+ task_thread_info(idle)->preempt_lazy_count = 0;
1a6e0f06 22826+#endif
e4b2b4a8
JK
22827 /*
22828 * The idle tasks have their own, simple scheduling class:
22829 */
b3bbd485 22830@@ -5323,7 +5457,7 @@ int task_can_attach(struct task_struct *p,
e4b2b4a8
JK
22831 * allowed nodes is unnecessary. Thus, cpusets are not
22832 * applicable for such threads. This prevents checking for
22833 * success of set_cpus_allowed_ptr() on all attached tasks
22834- * before cpus_allowed may be changed.
22835+ * before cpus_mask may be changed.
22836 */
22837 if (p->flags & PF_NO_SETAFFINITY) {
22838 ret = -EINVAL;
b3bbd485 22839@@ -5350,7 +5484,7 @@ int migrate_task_to(struct task_struct *p, int target_cpu)
e4b2b4a8
JK
22840 if (curr_cpu == target_cpu)
22841 return 0;
22842
22843- if (!cpumask_test_cpu(target_cpu, &p->cpus_allowed))
22844+ if (!cpumask_test_cpu(target_cpu, p->cpus_ptr))
22845 return -EINVAL;
22846
22847 /* TODO: This is not properly updating schedstats */
b3bbd485 22848@@ -5389,6 +5523,8 @@ void sched_setnuma(struct task_struct *p, int nid)
e4b2b4a8 22849 #endif /* CONFIG_NUMA_BALANCING */
1a6e0f06 22850
e4b2b4a8
JK
22851 #ifdef CONFIG_HOTPLUG_CPU
22852+static DEFINE_PER_CPU(struct mm_struct *, idle_last_mm);
22853+
22854 /*
22855 * Ensure that the idle task is using init_mm right before its CPU goes
22856 * offline.
b3bbd485 22857@@ -5403,7 +5539,12 @@ void idle_task_exit(void)
e4b2b4a8
JK
22858 switch_mm(mm, &init_mm, current);
22859 finish_arch_post_lock_switch();
22860 }
22861- mmdrop(mm);
22862+ /*
22863+ * Defer the cleanup to an alive cpu. On RT we can neither
22864+ * call mmdrop() nor mmdrop_delayed() from here.
22865+ */
22866+ per_cpu(idle_last_mm, smp_processor_id()) = mm;
22867+
1a6e0f06 22868 }
1a6e0f06 22869
e4b2b4a8 22870 /*
b3bbd485 22871@@ -5487,7 +5628,7 @@ static void migrate_tasks(struct rq *dead_rq, struct rq_flags *rf)
e4b2b4a8
JK
22872 put_prev_task(rq, next);
22873
22874 /*
22875- * Rules for changing task_struct::cpus_allowed are holding
22876+ * Rules for changing task_struct::cpus_mask are holding
22877 * both pi_lock and rq->lock, such that holding either
22878 * stabilizes the mask.
22879 *
b3bbd485 22880@@ -5718,6 +5859,10 @@ int sched_cpu_dying(unsigned int cpu)
e4b2b4a8
JK
22881 update_max_interval();
22882 nohz_balance_exit_idle(cpu);
22883 hrtick_clear(rq);
22884+ if (per_cpu(idle_last_mm, cpu)) {
22885+ mmdrop_delayed(per_cpu(idle_last_mm, cpu));
22886+ per_cpu(idle_last_mm, cpu) = NULL;
22887+ }
22888 return 0;
1a6e0f06 22889 }
e4b2b4a8 22890 #endif
b3bbd485 22891@@ -5964,7 +6109,7 @@ void __init sched_init(void)
e4b2b4a8
JK
22892 #ifdef CONFIG_DEBUG_ATOMIC_SLEEP
22893 static inline int preempt_count_equals(int preempt_offset)
22894 {
22895- int nested = preempt_count() + rcu_preempt_depth();
22896+ int nested = preempt_count() + sched_rcu_preempt_depth();
1a6e0f06 22897
e4b2b4a8
JK
22898 return (nested == preempt_offset);
22899 }
b3bbd485 22900@@ -6756,3 +6901,196 @@ const u32 sched_prio_to_wmult[40] = {
e4b2b4a8
JK
22901 /* 10 */ 39045157, 49367440, 61356676, 76695844, 95443717,
22902 /* 15 */ 119304647, 148102320, 186737708, 238609294, 286331153,
22903 };
1a6e0f06 22904+
b3bbd485 22905+#if defined(CONFIG_SMP) && defined(CONFIG_PREEMPT_RT_BASE)
1a6e0f06 22906+
e4b2b4a8
JK
22907+static inline void
22908+update_nr_migratory(struct task_struct *p, long delta)
1a6e0f06 22909+{
e4b2b4a8
JK
22910+ if (unlikely((p->sched_class == &rt_sched_class ||
22911+ p->sched_class == &dl_sched_class) &&
22912+ p->nr_cpus_allowed > 1)) {
22913+ if (p->sched_class == &rt_sched_class)
22914+ task_rq(p)->rt.rt_nr_migratory += delta;
22915+ else
22916+ task_rq(p)->dl.dl_nr_migratory += delta;
22917+ }
1a6e0f06
JK
22918+}
22919+
e4b2b4a8
JK
22920+static inline void
22921+migrate_disable_update_cpus_allowed(struct task_struct *p)
22922+{
22923+ struct rq *rq;
22924+ struct rq_flags rf;
1a6e0f06 22925+
e4b2b4a8
JK
22926+ p->cpus_ptr = cpumask_of(smp_processor_id());
22927+
22928+ rq = task_rq_lock(p, &rf);
22929+ update_nr_migratory(p, -1);
22930+ p->nr_cpus_allowed = 1;
22931+ task_rq_unlock(rq, p, &rf);
22932+}
22933+
22934+static inline void
22935+migrate_enable_update_cpus_allowed(struct task_struct *p)
1a6e0f06 22936+{
e4b2b4a8
JK
22937+ struct rq *rq;
22938+ struct rq_flags rf;
22939+
22940+ p->cpus_ptr = &p->cpus_mask;
22941+
22942+ rq = task_rq_lock(p, &rf);
22943+ p->nr_cpus_allowed = cpumask_weight(&p->cpus_mask);
22944+ update_nr_migratory(p, 1);
22945+ task_rq_unlock(rq, p, &rf);
1a6e0f06 22946+}
1a6e0f06 22947+
e4b2b4a8
JK
22948+void migrate_disable(void)
22949+{
22950+ struct task_struct *p = current;
22951+
22952+ if (in_atomic() || irqs_disabled()) {
22953+#ifdef CONFIG_SCHED_DEBUG
22954+ p->migrate_disable_atomic++;
1a6e0f06 22955+#endif
e4b2b4a8
JK
22956+ return;
22957+ }
22958+#ifdef CONFIG_SCHED_DEBUG
22959+ if (unlikely(p->migrate_disable_atomic)) {
22960+ tracing_off();
22961+ WARN_ON_ONCE(1);
22962+ }
1a6e0f06 22963+#endif
1a6e0f06 22964+
e4b2b4a8
JK
22965+ if (p->migrate_disable) {
22966+ p->migrate_disable++;
22967+ return;
22968+ }
22969+
22970+ preempt_disable();
22971+ preempt_lazy_disable();
22972+ pin_current_cpu();
22973+
22974+ migrate_disable_update_cpus_allowed(p);
22975+ p->migrate_disable = 1;
22976+
22977+ preempt_enable();
1a6e0f06 22978+}
e4b2b4a8 22979+EXPORT_SYMBOL(migrate_disable);
1a6e0f06 22980+
e4b2b4a8 22981+void migrate_enable(void)
1a6e0f06 22982+{
e4b2b4a8
JK
22983+ struct task_struct *p = current;
22984+
22985+ if (in_atomic() || irqs_disabled()) {
22986+#ifdef CONFIG_SCHED_DEBUG
22987+ p->migrate_disable_atomic--;
22988+#endif
22989+ return;
22990+ }
22991+
22992+#ifdef CONFIG_SCHED_DEBUG
22993+ if (unlikely(p->migrate_disable_atomic)) {
22994+ tracing_off();
22995+ WARN_ON_ONCE(1);
22996+ }
22997+#endif
22998+
22999+ WARN_ON_ONCE(p->migrate_disable <= 0);
23000+ if (p->migrate_disable > 1) {
23001+ p->migrate_disable--;
23002+ return;
23003+ }
23004+
23005+ preempt_disable();
23006+
23007+ p->migrate_disable = 0;
23008+ migrate_enable_update_cpus_allowed(p);
23009+
23010+ if (p->migrate_disable_update) {
23011+ struct rq *rq;
23012+ struct rq_flags rf;
23013+
23014+ rq = task_rq_lock(p, &rf);
23015+ update_rq_clock(rq);
23016+
23017+ __do_set_cpus_allowed_tail(p, &p->cpus_mask);
23018+ task_rq_unlock(rq, p, &rf);
23019+
23020+ p->migrate_disable_update = 0;
23021+
23022+ WARN_ON(smp_processor_id() != task_cpu(p));
23023+ if (!cpumask_test_cpu(task_cpu(p), &p->cpus_mask)) {
23024+ const struct cpumask *cpu_valid_mask = cpu_active_mask;
23025+ struct migration_arg arg;
23026+ unsigned int dest_cpu;
23027+
23028+ if (p->flags & PF_KTHREAD) {
23029+ /*
23030+ * Kernel threads are allowed on online && !active CPUs
23031+ */
23032+ cpu_valid_mask = cpu_online_mask;
23033+ }
23034+ dest_cpu = cpumask_any_and(cpu_valid_mask, &p->cpus_mask);
23035+ arg.task = p;
23036+ arg.dest_cpu = dest_cpu;
23037+
23038+ unpin_current_cpu();
23039+ preempt_lazy_enable();
23040+ preempt_enable();
23041+ stop_one_cpu(task_cpu(p), migration_cpu_stop, &arg);
23042+ tlb_migrate_finish(p->mm);
23043+
23044+ return;
23045+ }
23046+ }
23047+ unpin_current_cpu();
23048+ preempt_lazy_enable();
23049+ preempt_enable();
1a6e0f06 23050+}
e4b2b4a8 23051+EXPORT_SYMBOL(migrate_enable);
1a6e0f06 23052+
e4b2b4a8
JK
23053+#elif !defined(CONFIG_SMP) && defined(CONFIG_PREEMPT_RT_BASE)
23054+void migrate_disable(void)
23055+{
b3bbd485 23056+#ifdef CONFIG_SCHED_DEBUG
e4b2b4a8
JK
23057+ struct task_struct *p = current;
23058+
23059+ if (in_atomic() || irqs_disabled()) {
e4b2b4a8 23060+ p->migrate_disable_atomic++;
e4b2b4a8
JK
23061+ return;
23062+ }
b3bbd485 23063+
e4b2b4a8
JK
23064+ if (unlikely(p->migrate_disable_atomic)) {
23065+ tracing_off();
23066+ WARN_ON_ONCE(1);
23067+ }
e4b2b4a8
JK
23068+
23069+ p->migrate_disable++;
b3bbd485
JK
23070+#endif
23071+ barrier();
e4b2b4a8
JK
23072+}
23073+EXPORT_SYMBOL(migrate_disable);
23074+
23075+void migrate_enable(void)
23076+{
b3bbd485 23077+#ifdef CONFIG_SCHED_DEBUG
e4b2b4a8
JK
23078+ struct task_struct *p = current;
23079+
23080+ if (in_atomic() || irqs_disabled()) {
e4b2b4a8 23081+ p->migrate_disable_atomic--;
e4b2b4a8
JK
23082+ return;
23083+ }
23084+
e4b2b4a8
JK
23085+ if (unlikely(p->migrate_disable_atomic)) {
23086+ tracing_off();
23087+ WARN_ON_ONCE(1);
23088+ }
1a6e0f06 23089+
e4b2b4a8
JK
23090+ WARN_ON_ONCE(p->migrate_disable <= 0);
23091+ p->migrate_disable--;
b3bbd485
JK
23092+#endif
23093+ barrier();
e4b2b4a8
JK
23094+}
23095+EXPORT_SYMBOL(migrate_enable);
23096+#endif
b3bbd485
JK
23097diff --git a/kernel/sched/cpudeadline.c b/kernel/sched/cpudeadline.c
23098index 8d9562d890d3..91a0702fe3df 100644
23099--- a/kernel/sched/cpudeadline.c
23100+++ b/kernel/sched/cpudeadline.c
23101@@ -127,13 +127,13 @@ int cpudl_find(struct cpudl *cp, struct task_struct *p,
e4b2b4a8 23102 const struct sched_dl_entity *dl_se = &p->dl;
1a6e0f06 23103
e4b2b4a8
JK
23104 if (later_mask &&
23105- cpumask_and(later_mask, cp->free_cpus, &p->cpus_allowed)) {
23106+ cpumask_and(later_mask, cp->free_cpus, p->cpus_ptr)) {
23107 return 1;
23108 } else {
23109 int best_cpu = cpudl_maximum(cp);
23110 WARN_ON(best_cpu != -1 && !cpu_present(best_cpu));
23111
23112- if (cpumask_test_cpu(best_cpu, &p->cpus_allowed) &&
23113+ if (cpumask_test_cpu(best_cpu, p->cpus_ptr) &&
23114 dl_time_before(dl_se->deadline, cp->elements[0].dl)) {
23115 if (later_mask)
23116 cpumask_set_cpu(best_cpu, later_mask);
b3bbd485
JK
23117diff --git a/kernel/sched/cpupri.c b/kernel/sched/cpupri.c
23118index 2511aba36b89..7b9bc1de0e6c 100644
23119--- a/kernel/sched/cpupri.c
23120+++ b/kernel/sched/cpupri.c
23121@@ -103,11 +103,11 @@ int cpupri_find(struct cpupri *cp, struct task_struct *p,
e4b2b4a8
JK
23122 if (skip)
23123 continue;
1a6e0f06 23124
e4b2b4a8
JK
23125- if (cpumask_any_and(&p->cpus_allowed, vec->mask) >= nr_cpu_ids)
23126+ if (cpumask_any_and(p->cpus_ptr, vec->mask) >= nr_cpu_ids)
23127 continue;
1a6e0f06 23128
e4b2b4a8
JK
23129 if (lowest_mask) {
23130- cpumask_and(lowest_mask, &p->cpus_allowed, vec->mask);
23131+ cpumask_and(lowest_mask, p->cpus_ptr, vec->mask);
23132
23133 /*
23134 * We have to ensure that we have at least one bit
b3bbd485
JK
23135diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
23136index b2589c7e9439..28a75a9526ac 100644
23137--- a/kernel/sched/deadline.c
23138+++ b/kernel/sched/deadline.c
23139@@ -504,7 +504,7 @@ static struct rq *dl_task_offline_migration(struct rq *rq, struct task_struct *p
e4b2b4a8
JK
23140 * If we cannot preempt any rq, fall back to pick any
23141 * online cpu.
23142 */
23143- cpu = cpumask_any_and(cpu_active_mask, &p->cpus_allowed);
23144+ cpu = cpumask_any_and(cpu_active_mask, p->cpus_ptr);
23145 if (cpu >= nr_cpu_ids) {
23146 /*
23147 * Fail to find any suitable cpu.
b3bbd485 23148@@ -1020,7 +1020,7 @@ void init_dl_task_timer(struct sched_dl_entity *dl_se)
1a6e0f06 23149 {
e4b2b4a8 23150 struct hrtimer *timer = &dl_se->dl_timer;
1a6e0f06 23151
e4b2b4a8
JK
23152- hrtimer_init(timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
23153+ hrtimer_init(timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD);
23154 timer->function = dl_task_timer;
23155 }
23156
b3bbd485 23157@@ -1753,7 +1753,7 @@ static void set_curr_task_dl(struct rq *rq)
e4b2b4a8
JK
23158 static int pick_dl_task(struct rq *rq, struct task_struct *p, int cpu)
23159 {
23160 if (!task_running(rq, p) &&
23161- cpumask_test_cpu(cpu, &p->cpus_allowed))
23162+ cpumask_test_cpu(cpu, p->cpus_ptr))
23163 return 1;
23164 return 0;
23165 }
b3bbd485 23166@@ -1903,7 +1903,7 @@ static struct rq *find_lock_later_rq(struct task_struct *task, struct rq *rq)
e4b2b4a8
JK
23167 /* Retry if something changed. */
23168 if (double_lock_balance(rq, later_rq)) {
23169 if (unlikely(task_rq(task) != rq ||
23170- !cpumask_test_cpu(later_rq->cpu, &task->cpus_allowed) ||
23171+ !cpumask_test_cpu(later_rq->cpu, task->cpus_ptr) ||
23172 task_running(rq, task) ||
23173 !dl_task(task) ||
23174 !task_on_rq_queued(task))) {
b3bbd485
JK
23175diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
23176index 2f93e4a2d9f6..b5b43861c2b6 100644
23177--- a/kernel/sched/debug.c
23178+++ b/kernel/sched/debug.c
23179@@ -1017,6 +1017,10 @@ void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns,
e4b2b4a8
JK
23180 P(dl.runtime);
23181 P(dl.deadline);
1a6e0f06 23182 }
b3bbd485 23183+#if defined(CONFIG_SMP) && defined(CONFIG_PREEMPT_RT_BASE)
e4b2b4a8
JK
23184+ P(migrate_disable);
23185+#endif
23186+ P(nr_cpus_allowed);
23187 #undef PN_SCHEDSTAT
23188 #undef PN
23189 #undef __PN
b3bbd485 23190diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
5dd41b01 23191index b2d699f28304..20e7d867af7a 100644
b3bbd485
JK
23192--- a/kernel/sched/fair.c
23193+++ b/kernel/sched/fair.c
5dd41b01 23194@@ -1598,7 +1598,7 @@ static void task_numa_compare(struct task_numa_env *env,
e4b2b4a8
JK
23195 */
23196 if (cur) {
23197 /* Skip this swap candidate if cannot move to the source cpu */
23198- if (!cpumask_test_cpu(env->src_cpu, &cur->cpus_allowed))
23199+ if (!cpumask_test_cpu(env->src_cpu, cur->cpus_ptr))
23200 goto unlock;
1a6e0f06 23201
e4b2b4a8 23202 /*
5dd41b01 23203@@ -1708,7 +1708,7 @@ static void task_numa_find_cpu(struct task_numa_env *env,
1a6e0f06 23204
e4b2b4a8
JK
23205 for_each_cpu(cpu, cpumask_of_node(env->dst_nid)) {
23206 /* Skip this CPU if the source task cannot migrate */
23207- if (!cpumask_test_cpu(cpu, &env->p->cpus_allowed))
23208+ if (!cpumask_test_cpu(cpu, env->p->cpus_ptr))
23209 continue;
1a6e0f06 23210
e4b2b4a8 23211 env->dst_cpu = cpu;
5dd41b01 23212@@ -3842,7 +3842,7 @@ check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
e4b2b4a8
JK
23213 ideal_runtime = sched_slice(cfs_rq, curr);
23214 delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime;
23215 if (delta_exec > ideal_runtime) {
23216- resched_curr(rq_of(cfs_rq));
23217+ resched_curr_lazy(rq_of(cfs_rq));
23218 /*
23219 * The current task ran long enough, ensure it doesn't get
23220 * re-elected due to buddy favours.
5dd41b01 23221@@ -3866,7 +3866,7 @@ check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
e4b2b4a8 23222 return;
1a6e0f06 23223
e4b2b4a8
JK
23224 if (delta > ideal_runtime)
23225- resched_curr(rq_of(cfs_rq));
23226+ resched_curr_lazy(rq_of(cfs_rq));
23227 }
1a6e0f06 23228
e4b2b4a8 23229 static void
5dd41b01 23230@@ -4008,7 +4008,7 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
e4b2b4a8
JK
23231 * validating it and just reschedule.
23232 */
23233 if (queued) {
23234- resched_curr(rq_of(cfs_rq));
23235+ resched_curr_lazy(rq_of(cfs_rq));
1a6e0f06 23236 return;
e4b2b4a8
JK
23237 }
23238 /*
5dd41b01 23239@@ -4190,7 +4190,7 @@ static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec)
e4b2b4a8
JK
23240 * hierarchy can be throttled
23241 */
23242 if (!assign_cfs_rq_runtime(cfs_rq) && likely(cfs_rq->curr))
23243- resched_curr(rq_of(cfs_rq));
23244+ resched_curr_lazy(rq_of(cfs_rq));
1a6e0f06 23245 }
1a6e0f06 23246
e4b2b4a8 23247 static __always_inline
5dd41b01 23248@@ -4686,9 +4686,9 @@ void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
b3bbd485
JK
23249 cfs_b->period = ns_to_ktime(default_cfs_period());
23250
23251 INIT_LIST_HEAD(&cfs_b->throttled_cfs_rq);
23252- hrtimer_init(&cfs_b->period_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED);
23253+ hrtimer_init(&cfs_b->period_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED_HARD);
23254 cfs_b->period_timer.function = sched_cfs_period_timer;
23255- hrtimer_init(&cfs_b->slack_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
23256+ hrtimer_init(&cfs_b->slack_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD);
23257 cfs_b->slack_timer.function = sched_cfs_slack_timer;
23258 }
23259
5dd41b01 23260@@ -4839,7 +4839,7 @@ static void hrtick_start_fair(struct rq *rq, struct task_struct *p)
1a6e0f06 23261
e4b2b4a8
JK
23262 if (delta < 0) {
23263 if (rq->curr == p)
23264- resched_curr(rq);
23265+ resched_curr_lazy(rq);
23266 return;
23267 }
23268 hrtick_start(rq, delta);
5dd41b01 23269@@ -5477,7 +5477,7 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p,
1a6e0f06 23270
e4b2b4a8
JK
23271 /* Skip over this group if it has no CPUs allowed */
23272 if (!cpumask_intersects(sched_group_span(group),
23273- &p->cpus_allowed))
23274+ p->cpus_ptr))
23275 continue;
1a6e0f06 23276
e4b2b4a8 23277 local_group = cpumask_test_cpu(this_cpu,
5dd41b01 23278@@ -5597,7 +5597,7 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
e4b2b4a8
JK
23279 return cpumask_first(sched_group_span(group));
23280
23281 /* Traverse only the allowed CPUs */
23282- for_each_cpu_and(i, sched_group_span(group), &p->cpus_allowed) {
23283+ for_each_cpu_and(i, sched_group_span(group), p->cpus_ptr) {
23284 if (idle_cpu(i)) {
23285 struct rq *rq = cpu_rq(i);
23286 struct cpuidle_state *idle = idle_get_state(rq);
5dd41b01 23287@@ -5700,7 +5700,7 @@ static int select_idle_core(struct task_struct *p, struct sched_domain *sd, int
e4b2b4a8
JK
23288 if (!test_idle_cores(target, false))
23289 return -1;
23290
23291- cpumask_and(cpus, sched_domain_span(sd), &p->cpus_allowed);
23292+ cpumask_and(cpus, sched_domain_span(sd), p->cpus_ptr);
23293
23294 for_each_cpu_wrap(core, cpus, target) {
23295 bool idle = true;
5dd41b01 23296@@ -5734,7 +5734,7 @@ static int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int t
e4b2b4a8
JK
23297 return -1;
23298
23299 for_each_cpu(cpu, cpu_smt_mask(target)) {
23300- if (!cpumask_test_cpu(cpu, &p->cpus_allowed))
23301+ if (!cpumask_test_cpu(cpu, p->cpus_ptr))
23302 continue;
23303 if (idle_cpu(cpu))
23304 return cpu;
5dd41b01 23305@@ -5797,7 +5797,7 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int t
e4b2b4a8
JK
23306 for_each_cpu_wrap(cpu, sched_domain_span(sd), target) {
23307 if (!--nr)
23308 return -1;
23309- if (!cpumask_test_cpu(cpu, &p->cpus_allowed))
23310+ if (!cpumask_test_cpu(cpu, p->cpus_ptr))
23311 continue;
23312 if (idle_cpu(cpu))
23313 break;
5dd41b01 23314@@ -5952,7 +5952,7 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f
e4b2b4a8
JK
23315 if (sd_flag & SD_BALANCE_WAKE) {
23316 record_wakee(p);
23317 want_affine = !wake_wide(p) && !wake_cap(p, cpu, prev_cpu)
23318- && cpumask_test_cpu(cpu, &p->cpus_allowed);
23319+ && cpumask_test_cpu(cpu, p->cpus_ptr);
23320 }
1a6e0f06 23321
e4b2b4a8 23322 rcu_read_lock();
5dd41b01 23323@@ -6233,7 +6233,7 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
e4b2b4a8 23324 return;
1a6e0f06 23325
e4b2b4a8
JK
23326 preempt:
23327- resched_curr(rq);
23328+ resched_curr_lazy(rq);
23329 /*
23330 * Only set the backward buddy when the current task is still
23331 * on the rq. This can happen when a wakeup gets interleaved
5dd41b01 23332@@ -6701,14 +6701,14 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
e4b2b4a8
JK
23333 /*
23334 * We do not migrate tasks that are:
23335 * 1) throttled_lb_pair, or
23336- * 2) cannot be migrated to this CPU due to cpus_allowed, or
23337+ * 2) cannot be migrated to this CPU due to cpus_ptr, or
23338 * 3) running (obviously), or
23339 * 4) are cache-hot on their current CPU.
23340 */
23341 if (throttled_lb_pair(task_group(p), env->src_cpu, env->dst_cpu))
23342 return 0;
1a6e0f06 23343
e4b2b4a8
JK
23344- if (!cpumask_test_cpu(env->dst_cpu, &p->cpus_allowed)) {
23345+ if (!cpumask_test_cpu(env->dst_cpu, p->cpus_ptr)) {
23346 int cpu;
1a6e0f06 23347
e4b2b4a8 23348 schedstat_inc(p->se.statistics.nr_failed_migrations_affine);
5dd41b01 23349@@ -6728,7 +6728,7 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
1a6e0f06 23350
e4b2b4a8
JK
23351 /* Prevent to re-select dst_cpu via env's cpus */
23352 for_each_cpu_and(cpu, env->dst_grpmask, env->cpus) {
23353- if (cpumask_test_cpu(cpu, &p->cpus_allowed)) {
23354+ if (cpumask_test_cpu(cpu, p->cpus_ptr)) {
23355 env->flags |= LBF_DST_PINNED;
23356 env->new_dst_cpu = cpu;
23357 break;
5dd41b01 23358@@ -7297,7 +7297,7 @@ check_cpu_capacity(struct rq *rq, struct sched_domain *sd)
1a6e0f06 23359
e4b2b4a8
JK
23360 /*
23361 * Group imbalance indicates (and tries to solve) the problem where balancing
23362- * groups is inadequate due to ->cpus_allowed constraints.
23363+ * groups is inadequate due to ->cpus_ptr constraints.
23364 *
23365 * Imagine a situation of two groups of 4 cpus each and 4 tasks each with a
23366 * cpumask covering 1 cpu of the first group and 3 cpus of the second group.
5dd41b01 23367@@ -7873,7 +7873,7 @@ static struct sched_group *find_busiest_group(struct lb_env *env)
1a6e0f06 23368 /*
e4b2b4a8
JK
23369 * If the busiest group is imbalanced the below checks don't
23370 * work because they assume all things are equal, which typically
23371- * isn't true due to cpus_allowed constraints and the like.
23372+ * isn't true due to cpus_ptr constraints and the like.
23373 */
23374 if (busiest->group_type == group_imbalanced)
23375 goto force_balance;
5dd41b01 23376@@ -8265,7 +8265,7 @@ static int load_balance(int this_cpu, struct rq *this_rq,
e4b2b4a8
JK
23377 * if the curr task on busiest cpu can't be
23378 * moved to this_cpu
23379 */
23380- if (!cpumask_test_cpu(this_cpu, &busiest->curr->cpus_allowed)) {
23381+ if (!cpumask_test_cpu(this_cpu, busiest->curr->cpus_ptr)) {
23382 raw_spin_unlock_irqrestore(&busiest->lock,
23383 flags);
23384 env.flags |= LBF_ALL_PINNED;
5dd41b01 23385@@ -9087,7 +9087,7 @@ static void task_fork_fair(struct task_struct *p)
e4b2b4a8
JK
23386 * 'current' within the tree based on its new key value.
23387 */
23388 swap(curr->vruntime, se->vruntime);
23389- resched_curr(rq);
23390+ resched_curr_lazy(rq);
1a6e0f06 23391 }
e4b2b4a8
JK
23392
23393 se->vruntime -= cfs_rq->min_vruntime;
5dd41b01 23394@@ -9111,7 +9111,7 @@ prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio)
e4b2b4a8
JK
23395 */
23396 if (rq->curr == p) {
23397 if (p->prio > oldprio)
23398- resched_curr(rq);
23399+ resched_curr_lazy(rq);
23400 } else
23401 check_preempt_curr(rq, p, 0);
23402 }
b3bbd485
JK
23403diff --git a/kernel/sched/features.h b/kernel/sched/features.h
23404index 9552fd5854bf..fb069998b518 100644
23405--- a/kernel/sched/features.h
23406+++ b/kernel/sched/features.h
23407@@ -46,11 +46,19 @@ SCHED_FEAT(LB_BIAS, true)
e4b2b4a8
JK
23408 */
23409 SCHED_FEAT(NONTASK_CAPACITY, true)
23410
23411+#ifdef CONFIG_PREEMPT_RT_FULL
23412+SCHED_FEAT(TTWU_QUEUE, false)
23413+# ifdef CONFIG_PREEMPT_LAZY
23414+SCHED_FEAT(PREEMPT_LAZY, true)
23415+# endif
23416+#else
23417+
23418 /*
23419 * Queue remote wakeups on the target CPU and process them
23420 * using the scheduler IPI. Reduces rq->lock contention/bounces.
23421 */
23422 SCHED_FEAT(TTWU_QUEUE, true)
1a6e0f06
JK
23423+#endif
23424
e4b2b4a8
JK
23425 /*
23426 * When doing wakeups, attempt to limit superfluous scans of the LLC domain.
b3bbd485
JK
23427diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
23428index cb9a5b8532fa..6c72332dab3f 100644
23429--- a/kernel/sched/rt.c
23430+++ b/kernel/sched/rt.c
23431@@ -47,8 +47,8 @@ void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime)
c7c16703 23432
e4b2b4a8 23433 raw_spin_lock_init(&rt_b->rt_runtime_lock);
c7c16703 23434
e4b2b4a8
JK
23435- hrtimer_init(&rt_b->rt_period_timer,
23436- CLOCK_MONOTONIC, HRTIMER_MODE_REL);
23437+ hrtimer_init(&rt_b->rt_period_timer, CLOCK_MONOTONIC,
23438+ HRTIMER_MODE_REL_HARD);
23439 rt_b->rt_period_timer.function = sched_rt_period_timer;
23440 }
c7c16703 23441
b3bbd485 23442@@ -1596,7 +1596,7 @@ static void put_prev_task_rt(struct rq *rq, struct task_struct *p)
e4b2b4a8
JK
23443 static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu)
23444 {
23445 if (!task_running(rq, p) &&
23446- cpumask_test_cpu(cpu, &p->cpus_allowed))
23447+ cpumask_test_cpu(cpu, p->cpus_ptr))
23448 return 1;
23449 return 0;
c7c16703 23450 }
b3bbd485 23451@@ -1731,7 +1731,7 @@ static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq)
e4b2b4a8
JK
23452 * Also make sure that it wasn't scheduled on its rq.
23453 */
23454 if (unlikely(task_rq(task) != rq ||
23455- !cpumask_test_cpu(lowest_rq->cpu, &task->cpus_allowed) ||
23456+ !cpumask_test_cpu(lowest_rq->cpu, task->cpus_ptr) ||
23457 task_running(rq, task) ||
23458 !rt_task(task) ||
23459 !task_on_rq_queued(task))) {
b3bbd485
JK
23460diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
23461index b29376169f3f..96481980c8c7 100644
23462--- a/kernel/sched/sched.h
23463+++ b/kernel/sched/sched.h
23464@@ -1354,6 +1354,7 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
e4b2b4a8
JK
23465 #define WF_SYNC 0x01 /* waker goes to sleep after wakeup */
23466 #define WF_FORK 0x02 /* child wakeup after fork */
23467 #define WF_MIGRATED 0x4 /* internal use, task got migrated */
23468+#define WF_LOCK_SLEEPER 0x08 /* wakeup spinlock "sleeper" */
c7c16703 23469
e4b2b4a8
JK
23470 /*
23471 * To aid in avoiding the subversion of "niceness" due to uneven distribution
b3bbd485 23472@@ -1545,6 +1546,15 @@ extern void init_sched_fair_class(void);
e4b2b4a8
JK
23473 extern void resched_curr(struct rq *rq);
23474 extern void resched_cpu(int cpu);
23475
23476+#ifdef CONFIG_PREEMPT_LAZY
23477+extern void resched_curr_lazy(struct rq *rq);
23478+#else
23479+static inline void resched_curr_lazy(struct rq *rq)
1a6e0f06 23480+{
e4b2b4a8 23481+ resched_curr(rq);
1a6e0f06 23482+}
1a6e0f06
JK
23483+#endif
23484+
e4b2b4a8
JK
23485 extern struct rt_bandwidth def_rt_bandwidth;
23486 extern void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime);
23487
b3bbd485
JK
23488diff --git a/kernel/sched/swait.c b/kernel/sched/swait.c
23489index 9ff1555341ed..b14638a05ec9 100644
23490--- a/kernel/sched/swait.c
23491+++ b/kernel/sched/swait.c
e4b2b4a8
JK
23492@@ -1,6 +1,7 @@
23493 // SPDX-License-Identifier: GPL-2.0
23494 #include <linux/sched/signal.h>
23495 #include <linux/swait.h>
23496+#include <linux/suspend.h>
23497
23498 void __init_swait_queue_head(struct swait_queue_head *q, const char *name,
23499 struct lock_class_key *key)
b3bbd485 23500@@ -30,6 +31,25 @@ void swake_up_locked(struct swait_queue_head *q)
e4b2b4a8
JK
23501 }
23502 EXPORT_SYMBOL(swake_up_locked);
23503
23504+void swake_up_all_locked(struct swait_queue_head *q)
1a6e0f06 23505+{
e4b2b4a8
JK
23506+ struct swait_queue *curr;
23507+ int wakes = 0;
1a6e0f06 23508+
e4b2b4a8 23509+ while (!list_empty(&q->task_list)) {
1a6e0f06 23510+
e4b2b4a8
JK
23511+ curr = list_first_entry(&q->task_list, typeof(*curr),
23512+ task_list);
23513+ wake_up_process(curr->task);
23514+ list_del_init(&curr->task_list);
23515+ wakes++;
23516+ }
23517+ if (pm_in_action)
23518+ return;
23519+ WARN(wakes > 2, "complete_all() with %d waiters\n", wakes);
1a6e0f06 23520+}
e4b2b4a8 23521+EXPORT_SYMBOL(swake_up_all_locked);
1a6e0f06 23522+
e4b2b4a8
JK
23523 void swake_up(struct swait_queue_head *q)
23524 {
23525 unsigned long flags;
b3bbd485 23526@@ -49,6 +69,7 @@ void swake_up_all(struct swait_queue_head *q)
e4b2b4a8
JK
23527 struct swait_queue *curr;
23528 LIST_HEAD(tmp);
23529
23530+ WARN_ON(irqs_disabled());
23531 raw_spin_lock_irq(&q->lock);
23532 list_splice_init(&q->task_list, &tmp);
23533 while (!list_empty(&tmp)) {
b3bbd485
JK
23534diff --git a/kernel/sched/swork.c b/kernel/sched/swork.c
23535new file mode 100644
23536index 000000000000..1950f40ca725
23537--- /dev/null
23538+++ b/kernel/sched/swork.c
e4b2b4a8 23539@@ -0,0 +1,173 @@
1a6e0f06 23540+/*
e4b2b4a8
JK
23541+ * Copyright (C) 2014 BMW Car IT GmbH, Daniel Wagner daniel.wagner@bmw-carit.de
23542+ *
23543+ * Provides a framework for enqueuing callbacks from irq context
23544+ * PREEMPT_RT_FULL safe. The callbacks are executed in kthread context.
1a6e0f06 23545+ */
1a6e0f06 23546+
e4b2b4a8
JK
23547+#include <linux/swait.h>
23548+#include <linux/swork.h>
23549+#include <linux/kthread.h>
23550+#include <linux/slab.h>
23551+#include <linux/spinlock.h>
23552+#include <linux/export.h>
23553+
23554+#define SWORK_EVENT_PENDING (1 << 0)
23555+
23556+static DEFINE_MUTEX(worker_mutex);
23557+static struct sworker *glob_worker;
23558+
23559+struct sworker {
23560+ struct list_head events;
23561+ struct swait_queue_head wq;
1a6e0f06 23562+
e4b2b4a8
JK
23563+ raw_spinlock_t lock;
23564+
23565+ struct task_struct *task;
23566+ int refs;
23567+};
1a6e0f06 23568+
e4b2b4a8 23569+static bool swork_readable(struct sworker *worker)
1a6e0f06 23570+{
e4b2b4a8 23571+ bool r;
1a6e0f06 23572+
e4b2b4a8
JK
23573+ if (kthread_should_stop())
23574+ return true;
23575+
23576+ raw_spin_lock_irq(&worker->lock);
23577+ r = !list_empty(&worker->events);
23578+ raw_spin_unlock_irq(&worker->lock);
23579+
23580+ return r;
1a6e0f06 23581+}
1a6e0f06 23582+
e4b2b4a8 23583+static int swork_kthread(void *arg)
1a6e0f06 23584+{
e4b2b4a8 23585+ struct sworker *worker = arg;
1a6e0f06 23586+
e4b2b4a8
JK
23587+ for (;;) {
23588+ swait_event_interruptible(worker->wq,
23589+ swork_readable(worker));
23590+ if (kthread_should_stop())
23591+ break;
1a6e0f06 23592+
e4b2b4a8
JK
23593+ raw_spin_lock_irq(&worker->lock);
23594+ while (!list_empty(&worker->events)) {
23595+ struct swork_event *sev;
1a6e0f06 23596+
e4b2b4a8
JK
23597+ sev = list_first_entry(&worker->events,
23598+ struct swork_event, item);
23599+ list_del(&sev->item);
23600+ raw_spin_unlock_irq(&worker->lock);
1a6e0f06 23601+
e4b2b4a8
JK
23602+ WARN_ON_ONCE(!test_and_clear_bit(SWORK_EVENT_PENDING,
23603+ &sev->flags));
23604+ sev->func(sev);
23605+ raw_spin_lock_irq(&worker->lock);
23606+ }
23607+ raw_spin_unlock_irq(&worker->lock);
23608+ }
23609+ return 0;
1a6e0f06 23610+}
1a6e0f06 23611+
e4b2b4a8 23612+static struct sworker *swork_create(void)
1a6e0f06 23613+{
e4b2b4a8 23614+ struct sworker *worker;
1a6e0f06 23615+
e4b2b4a8
JK
23616+ worker = kzalloc(sizeof(*worker), GFP_KERNEL);
23617+ if (!worker)
23618+ return ERR_PTR(-ENOMEM);
1a6e0f06 23619+
e4b2b4a8
JK
23620+ INIT_LIST_HEAD(&worker->events);
23621+ raw_spin_lock_init(&worker->lock);
23622+ init_swait_queue_head(&worker->wq);
1a6e0f06 23623+
e4b2b4a8
JK
23624+ worker->task = kthread_run(swork_kthread, worker, "kswork");
23625+ if (IS_ERR(worker->task)) {
23626+ kfree(worker);
23627+ return ERR_PTR(-ENOMEM);
1a6e0f06 23628+ }
1a6e0f06 23629+
e4b2b4a8 23630+ return worker;
1a6e0f06 23631+}
1a6e0f06 23632+
e4b2b4a8 23633+static void swork_destroy(struct sworker *worker)
1a6e0f06 23634+{
e4b2b4a8
JK
23635+ kthread_stop(worker->task);
23636+
23637+ WARN_ON(!list_empty(&worker->events));
23638+ kfree(worker);
1a6e0f06 23639+}
1a6e0f06 23640+
e4b2b4a8
JK
23641+/**
23642+ * swork_queue - queue swork
23643+ *
23644+ * Returns %false if @work was already on a queue, %true otherwise.
23645+ *
23646+ * The work is queued and processed on a random CPU
23647+ */
23648+bool swork_queue(struct swork_event *sev)
1a6e0f06 23649+{
e4b2b4a8 23650+ unsigned long flags;
1a6e0f06 23651+
e4b2b4a8
JK
23652+ if (test_and_set_bit(SWORK_EVENT_PENDING, &sev->flags))
23653+ return false;
1a6e0f06 23654+
e4b2b4a8
JK
23655+ raw_spin_lock_irqsave(&glob_worker->lock, flags);
23656+ list_add_tail(&sev->item, &glob_worker->events);
23657+ raw_spin_unlock_irqrestore(&glob_worker->lock, flags);
1a6e0f06 23658+
e4b2b4a8
JK
23659+ swake_up(&glob_worker->wq);
23660+ return true;
1a6e0f06 23661+}
e4b2b4a8 23662+EXPORT_SYMBOL_GPL(swork_queue);
1a6e0f06 23663+
e4b2b4a8
JK
23664+/**
23665+ * swork_get - get an instance of the sworker
23666+ *
23667+ * Returns an negative error code if the initialization if the worker did not
23668+ * work, %0 otherwise.
23669+ *
23670+ */
23671+int swork_get(void)
1a6e0f06 23672+{
e4b2b4a8 23673+ struct sworker *worker;
1a6e0f06 23674+
e4b2b4a8
JK
23675+ mutex_lock(&worker_mutex);
23676+ if (!glob_worker) {
23677+ worker = swork_create();
23678+ if (IS_ERR(worker)) {
23679+ mutex_unlock(&worker_mutex);
23680+ return -ENOMEM;
23681+ }
1a6e0f06 23682+
e4b2b4a8
JK
23683+ glob_worker = worker;
23684+ }
1a6e0f06 23685+
e4b2b4a8
JK
23686+ glob_worker->refs++;
23687+ mutex_unlock(&worker_mutex);
1a6e0f06 23688+
e4b2b4a8 23689+ return 0;
1a6e0f06 23690+}
e4b2b4a8 23691+EXPORT_SYMBOL_GPL(swork_get);
1a6e0f06 23692+
e4b2b4a8
JK
23693+/**
23694+ * swork_put - puts an instance of the sworker
23695+ *
23696+ * Will destroy the sworker thread. This function must not be called until all
23697+ * queued events have been completed.
1a6e0f06 23698+ */
e4b2b4a8 23699+void swork_put(void)
1a6e0f06 23700+{
e4b2b4a8 23701+ mutex_lock(&worker_mutex);
1a6e0f06 23702+
e4b2b4a8
JK
23703+ glob_worker->refs--;
23704+ if (glob_worker->refs > 0)
23705+ goto out;
1a6e0f06 23706+
e4b2b4a8
JK
23707+ swork_destroy(glob_worker);
23708+ glob_worker = NULL;
23709+out:
23710+ mutex_unlock(&worker_mutex);
1a6e0f06 23711+}
e4b2b4a8 23712+EXPORT_SYMBOL_GPL(swork_put);
b3bbd485
JK
23713diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
23714index 659e075ef70b..bb22e3620a90 100644
23715--- a/kernel/sched/topology.c
23716+++ b/kernel/sched/topology.c
23717@@ -286,6 +286,7 @@ static int init_rootdomain(struct root_domain *rd)
e4b2b4a8
JK
23718 rd->rto_cpu = -1;
23719 raw_spin_lock_init(&rd->rto_lock);
23720 init_irq_work(&rd->rto_push_work, rto_push_irq_work_func);
23721+ rd->rto_push_work.flags |= IRQ_WORK_HARD_IRQ;
23722 #endif
23723
23724 init_dl_bw(&rd->dl_bw);
b3bbd485
JK
23725diff --git a/kernel/signal.c b/kernel/signal.c
23726index 4439ba9dc5d9..d8f75a030292 100644
23727--- a/kernel/signal.c
23728+++ b/kernel/signal.c
e4b2b4a8
JK
23729@@ -19,6 +19,7 @@
23730 #include <linux/sched/task.h>
23731 #include <linux/sched/task_stack.h>
23732 #include <linux/sched/cputime.h>
23733+#include <linux/sched/rt.h>
23734 #include <linux/fs.h>
23735 #include <linux/tty.h>
23736 #include <linux/binfmts.h>
b3bbd485 23737@@ -360,13 +361,30 @@ static bool task_participate_group_stop(struct task_struct *task)
e4b2b4a8
JK
23738 return false;
23739 }
23740
23741+static inline struct sigqueue *get_task_cache(struct task_struct *t)
1a6e0f06 23742+{
e4b2b4a8 23743+ struct sigqueue *q = t->sigqueue_cache;
1a6e0f06 23744+
e4b2b4a8
JK
23745+ if (cmpxchg(&t->sigqueue_cache, q, NULL) != q)
23746+ return NULL;
23747+ return q;
1a6e0f06 23748+}
1a6e0f06 23749+
e4b2b4a8 23750+static inline int put_task_cache(struct task_struct *t, struct sigqueue *q)
1a6e0f06 23751+{
e4b2b4a8
JK
23752+ if (cmpxchg(&t->sigqueue_cache, NULL, q) == NULL)
23753+ return 0;
23754+ return 1;
1a6e0f06 23755+}
1a6e0f06 23756+
e4b2b4a8
JK
23757 /*
23758 * allocate a new signal queue record
23759 * - this may be called without locks if and only if t == current, otherwise an
23760 * appropriate lock must be held to stop the target task from exiting
23761 */
23762 static struct sigqueue *
23763-__sigqueue_alloc(int sig, struct task_struct *t, gfp_t flags, int override_rlimit)
23764+__sigqueue_do_alloc(int sig, struct task_struct *t, gfp_t flags,
23765+ int override_rlimit, int fromslab)
23766 {
23767 struct sigqueue *q = NULL;
23768 struct user_struct *user;
b3bbd485 23769@@ -383,7 +401,10 @@ __sigqueue_alloc(int sig, struct task_struct *t, gfp_t flags, int override_rlimi
e4b2b4a8
JK
23770 if (override_rlimit ||
23771 atomic_read(&user->sigpending) <=
23772 task_rlimit(t, RLIMIT_SIGPENDING)) {
23773- q = kmem_cache_alloc(sigqueue_cachep, flags);
23774+ if (!fromslab)
23775+ q = get_task_cache(t);
23776+ if (!q)
23777+ q = kmem_cache_alloc(sigqueue_cachep, flags);
23778 } else {
23779 print_dropped_signal(sig);
23780 }
b3bbd485 23781@@ -400,6 +421,13 @@ __sigqueue_alloc(int sig, struct task_struct *t, gfp_t flags, int override_rlimi
e4b2b4a8
JK
23782 return q;
23783 }
23784
23785+static struct sigqueue *
23786+__sigqueue_alloc(int sig, struct task_struct *t, gfp_t flags,
23787+ int override_rlimit)
1a6e0f06 23788+{
e4b2b4a8 23789+ return __sigqueue_do_alloc(sig, t, flags, override_rlimit, 0);
1a6e0f06 23790+}
1a6e0f06 23791+
e4b2b4a8
JK
23792 static void __sigqueue_free(struct sigqueue *q)
23793 {
23794 if (q->flags & SIGQUEUE_PREALLOC)
b3bbd485 23795@@ -409,6 +437,21 @@ static void __sigqueue_free(struct sigqueue *q)
e4b2b4a8
JK
23796 kmem_cache_free(sigqueue_cachep, q);
23797 }
23798
23799+static void sigqueue_free_current(struct sigqueue *q)
1a6e0f06 23800+{
e4b2b4a8
JK
23801+ struct user_struct *up;
23802+
23803+ if (q->flags & SIGQUEUE_PREALLOC)
23804+ return;
23805+
23806+ up = q->user;
23807+ if (rt_prio(current->normal_prio) && !put_task_cache(current, q)) {
23808+ atomic_dec(&up->sigpending);
23809+ free_uid(up);
23810+ } else
23811+ __sigqueue_free(q);
1a6e0f06 23812+}
1a6e0f06 23813+
e4b2b4a8
JK
23814 void flush_sigqueue(struct sigpending *queue)
23815 {
23816 struct sigqueue *q;
b3bbd485
JK
23817@@ -421,6 +464,21 @@ void flush_sigqueue(struct sigpending *queue)
23818 }
e4b2b4a8
JK
23819 }
23820
b3bbd485 23821+/*
e4b2b4a8
JK
23822+ * Called from __exit_signal. Flush tsk->pending and
23823+ * tsk->sigqueue_cache
23824+ */
23825+void flush_task_sigqueue(struct task_struct *tsk)
1a6e0f06 23826+{
e4b2b4a8 23827+ struct sigqueue *q;
1a6e0f06 23828+
e4b2b4a8 23829+ flush_sigqueue(&tsk->pending);
1a6e0f06 23830+
e4b2b4a8
JK
23831+ q = get_task_cache(tsk);
23832+ if (q)
23833+ kmem_cache_free(sigqueue_cachep, q);
1a6e0f06
JK
23834+}
23835+
b3bbd485 23836 /*
e4b2b4a8
JK
23837 * Flush all pending signals for this kthread.
23838 */
b3bbd485 23839@@ -542,7 +600,7 @@ static void collect_signal(int sig, struct sigpending *list, siginfo_t *info,
e4b2b4a8
JK
23840 (info->si_code == SI_TIMER) &&
23841 (info->si_sys_private);
23842
23843- __sigqueue_free(first);
23844+ sigqueue_free_current(first);
23845 } else {
23846 /*
23847 * Ok, it wasn't in the queue. This must be
b3bbd485 23848@@ -578,6 +636,8 @@ int dequeue_signal(struct task_struct *tsk, sigset_t *mask, siginfo_t *info)
e4b2b4a8
JK
23849 bool resched_timer = false;
23850 int signr;
23851
23852+ WARN_ON_ONCE(tsk != current);
23853+
23854 /* We only dequeue private signals from ourselves, we don't let
23855 * signalfd steal them
23856 */
b3bbd485 23857@@ -1177,8 +1237,8 @@ int do_send_sig_info(int sig, struct siginfo *info, struct task_struct *p,
e4b2b4a8
JK
23858 * We don't want to have recursive SIGSEGV's etc, for example,
23859 * that is why we also clear SIGNAL_UNKILLABLE.
23860 */
23861-int
23862-force_sig_info(int sig, struct siginfo *info, struct task_struct *t)
23863+static int
23864+do_force_sig_info(int sig, struct siginfo *info, struct task_struct *t)
23865 {
23866 unsigned long int flags;
23867 int ret, blocked, ignored;
b3bbd485 23868@@ -1207,6 +1267,39 @@ force_sig_info(int sig, struct siginfo *info, struct task_struct *t)
e4b2b4a8
JK
23869 return ret;
23870 }
23871
23872+int force_sig_info(int sig, struct siginfo *info, struct task_struct *t)
1a6e0f06 23873+{
e4b2b4a8
JK
23874+/*
23875+ * On some archs, PREEMPT_RT has to delay sending a signal from a trap
23876+ * since it can not enable preemption, and the signal code's spin_locks
23877+ * turn into mutexes. Instead, it must set TIF_NOTIFY_RESUME which will
23878+ * send the signal on exit of the trap.
23879+ */
23880+#ifdef ARCH_RT_DELAYS_SIGNAL_SEND
23881+ if (in_atomic()) {
23882+ if (WARN_ON_ONCE(t != current))
23883+ return 0;
23884+ if (WARN_ON_ONCE(t->forced_info.si_signo))
23885+ return 0;
1a6e0f06 23886+
e4b2b4a8
JK
23887+ if (is_si_special(info)) {
23888+ WARN_ON_ONCE(info != SEND_SIG_PRIV);
23889+ t->forced_info.si_signo = sig;
23890+ t->forced_info.si_errno = 0;
23891+ t->forced_info.si_code = SI_KERNEL;
23892+ t->forced_info.si_pid = 0;
23893+ t->forced_info.si_uid = 0;
23894+ } else {
23895+ t->forced_info = *info;
23896+ }
1a6e0f06 23897+
e4b2b4a8
JK
23898+ set_tsk_thread_flag(t, TIF_NOTIFY_RESUME);
23899+ return 0;
23900+ }
23901+#endif
23902+ return do_force_sig_info(sig, info, t);
1a6e0f06 23903+}
1a6e0f06 23904+
e4b2b4a8
JK
23905 /*
23906 * Nuke all other threads in the group.
23907 */
b3bbd485 23908@@ -1241,12 +1334,12 @@ struct sighand_struct *__lock_task_sighand(struct task_struct *tsk,
e4b2b4a8
JK
23909 * Disable interrupts early to avoid deadlocks.
23910 * See rcu_read_unlock() comment header for details.
23911 */
23912- local_irq_save(*flags);
23913+ local_irq_save_nort(*flags);
23914 rcu_read_lock();
23915 sighand = rcu_dereference(tsk->sighand);
23916 if (unlikely(sighand == NULL)) {
23917 rcu_read_unlock();
23918- local_irq_restore(*flags);
23919+ local_irq_restore_nort(*flags);
23920 break;
23921 }
23922 /*
b3bbd485 23923@@ -1267,7 +1360,7 @@ struct sighand_struct *__lock_task_sighand(struct task_struct *tsk,
e4b2b4a8
JK
23924 }
23925 spin_unlock(&sighand->siglock);
23926 rcu_read_unlock();
23927- local_irq_restore(*flags);
23928+ local_irq_restore_nort(*flags);
23929 }
23930
23931 return sighand;
b3bbd485 23932@@ -1514,7 +1607,8 @@ EXPORT_SYMBOL(kill_pid);
e4b2b4a8
JK
23933 */
23934 struct sigqueue *sigqueue_alloc(void)
23935 {
23936- struct sigqueue *q = __sigqueue_alloc(-1, current, GFP_KERNEL, 0);
23937+ /* Preallocated sigqueue objects always from the slabcache ! */
23938+ struct sigqueue *q = __sigqueue_do_alloc(-1, current, GFP_KERNEL, 0, 1);
23939
23940 if (q)
23941 q->flags |= SIGQUEUE_PREALLOC;
b3bbd485 23942@@ -1888,15 +1982,7 @@ static void ptrace_stop(int exit_code, int why, int clear_code, siginfo_t *info)
e4b2b4a8
JK
23943 if (gstop_done && ptrace_reparented(current))
23944 do_notify_parent_cldstop(current, false, why);
23945
23946- /*
23947- * Don't want to allow preemption here, because
23948- * sys_ptrace() needs this task to be inactive.
23949- *
23950- * XXX: implement read_unlock_no_resched().
23951- */
23952- preempt_disable();
23953 read_unlock(&tasklist_lock);
23954- preempt_enable_no_resched();
23955 freezable_schedule();
23956 } else {
23957 /*
b3bbd485 23958diff --git a/kernel/softirq.c b/kernel/softirq.c
5dd41b01 23959index a4c87cf27f9d..583c9ecf04e3 100644
b3bbd485
JK
23960--- a/kernel/softirq.c
23961+++ b/kernel/softirq.c
e4b2b4a8
JK
23962@@ -21,11 +21,14 @@
23963 #include <linux/freezer.h>
23964 #include <linux/kthread.h>
23965 #include <linux/rcupdate.h>
23966+#include <linux/delay.h>
23967 #include <linux/ftrace.h>
23968 #include <linux/smp.h>
23969 #include <linux/smpboot.h>
23970 #include <linux/tick.h>
23971+#include <linux/locallock.h>
23972 #include <linux/irq.h>
23973+#include <linux/sched/types.h>
23974
23975 #define CREATE_TRACE_POINTS
23976 #include <trace/events/irq.h>
b3bbd485 23977@@ -56,12 +59,108 @@ EXPORT_SYMBOL(irq_stat);
e4b2b4a8
JK
23978 static struct softirq_action softirq_vec[NR_SOFTIRQS] __cacheline_aligned_in_smp;
23979
23980 DEFINE_PER_CPU(struct task_struct *, ksoftirqd);
23981+#ifdef CONFIG_PREEMPT_RT_FULL
23982+#define TIMER_SOFTIRQS ((1 << TIMER_SOFTIRQ) | (1 << HRTIMER_SOFTIRQ))
23983+DEFINE_PER_CPU(struct task_struct *, ktimer_softirqd);
23984+#endif
23985
23986 const char * const softirq_to_name[NR_SOFTIRQS] = {
23987 "HI", "TIMER", "NET_TX", "NET_RX", "BLOCK", "IRQ_POLL",
23988 "TASKLET", "SCHED", "HRTIMER", "RCU"
23989 };
23990
23991+#ifdef CONFIG_NO_HZ_COMMON
23992+# ifdef CONFIG_PREEMPT_RT_FULL
1a6e0f06 23993+
e4b2b4a8
JK
23994+struct softirq_runner {
23995+ struct task_struct *runner[NR_SOFTIRQS];
23996+};
1a6e0f06 23997+
e4b2b4a8 23998+static DEFINE_PER_CPU(struct softirq_runner, softirq_runners);
1a6e0f06 23999+
e4b2b4a8 24000+static inline void softirq_set_runner(unsigned int sirq)
1a6e0f06 24001+{
e4b2b4a8 24002+ struct softirq_runner *sr = this_cpu_ptr(&softirq_runners);
1a6e0f06 24003+
e4b2b4a8 24004+ sr->runner[sirq] = current;
1a6e0f06 24005+}
1a6e0f06 24006+
e4b2b4a8 24007+static inline void softirq_clr_runner(unsigned int sirq)
1a6e0f06 24008+{
e4b2b4a8
JK
24009+ struct softirq_runner *sr = this_cpu_ptr(&softirq_runners);
24010+
24011+ sr->runner[sirq] = NULL;
1a6e0f06 24012+}
1a6e0f06 24013+
e4b2b4a8
JK
24014+/*
24015+ * On preempt-rt a softirq running context might be blocked on a
24016+ * lock. There might be no other runnable task on this CPU because the
24017+ * lock owner runs on some other CPU. So we have to go into idle with
24018+ * the pending bit set. Therefor we need to check this otherwise we
24019+ * warn about false positives which confuses users and defeats the
24020+ * whole purpose of this test.
1a6e0f06 24021+ *
e4b2b4a8 24022+ * This code is called with interrupts disabled.
1a6e0f06 24023+ */
e4b2b4a8 24024+void softirq_check_pending_idle(void)
1a6e0f06 24025+{
e4b2b4a8
JK
24026+ static int rate_limit;
24027+ struct softirq_runner *sr = this_cpu_ptr(&softirq_runners);
24028+ u32 warnpending;
24029+ int i;
24030+
24031+ if (rate_limit >= 10)
24032+ return;
24033+
24034+ warnpending = local_softirq_pending() & SOFTIRQ_STOP_IDLE_MASK;
24035+ for (i = 0; i < NR_SOFTIRQS; i++) {
24036+ struct task_struct *tsk = sr->runner[i];
24037+
24038+ /*
24039+ * The wakeup code in rtmutex.c wakes up the task
24040+ * _before_ it sets pi_blocked_on to NULL under
24041+ * tsk->pi_lock. So we need to check for both: state
24042+ * and pi_blocked_on.
24043+ */
24044+ if (tsk) {
24045+ raw_spin_lock(&tsk->pi_lock);
24046+ if (tsk->pi_blocked_on || tsk->state == TASK_RUNNING) {
24047+ /* Clear all bits pending in that task */
24048+ warnpending &= ~(tsk->softirqs_raised);
24049+ warnpending &= ~(1 << i);
24050+ }
24051+ raw_spin_unlock(&tsk->pi_lock);
24052+ }
1a6e0f06 24053+ }
e4b2b4a8
JK
24054+
24055+ if (warnpending) {
24056+ printk(KERN_ERR "NOHZ: local_softirq_pending %02x\n",
24057+ warnpending);
24058+ rate_limit++;
24059+ }
24060+}
24061+# else
24062+/*
24063+ * On !PREEMPT_RT we just printk rate limited:
24064+ */
24065+void softirq_check_pending_idle(void)
1a6e0f06 24066+{
e4b2b4a8
JK
24067+ static int rate_limit;
24068+
5dd41b01 24069+ if (rate_limit < 10 && !in_softirq() &&
e4b2b4a8
JK
24070+ (local_softirq_pending() & SOFTIRQ_STOP_IDLE_MASK)) {
24071+ printk(KERN_ERR "NOHZ: local_softirq_pending %02x\n",
24072+ local_softirq_pending());
24073+ rate_limit++;
24074+ }
1a6e0f06 24075+}
e4b2b4a8
JK
24076+# endif
24077+
24078+#else /* !CONFIG_NO_HZ_COMMON */
24079+static inline void softirq_set_runner(unsigned int sirq) { }
24080+static inline void softirq_clr_runner(unsigned int sirq) { }
24081+#endif
1a6e0f06
JK
24082+
24083 /*
e4b2b4a8
JK
24084 * we cannot loop indefinitely here to avoid userspace starvation,
24085 * but we also don't want to introduce a worst case 1/HZ latency
b3bbd485 24086@@ -77,6 +176,38 @@ static void wakeup_softirqd(void)
e4b2b4a8 24087 wake_up_process(tsk);
1a6e0f06
JK
24088 }
24089
e4b2b4a8
JK
24090+#ifdef CONFIG_PREEMPT_RT_FULL
24091+static void wakeup_timer_softirqd(void)
1a6e0f06 24092+{
e4b2b4a8
JK
24093+ /* Interrupts are disabled: no need to stop preemption */
24094+ struct task_struct *tsk = __this_cpu_read(ktimer_softirqd);
24095+
24096+ if (tsk && tsk->state != TASK_RUNNING)
24097+ wake_up_process(tsk);
1a6e0f06 24098+}
e4b2b4a8 24099+#endif
1a6e0f06 24100+
e4b2b4a8
JK
24101+static void handle_softirq(unsigned int vec_nr)
24102+{
24103+ struct softirq_action *h = softirq_vec + vec_nr;
24104+ int prev_count;
1a6e0f06 24105+
e4b2b4a8 24106+ prev_count = preempt_count();
1a6e0f06 24107+
e4b2b4a8 24108+ kstat_incr_softirqs_this_cpu(vec_nr);
1a6e0f06 24109+
e4b2b4a8
JK
24110+ trace_softirq_entry(vec_nr);
24111+ h->action(h);
24112+ trace_softirq_exit(vec_nr);
24113+ if (unlikely(prev_count != preempt_count())) {
24114+ pr_err("huh, entered softirq %u %s %p with preempt_count %08x, exited with %08x?\n",
24115+ vec_nr, softirq_to_name[vec_nr], h->action,
24116+ prev_count, preempt_count());
24117+ preempt_count_set(prev_count);
24118+ }
1a6e0f06
JK
24119+}
24120+
e4b2b4a8 24121+#ifndef CONFIG_PREEMPT_RT_FULL
1a6e0f06 24122 /*
e4b2b4a8
JK
24123 * If ksoftirqd is scheduled, we do not want to process pending softirqs
24124 * right now. Let ksoftirqd handle this at its own rate, to get fairness,
b3bbd485 24125@@ -92,6 +223,47 @@ static bool ksoftirqd_running(unsigned long pending)
e4b2b4a8 24126 return tsk && (tsk->state == TASK_RUNNING);
1a6e0f06
JK
24127 }
24128
e4b2b4a8 24129+static inline int ksoftirqd_softirq_pending(void)
1a6e0f06 24130+{
e4b2b4a8 24131+ return local_softirq_pending();
1a6e0f06
JK
24132+}
24133+
e4b2b4a8 24134+static void handle_pending_softirqs(u32 pending)
1a6e0f06 24135+{
e4b2b4a8
JK
24136+ struct softirq_action *h = softirq_vec;
24137+ int softirq_bit;
1a6e0f06 24138+
e4b2b4a8
JK
24139+ local_irq_enable();
24140+
24141+ h = softirq_vec;
24142+
24143+ while ((softirq_bit = ffs(pending))) {
24144+ unsigned int vec_nr;
24145+
24146+ h += softirq_bit - 1;
24147+ vec_nr = h - softirq_vec;
24148+ handle_softirq(vec_nr);
24149+
24150+ h++;
24151+ pending >>= softirq_bit;
1a6e0f06 24152+ }
e4b2b4a8
JK
24153+
24154+ rcu_bh_qs();
24155+ local_irq_disable();
1a6e0f06 24156+}
e4b2b4a8
JK
24157+
24158+static void run_ksoftirqd(unsigned int cpu)
1a6e0f06 24159+{
e4b2b4a8
JK
24160+ local_irq_disable();
24161+ if (ksoftirqd_softirq_pending()) {
24162+ __do_softirq();
24163+ local_irq_enable();
24164+ cond_resched_rcu_qs();
24165+ return;
24166+ }
24167+ local_irq_enable();
1a6e0f06 24168+}
1a6e0f06 24169+
e4b2b4a8
JK
24170 /*
24171 * preempt_count and SOFTIRQ_OFFSET usage:
24172 * - preempt_count is changed by SOFTIRQ_OFFSET on entering or leaving
b3bbd485 24173@@ -247,10 +419,8 @@ asmlinkage __visible void __softirq_entry __do_softirq(void)
e4b2b4a8
JK
24174 unsigned long end = jiffies + MAX_SOFTIRQ_TIME;
24175 unsigned long old_flags = current->flags;
24176 int max_restart = MAX_SOFTIRQ_RESTART;
24177- struct softirq_action *h;
24178 bool in_hardirq;
24179 __u32 pending;
24180- int softirq_bit;
24181
24182 /*
24183 * Mask out PF_MEMALLOC s current task context is borrowed for the
b3bbd485 24184@@ -269,36 +439,7 @@ asmlinkage __visible void __softirq_entry __do_softirq(void)
e4b2b4a8
JK
24185 /* Reset the pending bitmask before enabling irqs */
24186 set_softirq_pending(0);
24187
24188- local_irq_enable();
24189-
24190- h = softirq_vec;
24191-
24192- while ((softirq_bit = ffs(pending))) {
24193- unsigned int vec_nr;
24194- int prev_count;
24195-
24196- h += softirq_bit - 1;
24197-
24198- vec_nr = h - softirq_vec;
24199- prev_count = preempt_count();
24200-
24201- kstat_incr_softirqs_this_cpu(vec_nr);
24202-
24203- trace_softirq_entry(vec_nr);
24204- h->action(h);
24205- trace_softirq_exit(vec_nr);
24206- if (unlikely(prev_count != preempt_count())) {
24207- pr_err("huh, entered softirq %u %s %p with preempt_count %08x, exited with %08x?\n",
24208- vec_nr, softirq_to_name[vec_nr], h->action,
24209- prev_count, preempt_count());
24210- preempt_count_set(prev_count);
24211- }
24212- h++;
24213- pending >>= softirq_bit;
24214- }
24215-
24216- rcu_bh_qs();
24217- local_irq_disable();
24218+ handle_pending_softirqs(pending);
24219
24220 pending = local_softirq_pending();
24221 if (pending) {
b3bbd485
JK
24222@@ -334,6 +475,309 @@ asmlinkage __visible void do_softirq(void)
24223 local_irq_restore(flags);
e4b2b4a8
JK
24224 }
24225
b3bbd485 24226+/*
e4b2b4a8 24227+ * This function must run with irqs disabled!
1a6e0f06 24228+ */
e4b2b4a8 24229+void raise_softirq_irqoff(unsigned int nr)
1a6e0f06 24230+{
e4b2b4a8 24231+ __raise_softirq_irqoff(nr);
1a6e0f06
JK
24232+
24233+ /*
e4b2b4a8
JK
24234+ * If we're in an interrupt or softirq, we're done
24235+ * (this also catches softirq-disabled code). We will
24236+ * actually run the softirq once we return from
24237+ * the irq or softirq.
24238+ *
24239+ * Otherwise we wake up ksoftirqd to make sure we
24240+ * schedule the softirq soon.
1a6e0f06 24241+ */
e4b2b4a8
JK
24242+ if (!in_interrupt())
24243+ wakeup_softirqd();
24244+}
1a6e0f06 24245+
e4b2b4a8
JK
24246+void __raise_softirq_irqoff(unsigned int nr)
24247+{
24248+ trace_softirq_raise(nr);
24249+ or_softirq_pending(1UL << nr);
24250+}
1a6e0f06 24251+
e4b2b4a8
JK
24252+static inline void local_bh_disable_nort(void) { local_bh_disable(); }
24253+static inline void _local_bh_enable_nort(void) { _local_bh_enable(); }
24254+static void ksoftirqd_set_sched_params(unsigned int cpu) { }
1a6e0f06 24255+
e4b2b4a8 24256+#else /* !PREEMPT_RT_FULL */
1a6e0f06 24257+
e4b2b4a8
JK
24258+/*
24259+ * On RT we serialize softirq execution with a cpu local lock per softirq
24260+ */
24261+static DEFINE_PER_CPU(struct local_irq_lock [NR_SOFTIRQS], local_softirq_locks);
1a6e0f06 24262+
e4b2b4a8
JK
24263+void __init softirq_early_init(void)
24264+{
24265+ int i;
1a6e0f06 24266+
e4b2b4a8
JK
24267+ for (i = 0; i < NR_SOFTIRQS; i++)
24268+ local_irq_lock_init(local_softirq_locks[i]);
24269+}
1a6e0f06 24270+
e4b2b4a8
JK
24271+static void lock_softirq(int which)
24272+{
24273+ local_lock(local_softirq_locks[which]);
24274+}
1a6e0f06 24275+
e4b2b4a8
JK
24276+static void unlock_softirq(int which)
24277+{
24278+ local_unlock(local_softirq_locks[which]);
24279+}
1a6e0f06 24280+
e4b2b4a8
JK
24281+static void do_single_softirq(int which)
24282+{
24283+ unsigned long old_flags = current->flags;
1a6e0f06 24284+
e4b2b4a8
JK
24285+ current->flags &= ~PF_MEMALLOC;
24286+ vtime_account_irq_enter(current);
24287+ current->flags |= PF_IN_SOFTIRQ;
24288+ lockdep_softirq_enter();
24289+ local_irq_enable();
24290+ handle_softirq(which);
24291+ local_irq_disable();
24292+ lockdep_softirq_exit();
24293+ current->flags &= ~PF_IN_SOFTIRQ;
24294+ vtime_account_irq_enter(current);
24295+ current_restore_flags(old_flags, PF_MEMALLOC);
1a6e0f06
JK
24296+}
24297+
1a6e0f06 24298+/*
e4b2b4a8
JK
24299+ * Called with interrupts disabled. Process softirqs which were raised
24300+ * in current context (or on behalf of ksoftirqd).
1a6e0f06 24301+ */
e4b2b4a8 24302+static void do_current_softirqs(void)
1a6e0f06 24303+{
e4b2b4a8
JK
24304+ while (current->softirqs_raised) {
24305+ int i = __ffs(current->softirqs_raised);
24306+ unsigned int pending, mask = (1U << i);
1a6e0f06 24307+
e4b2b4a8
JK
24308+ current->softirqs_raised &= ~mask;
24309+ local_irq_enable();
1a6e0f06 24310+
e4b2b4a8
JK
24311+ /*
24312+ * If the lock is contended, we boost the owner to
24313+ * process the softirq or leave the critical section
24314+ * now.
24315+ */
24316+ lock_softirq(i);
24317+ local_irq_disable();
24318+ softirq_set_runner(i);
24319+ /*
24320+ * Check with the local_softirq_pending() bits,
24321+ * whether we need to process this still or if someone
24322+ * else took care of it.
24323+ */
24324+ pending = local_softirq_pending();
24325+ if (pending & mask) {
24326+ set_softirq_pending(pending & ~mask);
24327+ do_single_softirq(i);
24328+ }
24329+ softirq_clr_runner(i);
24330+ WARN_ON(current->softirq_nestcnt != 1);
24331+ local_irq_enable();
24332+ unlock_softirq(i);
24333+ local_irq_disable();
1a6e0f06 24334+ }
1a6e0f06
JK
24335+}
24336+
e4b2b4a8 24337+void __local_bh_disable(void)
1a6e0f06 24338+{
e4b2b4a8
JK
24339+ if (++current->softirq_nestcnt == 1)
24340+ migrate_disable();
24341+}
24342+EXPORT_SYMBOL(__local_bh_disable);
1a6e0f06 24343+
e4b2b4a8
JK
24344+void __local_bh_enable(void)
24345+{
24346+ if (WARN_ON(current->softirq_nestcnt == 0))
24347+ return;
1a6e0f06 24348+
e4b2b4a8
JK
24349+ local_irq_disable();
24350+ if (current->softirq_nestcnt == 1 && current->softirqs_raised)
24351+ do_current_softirqs();
24352+ local_irq_enable();
1a6e0f06 24353+
e4b2b4a8
JK
24354+ if (--current->softirq_nestcnt == 0)
24355+ migrate_enable();
1a6e0f06 24356+}
e4b2b4a8 24357+EXPORT_SYMBOL(__local_bh_enable);
1a6e0f06 24358+
e4b2b4a8 24359+void _local_bh_enable(void)
1a6e0f06 24360+{
e4b2b4a8
JK
24361+ if (WARN_ON(current->softirq_nestcnt == 0))
24362+ return;
24363+ if (--current->softirq_nestcnt == 0)
24364+ migrate_enable();
1a6e0f06 24365+}
e4b2b4a8 24366+EXPORT_SYMBOL(_local_bh_enable);
1a6e0f06 24367+
e4b2b4a8 24368+int in_serving_softirq(void)
1a6e0f06 24369+{
e4b2b4a8 24370+ return current->flags & PF_IN_SOFTIRQ;
1a6e0f06 24371+}
e4b2b4a8 24372+EXPORT_SYMBOL(in_serving_softirq);
1a6e0f06 24373+
e4b2b4a8
JK
24374+/* Called with preemption disabled */
24375+static void run_ksoftirqd(unsigned int cpu)
1a6e0f06 24376+{
e4b2b4a8
JK
24377+ local_irq_disable();
24378+ current->softirq_nestcnt++;
24379+
24380+ do_current_softirqs();
24381+ current->softirq_nestcnt--;
24382+ local_irq_enable();
24383+ cond_resched_rcu_qs();
1a6e0f06 24384+}
1a6e0f06 24385+
e4b2b4a8
JK
24386+/*
24387+ * Called from netif_rx_ni(). Preemption enabled, but migration
24388+ * disabled. So the cpu can't go away under us.
24389+ */
24390+void thread_do_softirq(void)
1a6e0f06 24391+{
e4b2b4a8
JK
24392+ if (!in_serving_softirq() && current->softirqs_raised) {
24393+ current->softirq_nestcnt++;
24394+ do_current_softirqs();
24395+ current->softirq_nestcnt--;
24396+ }
1a6e0f06 24397+}
1a6e0f06 24398+
e4b2b4a8 24399+static void do_raise_softirq_irqoff(unsigned int nr)
1a6e0f06 24400+{
e4b2b4a8
JK
24401+ unsigned int mask;
24402+
24403+ mask = 1UL << nr;
24404+
24405+ trace_softirq_raise(nr);
24406+ or_softirq_pending(mask);
24407+
24408+ /*
24409+ * If we are not in a hard interrupt and inside a bh disabled
24410+ * region, we simply raise the flag on current. local_bh_enable()
24411+ * will make sure that the softirq is executed. Otherwise we
24412+ * delegate it to ksoftirqd.
24413+ */
24414+ if (!in_irq() && current->softirq_nestcnt)
24415+ current->softirqs_raised |= mask;
24416+ else if (!__this_cpu_read(ksoftirqd) || !__this_cpu_read(ktimer_softirqd))
24417+ return;
24418+
24419+ if (mask & TIMER_SOFTIRQS)
24420+ __this_cpu_read(ktimer_softirqd)->softirqs_raised |= mask;
24421+ else
24422+ __this_cpu_read(ksoftirqd)->softirqs_raised |= mask;
1a6e0f06 24423+}
1a6e0f06 24424+
e4b2b4a8 24425+static void wakeup_proper_softirq(unsigned int nr)
1a6e0f06 24426+{
e4b2b4a8
JK
24427+ if ((1UL << nr) & TIMER_SOFTIRQS)
24428+ wakeup_timer_softirqd();
24429+ else
24430+ wakeup_softirqd();
1a6e0f06 24431+}
1a6e0f06 24432+
e4b2b4a8 24433+void __raise_softirq_irqoff(unsigned int nr)
1a6e0f06 24434+{
e4b2b4a8
JK
24435+ do_raise_softirq_irqoff(nr);
24436+ if (!in_irq() && !current->softirq_nestcnt)
24437+ wakeup_proper_softirq(nr);
1a6e0f06 24438+}
1a6e0f06 24439+
e4b2b4a8
JK
24440+/*
24441+ * Same as __raise_softirq_irqoff() but will process them in ksoftirqd
24442+ */
24443+void __raise_softirq_irqoff_ksoft(unsigned int nr)
1a6e0f06 24444+{
e4b2b4a8 24445+ unsigned int mask;
1a6e0f06 24446+
e4b2b4a8
JK
24447+ if (WARN_ON_ONCE(!__this_cpu_read(ksoftirqd) ||
24448+ !__this_cpu_read(ktimer_softirqd)))
24449+ return;
24450+ mask = 1UL << nr;
1a6e0f06 24451+
e4b2b4a8
JK
24452+ trace_softirq_raise(nr);
24453+ or_softirq_pending(mask);
24454+ if (mask & TIMER_SOFTIRQS)
24455+ __this_cpu_read(ktimer_softirqd)->softirqs_raised |= mask;
24456+ else
24457+ __this_cpu_read(ksoftirqd)->softirqs_raised |= mask;
24458+ wakeup_proper_softirq(nr);
1a6e0f06 24459+}
1a6e0f06
JK
24460+
24461+/*
e4b2b4a8 24462+ * This function must run with irqs disabled!
1a6e0f06 24463+ */
e4b2b4a8 24464+void raise_softirq_irqoff(unsigned int nr)
1a6e0f06 24465+{
e4b2b4a8
JK
24466+ do_raise_softirq_irqoff(nr);
24467+
24468+ /*
24469+ * If we're in an hard interrupt we let irq return code deal
24470+ * with the wakeup of ksoftirqd.
24471+ */
24472+ if (in_irq())
24473+ return;
24474+ /*
24475+ * If we are in thread context but outside of a bh disabled
24476+ * region, we need to wake ksoftirqd as well.
24477+ *
24478+ * CHECKME: Some of the places which do that could be wrapped
24479+ * into local_bh_disable/enable pairs. Though it's unclear
24480+ * whether this is worth the effort. To find those places just
24481+ * raise a WARN() if the condition is met.
24482+ */
24483+ if (!current->softirq_nestcnt)
24484+ wakeup_proper_softirq(nr);
1a6e0f06 24485+}
1a6e0f06 24486+
e4b2b4a8 24487+static inline int ksoftirqd_softirq_pending(void)
1a6e0f06 24488+{
e4b2b4a8
JK
24489+ return current->softirqs_raised;
24490+}
1a6e0f06 24491+
e4b2b4a8
JK
24492+static inline void local_bh_disable_nort(void) { }
24493+static inline void _local_bh_enable_nort(void) { }
24494+
24495+static inline void ksoftirqd_set_sched_params(unsigned int cpu)
24496+{
24497+ /* Take over all but timer pending softirqs when starting */
24498+ local_irq_disable();
24499+ current->softirqs_raised = local_softirq_pending() & ~TIMER_SOFTIRQS;
24500+ local_irq_enable();
1a6e0f06 24501+}
1a6e0f06 24502+
e4b2b4a8 24503+static inline void ktimer_softirqd_set_sched_params(unsigned int cpu)
1a6e0f06 24504+{
e4b2b4a8 24505+ struct sched_param param = { .sched_priority = 1 };
1a6e0f06 24506+
e4b2b4a8
JK
24507+ sched_setscheduler(current, SCHED_FIFO, &param);
24508+
24509+ /* Take over timer pending softirqs when starting */
24510+ local_irq_disable();
24511+ current->softirqs_raised = local_softirq_pending() & TIMER_SOFTIRQS;
24512+ local_irq_enable();
1a6e0f06 24513+}
1a6e0f06 24514+
e4b2b4a8
JK
24515+static inline void ktimer_softirqd_clr_sched_params(unsigned int cpu,
24516+ bool online)
1a6e0f06 24517+{
e4b2b4a8 24518+ struct sched_param param = { .sched_priority = 0 };
1a6e0f06 24519+
e4b2b4a8 24520+ sched_setscheduler(current, SCHED_NORMAL, &param);
1a6e0f06 24521+}
1a6e0f06 24522+
e4b2b4a8 24523+static int ktimer_softirqd_should_run(unsigned int cpu)
1a6e0f06 24524+{
e4b2b4a8 24525+ return current->softirqs_raised;
1a6e0f06 24526+}
1a6e0f06 24527+
e4b2b4a8 24528+#endif /* PREEMPT_RT_FULL */
b3bbd485 24529 /*
e4b2b4a8
JK
24530 * Enter an interrupt context.
24531 */
b3bbd485 24532@@ -345,9 +789,9 @@ void irq_enter(void)
e4b2b4a8
JK
24533 * Prevent raise_softirq from needlessly waking up ksoftirqd
24534 * here, as softirq will be serviced on return from interrupt.
24535 */
24536- local_bh_disable();
24537+ local_bh_disable_nort();
24538 tick_irq_enter();
24539- _local_bh_enable();
24540+ _local_bh_enable_nort();
24541 }
24542
24543 __irq_enter();
b3bbd485 24544@@ -355,6 +799,7 @@ void irq_enter(void)
e4b2b4a8
JK
24545
24546 static inline void invoke_softirq(void)
24547 {
24548+#ifndef CONFIG_PREEMPT_RT_FULL
24549 if (ksoftirqd_running(local_softirq_pending()))
24550 return;
24551
b3bbd485 24552@@ -377,6 +822,18 @@ static inline void invoke_softirq(void)
e4b2b4a8
JK
24553 } else {
24554 wakeup_softirqd();
24555 }
24556+#else /* PREEMPT_RT_FULL */
24557+ unsigned long flags;
24558+
24559+ local_irq_save(flags);
24560+ if (__this_cpu_read(ksoftirqd) &&
24561+ __this_cpu_read(ksoftirqd)->softirqs_raised)
24562+ wakeup_softirqd();
24563+ if (__this_cpu_read(ktimer_softirqd) &&
24564+ __this_cpu_read(ktimer_softirqd)->softirqs_raised)
24565+ wakeup_timer_softirqd();
24566+ local_irq_restore(flags);
24567+#endif
24568 }
24569
24570 static inline void tick_irq_exit(void)
b3bbd485 24571@@ -385,7 +842,8 @@ static inline void tick_irq_exit(void)
e4b2b4a8
JK
24572 int cpu = smp_processor_id();
24573
24574 /* Make sure that timer wheel updates are propagated */
24575- if ((idle_cpu(cpu) && !need_resched()) || tick_nohz_full_cpu(cpu)) {
e4b2b4a8 24576+ if ((idle_cpu(cpu) || tick_nohz_full_cpu(cpu)) &&
b3bbd485 24577+ !need_resched() && !local_softirq_pending()) {
e4b2b4a8
JK
24578 if (!in_irq())
24579 tick_nohz_irq_exit();
24580 }
b3bbd485 24581@@ -413,26 +871,6 @@ void irq_exit(void)
e4b2b4a8
JK
24582 trace_hardirq_exit(); /* must be last! */
24583 }
24584
24585-/*
24586- * This function must run with irqs disabled!
24587- */
24588-inline void raise_softirq_irqoff(unsigned int nr)
24589-{
24590- __raise_softirq_irqoff(nr);
24591-
24592- /*
24593- * If we're in an interrupt or softirq, we're done
24594- * (this also catches softirq-disabled code). We will
24595- * actually run the softirq once we return from
24596- * the irq or softirq.
24597- *
24598- * Otherwise we wake up ksoftirqd to make sure we
24599- * schedule the softirq soon.
24600- */
24601- if (!in_interrupt())
24602- wakeup_softirqd();
24603-}
24604-
24605 void raise_softirq(unsigned int nr)
24606 {
24607 unsigned long flags;
b3bbd485 24608@@ -442,12 +880,6 @@ void raise_softirq(unsigned int nr)
e4b2b4a8
JK
24609 local_irq_restore(flags);
24610 }
24611
24612-void __raise_softirq_irqoff(unsigned int nr)
24613-{
24614- trace_softirq_raise(nr);
24615- or_softirq_pending(1UL << nr);
24616-}
24617-
24618 void open_softirq(int nr, void (*action)(struct softirq_action *))
24619 {
24620 softirq_vec[nr].action = action;
b3bbd485 24621@@ -464,15 +896,45 @@ struct tasklet_head {
e4b2b4a8
JK
24622 static DEFINE_PER_CPU(struct tasklet_head, tasklet_vec);
24623 static DEFINE_PER_CPU(struct tasklet_head, tasklet_hi_vec);
24624
24625+static void inline
24626+__tasklet_common_schedule(struct tasklet_struct *t, struct tasklet_head *head, unsigned int nr)
1a6e0f06 24627+{
e4b2b4a8
JK
24628+ if (tasklet_trylock(t)) {
24629+again:
24630+ /* We may have been preempted before tasklet_trylock
24631+ * and __tasklet_action may have already run.
24632+ * So double check the sched bit while the takslet
24633+ * is locked before adding it to the list.
24634+ */
24635+ if (test_bit(TASKLET_STATE_SCHED, &t->state)) {
24636+ t->next = NULL;
24637+ *head->tail = t;
24638+ head->tail = &(t->next);
24639+ raise_softirq_irqoff(nr);
24640+ tasklet_unlock(t);
24641+ } else {
24642+ /* This is subtle. If we hit the corner case above
24643+ * It is possible that we get preempted right here,
24644+ * and another task has successfully called
24645+ * tasklet_schedule(), then this function, and
24646+ * failed on the trylock. Thus we must be sure
24647+ * before releasing the tasklet lock, that the
24648+ * SCHED_BIT is clear. Otherwise the tasklet
24649+ * may get its SCHED_BIT set, but not added to the
24650+ * list
24651+ */
24652+ if (!tasklet_tryunlock(t))
24653+ goto again;
24654+ }
24655+ }
1a6e0f06 24656+}
1a6e0f06 24657+
e4b2b4a8
JK
24658 void __tasklet_schedule(struct tasklet_struct *t)
24659 {
24660 unsigned long flags;
24661
24662 local_irq_save(flags);
24663- t->next = NULL;
24664- *__this_cpu_read(tasklet_vec.tail) = t;
24665- __this_cpu_write(tasklet_vec.tail, &(t->next));
24666- raise_softirq_irqoff(TASKLET_SOFTIRQ);
24667+ __tasklet_common_schedule(t, this_cpu_ptr(&tasklet_vec), TASKLET_SOFTIRQ);
24668 local_irq_restore(flags);
24669 }
24670 EXPORT_SYMBOL(__tasklet_schedule);
b3bbd485 24671@@ -482,50 +944,108 @@ void __tasklet_hi_schedule(struct tasklet_struct *t)
e4b2b4a8
JK
24672 unsigned long flags;
24673
24674 local_irq_save(flags);
24675- t->next = NULL;
24676- *__this_cpu_read(tasklet_hi_vec.tail) = t;
24677- __this_cpu_write(tasklet_hi_vec.tail, &(t->next));
24678- raise_softirq_irqoff(HI_SOFTIRQ);
24679+ __tasklet_common_schedule(t, this_cpu_ptr(&tasklet_hi_vec), HI_SOFTIRQ);
24680 local_irq_restore(flags);
24681 }
24682 EXPORT_SYMBOL(__tasklet_hi_schedule);
24683
24684-static __latent_entropy void tasklet_action(struct softirq_action *a)
24685+void tasklet_enable(struct tasklet_struct *t)
24686 {
24687- struct tasklet_struct *list;
24688+ if (!atomic_dec_and_test(&t->count))
24689+ return;
24690+ if (test_and_clear_bit(TASKLET_STATE_PENDING, &t->state))
24691+ tasklet_schedule(t);
1a6e0f06 24692+}
e4b2b4a8
JK
24693+EXPORT_SYMBOL(tasklet_enable);
24694
24695- local_irq_disable();
24696- list = __this_cpu_read(tasklet_vec.head);
24697- __this_cpu_write(tasklet_vec.head, NULL);
24698- __this_cpu_write(tasklet_vec.tail, this_cpu_ptr(&tasklet_vec.head));
24699- local_irq_enable();
24700+static void __tasklet_action(struct softirq_action *a,
24701+ struct tasklet_struct *list)
24702+{
24703+ int loops = 1000000;
24704
24705 while (list) {
24706 struct tasklet_struct *t = list;
24707
24708 list = list->next;
24709
24710- if (tasklet_trylock(t)) {
24711- if (!atomic_read(&t->count)) {
24712- if (!test_and_clear_bit(TASKLET_STATE_SCHED,
24713- &t->state))
24714- BUG();
24715- t->func(t->data);
24716- tasklet_unlock(t);
24717- continue;
24718- }
24719- tasklet_unlock(t);
24720+ /*
24721+ * Should always succeed - after a tasklist got on the
24722+ * list (after getting the SCHED bit set from 0 to 1),
24723+ * nothing but the tasklet softirq it got queued to can
24724+ * lock it:
24725+ */
24726+ if (!tasklet_trylock(t)) {
24727+ WARN_ON(1);
24728+ continue;
24729 }
24730
24731- local_irq_disable();
24732 t->next = NULL;
24733- *__this_cpu_read(tasklet_vec.tail) = t;
24734- __this_cpu_write(tasklet_vec.tail, &(t->next));
24735- __raise_softirq_irqoff(TASKLET_SOFTIRQ);
24736- local_irq_enable();
1a6e0f06 24737+
e4b2b4a8
JK
24738+ /*
24739+ * If we cannot handle the tasklet because it's disabled,
24740+ * mark it as pending. tasklet_enable() will later
24741+ * re-schedule the tasklet.
24742+ */
24743+ if (unlikely(atomic_read(&t->count))) {
24744+out_disabled:
24745+ /* implicit unlock: */
24746+ wmb();
24747+ t->state = TASKLET_STATEF_PENDING;
24748+ continue;
24749+ }
1a6e0f06 24750+
e4b2b4a8
JK
24751+ /*
24752+ * After this point on the tasklet might be rescheduled
24753+ * on another CPU, but it can only be added to another
24754+ * CPU's tasklet list if we unlock the tasklet (which we
24755+ * dont do yet).
24756+ */
24757+ if (!test_and_clear_bit(TASKLET_STATE_SCHED, &t->state))
24758+ WARN_ON(1);
1a6e0f06 24759+
e4b2b4a8
JK
24760+again:
24761+ t->func(t->data);
1a6e0f06 24762+
e4b2b4a8
JK
24763+ /*
24764+ * Try to unlock the tasklet. We must use cmpxchg, because
24765+ * another CPU might have scheduled or disabled the tasklet.
24766+ * We only allow the STATE_RUN -> 0 transition here.
24767+ */
24768+ while (!tasklet_tryunlock(t)) {
24769+ /*
24770+ * If it got disabled meanwhile, bail out:
24771+ */
24772+ if (atomic_read(&t->count))
24773+ goto out_disabled;
24774+ /*
24775+ * If it got scheduled meanwhile, re-execute
24776+ * the tasklet function:
24777+ */
24778+ if (test_and_clear_bit(TASKLET_STATE_SCHED, &t->state))
24779+ goto again;
24780+ if (!--loops) {
24781+ printk("hm, tasklet state: %08lx\n", t->state);
24782+ WARN_ON(1);
24783+ tasklet_unlock(t);
24784+ break;
24785+ }
24786+ }
24787 }
24788 }
24789
24790+static __latent_entropy void tasklet_action(struct softirq_action *a)
24791+{
24792+ struct tasklet_struct *list;
1a6e0f06 24793+
e4b2b4a8
JK
24794+ local_irq_disable();
24795+ list = __this_cpu_read(tasklet_vec.head);
24796+ __this_cpu_write(tasklet_vec.head, NULL);
24797+ __this_cpu_write(tasklet_vec.tail, this_cpu_ptr(&tasklet_vec.head));
24798+ local_irq_enable();
1a6e0f06 24799+
e4b2b4a8 24800+ __tasklet_action(a, list);
1a6e0f06 24801+}
e4b2b4a8
JK
24802+
24803 static __latent_entropy void tasklet_hi_action(struct softirq_action *a)
24804 {
24805 struct tasklet_struct *list;
b3bbd485 24806@@ -536,30 +1056,7 @@ static __latent_entropy void tasklet_hi_action(struct softirq_action *a)
e4b2b4a8
JK
24807 __this_cpu_write(tasklet_hi_vec.tail, this_cpu_ptr(&tasklet_hi_vec.head));
24808 local_irq_enable();
24809
24810- while (list) {
24811- struct tasklet_struct *t = list;
24812-
24813- list = list->next;
24814-
24815- if (tasklet_trylock(t)) {
24816- if (!atomic_read(&t->count)) {
24817- if (!test_and_clear_bit(TASKLET_STATE_SCHED,
24818- &t->state))
24819- BUG();
24820- t->func(t->data);
24821- tasklet_unlock(t);
24822- continue;
24823- }
24824- tasklet_unlock(t);
24825- }
24826-
24827- local_irq_disable();
24828- t->next = NULL;
24829- *__this_cpu_read(tasklet_hi_vec.tail) = t;
24830- __this_cpu_write(tasklet_hi_vec.tail, &(t->next));
24831- __raise_softirq_irqoff(HI_SOFTIRQ);
24832- local_irq_enable();
24833- }
24834+ __tasklet_action(a, list);
24835 }
24836
24837 void tasklet_init(struct tasklet_struct *t,
b3bbd485 24838@@ -580,7 +1077,7 @@ void tasklet_kill(struct tasklet_struct *t)
e4b2b4a8
JK
24839
24840 while (test_and_set_bit(TASKLET_STATE_SCHED, &t->state)) {
24841 do {
24842- yield();
24843+ msleep(1);
24844 } while (test_bit(TASKLET_STATE_SCHED, &t->state));
24845 }
24846 tasklet_unlock_wait(t);
b3bbd485 24847@@ -588,57 +1085,6 @@ void tasklet_kill(struct tasklet_struct *t)
e4b2b4a8
JK
24848 }
24849 EXPORT_SYMBOL(tasklet_kill);
24850
24851-/*
24852- * tasklet_hrtimer
24853- */
24854-
24855-/*
24856- * The trampoline is called when the hrtimer expires. It schedules a tasklet
24857- * to run __tasklet_hrtimer_trampoline() which in turn will call the intended
24858- * hrtimer callback, but from softirq context.
24859- */
24860-static enum hrtimer_restart __hrtimer_tasklet_trampoline(struct hrtimer *timer)
24861-{
24862- struct tasklet_hrtimer *ttimer =
24863- container_of(timer, struct tasklet_hrtimer, timer);
24864-
24865- tasklet_hi_schedule(&ttimer->tasklet);
24866- return HRTIMER_NORESTART;
24867-}
24868-
24869-/*
24870- * Helper function which calls the hrtimer callback from
24871- * tasklet/softirq context
24872- */
24873-static void __tasklet_hrtimer_trampoline(unsigned long data)
24874-{
24875- struct tasklet_hrtimer *ttimer = (void *)data;
24876- enum hrtimer_restart restart;
24877-
24878- restart = ttimer->function(&ttimer->timer);
24879- if (restart != HRTIMER_NORESTART)
24880- hrtimer_restart(&ttimer->timer);
24881-}
24882-
24883-/**
24884- * tasklet_hrtimer_init - Init a tasklet/hrtimer combo for softirq callbacks
24885- * @ttimer: tasklet_hrtimer which is initialized
24886- * @function: hrtimer callback function which gets called from softirq context
24887- * @which_clock: clock id (CLOCK_MONOTONIC/CLOCK_REALTIME)
24888- * @mode: hrtimer mode (HRTIMER_MODE_ABS/HRTIMER_MODE_REL)
24889- */
24890-void tasklet_hrtimer_init(struct tasklet_hrtimer *ttimer,
24891- enum hrtimer_restart (*function)(struct hrtimer *),
24892- clockid_t which_clock, enum hrtimer_mode mode)
24893-{
24894- hrtimer_init(&ttimer->timer, which_clock, mode);
24895- ttimer->timer.function = __hrtimer_tasklet_trampoline;
24896- tasklet_init(&ttimer->tasklet, __tasklet_hrtimer_trampoline,
24897- (unsigned long)ttimer);
24898- ttimer->function = function;
24899-}
24900-EXPORT_SYMBOL_GPL(tasklet_hrtimer_init);
24901-
24902 void __init softirq_init(void)
24903 {
24904 int cpu;
b3bbd485 24905@@ -654,25 +1100,26 @@ void __init softirq_init(void)
e4b2b4a8
JK
24906 open_softirq(HI_SOFTIRQ, tasklet_hi_action);
24907 }
24908
24909-static int ksoftirqd_should_run(unsigned int cpu)
24910-{
24911- return local_softirq_pending();
24912-}
24913-
24914-static void run_ksoftirqd(unsigned int cpu)
24915+#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT_FULL)
24916+void tasklet_unlock_wait(struct tasklet_struct *t)
24917 {
24918- local_irq_disable();
24919- if (local_softirq_pending()) {
24920+ while (test_bit(TASKLET_STATE_RUN, &(t)->state)) {
24921 /*
24922- * We can safely run softirq on inline stack, as we are not deep
24923- * in the task stack here.
24924+ * Hack for now to avoid this busy-loop:
24925 */
24926- __do_softirq();
24927- local_irq_enable();
24928- cond_resched_rcu_qs();
24929- return;
24930+#ifdef CONFIG_PREEMPT_RT_FULL
24931+ msleep(1);
1a6e0f06 24932+#else
e4b2b4a8
JK
24933+ barrier();
24934+#endif
24935 }
24936- local_irq_enable();
1a6e0f06 24937+}
e4b2b4a8 24938+EXPORT_SYMBOL(tasklet_unlock_wait);
1a6e0f06
JK
24939+#endif
24940+
e4b2b4a8 24941+static int ksoftirqd_should_run(unsigned int cpu)
1a6e0f06 24942+{
e4b2b4a8
JK
24943+ return ksoftirqd_softirq_pending();
24944 }
1a6e0f06 24945
e4b2b4a8 24946 #ifdef CONFIG_HOTPLUG_CPU
b3bbd485 24947@@ -739,17 +1186,31 @@ static int takeover_tasklets(unsigned int cpu)
e4b2b4a8
JK
24948
24949 static struct smp_hotplug_thread softirq_threads = {
24950 .store = &ksoftirqd,
24951+ .setup = ksoftirqd_set_sched_params,
24952 .thread_should_run = ksoftirqd_should_run,
24953 .thread_fn = run_ksoftirqd,
24954 .thread_comm = "ksoftirqd/%u",
24955 };
24956
24957+#ifdef CONFIG_PREEMPT_RT_FULL
24958+static struct smp_hotplug_thread softirq_timer_threads = {
24959+ .store = &ktimer_softirqd,
24960+ .setup = ktimer_softirqd_set_sched_params,
24961+ .cleanup = ktimer_softirqd_clr_sched_params,
24962+ .thread_should_run = ktimer_softirqd_should_run,
24963+ .thread_fn = run_ksoftirqd,
24964+ .thread_comm = "ktimersoftd/%u",
24965+};
24966+#endif
1a6e0f06 24967+
e4b2b4a8
JK
24968 static __init int spawn_ksoftirqd(void)
24969 {
24970 cpuhp_setup_state_nocalls(CPUHP_SOFTIRQ_DEAD, "softirq:dead", NULL,
24971 takeover_tasklets);
24972 BUG_ON(smpboot_register_percpu_thread(&softirq_threads));
24973-
24974+#ifdef CONFIG_PREEMPT_RT_FULL
24975+ BUG_ON(smpboot_register_percpu_thread(&softirq_timer_threads));
24976+#endif
24977 return 0;
24978 }
24979 early_initcall(spawn_ksoftirqd);
b3bbd485
JK
24980diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
24981index 067cb83f37ea..56f2f2e01229 100644
24982--- a/kernel/stop_machine.c
24983+++ b/kernel/stop_machine.c
24984@@ -503,6 +503,8 @@ static void cpu_stopper_thread(unsigned int cpu)
e4b2b4a8
JK
24985 struct cpu_stop_done *done = work->done;
24986 int ret;
24987
24988+ /* XXX */
1a6e0f06 24989+
e4b2b4a8
JK
24990 /* cpu stop callbacks must not sleep, make in_atomic() == T */
24991 preempt_count_inc();
24992 ret = fn(arg);
b3bbd485 24993diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c
5dd41b01 24994index fa5de5e8de61..6020ee66e517 100644
b3bbd485
JK
24995--- a/kernel/time/alarmtimer.c
24996+++ b/kernel/time/alarmtimer.c
24997@@ -436,7 +436,7 @@ int alarm_cancel(struct alarm *alarm)
e4b2b4a8
JK
24998 int ret = alarm_try_to_cancel(alarm);
24999 if (ret >= 0)
25000 return ret;
25001- cpu_relax();
25002+ hrtimer_wait_for_timer(&alarm->timer);
25003 }
25004 }
25005 EXPORT_SYMBOL_GPL(alarm_cancel);
b3bbd485
JK
25006diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
25007index d00e85ac10d6..b59e009087a9 100644
25008--- a/kernel/time/hrtimer.c
25009+++ b/kernel/time/hrtimer.c
25010@@ -59,6 +59,15 @@
25011
e4b2b4a8 25012 #include "tick-internal.h"
1a6e0f06 25013
b3bbd485 25014+/*
e4b2b4a8
JK
25015+ * Masks for selecting the soft and hard context timers from
25016+ * cpu_base->active
25017+ */
25018+#define MASK_SHIFT (HRTIMER_BASE_MONOTONIC_SOFT)
25019+#define HRTIMER_ACTIVE_HARD ((1U << MASK_SHIFT) - 1)
25020+#define HRTIMER_ACTIVE_SOFT (HRTIMER_ACTIVE_HARD << MASK_SHIFT)
25021+#define HRTIMER_ACTIVE_ALL (HRTIMER_ACTIVE_SOFT | HRTIMER_ACTIVE_HARD)
25022+
b3bbd485 25023 /*
e4b2b4a8
JK
25024 * The timer bases:
25025 *
e4b2b4a8
JK
25026@@ -70,7 +79,6 @@
25027 DEFINE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases) =
1a6e0f06 25028 {
e4b2b4a8
JK
25029 .lock = __RAW_SPIN_LOCK_UNLOCKED(hrtimer_bases.lock),
25030- .seq = SEQCNT_ZERO(hrtimer_bases.seq),
25031 .clock_base =
25032 {
25033 {
b3bbd485 25034@@ -93,6 +101,26 @@ DEFINE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases) =
e4b2b4a8
JK
25035 .clockid = CLOCK_TAI,
25036 .get_time = &ktime_get_clocktai,
25037 },
25038+ {
25039+ .index = HRTIMER_BASE_MONOTONIC_SOFT,
25040+ .clockid = CLOCK_MONOTONIC,
25041+ .get_time = &ktime_get,
25042+ },
25043+ {
25044+ .index = HRTIMER_BASE_REALTIME_SOFT,
25045+ .clockid = CLOCK_REALTIME,
25046+ .get_time = &ktime_get_real,
25047+ },
25048+ {
25049+ .index = HRTIMER_BASE_BOOTTIME_SOFT,
25050+ .clockid = CLOCK_BOOTTIME,
25051+ .get_time = &ktime_get_boottime,
25052+ },
25053+ {
25054+ .index = HRTIMER_BASE_TAI_SOFT,
25055+ .clockid = CLOCK_TAI,
25056+ .get_time = &ktime_get_clocktai,
25057+ },
25058 }
25059 };
1a6e0f06 25060
b3bbd485 25061@@ -118,7 +146,6 @@ static const int hrtimer_clock_to_base_table[MAX_CLOCKS] = {
e4b2b4a8
JK
25062 * timer->base->cpu_base
25063 */
25064 static struct hrtimer_cpu_base migration_cpu_base = {
25065- .seq = SEQCNT_ZERO(migration_cpu_base),
25066 .clock_base = { { .cpu_base = &migration_cpu_base, }, },
25067 };
1a6e0f06 25068
b3bbd485 25069@@ -156,45 +183,33 @@ struct hrtimer_clock_base *lock_hrtimer_base(const struct hrtimer *timer,
1a6e0f06
JK
25070 }
25071
25072 /*
e4b2b4a8
JK
25073- * With HIGHRES=y we do not migrate the timer when it is expiring
25074- * before the next event on the target cpu because we cannot reprogram
25075- * the target cpu hardware and we would cause it to fire late.
25076+ * We do not migrate the timer when it is expiring before the next
25077+ * event on the target cpu. When high resolution is enabled, we cannot
25078+ * reprogram the target cpu hardware and we would cause it to fire
25079+ * late. To keep it simple, we handle the high resolution enabled and
25080+ * disabled case similar.
25081 *
25082 * Called with cpu_base->lock of target cpu held.
25083 */
25084 static int
25085 hrtimer_check_target(struct hrtimer *timer, struct hrtimer_clock_base *new_base)
1a6e0f06 25086 {
e4b2b4a8
JK
25087-#ifdef CONFIG_HIGH_RES_TIMERS
25088 ktime_t expires;
1a6e0f06 25089
e4b2b4a8
JK
25090- if (!new_base->cpu_base->hres_active)
25091- return 0;
25092-
25093 expires = ktime_sub(hrtimer_get_expires(timer), new_base->offset);
25094- return expires <= new_base->cpu_base->expires_next;
25095-#else
25096- return 0;
25097-#endif
25098+ return expires < new_base->cpu_base->expires_next;
25099 }
1a6e0f06 25100
e4b2b4a8
JK
25101-#ifdef CONFIG_NO_HZ_COMMON
25102-static inline
25103-struct hrtimer_cpu_base *get_target_base(struct hrtimer_cpu_base *base,
25104- int pinned)
25105-{
25106- if (pinned || !base->migration_enabled)
25107- return base;
25108- return &per_cpu(hrtimer_bases, get_nohz_timer_target());
25109-}
25110-#else
25111 static inline
25112 struct hrtimer_cpu_base *get_target_base(struct hrtimer_cpu_base *base,
25113 int pinned)
25114 {
25115+#if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON)
25116+ if (static_branch_unlikely(&timers_migration_enabled) && !pinned)
25117+ return &per_cpu(hrtimer_bases, get_nohz_timer_target());
25118+#endif
25119 return base;
25120 }
25121-#endif
1a6e0f06 25122
e4b2b4a8
JK
25123 /*
25124 * We switch the timer base to a power-optimized selected CPU target,
b3bbd485 25125@@ -396,7 +411,8 @@ static inline void debug_hrtimer_init(struct hrtimer *timer)
e4b2b4a8
JK
25126 debug_object_init(timer, &hrtimer_debug_descr);
25127 }
1a6e0f06 25128
e4b2b4a8
JK
25129-static inline void debug_hrtimer_activate(struct hrtimer *timer)
25130+static inline void debug_hrtimer_activate(struct hrtimer *timer,
25131+ enum hrtimer_mode mode)
25132 {
25133 debug_object_activate(timer, &hrtimer_debug_descr);
25134 }
b3bbd485 25135@@ -429,8 +445,10 @@ void destroy_hrtimer_on_stack(struct hrtimer *timer)
e4b2b4a8 25136 EXPORT_SYMBOL_GPL(destroy_hrtimer_on_stack);
1a6e0f06 25137
e4b2b4a8
JK
25138 #else
25139+
25140 static inline void debug_hrtimer_init(struct hrtimer *timer) { }
25141-static inline void debug_hrtimer_activate(struct hrtimer *timer) { }
25142+static inline void debug_hrtimer_activate(struct hrtimer *timer,
25143+ enum hrtimer_mode mode) { }
25144 static inline void debug_hrtimer_deactivate(struct hrtimer *timer) { }
25145 #endif
1a6e0f06 25146
b3bbd485 25147@@ -442,10 +460,11 @@ debug_init(struct hrtimer *timer, clockid_t clockid,
e4b2b4a8 25148 trace_hrtimer_init(timer, clockid, mode);
1a6e0f06 25149 }
1a6e0f06 25150
e4b2b4a8
JK
25151-static inline void debug_activate(struct hrtimer *timer)
25152+static inline void debug_activate(struct hrtimer *timer,
25153+ enum hrtimer_mode mode)
25154 {
25155- debug_hrtimer_activate(timer);
25156- trace_hrtimer_start(timer);
25157+ debug_hrtimer_activate(timer, mode);
25158+ trace_hrtimer_start(timer, mode);
25159 }
1a6e0f06 25160
e4b2b4a8 25161 static inline void debug_deactivate(struct hrtimer *timer)
b3bbd485 25162@@ -454,35 +473,43 @@ static inline void debug_deactivate(struct hrtimer *timer)
e4b2b4a8 25163 trace_hrtimer_cancel(timer);
1a6e0f06
JK
25164 }
25165
e4b2b4a8
JK
25166-#if defined(CONFIG_NO_HZ_COMMON) || defined(CONFIG_HIGH_RES_TIMERS)
25167-static inline void hrtimer_update_next_timer(struct hrtimer_cpu_base *cpu_base,
25168- struct hrtimer *timer)
25169+static struct hrtimer_clock_base *
25170+__next_base(struct hrtimer_cpu_base *cpu_base, unsigned int *active)
25171 {
25172-#ifdef CONFIG_HIGH_RES_TIMERS
25173- cpu_base->next_timer = timer;
25174-#endif
25175+ unsigned int idx;
1a6e0f06 25176+
e4b2b4a8
JK
25177+ if (!*active)
25178+ return NULL;
1a6e0f06 25179+
e4b2b4a8
JK
25180+ idx = __ffs(*active);
25181+ *active &= ~(1U << idx);
1a6e0f06 25182+
e4b2b4a8
JK
25183+ return &cpu_base->clock_base[idx];
25184 }
25185
25186-static ktime_t __hrtimer_get_next_event(struct hrtimer_cpu_base *cpu_base)
25187+#define for_each_active_base(base, cpu_base, active) \
25188+ while ((base = __next_base((cpu_base), &(active))))
1a6e0f06 25189+
e4b2b4a8
JK
25190+static ktime_t __hrtimer_next_event_base(struct hrtimer_cpu_base *cpu_base,
25191+ unsigned int active,
25192+ ktime_t expires_next)
25193 {
25194- struct hrtimer_clock_base *base = cpu_base->clock_base;
25195- unsigned int active = cpu_base->active_bases;
25196- ktime_t expires, expires_next = KTIME_MAX;
25197+ struct hrtimer_clock_base *base;
25198+ ktime_t expires;
25199
25200- hrtimer_update_next_timer(cpu_base, NULL);
25201- for (; active; base++, active >>= 1) {
25202+ for_each_active_base(base, cpu_base, active) {
25203 struct timerqueue_node *next;
25204 struct hrtimer *timer;
25205
25206- if (!(active & 0x01))
25207- continue;
25208-
25209 next = timerqueue_getnext(&base->active);
25210 timer = container_of(next, struct hrtimer, node);
25211 expires = ktime_sub(hrtimer_get_expires(timer), base->offset);
25212 if (expires < expires_next) {
25213 expires_next = expires;
25214- hrtimer_update_next_timer(cpu_base, timer);
25215+ if (timer->is_soft)
25216+ cpu_base->softirq_next_timer = timer;
25217+ else
25218+ cpu_base->next_timer = timer;
25219 }
25220 }
25221 /*
b3bbd485 25222@@ -494,7 +521,47 @@ static ktime_t __hrtimer_get_next_event(struct hrtimer_cpu_base *cpu_base)
e4b2b4a8
JK
25223 expires_next = 0;
25224 return expires_next;
25225 }
25226-#endif
1a6e0f06 25227+
e4b2b4a8
JK
25228+/*
25229+ * Recomputes cpu_base::*next_timer and returns the earliest expires_next but
25230+ * does not set cpu_base::*expires_next, that is done by hrtimer_reprogram.
25231+ *
25232+ * When a softirq is pending, we can ignore the HRTIMER_ACTIVE_SOFT bases,
25233+ * those timers will get run whenever the softirq gets handled, at the end of
25234+ * hrtimer_run_softirq(), hrtimer_update_softirq_timer() will re-add these bases.
25235+ *
25236+ * Therefore softirq values are those from the HRTIMER_ACTIVE_SOFT clock bases.
25237+ * The !softirq values are the minima across HRTIMER_ACTIVE_ALL, unless an actual
25238+ * softirq is pending, in which case they're the minima of HRTIMER_ACTIVE_HARD.
25239+ *
25240+ * @active_mask must be one of:
25241+ * - HRTIMER_ACTIVE_ALL,
25242+ * - HRTIMER_ACTIVE_SOFT, or
25243+ * - HRTIMER_ACTIVE_HARD.
25244+ */
25245+static ktime_t
25246+__hrtimer_get_next_event(struct hrtimer_cpu_base *cpu_base, unsigned int active_mask)
1a6e0f06 25247+{
e4b2b4a8
JK
25248+ unsigned int active;
25249+ struct hrtimer *next_timer = NULL;
25250+ ktime_t expires_next = KTIME_MAX;
1a6e0f06 25251+
e4b2b4a8
JK
25252+ if (!cpu_base->softirq_activated && (active_mask & HRTIMER_ACTIVE_SOFT)) {
25253+ active = cpu_base->active_bases & HRTIMER_ACTIVE_SOFT;
25254+ cpu_base->softirq_next_timer = NULL;
25255+ expires_next = __hrtimer_next_event_base(cpu_base, active, KTIME_MAX);
25256+
25257+ next_timer = cpu_base->softirq_next_timer;
1a6e0f06 25258+ }
1a6e0f06 25259+
e4b2b4a8
JK
25260+ if (active_mask & HRTIMER_ACTIVE_HARD) {
25261+ active = cpu_base->active_bases & HRTIMER_ACTIVE_HARD;
25262+ cpu_base->next_timer = next_timer;
25263+ expires_next = __hrtimer_next_event_base(cpu_base, active, expires_next);
25264+ }
1a6e0f06 25265+
e4b2b4a8 25266+ return expires_next;
1a6e0f06 25267+}
e4b2b4a8
JK
25268
25269 static inline ktime_t hrtimer_update_base(struct hrtimer_cpu_base *base)
25270 {
b3bbd485 25271@@ -502,36 +569,14 @@ static inline ktime_t hrtimer_update_base(struct hrtimer_cpu_base *base)
e4b2b4a8
JK
25272 ktime_t *offs_boot = &base->clock_base[HRTIMER_BASE_BOOTTIME].offset;
25273 ktime_t *offs_tai = &base->clock_base[HRTIMER_BASE_TAI].offset;
25274
25275- return ktime_get_update_offsets_now(&base->clock_was_set_seq,
25276+ ktime_t now = ktime_get_update_offsets_now(&base->clock_was_set_seq,
25277 offs_real, offs_boot, offs_tai);
25278-}
25279-
25280-/* High resolution timer related functions */
25281-#ifdef CONFIG_HIGH_RES_TIMERS
25282-
25283-/*
25284- * High resolution timer enabled ?
25285- */
25286-static bool hrtimer_hres_enabled __read_mostly = true;
25287-unsigned int hrtimer_resolution __read_mostly = LOW_RES_NSEC;
25288-EXPORT_SYMBOL_GPL(hrtimer_resolution);
25289-
25290-/*
25291- * Enable / Disable high resolution mode
25292- */
25293-static int __init setup_hrtimer_hres(char *str)
25294-{
25295- return (kstrtobool(str, &hrtimer_hres_enabled) == 0);
25296-}
25297
25298-__setup("highres=", setup_hrtimer_hres);
25299+ base->clock_base[HRTIMER_BASE_REALTIME_SOFT].offset = *offs_real;
25300+ base->clock_base[HRTIMER_BASE_BOOTTIME_SOFT].offset = *offs_boot;
25301+ base->clock_base[HRTIMER_BASE_TAI_SOFT].offset = *offs_tai;
25302
25303-/*
25304- * hrtimer_high_res_enabled - query, if the highres mode is enabled
25305- */
25306-static inline int hrtimer_is_hres_enabled(void)
25307-{
25308- return hrtimer_hres_enabled;
25309+ return now;
25310 }
25311
1a6e0f06 25312 /*
b3bbd485 25313@@ -539,7 +584,8 @@ static inline int hrtimer_is_hres_enabled(void)
1a6e0f06 25314 */
e4b2b4a8 25315 static inline int __hrtimer_hres_active(struct hrtimer_cpu_base *cpu_base)
1a6e0f06 25316 {
e4b2b4a8
JK
25317- return cpu_base->hres_active;
25318+ return IS_ENABLED(CONFIG_HIGH_RES_TIMERS) ?
25319+ cpu_base->hres_active : 0;
25320 }
1a6e0f06 25321
e4b2b4a8 25322 static inline int hrtimer_hres_active(void)
b3bbd485 25323@@ -557,10 +603,23 @@ hrtimer_force_reprogram(struct hrtimer_cpu_base *cpu_base, int skip_equal)
e4b2b4a8
JK
25324 {
25325 ktime_t expires_next;
1a6e0f06 25326
e4b2b4a8
JK
25327- if (!cpu_base->hres_active)
25328- return;
25329+ /*
25330+ * Find the current next expiration time.
25331+ */
25332+ expires_next = __hrtimer_get_next_event(cpu_base, HRTIMER_ACTIVE_ALL);
1a6e0f06 25333
e4b2b4a8
JK
25334- expires_next = __hrtimer_get_next_event(cpu_base);
25335+ if (cpu_base->next_timer && cpu_base->next_timer->is_soft) {
25336+ /*
25337+ * When the softirq is activated, hrtimer has to be
25338+ * programmed with the first hard hrtimer because soft
25339+ * timer interrupt could occur too late.
25340+ */
25341+ if (cpu_base->softirq_activated)
25342+ expires_next = __hrtimer_get_next_event(cpu_base,
25343+ HRTIMER_ACTIVE_HARD);
25344+ else
25345+ cpu_base->softirq_expires_next = expires_next;
1a6e0f06
JK
25346+ }
25347
e4b2b4a8
JK
25348 if (skip_equal && expires_next == cpu_base->expires_next)
25349 return;
b3bbd485 25350@@ -568,6 +627,9 @@ hrtimer_force_reprogram(struct hrtimer_cpu_base *cpu_base, int skip_equal)
e4b2b4a8 25351 cpu_base->expires_next = expires_next;
1a6e0f06
JK
25352
25353 /*
e4b2b4a8
JK
25354+ * If hres is not active, hardware does not have to be
25355+ * reprogrammed yet.
25356+ *
25357 * If a hang was detected in the last timer interrupt then we
25358 * leave the hang delay active in the hardware. We want the
25359 * system to make progress. That also prevents the following
b3bbd485 25360@@ -581,83 +643,38 @@ hrtimer_force_reprogram(struct hrtimer_cpu_base *cpu_base, int skip_equal)
e4b2b4a8
JK
25361 * set. So we'd effectivly block all timers until the T2 event
25362 * fires.
1a6e0f06 25363 */
e4b2b4a8
JK
25364- if (cpu_base->hang_detected)
25365+ if (!__hrtimer_hres_active(cpu_base) || cpu_base->hang_detected)
25366 return;
1a6e0f06 25367
e4b2b4a8
JK
25368 tick_program_event(cpu_base->expires_next, 1);
25369 }
1a6e0f06 25370
e4b2b4a8
JK
25371+/* High resolution timer related functions */
25372+#ifdef CONFIG_HIGH_RES_TIMERS
25373+
25374 /*
25375- * When a timer is enqueued and expires earlier than the already enqueued
25376- * timers, we have to check, whether it expires earlier than the timer for
25377- * which the clock event device was armed.
25378- *
25379- * Called with interrupts disabled and base->cpu_base.lock held
25380+ * High resolution timer enabled ?
1a6e0f06 25381 */
e4b2b4a8
JK
25382-static void hrtimer_reprogram(struct hrtimer *timer,
25383- struct hrtimer_clock_base *base)
25384-{
25385- struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases);
25386- ktime_t expires = ktime_sub(hrtimer_get_expires(timer), base->offset);
25387-
25388- WARN_ON_ONCE(hrtimer_get_expires_tv64(timer) < 0);
25389-
25390- /*
25391- * If the timer is not on the current cpu, we cannot reprogram
25392- * the other cpus clock event device.
25393- */
25394- if (base->cpu_base != cpu_base)
25395- return;
25396-
25397- /*
25398- * If the hrtimer interrupt is running, then it will
25399- * reevaluate the clock bases and reprogram the clock event
25400- * device. The callbacks are always executed in hard interrupt
25401- * context so we don't need an extra check for a running
25402- * callback.
25403- */
25404- if (cpu_base->in_hrtirq)
25405- return;
25406-
25407- /*
25408- * CLOCK_REALTIME timer might be requested with an absolute
25409- * expiry time which is less than base->offset. Set it to 0.
25410- */
25411- if (expires < 0)
25412- expires = 0;
25413-
25414- if (expires >= cpu_base->expires_next)
25415- return;
25416-
25417- /* Update the pointer to the next expiring timer */
25418- cpu_base->next_timer = timer;
25419-
25420- /*
25421- * If a hang was detected in the last timer interrupt then we
25422- * do not schedule a timer which is earlier than the expiry
25423- * which we enforced in the hang detection. We want the system
25424- * to make progress.
25425- */
25426- if (cpu_base->hang_detected)
25427- return;
25428+static bool hrtimer_hres_enabled __read_mostly = true;
25429+unsigned int hrtimer_resolution __read_mostly = LOW_RES_NSEC;
25430+EXPORT_SYMBOL_GPL(hrtimer_resolution);
25431
25432- /*
25433- * Program the timer hardware. We enforce the expiry for
25434- * events which are already in the past.
25435- */
25436- cpu_base->expires_next = expires;
25437- tick_program_event(expires, 1);
25438+/*
25439+ * Enable / Disable high resolution mode
25440+ */
25441+static int __init setup_hrtimer_hres(char *str)
25442+{
25443+ return (kstrtobool(str, &hrtimer_hres_enabled) == 0);
1a6e0f06
JK
25444 }
25445
e4b2b4a8
JK
25446+__setup("highres=", setup_hrtimer_hres);
25447+
25448 /*
25449- * Initialize the high resolution related parts of cpu_base
25450+ * hrtimer_high_res_enabled - query, if the highres mode is enabled
25451 */
25452-static inline void hrtimer_init_hres(struct hrtimer_cpu_base *base)
25453+static inline int hrtimer_is_hres_enabled(void)
1a6e0f06 25454 {
e4b2b4a8
JK
25455- base->expires_next = KTIME_MAX;
25456- base->hang_detected = 0;
25457- base->hres_active = 0;
25458- base->next_timer = NULL;
25459+ return hrtimer_hres_enabled;
1a6e0f06
JK
25460 }
25461
e4b2b4a8 25462 /*
b3bbd485 25463@@ -669,7 +686,7 @@ static void retrigger_next_event(void *arg)
1a6e0f06 25464 {
e4b2b4a8 25465 struct hrtimer_cpu_base *base = this_cpu_ptr(&hrtimer_bases);
1a6e0f06 25466
e4b2b4a8
JK
25467- if (!base->hres_active)
25468+ if (!__hrtimer_hres_active(base))
25469 return;
1a6e0f06 25470
e4b2b4a8 25471 raw_spin_lock(&base->lock);
b3bbd485 25472@@ -698,6 +715,29 @@ static void hrtimer_switch_to_hres(void)
e4b2b4a8
JK
25473 retrigger_next_event(NULL);
25474 }
1a6e0f06 25475
e4b2b4a8
JK
25476+#ifdef CONFIG_PREEMPT_RT_FULL
25477+
25478+static struct swork_event clock_set_delay_work;
25479+
25480+static void run_clock_set_delay(struct swork_event *event)
25481+{
25482+ clock_was_set();
25483+}
25484+
25485+void clock_was_set_delayed(void)
25486+{
25487+ swork_queue(&clock_set_delay_work);
25488+}
25489+
25490+static __init int create_clock_set_delay_thread(void)
25491+{
25492+ WARN_ON(swork_get());
25493+ INIT_SWORK(&clock_set_delay_work, run_clock_set_delay);
25494+ return 0;
25495+}
25496+early_initcall(create_clock_set_delay_thread);
25497+#else /* PREEMPT_RT_FULL */
25498+
25499 static void clock_was_set_work(struct work_struct *work)
1a6e0f06 25500 {
e4b2b4a8 25501 clock_was_set();
b3bbd485 25502@@ -713,25 +753,105 @@ void clock_was_set_delayed(void)
e4b2b4a8
JK
25503 {
25504 schedule_work(&hrtimer_work);
1a6e0f06 25505 }
e4b2b4a8 25506+#endif
1a6e0f06 25507
e4b2b4a8 25508 #else
1a6e0f06 25509
e4b2b4a8
JK
25510-static inline int __hrtimer_hres_active(struct hrtimer_cpu_base *b) { return 0; }
25511-static inline int hrtimer_hres_active(void) { return 0; }
25512 static inline int hrtimer_is_hres_enabled(void) { return 0; }
25513 static inline void hrtimer_switch_to_hres(void) { }
25514-static inline void
25515-hrtimer_force_reprogram(struct hrtimer_cpu_base *base, int skip_equal) { }
25516-static inline int hrtimer_reprogram(struct hrtimer *timer,
25517- struct hrtimer_clock_base *base)
25518-{
25519- return 0;
25520-}
25521-static inline void hrtimer_init_hres(struct hrtimer_cpu_base *base) { }
25522 static inline void retrigger_next_event(void *arg) { }
1a6e0f06 25523
e4b2b4a8 25524 #endif /* CONFIG_HIGH_RES_TIMERS */
1a6e0f06 25525
b3bbd485 25526+/*
e4b2b4a8
JK
25527+ * When a timer is enqueued and expires earlier than the already enqueued
25528+ * timers, we have to check, whether it expires earlier than the timer for
25529+ * which the clock event device was armed.
25530+ *
25531+ * Called with interrupts disabled and base->cpu_base.lock held
25532+ */
25533+static void hrtimer_reprogram(struct hrtimer *timer, bool reprogram)
25534+{
25535+ struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases);
25536+ struct hrtimer_clock_base *base = timer->base;
25537+ ktime_t expires = ktime_sub(hrtimer_get_expires(timer), base->offset);
25538+
25539+ WARN_ON_ONCE(hrtimer_get_expires_tv64(timer) < 0);
25540+
25541+ /*
25542+ * CLOCK_REALTIME timer might be requested with an absolute
25543+ * expiry time which is less than base->offset. Set it to 0.
25544+ */
25545+ if (expires < 0)
25546+ expires = 0;
25547+
25548+ if (timer->is_soft) {
25549+ /*
25550+ * soft hrtimer could be started on a remote CPU. In this
25551+ * case softirq_expires_next needs to be updated on the
25552+ * remote CPU. The soft hrtimer will not expire before the
25553+ * first hard hrtimer on the remote CPU -
25554+ * hrtimer_check_target() prevents this case.
25555+ */
25556+ struct hrtimer_cpu_base *timer_cpu_base = base->cpu_base;
25557+
25558+ if (timer_cpu_base->softirq_activated)
25559+ return;
25560+
25561+ if (!ktime_before(expires, timer_cpu_base->softirq_expires_next))
25562+ return;
25563+
25564+ timer_cpu_base->softirq_next_timer = timer;
25565+ timer_cpu_base->softirq_expires_next = expires;
25566+
25567+ if (!ktime_before(expires, timer_cpu_base->expires_next) ||
25568+ !reprogram)
25569+ return;
25570+ }
25571+
25572+ /*
25573+ * If the timer is not on the current cpu, we cannot reprogram
25574+ * the other cpus clock event device.
25575+ */
25576+ if (base->cpu_base != cpu_base)
25577+ return;
25578+
25579+ /*
25580+ * If the hrtimer interrupt is running, then it will
25581+ * reevaluate the clock bases and reprogram the clock event
25582+ * device. The callbacks are always executed in hard interrupt
25583+ * context so we don't need an extra check for a running
25584+ * callback.
25585+ */
25586+ if (cpu_base->in_hrtirq)
25587+ return;
25588+
25589+ if (expires >= cpu_base->expires_next)
25590+ return;
25591+
25592+ /* Update the pointer to the next expiring timer */
25593+ cpu_base->next_timer = timer;
25594+ cpu_base->expires_next = expires;
25595+
25596+ /*
25597+ * If hres is not active, hardware does not have to be
25598+ * programmed yet.
25599+ *
25600+ * If a hang was detected in the last timer interrupt then we
25601+ * do not schedule a timer which is earlier than the expiry
25602+ * which we enforced in the hang detection. We want the system
25603+ * to make progress.
25604+ */
25605+ if (!__hrtimer_hres_active(cpu_base) || cpu_base->hang_detected)
25606+ return;
25607+
25608+ /*
25609+ * Program the timer hardware. We enforce the expiry for
25610+ * events which are already in the past.
25611+ */
25612+ tick_program_event(expires, 1);
25613+}
25614+
b3bbd485 25615 /*
e4b2b4a8
JK
25616 * Clock realtime was set
25617 *
b3bbd485 25618@@ -830,6 +950,33 @@ u64 hrtimer_forward(struct hrtimer *timer, ktime_t now, ktime_t interval)
1a6e0f06 25619 }
e4b2b4a8 25620 EXPORT_SYMBOL_GPL(hrtimer_forward);
1a6e0f06 25621
e4b2b4a8
JK
25622+#ifdef CONFIG_PREEMPT_RT_BASE
25623+# define wake_up_timer_waiters(b) wake_up(&(b)->wait)
25624+
25625+/**
25626+ * hrtimer_wait_for_timer - Wait for a running timer
1a6e0f06 25627+ *
e4b2b4a8 25628+ * @timer: timer to wait for
1a6e0f06 25629+ *
e4b2b4a8
JK
25630+ * The function waits in case the timers callback function is
25631+ * currently executed on the waitqueue of the timer base. The
25632+ * waitqueue is woken up after the timer callback function has
25633+ * finished execution.
1a6e0f06 25634+ */
e4b2b4a8 25635+void hrtimer_wait_for_timer(const struct hrtimer *timer)
1a6e0f06 25636+{
e4b2b4a8 25637+ struct hrtimer_clock_base *base = timer->base;
1a6e0f06 25638+
e4b2b4a8
JK
25639+ if (base && base->cpu_base &&
25640+ base->index >= HRTIMER_BASE_MONOTONIC_SOFT)
25641+ wait_event(base->cpu_base->wait,
25642+ !(hrtimer_callback_running(timer)));
1a6e0f06 25643+}
1a6e0f06 25644+
1a6e0f06 25645+#else
e4b2b4a8 25646+# define wake_up_timer_waiters(b) do { } while (0)
1a6e0f06 25647+#endif
e4b2b4a8
JK
25648+
25649 /*
25650 * enqueue_hrtimer - internal function to (re)start a timer
25651 *
b3bbd485 25652@@ -839,9 +986,10 @@ EXPORT_SYMBOL_GPL(hrtimer_forward);
e4b2b4a8 25653 * Returns 1 when the new timer is the leftmost timer in the tree.
1a6e0f06 25654 */
e4b2b4a8
JK
25655 static int enqueue_hrtimer(struct hrtimer *timer,
25656- struct hrtimer_clock_base *base)
25657+ struct hrtimer_clock_base *base,
25658+ enum hrtimer_mode mode)
1a6e0f06 25659 {
e4b2b4a8
JK
25660- debug_activate(timer);
25661+ debug_activate(timer, mode);
1a6e0f06 25662
e4b2b4a8
JK
25663 base->cpu_base->active_bases |= 1 << base->index;
25664
b3bbd485 25665@@ -874,7 +1022,6 @@ static void __remove_hrtimer(struct hrtimer *timer,
e4b2b4a8
JK
25666 if (!timerqueue_del(&base->active, &timer->node))
25667 cpu_base->active_bases &= ~(1 << base->index);
25668
25669-#ifdef CONFIG_HIGH_RES_TIMERS
25670 /*
25671 * Note: If reprogram is false we do not update
25672 * cpu_base->next_timer. This happens when we remove the first
b3bbd485 25673@@ -885,7 +1032,6 @@ static void __remove_hrtimer(struct hrtimer *timer,
e4b2b4a8
JK
25674 */
25675 if (reprogram && timer == cpu_base->next_timer)
25676 hrtimer_force_reprogram(cpu_base, 1);
25677-#endif
25678 }
1a6e0f06 25679
e4b2b4a8 25680 /*
b3bbd485 25681@@ -934,22 +1080,36 @@ static inline ktime_t hrtimer_update_lowres(struct hrtimer *timer, ktime_t tim,
e4b2b4a8 25682 return tim;
1a6e0f06 25683 }
1a6e0f06 25684
e4b2b4a8
JK
25685-/**
25686- * hrtimer_start_range_ns - (re)start an hrtimer on the current CPU
25687- * @timer: the timer to be added
25688- * @tim: expiry time
25689- * @delta_ns: "slack" range for the timer
25690- * @mode: expiry mode: absolute (HRTIMER_MODE_ABS) or
25691- * relative (HRTIMER_MODE_REL)
25692- */
25693-void hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim,
25694- u64 delta_ns, const enum hrtimer_mode mode)
25695+static void
25696+hrtimer_update_softirq_timer(struct hrtimer_cpu_base *cpu_base, bool reprogram)
1a6e0f06 25697 {
e4b2b4a8
JK
25698- struct hrtimer_clock_base *base, *new_base;
25699- unsigned long flags;
25700- int leftmost;
25701+ ktime_t expires;
1a6e0f06 25702
e4b2b4a8 25703- base = lock_hrtimer_base(timer, &flags);
1a6e0f06 25704+ /*
e4b2b4a8 25705+ * Find the next SOFT expiration.
1a6e0f06 25706+ */
e4b2b4a8 25707+ expires = __hrtimer_get_next_event(cpu_base, HRTIMER_ACTIVE_SOFT);
1a6e0f06 25708+
e4b2b4a8
JK
25709+ /*
25710+ * reprogramming needs to be triggered, even if the next soft
25711+ * hrtimer expires at the same time than the next hard
25712+ * hrtimer. cpu_base->softirq_expires_next needs to be updated!
25713+ */
25714+ if (expires == KTIME_MAX)
25715+ return;
1a6e0f06 25716+
e4b2b4a8
JK
25717+ /*
25718+ * cpu_base->*next_timer is recomputed by __hrtimer_get_next_event()
25719+ * cpu_base->*expires_next is only set by hrtimer_reprogram()
25720+ */
25721+ hrtimer_reprogram(cpu_base->softirq_next_timer, reprogram);
1a6e0f06 25722+}
1a6e0f06 25723+
e4b2b4a8
JK
25724+static int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim,
25725+ u64 delta_ns, const enum hrtimer_mode mode,
25726+ struct hrtimer_clock_base *base)
1a6e0f06 25727+{
e4b2b4a8
JK
25728+ struct hrtimer_clock_base *new_base;
25729
25730 /* Remove an active timer from the queue: */
25731 remove_hrtimer(timer, base, true);
b3bbd485 25732@@ -964,21 +1124,37 @@ void hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim,
e4b2b4a8
JK
25733 /* Switch the timer base, if necessary: */
25734 new_base = switch_hrtimer_base(timer, base, mode & HRTIMER_MODE_PINNED);
25735
25736- leftmost = enqueue_hrtimer(timer, new_base);
25737- if (!leftmost)
25738- goto unlock;
25739+ return enqueue_hrtimer(timer, new_base, mode);
1a6e0f06 25740+}
1a6e0f06 25741+
e4b2b4a8
JK
25742+/**
25743+ * hrtimer_start_range_ns - (re)start an hrtimer
25744+ * @timer: the timer to be added
25745+ * @tim: expiry time
25746+ * @delta_ns: "slack" range for the timer
25747+ * @mode: timer mode: absolute (HRTIMER_MODE_ABS) or
25748+ * relative (HRTIMER_MODE_REL), and pinned (HRTIMER_MODE_PINNED);
25749+ * softirq based mode is considered for debug purpose only!
25750+ */
25751+void hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim,
25752+ u64 delta_ns, const enum hrtimer_mode mode)
1a6e0f06 25753+{
e4b2b4a8
JK
25754+ struct hrtimer_clock_base *base;
25755+ unsigned long flags;
1a6e0f06
JK
25756+
25757+ /*
e4b2b4a8
JK
25758+ * Check whether the HRTIMER_MODE_SOFT bit and hrtimer.is_soft
25759+ * match.
1a6e0f06 25760+ */
e4b2b4a8
JK
25761+#ifndef CONFIG_PREEMPT_RT_BASE
25762+ WARN_ON_ONCE(!(mode & HRTIMER_MODE_SOFT) ^ !timer->is_soft);
1a6e0f06 25763+#endif
1a6e0f06 25764+
e4b2b4a8
JK
25765+ base = lock_hrtimer_base(timer, &flags);
25766+
25767+ if (__hrtimer_start_range_ns(timer, tim, delta_ns, mode, base))
25768+ hrtimer_reprogram(timer, true);
25769
25770- if (!hrtimer_is_hres_active(timer)) {
25771- /*
25772- * Kick to reschedule the next tick to handle the new timer
25773- * on dynticks target.
25774- */
25775- if (new_base->cpu_base->nohz_active)
25776- wake_up_nohz_cpu(new_base->cpu_base->cpu);
25777- } else {
25778- hrtimer_reprogram(timer, new_base);
25779- }
25780-unlock:
25781 unlock_hrtimer_base(timer, &flags);
25782 }
25783 EXPORT_SYMBOL_GPL(hrtimer_start_range_ns);
b3bbd485 25784@@ -1035,7 +1211,7 @@ int hrtimer_cancel(struct hrtimer *timer)
e4b2b4a8
JK
25785
25786 if (ret >= 0)
25787 return ret;
25788- cpu_relax();
25789+ hrtimer_wait_for_timer(timer);
25790 }
25791 }
25792 EXPORT_SYMBOL_GPL(hrtimer_cancel);
b3bbd485 25793@@ -1076,7 +1252,7 @@ u64 hrtimer_get_next_event(void)
e4b2b4a8
JK
25794 raw_spin_lock_irqsave(&cpu_base->lock, flags);
25795
25796 if (!__hrtimer_hres_active(cpu_base))
25797- expires = __hrtimer_get_next_event(cpu_base);
25798+ expires = __hrtimer_get_next_event(cpu_base, HRTIMER_ACTIVE_ALL);
25799
25800 raw_spin_unlock_irqrestore(&cpu_base->lock, flags);
25801
b3bbd485 25802@@ -1099,8 +1275,16 @@ static inline int hrtimer_clockid_to_base(clockid_t clock_id)
e4b2b4a8
JK
25803 static void __hrtimer_init(struct hrtimer *timer, clockid_t clock_id,
25804 enum hrtimer_mode mode)
25805 {
25806- struct hrtimer_cpu_base *cpu_base;
25807+ bool softtimer;
25808 int base;
25809+ struct hrtimer_cpu_base *cpu_base;
25810+
25811+ softtimer = !!(mode & HRTIMER_MODE_SOFT);
25812+#ifdef CONFIG_PREEMPT_RT_FULL
25813+ if (!softtimer && !(mode & HRTIMER_MODE_HARD))
25814+ softtimer = true;
1a6e0f06 25815+#endif
e4b2b4a8
JK
25816+ base = softtimer ? HRTIMER_MAX_CLOCK_BASES / 2 : 0;
25817
25818 memset(timer, 0, sizeof(struct hrtimer));
25819
b3bbd485 25820@@ -1114,7 +1298,8 @@ static void __hrtimer_init(struct hrtimer *timer, clockid_t clock_id,
e4b2b4a8
JK
25821 if (clock_id == CLOCK_REALTIME && mode & HRTIMER_MODE_REL)
25822 clock_id = CLOCK_MONOTONIC;
25823
25824- base = hrtimer_clockid_to_base(clock_id);
25825+ base += hrtimer_clockid_to_base(clock_id);
25826+ timer->is_soft = softtimer;
25827 timer->base = &cpu_base->clock_base[base];
25828 timerqueue_init(&timer->node);
25829 }
b3bbd485 25830@@ -1123,7 +1308,13 @@ static void __hrtimer_init(struct hrtimer *timer, clockid_t clock_id,
e4b2b4a8
JK
25831 * hrtimer_init - initialize a timer to the given clock
25832 * @timer: the timer to be initialized
25833 * @clock_id: the clock to be used
25834- * @mode: timer mode abs/rel
25835+ * @mode: The modes which are relevant for intitialization:
25836+ * HRTIMER_MODE_ABS, HRTIMER_MODE_REL, HRTIMER_MODE_ABS_SOFT,
25837+ * HRTIMER_MODE_REL_SOFT
25838+ *
25839+ * The PINNED variants of the above can be handed in,
25840+ * but the PINNED bit is ignored as pinning happens
25841+ * when the hrtimer is started
1a6e0f06 25842 */
e4b2b4a8
JK
25843 void hrtimer_init(struct hrtimer *timer, clockid_t clock_id,
25844 enum hrtimer_mode mode)
b3bbd485 25845@@ -1142,19 +1333,19 @@ EXPORT_SYMBOL_GPL(hrtimer_init);
e4b2b4a8
JK
25846 */
25847 bool hrtimer_active(const struct hrtimer *timer)
25848 {
25849- struct hrtimer_cpu_base *cpu_base;
25850+ struct hrtimer_clock_base *base;
25851 unsigned int seq;
25852
25853 do {
25854- cpu_base = READ_ONCE(timer->base->cpu_base);
25855- seq = raw_read_seqcount_begin(&cpu_base->seq);
25856+ base = READ_ONCE(timer->base);
25857+ seq = raw_read_seqcount_begin(&base->seq);
25858
25859 if (timer->state != HRTIMER_STATE_INACTIVE ||
25860- cpu_base->running == timer)
25861+ base->running == timer)
25862 return true;
25863
25864- } while (read_seqcount_retry(&cpu_base->seq, seq) ||
25865- cpu_base != READ_ONCE(timer->base->cpu_base));
25866+ } while (read_seqcount_retry(&base->seq, seq) ||
25867+ base != READ_ONCE(timer->base));
25868
25869 return false;
25870 }
b3bbd485 25871@@ -1180,7 +1371,8 @@ EXPORT_SYMBOL_GPL(hrtimer_active);
e4b2b4a8
JK
25872
25873 static void __run_hrtimer(struct hrtimer_cpu_base *cpu_base,
25874 struct hrtimer_clock_base *base,
25875- struct hrtimer *timer, ktime_t *now)
25876+ struct hrtimer *timer, ktime_t *now,
25877+ unsigned long flags)
25878 {
25879 enum hrtimer_restart (*fn)(struct hrtimer *);
25880 int restart;
b3bbd485 25881@@ -1188,16 +1380,16 @@ static void __run_hrtimer(struct hrtimer_cpu_base *cpu_base,
e4b2b4a8
JK
25882 lockdep_assert_held(&cpu_base->lock);
25883
25884 debug_deactivate(timer);
25885- cpu_base->running = timer;
25886+ base->running = timer;
25887
25888 /*
25889 * Separate the ->running assignment from the ->state assignment.
25890 *
25891 * As with a regular write barrier, this ensures the read side in
25892- * hrtimer_active() cannot observe cpu_base->running == NULL &&
25893+ * hrtimer_active() cannot observe base->running == NULL &&
25894 * timer->state == INACTIVE.
25895 */
25896- raw_write_seqcount_barrier(&cpu_base->seq);
25897+ raw_write_seqcount_barrier(&base->seq);
25898
25899 __remove_hrtimer(timer, base, HRTIMER_STATE_INACTIVE, 0);
25900 fn = timer->function;
b3bbd485 25901@@ -1211,15 +1403,15 @@ static void __run_hrtimer(struct hrtimer_cpu_base *cpu_base,
e4b2b4a8
JK
25902 timer->is_rel = false;
25903
25904 /*
25905- * Because we run timers from hardirq context, there is no chance
25906- * they get migrated to another cpu, therefore its safe to unlock
25907- * the timer base.
25908+ * The timer is marked as running in the cpu base, so it is
25909+ * protected against migration to a different CPU even if the lock
25910+ * is dropped.
25911 */
25912- raw_spin_unlock(&cpu_base->lock);
25913+ raw_spin_unlock_irqrestore(&cpu_base->lock, flags);
25914 trace_hrtimer_expire_entry(timer, now);
25915 restart = fn(timer);
25916 trace_hrtimer_expire_exit(timer);
25917- raw_spin_lock(&cpu_base->lock);
25918+ raw_spin_lock_irq(&cpu_base->lock);
25919
25920 /*
25921 * Note: We clear the running state after enqueue_hrtimer and
b3bbd485 25922@@ -1232,33 +1424,31 @@ static void __run_hrtimer(struct hrtimer_cpu_base *cpu_base,
e4b2b4a8
JK
25923 */
25924 if (restart != HRTIMER_NORESTART &&
25925 !(timer->state & HRTIMER_STATE_ENQUEUED))
25926- enqueue_hrtimer(timer, base);
25927+ enqueue_hrtimer(timer, base, HRTIMER_MODE_ABS);
25928
25929 /*
25930 * Separate the ->running assignment from the ->state assignment.
25931 *
25932 * As with a regular write barrier, this ensures the read side in
25933- * hrtimer_active() cannot observe cpu_base->running == NULL &&
25934+ * hrtimer_active() cannot observe base->running.timer == NULL &&
25935 * timer->state == INACTIVE.
25936 */
25937- raw_write_seqcount_barrier(&cpu_base->seq);
25938+ raw_write_seqcount_barrier(&base->seq);
1a6e0f06 25939
e4b2b4a8
JK
25940- WARN_ON_ONCE(cpu_base->running != timer);
25941- cpu_base->running = NULL;
25942+ WARN_ON_ONCE(base->running != timer);
25943+ base->running = NULL;
25944 }
1a6e0f06 25945
e4b2b4a8
JK
25946-static void __hrtimer_run_queues(struct hrtimer_cpu_base *cpu_base, ktime_t now)
25947+static void __hrtimer_run_queues(struct hrtimer_cpu_base *cpu_base, ktime_t now,
25948+ unsigned long flags, unsigned int active_mask)
25949 {
25950- struct hrtimer_clock_base *base = cpu_base->clock_base;
25951- unsigned int active = cpu_base->active_bases;
25952+ struct hrtimer_clock_base *base;
25953+ unsigned int active = cpu_base->active_bases & active_mask;
25954
25955- for (; active; base++, active >>= 1) {
25956+ for_each_active_base(base, cpu_base, active) {
25957 struct timerqueue_node *node;
25958 ktime_t basenow;
25959
25960- if (!(active & 0x01))
25961- continue;
25962-
25963 basenow = ktime_add(now, base->offset);
25964
25965 while ((node = timerqueue_getnext(&base->active))) {
b3bbd485 25966@@ -1281,11 +1471,29 @@ static void __hrtimer_run_queues(struct hrtimer_cpu_base *cpu_base, ktime_t now)
e4b2b4a8
JK
25967 if (basenow < hrtimer_get_softexpires_tv64(timer))
25968 break;
25969
25970- __run_hrtimer(cpu_base, base, timer, &basenow);
25971+ __run_hrtimer(cpu_base, base, timer, &basenow, flags);
25972 }
25973 }
25974 }
25975
25976+static __latent_entropy void hrtimer_run_softirq(struct softirq_action *h)
1a6e0f06 25977+{
e4b2b4a8
JK
25978+ struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases);
25979+ unsigned long flags;
25980+ ktime_t now;
1a6e0f06 25981+
e4b2b4a8 25982+ raw_spin_lock_irqsave(&cpu_base->lock, flags);
1a6e0f06 25983+
e4b2b4a8
JK
25984+ now = hrtimer_update_base(cpu_base);
25985+ __hrtimer_run_queues(cpu_base, now, flags, HRTIMER_ACTIVE_SOFT);
25986+
25987+ cpu_base->softirq_activated = 0;
25988+ hrtimer_update_softirq_timer(cpu_base, true);
25989+
25990+ raw_spin_unlock_irqrestore(&cpu_base->lock, flags);
25991+ wake_up_timer_waiters(cpu_base);
25992+}
25993+
25994 #ifdef CONFIG_HIGH_RES_TIMERS
1a6e0f06 25995
e4b2b4a8 25996 /*
b3bbd485 25997@@ -1296,13 +1504,14 @@ void hrtimer_interrupt(struct clock_event_device *dev)
e4b2b4a8
JK
25998 {
25999 struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases);
26000 ktime_t expires_next, now, entry_time, delta;
26001+ unsigned long flags;
26002 int retries = 0;
1a6e0f06 26003
e4b2b4a8
JK
26004 BUG_ON(!cpu_base->hres_active);
26005 cpu_base->nr_events++;
26006 dev->next_event = KTIME_MAX;
1a6e0f06 26007
e4b2b4a8
JK
26008- raw_spin_lock(&cpu_base->lock);
26009+ raw_spin_lock_irqsave(&cpu_base->lock, flags);
26010 entry_time = now = hrtimer_update_base(cpu_base);
26011 retry:
26012 cpu_base->in_hrtirq = 1;
b3bbd485 26013@@ -1315,17 +1524,23 @@ void hrtimer_interrupt(struct clock_event_device *dev)
e4b2b4a8
JK
26014 */
26015 cpu_base->expires_next = KTIME_MAX;
1a6e0f06 26016
e4b2b4a8
JK
26017- __hrtimer_run_queues(cpu_base, now);
26018+ if (!ktime_before(now, cpu_base->softirq_expires_next)) {
26019+ cpu_base->softirq_expires_next = KTIME_MAX;
26020+ cpu_base->softirq_activated = 1;
26021+ raise_softirq_irqoff(HRTIMER_SOFTIRQ);
26022+ }
1a6e0f06 26023+
e4b2b4a8 26024+ __hrtimer_run_queues(cpu_base, now, flags, HRTIMER_ACTIVE_HARD);
1a6e0f06 26025
e4b2b4a8
JK
26026 /* Reevaluate the clock bases for the next expiry */
26027- expires_next = __hrtimer_get_next_event(cpu_base);
26028+ expires_next = __hrtimer_get_next_event(cpu_base, HRTIMER_ACTIVE_ALL);
26029 /*
26030 * Store the new expiry value so the migration code can verify
26031 * against it.
26032 */
26033 cpu_base->expires_next = expires_next;
26034 cpu_base->in_hrtirq = 0;
26035- raw_spin_unlock(&cpu_base->lock);
26036+ raw_spin_unlock_irqrestore(&cpu_base->lock, flags);
26037
26038 /* Reprogramming necessary ? */
26039 if (!tick_program_event(expires_next, 0)) {
b3bbd485 26040@@ -1346,7 +1561,7 @@ void hrtimer_interrupt(struct clock_event_device *dev)
e4b2b4a8
JK
26041 * Acquire base lock for updating the offsets and retrieving
26042 * the current time.
26043 */
26044- raw_spin_lock(&cpu_base->lock);
26045+ raw_spin_lock_irqsave(&cpu_base->lock, flags);
26046 now = hrtimer_update_base(cpu_base);
26047 cpu_base->nr_retries++;
26048 if (++retries < 3)
b3bbd485 26049@@ -1359,7 +1574,8 @@ void hrtimer_interrupt(struct clock_event_device *dev)
e4b2b4a8
JK
26050 */
26051 cpu_base->nr_hangs++;
26052 cpu_base->hang_detected = 1;
26053- raw_spin_unlock(&cpu_base->lock);
26054+ raw_spin_unlock_irqrestore(&cpu_base->lock, flags);
26055+
26056 delta = ktime_sub(now, entry_time);
26057 if ((unsigned int)delta > cpu_base->max_hang_time)
26058 cpu_base->max_hang_time = (unsigned int) delta;
b3bbd485 26059@@ -1401,6 +1617,7 @@ static inline void __hrtimer_peek_ahead_timers(void) { }
e4b2b4a8
JK
26060 void hrtimer_run_queues(void)
26061 {
26062 struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases);
26063+ unsigned long flags;
26064 ktime_t now;
1a6e0f06 26065
e4b2b4a8 26066 if (__hrtimer_hres_active(cpu_base))
b3bbd485 26067@@ -1418,10 +1635,17 @@ void hrtimer_run_queues(void)
e4b2b4a8
JK
26068 return;
26069 }
1a6e0f06 26070
e4b2b4a8
JK
26071- raw_spin_lock(&cpu_base->lock);
26072+ raw_spin_lock_irqsave(&cpu_base->lock, flags);
26073 now = hrtimer_update_base(cpu_base);
26074- __hrtimer_run_queues(cpu_base, now);
26075- raw_spin_unlock(&cpu_base->lock);
26076+
26077+ if (!ktime_before(now, cpu_base->softirq_expires_next)) {
26078+ cpu_base->softirq_expires_next = KTIME_MAX;
26079+ cpu_base->softirq_activated = 1;
26080+ raise_softirq_irqoff(HRTIMER_SOFTIRQ);
26081+ }
26082+
26083+ __hrtimer_run_queues(cpu_base, now, flags, HRTIMER_ACTIVE_HARD);
26084+ raw_spin_unlock_irqrestore(&cpu_base->lock, flags);
1a6e0f06
JK
26085 }
26086
e4b2b4a8 26087 /*
b3bbd485 26088@@ -1440,13 +1664,65 @@ static enum hrtimer_restart hrtimer_wakeup(struct hrtimer *timer)
e4b2b4a8 26089 return HRTIMER_NORESTART;
1a6e0f06
JK
26090 }
26091
e4b2b4a8
JK
26092-void hrtimer_init_sleeper(struct hrtimer_sleeper *sl, struct task_struct *task)
26093+#ifdef CONFIG_PREEMPT_RT_FULL
26094+static bool task_is_realtime(struct task_struct *tsk)
1a6e0f06 26095 {
e4b2b4a8 26096+ int policy = tsk->policy;
1a6e0f06 26097+
e4b2b4a8
JK
26098+ if (policy == SCHED_FIFO || policy == SCHED_RR)
26099+ return true;
26100+ if (policy == SCHED_DEADLINE)
26101+ return true;
26102+ return false;
26103+}
1a6e0f06 26104+#endif
e4b2b4a8
JK
26105+
26106+static void __hrtimer_init_sleeper(struct hrtimer_sleeper *sl,
26107+ clockid_t clock_id,
26108+ enum hrtimer_mode mode,
26109+ struct task_struct *task)
26110+{
26111+#ifdef CONFIG_PREEMPT_RT_FULL
26112+ if (!(mode & (HRTIMER_MODE_SOFT | HRTIMER_MODE_HARD))) {
26113+ if (task_is_realtime(current) || system_state != SYSTEM_RUNNING)
26114+ mode |= HRTIMER_MODE_HARD;
26115+ else
26116+ mode |= HRTIMER_MODE_SOFT;
26117+ }
1a6e0f06 26118+#endif
e4b2b4a8
JK
26119+ __hrtimer_init(&sl->timer, clock_id, mode);
26120 sl->timer.function = hrtimer_wakeup;
26121 sl->task = task;
26122 }
26123+
26124+/**
26125+ * hrtimer_init_sleeper - initialize sleeper to the given clock
26126+ * @sl: sleeper to be initialized
26127+ * @clock_id: the clock to be used
26128+ * @mode: timer mode abs/rel
26129+ * @task: the task to wake up
26130+ */
26131+void hrtimer_init_sleeper(struct hrtimer_sleeper *sl, clockid_t clock_id,
26132+ enum hrtimer_mode mode, struct task_struct *task)
26133+{
26134+ debug_init(&sl->timer, clock_id, mode);
26135+ __hrtimer_init_sleeper(sl, clock_id, mode, task);
26136+
26137+}
26138 EXPORT_SYMBOL_GPL(hrtimer_init_sleeper);
1a6e0f06 26139
e4b2b4a8
JK
26140+#ifdef CONFIG_DEBUG_OBJECTS_TIMERS
26141+void hrtimer_init_sleeper_on_stack(struct hrtimer_sleeper *sl,
26142+ clockid_t clock_id,
26143+ enum hrtimer_mode mode,
26144+ struct task_struct *task)
26145+{
26146+ debug_object_init_on_stack(&sl->timer, &hrtimer_debug_descr);
26147+ __hrtimer_init_sleeper(sl, clock_id, mode, task);
26148+}
26149+EXPORT_SYMBOL_GPL(hrtimer_init_sleeper_on_stack);
26150+#endif
1a6e0f06 26151+
e4b2b4a8
JK
26152 int nanosleep_copyout(struct restart_block *restart, struct timespec64 *ts)
26153 {
26154 switch(restart->nanosleep.type) {
b3bbd485 26155@@ -1470,8 +1746,6 @@ static int __sched do_nanosleep(struct hrtimer_sleeper *t, enum hrtimer_mode mod
e4b2b4a8
JK
26156 {
26157 struct restart_block *restart;
1a6e0f06 26158
e4b2b4a8
JK
26159- hrtimer_init_sleeper(t, current);
26160-
26161 do {
26162 set_current_state(TASK_INTERRUPTIBLE);
26163 hrtimer_start_expires(&t->timer, mode);
b3bbd485 26164@@ -1508,10 +1782,9 @@ static long __sched hrtimer_nanosleep_restart(struct restart_block *restart)
e4b2b4a8
JK
26165 struct hrtimer_sleeper t;
26166 int ret;
1a6e0f06 26167
e4b2b4a8
JK
26168- hrtimer_init_on_stack(&t.timer, restart->nanosleep.clockid,
26169- HRTIMER_MODE_ABS);
26170+ hrtimer_init_sleeper_on_stack(&t, restart->nanosleep.clockid,
26171+ HRTIMER_MODE_ABS, current);
26172 hrtimer_set_expires_tv64(&t.timer, restart->nanosleep.expires);
26173-
26174 ret = do_nanosleep(&t, HRTIMER_MODE_ABS);
26175 destroy_hrtimer_on_stack(&t.timer);
26176 return ret;
b3bbd485 26177@@ -1529,7 +1802,7 @@ long hrtimer_nanosleep(const struct timespec64 *rqtp,
e4b2b4a8
JK
26178 if (dl_task(current) || rt_task(current))
26179 slack = 0;
1a6e0f06 26180
e4b2b4a8
JK
26181- hrtimer_init_on_stack(&t.timer, clockid, mode);
26182+ hrtimer_init_sleeper_on_stack(&t, clockid, mode, current);
26183 hrtimer_set_expires_range_ns(&t.timer, timespec64_to_ktime(*rqtp), slack);
26184 ret = do_nanosleep(&t, mode);
26185 if (ret != -ERESTART_RESTARTBLOCK)
b3bbd485 26186@@ -1585,6 +1858,27 @@ COMPAT_SYSCALL_DEFINE2(nanosleep, struct compat_timespec __user *, rqtp,
e4b2b4a8
JK
26187 }
26188 #endif
1a6e0f06 26189
e4b2b4a8
JK
26190+#ifdef CONFIG_PREEMPT_RT_FULL
26191+/*
26192+ * Sleep for 1 ms in hope whoever holds what we want will let it go.
26193+ */
26194+void cpu_chill(void)
26195+{
26196+ ktime_t chill_time;
26197+ unsigned int freeze_flag = current->flags & PF_NOFREEZE;
26198+
26199+ chill_time = ktime_set(0, NSEC_PER_MSEC);
26200+ set_current_state(TASK_UNINTERRUPTIBLE);
26201+ current->flags |= PF_NOFREEZE;
26202+ sleeping_lock_inc();
26203+ schedule_hrtimeout(&chill_time, HRTIMER_MODE_REL_HARD);
26204+ sleeping_lock_dec();
26205+ if (!freeze_flag)
26206+ current->flags &= ~PF_NOFREEZE;
26207+}
26208+EXPORT_SYMBOL(cpu_chill);
26209+#endif
26210+
26211 /*
26212 * Functions related to boot-time initialization:
26213 */
b3bbd485 26214@@ -1598,9 +1892,17 @@ int hrtimers_prepare_cpu(unsigned int cpu)
e4b2b4a8
JK
26215 timerqueue_init_head(&cpu_base->clock_base[i].active);
26216 }
1a6e0f06 26217
e4b2b4a8
JK
26218- cpu_base->active_bases = 0;
26219 cpu_base->cpu = cpu;
26220- hrtimer_init_hres(cpu_base);
26221+ cpu_base->active_bases = 0;
26222+ cpu_base->hres_active = 0;
26223+ cpu_base->hang_detected = 0;
26224+ cpu_base->next_timer = NULL;
26225+ cpu_base->softirq_next_timer = NULL;
26226+ cpu_base->expires_next = KTIME_MAX;
26227+ cpu_base->softirq_expires_next = KTIME_MAX;
26228+#ifdef CONFIG_PREEMPT_RT_BASE
26229+ init_waitqueue_head(&cpu_base->wait);
26230+#endif
26231 return 0;
26232 }
1a6e0f06 26233
b3bbd485 26234@@ -1632,7 +1934,7 @@ static void migrate_hrtimer_list(struct hrtimer_clock_base *old_base,
e4b2b4a8
JK
26235 * sort out already expired timers and reprogram the
26236 * event device.
26237 */
26238- enqueue_hrtimer(timer, new_base);
26239+ enqueue_hrtimer(timer, new_base, HRTIMER_MODE_ABS);
26240 }
26241 }
1a6e0f06 26242
b3bbd485 26243@@ -1644,6 +1946,12 @@ int hrtimers_dead_cpu(unsigned int scpu)
e4b2b4a8
JK
26244 BUG_ON(cpu_online(scpu));
26245 tick_cancel_sched_timer(scpu);
26246
26247+ /*
26248+ * this BH disable ensures that raise_softirq_irqoff() does
26249+ * not wakeup ksoftirqd (and acquire the pi-lock) while
26250+ * holding the cpu_base lock
26251+ */
26252+ local_bh_disable();
1a6e0f06 26253 local_irq_disable();
e4b2b4a8
JK
26254 old_base = &per_cpu(hrtimer_bases, scpu);
26255 new_base = this_cpu_ptr(&hrtimer_bases);
b3bbd485 26256@@ -1659,12 +1967,19 @@ int hrtimers_dead_cpu(unsigned int scpu)
e4b2b4a8
JK
26257 &new_base->clock_base[i]);
26258 }
1a6e0f06 26259
e4b2b4a8
JK
26260+ /*
26261+ * The migration might have changed the first expiring softirq
26262+ * timer on this CPU. Update it.
26263+ */
26264+ hrtimer_update_softirq_timer(new_base, false);
26265+
26266 raw_spin_unlock(&old_base->lock);
26267 raw_spin_unlock(&new_base->lock);
26268
26269 /* Check, if we got expired work to do */
26270 __hrtimer_peek_ahead_timers();
1a6e0f06 26271 local_irq_enable();
e4b2b4a8
JK
26272+ local_bh_enable();
26273 return 0;
26274 }
1a6e0f06 26275
b3bbd485 26276@@ -1673,18 +1988,19 @@ int hrtimers_dead_cpu(unsigned int scpu)
e4b2b4a8
JK
26277 void __init hrtimers_init(void)
26278 {
26279 hrtimers_prepare_cpu(smp_processor_id());
26280+ open_softirq(HRTIMER_SOFTIRQ, hrtimer_run_softirq);
1a6e0f06
JK
26281 }
26282
1a6e0f06 26283 /**
e4b2b4a8
JK
26284 * schedule_hrtimeout_range_clock - sleep until timeout
26285 * @expires: timeout value (ktime_t)
26286 * @delta: slack in expires timeout (ktime_t)
26287- * @mode: timer mode, HRTIMER_MODE_ABS or HRTIMER_MODE_REL
26288- * @clock: timer clock, CLOCK_MONOTONIC or CLOCK_REALTIME
26289+ * @mode: timer mode
26290+ * @clock_id: timer clock to be used
1a6e0f06 26291 */
e4b2b4a8
JK
26292 int __sched
26293 schedule_hrtimeout_range_clock(ktime_t *expires, u64 delta,
26294- const enum hrtimer_mode mode, int clock)
26295+ const enum hrtimer_mode mode, clockid_t clock_id)
26296 {
26297 struct hrtimer_sleeper t;
26298
b3bbd485 26299@@ -1705,11 +2021,9 @@ schedule_hrtimeout_range_clock(ktime_t *expires, u64 delta,
e4b2b4a8 26300 return -EINTR;
1a6e0f06
JK
26301 }
26302
e4b2b4a8
JK
26303- hrtimer_init_on_stack(&t.timer, clock, mode);
26304+ hrtimer_init_sleeper_on_stack(&t, clock_id, mode, current);
26305 hrtimer_set_expires_range_ns(&t.timer, *expires, delta);
26306
26307- hrtimer_init_sleeper(&t, current);
26308-
26309 hrtimer_start_expires(&t.timer, mode);
26310
26311 if (likely(t.task))
b3bbd485 26312@@ -1727,7 +2041,7 @@ schedule_hrtimeout_range_clock(ktime_t *expires, u64 delta,
e4b2b4a8
JK
26313 * schedule_hrtimeout_range - sleep until timeout
26314 * @expires: timeout value (ktime_t)
26315 * @delta: slack in expires timeout (ktime_t)
26316- * @mode: timer mode, HRTIMER_MODE_ABS or HRTIMER_MODE_REL
26317+ * @mode: timer mode
26318 *
26319 * Make the current task sleep until the given expiry time has
26320 * elapsed. The routine will return immediately unless
b3bbd485 26321@@ -1766,7 +2080,7 @@ EXPORT_SYMBOL_GPL(schedule_hrtimeout_range);
e4b2b4a8
JK
26322 /**
26323 * schedule_hrtimeout - sleep until timeout
26324 * @expires: timeout value (ktime_t)
26325- * @mode: timer mode, HRTIMER_MODE_ABS or HRTIMER_MODE_REL
26326+ * @mode: timer mode
26327 *
26328 * Make the current task sleep until the given expiry time has
26329 * elapsed. The routine will return immediately unless
b3bbd485
JK
26330diff --git a/kernel/time/itimer.c b/kernel/time/itimer.c
26331index f26acef5d7b4..760f38528365 100644
26332--- a/kernel/time/itimer.c
26333+++ b/kernel/time/itimer.c
26334@@ -214,6 +214,7 @@ int do_setitimer(int which, struct itimerval *value, struct itimerval *ovalue)
e4b2b4a8
JK
26335 /* We are sharing ->siglock with it_real_fn() */
26336 if (hrtimer_try_to_cancel(timer) < 0) {
26337 spin_unlock_irq(&tsk->sighand->siglock);
26338+ hrtimer_wait_for_timer(&tsk->signal->real_timer);
26339 goto again;
26340 }
26341 expires = timeval_to_ktime(value->it_value);
b3bbd485
JK
26342diff --git a/kernel/time/jiffies.c b/kernel/time/jiffies.c
26343index 497719127bf9..62acb8914c9e 100644
26344--- a/kernel/time/jiffies.c
26345+++ b/kernel/time/jiffies.c
26346@@ -74,7 +74,8 @@ static struct clocksource clocksource_jiffies = {
e4b2b4a8
JK
26347 .max_cycles = 10,
26348 };
26349
26350-__cacheline_aligned_in_smp DEFINE_SEQLOCK(jiffies_lock);
26351+__cacheline_aligned_in_smp DEFINE_RAW_SPINLOCK(jiffies_lock);
26352+__cacheline_aligned_in_smp seqcount_t jiffies_seq;
26353
26354 #if (BITS_PER_LONG < 64)
26355 u64 get_jiffies_64(void)
b3bbd485 26356@@ -83,9 +84,9 @@ u64 get_jiffies_64(void)
e4b2b4a8
JK
26357 u64 ret;
26358
26359 do {
26360- seq = read_seqbegin(&jiffies_lock);
26361+ seq = read_seqcount_begin(&jiffies_seq);
26362 ret = jiffies_64;
26363- } while (read_seqretry(&jiffies_lock, seq));
26364+ } while (read_seqcount_retry(&jiffies_seq, seq));
26365 return ret;
1a6e0f06 26366 }
e4b2b4a8 26367 EXPORT_SYMBOL(get_jiffies_64);
b3bbd485 26368diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c
5dd41b01 26369index 2da660d53a4b..c7b7d047d12e 100644
b3bbd485
JK
26370--- a/kernel/time/posix-cpu-timers.c
26371+++ b/kernel/time/posix-cpu-timers.c
e4b2b4a8
JK
26372@@ -3,8 +3,10 @@
26373 * Implement CPU time clocks for the POSIX clock interface.
26374 */
1a6e0f06 26375
e4b2b4a8
JK
26376+#include <uapi/linux/sched/types.h>
26377 #include <linux/sched/signal.h>
26378 #include <linux/sched/cputime.h>
26379+#include <linux/sched/rt.h>
26380 #include <linux/posix-timers.h>
26381 #include <linux/errno.h>
26382 #include <linux/math64.h>
26383@@ -14,6 +16,7 @@
26384 #include <linux/tick.h>
26385 #include <linux/workqueue.h>
26386 #include <linux/compat.h>
26387+#include <linux/smpboot.h>
1a6e0f06 26388
e4b2b4a8 26389 #include "posix-timers.h"
1a6e0f06 26390
b3bbd485 26391@@ -603,7 +606,7 @@ static int posix_cpu_timer_set(struct k_itimer *timer, int timer_flags,
e4b2b4a8
JK
26392 /*
26393 * Disarm any old timer after extracting its expiry time.
26394 */
26395- WARN_ON_ONCE(!irqs_disabled());
26396+ WARN_ON_ONCE_NONRT(!irqs_disabled());
1a6e0f06 26397
e4b2b4a8
JK
26398 ret = 0;
26399 old_incr = timer->it.cpu.incr;
b3bbd485 26400@@ -1034,7 +1037,7 @@ static void posix_cpu_timer_rearm(struct k_itimer *timer)
e4b2b4a8
JK
26401 /*
26402 * Now re-arm for the new expiry time.
26403 */
26404- WARN_ON_ONCE(!irqs_disabled());
26405+ WARN_ON_ONCE_NONRT(!irqs_disabled());
26406 arm_timer(timer);
26407 unlock:
26408 unlock_task_sighand(p, &flags);
b3bbd485 26409@@ -1119,13 +1122,13 @@ static inline int fastpath_timer_check(struct task_struct *tsk)
e4b2b4a8
JK
26410 * already updated our counts. We need to check if any timers fire now.
26411 * Interrupts are disabled.
26412 */
26413-void run_posix_cpu_timers(struct task_struct *tsk)
26414+static void __run_posix_cpu_timers(struct task_struct *tsk)
26415 {
26416 LIST_HEAD(firing);
26417 struct k_itimer *timer, *next;
26418 unsigned long flags;
1a6e0f06 26419
e4b2b4a8
JK
26420- WARN_ON_ONCE(!irqs_disabled());
26421+ WARN_ON_ONCE_NONRT(!irqs_disabled());
1a6e0f06 26422
e4b2b4a8
JK
26423 /*
26424 * The fast path checks that there are no expired thread or thread
b3bbd485 26425@@ -1179,6 +1182,152 @@ void run_posix_cpu_timers(struct task_struct *tsk)
1a6e0f06 26426 }
1a6e0f06 26427 }
1a6e0f06 26428
e4b2b4a8
JK
26429+#ifdef CONFIG_PREEMPT_RT_BASE
26430+#include <linux/kthread.h>
26431+#include <linux/cpu.h>
26432+DEFINE_PER_CPU(struct task_struct *, posix_timer_task);
26433+DEFINE_PER_CPU(struct task_struct *, posix_timer_tasklist);
26434+DEFINE_PER_CPU(bool, posix_timer_th_active);
1a6e0f06 26435+
e4b2b4a8 26436+static void posix_cpu_kthread_fn(unsigned int cpu)
1a6e0f06 26437+{
e4b2b4a8
JK
26438+ struct task_struct *tsk = NULL;
26439+ struct task_struct *next = NULL;
1a6e0f06 26440+
e4b2b4a8
JK
26441+ BUG_ON(per_cpu(posix_timer_task, cpu) != current);
26442+
26443+ /* grab task list */
26444+ raw_local_irq_disable();
26445+ tsk = per_cpu(posix_timer_tasklist, cpu);
26446+ per_cpu(posix_timer_tasklist, cpu) = NULL;
26447+ raw_local_irq_enable();
26448+
26449+ /* its possible the list is empty, just return */
26450+ if (!tsk)
26451+ return;
26452+
26453+ /* Process task list */
26454+ while (1) {
26455+ /* save next */
26456+ next = tsk->posix_timer_list;
26457+
26458+ /* run the task timers, clear its ptr and
26459+ * unreference it
26460+ */
26461+ __run_posix_cpu_timers(tsk);
26462+ tsk->posix_timer_list = NULL;
26463+ put_task_struct(tsk);
26464+
26465+ /* check if this is the last on the list */
26466+ if (next == tsk)
26467+ break;
26468+ tsk = next;
1a6e0f06
JK
26469+ }
26470+}
26471+
e4b2b4a8 26472+static inline int __fastpath_timer_check(struct task_struct *tsk)
1a6e0f06 26473+{
e4b2b4a8
JK
26474+ /* tsk == current, ensure it is safe to use ->signal/sighand */
26475+ if (unlikely(tsk->exit_state))
26476+ return 0;
1a6e0f06 26477+
e4b2b4a8
JK
26478+ if (!task_cputime_zero(&tsk->cputime_expires))
26479+ return 1;
26480+
26481+ if (!task_cputime_zero(&tsk->signal->cputime_expires))
26482+ return 1;
26483+
26484+ return 0;
1a6e0f06
JK
26485+}
26486+
e4b2b4a8
JK
26487+void run_posix_cpu_timers(struct task_struct *tsk)
26488+{
26489+ unsigned int cpu = smp_processor_id();
26490+ struct task_struct *tasklist;
1a6e0f06 26491+
e4b2b4a8
JK
26492+ BUG_ON(!irqs_disabled());
26493+
26494+ if (per_cpu(posix_timer_th_active, cpu) != true)
26495+ return;
26496+
26497+ /* get per-cpu references */
26498+ tasklist = per_cpu(posix_timer_tasklist, cpu);
26499+
26500+ /* check to see if we're already queued */
26501+ if (!tsk->posix_timer_list && __fastpath_timer_check(tsk)) {
26502+ get_task_struct(tsk);
26503+ if (tasklist) {
26504+ tsk->posix_timer_list = tasklist;
26505+ } else {
26506+ /*
26507+ * The list is terminated by a self-pointing
26508+ * task_struct
26509+ */
26510+ tsk->posix_timer_list = tsk;
26511+ }
26512+ per_cpu(posix_timer_tasklist, cpu) = tsk;
26513+
26514+ wake_up_process(per_cpu(posix_timer_task, cpu));
26515+ }
26516+}
26517+
26518+static int posix_cpu_kthread_should_run(unsigned int cpu)
1a6e0f06 26519+{
e4b2b4a8 26520+ return __this_cpu_read(posix_timer_tasklist) != NULL;
1a6e0f06 26521+}
1a6e0f06 26522+
e4b2b4a8 26523+static void posix_cpu_kthread_park(unsigned int cpu)
1a6e0f06 26524+{
e4b2b4a8 26525+ this_cpu_write(posix_timer_th_active, false);
1a6e0f06
JK
26526+}
26527+
e4b2b4a8 26528+static void posix_cpu_kthread_unpark(unsigned int cpu)
1a6e0f06 26529+{
e4b2b4a8 26530+ this_cpu_write(posix_timer_th_active, true);
1a6e0f06 26531+}
1a6e0f06 26532+
e4b2b4a8
JK
26533+static void posix_cpu_kthread_setup(unsigned int cpu)
26534+{
26535+ struct sched_param sp;
26536+
26537+ sp.sched_priority = MAX_RT_PRIO - 1;
26538+ sched_setscheduler_nocheck(current, SCHED_FIFO, &sp);
26539+ posix_cpu_kthread_unpark(cpu);
26540+}
26541+
26542+static struct smp_hotplug_thread posix_cpu_thread = {
26543+ .store = &posix_timer_task,
26544+ .thread_should_run = posix_cpu_kthread_should_run,
26545+ .thread_fn = posix_cpu_kthread_fn,
26546+ .thread_comm = "posixcputmr/%u",
26547+ .setup = posix_cpu_kthread_setup,
26548+ .park = posix_cpu_kthread_park,
26549+ .unpark = posix_cpu_kthread_unpark,
26550+};
26551+
26552+static int __init posix_cpu_thread_init(void)
1a6e0f06 26553+{
e4b2b4a8
JK
26554+ /* Start one for boot CPU. */
26555+ unsigned long cpu;
26556+ int ret;
26557+
26558+ /* init the per-cpu posix_timer_tasklets */
26559+ for_each_possible_cpu(cpu)
26560+ per_cpu(posix_timer_tasklist, cpu) = NULL;
26561+
26562+ ret = smpboot_register_percpu_thread(&posix_cpu_thread);
26563+ WARN_ON(ret);
26564+
1a6e0f06
JK
26565+ return 0;
26566+}
e4b2b4a8
JK
26567+early_initcall(posix_cpu_thread_init);
26568+#else /* CONFIG_PREEMPT_RT_BASE */
26569+void run_posix_cpu_timers(struct task_struct *tsk)
26570+{
26571+ __run_posix_cpu_timers(tsk);
26572+}
26573+#endif /* CONFIG_PREEMPT_RT_BASE */
26574+
26575 /*
26576 * Set one of the process-wide special case CPU timers or RLIMIT_CPU.
26577 * The tsk->sighand->siglock must be held by the caller.
b3bbd485 26578diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c
5dd41b01 26579index 55d45fe2cc17..5a59538f3d16 100644
b3bbd485
JK
26580--- a/kernel/time/posix-timers.c
26581+++ b/kernel/time/posix-timers.c
5dd41b01 26582@@ -443,6 +443,7 @@ static enum hrtimer_restart posix_timer_fn(struct hrtimer *timer)
e4b2b4a8
JK
26583 static struct pid *good_sigevent(sigevent_t * event)
26584 {
26585 struct task_struct *rtn = current->group_leader;
26586+ int sig = event->sigev_signo;
26587
26588 switch (event->sigev_notify) {
26589 case SIGEV_SIGNAL | SIGEV_THREAD_ID:
5dd41b01 26590@@ -452,7 +453,8 @@ static struct pid *good_sigevent(sigevent_t * event)
e4b2b4a8
JK
26591 /* FALLTHRU */
26592 case SIGEV_SIGNAL:
26593 case SIGEV_THREAD:
26594- if (event->sigev_signo <= 0 || event->sigev_signo > SIGRTMAX)
26595+ if (sig <= 0 || sig > SIGRTMAX ||
26596+ sig_kernel_only(sig) || sig_kernel_coredump(sig))
26597 return NULL;
26598 /* FALLTHRU */
26599 case SIGEV_NONE:
5dd41b01 26600@@ -478,7 +480,7 @@ static struct k_itimer * alloc_posix_timer(void)
e4b2b4a8
JK
26601
26602 static void k_itimer_rcu_free(struct rcu_head *head)
26603 {
26604- struct k_itimer *tmr = container_of(head, struct k_itimer, it.rcu);
26605+ struct k_itimer *tmr = container_of(head, struct k_itimer, rcu);
26606
26607 kmem_cache_free(posix_timers_cache, tmr);
26608 }
5dd41b01 26609@@ -495,7 +497,7 @@ static void release_posix_timer(struct k_itimer *tmr, int it_id_set)
e4b2b4a8
JK
26610 }
26611 put_pid(tmr->it_pid);
26612 sigqueue_free(tmr->sigq);
26613- call_rcu(&tmr->it.rcu, k_itimer_rcu_free);
26614+ call_rcu(&tmr->rcu, k_itimer_rcu_free);
26615 }
26616
26617 static int common_timer_create(struct k_itimer *new_timer)
5dd41b01 26618@@ -834,6 +836,22 @@ static void common_hrtimer_arm(struct k_itimer *timr, ktime_t expires,
e4b2b4a8
JK
26619 hrtimer_start_expires(timer, HRTIMER_MODE_ABS);
26620 }
26621
26622+/*
26623+ * Protected by RCU!
26624+ */
26625+static void timer_wait_for_callback(const struct k_clock *kc, struct k_itimer *timr)
26626+{
26627+#ifdef CONFIG_PREEMPT_RT_FULL
26628+ if (kc->timer_arm == common_hrtimer_arm)
26629+ hrtimer_wait_for_timer(&timr->it.real.timer);
26630+ else if (kc == &alarm_clock)
26631+ hrtimer_wait_for_timer(&timr->it.alarm.alarmtimer.timer);
26632+ else
26633+ /* FIXME: Whacky hack for posix-cpu-timers */
26634+ schedule_timeout(1);
1a6e0f06 26635+#endif
e4b2b4a8 26636+}
1a6e0f06 26637+
e4b2b4a8
JK
26638 static int common_hrtimer_try_to_cancel(struct k_itimer *timr)
26639 {
26640 return hrtimer_try_to_cancel(&timr->it.real.timer);
5dd41b01 26641@@ -898,6 +916,7 @@ static int do_timer_settime(timer_t timer_id, int flags,
e4b2b4a8
JK
26642 if (!timr)
26643 return -EINVAL;
26644
26645+ rcu_read_lock();
26646 kc = timr->kclock;
26647 if (WARN_ON_ONCE(!kc || !kc->timer_set))
26648 error = -EINVAL;
5dd41b01 26649@@ -906,9 +925,12 @@ static int do_timer_settime(timer_t timer_id, int flags,
e4b2b4a8
JK
26650
26651 unlock_timer(timr, flag);
26652 if (error == TIMER_RETRY) {
26653+ timer_wait_for_callback(kc, timr);
26654 old_spec64 = NULL; // We already got the old time...
26655+ rcu_read_unlock();
26656 goto retry;
26657 }
26658+ rcu_read_unlock();
26659
26660 return error;
26661 }
5dd41b01 26662@@ -990,10 +1012,15 @@ SYSCALL_DEFINE1(timer_delete, timer_t, timer_id)
e4b2b4a8
JK
26663 if (!timer)
26664 return -EINVAL;
26665
26666+ rcu_read_lock();
26667 if (timer_delete_hook(timer) == TIMER_RETRY) {
26668 unlock_timer(timer, flags);
26669+ timer_wait_for_callback(clockid_to_kclock(timer->it_clock),
26670+ timer);
26671+ rcu_read_unlock();
26672 goto retry_delete;
26673 }
26674+ rcu_read_unlock();
26675
26676 spin_lock(&current->sighand->siglock);
26677 list_del(&timer->list);
5dd41b01 26678@@ -1019,8 +1046,18 @@ static void itimer_delete(struct k_itimer *timer)
e4b2b4a8
JK
26679 retry_delete:
26680 spin_lock_irqsave(&timer->it_lock, flags);
26681
26682+ /* On RT we can race with a deletion */
26683+ if (!timer->it_signal) {
26684+ unlock_timer(timer, flags);
26685+ return;
26686+ }
26687+
26688 if (timer_delete_hook(timer) == TIMER_RETRY) {
26689+ rcu_read_lock();
26690 unlock_timer(timer, flags);
26691+ timer_wait_for_callback(clockid_to_kclock(timer->it_clock),
26692+ timer);
26693+ rcu_read_unlock();
26694 goto retry_delete;
26695 }
26696 list_del(&timer->list);
b3bbd485
JK
26697diff --git a/kernel/time/tick-broadcast-hrtimer.c b/kernel/time/tick-broadcast-hrtimer.c
26698index 58045eb976c3..f0a34afbc252 100644
26699--- a/kernel/time/tick-broadcast-hrtimer.c
26700+++ b/kernel/time/tick-broadcast-hrtimer.c
26701@@ -106,7 +106,7 @@ static enum hrtimer_restart bc_handler(struct hrtimer *t)
e4b2b4a8
JK
26702
26703 void tick_setup_hrtimer_broadcast(void)
1a6e0f06 26704 {
e4b2b4a8
JK
26705- hrtimer_init(&bctimer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
26706+ hrtimer_init(&bctimer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_HARD);
26707 bctimer.function = bc_handler;
26708 clockevents_register_device(&ce_broadcast_hrtimer);
26709 }
b3bbd485
JK
26710diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c
26711index 49edc1c4f3e6..7a87a4488a5e 100644
26712--- a/kernel/time/tick-common.c
26713+++ b/kernel/time/tick-common.c
26714@@ -79,13 +79,15 @@ int tick_is_oneshot_available(void)
e4b2b4a8
JK
26715 static void tick_periodic(int cpu)
26716 {
26717 if (tick_do_timer_cpu == cpu) {
26718- write_seqlock(&jiffies_lock);
26719+ raw_spin_lock(&jiffies_lock);
26720+ write_seqcount_begin(&jiffies_seq);
1a6e0f06 26721
e4b2b4a8
JK
26722 /* Keep track of the next tick event */
26723 tick_next_period = ktime_add(tick_next_period, tick_period);
1a6e0f06 26724
e4b2b4a8
JK
26725 do_timer(1);
26726- write_sequnlock(&jiffies_lock);
26727+ write_seqcount_end(&jiffies_seq);
26728+ raw_spin_unlock(&jiffies_lock);
26729 update_wall_time();
26730 }
1a6e0f06 26731
b3bbd485 26732@@ -157,9 +159,9 @@ void tick_setup_periodic(struct clock_event_device *dev, int broadcast)
e4b2b4a8 26733 ktime_t next;
1a6e0f06 26734
e4b2b4a8
JK
26735 do {
26736- seq = read_seqbegin(&jiffies_lock);
26737+ seq = read_seqcount_begin(&jiffies_seq);
26738 next = tick_next_period;
26739- } while (read_seqretry(&jiffies_lock, seq));
26740+ } while (read_seqcount_retry(&jiffies_seq, seq));
1a6e0f06 26741
e4b2b4a8
JK
26742 clockevents_switch_state(dev, CLOCK_EVT_STATE_ONESHOT);
26743
b3bbd485
JK
26744@@ -490,6 +492,7 @@ void tick_freeze(void)
26745 if (tick_freeze_depth == num_online_cpus()) {
26746 trace_suspend_resume(TPS("timekeeping_freeze"),
26747 smp_processor_id(), true);
26748+ system_state = SYSTEM_SUSPEND;
26749 timekeeping_suspend();
26750 } else {
26751 tick_suspend_local();
26752@@ -513,6 +516,7 @@ void tick_unfreeze(void)
26753
26754 if (tick_freeze_depth == num_online_cpus()) {
26755 timekeeping_resume();
26756+ system_state = SYSTEM_RUNNING;
26757 trace_suspend_resume(TPS("timekeeping_freeze"),
26758 smp_processor_id(), false);
26759 } else {
26760diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h
26761index f8e1845aa464..e277284c2831 100644
26762--- a/kernel/time/tick-internal.h
26763+++ b/kernel/time/tick-internal.h
26764@@ -150,16 +150,15 @@ static inline void tick_nohz_init(void) { }
e4b2b4a8
JK
26765
26766 #ifdef CONFIG_NO_HZ_COMMON
26767 extern unsigned long tick_nohz_active;
26768-#else
26769+extern void timers_update_nohz(void);
26770+# ifdef CONFIG_SMP
26771+extern struct static_key_false timers_migration_enabled;
26772+# endif
26773+#else /* CONFIG_NO_HZ_COMMON */
26774+static inline void timers_update_nohz(void) { }
26775 #define tick_nohz_active (0)
26776 #endif
26777
26778-#if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON)
26779-extern void timers_update_migration(bool update_nohz);
26780-#else
26781-static inline void timers_update_migration(bool update_nohz) { }
26782-#endif
26783-
26784 DECLARE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases);
26785
26786 extern u64 get_next_timer_interrupt(unsigned long basej, u64 basem);
b3bbd485 26787diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
5dd41b01 26788index a8fa0a896b78..643b36a0b8e1 100644
b3bbd485
JK
26789--- a/kernel/time/tick-sched.c
26790+++ b/kernel/time/tick-sched.c
26791@@ -66,7 +66,8 @@ static void tick_do_update_jiffies64(ktime_t now)
1a6e0f06
JK
26792 return;
26793
e4b2b4a8
JK
26794 /* Reevaluate with jiffies_lock held */
26795- write_seqlock(&jiffies_lock);
26796+ raw_spin_lock(&jiffies_lock);
26797+ write_seqcount_begin(&jiffies_seq);
26798
26799 delta = ktime_sub(now, last_jiffies_update);
26800 if (delta >= tick_period) {
b3bbd485 26801@@ -89,10 +90,12 @@ static void tick_do_update_jiffies64(ktime_t now)
e4b2b4a8
JK
26802 /* Keep the tick_next_period variable up to date */
26803 tick_next_period = ktime_add(last_jiffies_update, tick_period);
26804 } else {
26805- write_sequnlock(&jiffies_lock);
26806+ write_seqcount_end(&jiffies_seq);
26807+ raw_spin_unlock(&jiffies_lock);
26808 return;
1a6e0f06 26809 }
e4b2b4a8
JK
26810- write_sequnlock(&jiffies_lock);
26811+ write_seqcount_end(&jiffies_seq);
26812+ raw_spin_unlock(&jiffies_lock);
26813 update_wall_time();
26814 }
26815
b3bbd485 26816@@ -103,12 +106,14 @@ static ktime_t tick_init_jiffy_update(void)
e4b2b4a8
JK
26817 {
26818 ktime_t period;
26819
26820- write_seqlock(&jiffies_lock);
26821+ raw_spin_lock(&jiffies_lock);
26822+ write_seqcount_begin(&jiffies_seq);
26823 /* Did we start the jiffies update yet ? */
26824 if (last_jiffies_update == 0)
26825 last_jiffies_update = tick_next_period;
26826 period = last_jiffies_update;
26827- write_sequnlock(&jiffies_lock);
26828+ write_seqcount_end(&jiffies_seq);
26829+ raw_spin_unlock(&jiffies_lock);
26830 return period;
1a6e0f06
JK
26831 }
26832
b3bbd485 26833@@ -225,6 +230,7 @@ static void nohz_full_kick_func(struct irq_work *work)
e4b2b4a8
JK
26834
26835 static DEFINE_PER_CPU(struct irq_work, nohz_full_kick_work) = {
26836 .func = nohz_full_kick_func,
26837+ .flags = IRQ_WORK_HARD_IRQ,
26838 };
26839
1a6e0f06 26840 /*
b3bbd485 26841@@ -689,10 +695,10 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts,
1a6e0f06 26842
e4b2b4a8
JK
26843 /* Read jiffies and the time when jiffies were updated last */
26844 do {
26845- seq = read_seqbegin(&jiffies_lock);
26846+ seq = read_seqcount_begin(&jiffies_seq);
26847 basemono = last_jiffies_update;
26848 basejiff = jiffies;
26849- } while (read_seqretry(&jiffies_lock, seq));
26850+ } while (read_seqcount_retry(&jiffies_seq, seq));
26851 ts->last_jiffies = basejiff;
1a6e0f06 26852
e4b2b4a8 26853 /*
5dd41b01 26854@@ -906,14 +912,7 @@ static bool can_stop_idle_tick(int cpu, struct tick_sched *ts)
e4b2b4a8
JK
26855 return false;
26856
26857 if (unlikely(local_softirq_pending() && cpu_online(cpu))) {
26858- static int ratelimit;
26859-
b3a53f05 26860- if (ratelimit < 10 && !in_softirq() &&
e4b2b4a8
JK
26861- (local_softirq_pending() & SOFTIRQ_STOP_IDLE_MASK)) {
26862- pr_warn("NOHZ: local_softirq_pending %02x\n",
26863- (unsigned int) local_softirq_pending());
26864- ratelimit++;
26865- }
26866+ softirq_check_pending_idle();
26867 return false;
1a6e0f06 26868 }
1a6e0f06 26869
b3bbd485 26870@@ -1132,7 +1131,7 @@ static inline void tick_nohz_activate(struct tick_sched *ts, int mode)
e4b2b4a8
JK
26871 ts->nohz_mode = mode;
26872 /* One update is enough */
26873 if (!test_and_set_bit(0, &tick_nohz_active))
26874- timers_update_migration(true);
26875+ timers_update_nohz();
26876 }
1a6e0f06 26877
e4b2b4a8 26878 /**
b3bbd485 26879@@ -1250,7 +1249,7 @@ void tick_setup_sched_timer(void)
e4b2b4a8
JK
26880 /*
26881 * Emulate tick processing via per-CPU hrtimers:
26882 */
26883- hrtimer_init(&ts->sched_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
26884+ hrtimer_init(&ts->sched_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_HARD);
26885 ts->sched_timer.function = tick_sched_timer;
26886
26887 /* Get the next period (per-CPU) */
b3bbd485
JK
26888diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
26889index 2cafb49aa65e..2720f2c29a6d 100644
26890--- a/kernel/time/timekeeping.c
26891+++ b/kernel/time/timekeeping.c
26892@@ -2326,8 +2326,10 @@ EXPORT_SYMBOL(hardpps);
e4b2b4a8
JK
26893 */
26894 void xtime_update(unsigned long ticks)
1a6e0f06 26895 {
e4b2b4a8
JK
26896- write_seqlock(&jiffies_lock);
26897+ raw_spin_lock(&jiffies_lock);
26898+ write_seqcount_begin(&jiffies_seq);
26899 do_timer(ticks);
26900- write_sequnlock(&jiffies_lock);
26901+ write_seqcount_end(&jiffies_seq);
26902+ raw_spin_unlock(&jiffies_lock);
26903 update_wall_time();
26904 }
b3bbd485
JK
26905diff --git a/kernel/time/timekeeping.h b/kernel/time/timekeeping.h
26906index c9f9af339914..0c0f52bf1927 100644
26907--- a/kernel/time/timekeeping.h
26908+++ b/kernel/time/timekeeping.h
26909@@ -18,7 +18,8 @@ extern void timekeeping_resume(void);
e4b2b4a8
JK
26910 extern void do_timer(unsigned long ticks);
26911 extern void update_wall_time(void);
1a6e0f06 26912
e4b2b4a8
JK
26913-extern seqlock_t jiffies_lock;
26914+extern raw_spinlock_t jiffies_lock;
26915+extern seqcount_t jiffies_seq;
1a6e0f06 26916
e4b2b4a8
JK
26917 #define CS_NAME_LEN 32
26918
b3bbd485
JK
26919diff --git a/kernel/time/timer.c b/kernel/time/timer.c
26920index f17c76a1a05f..5fadd754ce20 100644
26921--- a/kernel/time/timer.c
26922+++ b/kernel/time/timer.c
e4b2b4a8
JK
26923@@ -44,6 +44,7 @@
26924 #include <linux/sched/debug.h>
26925 #include <linux/slab.h>
26926 #include <linux/compat.h>
26927+#include <linux/swait.h>
26928
26929 #include <linux/uaccess.h>
26930 #include <asm/unistd.h>
b3bbd485 26931@@ -197,11 +198,12 @@ EXPORT_SYMBOL(jiffies_64);
e4b2b4a8
JK
26932 struct timer_base {
26933 raw_spinlock_t lock;
26934 struct timer_list *running_timer;
26935+#ifdef CONFIG_PREEMPT_RT_FULL
26936+ struct swait_queue_head wait_for_running_timer;
1a6e0f06 26937+#endif
e4b2b4a8
JK
26938 unsigned long clk;
26939 unsigned long next_expiry;
26940 unsigned int cpu;
26941- bool migration_enabled;
26942- bool nohz_active;
26943 bool is_idle;
26944 bool must_forward_clk;
26945 DECLARE_BITMAP(pending_map, WHEEL_SIZE);
b3bbd485 26946@@ -210,45 +212,73 @@ struct timer_base {
1a6e0f06 26947
e4b2b4a8 26948 static DEFINE_PER_CPU(struct timer_base, timer_bases[NR_BASES]);
1a6e0f06 26949
e4b2b4a8
JK
26950-#if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON)
26951+#ifdef CONFIG_NO_HZ_COMMON
1a6e0f06 26952+
e4b2b4a8
JK
26953+static DEFINE_STATIC_KEY_FALSE(timers_nohz_active);
26954+static DEFINE_MUTEX(timer_keys_mutex);
26955+
26956+static struct swork_event timer_update_swork;
26957+
26958+#ifdef CONFIG_SMP
26959 unsigned int sysctl_timer_migration = 1;
1a6e0f06 26960
e4b2b4a8
JK
26961-void timers_update_migration(bool update_nohz)
26962+DEFINE_STATIC_KEY_FALSE(timers_migration_enabled);
1a6e0f06 26963+
e4b2b4a8
JK
26964+static void timers_update_migration(void)
26965 {
26966 bool on = sysctl_timer_migration && tick_nohz_active;
26967- unsigned int cpu;
1a6e0f06 26968
e4b2b4a8
JK
26969- /* Avoid the loop, if nothing to update */
26970- if (this_cpu_read(timer_bases[BASE_STD].migration_enabled) == on)
26971- return;
26972+ if (on)
26973+ static_branch_enable(&timers_migration_enabled);
26974+ else
26975+ static_branch_disable(&timers_migration_enabled);
26976+}
26977+#else
26978+static inline void timers_update_migration(void) { }
26979+#endif /* !CONFIG_SMP */
26980
26981- for_each_possible_cpu(cpu) {
26982- per_cpu(timer_bases[BASE_STD].migration_enabled, cpu) = on;
26983- per_cpu(timer_bases[BASE_DEF].migration_enabled, cpu) = on;
26984- per_cpu(hrtimer_bases.migration_enabled, cpu) = on;
26985- if (!update_nohz)
26986- continue;
26987- per_cpu(timer_bases[BASE_STD].nohz_active, cpu) = true;
26988- per_cpu(timer_bases[BASE_DEF].nohz_active, cpu) = true;
26989- per_cpu(hrtimer_bases.nohz_active, cpu) = true;
26990- }
26991+static void timer_update_keys(struct swork_event *event)
26992+{
26993+ mutex_lock(&timer_keys_mutex);
26994+ timers_update_migration();
26995+ static_branch_enable(&timers_nohz_active);
26996+ mutex_unlock(&timer_keys_mutex);
b3bbd485
JK
26997+}
26998+
e4b2b4a8
JK
26999+void timers_update_nohz(void)
27000+{
27001+ swork_queue(&timer_update_swork);
27002+}
1a6e0f06 27003+
e4b2b4a8 27004+static __init int hrtimer_init_thread(void)
1a6e0f06 27005+{
e4b2b4a8
JK
27006+ WARN_ON(swork_get());
27007+ INIT_SWORK(&timer_update_swork, timer_update_keys);
27008+ return 0;
b3bbd485 27009 }
e4b2b4a8 27010+early_initcall(hrtimer_init_thread);
b3bbd485 27011
e4b2b4a8
JK
27012 int timer_migration_handler(struct ctl_table *table, int write,
27013 void __user *buffer, size_t *lenp,
27014 loff_t *ppos)
27015 {
27016- static DEFINE_MUTEX(mutex);
27017 int ret;
27018
27019- mutex_lock(&mutex);
27020+ mutex_lock(&timer_keys_mutex);
27021 ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
27022 if (!ret && write)
27023- timers_update_migration(false);
27024- mutex_unlock(&mutex);
27025+ timers_update_migration();
27026+ mutex_unlock(&timer_keys_mutex);
27027 return ret;
27028 }
27029-#endif
27030+
27031+static inline bool is_timers_nohz_active(void)
27032+{
27033+ return static_branch_unlikely(&timers_nohz_active);
1a6e0f06
JK
27034+}
27035+#else
e4b2b4a8
JK
27036+static inline bool is_timers_nohz_active(void) { return false; }
27037+#endif /* NO_HZ_COMMON */
27038
27039 static unsigned long round_jiffies_common(unsigned long j, int cpu,
27040 bool force_up)
b3bbd485 27041@@ -534,7 +564,7 @@ __internal_add_timer(struct timer_base *base, struct timer_list *timer)
e4b2b4a8
JK
27042 static void
27043 trigger_dyntick_cpu(struct timer_base *base, struct timer_list *timer)
1a6e0f06 27044 {
e4b2b4a8
JK
27045- if (!IS_ENABLED(CONFIG_NO_HZ_COMMON) || !base->nohz_active)
27046+ if (!is_timers_nohz_active())
27047 return;
1a6e0f06 27048
e4b2b4a8 27049 /*
b3bbd485 27050@@ -840,21 +870,20 @@ static inline struct timer_base *get_timer_base(u32 tflags)
e4b2b4a8
JK
27051 return get_timer_cpu_base(tflags, tflags & TIMER_CPUMASK);
27052 }
1a6e0f06 27053
e4b2b4a8
JK
27054-#ifdef CONFIG_NO_HZ_COMMON
27055 static inline struct timer_base *
27056 get_target_base(struct timer_base *base, unsigned tflags)
1a6e0f06 27057 {
e4b2b4a8
JK
27058-#ifdef CONFIG_SMP
27059- if ((tflags & TIMER_PINNED) || !base->migration_enabled)
27060- return get_timer_this_cpu_base(tflags);
27061- return get_timer_cpu_base(tflags, get_nohz_timer_target());
27062-#else
27063- return get_timer_this_cpu_base(tflags);
27064+#if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON)
27065+ if (static_branch_unlikely(&timers_migration_enabled) &&
27066+ !(tflags & TIMER_PINNED))
27067+ return get_timer_cpu_base(tflags, get_nohz_timer_target());
27068 #endif
27069+ return get_timer_this_cpu_base(tflags);
1a6e0f06 27070 }
1a6e0f06 27071
e4b2b4a8
JK
27072 static inline void forward_timer_base(struct timer_base *base)
27073 {
27074+#ifdef CONFIG_NO_HZ_COMMON
27075 unsigned long jnow;
1a6e0f06 27076
e4b2b4a8 27077 /*
b3bbd485 27078@@ -878,16 +907,8 @@ static inline void forward_timer_base(struct timer_base *base)
e4b2b4a8
JK
27079 base->clk = jnow;
27080 else
27081 base->clk = base->next_expiry;
27082-}
27083-#else
27084-static inline struct timer_base *
27085-get_target_base(struct timer_base *base, unsigned tflags)
27086-{
27087- return get_timer_this_cpu_base(tflags);
27088-}
27089-
27090-static inline void forward_timer_base(struct timer_base *base) { }
27091 #endif
27092+}
1a6e0f06 27093
1a6e0f06 27094
1a6e0f06 27095 /*
b3bbd485 27096@@ -1130,6 +1151,33 @@ void add_timer_on(struct timer_list *timer, int cpu)
1a6e0f06 27097 }
e4b2b4a8 27098 EXPORT_SYMBOL_GPL(add_timer_on);
1a6e0f06 27099
e4b2b4a8
JK
27100+#ifdef CONFIG_PREEMPT_RT_FULL
27101+/*
27102+ * Wait for a running timer
27103+ */
27104+static void wait_for_running_timer(struct timer_list *timer)
27105+{
27106+ struct timer_base *base;
27107+ u32 tf = timer->flags;
27108+
27109+ if (tf & TIMER_MIGRATING)
27110+ return;
27111+
27112+ base = get_timer_base(tf);
27113+ swait_event(base->wait_for_running_timer,
27114+ base->running_timer != timer);
27115+}
27116+
27117+# define wakeup_timer_waiters(b) swake_up_all(&(b)->wait_for_running_timer)
1a6e0f06 27118+#else
e4b2b4a8 27119+static inline void wait_for_running_timer(struct timer_list *timer)
1a6e0f06 27120+{
e4b2b4a8 27121+ cpu_relax();
1a6e0f06 27122+}
e4b2b4a8
JK
27123+
27124+# define wakeup_timer_waiters(b) do { } while (0)
1a6e0f06
JK
27125+#endif
27126+
e4b2b4a8
JK
27127 /**
27128 * del_timer - deactivate a timer.
27129 * @timer: the timer to be deactivated
b3bbd485 27130@@ -1185,7 +1233,7 @@ int try_to_del_timer_sync(struct timer_list *timer)
e4b2b4a8
JK
27131 }
27132 EXPORT_SYMBOL(try_to_del_timer_sync);
27133
27134-#ifdef CONFIG_SMP
27135+#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT_FULL)
27136 /**
27137 * del_timer_sync - deactivate a timer and wait for the handler to finish.
27138 * @timer: the timer to be deactivated
b3bbd485 27139@@ -1245,7 +1293,7 @@ int del_timer_sync(struct timer_list *timer)
e4b2b4a8
JK
27140 int ret = try_to_del_timer_sync(timer);
27141 if (ret >= 0)
27142 return ret;
27143- cpu_relax();
27144+ wait_for_running_timer(timer);
27145 }
27146 }
27147 EXPORT_SYMBOL(del_timer_sync);
b3bbd485 27148@@ -1309,13 +1357,16 @@ static void expire_timers(struct timer_base *base, struct hlist_head *head)
e4b2b4a8
JK
27149 fn = timer->function;
27150 data = timer->data;
27151
27152- if (timer->flags & TIMER_IRQSAFE) {
27153+ if (!IS_ENABLED(CONFIG_PREEMPT_RT_FULL) &&
27154+ timer->flags & TIMER_IRQSAFE) {
27155 raw_spin_unlock(&base->lock);
27156 call_timer_fn(timer, fn, data);
27157+ base->running_timer = NULL;
27158 raw_spin_lock(&base->lock);
27159 } else {
27160 raw_spin_unlock_irq(&base->lock);
27161 call_timer_fn(timer, fn, data);
27162+ base->running_timer = NULL;
27163 raw_spin_lock_irq(&base->lock);
27164 }
27165 }
b3bbd485 27166@@ -1586,7 +1637,7 @@ void update_process_times(int user_tick)
e4b2b4a8 27167 account_process_tick(p, user_tick);
e4b2b4a8
JK
27168 run_local_timers();
27169 rcu_check_callbacks(user_tick);
27170-#ifdef CONFIG_IRQ_WORK
27171+#if defined(CONFIG_IRQ_WORK)
27172 if (in_irq())
27173 irq_work_tick();
27174 #endif
b3bbd485 27175@@ -1633,8 +1684,8 @@ static inline void __run_timers(struct timer_base *base)
e4b2b4a8
JK
27176 while (levels--)
27177 expire_timers(base, heads + levels);
27178 }
27179- base->running_timer = NULL;
27180 raw_spin_unlock_irq(&base->lock);
27181+ wakeup_timer_waiters(base);
27182 }
27183
1a6e0f06 27184 /*
b3bbd485 27185@@ -1644,6 +1695,7 @@ static __latent_entropy void run_timer_softirq(struct softirq_action *h)
e4b2b4a8
JK
27186 {
27187 struct timer_base *base = this_cpu_ptr(&timer_bases[BASE_STD]);
27188
27189+ irq_work_tick_soft();
b3bbd485
JK
27190 __run_timers(base);
27191 if (IS_ENABLED(CONFIG_NO_HZ_COMMON))
27192 __run_timers(this_cpu_ptr(&timer_bases[BASE_DEF]));
27193@@ -1867,6 +1919,9 @@ static void __init init_timer_cpu(int cpu)
e4b2b4a8
JK
27194 base->cpu = cpu;
27195 raw_spin_lock_init(&base->lock);
27196 base->clk = jiffies;
27197+#ifdef CONFIG_PREEMPT_RT_FULL
27198+ init_swait_queue_head(&base->wait_for_running_timer);
1a6e0f06 27199+#endif
e4b2b4a8
JK
27200 }
27201 }
27202
b3bbd485
JK
27203diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
27204index 4ad6f6ca18c1..55d39a3fbdf7 100644
27205--- a/kernel/trace/Kconfig
27206+++ b/kernel/trace/Kconfig
27207@@ -585,7 +585,10 @@ config HIST_TRIGGERS
e4b2b4a8
JK
27208 event activity as an initial guide for further investigation
27209 using more advanced tools.
27210
27211- See Documentation/trace/events.txt.
27212+ Inter-event tracing of quantities such as latencies is also
27213+ supported using hist triggers under this option.
27214+
27215+ See Documentation/trace/histogram.txt.
27216 If in doubt, say N.
27217
27218 config MMIOTRACE_TEST
b3bbd485 27219diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
5dd41b01 27220index a1d5e0949dcf..e8ca1e01facd 100644
b3bbd485
JK
27221--- a/kernel/trace/ring_buffer.c
27222+++ b/kernel/trace/ring_buffer.c
27223@@ -41,6 +41,8 @@ int ring_buffer_print_entry_header(struct trace_seq *s)
e4b2b4a8
JK
27224 RINGBUF_TYPE_PADDING);
27225 trace_seq_printf(s, "\ttime_extend : type == %d\n",
27226 RINGBUF_TYPE_TIME_EXTEND);
27227+ trace_seq_printf(s, "\ttime_stamp : type == %d\n",
27228+ RINGBUF_TYPE_TIME_STAMP);
27229 trace_seq_printf(s, "\tdata max type_len == %d\n",
27230 RINGBUF_TYPE_DATA_TYPE_LEN_MAX);
27231
b3bbd485 27232@@ -140,12 +142,15 @@ int ring_buffer_print_entry_header(struct trace_seq *s)
e4b2b4a8
JK
27233
27234 enum {
27235 RB_LEN_TIME_EXTEND = 8,
27236- RB_LEN_TIME_STAMP = 16,
27237+ RB_LEN_TIME_STAMP = 8,
27238 };
27239
27240 #define skip_time_extend(event) \
27241 ((struct ring_buffer_event *)((char *)event + RB_LEN_TIME_EXTEND))
27242
27243+#define extended_time(event) \
27244+ (event->type_len >= RINGBUF_TYPE_TIME_EXTEND)
27245+
27246 static inline int rb_null_event(struct ring_buffer_event *event)
1a6e0f06 27247 {
e4b2b4a8 27248 return event->type_len == RINGBUF_TYPE_PADDING && !event->time_delta;
b3bbd485 27249@@ -209,7 +214,7 @@ rb_event_ts_length(struct ring_buffer_event *event)
e4b2b4a8
JK
27250 {
27251 unsigned len = 0;
1a6e0f06 27252
e4b2b4a8
JK
27253- if (event->type_len == RINGBUF_TYPE_TIME_EXTEND) {
27254+ if (extended_time(event)) {
27255 /* time extends include the data event after it */
27256 len = RB_LEN_TIME_EXTEND;
27257 event = skip_time_extend(event);
b3bbd485 27258@@ -231,7 +236,7 @@ unsigned ring_buffer_event_length(struct ring_buffer_event *event)
e4b2b4a8
JK
27259 {
27260 unsigned length;
1a6e0f06 27261
e4b2b4a8
JK
27262- if (event->type_len == RINGBUF_TYPE_TIME_EXTEND)
27263+ if (extended_time(event))
27264 event = skip_time_extend(event);
27265
27266 length = rb_event_length(event);
b3bbd485 27267@@ -248,7 +253,7 @@ EXPORT_SYMBOL_GPL(ring_buffer_event_length);
e4b2b4a8
JK
27268 static __always_inline void *
27269 rb_event_data(struct ring_buffer_event *event)
1a6e0f06 27270 {
e4b2b4a8
JK
27271- if (event->type_len == RINGBUF_TYPE_TIME_EXTEND)
27272+ if (extended_time(event))
27273 event = skip_time_extend(event);
27274 BUG_ON(event->type_len > RINGBUF_TYPE_DATA_TYPE_LEN_MAX);
27275 /* If length is in len field, then array[0] has the data */
b3bbd485 27276@@ -275,6 +280,27 @@ EXPORT_SYMBOL_GPL(ring_buffer_event_data);
e4b2b4a8
JK
27277 #define TS_MASK ((1ULL << TS_SHIFT) - 1)
27278 #define TS_DELTA_TEST (~TS_MASK)
1a6e0f06 27279
e4b2b4a8
JK
27280+/**
27281+ * ring_buffer_event_time_stamp - return the event's extended timestamp
27282+ * @event: the event to get the timestamp of
27283+ *
27284+ * Returns the extended timestamp associated with a data event.
27285+ * An extended time_stamp is a 64-bit timestamp represented
27286+ * internally in a special way that makes the best use of space
27287+ * contained within a ring buffer event. This function decodes
27288+ * it and maps it to a straight u64 value.
27289+ */
27290+u64 ring_buffer_event_time_stamp(struct ring_buffer_event *event)
1a6e0f06 27291+{
e4b2b4a8 27292+ u64 ts;
1a6e0f06 27293+
e4b2b4a8
JK
27294+ ts = event->array[0];
27295+ ts <<= TS_SHIFT;
27296+ ts += event->time_delta;
1a6e0f06 27297+
e4b2b4a8
JK
27298+ return ts;
27299+}
27300+
27301 /* Flag when events were overwritten */
27302 #define RB_MISSED_EVENTS (1 << 31)
27303 /* Missed count stored at end */
b3bbd485 27304@@ -451,6 +477,7 @@ struct ring_buffer_per_cpu {
e4b2b4a8
JK
27305 struct buffer_page *reader_page;
27306 unsigned long lost_events;
27307 unsigned long last_overrun;
27308+ unsigned long nest;
27309 local_t entries_bytes;
27310 local_t entries;
27311 local_t overrun;
b3bbd485 27312@@ -488,6 +515,7 @@ struct ring_buffer {
e4b2b4a8
JK
27313 u64 (*clock)(void);
27314
27315 struct rb_irq_work irq_work;
27316+ bool time_stamp_abs;
27317 };
27318
27319 struct ring_buffer_iter {
b3bbd485 27320@@ -1387,6 +1415,16 @@ void ring_buffer_set_clock(struct ring_buffer *buffer,
e4b2b4a8 27321 buffer->clock = clock;
1a6e0f06
JK
27322 }
27323
e4b2b4a8 27324+void ring_buffer_set_time_stamp_abs(struct ring_buffer *buffer, bool abs)
1a6e0f06 27325+{
e4b2b4a8 27326+ buffer->time_stamp_abs = abs;
1a6e0f06
JK
27327+}
27328+
e4b2b4a8 27329+bool ring_buffer_time_stamp_abs(struct ring_buffer *buffer)
1a6e0f06 27330+{
e4b2b4a8 27331+ return buffer->time_stamp_abs;
1a6e0f06
JK
27332+}
27333+
e4b2b4a8
JK
27334 static void rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer);
27335
27336 static inline unsigned long rb_page_entries(struct buffer_page *bpage)
5dd41b01 27337@@ -2219,12 +2257,15 @@ rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer,
e4b2b4a8
JK
27338
27339 /* Slow path, do not inline */
27340 static noinline struct ring_buffer_event *
27341-rb_add_time_stamp(struct ring_buffer_event *event, u64 delta)
27342+rb_add_time_stamp(struct ring_buffer_event *event, u64 delta, bool abs)
27343 {
27344- event->type_len = RINGBUF_TYPE_TIME_EXTEND;
27345+ if (abs)
27346+ event->type_len = RINGBUF_TYPE_TIME_STAMP;
27347+ else
27348+ event->type_len = RINGBUF_TYPE_TIME_EXTEND;
27349
27350- /* Not the first event on the page? */
27351- if (rb_event_index(event)) {
27352+ /* Not the first event on the page, or not delta? */
27353+ if (abs || rb_event_index(event)) {
27354 event->time_delta = delta & TS_MASK;
27355 event->array[0] = delta >> TS_SHIFT;
27356 } else {
5dd41b01 27357@@ -2267,7 +2308,9 @@ rb_update_event(struct ring_buffer_per_cpu *cpu_buffer,
e4b2b4a8
JK
27358 * add it to the start of the resevered space.
27359 */
27360 if (unlikely(info->add_timestamp)) {
27361- event = rb_add_time_stamp(event, delta);
27362+ bool abs = ring_buffer_time_stamp_abs(cpu_buffer->buffer);
27363+
27364+ event = rb_add_time_stamp(event, info->delta, abs);
27365 length -= RB_LEN_TIME_EXTEND;
27366 delta = 0;
27367 }
5dd41b01 27368@@ -2455,7 +2498,7 @@ static __always_inline void rb_end_commit(struct ring_buffer_per_cpu *cpu_buffer
e4b2b4a8
JK
27369
27370 static inline void rb_event_discard(struct ring_buffer_event *event)
27371 {
27372- if (event->type_len == RINGBUF_TYPE_TIME_EXTEND)
27373+ if (extended_time(event))
27374 event = skip_time_extend(event);
27375
27376 /* array[0] holds the actual length for the discarded event */
5dd41b01 27377@@ -2499,10 +2542,11 @@ rb_update_write_stamp(struct ring_buffer_per_cpu *cpu_buffer,
e4b2b4a8
JK
27378 cpu_buffer->write_stamp =
27379 cpu_buffer->commit_page->page->time_stamp;
27380 else if (event->type_len == RINGBUF_TYPE_TIME_EXTEND) {
27381- delta = event->array[0];
27382- delta <<= TS_SHIFT;
27383- delta += event->time_delta;
27384+ delta = ring_buffer_event_time_stamp(event);
27385 cpu_buffer->write_stamp += delta;
27386+ } else if (event->type_len == RINGBUF_TYPE_TIME_STAMP) {
27387+ delta = ring_buffer_event_time_stamp(event);
27388+ cpu_buffer->write_stamp = delta;
27389 } else
27390 cpu_buffer->write_stamp += event->time_delta;
27391 }
5dd41b01 27392@@ -2585,22 +2629,19 @@ static __always_inline int
e4b2b4a8
JK
27393 trace_recursive_lock(struct ring_buffer_per_cpu *cpu_buffer)
27394 {
27395 unsigned int val = cpu_buffer->current_context;
27396+ unsigned long pc = preempt_count();
27397 int bit;
27398
27399- if (in_interrupt()) {
27400- if (in_nmi())
27401- bit = RB_CTX_NMI;
27402- else if (in_irq())
27403- bit = RB_CTX_IRQ;
27404- else
27405- bit = RB_CTX_SOFTIRQ;
27406- } else
27407+ if (!(pc & (NMI_MASK | HARDIRQ_MASK | SOFTIRQ_OFFSET)))
27408 bit = RB_CTX_NORMAL;
27409+ else
27410+ bit = pc & NMI_MASK ? RB_CTX_NMI :
27411+ pc & HARDIRQ_MASK ? RB_CTX_IRQ : RB_CTX_SOFTIRQ;
27412
27413- if (unlikely(val & (1 << bit)))
27414+ if (unlikely(val & (1 << (bit + cpu_buffer->nest))))
27415 return 1;
27416
27417- val |= (1 << bit);
27418+ val |= (1 << (bit + cpu_buffer->nest));
27419 cpu_buffer->current_context = val;
27420
27421 return 0;
5dd41b01 27422@@ -2609,7 +2650,57 @@ trace_recursive_lock(struct ring_buffer_per_cpu *cpu_buffer)
e4b2b4a8
JK
27423 static __always_inline void
27424 trace_recursive_unlock(struct ring_buffer_per_cpu *cpu_buffer)
27425 {
27426- cpu_buffer->current_context &= cpu_buffer->current_context - 1;
27427+ cpu_buffer->current_context &=
27428+ cpu_buffer->current_context - (1 << cpu_buffer->nest);
27429+}
27430+
27431+/* The recursive locking above uses 4 bits */
27432+#define NESTED_BITS 4
27433+
27434+/**
27435+ * ring_buffer_nest_start - Allow to trace while nested
27436+ * @buffer: The ring buffer to modify
27437+ *
27438+ * The ring buffer has a safty mechanism to prevent recursion.
27439+ * But there may be a case where a trace needs to be done while
27440+ * tracing something else. In this case, calling this function
27441+ * will allow this function to nest within a currently active
27442+ * ring_buffer_lock_reserve().
27443+ *
27444+ * Call this function before calling another ring_buffer_lock_reserve() and
27445+ * call ring_buffer_nest_end() after the nested ring_buffer_unlock_commit().
1a6e0f06 27446+ */
e4b2b4a8 27447+void ring_buffer_nest_start(struct ring_buffer *buffer)
1a6e0f06 27448+{
e4b2b4a8
JK
27449+ struct ring_buffer_per_cpu *cpu_buffer;
27450+ int cpu;
1a6e0f06 27451+
e4b2b4a8
JK
27452+ /* Enabled by ring_buffer_nest_end() */
27453+ preempt_disable_notrace();
27454+ cpu = raw_smp_processor_id();
27455+ cpu_buffer = buffer->buffers[cpu];
27456+ /* This is the shift value for the above recusive locking */
27457+ cpu_buffer->nest += NESTED_BITS;
1a6e0f06
JK
27458+}
27459+
e4b2b4a8
JK
27460+/**
27461+ * ring_buffer_nest_end - Allow to trace while nested
27462+ * @buffer: The ring buffer to modify
27463+ *
27464+ * Must be called after ring_buffer_nest_start() and after the
27465+ * ring_buffer_unlock_commit().
1a6e0f06 27466+ */
e4b2b4a8 27467+void ring_buffer_nest_end(struct ring_buffer *buffer)
1a6e0f06 27468+{
e4b2b4a8 27469+ struct ring_buffer_per_cpu *cpu_buffer;
1a6e0f06
JK
27470+ int cpu;
27471+
e4b2b4a8
JK
27472+ /* disabled by ring_buffer_nest_start() */
27473+ cpu = raw_smp_processor_id();
27474+ cpu_buffer = buffer->buffers[cpu];
27475+ /* This is the shift value for the above recusive locking */
27476+ cpu_buffer->nest -= NESTED_BITS;
27477+ preempt_enable_notrace();
27478 }
27479
27480 /**
5dd41b01 27481@@ -2685,7 +2776,7 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
e4b2b4a8
JK
27482 * If this is the first commit on the page, then it has the same
27483 * timestamp as the page itself.
27484 */
27485- if (!tail)
27486+ if (!tail && !ring_buffer_time_stamp_abs(cpu_buffer->buffer))
27487 info->delta = 0;
27488
27489 /* See if we shot pass the end of this buffer page */
5dd41b01 27490@@ -2762,8 +2853,11 @@ rb_reserve_next_event(struct ring_buffer *buffer,
e4b2b4a8
JK
27491 /* make sure this diff is calculated here */
27492 barrier();
27493
27494- /* Did the write stamp get updated already? */
27495- if (likely(info.ts >= cpu_buffer->write_stamp)) {
27496+ if (ring_buffer_time_stamp_abs(buffer)) {
27497+ info.delta = info.ts;
27498+ rb_handle_timestamp(cpu_buffer, &info);
27499+ } else /* Did the write stamp get updated already? */
27500+ if (likely(info.ts >= cpu_buffer->write_stamp)) {
27501 info.delta = diff;
27502 if (unlikely(test_time_stamp(info.delta)))
27503 rb_handle_timestamp(cpu_buffer, &info);
5dd41b01 27504@@ -3461,14 +3555,13 @@ rb_update_read_stamp(struct ring_buffer_per_cpu *cpu_buffer,
e4b2b4a8
JK
27505 return;
27506
27507 case RINGBUF_TYPE_TIME_EXTEND:
27508- delta = event->array[0];
27509- delta <<= TS_SHIFT;
27510- delta += event->time_delta;
27511+ delta = ring_buffer_event_time_stamp(event);
27512 cpu_buffer->read_stamp += delta;
27513 return;
27514
27515 case RINGBUF_TYPE_TIME_STAMP:
27516- /* FIXME: not implemented */
27517+ delta = ring_buffer_event_time_stamp(event);
27518+ cpu_buffer->read_stamp = delta;
27519 return;
27520
27521 case RINGBUF_TYPE_DATA:
5dd41b01 27522@@ -3492,14 +3585,13 @@ rb_update_iter_read_stamp(struct ring_buffer_iter *iter,
e4b2b4a8
JK
27523 return;
27524
27525 case RINGBUF_TYPE_TIME_EXTEND:
27526- delta = event->array[0];
27527- delta <<= TS_SHIFT;
27528- delta += event->time_delta;
27529+ delta = ring_buffer_event_time_stamp(event);
27530 iter->read_stamp += delta;
27531 return;
27532
27533 case RINGBUF_TYPE_TIME_STAMP:
27534- /* FIXME: not implemented */
27535+ delta = ring_buffer_event_time_stamp(event);
27536+ iter->read_stamp = delta;
27537 return;
27538
27539 case RINGBUF_TYPE_DATA:
5dd41b01 27540@@ -3723,6 +3815,8 @@ rb_buffer_peek(struct ring_buffer_per_cpu *cpu_buffer, u64 *ts,
e4b2b4a8
JK
27541 struct buffer_page *reader;
27542 int nr_loops = 0;
27543
27544+ if (ts)
27545+ *ts = 0;
27546 again:
27547 /*
27548 * We repeat when a time extend is encountered.
5dd41b01 27549@@ -3759,12 +3853,17 @@ rb_buffer_peek(struct ring_buffer_per_cpu *cpu_buffer, u64 *ts,
e4b2b4a8
JK
27550 goto again;
27551
27552 case RINGBUF_TYPE_TIME_STAMP:
27553- /* FIXME: not implemented */
27554+ if (ts) {
27555+ *ts = ring_buffer_event_time_stamp(event);
27556+ ring_buffer_normalize_time_stamp(cpu_buffer->buffer,
27557+ cpu_buffer->cpu, ts);
27558+ }
27559+ /* Internal data, OK to advance */
27560 rb_advance_reader(cpu_buffer);
27561 goto again;
27562
27563 case RINGBUF_TYPE_DATA:
27564- if (ts) {
27565+ if (ts && !(*ts)) {
27566 *ts = cpu_buffer->read_stamp + event->time_delta;
27567 ring_buffer_normalize_time_stamp(cpu_buffer->buffer,
27568 cpu_buffer->cpu, ts);
5dd41b01 27569@@ -3789,6 +3888,9 @@ rb_iter_peek(struct ring_buffer_iter *iter, u64 *ts)
e4b2b4a8
JK
27570 struct ring_buffer_event *event;
27571 int nr_loops = 0;
27572
27573+ if (ts)
27574+ *ts = 0;
27575+
27576 cpu_buffer = iter->cpu_buffer;
27577 buffer = cpu_buffer->buffer;
27578
5dd41b01 27579@@ -3841,12 +3943,17 @@ rb_iter_peek(struct ring_buffer_iter *iter, u64 *ts)
e4b2b4a8
JK
27580 goto again;
27581
27582 case RINGBUF_TYPE_TIME_STAMP:
27583- /* FIXME: not implemented */
27584+ if (ts) {
27585+ *ts = ring_buffer_event_time_stamp(event);
27586+ ring_buffer_normalize_time_stamp(cpu_buffer->buffer,
27587+ cpu_buffer->cpu, ts);
27588+ }
27589+ /* Internal data, OK to advance */
27590 rb_advance_iter(iter);
27591 goto again;
27592
27593 case RINGBUF_TYPE_DATA:
27594- if (ts) {
27595+ if (ts && !(*ts)) {
27596 *ts = iter->read_stamp + event->time_delta;
27597 ring_buffer_normalize_time_stamp(buffer,
27598 cpu_buffer->cpu, ts);
b3bbd485
JK
27599diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
27600index e9cbb96cd99e..4fc60e5ec4b9 100644
27601--- a/kernel/trace/trace.c
27602+++ b/kernel/trace/trace.c
27603@@ -1170,6 +1170,14 @@ static struct {
e4b2b4a8
JK
27604 ARCH_TRACE_CLOCKS
27605 };
27606
27607+bool trace_clock_in_ns(struct trace_array *tr)
27608+{
27609+ if (trace_clocks[tr->clock_id].in_ns)
27610+ return true;
27611+
27612+ return false;
1a6e0f06 27613+}
1a6e0f06
JK
27614+
27615 /*
e4b2b4a8 27616 * trace_parser_get_init - gets the buffer for trace parser
1a6e0f06 27617 */
b3bbd485 27618@@ -2127,6 +2135,7 @@ tracing_generic_entry_update(struct trace_entry *entry, unsigned long flags,
e4b2b4a8
JK
27619 struct task_struct *tsk = current;
27620
27621 entry->preempt_count = pc & 0xff;
27622+ entry->preempt_lazy_count = preempt_lazy_count();
27623 entry->pid = (tsk) ? tsk->pid : 0;
27624 entry->flags =
27625 #ifdef CONFIG_TRACE_IRQFLAGS_SUPPORT
b3bbd485 27626@@ -2137,8 +2146,11 @@ tracing_generic_entry_update(struct trace_entry *entry, unsigned long flags,
e4b2b4a8
JK
27627 ((pc & NMI_MASK ) ? TRACE_FLAG_NMI : 0) |
27628 ((pc & HARDIRQ_MASK) ? TRACE_FLAG_HARDIRQ : 0) |
27629 ((pc & SOFTIRQ_OFFSET) ? TRACE_FLAG_SOFTIRQ : 0) |
27630- (tif_need_resched() ? TRACE_FLAG_NEED_RESCHED : 0) |
27631+ (tif_need_resched_now() ? TRACE_FLAG_NEED_RESCHED : 0) |
27632+ (need_resched_lazy() ? TRACE_FLAG_NEED_RESCHED_LAZY : 0) |
27633 (test_preempt_need_resched() ? TRACE_FLAG_PREEMPT_RESCHED : 0);
27634+
27635+ entry->migrate_disable = (tsk) ? __migrate_disabled(tsk) & 0xFF : 0;
1a6e0f06 27636 }
e4b2b4a8 27637 EXPORT_SYMBOL_GPL(tracing_generic_entry_update);
1a6e0f06 27638
b3bbd485 27639@@ -2275,7 +2287,7 @@ trace_event_buffer_lock_reserve(struct ring_buffer **current_rb,
e4b2b4a8
JK
27640
27641 *current_rb = trace_file->tr->trace_buffer.buffer;
27642
27643- if ((trace_file->flags &
27644+ if (!ring_buffer_time_stamp_abs(*current_rb) && (trace_file->flags &
27645 (EVENT_FILE_FL_SOFT_DISABLED | EVENT_FILE_FL_FILTERED)) &&
27646 (entry = this_cpu_read(trace_buffered_event))) {
27647 /* Try to use the per cpu buffer first */
b3bbd485 27648@@ -3342,14 +3354,17 @@ get_total_entries(struct trace_buffer *buf,
e4b2b4a8
JK
27649
27650 static void print_lat_help_header(struct seq_file *m)
27651 {
27652- seq_puts(m, "# _------=> CPU# \n"
27653- "# / _-----=> irqs-off \n"
27654- "# | / _----=> need-resched \n"
27655- "# || / _---=> hardirq/softirq \n"
27656- "# ||| / _--=> preempt-depth \n"
27657- "# |||| / delay \n"
27658- "# cmd pid ||||| time | caller \n"
27659- "# \\ / ||||| \\ | / \n");
27660+ seq_puts(m, "# _--------=> CPU# \n"
27661+ "# / _-------=> irqs-off \n"
27662+ "# | / _------=> need-resched \n"
27663+ "# || / _-----=> need-resched_lazy \n"
27664+ "# ||| / _----=> hardirq/softirq \n"
27665+ "# |||| / _---=> preempt-depth \n"
27666+ "# ||||| / _--=> preempt-lazy-depth\n"
27667+ "# |||||| / _-=> migrate-disable \n"
27668+ "# ||||||| / delay \n"
27669+ "# cmd pid |||||||| time | caller \n"
27670+ "# \\ / |||||||| \\ | / \n");
1a6e0f06 27671 }
1a6e0f06 27672
e4b2b4a8 27673 static void print_event_info(struct trace_buffer *buf, struct seq_file *m)
b3bbd485 27674@@ -3385,15 +3400,17 @@ static void print_func_help_header_irq(struct trace_buffer *buf, struct seq_file
e4b2b4a8
JK
27675 tgid ? tgid_space : space);
27676 seq_printf(m, "# %s / _----=> need-resched\n",
27677 tgid ? tgid_space : space);
27678- seq_printf(m, "# %s| / _---=> hardirq/softirq\n",
27679+ seq_printf(m, "# %s| / _----=> need-resched_lazy\n",
b3bbd485
JK
27680+ tgid ? tgid_space : space);
27681+ seq_printf(m, "# %s|| / _---=> hardirq/softirq\n",
e4b2b4a8
JK
27682 tgid ? tgid_space : space);
27683- seq_printf(m, "# %s|| / _--=> preempt-depth\n",
b3bbd485 27684+ seq_printf(m, "# %s||| / _--=> preempt-depth\n",
e4b2b4a8
JK
27685 tgid ? tgid_space : space);
27686- seq_printf(m, "# %s||| / delay\n",
b3bbd485 27687+ seq_printf(m, "# %s|||| / delay\n",
e4b2b4a8
JK
27688 tgid ? tgid_space : space);
27689- seq_printf(m, "# TASK-PID %sCPU# |||| TIMESTAMP FUNCTION\n",
e4b2b4a8
JK
27690+ seq_printf(m, "# TASK-PID %sCPU# ||||| TIMESTAMP FUNCTION\n",
27691 tgid ? " TGID " : space);
27692- seq_printf(m, "# | | %s | |||| | |\n",
27693+ seq_printf(m, "# | | %s | ||||| | |\n",
27694 tgid ? " | " : space);
27695 }
27696
b3bbd485 27697@@ -4531,6 +4548,9 @@ static const char readme_msg[] =
e4b2b4a8
JK
27698 #ifdef CONFIG_X86_64
27699 " x86-tsc: TSC cycle counter\n"
27700 #endif
27701+ "\n timestamp_mode\t-view the mode used to timestamp events\n"
27702+ " delta: Delta difference against a buffer-wide timestamp\n"
27703+ " absolute: Absolute (standalone) timestamp\n"
27704 "\n trace_marker\t\t- Writes into this file writes into the kernel buffer\n"
27705 "\n trace_marker_raw\t\t- Writes into this file writes binary data into the kernel buffer\n"
27706 " tracing_cpumask\t- Limit which CPUs to trace\n"
b3bbd485 27707@@ -4707,8 +4727,9 @@ static const char readme_msg[] =
e4b2b4a8
JK
27708 "\t .sym display an address as a symbol\n"
27709 "\t .sym-offset display an address as a symbol and offset\n"
27710 "\t .execname display a common_pid as a program name\n"
27711- "\t .syscall display a syscall id as a syscall name\n\n"
27712- "\t .log2 display log2 value rather than raw number\n\n"
27713+ "\t .syscall display a syscall id as a syscall name\n"
27714+ "\t .log2 display log2 value rather than raw number\n"
27715+ "\t .usecs display a common_timestamp in microseconds\n\n"
27716 "\t The 'pause' parameter can be used to pause an existing hist\n"
27717 "\t trigger or to start a hist trigger but not log any events\n"
27718 "\t until told to do so. 'continue' can be used to start or\n"
b3bbd485 27719@@ -6218,7 +6239,7 @@ static int tracing_clock_show(struct seq_file *m, void *v)
e4b2b4a8 27720 return 0;
1a6e0f06 27721 }
1a6e0f06 27722
e4b2b4a8
JK
27723-static int tracing_set_clock(struct trace_array *tr, const char *clockstr)
27724+int tracing_set_clock(struct trace_array *tr, const char *clockstr)
27725 {
27726 int i;
27727
b3bbd485 27728@@ -6298,6 +6319,71 @@ static int tracing_clock_open(struct inode *inode, struct file *file)
e4b2b4a8 27729 return ret;
1a6e0f06 27730 }
e4b2b4a8
JK
27731
27732+static int tracing_time_stamp_mode_show(struct seq_file *m, void *v)
27733+{
27734+ struct trace_array *tr = m->private;
27735+
27736+ mutex_lock(&trace_types_lock);
27737+
27738+ if (ring_buffer_time_stamp_abs(tr->trace_buffer.buffer))
27739+ seq_puts(m, "delta [absolute]\n");
27740+ else
27741+ seq_puts(m, "[delta] absolute\n");
27742+
27743+ mutex_unlock(&trace_types_lock);
27744+
27745+ return 0;
27746+}
27747+
27748+static int tracing_time_stamp_mode_open(struct inode *inode, struct file *file)
27749+{
27750+ struct trace_array *tr = inode->i_private;
27751+ int ret;
27752+
27753+ if (tracing_disabled)
27754+ return -ENODEV;
27755+
27756+ if (trace_array_get(tr))
27757+ return -ENODEV;
27758+
27759+ ret = single_open(file, tracing_time_stamp_mode_show, inode->i_private);
27760+ if (ret < 0)
27761+ trace_array_put(tr);
27762+
27763+ return ret;
27764+}
27765+
27766+int tracing_set_time_stamp_abs(struct trace_array *tr, bool abs)
27767+{
27768+ int ret = 0;
27769+
27770+ mutex_lock(&trace_types_lock);
27771+
27772+ if (abs && tr->time_stamp_abs_ref++)
27773+ goto out;
27774+
27775+ if (!abs) {
27776+ if (WARN_ON_ONCE(!tr->time_stamp_abs_ref)) {
27777+ ret = -EINVAL;
27778+ goto out;
27779+ }
27780+
27781+ if (--tr->time_stamp_abs_ref)
27782+ goto out;
27783+ }
27784+
27785+ ring_buffer_set_time_stamp_abs(tr->trace_buffer.buffer, abs);
27786+
27787+#ifdef CONFIG_TRACER_MAX_TRACE
27788+ if (tr->max_buffer.buffer)
27789+ ring_buffer_set_time_stamp_abs(tr->max_buffer.buffer, abs);
1a6e0f06 27790+#endif
e4b2b4a8
JK
27791+ out:
27792+ mutex_unlock(&trace_types_lock);
27793+
27794+ return ret;
27795+}
27796+
27797 struct ftrace_buffer_info {
27798 struct trace_iterator iter;
27799 void *spare;
b3bbd485 27800@@ -6545,6 +6631,13 @@ static const struct file_operations trace_clock_fops = {
e4b2b4a8
JK
27801 .write = tracing_clock_write,
27802 };
1a6e0f06 27803
e4b2b4a8
JK
27804+static const struct file_operations trace_time_stamp_mode_fops = {
27805+ .open = tracing_time_stamp_mode_open,
27806+ .read = seq_read,
27807+ .llseek = seq_lseek,
27808+ .release = tracing_single_release_tr,
27809+};
27810+
27811 #ifdef CONFIG_TRACER_SNAPSHOT
27812 static const struct file_operations snapshot_fops = {
27813 .open = tracing_snapshot_open,
b3bbd485 27814@@ -7684,6 +7777,7 @@ static int instance_mkdir(const char *name)
e4b2b4a8
JK
27815 struct trace_array *tr;
27816 int ret;
1a6e0f06 27817
e4b2b4a8
JK
27818+ mutex_lock(&event_mutex);
27819 mutex_lock(&trace_types_lock);
1a6e0f06 27820
e4b2b4a8 27821 ret = -EEXIST;
b3bbd485 27822@@ -7716,6 +7810,7 @@ static int instance_mkdir(const char *name)
1a6e0f06 27823
e4b2b4a8
JK
27824 INIT_LIST_HEAD(&tr->systems);
27825 INIT_LIST_HEAD(&tr->events);
27826+ INIT_LIST_HEAD(&tr->hist_vars);
1a6e0f06 27827
e4b2b4a8
JK
27828 if (allocate_trace_buffers(tr, trace_buf_size) < 0)
27829 goto out_free_tr;
b3bbd485 27830@@ -7739,6 +7834,7 @@ static int instance_mkdir(const char *name)
e4b2b4a8 27831 list_add(&tr->list, &ftrace_trace_arrays);
1a6e0f06 27832
e4b2b4a8
JK
27833 mutex_unlock(&trace_types_lock);
27834+ mutex_unlock(&event_mutex);
1a6e0f06 27835
e4b2b4a8 27836 return 0;
1a6e0f06 27837
b3bbd485 27838@@ -7750,6 +7846,7 @@ static int instance_mkdir(const char *name)
1a6e0f06 27839
e4b2b4a8
JK
27840 out_unlock:
27841 mutex_unlock(&trace_types_lock);
27842+ mutex_unlock(&event_mutex);
1a6e0f06 27843
e4b2b4a8 27844 return ret;
1a6e0f06 27845
b3bbd485 27846@@ -7762,6 +7859,7 @@ static int instance_rmdir(const char *name)
e4b2b4a8
JK
27847 int ret;
27848 int i;
1a6e0f06 27849
e4b2b4a8
JK
27850+ mutex_lock(&event_mutex);
27851 mutex_lock(&trace_types_lock);
1a6e0f06 27852
e4b2b4a8 27853 ret = -ENODEV;
b3bbd485 27854@@ -7807,6 +7905,7 @@ static int instance_rmdir(const char *name)
1a6e0f06 27855
e4b2b4a8
JK
27856 out_unlock:
27857 mutex_unlock(&trace_types_lock);
27858+ mutex_unlock(&event_mutex);
1a6e0f06 27859
e4b2b4a8
JK
27860 return ret;
27861 }
b3bbd485 27862@@ -7864,6 +7963,9 @@ init_tracer_tracefs(struct trace_array *tr, struct dentry *d_tracer)
e4b2b4a8
JK
27863 trace_create_file("tracing_on", 0644, d_tracer,
27864 tr, &rb_simple_fops);
1a6e0f06 27865
e4b2b4a8
JK
27866+ trace_create_file("timestamp_mode", 0444, d_tracer, tr,
27867+ &trace_time_stamp_mode_fops);
1a6e0f06 27868+
e4b2b4a8 27869 create_trace_options_dir(tr);
1a6e0f06 27870
e4b2b4a8 27871 #if defined(CONFIG_TRACER_MAX_TRACE) || defined(CONFIG_HWLAT_TRACER)
b3bbd485 27872@@ -8275,6 +8377,92 @@ void ftrace_dump(enum ftrace_dump_mode oops_dump_mode)
1a6e0f06 27873 }
e4b2b4a8 27874 EXPORT_SYMBOL_GPL(ftrace_dump);
1a6e0f06 27875
e4b2b4a8 27876+int trace_run_command(const char *buf, int (*createfn)(int, char **))
1a6e0f06 27877+{
e4b2b4a8
JK
27878+ char **argv;
27879+ int argc, ret;
1a6e0f06 27880+
e4b2b4a8
JK
27881+ argc = 0;
27882+ ret = 0;
27883+ argv = argv_split(GFP_KERNEL, buf, &argc);
27884+ if (!argv)
27885+ return -ENOMEM;
27886+
27887+ if (argc)
27888+ ret = createfn(argc, argv);
27889+
27890+ argv_free(argv);
27891+
27892+ return ret;
1a6e0f06
JK
27893+}
27894+
e4b2b4a8
JK
27895+#define WRITE_BUFSIZE 4096
27896+
27897+ssize_t trace_parse_run_command(struct file *file, const char __user *buffer,
27898+ size_t count, loff_t *ppos,
27899+ int (*createfn)(int, char **))
27900+{
27901+ char *kbuf, *buf, *tmp;
27902+ int ret = 0;
27903+ size_t done = 0;
27904+ size_t size;
27905+
27906+ kbuf = kmalloc(WRITE_BUFSIZE, GFP_KERNEL);
27907+ if (!kbuf)
27908+ return -ENOMEM;
27909+
27910+ while (done < count) {
27911+ size = count - done;
27912+
27913+ if (size >= WRITE_BUFSIZE)
27914+ size = WRITE_BUFSIZE - 1;
27915+
27916+ if (copy_from_user(kbuf, buffer + done, size)) {
27917+ ret = -EFAULT;
27918+ goto out;
27919+ }
27920+ kbuf[size] = '\0';
27921+ buf = kbuf;
27922+ do {
27923+ tmp = strchr(buf, '\n');
27924+ if (tmp) {
27925+ *tmp = '\0';
27926+ size = tmp - buf + 1;
27927+ } else {
27928+ size = strlen(buf);
27929+ if (done + size < count) {
27930+ if (buf != kbuf)
27931+ break;
27932+ /* This can accept WRITE_BUFSIZE - 2 ('\n' + '\0') */
27933+ pr_warn("Line length is too long: Should be less than %d\n",
27934+ WRITE_BUFSIZE - 2);
27935+ ret = -EINVAL;
27936+ goto out;
27937+ }
27938+ }
27939+ done += size;
27940+
27941+ /* Remove comments */
27942+ tmp = strchr(buf, '#');
27943+
27944+ if (tmp)
27945+ *tmp = '\0';
27946+
27947+ ret = trace_run_command(buf, createfn);
27948+ if (ret)
27949+ goto out;
27950+ buf += size;
27951+
27952+ } while (done < count);
27953+ }
27954+ ret = done;
27955+
27956+out:
27957+ kfree(kbuf);
27958+
27959+ return ret;
27960+}
27961+
27962 __init static int tracer_alloc_buffers(void)
27963 {
27964 int ring_buf_size;
b3bbd485 27965@@ -8375,6 +8563,7 @@ __init static int tracer_alloc_buffers(void)
e4b2b4a8
JK
27966
27967 INIT_LIST_HEAD(&global_trace.systems);
27968 INIT_LIST_HEAD(&global_trace.events);
27969+ INIT_LIST_HEAD(&global_trace.hist_vars);
27970 list_add(&global_trace.list, &ftrace_trace_arrays);
27971
27972 apply_trace_boot_options();
b3bbd485
JK
27973diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
27974index 851cd1605085..18bf383f46e8 100644
27975--- a/kernel/trace/trace.h
27976+++ b/kernel/trace/trace.h
27977@@ -127,6 +127,7 @@ struct kretprobe_trace_entry_head {
27978 * NEED_RESCHED - reschedule is requested
27979 * HARDIRQ - inside an interrupt handler
27980 * SOFTIRQ - inside a softirq handler
27981+ * NEED_RESCHED_LAZY - lazy reschedule is requested
27982 */
27983 enum trace_flag_type {
27984 TRACE_FLAG_IRQS_OFF = 0x01,
27985@@ -136,6 +137,7 @@ enum trace_flag_type {
27986 TRACE_FLAG_SOFTIRQ = 0x10,
27987 TRACE_FLAG_PREEMPT_RESCHED = 0x20,
27988 TRACE_FLAG_NMI = 0x40,
27989+ TRACE_FLAG_NEED_RESCHED_LAZY = 0x80,
27990 };
27991
27992 #define TRACE_BUF_SIZE 1024
27993@@ -273,6 +275,8 @@ struct trace_array {
27994 /* function tracing enabled */
27995 int function_enabled;
27996 #endif
27997+ int time_stamp_abs_ref;
27998+ struct list_head hist_vars;
27999 };
28000
28001 enum {
28002@@ -286,6 +290,11 @@ extern struct mutex trace_types_lock;
28003 extern int trace_array_get(struct trace_array *tr);
28004 extern void trace_array_put(struct trace_array *tr);
28005
28006+extern int tracing_set_time_stamp_abs(struct trace_array *tr, bool abs);
28007+extern int tracing_set_clock(struct trace_array *tr, const char *clockstr);
28008+
28009+extern bool trace_clock_in_ns(struct trace_array *tr);
28010+
28011 /*
28012 * The global tracer (top) should be the first trace array added,
28013 * but we check the flag anyway.
28014@@ -1293,7 +1302,7 @@ __event_trigger_test_discard(struct trace_event_file *file,
28015 unsigned long eflags = file->flags;
28016
28017 if (eflags & EVENT_FILE_FL_TRIGGER_COND)
28018- *tt = event_triggers_call(file, entry);
28019+ *tt = event_triggers_call(file, entry, event);
28020
28021 if (test_bit(EVENT_FILE_FL_SOFT_DISABLED_BIT, &file->flags) ||
28022 (unlikely(file->flags & EVENT_FILE_FL_FILTERED) &&
28023@@ -1330,7 +1339,7 @@ event_trigger_unlock_commit(struct trace_event_file *file,
28024 trace_buffer_unlock_commit(file->tr, buffer, event, irq_flags, pc);
28025
28026 if (tt)
28027- event_triggers_post_call(file, tt, entry);
28028+ event_triggers_post_call(file, tt, entry, event);
28029 }
28030
28031 /**
28032@@ -1363,7 +1372,7 @@ event_trigger_unlock_commit_regs(struct trace_event_file *file,
28033 irq_flags, pc, regs);
28034
28035 if (tt)
28036- event_triggers_post_call(file, tt, entry);
28037+ event_triggers_post_call(file, tt, entry, event);
28038 }
28039
28040 #define FILTER_PRED_INVALID ((unsigned short)-1)
28041@@ -1545,6 +1554,8 @@ extern void pause_named_trigger(struct event_trigger_data *data);
28042 extern void unpause_named_trigger(struct event_trigger_data *data);
28043 extern void set_named_trigger_data(struct event_trigger_data *data,
28044 struct event_trigger_data *named_data);
28045+extern struct event_trigger_data *
28046+get_named_trigger_data(struct event_trigger_data *data);
28047 extern int register_event_command(struct event_command *cmd);
28048 extern int unregister_event_command(struct event_command *cmd);
28049 extern int register_trigger_hist_enable_disable_cmds(void);
28050@@ -1588,7 +1599,8 @@ extern int register_trigger_hist_enable_disable_cmds(void);
28051 */
28052 struct event_trigger_ops {
28053 void (*func)(struct event_trigger_data *data,
28054- void *rec);
28055+ void *rec,
28056+ struct ring_buffer_event *rbe);
28057 int (*init)(struct event_trigger_ops *ops,
28058 struct event_trigger_data *data);
28059 void (*free)(struct event_trigger_ops *ops,
28060@@ -1755,6 +1767,13 @@ void trace_printk_start_comm(void);
28061 int trace_keep_overwrite(struct tracer *tracer, u32 mask, int set);
28062 int set_tracer_flag(struct trace_array *tr, unsigned int mask, int enabled);
28063
28064+#define MAX_EVENT_NAME_LEN 64
28065+
28066+extern int trace_run_command(const char *buf, int (*createfn)(int, char**));
28067+extern ssize_t trace_parse_run_command(struct file *file,
28068+ const char __user *buffer, size_t count, loff_t *ppos,
28069+ int (*createfn)(int, char**));
28070+
28071 /*
28072 * Normal trace_printk() and friends allocates special buffers
28073 * to do the manipulation, as well as saves the print formats
28074diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
28075index d53268a4e167..9ba230a4052f 100644
28076--- a/kernel/trace/trace_events.c
28077+++ b/kernel/trace/trace_events.c
28078@@ -187,6 +187,8 @@ static int trace_define_common_fields(void)
e4b2b4a8
JK
28079 __common_field(unsigned char, flags);
28080 __common_field(unsigned char, preempt_count);
28081 __common_field(int, pid);
28082+ __common_field(unsigned short, migrate_disable);
28083+ __common_field(unsigned short, padding);
28084
28085 return ret;
1a6e0f06 28086 }
b3bbd485 28087@@ -1406,8 +1408,8 @@ static int subsystem_open(struct inode *inode, struct file *filp)
e4b2b4a8 28088 return -ENODEV;
1a6e0f06 28089
e4b2b4a8
JK
28090 /* Make sure the system still exists */
28091- mutex_lock(&trace_types_lock);
28092 mutex_lock(&event_mutex);
28093+ mutex_lock(&trace_types_lock);
28094 list_for_each_entry(tr, &ftrace_trace_arrays, list) {
28095 list_for_each_entry(dir, &tr->systems, list) {
28096 if (dir == inode->i_private) {
b3bbd485 28097@@ -1421,8 +1423,8 @@ static int subsystem_open(struct inode *inode, struct file *filp)
e4b2b4a8
JK
28098 }
28099 }
28100 exit_loop:
28101- mutex_unlock(&event_mutex);
28102 mutex_unlock(&trace_types_lock);
28103+ mutex_unlock(&event_mutex);
28104
28105 if (!system)
28106 return -ENODEV;
b3bbd485 28107@@ -2308,15 +2310,15 @@ static void __add_event_to_tracers(struct trace_event_call *call);
e4b2b4a8 28108 int trace_add_event_call(struct trace_event_call *call)
1a6e0f06 28109 {
e4b2b4a8
JK
28110 int ret;
28111- mutex_lock(&trace_types_lock);
28112 mutex_lock(&event_mutex);
28113+ mutex_lock(&trace_types_lock);
28114
28115 ret = __register_event(call, NULL);
28116 if (ret >= 0)
28117 __add_event_to_tracers(call);
28118
28119- mutex_unlock(&event_mutex);
28120 mutex_unlock(&trace_types_lock);
28121+ mutex_unlock(&event_mutex);
28122 return ret;
1a6e0f06
JK
28123 }
28124
b3bbd485 28125@@ -2370,13 +2372,13 @@ int trace_remove_event_call(struct trace_event_call *call)
1a6e0f06 28126 {
e4b2b4a8 28127 int ret;
1a6e0f06 28128
e4b2b4a8
JK
28129- mutex_lock(&trace_types_lock);
28130 mutex_lock(&event_mutex);
28131+ mutex_lock(&trace_types_lock);
28132 down_write(&trace_event_sem);
28133 ret = probe_remove_event_call(call);
28134 up_write(&trace_event_sem);
28135- mutex_unlock(&event_mutex);
28136 mutex_unlock(&trace_types_lock);
28137+ mutex_unlock(&event_mutex);
1a6e0f06 28138
e4b2b4a8 28139 return ret;
1a6e0f06 28140 }
b3bbd485 28141@@ -2438,8 +2440,8 @@ static int trace_module_notify(struct notifier_block *self,
e4b2b4a8
JK
28142 {
28143 struct module *mod = data;
1a6e0f06 28144
e4b2b4a8
JK
28145- mutex_lock(&trace_types_lock);
28146 mutex_lock(&event_mutex);
28147+ mutex_lock(&trace_types_lock);
28148 switch (val) {
28149 case MODULE_STATE_COMING:
28150 trace_module_add_events(mod);
b3bbd485 28151@@ -2448,8 +2450,8 @@ static int trace_module_notify(struct notifier_block *self,
e4b2b4a8
JK
28152 trace_module_remove_events(mod);
28153 break;
28154 }
28155- mutex_unlock(&event_mutex);
28156 mutex_unlock(&trace_types_lock);
28157+ mutex_unlock(&event_mutex);
1a6e0f06 28158
1a6e0f06
JK
28159 return 0;
28160 }
b3bbd485 28161@@ -2964,24 +2966,24 @@ create_event_toplevel_files(struct dentry *parent, struct trace_array *tr)
e4b2b4a8
JK
28162 * creates the event hierachry in the @parent/events directory.
28163 *
28164 * Returns 0 on success.
28165+ *
28166+ * Must be called with event_mutex held.
28167 */
28168 int event_trace_add_tracer(struct dentry *parent, struct trace_array *tr)
28169 {
28170 int ret;
1a6e0f06 28171
e4b2b4a8
JK
28172- mutex_lock(&event_mutex);
28173+ lockdep_assert_held(&event_mutex);
c7c16703 28174
e4b2b4a8
JK
28175 ret = create_event_toplevel_files(parent, tr);
28176 if (ret)
28177- goto out_unlock;
28178+ goto out;
c7c16703 28179
e4b2b4a8
JK
28180 down_write(&trace_event_sem);
28181 __trace_add_event_dirs(tr);
28182 up_write(&trace_event_sem);
c7c16703 28183
e4b2b4a8
JK
28184- out_unlock:
28185- mutex_unlock(&event_mutex);
28186-
28187+ out:
28188 return ret;
1a6e0f06 28189 }
1a6e0f06 28190
b3bbd485 28191@@ -3010,9 +3012,10 @@ early_event_add_tracer(struct dentry *parent, struct trace_array *tr)
e4b2b4a8 28192 return ret;
1a6e0f06 28193 }
1a6e0f06 28194
e4b2b4a8
JK
28195+/* Must be called with event_mutex held */
28196 int event_trace_del_tracer(struct trace_array *tr)
28197 {
28198- mutex_lock(&event_mutex);
28199+ lockdep_assert_held(&event_mutex);
1a6e0f06 28200
e4b2b4a8
JK
28201 /* Disable any event triggers and associated soft-disabled events */
28202 clear_event_triggers(tr);
b3bbd485 28203@@ -3033,8 +3036,6 @@ int event_trace_del_tracer(struct trace_array *tr)
1a6e0f06 28204
e4b2b4a8 28205 tr->event_dir = NULL;
1a6e0f06 28206
e4b2b4a8
JK
28207- mutex_unlock(&event_mutex);
28208-
28209 return 0;
1a6e0f06 28210 }
1a6e0f06 28211
b3bbd485
JK
28212diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c
28213index 7eb975a2d0e1..24bc0769fdd6 100644
28214--- a/kernel/trace/trace_events_hist.c
28215+++ b/kernel/trace/trace_events_hist.c
e4b2b4a8
JK
28216@@ -20,13 +20,39 @@
28217 #include <linux/slab.h>
28218 #include <linux/stacktrace.h>
28219 #include <linux/rculist.h>
28220+#include <linux/tracefs.h>
1a6e0f06 28221
e4b2b4a8
JK
28222 #include "tracing_map.h"
28223 #include "trace.h"
1a6e0f06 28224
e4b2b4a8
JK
28225+#define SYNTH_SYSTEM "synthetic"
28226+#define SYNTH_FIELDS_MAX 16
28227+
28228+#define STR_VAR_LEN_MAX 32 /* must be multiple of sizeof(u64) */
28229+
28230 struct hist_field;
1a6e0f06 28231
e4b2b4a8
JK
28232-typedef u64 (*hist_field_fn_t) (struct hist_field *field, void *event);
28233+typedef u64 (*hist_field_fn_t) (struct hist_field *field,
28234+ struct tracing_map_elt *elt,
28235+ struct ring_buffer_event *rbe,
28236+ void *event);
28237+
28238+#define HIST_FIELD_OPERANDS_MAX 2
28239+#define HIST_FIELDS_MAX (TRACING_MAP_FIELDS_MAX + TRACING_MAP_VARS_MAX)
28240+#define HIST_ACTIONS_MAX 8
28241+
28242+enum field_op_id {
28243+ FIELD_OP_NONE,
28244+ FIELD_OP_PLUS,
28245+ FIELD_OP_MINUS,
28246+ FIELD_OP_UNARY_MINUS,
28247+};
28248+
28249+struct hist_var {
28250+ char *name;
28251+ struct hist_trigger_data *hist_data;
28252+ unsigned int idx;
28253+};
28254
28255 struct hist_field {
28256 struct ftrace_event_field *field;
b3bbd485 28257@@ -34,26 +60,50 @@ struct hist_field {
e4b2b4a8
JK
28258 hist_field_fn_t fn;
28259 unsigned int size;
28260 unsigned int offset;
28261+ unsigned int is_signed;
28262+ const char *type;
28263+ struct hist_field *operands[HIST_FIELD_OPERANDS_MAX];
28264+ struct hist_trigger_data *hist_data;
28265+ struct hist_var var;
28266+ enum field_op_id operator;
28267+ char *system;
28268+ char *event_name;
28269+ char *name;
28270+ unsigned int var_idx;
28271+ unsigned int var_ref_idx;
28272+ bool read_once;
28273 };
28274
28275-static u64 hist_field_none(struct hist_field *field, void *event)
28276+static u64 hist_field_none(struct hist_field *field,
28277+ struct tracing_map_elt *elt,
28278+ struct ring_buffer_event *rbe,
28279+ void *event)
1a6e0f06 28280 {
e4b2b4a8
JK
28281 return 0;
28282 }
1a6e0f06 28283
e4b2b4a8
JK
28284-static u64 hist_field_counter(struct hist_field *field, void *event)
28285+static u64 hist_field_counter(struct hist_field *field,
28286+ struct tracing_map_elt *elt,
28287+ struct ring_buffer_event *rbe,
28288+ void *event)
28289 {
28290 return 1;
1a6e0f06
JK
28291 }
28292
e4b2b4a8
JK
28293-static u64 hist_field_string(struct hist_field *hist_field, void *event)
28294+static u64 hist_field_string(struct hist_field *hist_field,
28295+ struct tracing_map_elt *elt,
28296+ struct ring_buffer_event *rbe,
28297+ void *event)
28298 {
28299 char *addr = (char *)(event + hist_field->field->offset);
1a6e0f06 28300
e4b2b4a8 28301 return (u64)(unsigned long)addr;
1a6e0f06 28302 }
e4b2b4a8
JK
28303
28304-static u64 hist_field_dynstring(struct hist_field *hist_field, void *event)
28305+static u64 hist_field_dynstring(struct hist_field *hist_field,
28306+ struct tracing_map_elt *elt,
28307+ struct ring_buffer_event *rbe,
28308+ void *event)
28309 {
28310 u32 str_item = *(u32 *)(event + hist_field->field->offset);
28311 int str_loc = str_item & 0xffff;
b3bbd485 28312@@ -62,22 +112,74 @@ static u64 hist_field_dynstring(struct hist_field *hist_field, void *event)
e4b2b4a8 28313 return (u64)(unsigned long)addr;
1a6e0f06 28314 }
1a6e0f06 28315
e4b2b4a8
JK
28316-static u64 hist_field_pstring(struct hist_field *hist_field, void *event)
28317+static u64 hist_field_pstring(struct hist_field *hist_field,
28318+ struct tracing_map_elt *elt,
28319+ struct ring_buffer_event *rbe,
28320+ void *event)
28321 {
28322 char **addr = (char **)(event + hist_field->field->offset);
1a6e0f06 28323
e4b2b4a8 28324 return (u64)(unsigned long)*addr;
1a6e0f06
JK
28325 }
28326
e4b2b4a8
JK
28327-static u64 hist_field_log2(struct hist_field *hist_field, void *event)
28328+static u64 hist_field_log2(struct hist_field *hist_field,
28329+ struct tracing_map_elt *elt,
28330+ struct ring_buffer_event *rbe,
28331+ void *event)
1a6e0f06 28332 {
e4b2b4a8
JK
28333- u64 val = *(u64 *)(event + hist_field->field->offset);
28334+ struct hist_field *operand = hist_field->operands[0];
28335+
28336+ u64 val = operand->fn(operand, elt, rbe, event);
1a6e0f06 28337
e4b2b4a8 28338 return (u64) ilog2(roundup_pow_of_two(val));
1a6e0f06
JK
28339 }
28340
e4b2b4a8
JK
28341+static u64 hist_field_plus(struct hist_field *hist_field,
28342+ struct tracing_map_elt *elt,
28343+ struct ring_buffer_event *rbe,
28344+ void *event)
1a6e0f06 28345+{
e4b2b4a8
JK
28346+ struct hist_field *operand1 = hist_field->operands[0];
28347+ struct hist_field *operand2 = hist_field->operands[1];
28348+
28349+ u64 val1 = operand1->fn(operand1, elt, rbe, event);
28350+ u64 val2 = operand2->fn(operand2, elt, rbe, event);
28351+
28352+ return val1 + val2;
28353+}
28354+
28355+static u64 hist_field_minus(struct hist_field *hist_field,
28356+ struct tracing_map_elt *elt,
28357+ struct ring_buffer_event *rbe,
28358+ void *event)
28359+{
28360+ struct hist_field *operand1 = hist_field->operands[0];
28361+ struct hist_field *operand2 = hist_field->operands[1];
28362+
28363+ u64 val1 = operand1->fn(operand1, elt, rbe, event);
28364+ u64 val2 = operand2->fn(operand2, elt, rbe, event);
28365+
28366+ return val1 - val2;
28367+}
28368+
28369+static u64 hist_field_unary_minus(struct hist_field *hist_field,
28370+ struct tracing_map_elt *elt,
28371+ struct ring_buffer_event *rbe,
28372+ void *event)
28373+{
28374+ struct hist_field *operand = hist_field->operands[0];
28375+
28376+ s64 sval = (s64)operand->fn(operand, elt, rbe, event);
28377+ u64 val = (u64)-sval;
28378+
28379+ return val;
28380+}
28381+
28382 #define DEFINE_HIST_FIELD_FN(type) \
28383-static u64 hist_field_##type(struct hist_field *hist_field, void *event)\
28384+ static u64 hist_field_##type(struct hist_field *hist_field, \
28385+ struct tracing_map_elt *elt, \
28386+ struct ring_buffer_event *rbe, \
28387+ void *event) \
28388 { \
28389 type *addr = (type *)(event + hist_field->field->offset); \
28390 \
b3bbd485 28391@@ -110,16 +212,29 @@ DEFINE_HIST_FIELD_FN(u8);
e4b2b4a8
JK
28392 #define HIST_KEY_SIZE_MAX (MAX_FILTER_STR_VAL + HIST_STACKTRACE_SIZE)
28393
28394 enum hist_field_flags {
28395- HIST_FIELD_FL_HITCOUNT = 1,
28396- HIST_FIELD_FL_KEY = 2,
28397- HIST_FIELD_FL_STRING = 4,
28398- HIST_FIELD_FL_HEX = 8,
28399- HIST_FIELD_FL_SYM = 16,
28400- HIST_FIELD_FL_SYM_OFFSET = 32,
28401- HIST_FIELD_FL_EXECNAME = 64,
28402- HIST_FIELD_FL_SYSCALL = 128,
28403- HIST_FIELD_FL_STACKTRACE = 256,
28404- HIST_FIELD_FL_LOG2 = 512,
28405+ HIST_FIELD_FL_HITCOUNT = 1 << 0,
28406+ HIST_FIELD_FL_KEY = 1 << 1,
28407+ HIST_FIELD_FL_STRING = 1 << 2,
28408+ HIST_FIELD_FL_HEX = 1 << 3,
28409+ HIST_FIELD_FL_SYM = 1 << 4,
28410+ HIST_FIELD_FL_SYM_OFFSET = 1 << 5,
28411+ HIST_FIELD_FL_EXECNAME = 1 << 6,
28412+ HIST_FIELD_FL_SYSCALL = 1 << 7,
28413+ HIST_FIELD_FL_STACKTRACE = 1 << 8,
28414+ HIST_FIELD_FL_LOG2 = 1 << 9,
28415+ HIST_FIELD_FL_TIMESTAMP = 1 << 10,
28416+ HIST_FIELD_FL_TIMESTAMP_USECS = 1 << 11,
28417+ HIST_FIELD_FL_VAR = 1 << 12,
28418+ HIST_FIELD_FL_EXPR = 1 << 13,
28419+ HIST_FIELD_FL_VAR_REF = 1 << 14,
28420+ HIST_FIELD_FL_CPU = 1 << 15,
28421+ HIST_FIELD_FL_ALIAS = 1 << 16,
28422+};
28423+
28424+struct var_defs {
28425+ unsigned int n_vars;
28426+ char *name[TRACING_MAP_VARS_MAX];
28427+ char *expr[TRACING_MAP_VARS_MAX];
28428 };
28429
28430 struct hist_trigger_attrs {
b3bbd485 28431@@ -127,298 +242,3585 @@ struct hist_trigger_attrs {
e4b2b4a8
JK
28432 char *vals_str;
28433 char *sort_key_str;
28434 char *name;
28435+ char *clock;
28436 bool pause;
28437 bool cont;
28438 bool clear;
28439+ bool ts_in_usecs;
28440 unsigned int map_bits;
28441+
28442+ char *assignment_str[TRACING_MAP_VARS_MAX];
28443+ unsigned int n_assignments;
28444+
28445+ char *action_str[HIST_ACTIONS_MAX];
28446+ unsigned int n_actions;
28447+
28448+ struct var_defs var_defs;
28449+};
28450+
28451+struct field_var {
28452+ struct hist_field *var;
28453+ struct hist_field *val;
28454+};
1a6e0f06 28455+
e4b2b4a8
JK
28456+struct field_var_hist {
28457+ struct hist_trigger_data *hist_data;
28458+ char *cmd;
28459 };
28460
28461 struct hist_trigger_data {
28462- struct hist_field *fields[TRACING_MAP_FIELDS_MAX];
28463+ struct hist_field *fields[HIST_FIELDS_MAX];
28464 unsigned int n_vals;
28465 unsigned int n_keys;
28466 unsigned int n_fields;
28467+ unsigned int n_vars;
28468 unsigned int key_size;
28469 struct tracing_map_sort_key sort_keys[TRACING_MAP_SORT_KEYS_MAX];
28470 unsigned int n_sort_keys;
28471 struct trace_event_file *event_file;
28472 struct hist_trigger_attrs *attrs;
28473 struct tracing_map *map;
28474+ bool enable_timestamps;
28475+ bool remove;
28476+ struct hist_field *var_refs[TRACING_MAP_VARS_MAX];
28477+ unsigned int n_var_refs;
28478+
28479+ struct action_data *actions[HIST_ACTIONS_MAX];
28480+ unsigned int n_actions;
28481+
28482+ struct hist_field *synth_var_refs[SYNTH_FIELDS_MAX];
28483+ unsigned int n_synth_var_refs;
28484+ struct field_var *field_vars[SYNTH_FIELDS_MAX];
28485+ unsigned int n_field_vars;
28486+ unsigned int n_field_var_str;
28487+ struct field_var_hist *field_var_hists[SYNTH_FIELDS_MAX];
28488+ unsigned int n_field_var_hists;
28489+
28490+ struct field_var *max_vars[SYNTH_FIELDS_MAX];
28491+ unsigned int n_max_vars;
28492+ unsigned int n_max_var_str;
b3bbd485
JK
28493 };
28494
28495-static hist_field_fn_t select_value_fn(int field_size, int field_is_signed)
28496-{
28497- hist_field_fn_t fn = NULL;
e4b2b4a8
JK
28498+struct synth_field {
28499+ char *type;
28500+ char *name;
28501+ size_t size;
28502+ bool is_signed;
28503+ bool is_string;
28504+};
b3bbd485
JK
28505
28506- switch (field_size) {
28507- case 8:
28508- if (field_is_signed)
28509- fn = hist_field_s64;
28510- else
28511- fn = hist_field_u64;
28512- break;
28513- case 4:
28514- if (field_is_signed)
28515- fn = hist_field_s32;
28516- else
28517- fn = hist_field_u32;
28518- break;
28519- case 2:
28520- if (field_is_signed)
28521- fn = hist_field_s16;
28522- else
28523- fn = hist_field_u16;
28524- break;
28525- case 1:
28526- if (field_is_signed)
28527- fn = hist_field_s8;
28528- else
28529- fn = hist_field_u8;
28530- break;
28531- }
e4b2b4a8
JK
28532+struct synth_event {
28533+ struct list_head list;
28534+ int ref;
28535+ char *name;
28536+ struct synth_field **fields;
28537+ unsigned int n_fields;
28538+ unsigned int n_u64;
28539+ struct trace_event_class class;
28540+ struct trace_event_call call;
28541+ struct tracepoint *tp;
28542+};
b3bbd485
JK
28543
28544- return fn;
e4b2b4a8 28545+struct action_data;
1a6e0f06 28546+
e4b2b4a8
JK
28547+typedef void (*action_fn_t) (struct hist_trigger_data *hist_data,
28548+ struct tracing_map_elt *elt, void *rec,
28549+ struct ring_buffer_event *rbe,
28550+ struct action_data *data, u64 *var_ref_vals);
1a6e0f06 28551+
e4b2b4a8
JK
28552+struct action_data {
28553+ action_fn_t fn;
28554+ unsigned int n_params;
28555+ char *params[SYNTH_FIELDS_MAX];
28556+
28557+ union {
28558+ struct {
28559+ unsigned int var_ref_idx;
28560+ char *match_event;
28561+ char *match_event_system;
28562+ char *synth_event_name;
28563+ struct synth_event *synth_event;
28564+ } onmatch;
28565+
28566+ struct {
28567+ char *var_str;
28568+ char *fn_name;
28569+ unsigned int max_var_ref_idx;
28570+ struct hist_field *max_var;
28571+ struct hist_field *var;
28572+ } onmax;
28573+ };
28574+};
28575+
28576+
28577+static char last_hist_cmd[MAX_FILTER_STR_VAL];
28578+static char hist_err_str[MAX_FILTER_STR_VAL];
28579+
28580+static void last_cmd_set(char *str)
28581+{
28582+ if (!str)
1a6e0f06
JK
28583+ return;
28584+
e4b2b4a8 28585+ strncpy(last_hist_cmd, str, MAX_FILTER_STR_VAL - 1);
b3bbd485
JK
28586 }
28587
28588-static int parse_map_size(char *str)
e4b2b4a8 28589+static void hist_err(char *str, char *var)
b3bbd485
JK
28590 {
28591- unsigned long size, map_bits;
28592- int ret;
e4b2b4a8 28593+ int maxlen = MAX_FILTER_STR_VAL - 1;
b3bbd485
JK
28594
28595- strsep(&str, "=");
28596- if (!str) {
28597- ret = -EINVAL;
28598- goto out;
28599- }
e4b2b4a8 28600+ if (!str)
1a6e0f06 28601+ return;
b3bbd485
JK
28602
28603- ret = kstrtoul(str, 0, &size);
28604- if (ret)
28605- goto out;
e4b2b4a8
JK
28606+ if (strlen(hist_err_str))
28607+ return;
b3bbd485
JK
28608
28609- map_bits = ilog2(roundup_pow_of_two(size));
28610- if (map_bits < TRACING_MAP_BITS_MIN ||
28611- map_bits > TRACING_MAP_BITS_MAX)
28612- ret = -EINVAL;
28613- else
28614- ret = map_bits;
28615- out:
28616- return ret;
e4b2b4a8
JK
28617+ if (!var)
28618+ var = "";
28619+
28620+ if (strlen(hist_err_str) + strlen(str) + strlen(var) > maxlen)
28621+ return;
1a6e0f06 28622+
e4b2b4a8
JK
28623+ strcat(hist_err_str, str);
28624+ strcat(hist_err_str, var);
b3bbd485
JK
28625 }
28626
28627-static void destroy_hist_trigger_attrs(struct hist_trigger_attrs *attrs)
e4b2b4a8 28628+static void hist_err_event(char *str, char *system, char *event, char *var)
b3bbd485
JK
28629 {
28630- if (!attrs)
28631- return;
e4b2b4a8 28632+ char err[MAX_FILTER_STR_VAL];
b3bbd485
JK
28633
28634- kfree(attrs->name);
28635- kfree(attrs->sort_key_str);
28636- kfree(attrs->keys_str);
28637- kfree(attrs->vals_str);
28638- kfree(attrs);
e4b2b4a8
JK
28639+ if (system && var)
28640+ snprintf(err, MAX_FILTER_STR_VAL, "%s.%s.%s", system, event, var);
28641+ else if (system)
28642+ snprintf(err, MAX_FILTER_STR_VAL, "%s.%s", system, event);
28643+ else
28644+ strncpy(err, var, MAX_FILTER_STR_VAL);
28645+
28646+ hist_err(str, err);
b3bbd485
JK
28647 }
28648
28649-static struct hist_trigger_attrs *parse_hist_trigger_attrs(char *trigger_str)
e4b2b4a8 28650+static void hist_err_clear(void)
b3bbd485
JK
28651 {
28652- struct hist_trigger_attrs *attrs;
28653- int ret = 0;
e4b2b4a8 28654+ hist_err_str[0] = '\0';
1a6e0f06 28655+}
b3bbd485
JK
28656
28657- attrs = kzalloc(sizeof(*attrs), GFP_KERNEL);
28658- if (!attrs)
28659- return ERR_PTR(-ENOMEM);
e4b2b4a8 28660+static bool have_hist_err(void)
1a6e0f06 28661+{
e4b2b4a8
JK
28662+ if (strlen(hist_err_str))
28663+ return true;
b3bbd485
JK
28664
28665- while (trigger_str) {
28666- char *str = strsep(&trigger_str, ":");
e4b2b4a8
JK
28667+ return false;
28668+}
b3bbd485
JK
28669
28670- if ((strncmp(str, "key=", strlen("key=")) == 0) ||
28671- (strncmp(str, "keys=", strlen("keys=")) == 0))
28672- attrs->keys_str = kstrdup(str, GFP_KERNEL);
28673- else if ((strncmp(str, "val=", strlen("val=")) == 0) ||
28674- (strncmp(str, "vals=", strlen("vals=")) == 0) ||
28675- (strncmp(str, "values=", strlen("values=")) == 0))
28676- attrs->vals_str = kstrdup(str, GFP_KERNEL);
28677- else if (strncmp(str, "sort=", strlen("sort=")) == 0)
28678- attrs->sort_key_str = kstrdup(str, GFP_KERNEL);
28679- else if (strncmp(str, "name=", strlen("name=")) == 0)
28680- attrs->name = kstrdup(str, GFP_KERNEL);
28681- else if (strcmp(str, "pause") == 0)
28682- attrs->pause = true;
28683- else if ((strcmp(str, "cont") == 0) ||
28684- (strcmp(str, "continue") == 0))
28685- attrs->cont = true;
28686- else if (strcmp(str, "clear") == 0)
28687- attrs->clear = true;
28688- else if (strncmp(str, "size=", strlen("size=")) == 0) {
28689- int map_bits = parse_map_size(str);
e4b2b4a8
JK
28690+static LIST_HEAD(synth_event_list);
28691+static DEFINE_MUTEX(synth_event_mutex);
b3bbd485
JK
28692
28693- if (map_bits < 0) {
28694- ret = map_bits;
28695- goto free;
28696- }
28697- attrs->map_bits = map_bits;
e4b2b4a8
JK
28698+struct synth_trace_event {
28699+ struct trace_entry ent;
28700+ u64 fields[];
28701+};
1a6e0f06 28702+
e4b2b4a8
JK
28703+static int synth_event_define_fields(struct trace_event_call *call)
28704+{
28705+ struct synth_trace_event trace;
28706+ int offset = offsetof(typeof(trace), fields);
28707+ struct synth_event *event = call->data;
28708+ unsigned int i, size, n_u64;
28709+ char *name, *type;
28710+ bool is_signed;
28711+ int ret = 0;
28712+
28713+ for (i = 0, n_u64 = 0; i < event->n_fields; i++) {
28714+ size = event->fields[i]->size;
28715+ is_signed = event->fields[i]->is_signed;
28716+ type = event->fields[i]->type;
28717+ name = event->fields[i]->name;
28718+ ret = trace_define_field(call, type, name, offset, size,
28719+ is_signed, FILTER_OTHER);
28720+ if (ret)
28721+ break;
1a6e0f06 28722+
e4b2b4a8
JK
28723+ if (event->fields[i]->is_string) {
28724+ offset += STR_VAR_LEN_MAX;
28725+ n_u64 += STR_VAR_LEN_MAX / sizeof(u64);
b3bbd485
JK
28726 } else {
28727- ret = -EINVAL;
28728- goto free;
e4b2b4a8
JK
28729+ offset += sizeof(u64);
28730+ n_u64++;
b3bbd485
JK
28731 }
28732 }
28733
28734- if (!attrs->keys_str) {
28735- ret = -EINVAL;
28736- goto free;
28737- }
e4b2b4a8 28738+ event->n_u64 = n_u64;
b3bbd485
JK
28739
28740- return attrs;
28741- free:
28742- destroy_hist_trigger_attrs(attrs);
e4b2b4a8
JK
28743+ return ret;
28744+}
b3bbd485
JK
28745
28746- return ERR_PTR(ret);
e4b2b4a8
JK
28747+static bool synth_field_signed(char *type)
28748+{
28749+ if (strncmp(type, "u", 1) == 0)
28750+ return false;
1a6e0f06 28751+
e4b2b4a8 28752+ return true;
b3bbd485
JK
28753 }
28754
28755-static inline void save_comm(char *comm, struct task_struct *task)
e4b2b4a8 28756+static int synth_field_is_string(char *type)
b3bbd485
JK
28757 {
28758- if (!task->pid) {
28759- strcpy(comm, "<idle>");
28760- return;
28761- }
e4b2b4a8
JK
28762+ if (strstr(type, "char[") != NULL)
28763+ return true;
b3bbd485
JK
28764
28765- if (WARN_ON_ONCE(task->pid < 0)) {
28766- strcpy(comm, "<XXX>");
28767- return;
28768- }
e4b2b4a8 28769+ return false;
1a6e0f06 28770+}
b3bbd485
JK
28771
28772- memcpy(comm, task->comm, TASK_COMM_LEN);
e4b2b4a8 28773+static int synth_field_string_size(char *type)
1a6e0f06 28774+{
e4b2b4a8
JK
28775+ char buf[4], *end, *start;
28776+ unsigned int len;
28777+ int size, err;
1a6e0f06 28778+
e4b2b4a8
JK
28779+ start = strstr(type, "char[");
28780+ if (start == NULL)
28781+ return -EINVAL;
28782+ start += strlen("char[");
1a6e0f06 28783+
e4b2b4a8
JK
28784+ end = strchr(type, ']');
28785+ if (!end || end < start)
28786+ return -EINVAL;
28787+
28788+ len = end - start;
28789+ if (len > 3)
28790+ return -EINVAL;
28791+
28792+ strncpy(buf, start, len);
28793+ buf[len] = '\0';
28794+
28795+ err = kstrtouint(buf, 0, &size);
28796+ if (err)
28797+ return err;
28798+
28799+ if (size > STR_VAR_LEN_MAX)
28800+ return -EINVAL;
28801+
28802+ return size;
b3bbd485
JK
28803 }
28804
28805-static void hist_trigger_elt_comm_free(struct tracing_map_elt *elt)
e4b2b4a8 28806+static int synth_field_size(char *type)
b3bbd485
JK
28807 {
28808- kfree((char *)elt->private_data);
e4b2b4a8
JK
28809+ int size = 0;
28810+
28811+ if (strcmp(type, "s64") == 0)
28812+ size = sizeof(s64);
28813+ else if (strcmp(type, "u64") == 0)
28814+ size = sizeof(u64);
28815+ else if (strcmp(type, "s32") == 0)
28816+ size = sizeof(s32);
28817+ else if (strcmp(type, "u32") == 0)
28818+ size = sizeof(u32);
28819+ else if (strcmp(type, "s16") == 0)
28820+ size = sizeof(s16);
28821+ else if (strcmp(type, "u16") == 0)
28822+ size = sizeof(u16);
28823+ else if (strcmp(type, "s8") == 0)
28824+ size = sizeof(s8);
28825+ else if (strcmp(type, "u8") == 0)
28826+ size = sizeof(u8);
28827+ else if (strcmp(type, "char") == 0)
28828+ size = sizeof(char);
28829+ else if (strcmp(type, "unsigned char") == 0)
28830+ size = sizeof(unsigned char);
28831+ else if (strcmp(type, "int") == 0)
28832+ size = sizeof(int);
28833+ else if (strcmp(type, "unsigned int") == 0)
28834+ size = sizeof(unsigned int);
28835+ else if (strcmp(type, "long") == 0)
28836+ size = sizeof(long);
28837+ else if (strcmp(type, "unsigned long") == 0)
28838+ size = sizeof(unsigned long);
28839+ else if (strcmp(type, "pid_t") == 0)
28840+ size = sizeof(pid_t);
28841+ else if (synth_field_is_string(type))
28842+ size = synth_field_string_size(type);
1a6e0f06 28843+
e4b2b4a8 28844+ return size;
b3bbd485
JK
28845 }
28846
28847-static int hist_trigger_elt_comm_alloc(struct tracing_map_elt *elt)
e4b2b4a8 28848+static const char *synth_field_fmt(char *type)
b3bbd485
JK
28849 {
28850- struct hist_trigger_data *hist_data = elt->map->private_data;
28851- struct hist_field *key_field;
28852- unsigned int i;
e4b2b4a8
JK
28853+ const char *fmt = "%llu";
28854+
28855+ if (strcmp(type, "s64") == 0)
28856+ fmt = "%lld";
28857+ else if (strcmp(type, "u64") == 0)
28858+ fmt = "%llu";
28859+ else if (strcmp(type, "s32") == 0)
28860+ fmt = "%d";
28861+ else if (strcmp(type, "u32") == 0)
28862+ fmt = "%u";
28863+ else if (strcmp(type, "s16") == 0)
28864+ fmt = "%d";
28865+ else if (strcmp(type, "u16") == 0)
28866+ fmt = "%u";
28867+ else if (strcmp(type, "s8") == 0)
28868+ fmt = "%d";
28869+ else if (strcmp(type, "u8") == 0)
28870+ fmt = "%u";
28871+ else if (strcmp(type, "char") == 0)
28872+ fmt = "%d";
28873+ else if (strcmp(type, "unsigned char") == 0)
28874+ fmt = "%u";
28875+ else if (strcmp(type, "int") == 0)
28876+ fmt = "%d";
28877+ else if (strcmp(type, "unsigned int") == 0)
28878+ fmt = "%u";
28879+ else if (strcmp(type, "long") == 0)
28880+ fmt = "%ld";
28881+ else if (strcmp(type, "unsigned long") == 0)
28882+ fmt = "%lu";
28883+ else if (strcmp(type, "pid_t") == 0)
28884+ fmt = "%d";
28885+ else if (synth_field_is_string(type))
28886+ fmt = "%s";
28887+
28888+ return fmt;
28889+}
b3bbd485
JK
28890
28891- for_each_hist_key_field(i, hist_data) {
28892- key_field = hist_data->fields[i];
e4b2b4a8
JK
28893+static enum print_line_t print_synth_event(struct trace_iterator *iter,
28894+ int flags,
28895+ struct trace_event *event)
28896+{
28897+ struct trace_array *tr = iter->tr;
28898+ struct trace_seq *s = &iter->seq;
28899+ struct synth_trace_event *entry;
28900+ struct synth_event *se;
28901+ unsigned int i, n_u64;
28902+ char print_fmt[32];
28903+ const char *fmt;
b3bbd485
JK
28904
28905- if (key_field->flags & HIST_FIELD_FL_EXECNAME) {
28906- unsigned int size = TASK_COMM_LEN + 1;
e4b2b4a8
JK
28907+ entry = (struct synth_trace_event *)iter->ent;
28908+ se = container_of(event, struct synth_event, call.event);
b3bbd485
JK
28909
28910- elt->private_data = kzalloc(size, GFP_KERNEL);
28911- if (!elt->private_data)
28912- return -ENOMEM;
28913- break;
e4b2b4a8
JK
28914+ trace_seq_printf(s, "%s: ", se->name);
28915+
28916+ for (i = 0, n_u64 = 0; i < se->n_fields; i++) {
28917+ if (trace_seq_has_overflowed(s))
28918+ goto end;
28919+
28920+ fmt = synth_field_fmt(se->fields[i]->type);
28921+
28922+ /* parameter types */
28923+ if (tr->trace_flags & TRACE_ITER_VERBOSE)
28924+ trace_seq_printf(s, "%s ", fmt);
28925+
28926+ snprintf(print_fmt, sizeof(print_fmt), "%%s=%s%%s", fmt);
28927+
28928+ /* parameter values */
28929+ if (se->fields[i]->is_string) {
28930+ trace_seq_printf(s, print_fmt, se->fields[i]->name,
28931+ (char *)&entry->fields[n_u64],
28932+ i == se->n_fields - 1 ? "" : " ");
28933+ n_u64 += STR_VAR_LEN_MAX / sizeof(u64);
28934+ } else {
28935+ trace_seq_printf(s, print_fmt, se->fields[i]->name,
28936+ entry->fields[n_u64],
28937+ i == se->n_fields - 1 ? "" : " ");
28938+ n_u64++;
b3bbd485
JK
28939 }
28940 }
e4b2b4a8
JK
28941+end:
28942+ trace_seq_putc(s, '\n');
b3bbd485
JK
28943
28944- return 0;
e4b2b4a8 28945+ return trace_handle_return(s);
b3bbd485
JK
28946 }
28947
28948-static void hist_trigger_elt_comm_copy(struct tracing_map_elt *to,
28949- struct tracing_map_elt *from)
e4b2b4a8
JK
28950+static struct trace_event_functions synth_event_funcs = {
28951+ .trace = print_synth_event
28952+};
1a6e0f06 28953+
e4b2b4a8
JK
28954+static notrace void trace_event_raw_event_synth(void *__data,
28955+ u64 *var_ref_vals,
28956+ unsigned int var_ref_idx)
b3bbd485
JK
28957 {
28958- char *comm_from = from->private_data;
28959- char *comm_to = to->private_data;
e4b2b4a8
JK
28960+ struct trace_event_file *trace_file = __data;
28961+ struct synth_trace_event *entry;
28962+ struct trace_event_buffer fbuffer;
28963+ struct ring_buffer *buffer;
28964+ struct synth_event *event;
28965+ unsigned int i, n_u64;
28966+ int fields_size = 0;
1a6e0f06 28967+
e4b2b4a8
JK
28968+ event = trace_file->event_call->data;
28969+
28970+ if (trace_trigger_soft_disabled(trace_file))
1a6e0f06 28971+ return;
1a6e0f06 28972+
e4b2b4a8 28973+ fields_size = event->n_u64 * sizeof(u64);
1a6e0f06 28974+
e4b2b4a8
JK
28975+ /*
28976+ * Avoid ring buffer recursion detection, as this event
28977+ * is being performed within another event.
28978+ */
28979+ buffer = trace_file->tr->trace_buffer.buffer;
28980+ ring_buffer_nest_start(buffer);
28981+
28982+ entry = trace_event_buffer_reserve(&fbuffer, trace_file,
28983+ sizeof(*entry) + fields_size);
28984+ if (!entry)
28985+ goto out;
28986+
28987+ for (i = 0, n_u64 = 0; i < event->n_fields; i++) {
28988+ if (event->fields[i]->is_string) {
28989+ char *str_val = (char *)(long)var_ref_vals[var_ref_idx + i];
28990+ char *str_field = (char *)&entry->fields[n_u64];
28991+
28992+ strscpy(str_field, str_val, STR_VAR_LEN_MAX);
28993+ n_u64 += STR_VAR_LEN_MAX / sizeof(u64);
28994+ } else {
28995+ entry->fields[n_u64] = var_ref_vals[var_ref_idx + i];
28996+ n_u64++;
28997+ }
1a6e0f06
JK
28998+ }
28999+
e4b2b4a8
JK
29000+ trace_event_buffer_commit(&fbuffer);
29001+out:
29002+ ring_buffer_nest_end(buffer);
1a6e0f06 29003+}
b3bbd485
JK
29004
29005- if (comm_from)
29006- memcpy(comm_to, comm_from, TASK_COMM_LEN + 1);
e4b2b4a8 29007+static void free_synth_event_print_fmt(struct trace_event_call *call)
1a6e0f06 29008+{
e4b2b4a8
JK
29009+ if (call) {
29010+ kfree(call->print_fmt);
29011+ call->print_fmt = NULL;
1a6e0f06 29012+ }
b3bbd485
JK
29013 }
29014
29015-static void hist_trigger_elt_comm_init(struct tracing_map_elt *elt)
e4b2b4a8
JK
29016+static int __set_synth_event_print_fmt(struct synth_event *event,
29017+ char *buf, int len)
b3bbd485
JK
29018 {
29019- char *comm = elt->private_data;
e4b2b4a8
JK
29020+ const char *fmt;
29021+ int pos = 0;
29022+ int i;
29023+
29024+ /* When len=0, we just calculate the needed length */
29025+#define LEN_OR_ZERO (len ? len - pos : 0)
29026+
29027+ pos += snprintf(buf + pos, LEN_OR_ZERO, "\"");
29028+ for (i = 0; i < event->n_fields; i++) {
29029+ fmt = synth_field_fmt(event->fields[i]->type);
29030+ pos += snprintf(buf + pos, LEN_OR_ZERO, "%s=%s%s",
29031+ event->fields[i]->name, fmt,
29032+ i == event->n_fields - 1 ? "" : ", ");
1a6e0f06 29033+ }
e4b2b4a8 29034+ pos += snprintf(buf + pos, LEN_OR_ZERO, "\"");
1a6e0f06 29035+
e4b2b4a8
JK
29036+ for (i = 0; i < event->n_fields; i++) {
29037+ pos += snprintf(buf + pos, LEN_OR_ZERO,
29038+ ", REC->%s", event->fields[i]->name);
1a6e0f06
JK
29039+ }
29040+
e4b2b4a8 29041+#undef LEN_OR_ZERO
b3bbd485
JK
29042
29043- if (comm)
29044- save_comm(comm, current);
e4b2b4a8
JK
29045+ /* return the length of print_fmt */
29046+ return pos;
b3bbd485
JK
29047 }
29048
29049-static const struct tracing_map_ops hist_trigger_elt_comm_ops = {
29050- .elt_alloc = hist_trigger_elt_comm_alloc,
29051- .elt_copy = hist_trigger_elt_comm_copy,
29052- .elt_free = hist_trigger_elt_comm_free,
29053- .elt_init = hist_trigger_elt_comm_init,
29054-};
e4b2b4a8
JK
29055+static int set_synth_event_print_fmt(struct trace_event_call *call)
29056+{
29057+ struct synth_event *event = call->data;
29058+ char *print_fmt;
29059+ int len;
1a6e0f06 29060+
e4b2b4a8
JK
29061+ /* First: called with 0 length to calculate the needed length */
29062+ len = __set_synth_event_print_fmt(event, NULL, 0);
1a6e0f06 29063+
e4b2b4a8
JK
29064+ print_fmt = kmalloc(len + 1, GFP_KERNEL);
29065+ if (!print_fmt)
29066+ return -ENOMEM;
1a6e0f06 29067+
e4b2b4a8
JK
29068+ /* Second: actually write the @print_fmt */
29069+ __set_synth_event_print_fmt(event, print_fmt, len + 1);
29070+ call->print_fmt = print_fmt;
b3bbd485
JK
29071
29072-static void destroy_hist_field(struct hist_field *hist_field)
e4b2b4a8 29073+ return 0;
1a6e0f06
JK
29074+}
29075+
e4b2b4a8 29076+static void free_synth_field(struct synth_field *field)
b3bbd485
JK
29077 {
29078- kfree(hist_field);
e4b2b4a8
JK
29079+ kfree(field->type);
29080+ kfree(field->name);
29081+ kfree(field);
b3bbd485
JK
29082 }
29083
29084-static struct hist_field *create_hist_field(struct ftrace_event_field *field,
29085- unsigned long flags)
e4b2b4a8
JK
29086+static struct synth_field *parse_synth_field(char *field_type,
29087+ char *field_name)
b3bbd485
JK
29088 {
29089- struct hist_field *hist_field;
e4b2b4a8
JK
29090+ struct synth_field *field;
29091+ int len, ret = 0;
29092+ char *array;
b3bbd485
JK
29093
29094- if (field && is_function_field(field))
29095- return NULL;
e4b2b4a8
JK
29096+ if (field_type[0] == ';')
29097+ field_type++;
b3bbd485
JK
29098
29099- hist_field = kzalloc(sizeof(struct hist_field), GFP_KERNEL);
29100- if (!hist_field)
29101- return NULL;
e4b2b4a8
JK
29102+ len = strlen(field_name);
29103+ if (field_name[len - 1] == ';')
29104+ field_name[len - 1] = '\0';
b3bbd485
JK
29105
29106- if (flags & HIST_FIELD_FL_HITCOUNT) {
29107- hist_field->fn = hist_field_counter;
29108- goto out;
e4b2b4a8
JK
29109+ field = kzalloc(sizeof(*field), GFP_KERNEL);
29110+ if (!field)
29111+ return ERR_PTR(-ENOMEM);
1a6e0f06 29112+
e4b2b4a8
JK
29113+ len = strlen(field_type) + 1;
29114+ array = strchr(field_name, '[');
29115+ if (array)
29116+ len += strlen(array);
29117+ field->type = kzalloc(len, GFP_KERNEL);
29118+ if (!field->type) {
29119+ ret = -ENOMEM;
29120+ goto free;
29121+ }
29122+ strcat(field->type, field_type);
29123+ if (array) {
29124+ strcat(field->type, array);
29125+ *array = '\0';
b3bbd485
JK
29126 }
29127
29128- if (flags & HIST_FIELD_FL_STACKTRACE) {
29129- hist_field->fn = hist_field_none;
29130- goto out;
e4b2b4a8
JK
29131+ field->size = synth_field_size(field->type);
29132+ if (!field->size) {
29133+ ret = -EINVAL;
29134+ goto free;
b3bbd485
JK
29135 }
29136
29137- if (flags & HIST_FIELD_FL_LOG2) {
29138- hist_field->fn = hist_field_log2;
29139- goto out;
e4b2b4a8
JK
29140+ if (synth_field_is_string(field->type))
29141+ field->is_string = true;
29142+
29143+ field->is_signed = synth_field_signed(field->type);
29144+
29145+ field->name = kstrdup(field_name, GFP_KERNEL);
29146+ if (!field->name) {
29147+ ret = -ENOMEM;
29148+ goto free;
29149+ }
29150+ out:
29151+ return field;
29152+ free:
29153+ free_synth_field(field);
29154+ field = ERR_PTR(ret);
29155+ goto out;
29156+}
29157+
29158+static void free_synth_tracepoint(struct tracepoint *tp)
1a6e0f06 29159+{
e4b2b4a8
JK
29160+ if (!tp)
29161+ return;
29162+
29163+ kfree(tp->name);
29164+ kfree(tp);
1a6e0f06 29165+}
1a6e0f06 29166+
e4b2b4a8 29167+static struct tracepoint *alloc_synth_tracepoint(char *name)
1a6e0f06 29168+{
e4b2b4a8 29169+ struct tracepoint *tp;
1a6e0f06 29170+
e4b2b4a8
JK
29171+ tp = kzalloc(sizeof(*tp), GFP_KERNEL);
29172+ if (!tp)
29173+ return ERR_PTR(-ENOMEM);
1a6e0f06 29174+
e4b2b4a8
JK
29175+ tp->name = kstrdup(name, GFP_KERNEL);
29176+ if (!tp->name) {
29177+ kfree(tp);
29178+ return ERR_PTR(-ENOMEM);
1a6e0f06 29179+ }
e4b2b4a8
JK
29180+
29181+ return tp;
1a6e0f06 29182+}
1a6e0f06 29183+
e4b2b4a8
JK
29184+typedef void (*synth_probe_func_t) (void *__data, u64 *var_ref_vals,
29185+ unsigned int var_ref_idx);
1a6e0f06 29186+
e4b2b4a8
JK
29187+static inline void trace_synth(struct synth_event *event, u64 *var_ref_vals,
29188+ unsigned int var_ref_idx)
29189+{
29190+ struct tracepoint *tp = event->tp;
29191+
29192+ if (unlikely(atomic_read(&tp->key.enabled) > 0)) {
29193+ struct tracepoint_func *probe_func_ptr;
29194+ synth_probe_func_t probe_func;
29195+ void *__data;
29196+
29197+ if (!(cpu_online(raw_smp_processor_id())))
29198+ return;
29199+
29200+ probe_func_ptr = rcu_dereference_sched((tp)->funcs);
29201+ if (probe_func_ptr) {
29202+ do {
29203+ probe_func = probe_func_ptr->func;
29204+ __data = probe_func_ptr->data;
29205+ probe_func(__data, var_ref_vals, var_ref_idx);
29206+ } while ((++probe_func_ptr)->func);
29207+ }
29208+ }
29209+}
29210+
29211+static struct synth_event *find_synth_event(const char *name)
29212+{
29213+ struct synth_event *event;
29214+
29215+ list_for_each_entry(event, &synth_event_list, list) {
29216+ if (strcmp(event->name, name) == 0)
29217+ return event;
29218+ }
29219+
29220+ return NULL;
29221+}
29222+
29223+static int register_synth_event(struct synth_event *event)
29224+{
29225+ struct trace_event_call *call = &event->call;
29226+ int ret = 0;
29227+
29228+ event->call.class = &event->class;
29229+ event->class.system = kstrdup(SYNTH_SYSTEM, GFP_KERNEL);
29230+ if (!event->class.system) {
29231+ ret = -ENOMEM;
29232+ goto out;
29233+ }
29234+
29235+ event->tp = alloc_synth_tracepoint(event->name);
29236+ if (IS_ERR(event->tp)) {
29237+ ret = PTR_ERR(event->tp);
29238+ event->tp = NULL;
29239+ goto out;
29240+ }
29241+
29242+ INIT_LIST_HEAD(&call->class->fields);
29243+ call->event.funcs = &synth_event_funcs;
29244+ call->class->define_fields = synth_event_define_fields;
29245+
29246+ ret = register_trace_event(&call->event);
29247+ if (!ret) {
29248+ ret = -ENODEV;
29249+ goto out;
29250+ }
29251+ call->flags = TRACE_EVENT_FL_TRACEPOINT;
29252+ call->class->reg = trace_event_reg;
29253+ call->class->probe = trace_event_raw_event_synth;
29254+ call->data = event;
29255+ call->tp = event->tp;
29256+
29257+ ret = trace_add_event_call(call);
29258+ if (ret) {
29259+ pr_warn("Failed to register synthetic event: %s\n",
29260+ trace_event_name(call));
29261+ goto err;
29262+ }
29263+
29264+ ret = set_synth_event_print_fmt(call);
29265+ if (ret < 0) {
29266+ trace_remove_event_call(call);
29267+ goto err;
29268+ }
29269+ out:
29270+ return ret;
29271+ err:
29272+ unregister_trace_event(&call->event);
29273+ goto out;
29274+}
29275+
29276+static int unregister_synth_event(struct synth_event *event)
29277+{
29278+ struct trace_event_call *call = &event->call;
29279+ int ret;
29280+
29281+ ret = trace_remove_event_call(call);
29282+
29283+ return ret;
29284+}
29285+
29286+static void free_synth_event(struct synth_event *event)
29287+{
29288+ unsigned int i;
29289+
29290+ if (!event)
29291+ return;
29292+
29293+ for (i = 0; i < event->n_fields; i++)
29294+ free_synth_field(event->fields[i]);
29295+
29296+ kfree(event->fields);
29297+ kfree(event->name);
29298+ kfree(event->class.system);
29299+ free_synth_tracepoint(event->tp);
29300+ free_synth_event_print_fmt(&event->call);
29301+ kfree(event);
29302+}
29303+
29304+static struct synth_event *alloc_synth_event(char *event_name, int n_fields,
29305+ struct synth_field **fields)
29306+{
29307+ struct synth_event *event;
29308+ unsigned int i;
1a6e0f06 29309+
e4b2b4a8
JK
29310+ event = kzalloc(sizeof(*event), GFP_KERNEL);
29311+ if (!event) {
29312+ event = ERR_PTR(-ENOMEM);
29313+ goto out;
29314+ }
1a6e0f06 29315+
e4b2b4a8
JK
29316+ event->name = kstrdup(event_name, GFP_KERNEL);
29317+ if (!event->name) {
29318+ kfree(event);
29319+ event = ERR_PTR(-ENOMEM);
29320+ goto out;
29321+ }
1a6e0f06 29322+
e4b2b4a8
JK
29323+ event->fields = kcalloc(n_fields, sizeof(*event->fields), GFP_KERNEL);
29324+ if (!event->fields) {
29325+ free_synth_event(event);
29326+ event = ERR_PTR(-ENOMEM);
29327+ goto out;
29328+ }
1a6e0f06 29329+
e4b2b4a8
JK
29330+ for (i = 0; i < n_fields; i++)
29331+ event->fields[i] = fields[i];
1a6e0f06 29332+
e4b2b4a8
JK
29333+ event->n_fields = n_fields;
29334+ out:
29335+ return event;
29336+}
1a6e0f06 29337+
e4b2b4a8
JK
29338+static void action_trace(struct hist_trigger_data *hist_data,
29339+ struct tracing_map_elt *elt, void *rec,
29340+ struct ring_buffer_event *rbe,
29341+ struct action_data *data, u64 *var_ref_vals)
1a6e0f06 29342+{
e4b2b4a8 29343+ struct synth_event *event = data->onmatch.synth_event;
1a6e0f06 29344+
e4b2b4a8
JK
29345+ trace_synth(event, var_ref_vals, data->onmatch.var_ref_idx);
29346+}
1a6e0f06 29347+
e4b2b4a8
JK
29348+struct hist_var_data {
29349+ struct list_head list;
29350+ struct hist_trigger_data *hist_data;
29351+};
1a6e0f06 29352+
e4b2b4a8
JK
29353+static void add_or_delete_synth_event(struct synth_event *event, int delete)
29354+{
29355+ if (delete)
29356+ free_synth_event(event);
29357+ else {
29358+ mutex_lock(&synth_event_mutex);
29359+ if (!find_synth_event(event->name))
29360+ list_add(&event->list, &synth_event_list);
29361+ else
29362+ free_synth_event(event);
29363+ mutex_unlock(&synth_event_mutex);
29364+ }
1a6e0f06
JK
29365+}
29366+
e4b2b4a8 29367+static int create_synth_event(int argc, char **argv)
1a6e0f06 29368+{
e4b2b4a8
JK
29369+ struct synth_field *field, *fields[SYNTH_FIELDS_MAX];
29370+ struct synth_event *event = NULL;
29371+ bool delete_event = false;
29372+ int i, n_fields = 0, ret = 0;
29373+ char *name;
1a6e0f06 29374+
e4b2b4a8 29375+ mutex_lock(&synth_event_mutex);
1a6e0f06 29376+
e4b2b4a8
JK
29377+ /*
29378+ * Argument syntax:
29379+ * - Add synthetic event: <event_name> field[;field] ...
29380+ * - Remove synthetic event: !<event_name> field[;field] ...
29381+ * where 'field' = type field_name
29382+ */
29383+ if (argc < 1) {
29384+ ret = -EINVAL;
29385+ goto out;
29386+ }
1a6e0f06 29387+
e4b2b4a8
JK
29388+ name = argv[0];
29389+ if (name[0] == '!') {
29390+ delete_event = true;
29391+ name++;
29392+ }
1a6e0f06 29393+
e4b2b4a8
JK
29394+ event = find_synth_event(name);
29395+ if (event) {
29396+ if (delete_event) {
29397+ if (event->ref) {
29398+ event = NULL;
29399+ ret = -EBUSY;
29400+ goto out;
29401+ }
29402+ list_del(&event->list);
29403+ goto out;
1a6e0f06 29404+ }
e4b2b4a8
JK
29405+ event = NULL;
29406+ ret = -EEXIST;
29407+ goto out;
29408+ } else if (delete_event)
29409+ goto out;
29410+
29411+ if (argc < 2) {
29412+ ret = -EINVAL;
29413+ goto out;
1a6e0f06 29414+ }
1a6e0f06 29415+
e4b2b4a8
JK
29416+ for (i = 1; i < argc - 1; i++) {
29417+ if (strcmp(argv[i], ";") == 0)
29418+ continue;
29419+ if (n_fields == SYNTH_FIELDS_MAX) {
29420+ ret = -EINVAL;
29421+ goto err;
29422+ }
1a6e0f06 29423+
e4b2b4a8
JK
29424+ field = parse_synth_field(argv[i], argv[i + 1]);
29425+ if (IS_ERR(field)) {
29426+ ret = PTR_ERR(field);
29427+ goto err;
29428+ }
29429+ fields[n_fields] = field;
29430+ i++; n_fields++;
29431+ }
1a6e0f06 29432+
e4b2b4a8
JK
29433+ if (i < argc) {
29434+ ret = -EINVAL;
29435+ goto err;
29436+ }
1a6e0f06 29437+
e4b2b4a8
JK
29438+ event = alloc_synth_event(name, n_fields, fields);
29439+ if (IS_ERR(event)) {
29440+ ret = PTR_ERR(event);
29441+ event = NULL;
29442+ goto err;
1a6e0f06 29443+ }
e4b2b4a8
JK
29444+ out:
29445+ mutex_unlock(&synth_event_mutex);
1a6e0f06 29446+
e4b2b4a8
JK
29447+ if (event) {
29448+ if (delete_event) {
29449+ ret = unregister_synth_event(event);
29450+ add_or_delete_synth_event(event, !ret);
29451+ } else {
29452+ ret = register_synth_event(event);
29453+ add_or_delete_synth_event(event, ret);
29454+ }
29455+ }
29456+
29457+ return ret;
29458+ err:
29459+ mutex_unlock(&synth_event_mutex);
29460+
29461+ for (i = 0; i < n_fields; i++)
29462+ free_synth_field(fields[i]);
29463+ free_synth_event(event);
29464+
29465+ return ret;
1a6e0f06
JK
29466+}
29467+
e4b2b4a8 29468+static int release_all_synth_events(void)
1a6e0f06 29469+{
e4b2b4a8
JK
29470+ struct list_head release_events;
29471+ struct synth_event *event, *e;
29472+ int ret = 0;
1a6e0f06 29473+
e4b2b4a8
JK
29474+ INIT_LIST_HEAD(&release_events);
29475+
29476+ mutex_lock(&synth_event_mutex);
29477+
29478+ list_for_each_entry(event, &synth_event_list, list) {
29479+ if (event->ref) {
29480+ mutex_unlock(&synth_event_mutex);
29481+ return -EBUSY;
29482+ }
29483+ }
29484+
29485+ list_splice_init(&event->list, &release_events);
29486+
29487+ mutex_unlock(&synth_event_mutex);
29488+
29489+ list_for_each_entry_safe(event, e, &release_events, list) {
29490+ list_del(&event->list);
29491+
29492+ ret = unregister_synth_event(event);
29493+ add_or_delete_synth_event(event, !ret);
29494+ }
29495+
29496+ return ret;
1a6e0f06
JK
29497+}
29498+
e4b2b4a8
JK
29499+
29500+static void *synth_events_seq_start(struct seq_file *m, loff_t *pos)
1a6e0f06 29501+{
e4b2b4a8 29502+ mutex_lock(&synth_event_mutex);
1a6e0f06 29503+
e4b2b4a8
JK
29504+ return seq_list_start(&synth_event_list, *pos);
29505+}
1a6e0f06 29506+
e4b2b4a8
JK
29507+static void *synth_events_seq_next(struct seq_file *m, void *v, loff_t *pos)
29508+{
29509+ return seq_list_next(v, &synth_event_list, pos);
29510+}
1a6e0f06 29511+
e4b2b4a8
JK
29512+static void synth_events_seq_stop(struct seq_file *m, void *v)
29513+{
29514+ mutex_unlock(&synth_event_mutex);
1a6e0f06 29515+}
1a6e0f06 29516+
e4b2b4a8 29517+static int synth_events_seq_show(struct seq_file *m, void *v)
1a6e0f06 29518+{
e4b2b4a8
JK
29519+ struct synth_field *field;
29520+ struct synth_event *event = v;
29521+ unsigned int i;
1a6e0f06 29522+
e4b2b4a8 29523+ seq_printf(m, "%s\t", event->name);
1a6e0f06 29524+
e4b2b4a8
JK
29525+ for (i = 0; i < event->n_fields; i++) {
29526+ field = event->fields[i];
29527+
29528+ /* parameter values */
29529+ seq_printf(m, "%s %s%s", field->type, field->name,
29530+ i == event->n_fields - 1 ? "" : "; ");
1a6e0f06
JK
29531+ }
29532+
e4b2b4a8 29533+ seq_putc(m, '\n');
1a6e0f06
JK
29534+
29535+ return 0;
29536+}
1a6e0f06 29537+
e4b2b4a8
JK
29538+static const struct seq_operations synth_events_seq_op = {
29539+ .start = synth_events_seq_start,
29540+ .next = synth_events_seq_next,
29541+ .stop = synth_events_seq_stop,
29542+ .show = synth_events_seq_show
29543+};
29544+
29545+static int synth_events_open(struct inode *inode, struct file *file)
1a6e0f06 29546+{
e4b2b4a8 29547+ int ret;
1a6e0f06 29548+
e4b2b4a8
JK
29549+ if ((file->f_mode & FMODE_WRITE) && (file->f_flags & O_TRUNC)) {
29550+ ret = release_all_synth_events();
29551+ if (ret < 0)
29552+ return ret;
29553+ }
1a6e0f06 29554+
e4b2b4a8 29555+ return seq_open(file, &synth_events_seq_op);
1a6e0f06 29556+}
e4b2b4a8
JK
29557+
29558+static ssize_t synth_events_write(struct file *file,
29559+ const char __user *buffer,
29560+ size_t count, loff_t *ppos)
1a6e0f06 29561+{
e4b2b4a8
JK
29562+ return trace_parse_run_command(file, buffer, count, ppos,
29563+ create_synth_event);
29564+}
1a6e0f06 29565+
e4b2b4a8
JK
29566+static const struct file_operations synth_events_fops = {
29567+ .open = synth_events_open,
29568+ .write = synth_events_write,
29569+ .read = seq_read,
29570+ .llseek = seq_lseek,
29571+ .release = seq_release,
29572+};
29573+
29574+static u64 hist_field_timestamp(struct hist_field *hist_field,
29575+ struct tracing_map_elt *elt,
29576+ struct ring_buffer_event *rbe,
29577+ void *event)
29578+{
29579+ struct hist_trigger_data *hist_data = hist_field->hist_data;
29580+ struct trace_array *tr = hist_data->event_file->tr;
29581+
29582+ u64 ts = ring_buffer_event_time_stamp(rbe);
29583+
29584+ if (hist_data->attrs->ts_in_usecs && trace_clock_in_ns(tr))
29585+ ts = ns2usecs(ts);
29586+
29587+ return ts;
1a6e0f06
JK
29588+}
29589+
e4b2b4a8
JK
29590+static u64 hist_field_cpu(struct hist_field *hist_field,
29591+ struct tracing_map_elt *elt,
29592+ struct ring_buffer_event *rbe,
29593+ void *event)
1a6e0f06 29594+{
e4b2b4a8
JK
29595+ int cpu = smp_processor_id();
29596+
29597+ return cpu;
1a6e0f06
JK
29598+}
29599+
e4b2b4a8
JK
29600+static struct hist_field *
29601+check_field_for_var_ref(struct hist_field *hist_field,
29602+ struct hist_trigger_data *var_data,
29603+ unsigned int var_idx)
1a6e0f06 29604+{
e4b2b4a8
JK
29605+ struct hist_field *found = NULL;
29606+
29607+ if (hist_field && hist_field->flags & HIST_FIELD_FL_VAR_REF) {
29608+ if (hist_field->var.idx == var_idx &&
29609+ hist_field->var.hist_data == var_data) {
29610+ found = hist_field;
29611+ }
29612+ }
29613+
29614+ return found;
1a6e0f06
JK
29615+}
29616+
e4b2b4a8
JK
29617+static struct hist_field *
29618+check_field_for_var_refs(struct hist_trigger_data *hist_data,
29619+ struct hist_field *hist_field,
29620+ struct hist_trigger_data *var_data,
29621+ unsigned int var_idx,
29622+ unsigned int level)
29623+{
29624+ struct hist_field *found = NULL;
29625+ unsigned int i;
29626+
29627+ if (level > 3)
29628+ return found;
29629+
29630+ if (!hist_field)
29631+ return found;
29632+
29633+ found = check_field_for_var_ref(hist_field, var_data, var_idx);
29634+ if (found)
29635+ return found;
29636+
29637+ for (i = 0; i < HIST_FIELD_OPERANDS_MAX; i++) {
29638+ struct hist_field *operand;
29639+
29640+ operand = hist_field->operands[i];
29641+ found = check_field_for_var_refs(hist_data, operand, var_data,
29642+ var_idx, level + 1);
29643+ if (found)
29644+ return found;
29645+ }
29646+
29647+ return found;
29648+}
29649+
29650+static struct hist_field *find_var_ref(struct hist_trigger_data *hist_data,
29651+ struct hist_trigger_data *var_data,
29652+ unsigned int var_idx)
29653+{
29654+ struct hist_field *hist_field, *found = NULL;
29655+ unsigned int i;
29656+
29657+ for_each_hist_field(i, hist_data) {
29658+ hist_field = hist_data->fields[i];
29659+ found = check_field_for_var_refs(hist_data, hist_field,
29660+ var_data, var_idx, 0);
29661+ if (found)
29662+ return found;
29663+ }
29664+
29665+ for (i = 0; i < hist_data->n_synth_var_refs; i++) {
29666+ hist_field = hist_data->synth_var_refs[i];
29667+ found = check_field_for_var_refs(hist_data, hist_field,
29668+ var_data, var_idx, 0);
29669+ if (found)
29670+ return found;
29671+ }
29672+
29673+ return found;
29674+}
29675+
29676+static struct hist_field *find_any_var_ref(struct hist_trigger_data *hist_data,
29677+ unsigned int var_idx)
1a6e0f06 29678+{
e4b2b4a8
JK
29679+ struct trace_array *tr = hist_data->event_file->tr;
29680+ struct hist_field *found = NULL;
29681+ struct hist_var_data *var_data;
1a6e0f06 29682+
e4b2b4a8
JK
29683+ list_for_each_entry(var_data, &tr->hist_vars, list) {
29684+ if (var_data->hist_data == hist_data)
29685+ continue;
29686+ found = find_var_ref(var_data->hist_data, hist_data, var_idx);
29687+ if (found)
29688+ break;
29689+ }
1a6e0f06 29690+
e4b2b4a8 29691+ return found;
1a6e0f06
JK
29692+}
29693+
e4b2b4a8 29694+static bool check_var_refs(struct hist_trigger_data *hist_data)
1a6e0f06 29695+{
e4b2b4a8
JK
29696+ struct hist_field *field;
29697+ bool found = false;
29698+ int i;
1a6e0f06 29699+
e4b2b4a8
JK
29700+ for_each_hist_field(i, hist_data) {
29701+ field = hist_data->fields[i];
29702+ if (field && field->flags & HIST_FIELD_FL_VAR) {
29703+ if (find_any_var_ref(hist_data, field->var.idx)) {
29704+ found = true;
29705+ break;
29706+ }
29707+ }
29708+ }
1a6e0f06 29709+
e4b2b4a8 29710+ return found;
1a6e0f06
JK
29711+}
29712+
e4b2b4a8 29713+static struct hist_var_data *find_hist_vars(struct hist_trigger_data *hist_data)
1a6e0f06 29714+{
e4b2b4a8
JK
29715+ struct trace_array *tr = hist_data->event_file->tr;
29716+ struct hist_var_data *var_data, *found = NULL;
1a6e0f06 29717+
e4b2b4a8
JK
29718+ list_for_each_entry(var_data, &tr->hist_vars, list) {
29719+ if (var_data->hist_data == hist_data) {
29720+ found = var_data;
29721+ break;
1a6e0f06 29722+ }
e4b2b4a8 29723+ }
1a6e0f06 29724+
e4b2b4a8
JK
29725+ return found;
29726+}
29727+
29728+static bool field_has_hist_vars(struct hist_field *hist_field,
29729+ unsigned int level)
29730+{
29731+ int i;
29732+
29733+ if (level > 3)
29734+ return false;
29735+
29736+ if (!hist_field)
29737+ return false;
29738+
29739+ if (hist_field->flags & HIST_FIELD_FL_VAR ||
29740+ hist_field->flags & HIST_FIELD_FL_VAR_REF)
29741+ return true;
29742+
29743+ for (i = 0; i < HIST_FIELD_OPERANDS_MAX; i++) {
29744+ struct hist_field *operand;
29745+
29746+ operand = hist_field->operands[i];
29747+ if (field_has_hist_vars(operand, level + 1))
29748+ return true;
1a6e0f06 29749+ }
e4b2b4a8
JK
29750+
29751+ return false;
1a6e0f06
JK
29752+}
29753+
e4b2b4a8
JK
29754+static bool has_hist_vars(struct hist_trigger_data *hist_data)
29755+{
29756+ struct hist_field *hist_field;
29757+ int i;
1a6e0f06 29758+
e4b2b4a8
JK
29759+ for_each_hist_field(i, hist_data) {
29760+ hist_field = hist_data->fields[i];
29761+ if (field_has_hist_vars(hist_field, 0))
29762+ return true;
29763+ }
1a6e0f06 29764+
e4b2b4a8
JK
29765+ return false;
29766+}
1a6e0f06 29767+
e4b2b4a8 29768+static int save_hist_vars(struct hist_trigger_data *hist_data)
1a6e0f06 29769+{
e4b2b4a8
JK
29770+ struct trace_array *tr = hist_data->event_file->tr;
29771+ struct hist_var_data *var_data;
1a6e0f06 29772+
e4b2b4a8
JK
29773+ var_data = find_hist_vars(hist_data);
29774+ if (var_data)
29775+ return 0;
29776+
29777+ if (trace_array_get(tr) < 0)
29778+ return -ENODEV;
29779+
29780+ var_data = kzalloc(sizeof(*var_data), GFP_KERNEL);
29781+ if (!var_data) {
29782+ trace_array_put(tr);
29783+ return -ENOMEM;
29784+ }
29785+
29786+ var_data->hist_data = hist_data;
29787+ list_add(&var_data->list, &tr->hist_vars);
29788+
29789+ return 0;
1a6e0f06
JK
29790+}
29791+
e4b2b4a8 29792+static void remove_hist_vars(struct hist_trigger_data *hist_data)
1a6e0f06 29793+{
e4b2b4a8
JK
29794+ struct trace_array *tr = hist_data->event_file->tr;
29795+ struct hist_var_data *var_data;
1a6e0f06 29796+
e4b2b4a8
JK
29797+ var_data = find_hist_vars(hist_data);
29798+ if (!var_data)
29799+ return;
29800+
29801+ if (WARN_ON(check_var_refs(hist_data)))
29802+ return;
29803+
29804+ list_del(&var_data->list);
29805+
29806+ kfree(var_data);
29807+
29808+ trace_array_put(tr);
1a6e0f06
JK
29809+}
29810+
e4b2b4a8
JK
29811+static struct hist_field *find_var_field(struct hist_trigger_data *hist_data,
29812+ const char *var_name)
1a6e0f06 29813+{
e4b2b4a8 29814+ struct hist_field *hist_field, *found = NULL;
1a6e0f06
JK
29815+ int i;
29816+
e4b2b4a8
JK
29817+ for_each_hist_field(i, hist_data) {
29818+ hist_field = hist_data->fields[i];
29819+ if (hist_field && hist_field->flags & HIST_FIELD_FL_VAR &&
29820+ strcmp(hist_field->var.name, var_name) == 0) {
29821+ found = hist_field;
29822+ break;
29823+ }
29824+ }
1a6e0f06 29825+
e4b2b4a8
JK
29826+ return found;
29827+}
1a6e0f06 29828+
e4b2b4a8
JK
29829+static struct hist_field *find_var(struct hist_trigger_data *hist_data,
29830+ struct trace_event_file *file,
29831+ const char *var_name)
29832+{
29833+ struct hist_trigger_data *test_data;
29834+ struct event_trigger_data *test;
29835+ struct hist_field *hist_field;
29836+
29837+ hist_field = find_var_field(hist_data, var_name);
29838+ if (hist_field)
29839+ return hist_field;
29840+
29841+ list_for_each_entry_rcu(test, &file->triggers, list) {
29842+ if (test->cmd_ops->trigger_type == ETT_EVENT_HIST) {
29843+ test_data = test->private_data;
29844+ hist_field = find_var_field(test_data, var_name);
29845+ if (hist_field)
29846+ return hist_field;
29847+ }
29848+ }
29849+
29850+ return NULL;
29851+}
29852+
29853+static struct trace_event_file *find_var_file(struct trace_array *tr,
29854+ char *system,
29855+ char *event_name,
29856+ char *var_name)
29857+{
29858+ struct hist_trigger_data *var_hist_data;
29859+ struct hist_var_data *var_data;
29860+ struct trace_event_file *file, *found = NULL;
29861+
29862+ if (system)
29863+ return find_event_file(tr, system, event_name);
29864+
29865+ list_for_each_entry(var_data, &tr->hist_vars, list) {
29866+ var_hist_data = var_data->hist_data;
29867+ file = var_hist_data->event_file;
29868+ if (file == found)
29869+ continue;
29870+
29871+ if (find_var_field(var_hist_data, var_name)) {
29872+ if (found) {
29873+ hist_err_event("Variable name not unique, need to use fully qualified name (subsys.event.var) for variable: ", system, event_name, var_name);
29874+ return NULL;
1a6e0f06 29875+ }
e4b2b4a8
JK
29876+
29877+ found = file;
1a6e0f06
JK
29878+ }
29879+ }
29880+
e4b2b4a8
JK
29881+ return found;
29882+}
29883+
29884+static struct hist_field *find_file_var(struct trace_event_file *file,
29885+ const char *var_name)
29886+{
29887+ struct hist_trigger_data *test_data;
29888+ struct event_trigger_data *test;
29889+ struct hist_field *hist_field;
29890+
29891+ list_for_each_entry_rcu(test, &file->triggers, list) {
29892+ if (test->cmd_ops->trigger_type == ETT_EVENT_HIST) {
29893+ test_data = test->private_data;
29894+ hist_field = find_var_field(test_data, var_name);
29895+ if (hist_field)
29896+ return hist_field;
29897+ }
1a6e0f06 29898+ }
e4b2b4a8
JK
29899+
29900+ return NULL;
1a6e0f06 29901+}
e4b2b4a8
JK
29902+
29903+static struct hist_field *
29904+find_match_var(struct hist_trigger_data *hist_data, char *var_name)
1a6e0f06 29905+{
e4b2b4a8
JK
29906+ struct trace_array *tr = hist_data->event_file->tr;
29907+ struct hist_field *hist_field, *found = NULL;
29908+ struct trace_event_file *file;
29909+ unsigned int i;
1a6e0f06 29910+
e4b2b4a8
JK
29911+ for (i = 0; i < hist_data->n_actions; i++) {
29912+ struct action_data *data = hist_data->actions[i];
29913+
29914+ if (data->fn == action_trace) {
29915+ char *system = data->onmatch.match_event_system;
29916+ char *event_name = data->onmatch.match_event;
29917+
29918+ file = find_var_file(tr, system, event_name, var_name);
29919+ if (!file)
29920+ continue;
29921+ hist_field = find_file_var(file, var_name);
29922+ if (hist_field) {
29923+ if (found) {
29924+ hist_err_event("Variable name not unique, need to use fully qualified name (subsys.event.var) for variable: ", system, event_name, var_name);
29925+ return ERR_PTR(-EINVAL);
29926+ }
29927+
29928+ found = hist_field;
29929+ }
29930+ }
1a6e0f06 29931+ }
e4b2b4a8 29932+ return found;
1a6e0f06 29933+}
1a6e0f06 29934+
e4b2b4a8
JK
29935+static struct hist_field *find_event_var(struct hist_trigger_data *hist_data,
29936+ char *system,
29937+ char *event_name,
29938+ char *var_name)
29939+{
29940+ struct trace_array *tr = hist_data->event_file->tr;
29941+ struct hist_field *hist_field = NULL;
29942+ struct trace_event_file *file;
1a6e0f06 29943+
e4b2b4a8
JK
29944+ if (!system || !event_name) {
29945+ hist_field = find_match_var(hist_data, var_name);
29946+ if (IS_ERR(hist_field))
29947+ return NULL;
29948+ if (hist_field)
29949+ return hist_field;
29950+ }
29951+
29952+ file = find_var_file(tr, system, event_name, var_name);
29953+ if (!file)
29954+ return NULL;
29955+
29956+ hist_field = find_file_var(file, var_name);
29957+
29958+ return hist_field;
29959+}
29960+
29961+struct hist_elt_data {
29962+ char *comm;
29963+ u64 *var_ref_vals;
29964+ char *field_var_str[SYNTH_FIELDS_MAX];
b3bbd485
JK
29965+};
29966+
e4b2b4a8
JK
29967+static u64 hist_field_var_ref(struct hist_field *hist_field,
29968+ struct tracing_map_elt *elt,
29969+ struct ring_buffer_event *rbe,
29970+ void *event)
1a6e0f06 29971+{
e4b2b4a8
JK
29972+ struct hist_elt_data *elt_data;
29973+ u64 var_val = 0;
1a6e0f06 29974+
e4b2b4a8
JK
29975+ elt_data = elt->private_data;
29976+ var_val = elt_data->var_ref_vals[hist_field->var_ref_idx];
29977+
29978+ return var_val;
1a6e0f06 29979+}
1a6e0f06 29980+
e4b2b4a8
JK
29981+static bool resolve_var_refs(struct hist_trigger_data *hist_data, void *key,
29982+ u64 *var_ref_vals, bool self)
1a6e0f06 29983+{
e4b2b4a8
JK
29984+ struct hist_trigger_data *var_data;
29985+ struct tracing_map_elt *var_elt;
29986+ struct hist_field *hist_field;
29987+ unsigned int i, var_idx;
29988+ bool resolved = true;
29989+ u64 var_val = 0;
1a6e0f06 29990+
e4b2b4a8
JK
29991+ for (i = 0; i < hist_data->n_var_refs; i++) {
29992+ hist_field = hist_data->var_refs[i];
29993+ var_idx = hist_field->var.idx;
29994+ var_data = hist_field->var.hist_data;
1a6e0f06 29995+
e4b2b4a8
JK
29996+ if (var_data == NULL) {
29997+ resolved = false;
29998+ break;
29999+ }
1a6e0f06 30000+
e4b2b4a8
JK
30001+ if ((self && var_data != hist_data) ||
30002+ (!self && var_data == hist_data))
30003+ continue;
30004+
30005+ var_elt = tracing_map_lookup(var_data->map, key);
30006+ if (!var_elt) {
30007+ resolved = false;
30008+ break;
30009+ }
30010+
30011+ if (!tracing_map_var_set(var_elt, var_idx)) {
30012+ resolved = false;
30013+ break;
30014+ }
30015+
30016+ if (self || !hist_field->read_once)
30017+ var_val = tracing_map_read_var(var_elt, var_idx);
30018+ else
30019+ var_val = tracing_map_read_var_once(var_elt, var_idx);
30020+
30021+ var_ref_vals[i] = var_val;
1a6e0f06 30022+ }
e4b2b4a8
JK
30023+
30024+ return resolved;
1a6e0f06
JK
30025+}
30026+
e4b2b4a8
JK
30027+static const char *hist_field_name(struct hist_field *field,
30028+ unsigned int level)
1a6e0f06 30029+{
e4b2b4a8
JK
30030+ const char *field_name = "";
30031+
30032+ if (level > 1)
30033+ return field_name;
30034+
30035+ if (field->field)
30036+ field_name = field->field->name;
30037+ else if (field->flags & HIST_FIELD_FL_LOG2 ||
30038+ field->flags & HIST_FIELD_FL_ALIAS)
30039+ field_name = hist_field_name(field->operands[0], ++level);
30040+ else if (field->flags & HIST_FIELD_FL_CPU)
30041+ field_name = "cpu";
30042+ else if (field->flags & HIST_FIELD_FL_EXPR ||
30043+ field->flags & HIST_FIELD_FL_VAR_REF) {
30044+ if (field->system) {
30045+ static char full_name[MAX_FILTER_STR_VAL];
30046+
30047+ strcat(full_name, field->system);
30048+ strcat(full_name, ".");
30049+ strcat(full_name, field->event_name);
30050+ strcat(full_name, ".");
30051+ strcat(full_name, field->name);
30052+ field_name = full_name;
30053+ } else
30054+ field_name = field->name;
30055+ } else if (field->flags & HIST_FIELD_FL_TIMESTAMP)
30056+ field_name = "common_timestamp";
30057+
30058+ if (field_name == NULL)
30059+ field_name = "";
30060+
30061+ return field_name;
1a6e0f06
JK
30062+}
30063+
b3bbd485
JK
30064+static hist_field_fn_t select_value_fn(int field_size, int field_is_signed)
30065+{
30066+ hist_field_fn_t fn = NULL;
30067+
30068+ switch (field_size) {
30069+ case 8:
30070+ if (field_is_signed)
30071+ fn = hist_field_s64;
30072+ else
30073+ fn = hist_field_u64;
30074+ break;
30075+ case 4:
30076+ if (field_is_signed)
30077+ fn = hist_field_s32;
30078+ else
30079+ fn = hist_field_u32;
30080+ break;
30081+ case 2:
30082+ if (field_is_signed)
30083+ fn = hist_field_s16;
30084+ else
30085+ fn = hist_field_u16;
30086+ break;
30087+ case 1:
30088+ if (field_is_signed)
30089+ fn = hist_field_s8;
30090+ else
30091+ fn = hist_field_u8;
30092+ break;
30093+ }
30094+
30095+ return fn;
30096+}
30097+
30098+static int parse_map_size(char *str)
30099+{
30100+ unsigned long size, map_bits;
30101+ int ret;
30102+
30103+ strsep(&str, "=");
30104+ if (!str) {
30105+ ret = -EINVAL;
30106+ goto out;
30107+ }
30108+
30109+ ret = kstrtoul(str, 0, &size);
30110+ if (ret)
30111+ goto out;
30112+
30113+ map_bits = ilog2(roundup_pow_of_two(size));
30114+ if (map_bits < TRACING_MAP_BITS_MIN ||
30115+ map_bits > TRACING_MAP_BITS_MAX)
30116+ ret = -EINVAL;
30117+ else
30118+ ret = map_bits;
30119+ out:
30120+ return ret;
30121+}
30122+
30123+static void destroy_hist_trigger_attrs(struct hist_trigger_attrs *attrs)
30124+{
e4b2b4a8 30125+ unsigned int i;
1a6e0f06 30126+
b3bbd485
JK
30127+ if (!attrs)
30128+ return;
30129+
e4b2b4a8
JK
30130+ for (i = 0; i < attrs->n_assignments; i++)
30131+ kfree(attrs->assignment_str[i]);
1a6e0f06 30132+
e4b2b4a8
JK
30133+ for (i = 0; i < attrs->n_actions; i++)
30134+ kfree(attrs->action_str[i]);
1a6e0f06 30135+
b3bbd485
JK
30136+ kfree(attrs->name);
30137+ kfree(attrs->sort_key_str);
30138+ kfree(attrs->keys_str);
30139+ kfree(attrs->vals_str);
e4b2b4a8 30140+ kfree(attrs->clock);
b3bbd485
JK
30141+ kfree(attrs);
30142+}
30143+
e4b2b4a8
JK
30144+static int parse_action(char *str, struct hist_trigger_attrs *attrs)
30145+{
30146+ int ret = -EINVAL;
1a6e0f06 30147+
e4b2b4a8
JK
30148+ if (attrs->n_actions >= HIST_ACTIONS_MAX)
30149+ return ret;
1a6e0f06 30150+
e4b2b4a8
JK
30151+ if ((strncmp(str, "onmatch(", strlen("onmatch(")) == 0) ||
30152+ (strncmp(str, "onmax(", strlen("onmax(")) == 0)) {
30153+ attrs->action_str[attrs->n_actions] = kstrdup(str, GFP_KERNEL);
30154+ if (!attrs->action_str[attrs->n_actions]) {
30155+ ret = -ENOMEM;
30156+ return ret;
30157+ }
30158+ attrs->n_actions++;
30159+ ret = 0;
1a6e0f06
JK
30160+ }
30161+
e4b2b4a8 30162+ return ret;
1a6e0f06
JK
30163+}
30164+
e4b2b4a8 30165+static int parse_assignment(char *str, struct hist_trigger_attrs *attrs)
1a6e0f06 30166+{
e4b2b4a8
JK
30167+ int ret = 0;
30168+
30169+ if ((strncmp(str, "key=", strlen("key=")) == 0) ||
30170+ (strncmp(str, "keys=", strlen("keys=")) == 0)) {
30171+ attrs->keys_str = kstrdup(str, GFP_KERNEL);
30172+ if (!attrs->keys_str) {
30173+ ret = -ENOMEM;
30174+ goto out;
30175+ }
30176+ } else if ((strncmp(str, "val=", strlen("val=")) == 0) ||
30177+ (strncmp(str, "vals=", strlen("vals=")) == 0) ||
30178+ (strncmp(str, "values=", strlen("values=")) == 0)) {
30179+ attrs->vals_str = kstrdup(str, GFP_KERNEL);
30180+ if (!attrs->vals_str) {
30181+ ret = -ENOMEM;
30182+ goto out;
30183+ }
30184+ } else if (strncmp(str, "sort=", strlen("sort=")) == 0) {
30185+ attrs->sort_key_str = kstrdup(str, GFP_KERNEL);
30186+ if (!attrs->sort_key_str) {
30187+ ret = -ENOMEM;
30188+ goto out;
30189+ }
30190+ } else if (strncmp(str, "name=", strlen("name=")) == 0) {
30191+ attrs->name = kstrdup(str, GFP_KERNEL);
30192+ if (!attrs->name) {
30193+ ret = -ENOMEM;
30194+ goto out;
30195+ }
30196+ } else if (strncmp(str, "clock=", strlen("clock=")) == 0) {
30197+ strsep(&str, "=");
30198+ if (!str) {
30199+ ret = -EINVAL;
30200+ goto out;
30201+ }
30202+
30203+ str = strstrip(str);
30204+ attrs->clock = kstrdup(str, GFP_KERNEL);
30205+ if (!attrs->clock) {
30206+ ret = -ENOMEM;
30207+ goto out;
30208+ }
30209+ } else if (strncmp(str, "size=", strlen("size=")) == 0) {
30210+ int map_bits = parse_map_size(str);
30211+
30212+ if (map_bits < 0) {
30213+ ret = map_bits;
30214+ goto out;
30215+ }
30216+ attrs->map_bits = map_bits;
30217+ } else {
30218+ char *assignment;
30219+
30220+ if (attrs->n_assignments == TRACING_MAP_VARS_MAX) {
30221+ hist_err("Too many variables defined: ", str);
30222+ ret = -EINVAL;
30223+ goto out;
30224+ }
30225+
30226+ assignment = kstrdup(str, GFP_KERNEL);
30227+ if (!assignment) {
30228+ ret = -ENOMEM;
30229+ goto out;
30230+ }
30231+
30232+ attrs->assignment_str[attrs->n_assignments++] = assignment;
1a6e0f06 30233+ }
e4b2b4a8
JK
30234+ out:
30235+ return ret;
1a6e0f06
JK
30236+}
30237+
b3bbd485
JK
30238+static struct hist_trigger_attrs *parse_hist_trigger_attrs(char *trigger_str)
30239+{
30240+ struct hist_trigger_attrs *attrs;
30241+ int ret = 0;
30242+
30243+ attrs = kzalloc(sizeof(*attrs), GFP_KERNEL);
30244+ if (!attrs)
30245+ return ERR_PTR(-ENOMEM);
30246+
30247+ while (trigger_str) {
30248+ char *str = strsep(&trigger_str, ":");
30249+
e4b2b4a8
JK
30250+ if (strchr(str, '=')) {
30251+ ret = parse_assignment(str, attrs);
30252+ if (ret)
30253+ goto free;
30254+ } else if (strcmp(str, "pause") == 0)
b3bbd485
JK
30255+ attrs->pause = true;
30256+ else if ((strcmp(str, "cont") == 0) ||
30257+ (strcmp(str, "continue") == 0))
30258+ attrs->cont = true;
30259+ else if (strcmp(str, "clear") == 0)
30260+ attrs->clear = true;
e4b2b4a8
JK
30261+ else {
30262+ ret = parse_action(str, attrs);
30263+ if (ret)
b3bbd485
JK
30264+ goto free;
30265+ }
30266+ }
30267+
30268+ if (!attrs->keys_str) {
30269+ ret = -EINVAL;
30270+ goto free;
30271+ }
30272+
e4b2b4a8
JK
30273+ if (!attrs->clock) {
30274+ attrs->clock = kstrdup("global", GFP_KERNEL);
30275+ if (!attrs->clock) {
30276+ ret = -ENOMEM;
30277+ goto free;
30278+ }
30279+ }
30280+
b3bbd485
JK
30281+ return attrs;
30282+ free:
30283+ destroy_hist_trigger_attrs(attrs);
30284+
30285+ return ERR_PTR(ret);
30286+}
30287+
30288+static inline void save_comm(char *comm, struct task_struct *task)
30289+{
30290+ if (!task->pid) {
30291+ strcpy(comm, "<idle>");
30292+ return;
30293+ }
30294+
30295+ if (WARN_ON_ONCE(task->pid < 0)) {
30296+ strcpy(comm, "<XXX>");
30297+ return;
30298+ }
30299+
30300+ memcpy(comm, task->comm, TASK_COMM_LEN);
30301+}
30302+
e4b2b4a8 30303+static void hist_elt_data_free(struct hist_elt_data *elt_data)
b3bbd485 30304+{
e4b2b4a8
JK
30305+ unsigned int i;
30306+
30307+ for (i = 0; i < SYNTH_FIELDS_MAX; i++)
30308+ kfree(elt_data->field_var_str[i]);
30309+
30310+ kfree(elt_data->comm);
30311+ kfree(elt_data);
b3bbd485
JK
30312+}
30313+
e4b2b4a8 30314+static void hist_trigger_elt_data_free(struct tracing_map_elt *elt)
1a6e0f06 30315+{
e4b2b4a8 30316+ struct hist_elt_data *elt_data = elt->private_data;
1a6e0f06 30317+
e4b2b4a8 30318+ hist_elt_data_free(elt_data);
1a6e0f06
JK
30319+}
30320+
e4b2b4a8 30321+static int hist_trigger_elt_data_alloc(struct tracing_map_elt *elt)
b3bbd485
JK
30322+{
30323+ struct hist_trigger_data *hist_data = elt->map->private_data;
e4b2b4a8
JK
30324+ unsigned int size = TASK_COMM_LEN;
30325+ struct hist_elt_data *elt_data;
b3bbd485 30326+ struct hist_field *key_field;
e4b2b4a8 30327+ unsigned int i, n_str;
1a6e0f06 30328+
e4b2b4a8
JK
30329+ elt_data = kzalloc(sizeof(*elt_data), GFP_KERNEL);
30330+ if (!elt_data)
30331+ return -ENOMEM;
b3bbd485
JK
30332+
30333+ for_each_hist_key_field(i, hist_data) {
30334+ key_field = hist_data->fields[i];
30335+
30336+ if (key_field->flags & HIST_FIELD_FL_EXECNAME) {
e4b2b4a8
JK
30337+ elt_data->comm = kzalloc(size, GFP_KERNEL);
30338+ if (!elt_data->comm) {
30339+ kfree(elt_data);
b3bbd485 30340+ return -ENOMEM;
e4b2b4a8 30341+ }
b3bbd485
JK
30342+ break;
30343+ }
30344+ }
30345+
e4b2b4a8
JK
30346+ n_str = hist_data->n_field_var_str + hist_data->n_max_var_str;
30347+
30348+ size = STR_VAR_LEN_MAX;
1a6e0f06 30349+
e4b2b4a8
JK
30350+ for (i = 0; i < n_str; i++) {
30351+ elt_data->field_var_str[i] = kzalloc(size, GFP_KERNEL);
30352+ if (!elt_data->field_var_str[i]) {
30353+ hist_elt_data_free(elt_data);
30354+ return -ENOMEM;
30355+ }
30356+ }
1a6e0f06 30357+
e4b2b4a8 30358+ elt->private_data = elt_data;
1a6e0f06 30359+
b3bbd485
JK
30360+ return 0;
30361+}
30362+
e4b2b4a8 30363+static void hist_trigger_elt_data_init(struct tracing_map_elt *elt)
b3bbd485 30364+{
e4b2b4a8 30365+ struct hist_elt_data *elt_data = elt->private_data;
b3bbd485 30366+
e4b2b4a8
JK
30367+ if (elt_data->comm)
30368+ save_comm(elt_data->comm, current);
b3bbd485
JK
30369+}
30370+
e4b2b4a8
JK
30371+static const struct tracing_map_ops hist_trigger_elt_data_ops = {
30372+ .elt_alloc = hist_trigger_elt_data_alloc,
30373+ .elt_free = hist_trigger_elt_data_free,
30374+ .elt_init = hist_trigger_elt_data_init,
30375+};
30376+
30377+static const char *get_hist_field_flags(struct hist_field *hist_field)
b3bbd485 30378+{
e4b2b4a8 30379+ const char *flags_str = NULL;
b3bbd485 30380+
e4b2b4a8
JK
30381+ if (hist_field->flags & HIST_FIELD_FL_HEX)
30382+ flags_str = "hex";
30383+ else if (hist_field->flags & HIST_FIELD_FL_SYM)
30384+ flags_str = "sym";
30385+ else if (hist_field->flags & HIST_FIELD_FL_SYM_OFFSET)
30386+ flags_str = "sym-offset";
30387+ else if (hist_field->flags & HIST_FIELD_FL_EXECNAME)
30388+ flags_str = "execname";
30389+ else if (hist_field->flags & HIST_FIELD_FL_SYSCALL)
30390+ flags_str = "syscall";
30391+ else if (hist_field->flags & HIST_FIELD_FL_LOG2)
30392+ flags_str = "log2";
30393+ else if (hist_field->flags & HIST_FIELD_FL_TIMESTAMP_USECS)
30394+ flags_str = "usecs";
30395+
30396+ return flags_str;
b3bbd485
JK
30397+}
30398+
e4b2b4a8 30399+static void expr_field_str(struct hist_field *field, char *expr)
1a6e0f06 30400+{
e4b2b4a8
JK
30401+ if (field->flags & HIST_FIELD_FL_VAR_REF)
30402+ strcat(expr, "$");
b3bbd485 30403+
e4b2b4a8 30404+ strcat(expr, hist_field_name(field, 0));
1a6e0f06 30405+
e4b2b4a8
JK
30406+ if (field->flags && !(field->flags & HIST_FIELD_FL_VAR_REF)) {
30407+ const char *flags_str = get_hist_field_flags(field);
1a6e0f06 30408+
e4b2b4a8
JK
30409+ if (flags_str) {
30410+ strcat(expr, ".");
30411+ strcat(expr, flags_str);
30412+ }
30413+ }
1a6e0f06
JK
30414+}
30415+
e4b2b4a8 30416+static char *expr_str(struct hist_field *field, unsigned int level)
1a6e0f06 30417+{
e4b2b4a8 30418+ char *expr;
1a6e0f06 30419+
e4b2b4a8
JK
30420+ if (level > 1)
30421+ return NULL;
1a6e0f06 30422+
e4b2b4a8
JK
30423+ expr = kzalloc(MAX_FILTER_STR_VAL, GFP_KERNEL);
30424+ if (!expr)
30425+ return NULL;
1a6e0f06 30426+
e4b2b4a8
JK
30427+ if (!field->operands[0]) {
30428+ expr_field_str(field, expr);
30429+ return expr;
30430+ }
1a6e0f06 30431+
e4b2b4a8
JK
30432+ if (field->operator == FIELD_OP_UNARY_MINUS) {
30433+ char *subexpr;
1a6e0f06 30434+
e4b2b4a8
JK
30435+ strcat(expr, "-(");
30436+ subexpr = expr_str(field->operands[0], ++level);
30437+ if (!subexpr) {
30438+ kfree(expr);
30439+ return NULL;
1a6e0f06 30440+ }
e4b2b4a8
JK
30441+ strcat(expr, subexpr);
30442+ strcat(expr, ")");
1a6e0f06 30443+
e4b2b4a8 30444+ kfree(subexpr);
1a6e0f06 30445+
e4b2b4a8
JK
30446+ return expr;
30447+ }
1a6e0f06 30448+
e4b2b4a8 30449+ expr_field_str(field->operands[0], expr);
1a6e0f06 30450+
e4b2b4a8
JK
30451+ switch (field->operator) {
30452+ case FIELD_OP_MINUS:
30453+ strcat(expr, "-");
30454+ break;
30455+ case FIELD_OP_PLUS:
30456+ strcat(expr, "+");
30457+ break;
30458+ default:
30459+ kfree(expr);
30460+ return NULL;
30461+ }
1a6e0f06 30462+
e4b2b4a8 30463+ expr_field_str(field->operands[1], expr);
1a6e0f06 30464+
e4b2b4a8 30465+ return expr;
1a6e0f06 30466+}
1a6e0f06 30467+
e4b2b4a8 30468+static int contains_operator(char *str)
1a6e0f06 30469+{
e4b2b4a8
JK
30470+ enum field_op_id field_op = FIELD_OP_NONE;
30471+ char *op;
1a6e0f06 30472+
e4b2b4a8
JK
30473+ op = strpbrk(str, "+-");
30474+ if (!op)
30475+ return FIELD_OP_NONE;
1a6e0f06 30476+
e4b2b4a8
JK
30477+ switch (*op) {
30478+ case '-':
30479+ if (*str == '-')
30480+ field_op = FIELD_OP_UNARY_MINUS;
30481+ else
30482+ field_op = FIELD_OP_MINUS;
30483+ break;
30484+ case '+':
30485+ field_op = FIELD_OP_PLUS;
30486+ break;
30487+ default:
30488+ break;
1a6e0f06 30489+ }
1a6e0f06 30490+
e4b2b4a8
JK
30491+ return field_op;
30492+}
1a6e0f06 30493+
e4b2b4a8
JK
30494+static void destroy_hist_field(struct hist_field *hist_field,
30495+ unsigned int level)
b3bbd485 30496+{
e4b2b4a8 30497+ unsigned int i;
1a6e0f06 30498+
e4b2b4a8
JK
30499+ if (level > 3)
30500+ return;
1a6e0f06 30501+
e4b2b4a8 30502+ if (!hist_field)
1a6e0f06
JK
30503+ return;
30504+
e4b2b4a8
JK
30505+ for (i = 0; i < HIST_FIELD_OPERANDS_MAX; i++)
30506+ destroy_hist_field(hist_field->operands[i], level + 1);
1a6e0f06 30507+
e4b2b4a8
JK
30508+ kfree(hist_field->var.name);
30509+ kfree(hist_field->name);
30510+ kfree(hist_field->type);
1a6e0f06 30511+
b3bbd485
JK
30512+ kfree(hist_field);
30513+}
30514+
e4b2b4a8
JK
30515+static struct hist_field *create_hist_field(struct hist_trigger_data *hist_data,
30516+ struct ftrace_event_field *field,
30517+ unsigned long flags,
30518+ char *var_name)
b3bbd485
JK
30519+{
30520+ struct hist_field *hist_field;
30521+
30522+ if (field && is_function_field(field))
30523+ return NULL;
30524+
30525+ hist_field = kzalloc(sizeof(struct hist_field), GFP_KERNEL);
30526+ if (!hist_field)
30527+ return NULL;
30528+
e4b2b4a8 30529+ hist_field->hist_data = hist_data;
1a6e0f06 30530+
e4b2b4a8
JK
30531+ if (flags & HIST_FIELD_FL_EXPR || flags & HIST_FIELD_FL_ALIAS)
30532+ goto out; /* caller will populate */
1a6e0f06 30533+
e4b2b4a8
JK
30534+ if (flags & HIST_FIELD_FL_VAR_REF) {
30535+ hist_field->fn = hist_field_var_ref;
30536+ goto out;
30537+ }
1a6e0f06 30538+
b3bbd485
JK
30539+ if (flags & HIST_FIELD_FL_HITCOUNT) {
30540+ hist_field->fn = hist_field_counter;
e4b2b4a8
JK
30541+ hist_field->size = sizeof(u64);
30542+ hist_field->type = kstrdup("u64", GFP_KERNEL);
30543+ if (!hist_field->type)
30544+ goto free;
b3bbd485
JK
30545+ goto out;
30546+ }
30547+
30548+ if (flags & HIST_FIELD_FL_STACKTRACE) {
30549+ hist_field->fn = hist_field_none;
30550+ goto out;
30551+ }
30552+
30553+ if (flags & HIST_FIELD_FL_LOG2) {
e4b2b4a8 30554+ unsigned long fl = flags & ~HIST_FIELD_FL_LOG2;
b3bbd485 30555+ hist_field->fn = hist_field_log2;
e4b2b4a8
JK
30556+ hist_field->operands[0] = create_hist_field(hist_data, field, fl, NULL);
30557+ hist_field->size = hist_field->operands[0]->size;
30558+ hist_field->type = kstrdup(hist_field->operands[0]->type, GFP_KERNEL);
30559+ if (!hist_field->type)
30560+ goto free;
30561+ goto out;
30562+ }
1a6e0f06 30563+
e4b2b4a8
JK
30564+ if (flags & HIST_FIELD_FL_TIMESTAMP) {
30565+ hist_field->fn = hist_field_timestamp;
30566+ hist_field->size = sizeof(u64);
30567+ hist_field->type = kstrdup("u64", GFP_KERNEL);
30568+ if (!hist_field->type)
30569+ goto free;
30570+ goto out;
30571+ }
1a6e0f06 30572+
e4b2b4a8
JK
30573+ if (flags & HIST_FIELD_FL_CPU) {
30574+ hist_field->fn = hist_field_cpu;
30575+ hist_field->size = sizeof(int);
30576+ hist_field->type = kstrdup("unsigned int", GFP_KERNEL);
30577+ if (!hist_field->type)
30578+ goto free;
b3bbd485
JK
30579+ goto out;
30580+ }
30581+
30582+ if (WARN_ON_ONCE(!field))
30583+ goto out;
30584+
30585+ if (is_string_field(field)) {
30586+ flags |= HIST_FIELD_FL_STRING;
30587+
e4b2b4a8
JK
30588+ hist_field->size = MAX_FILTER_STR_VAL;
30589+ hist_field->type = kstrdup(field->type, GFP_KERNEL);
30590+ if (!hist_field->type)
30591+ goto free;
30592+
b3bbd485
JK
30593+ if (field->filter_type == FILTER_STATIC_STRING)
30594+ hist_field->fn = hist_field_string;
30595+ else if (field->filter_type == FILTER_DYN_STRING)
30596+ hist_field->fn = hist_field_dynstring;
30597+ else
30598+ hist_field->fn = hist_field_pstring;
30599+ } else {
e4b2b4a8
JK
30600+ hist_field->size = field->size;
30601+ hist_field->is_signed = field->is_signed;
30602+ hist_field->type = kstrdup(field->type, GFP_KERNEL);
30603+ if (!hist_field->type)
30604+ goto free;
30605+
b3bbd485
JK
30606+ hist_field->fn = select_value_fn(field->size,
30607+ field->is_signed);
30608+ if (!hist_field->fn) {
e4b2b4a8 30609+ destroy_hist_field(hist_field, 0);
b3bbd485
JK
30610+ return NULL;
30611+ }
30612+ }
30613+ out:
30614+ hist_field->field = field;
30615+ hist_field->flags = flags;
30616+
e4b2b4a8
JK
30617+ if (var_name) {
30618+ hist_field->var.name = kstrdup(var_name, GFP_KERNEL);
30619+ if (!hist_field->var.name)
30620+ goto free;
30621+ }
30622+
b3bbd485 30623+ return hist_field;
e4b2b4a8
JK
30624+ free:
30625+ destroy_hist_field(hist_field, 0);
30626+ return NULL;
b3bbd485
JK
30627+}
30628+
30629+static void destroy_hist_fields(struct hist_trigger_data *hist_data)
30630+{
30631+ unsigned int i;
30632+
e4b2b4a8 30633+ for (i = 0; i < HIST_FIELDS_MAX; i++) {
b3bbd485 30634+ if (hist_data->fields[i]) {
e4b2b4a8 30635+ destroy_hist_field(hist_data->fields[i], 0);
b3bbd485
JK
30636+ hist_data->fields[i] = NULL;
30637+ }
30638+ }
30639+}
30640+
e4b2b4a8
JK
30641+static int init_var_ref(struct hist_field *ref_field,
30642+ struct hist_field *var_field,
30643+ char *system, char *event_name)
b3bbd485 30644+{
e4b2b4a8 30645+ int err = 0;
b3bbd485 30646+
e4b2b4a8
JK
30647+ ref_field->var.idx = var_field->var.idx;
30648+ ref_field->var.hist_data = var_field->hist_data;
30649+ ref_field->size = var_field->size;
30650+ ref_field->is_signed = var_field->is_signed;
30651+ ref_field->flags |= var_field->flags &
30652+ (HIST_FIELD_FL_TIMESTAMP | HIST_FIELD_FL_TIMESTAMP_USECS);
b3bbd485 30653+
e4b2b4a8
JK
30654+ if (system) {
30655+ ref_field->system = kstrdup(system, GFP_KERNEL);
30656+ if (!ref_field->system)
30657+ return -ENOMEM;
30658+ }
1a6e0f06 30659+
e4b2b4a8
JK
30660+ if (event_name) {
30661+ ref_field->event_name = kstrdup(event_name, GFP_KERNEL);
30662+ if (!ref_field->event_name) {
30663+ err = -ENOMEM;
30664+ goto free;
30665+ }
30666+ }
1a6e0f06 30667+
e4b2b4a8
JK
30668+ if (var_field->var.name) {
30669+ ref_field->name = kstrdup(var_field->var.name, GFP_KERNEL);
30670+ if (!ref_field->name) {
30671+ err = -ENOMEM;
30672+ goto free;
30673+ }
30674+ } else if (var_field->name) {
30675+ ref_field->name = kstrdup(var_field->name, GFP_KERNEL);
30676+ if (!ref_field->name) {
30677+ err = -ENOMEM;
30678+ goto free;
30679+ }
30680+ }
1a6e0f06 30681+
e4b2b4a8
JK
30682+ ref_field->type = kstrdup(var_field->type, GFP_KERNEL);
30683+ if (!ref_field->type) {
30684+ err = -ENOMEM;
30685+ goto free;
30686+ }
30687+ out:
30688+ return err;
30689+ free:
30690+ kfree(ref_field->system);
30691+ kfree(ref_field->event_name);
30692+ kfree(ref_field->name);
30693+
30694+ goto out;
1a6e0f06
JK
30695+}
30696+
e4b2b4a8
JK
30697+static struct hist_field *create_var_ref(struct hist_field *var_field,
30698+ char *system, char *event_name)
1a6e0f06 30699+{
e4b2b4a8
JK
30700+ unsigned long flags = HIST_FIELD_FL_VAR_REF;
30701+ struct hist_field *ref_field;
1a6e0f06 30702+
e4b2b4a8
JK
30703+ ref_field = create_hist_field(var_field->hist_data, NULL, flags, NULL);
30704+ if (ref_field) {
30705+ if (init_var_ref(ref_field, var_field, system, event_name)) {
30706+ destroy_hist_field(ref_field, 0);
30707+ return NULL;
30708+ }
30709+ }
1a6e0f06 30710+
e4b2b4a8 30711+ return ref_field;
1a6e0f06
JK
30712+}
30713+
e4b2b4a8 30714+static bool is_var_ref(char *var_name)
1a6e0f06 30715+{
e4b2b4a8
JK
30716+ if (!var_name || strlen(var_name) < 2 || var_name[0] != '$')
30717+ return false;
1a6e0f06 30718+
e4b2b4a8 30719+ return true;
1a6e0f06
JK
30720+}
30721+
e4b2b4a8
JK
30722+static char *field_name_from_var(struct hist_trigger_data *hist_data,
30723+ char *var_name)
1a6e0f06 30724+{
e4b2b4a8
JK
30725+ char *name, *field;
30726+ unsigned int i;
1a6e0f06 30727+
e4b2b4a8
JK
30728+ for (i = 0; i < hist_data->attrs->var_defs.n_vars; i++) {
30729+ name = hist_data->attrs->var_defs.name[i];
1a6e0f06 30730+
e4b2b4a8
JK
30731+ if (strcmp(var_name, name) == 0) {
30732+ field = hist_data->attrs->var_defs.expr[i];
30733+ if (contains_operator(field) || is_var_ref(field))
30734+ continue;
30735+ return field;
1a6e0f06
JK
30736+ }
30737+ }
e4b2b4a8
JK
30738+
30739+ return NULL;
1a6e0f06
JK
30740+}
30741+
e4b2b4a8
JK
30742+static char *local_field_var_ref(struct hist_trigger_data *hist_data,
30743+ char *system, char *event_name,
30744+ char *var_name)
30745+{
30746+ struct trace_event_call *call;
30747+
30748+ if (system && event_name) {
30749+ call = hist_data->event_file->event_call;
30750+
30751+ if (strcmp(system, call->class->system) != 0)
30752+ return NULL;
30753+
30754+ if (strcmp(event_name, trace_event_name(call)) != 0)
30755+ return NULL;
30756+ }
30757+
30758+ if (!!system != !!event_name)
30759+ return NULL;
30760+
30761+ if (!is_var_ref(var_name))
30762+ return NULL;
30763+
30764+ var_name++;
30765+
30766+ return field_name_from_var(hist_data, var_name);
1a6e0f06 30767+}
e4b2b4a8
JK
30768+
30769+static struct hist_field *parse_var_ref(struct hist_trigger_data *hist_data,
30770+ char *system, char *event_name,
30771+ char *var_name)
1a6e0f06 30772+{
e4b2b4a8
JK
30773+ struct hist_field *var_field = NULL, *ref_field = NULL;
30774+
30775+ if (!is_var_ref(var_name))
30776+ return NULL;
30777+
30778+ var_name++;
30779+
30780+ var_field = find_event_var(hist_data, system, event_name, var_name);
30781+ if (var_field)
30782+ ref_field = create_var_ref(var_field, system, event_name);
30783+
30784+ if (!ref_field)
30785+ hist_err_event("Couldn't find variable: $",
30786+ system, event_name, var_name);
30787+
30788+ return ref_field;
30789+}
30790+
30791+static struct ftrace_event_field *
30792+parse_field(struct hist_trigger_data *hist_data, struct trace_event_file *file,
30793+ char *field_str, unsigned long *flags)
30794+{
30795+ struct ftrace_event_field *field = NULL;
30796+ char *field_name, *modifier, *str;
30797+
30798+ modifier = str = kstrdup(field_str, GFP_KERNEL);
30799+ if (!modifier)
30800+ return ERR_PTR(-ENOMEM);
1a6e0f06 30801+
e4b2b4a8
JK
30802+ field_name = strsep(&modifier, ".");
30803+ if (modifier) {
30804+ if (strcmp(modifier, "hex") == 0)
30805+ *flags |= HIST_FIELD_FL_HEX;
30806+ else if (strcmp(modifier, "sym") == 0)
30807+ *flags |= HIST_FIELD_FL_SYM;
30808+ else if (strcmp(modifier, "sym-offset") == 0)
30809+ *flags |= HIST_FIELD_FL_SYM_OFFSET;
30810+ else if ((strcmp(modifier, "execname") == 0) &&
30811+ (strcmp(field_name, "common_pid") == 0))
30812+ *flags |= HIST_FIELD_FL_EXECNAME;
30813+ else if (strcmp(modifier, "syscall") == 0)
30814+ *flags |= HIST_FIELD_FL_SYSCALL;
30815+ else if (strcmp(modifier, "log2") == 0)
30816+ *flags |= HIST_FIELD_FL_LOG2;
30817+ else if (strcmp(modifier, "usecs") == 0)
30818+ *flags |= HIST_FIELD_FL_TIMESTAMP_USECS;
30819+ else {
30820+ hist_err("Invalid field modifier: ", modifier);
30821+ field = ERR_PTR(-EINVAL);
30822+ goto out;
1a6e0f06 30823+ }
e4b2b4a8 30824+ }
1a6e0f06 30825+
e4b2b4a8
JK
30826+ if (strcmp(field_name, "common_timestamp") == 0) {
30827+ *flags |= HIST_FIELD_FL_TIMESTAMP;
30828+ hist_data->enable_timestamps = true;
30829+ if (*flags & HIST_FIELD_FL_TIMESTAMP_USECS)
30830+ hist_data->attrs->ts_in_usecs = true;
30831+ } else if (strcmp(field_name, "cpu") == 0)
30832+ *flags |= HIST_FIELD_FL_CPU;
30833+ else {
30834+ field = trace_find_event_field(file->event_call, field_name);
30835+ if (!field || !field->size) {
30836+ hist_err("Couldn't find field: ", field_name);
30837+ field = ERR_PTR(-EINVAL);
30838+ goto out;
30839+ }
30840+ }
30841+ out:
30842+ kfree(str);
1a6e0f06 30843+
e4b2b4a8
JK
30844+ return field;
30845+}
1a6e0f06 30846+
e4b2b4a8
JK
30847+static struct hist_field *create_alias(struct hist_trigger_data *hist_data,
30848+ struct hist_field *var_ref,
30849+ char *var_name)
1a6e0f06 30850+{
e4b2b4a8
JK
30851+ struct hist_field *alias = NULL;
30852+ unsigned long flags = HIST_FIELD_FL_ALIAS | HIST_FIELD_FL_VAR;
1a6e0f06 30853+
e4b2b4a8
JK
30854+ alias = create_hist_field(hist_data, NULL, flags, var_name);
30855+ if (!alias)
30856+ return NULL;
1a6e0f06 30857+
e4b2b4a8
JK
30858+ alias->fn = var_ref->fn;
30859+ alias->operands[0] = var_ref;
1a6e0f06 30860+
e4b2b4a8
JK
30861+ if (init_var_ref(alias, var_ref, var_ref->system, var_ref->event_name)) {
30862+ destroy_hist_field(alias, 0);
30863+ return NULL;
30864+ }
1a6e0f06 30865+
e4b2b4a8 30866+ return alias;
1a6e0f06
JK
30867+}
30868+
e4b2b4a8
JK
30869+static struct hist_field *parse_atom(struct hist_trigger_data *hist_data,
30870+ struct trace_event_file *file, char *str,
30871+ unsigned long *flags, char *var_name)
30872+{
30873+ char *s, *ref_system = NULL, *ref_event = NULL, *ref_var = str;
30874+ struct ftrace_event_field *field = NULL;
30875+ struct hist_field *hist_field = NULL;
30876+ int ret = 0;
1a6e0f06 30877+
e4b2b4a8
JK
30878+ s = strchr(str, '.');
30879+ if (s) {
30880+ s = strchr(++s, '.');
30881+ if (s) {
30882+ ref_system = strsep(&str, ".");
30883+ if (!str) {
30884+ ret = -EINVAL;
30885+ goto out;
30886+ }
30887+ ref_event = strsep(&str, ".");
30888+ if (!str) {
30889+ ret = -EINVAL;
30890+ goto out;
30891+ }
30892+ ref_var = str;
30893+ }
30894+ }
1a6e0f06 30895+
e4b2b4a8
JK
30896+ s = local_field_var_ref(hist_data, ref_system, ref_event, ref_var);
30897+ if (!s) {
30898+ hist_field = parse_var_ref(hist_data, ref_system, ref_event, ref_var);
30899+ if (hist_field) {
30900+ hist_data->var_refs[hist_data->n_var_refs] = hist_field;
30901+ hist_field->var_ref_idx = hist_data->n_var_refs++;
30902+ if (var_name) {
30903+ hist_field = create_alias(hist_data, hist_field, var_name);
30904+ if (!hist_field) {
30905+ ret = -ENOMEM;
30906+ goto out;
30907+ }
30908+ }
30909+ return hist_field;
30910+ }
30911+ } else
30912+ str = s;
30913+
30914+ field = parse_field(hist_data, file, str, flags);
30915+ if (IS_ERR(field)) {
30916+ ret = PTR_ERR(field);
30917+ goto out;
30918+ }
30919+
30920+ hist_field = create_hist_field(hist_data, field, *flags, var_name);
30921+ if (!hist_field) {
30922+ ret = -ENOMEM;
30923+ goto out;
30924+ }
30925+
30926+ return hist_field;
30927+ out:
30928+ return ERR_PTR(ret);
30929+}
30930+
30931+static struct hist_field *parse_expr(struct hist_trigger_data *hist_data,
30932+ struct trace_event_file *file,
30933+ char *str, unsigned long flags,
30934+ char *var_name, unsigned int level);
30935+
30936+static struct hist_field *parse_unary(struct hist_trigger_data *hist_data,
30937+ struct trace_event_file *file,
30938+ char *str, unsigned long flags,
30939+ char *var_name, unsigned int level)
1a6e0f06 30940+{
e4b2b4a8
JK
30941+ struct hist_field *operand1, *expr = NULL;
30942+ unsigned long operand_flags;
30943+ int ret = 0;
30944+ char *s;
30945+
30946+ // we support only -(xxx) i.e. explicit parens required
30947+
30948+ if (level > 3) {
30949+ hist_err("Too many subexpressions (3 max): ", str);
30950+ ret = -EINVAL;
30951+ goto free;
1a6e0f06 30952+ }
e4b2b4a8
JK
30953+
30954+ str++; // skip leading '-'
30955+
30956+ s = strchr(str, '(');
30957+ if (s)
30958+ str++;
30959+ else {
30960+ ret = -EINVAL;
30961+ goto free;
30962+ }
30963+
30964+ s = strrchr(str, ')');
30965+ if (s)
30966+ *s = '\0';
30967+ else {
30968+ ret = -EINVAL; // no closing ')'
30969+ goto free;
30970+ }
30971+
30972+ flags |= HIST_FIELD_FL_EXPR;
30973+ expr = create_hist_field(hist_data, NULL, flags, var_name);
30974+ if (!expr) {
30975+ ret = -ENOMEM;
30976+ goto free;
30977+ }
30978+
30979+ operand_flags = 0;
30980+ operand1 = parse_expr(hist_data, file, str, operand_flags, NULL, ++level);
30981+ if (IS_ERR(operand1)) {
30982+ ret = PTR_ERR(operand1);
30983+ goto free;
30984+ }
30985+
30986+ expr->flags |= operand1->flags &
30987+ (HIST_FIELD_FL_TIMESTAMP | HIST_FIELD_FL_TIMESTAMP_USECS);
30988+ expr->fn = hist_field_unary_minus;
30989+ expr->operands[0] = operand1;
30990+ expr->operator = FIELD_OP_UNARY_MINUS;
30991+ expr->name = expr_str(expr, 0);
30992+ expr->type = kstrdup(operand1->type, GFP_KERNEL);
30993+ if (!expr->type) {
30994+ ret = -ENOMEM;
30995+ goto free;
30996+ }
30997+
30998+ return expr;
30999+ free:
31000+ destroy_hist_field(expr, 0);
31001+ return ERR_PTR(ret);
1a6e0f06 31002+}
1a6e0f06 31003+
e4b2b4a8
JK
31004+static int check_expr_operands(struct hist_field *operand1,
31005+ struct hist_field *operand2)
31006+{
31007+ unsigned long operand1_flags = operand1->flags;
31008+ unsigned long operand2_flags = operand2->flags;
1a6e0f06 31009+
e4b2b4a8
JK
31010+ if ((operand1_flags & HIST_FIELD_FL_VAR_REF) ||
31011+ (operand1_flags & HIST_FIELD_FL_ALIAS)) {
31012+ struct hist_field *var;
31013+
31014+ var = find_var_field(operand1->var.hist_data, operand1->name);
31015+ if (!var)
31016+ return -EINVAL;
31017+ operand1_flags = var->flags;
31018+ }
31019+
31020+ if ((operand2_flags & HIST_FIELD_FL_VAR_REF) ||
31021+ (operand2_flags & HIST_FIELD_FL_ALIAS)) {
31022+ struct hist_field *var;
31023+
31024+ var = find_var_field(operand2->var.hist_data, operand2->name);
31025+ if (!var)
31026+ return -EINVAL;
31027+ operand2_flags = var->flags;
31028+ }
31029+
31030+ if ((operand1_flags & HIST_FIELD_FL_TIMESTAMP_USECS) !=
31031+ (operand2_flags & HIST_FIELD_FL_TIMESTAMP_USECS)) {
31032+ hist_err("Timestamp units in expression don't match", NULL);
b3bbd485 31033+ return -EINVAL;
e4b2b4a8 31034+ }
b3bbd485
JK
31035+
31036+ return 0;
31037+}
31038+
e4b2b4a8
JK
31039+static struct hist_field *parse_expr(struct hist_trigger_data *hist_data,
31040+ struct trace_event_file *file,
31041+ char *str, unsigned long flags,
31042+ char *var_name, unsigned int level)
b3bbd485 31043+{
e4b2b4a8
JK
31044+ struct hist_field *operand1 = NULL, *operand2 = NULL, *expr = NULL;
31045+ unsigned long operand_flags;
31046+ int field_op, ret = -EINVAL;
31047+ char *sep, *operand1_str;
31048+
31049+ if (level > 3) {
31050+ hist_err("Too many subexpressions (3 max): ", str);
31051+ return ERR_PTR(-EINVAL);
31052+ }
31053+
31054+ field_op = contains_operator(str);
31055+
31056+ if (field_op == FIELD_OP_NONE)
31057+ return parse_atom(hist_data, file, str, &flags, var_name);
31058+
31059+ if (field_op == FIELD_OP_UNARY_MINUS)
31060+ return parse_unary(hist_data, file, str, flags, var_name, ++level);
31061+
31062+ switch (field_op) {
31063+ case FIELD_OP_MINUS:
31064+ sep = "-";
31065+ break;
31066+ case FIELD_OP_PLUS:
31067+ sep = "+";
31068+ break;
31069+ default:
31070+ goto free;
31071+ }
31072+
31073+ operand1_str = strsep(&str, sep);
31074+ if (!operand1_str || !str)
31075+ goto free;
31076+
31077+ operand_flags = 0;
31078+ operand1 = parse_atom(hist_data, file, operand1_str,
31079+ &operand_flags, NULL);
31080+ if (IS_ERR(operand1)) {
31081+ ret = PTR_ERR(operand1);
31082+ operand1 = NULL;
31083+ goto free;
31084+ }
31085+
31086+ // rest of string could be another expression e.g. b+c in a+b+c
31087+ operand_flags = 0;
31088+ operand2 = parse_expr(hist_data, file, str, operand_flags, NULL, ++level);
31089+ if (IS_ERR(operand2)) {
31090+ ret = PTR_ERR(operand2);
31091+ operand2 = NULL;
31092+ goto free;
31093+ }
31094+
31095+ ret = check_expr_operands(operand1, operand2);
31096+ if (ret)
31097+ goto free;
31098+
31099+ flags |= HIST_FIELD_FL_EXPR;
31100+
31101+ flags |= operand1->flags &
31102+ (HIST_FIELD_FL_TIMESTAMP | HIST_FIELD_FL_TIMESTAMP_USECS);
1a6e0f06 31103+
e4b2b4a8
JK
31104+ expr = create_hist_field(hist_data, NULL, flags, var_name);
31105+ if (!expr) {
31106+ ret = -ENOMEM;
31107+ goto free;
31108+ }
1a6e0f06 31109+
e4b2b4a8
JK
31110+ operand1->read_once = true;
31111+ operand2->read_once = true;
31112+
31113+ expr->operands[0] = operand1;
31114+ expr->operands[1] = operand2;
31115+ expr->operator = field_op;
31116+ expr->name = expr_str(expr, 0);
31117+ expr->type = kstrdup(operand1->type, GFP_KERNEL);
31118+ if (!expr->type) {
31119+ ret = -ENOMEM;
31120+ goto free;
31121+ }
1a6e0f06 31122+
e4b2b4a8
JK
31123+ switch (field_op) {
31124+ case FIELD_OP_MINUS:
31125+ expr->fn = hist_field_minus;
31126+ break;
31127+ case FIELD_OP_PLUS:
31128+ expr->fn = hist_field_plus;
31129+ break;
31130+ default:
31131+ ret = -EINVAL;
31132+ goto free;
31133+ }
31134+
31135+ return expr;
31136+ free:
31137+ destroy_hist_field(operand1, 0);
31138+ destroy_hist_field(operand2, 0);
31139+ destroy_hist_field(expr, 0);
31140+
31141+ return ERR_PTR(ret);
31142+}
31143+
31144+static char *find_trigger_filter(struct hist_trigger_data *hist_data,
31145+ struct trace_event_file *file)
1a6e0f06 31146+{
e4b2b4a8
JK
31147+ struct event_trigger_data *test;
31148+
31149+ list_for_each_entry_rcu(test, &file->triggers, list) {
31150+ if (test->cmd_ops->trigger_type == ETT_EVENT_HIST) {
31151+ if (test->private_data == hist_data)
31152+ return test->filter_str;
31153+ }
31154+ }
31155+
31156+ return NULL;
1a6e0f06
JK
31157+}
31158+
e4b2b4a8
JK
31159+static struct event_command trigger_hist_cmd;
31160+static int event_hist_trigger_func(struct event_command *cmd_ops,
31161+ struct trace_event_file *file,
31162+ char *glob, char *cmd, char *param);
31163+
31164+static bool compatible_keys(struct hist_trigger_data *target_hist_data,
31165+ struct hist_trigger_data *hist_data,
31166+ unsigned int n_keys)
1a6e0f06 31167+{
e4b2b4a8
JK
31168+ struct hist_field *target_hist_field, *hist_field;
31169+ unsigned int n, i, j;
31170+
31171+ if (hist_data->n_fields - hist_data->n_vals != n_keys)
31172+ return false;
31173+
31174+ i = hist_data->n_vals;
31175+ j = target_hist_data->n_vals;
31176+
31177+ for (n = 0; n < n_keys; n++) {
31178+ hist_field = hist_data->fields[i + n];
31179+ target_hist_field = target_hist_data->fields[j + n];
31180+
31181+ if (strcmp(hist_field->type, target_hist_field->type) != 0)
31182+ return false;
31183+ if (hist_field->size != target_hist_field->size)
31184+ return false;
31185+ if (hist_field->is_signed != target_hist_field->is_signed)
31186+ return false;
31187+ }
31188+
31189+ return true;
1a6e0f06
JK
31190+}
31191+
e4b2b4a8
JK
31192+static struct hist_trigger_data *
31193+find_compatible_hist(struct hist_trigger_data *target_hist_data,
31194+ struct trace_event_file *file)
1a6e0f06 31195+{
e4b2b4a8
JK
31196+ struct hist_trigger_data *hist_data;
31197+ struct event_trigger_data *test;
31198+ unsigned int n_keys;
31199+
31200+ n_keys = target_hist_data->n_fields - target_hist_data->n_vals;
31201+
31202+ list_for_each_entry_rcu(test, &file->triggers, list) {
31203+ if (test->cmd_ops->trigger_type == ETT_EVENT_HIST) {
31204+ hist_data = test->private_data;
31205+
31206+ if (compatible_keys(target_hist_data, hist_data, n_keys))
31207+ return hist_data;
31208+ }
31209+ }
31210+
31211+ return NULL;
1a6e0f06 31212+}
1a6e0f06 31213+
e4b2b4a8
JK
31214+static struct trace_event_file *event_file(struct trace_array *tr,
31215+ char *system, char *event_name)
31216+{
31217+ struct trace_event_file *file;
31218+
31219+ file = find_event_file(tr, system, event_name);
31220+ if (!file)
31221+ return ERR_PTR(-EINVAL);
31222+
31223+ return file;
31224+}
31225+
31226+static struct hist_field *
31227+find_synthetic_field_var(struct hist_trigger_data *target_hist_data,
31228+ char *system, char *event_name, char *field_name)
31229+{
31230+ struct hist_field *event_var;
31231+ char *synthetic_name;
31232+
31233+ synthetic_name = kzalloc(MAX_FILTER_STR_VAL, GFP_KERNEL);
31234+ if (!synthetic_name)
31235+ return ERR_PTR(-ENOMEM);
31236+
31237+ strcpy(synthetic_name, "synthetic_");
31238+ strcat(synthetic_name, field_name);
31239+
31240+ event_var = find_event_var(target_hist_data, system, event_name, synthetic_name);
31241+
31242+ kfree(synthetic_name);
31243+
31244+ return event_var;
31245+}
1a6e0f06
JK
31246+
31247+/**
e4b2b4a8
JK
31248+ * create_field_var_hist - Automatically create a histogram and var for a field
31249+ * @target_hist_data: The target hist trigger
31250+ * @subsys_name: Optional subsystem name
31251+ * @event_name: Optional event name
31252+ * @field_name: The name of the field (and the resulting variable)
1a6e0f06 31253+ *
e4b2b4a8
JK
31254+ * Hist trigger actions fetch data from variables, not directly from
31255+ * events. However, for convenience, users are allowed to directly
31256+ * specify an event field in an action, which will be automatically
31257+ * converted into a variable on their behalf.
31258+
31259+ * If a user specifies a field on an event that isn't the event the
31260+ * histogram currently being defined (the target event histogram), the
31261+ * only way that can be accomplished is if a new hist trigger is
31262+ * created and the field variable defined on that.
1a6e0f06 31263+ *
e4b2b4a8
JK
31264+ * This function creates a new histogram compatible with the target
31265+ * event (meaning a histogram with the same key as the target
31266+ * histogram), and creates a variable for the specified field, but
31267+ * with 'synthetic_' prepended to the variable name in order to avoid
31268+ * collision with normal field variables.
31269+ *
31270+ * Return: The variable created for the field.
1a6e0f06 31271+ */
e4b2b4a8
JK
31272+static struct hist_field *
31273+create_field_var_hist(struct hist_trigger_data *target_hist_data,
31274+ char *subsys_name, char *event_name, char *field_name)
31275+{
31276+ struct trace_array *tr = target_hist_data->event_file->tr;
31277+ struct hist_field *event_var = ERR_PTR(-EINVAL);
31278+ struct hist_trigger_data *hist_data;
31279+ unsigned int i, n, first = true;
31280+ struct field_var_hist *var_hist;
31281+ struct trace_event_file *file;
31282+ struct hist_field *key_field;
31283+ char *saved_filter;
31284+ char *cmd;
31285+ int ret;
1a6e0f06 31286+
e4b2b4a8
JK
31287+ if (target_hist_data->n_field_var_hists >= SYNTH_FIELDS_MAX) {
31288+ hist_err_event("onmatch: Too many field variables defined: ",
31289+ subsys_name, event_name, field_name);
31290+ return ERR_PTR(-EINVAL);
31291+ }
1a6e0f06 31292+
e4b2b4a8 31293+ file = event_file(tr, subsys_name, event_name);
1a6e0f06 31294+
e4b2b4a8
JK
31295+ if (IS_ERR(file)) {
31296+ hist_err_event("onmatch: Event file not found: ",
31297+ subsys_name, event_name, field_name);
31298+ ret = PTR_ERR(file);
31299+ return ERR_PTR(ret);
1a6e0f06
JK
31300+ }
31301+
e4b2b4a8
JK
31302+ /*
31303+ * Look for a histogram compatible with target. We'll use the
31304+ * found histogram specification to create a new matching
31305+ * histogram with our variable on it. target_hist_data is not
31306+ * yet a registered histogram so we can't use that.
31307+ */
31308+ hist_data = find_compatible_hist(target_hist_data, file);
31309+ if (!hist_data) {
31310+ hist_err_event("onmatch: Matching event histogram not found: ",
31311+ subsys_name, event_name, field_name);
31312+ return ERR_PTR(-EINVAL);
1a6e0f06 31313+ }
1a6e0f06 31314+
e4b2b4a8
JK
31315+ /* See if a synthetic field variable has already been created */
31316+ event_var = find_synthetic_field_var(target_hist_data, subsys_name,
31317+ event_name, field_name);
31318+ if (!IS_ERR_OR_NULL(event_var))
31319+ return event_var;
1a6e0f06 31320+
e4b2b4a8
JK
31321+ var_hist = kzalloc(sizeof(*var_hist), GFP_KERNEL);
31322+ if (!var_hist)
31323+ return ERR_PTR(-ENOMEM);
1a6e0f06 31324+
e4b2b4a8
JK
31325+ cmd = kzalloc(MAX_FILTER_STR_VAL, GFP_KERNEL);
31326+ if (!cmd) {
31327+ kfree(var_hist);
31328+ return ERR_PTR(-ENOMEM);
31329+ }
31330+
31331+ /* Use the same keys as the compatible histogram */
31332+ strcat(cmd, "keys=");
31333+
31334+ for_each_hist_key_field(i, hist_data) {
31335+ key_field = hist_data->fields[i];
31336+ if (!first)
31337+ strcat(cmd, ",");
31338+ strcat(cmd, key_field->field->name);
31339+ first = false;
31340+ }
31341+
31342+ /* Create the synthetic field variable specification */
31343+ strcat(cmd, ":synthetic_");
31344+ strcat(cmd, field_name);
31345+ strcat(cmd, "=");
31346+ strcat(cmd, field_name);
31347+
31348+ /* Use the same filter as the compatible histogram */
31349+ saved_filter = find_trigger_filter(hist_data, file);
31350+ if (saved_filter) {
31351+ strcat(cmd, " if ");
31352+ strcat(cmd, saved_filter);
31353+ }
31354+
31355+ var_hist->cmd = kstrdup(cmd, GFP_KERNEL);
31356+ if (!var_hist->cmd) {
31357+ kfree(cmd);
31358+ kfree(var_hist);
31359+ return ERR_PTR(-ENOMEM);
31360+ }
31361+
31362+ /* Save the compatible histogram information */
31363+ var_hist->hist_data = hist_data;
31364+
31365+ /* Create the new histogram with our variable */
31366+ ret = event_hist_trigger_func(&trigger_hist_cmd, file,
31367+ "", "hist", cmd);
31368+ if (ret) {
31369+ kfree(cmd);
31370+ kfree(var_hist->cmd);
31371+ kfree(var_hist);
31372+ hist_err_event("onmatch: Couldn't create histogram for field: ",
31373+ subsys_name, event_name, field_name);
31374+ return ERR_PTR(ret);
31375+ }
31376+
31377+ kfree(cmd);
31378+
31379+ /* If we can't find the variable, something went wrong */
31380+ event_var = find_synthetic_field_var(target_hist_data, subsys_name,
31381+ event_name, field_name);
31382+ if (IS_ERR_OR_NULL(event_var)) {
31383+ kfree(var_hist->cmd);
31384+ kfree(var_hist);
31385+ hist_err_event("onmatch: Couldn't find synthetic variable: ",
31386+ subsys_name, event_name, field_name);
31387+ return ERR_PTR(-EINVAL);
1a6e0f06 31388+ }
e4b2b4a8
JK
31389+
31390+ n = target_hist_data->n_field_var_hists;
31391+ target_hist_data->field_var_hists[n] = var_hist;
31392+ target_hist_data->n_field_var_hists++;
31393+
31394+ return event_var;
1a6e0f06
JK
31395+}
31396+
e4b2b4a8
JK
31397+static struct hist_field *
31398+find_target_event_var(struct hist_trigger_data *hist_data,
31399+ char *subsys_name, char *event_name, char *var_name)
1a6e0f06 31400+{
e4b2b4a8
JK
31401+ struct trace_event_file *file = hist_data->event_file;
31402+ struct hist_field *hist_field = NULL;
1a6e0f06 31403+
e4b2b4a8
JK
31404+ if (subsys_name) {
31405+ struct trace_event_call *call;
1a6e0f06 31406+
e4b2b4a8
JK
31407+ if (!event_name)
31408+ return NULL;
1a6e0f06 31409+
e4b2b4a8 31410+ call = file->event_call;
1a6e0f06 31411+
e4b2b4a8
JK
31412+ if (strcmp(subsys_name, call->class->system) != 0)
31413+ return NULL;
1a6e0f06 31414+
e4b2b4a8
JK
31415+ if (strcmp(event_name, trace_event_name(call)) != 0)
31416+ return NULL;
31417+ }
31418+
31419+ hist_field = find_var_field(hist_data, var_name);
31420+
31421+ return hist_field;
31422+}
31423+
31424+static inline void __update_field_vars(struct tracing_map_elt *elt,
31425+ struct ring_buffer_event *rbe,
31426+ void *rec,
31427+ struct field_var **field_vars,
31428+ unsigned int n_field_vars,
31429+ unsigned int field_var_str_start)
31430+{
31431+ struct hist_elt_data *elt_data = elt->private_data;
31432+ unsigned int i, j, var_idx;
31433+ u64 var_val;
1a6e0f06 31434+
e4b2b4a8
JK
31435+ for (i = 0, j = field_var_str_start; i < n_field_vars; i++) {
31436+ struct field_var *field_var = field_vars[i];
31437+ struct hist_field *var = field_var->var;
31438+ struct hist_field *val = field_var->val;
1a6e0f06 31439+
e4b2b4a8
JK
31440+ var_val = val->fn(val, elt, rbe, rec);
31441+ var_idx = var->var.idx;
1a6e0f06 31442+
e4b2b4a8
JK
31443+ if (val->flags & HIST_FIELD_FL_STRING) {
31444+ char *str = elt_data->field_var_str[j++];
31445+ char *val_str = (char *)(uintptr_t)var_val;
1a6e0f06 31446+
e4b2b4a8
JK
31447+ strscpy(str, val_str, STR_VAR_LEN_MAX);
31448+ var_val = (u64)(uintptr_t)str;
1a6e0f06 31449+ }
e4b2b4a8 31450+ tracing_map_set_var(elt, var_idx, var_val);
1a6e0f06 31451+ }
1a6e0f06
JK
31452+}
31453+
e4b2b4a8
JK
31454+static void update_field_vars(struct hist_trigger_data *hist_data,
31455+ struct tracing_map_elt *elt,
31456+ struct ring_buffer_event *rbe,
31457+ void *rec)
1a6e0f06 31458+{
e4b2b4a8
JK
31459+ __update_field_vars(elt, rbe, rec, hist_data->field_vars,
31460+ hist_data->n_field_vars, 0);
31461+}
1a6e0f06 31462+
e4b2b4a8
JK
31463+static void update_max_vars(struct hist_trigger_data *hist_data,
31464+ struct tracing_map_elt *elt,
31465+ struct ring_buffer_event *rbe,
31466+ void *rec)
31467+{
31468+ __update_field_vars(elt, rbe, rec, hist_data->max_vars,
31469+ hist_data->n_max_vars, hist_data->n_field_var_str);
1a6e0f06
JK
31470+}
31471+
e4b2b4a8
JK
31472+static struct hist_field *create_var(struct hist_trigger_data *hist_data,
31473+ struct trace_event_file *file,
31474+ char *name, int size, const char *type)
31475+{
31476+ struct hist_field *var;
31477+ int idx;
1a6e0f06 31478+
e4b2b4a8
JK
31479+ if (find_var(hist_data, file, name) && !hist_data->remove) {
31480+ var = ERR_PTR(-EINVAL);
31481+ goto out;
31482+ }
1a6e0f06 31483+
e4b2b4a8
JK
31484+ var = kzalloc(sizeof(struct hist_field), GFP_KERNEL);
31485+ if (!var) {
31486+ var = ERR_PTR(-ENOMEM);
31487+ goto out;
31488+ }
1a6e0f06 31489+
e4b2b4a8
JK
31490+ idx = tracing_map_add_var(hist_data->map);
31491+ if (idx < 0) {
31492+ kfree(var);
31493+ var = ERR_PTR(-EINVAL);
31494+ goto out;
31495+ }
1a6e0f06 31496+
e4b2b4a8
JK
31497+ var->flags = HIST_FIELD_FL_VAR;
31498+ var->var.idx = idx;
31499+ var->var.hist_data = var->hist_data = hist_data;
31500+ var->size = size;
31501+ var->var.name = kstrdup(name, GFP_KERNEL);
31502+ var->type = kstrdup(type, GFP_KERNEL);
31503+ if (!var->var.name || !var->type) {
31504+ kfree(var->var.name);
31505+ kfree(var->type);
31506+ kfree(var);
31507+ var = ERR_PTR(-ENOMEM);
31508+ }
31509+ out:
31510+ return var;
31511+}
1a6e0f06 31512+
e4b2b4a8
JK
31513+static struct field_var *create_field_var(struct hist_trigger_data *hist_data,
31514+ struct trace_event_file *file,
31515+ char *field_name)
1a6e0f06 31516+{
e4b2b4a8
JK
31517+ struct hist_field *val = NULL, *var = NULL;
31518+ unsigned long flags = HIST_FIELD_FL_VAR;
31519+ struct field_var *field_var;
b3bbd485
JK
31520+ int ret = 0;
31521+
e4b2b4a8
JK
31522+ if (hist_data->n_field_vars >= SYNTH_FIELDS_MAX) {
31523+ hist_err("Too many field variables defined: ", field_name);
31524+ ret = -EINVAL;
31525+ goto err;
31526+ }
31527+
31528+ val = parse_atom(hist_data, file, field_name, &flags, NULL);
31529+ if (IS_ERR(val)) {
31530+ hist_err("Couldn't parse field variable: ", field_name);
31531+ ret = PTR_ERR(val);
31532+ goto err;
31533+ }
31534+
31535+ var = create_var(hist_data, file, field_name, val->size, val->type);
31536+ if (IS_ERR(var)) {
31537+ hist_err("Couldn't create or find variable: ", field_name);
31538+ kfree(val);
31539+ ret = PTR_ERR(var);
31540+ goto err;
31541+ }
31542+
31543+ field_var = kzalloc(sizeof(struct field_var), GFP_KERNEL);
31544+ if (!field_var) {
31545+ kfree(val);
31546+ kfree(var);
31547+ ret = -ENOMEM;
31548+ goto err;
31549+ }
31550+
31551+ field_var->var = var;
31552+ field_var->val = val;
31553+ out:
31554+ return field_var;
31555+ err:
31556+ field_var = ERR_PTR(ret);
31557+ goto out;
1a6e0f06
JK
31558+}
31559+
e4b2b4a8
JK
31560+/**
31561+ * create_target_field_var - Automatically create a variable for a field
31562+ * @target_hist_data: The target hist trigger
31563+ * @subsys_name: Optional subsystem name
31564+ * @event_name: Optional event name
31565+ * @var_name: The name of the field (and the resulting variable)
31566+ *
31567+ * Hist trigger actions fetch data from variables, not directly from
31568+ * events. However, for convenience, users are allowed to directly
31569+ * specify an event field in an action, which will be automatically
31570+ * converted into a variable on their behalf.
31571+
31572+ * This function creates a field variable with the name var_name on
31573+ * the hist trigger currently being defined on the target event. If
31574+ * subsys_name and event_name are specified, this function simply
31575+ * verifies that they do in fact match the target event subsystem and
31576+ * event name.
31577+ *
31578+ * Return: The variable created for the field.
1a6e0f06 31579+ */
e4b2b4a8
JK
31580+static struct field_var *
31581+create_target_field_var(struct hist_trigger_data *target_hist_data,
31582+ char *subsys_name, char *event_name, char *var_name)
1a6e0f06 31583+{
e4b2b4a8 31584+ struct trace_event_file *file = target_hist_data->event_file;
1a6e0f06 31585+
e4b2b4a8
JK
31586+ if (subsys_name) {
31587+ struct trace_event_call *call;
1a6e0f06 31588+
e4b2b4a8
JK
31589+ if (!event_name)
31590+ return NULL;
1a6e0f06 31591+
e4b2b4a8
JK
31592+ call = file->event_call;
31593+
31594+ if (strcmp(subsys_name, call->class->system) != 0)
31595+ return NULL;
31596+
31597+ if (strcmp(event_name, trace_event_name(call)) != 0)
31598+ return NULL;
31599+ }
31600+
31601+ return create_field_var(target_hist_data, file, var_name);
1a6e0f06
JK
31602+}
31603+
e4b2b4a8
JK
31604+static void onmax_print(struct seq_file *m,
31605+ struct hist_trigger_data *hist_data,
31606+ struct tracing_map_elt *elt,
31607+ struct action_data *data)
1a6e0f06 31608+{
e4b2b4a8 31609+ unsigned int i, save_var_idx, max_idx = data->onmax.max_var->var.idx;
1a6e0f06 31610+
e4b2b4a8 31611+ seq_printf(m, "\n\tmax: %10llu", tracing_map_read_var(elt, max_idx));
1a6e0f06 31612+
e4b2b4a8
JK
31613+ for (i = 0; i < hist_data->n_max_vars; i++) {
31614+ struct hist_field *save_val = hist_data->max_vars[i]->val;
31615+ struct hist_field *save_var = hist_data->max_vars[i]->var;
31616+ u64 val;
1a6e0f06 31617+
e4b2b4a8 31618+ save_var_idx = save_var->var.idx;
1a6e0f06 31619+
e4b2b4a8 31620+ val = tracing_map_read_var(elt, save_var_idx);
1a6e0f06 31621+
e4b2b4a8
JK
31622+ if (save_val->flags & HIST_FIELD_FL_STRING) {
31623+ seq_printf(m, " %s: %-32s", save_var->var.name,
31624+ (char *)(uintptr_t)(val));
31625+ } else
31626+ seq_printf(m, " %s: %10llu", save_var->var.name, val);
31627+ }
1a6e0f06
JK
31628+}
31629+
e4b2b4a8
JK
31630+static void onmax_save(struct hist_trigger_data *hist_data,
31631+ struct tracing_map_elt *elt, void *rec,
31632+ struct ring_buffer_event *rbe,
31633+ struct action_data *data, u64 *var_ref_vals)
1a6e0f06 31634+{
e4b2b4a8
JK
31635+ unsigned int max_idx = data->onmax.max_var->var.idx;
31636+ unsigned int max_var_ref_idx = data->onmax.max_var_ref_idx;
1a6e0f06 31637+
e4b2b4a8 31638+ u64 var_val, max_val;
1a6e0f06 31639+
e4b2b4a8
JK
31640+ var_val = var_ref_vals[max_var_ref_idx];
31641+ max_val = tracing_map_read_var(elt, max_idx);
31642+
31643+ if (var_val <= max_val)
31644+ return;
31645+
31646+ tracing_map_set_var(elt, max_idx, var_val);
31647+
31648+ update_max_vars(hist_data, elt, rbe, rec);
31649+}
1a6e0f06 31650+
e4b2b4a8 31651+static void onmax_destroy(struct action_data *data)
1a6e0f06 31652+{
e4b2b4a8 31653+ unsigned int i;
1a6e0f06 31654+
e4b2b4a8
JK
31655+ destroy_hist_field(data->onmax.max_var, 0);
31656+ destroy_hist_field(data->onmax.var, 0);
1a6e0f06 31657+
e4b2b4a8
JK
31658+ kfree(data->onmax.var_str);
31659+ kfree(data->onmax.fn_name);
1a6e0f06 31660+
e4b2b4a8
JK
31661+ for (i = 0; i < data->n_params; i++)
31662+ kfree(data->params[i]);
1a6e0f06 31663+
e4b2b4a8
JK
31664+ kfree(data);
31665+}
1a6e0f06 31666+
e4b2b4a8
JK
31667+static int onmax_create(struct hist_trigger_data *hist_data,
31668+ struct action_data *data)
31669+{
31670+ struct trace_event_file *file = hist_data->event_file;
31671+ struct hist_field *var_field, *ref_field, *max_var;
31672+ unsigned int var_ref_idx = hist_data->n_var_refs;
31673+ struct field_var *field_var;
31674+ char *onmax_var_str, *param;
31675+ unsigned long flags;
31676+ unsigned int i;
31677+ int ret = 0;
1a6e0f06 31678+
e4b2b4a8
JK
31679+ onmax_var_str = data->onmax.var_str;
31680+ if (onmax_var_str[0] != '$') {
31681+ hist_err("onmax: For onmax(x), x must be a variable: ", onmax_var_str);
b3bbd485 31682+ return -EINVAL;
e4b2b4a8
JK
31683+ }
31684+ onmax_var_str++;
b3bbd485 31685+
e4b2b4a8
JK
31686+ var_field = find_target_event_var(hist_data, NULL, NULL, onmax_var_str);
31687+ if (!var_field) {
31688+ hist_err("onmax: Couldn't find onmax variable: ", onmax_var_str);
31689+ return -EINVAL;
31690+ }
1a6e0f06 31691+
e4b2b4a8
JK
31692+ flags = HIST_FIELD_FL_VAR_REF;
31693+ ref_field = create_hist_field(hist_data, NULL, flags, NULL);
31694+ if (!ref_field)
31695+ return -ENOMEM;
1a6e0f06 31696+
e4b2b4a8
JK
31697+ if (init_var_ref(ref_field, var_field, NULL, NULL)) {
31698+ destroy_hist_field(ref_field, 0);
31699+ ret = -ENOMEM;
31700+ goto out;
1a6e0f06 31701+ }
e4b2b4a8
JK
31702+ hist_data->var_refs[hist_data->n_var_refs] = ref_field;
31703+ ref_field->var_ref_idx = hist_data->n_var_refs++;
31704+ data->onmax.var = ref_field;
31705+
31706+ data->fn = onmax_save;
31707+ data->onmax.max_var_ref_idx = var_ref_idx;
31708+ max_var = create_var(hist_data, file, "max", sizeof(u64), "u64");
31709+ if (IS_ERR(max_var)) {
31710+ hist_err("onmax: Couldn't create onmax variable: ", "max");
31711+ ret = PTR_ERR(max_var);
31712+ goto out;
31713+ }
31714+ data->onmax.max_var = max_var;
1a6e0f06 31715+
e4b2b4a8
JK
31716+ for (i = 0; i < data->n_params; i++) {
31717+ param = kstrdup(data->params[i], GFP_KERNEL);
31718+ if (!param) {
31719+ ret = -ENOMEM;
31720+ goto out;
31721+ }
31722+
31723+ field_var = create_target_field_var(hist_data, NULL, NULL, param);
31724+ if (IS_ERR(field_var)) {
31725+ hist_err("onmax: Couldn't create field variable: ", param);
31726+ ret = PTR_ERR(field_var);
31727+ kfree(param);
31728+ goto out;
31729+ }
31730+
31731+ hist_data->max_vars[hist_data->n_max_vars++] = field_var;
31732+ if (field_var->val->flags & HIST_FIELD_FL_STRING)
31733+ hist_data->n_max_var_str++;
31734+
31735+ kfree(param);
1a6e0f06 31736+ }
e4b2b4a8
JK
31737+ out:
31738+ return ret;
1a6e0f06
JK
31739+}
31740+
e4b2b4a8 31741+static int parse_action_params(char *params, struct action_data *data)
1a6e0f06 31742+{
e4b2b4a8
JK
31743+ char *param, *saved_param;
31744+ int ret = 0;
1a6e0f06 31745+
e4b2b4a8
JK
31746+ while (params) {
31747+ if (data->n_params >= SYNTH_FIELDS_MAX)
31748+ goto out;
1a6e0f06 31749+
e4b2b4a8
JK
31750+ param = strsep(&params, ",");
31751+ if (!param) {
31752+ ret = -EINVAL;
31753+ goto out;
31754+ }
1a6e0f06 31755+
e4b2b4a8
JK
31756+ param = strstrip(param);
31757+ if (strlen(param) < 2) {
31758+ hist_err("Invalid action param: ", param);
b3bbd485
JK
31759+ ret = -EINVAL;
31760+ goto out;
31761+ }
1a6e0f06 31762+
e4b2b4a8
JK
31763+ saved_param = kstrdup(param, GFP_KERNEL);
31764+ if (!saved_param) {
31765+ ret = -ENOMEM;
31766+ goto out;
31767+ }
31768+
31769+ data->params[data->n_params++] = saved_param;
b3bbd485 31770+ }
e4b2b4a8
JK
31771+ out:
31772+ return ret;
31773+}
b3bbd485 31774+
e4b2b4a8 31775+static struct action_data *onmax_parse(char *str)
1a6e0f06 31776+{
e4b2b4a8
JK
31777+ char *onmax_fn_name, *onmax_var_str;
31778+ struct action_data *data;
31779+ int ret = -EINVAL;
1a6e0f06 31780+
e4b2b4a8
JK
31781+ data = kzalloc(sizeof(*data), GFP_KERNEL);
31782+ if (!data)
31783+ return ERR_PTR(-ENOMEM);
1a6e0f06 31784+
e4b2b4a8
JK
31785+ onmax_var_str = strsep(&str, ")");
31786+ if (!onmax_var_str || !str) {
b3bbd485 31787+ ret = -EINVAL;
e4b2b4a8 31788+ goto free;
b3bbd485
JK
31789+ }
31790+
e4b2b4a8
JK
31791+ data->onmax.var_str = kstrdup(onmax_var_str, GFP_KERNEL);
31792+ if (!data->onmax.var_str) {
31793+ ret = -ENOMEM;
31794+ goto free;
31795+ }
31796+
31797+ strsep(&str, ".");
31798+ if (!str)
31799+ goto free;
31800+
31801+ onmax_fn_name = strsep(&str, "(");
31802+ if (!onmax_fn_name || !str)
31803+ goto free;
31804+
31805+ if (strncmp(onmax_fn_name, "save", strlen("save")) == 0) {
31806+ char *params = strsep(&str, ")");
31807+
31808+ if (!params) {
31809+ ret = -EINVAL;
31810+ goto free;
1a6e0f06 31811+ }
1a6e0f06 31812+
e4b2b4a8
JK
31813+ ret = parse_action_params(params, data);
31814+ if (ret)
31815+ goto free;
31816+ } else
31817+ goto free;
31818+
31819+ data->onmax.fn_name = kstrdup(onmax_fn_name, GFP_KERNEL);
31820+ if (!data->onmax.fn_name) {
31821+ ret = -ENOMEM;
31822+ goto free;
1a6e0f06 31823+ }
e4b2b4a8
JK
31824+ out:
31825+ return data;
31826+ free:
31827+ onmax_destroy(data);
31828+ data = ERR_PTR(ret);
31829+ goto out;
1a6e0f06
JK
31830+}
31831+
e4b2b4a8
JK
31832+static void onmatch_destroy(struct action_data *data)
31833+{
31834+ unsigned int i;
31835+
31836+ mutex_lock(&synth_event_mutex);
31837+
31838+ kfree(data->onmatch.match_event);
31839+ kfree(data->onmatch.match_event_system);
31840+ kfree(data->onmatch.synth_event_name);
31841+
31842+ for (i = 0; i < data->n_params; i++)
31843+ kfree(data->params[i]);
31844+
31845+ if (data->onmatch.synth_event)
31846+ data->onmatch.synth_event->ref--;
31847+
31848+ kfree(data);
31849+
31850+ mutex_unlock(&synth_event_mutex);
31851+}
31852+
31853+static void destroy_field_var(struct field_var *field_var)
31854+{
31855+ if (!field_var)
31856+ return;
31857+
31858+ destroy_hist_field(field_var->var, 0);
31859+ destroy_hist_field(field_var->val, 0);
31860+
31861+ kfree(field_var);
1a6e0f06
JK
31862+}
31863+
e4b2b4a8
JK
31864+static void destroy_field_vars(struct hist_trigger_data *hist_data)
31865+{
31866+ unsigned int i;
1a6e0f06 31867+
e4b2b4a8
JK
31868+ for (i = 0; i < hist_data->n_field_vars; i++)
31869+ destroy_field_var(hist_data->field_vars[i]);
31870+}
31871+
31872+static void save_field_var(struct hist_trigger_data *hist_data,
31873+ struct field_var *field_var)
1a6e0f06 31874+{
e4b2b4a8 31875+ hist_data->field_vars[hist_data->n_field_vars++] = field_var;
1a6e0f06 31876+
e4b2b4a8
JK
31877+ if (field_var->val->flags & HIST_FIELD_FL_STRING)
31878+ hist_data->n_field_var_str++;
31879+}
1a6e0f06 31880+
e4b2b4a8
JK
31881+
31882+static void destroy_synth_var_refs(struct hist_trigger_data *hist_data)
31883+{
31884+ unsigned int i;
31885+
31886+ for (i = 0; i < hist_data->n_synth_var_refs; i++)
31887+ destroy_hist_field(hist_data->synth_var_refs[i], 0);
1a6e0f06 31888+}
e4b2b4a8
JK
31889+
31890+static void save_synth_var_ref(struct hist_trigger_data *hist_data,
31891+ struct hist_field *var_ref)
1a6e0f06 31892+{
e4b2b4a8
JK
31893+ hist_data->synth_var_refs[hist_data->n_synth_var_refs++] = var_ref;
31894+
31895+ hist_data->var_refs[hist_data->n_var_refs] = var_ref;
31896+ var_ref->var_ref_idx = hist_data->n_var_refs++;
1a6e0f06 31897+}
1a6e0f06 31898+
e4b2b4a8
JK
31899+static int check_synth_field(struct synth_event *event,
31900+ struct hist_field *hist_field,
31901+ unsigned int field_pos)
1a6e0f06 31902+{
e4b2b4a8
JK
31903+ struct synth_field *field;
31904+
31905+ if (field_pos >= event->n_fields)
31906+ return -EINVAL;
31907+
31908+ field = event->fields[field_pos];
31909+
31910+ if (strcmp(field->type, hist_field->type) != 0)
31911+ return -EINVAL;
31912+
31913+ return 0;
1a6e0f06
JK
31914+}
31915+
e4b2b4a8
JK
31916+static struct hist_field *
31917+onmatch_find_var(struct hist_trigger_data *hist_data, struct action_data *data,
31918+ char *system, char *event, char *var)
31919+{
31920+ struct hist_field *hist_field;
31921+
31922+ var++; /* skip '$' */
31923+
31924+ hist_field = find_target_event_var(hist_data, system, event, var);
31925+ if (!hist_field) {
31926+ if (!system) {
31927+ system = data->onmatch.match_event_system;
31928+ event = data->onmatch.match_event;
31929+ }
31930+
31931+ hist_field = find_event_var(hist_data, system, event, var);
1a6e0f06
JK
31932+ }
31933+
e4b2b4a8
JK
31934+ if (!hist_field)
31935+ hist_err_event("onmatch: Couldn't find onmatch param: $", system, event, var);
31936+
31937+ return hist_field;
31938+}
31939+
31940+static struct hist_field *
31941+onmatch_create_field_var(struct hist_trigger_data *hist_data,
31942+ struct action_data *data, char *system,
31943+ char *event, char *var)
31944+{
31945+ struct hist_field *hist_field = NULL;
31946+ struct field_var *field_var;
31947+
31948+ /*
31949+ * First try to create a field var on the target event (the
31950+ * currently being defined). This will create a variable for
31951+ * unqualified fields on the target event, or if qualified,
31952+ * target fields that have qualified names matching the target.
31953+ */
31954+ field_var = create_target_field_var(hist_data, system, event, var);
31955+
31956+ if (field_var && !IS_ERR(field_var)) {
31957+ save_field_var(hist_data, field_var);
31958+ hist_field = field_var->var;
31959+ } else {
31960+ field_var = NULL;
31961+ /*
31962+ * If no explicit system.event is specfied, default to
31963+ * looking for fields on the onmatch(system.event.xxx)
31964+ * event.
31965+ */
31966+ if (!system) {
31967+ system = data->onmatch.match_event_system;
31968+ event = data->onmatch.match_event;
31969+ }
31970+
31971+ /*
31972+ * At this point, we're looking at a field on another
31973+ * event. Because we can't modify a hist trigger on
31974+ * another event to add a variable for a field, we need
31975+ * to create a new trigger on that event and create the
31976+ * variable at the same time.
31977+ */
31978+ hist_field = create_field_var_hist(hist_data, system, event, var);
31979+ if (IS_ERR(hist_field))
31980+ goto free;
31981+ }
31982+ out:
31983+ return hist_field;
31984+ free:
31985+ destroy_field_var(field_var);
31986+ hist_field = NULL;
31987+ goto out;
31988+}
31989+
31990+static int onmatch_create(struct hist_trigger_data *hist_data,
31991+ struct trace_event_file *file,
31992+ struct action_data *data)
31993+{
31994+ char *event_name, *param, *system = NULL;
31995+ struct hist_field *hist_field, *var_ref;
31996+ unsigned int i, var_ref_idx;
31997+ unsigned int field_pos = 0;
31998+ struct synth_event *event;
31999+ int ret = 0;
32000+
32001+ mutex_lock(&synth_event_mutex);
32002+ event = find_synth_event(data->onmatch.synth_event_name);
32003+ if (!event) {
32004+ hist_err("onmatch: Couldn't find synthetic event: ", data->onmatch.synth_event_name);
32005+ mutex_unlock(&synth_event_mutex);
32006+ return -EINVAL;
32007+ }
32008+ event->ref++;
32009+ mutex_unlock(&synth_event_mutex);
32010+
32011+ var_ref_idx = hist_data->n_var_refs;
32012+
32013+ for (i = 0; i < data->n_params; i++) {
32014+ char *p;
32015+
32016+ p = param = kstrdup(data->params[i], GFP_KERNEL);
32017+ if (!param) {
32018+ ret = -ENOMEM;
32019+ goto err;
32020+ }
32021+
32022+ system = strsep(&param, ".");
32023+ if (!param) {
32024+ param = (char *)system;
32025+ system = event_name = NULL;
32026+ } else {
32027+ event_name = strsep(&param, ".");
32028+ if (!param) {
32029+ kfree(p);
32030+ ret = -EINVAL;
32031+ goto err;
32032+ }
32033+ }
32034+
32035+ if (param[0] == '$')
32036+ hist_field = onmatch_find_var(hist_data, data, system,
32037+ event_name, param);
32038+ else
32039+ hist_field = onmatch_create_field_var(hist_data, data,
32040+ system,
32041+ event_name,
32042+ param);
32043+
32044+ if (!hist_field) {
32045+ kfree(p);
32046+ ret = -EINVAL;
32047+ goto err;
32048+ }
32049+
32050+ if (check_synth_field(event, hist_field, field_pos) == 0) {
32051+ var_ref = create_var_ref(hist_field, system, event_name);
32052+ if (!var_ref) {
32053+ kfree(p);
32054+ ret = -ENOMEM;
32055+ goto err;
32056+ }
32057+
32058+ save_synth_var_ref(hist_data, var_ref);
32059+ field_pos++;
32060+ kfree(p);
32061+ continue;
32062+ }
32063+
32064+ hist_err_event("onmatch: Param type doesn't match synthetic event field type: ",
32065+ system, event_name, param);
32066+ kfree(p);
32067+ ret = -EINVAL;
32068+ goto err;
32069+ }
32070+
32071+ if (field_pos != event->n_fields) {
32072+ hist_err("onmatch: Param count doesn't match synthetic event field count: ", event->name);
32073+ ret = -EINVAL;
32074+ goto err;
32075+ }
32076+
32077+ data->fn = action_trace;
32078+ data->onmatch.synth_event = event;
32079+ data->onmatch.var_ref_idx = var_ref_idx;
32080+ out:
32081+ return ret;
32082+ err:
32083+ mutex_lock(&synth_event_mutex);
32084+ event->ref--;
32085+ mutex_unlock(&synth_event_mutex);
32086+
32087+ goto out;
32088+}
32089+
32090+static struct action_data *onmatch_parse(struct trace_array *tr, char *str)
32091+{
32092+ char *match_event, *match_event_system;
32093+ char *synth_event_name, *params;
32094+ struct action_data *data;
32095+ int ret = -EINVAL;
32096+
32097+ data = kzalloc(sizeof(*data), GFP_KERNEL);
32098+ if (!data)
32099+ return ERR_PTR(-ENOMEM);
32100+
32101+ match_event = strsep(&str, ")");
32102+ if (!match_event || !str) {
32103+ hist_err("onmatch: Missing closing paren: ", match_event);
32104+ goto free;
32105+ }
32106+
32107+ match_event_system = strsep(&match_event, ".");
32108+ if (!match_event) {
32109+ hist_err("onmatch: Missing subsystem for match event: ", match_event_system);
32110+ goto free;
32111+ }
32112+
32113+ if (IS_ERR(event_file(tr, match_event_system, match_event))) {
32114+ hist_err_event("onmatch: Invalid subsystem or event name: ",
32115+ match_event_system, match_event, NULL);
32116+ goto free;
b3bbd485
JK
32117 }
32118
32119- if (WARN_ON_ONCE(!field))
32120- goto out;
e4b2b4a8
JK
32121+ data->onmatch.match_event = kstrdup(match_event, GFP_KERNEL);
32122+ if (!data->onmatch.match_event) {
32123+ ret = -ENOMEM;
32124+ goto free;
32125+ }
b3bbd485
JK
32126
32127- if (is_string_field(field)) {
32128- flags |= HIST_FIELD_FL_STRING;
e4b2b4a8
JK
32129+ data->onmatch.match_event_system = kstrdup(match_event_system, GFP_KERNEL);
32130+ if (!data->onmatch.match_event_system) {
32131+ ret = -ENOMEM;
32132+ goto free;
32133+ }
b3bbd485
JK
32134
32135- if (field->filter_type == FILTER_STATIC_STRING)
32136- hist_field->fn = hist_field_string;
32137- else if (field->filter_type == FILTER_DYN_STRING)
32138- hist_field->fn = hist_field_dynstring;
32139- else
32140- hist_field->fn = hist_field_pstring;
32141- } else {
32142- hist_field->fn = select_value_fn(field->size,
32143- field->is_signed);
32144- if (!hist_field->fn) {
32145- destroy_hist_field(hist_field);
32146- return NULL;
32147- }
e4b2b4a8
JK
32148+ strsep(&str, ".");
32149+ if (!str) {
32150+ hist_err("onmatch: Missing . after onmatch(): ", str);
32151+ goto free;
b3bbd485
JK
32152 }
32153- out:
32154- hist_field->field = field;
32155- hist_field->flags = flags;
32156
32157- return hist_field;
32158-}
e4b2b4a8
JK
32159+ synth_event_name = strsep(&str, "(");
32160+ if (!synth_event_name || !str) {
32161+ hist_err("onmatch: Missing opening paramlist paren: ", synth_event_name);
32162+ goto free;
32163+ }
b3bbd485
JK
32164
32165-static void destroy_hist_fields(struct hist_trigger_data *hist_data)
32166-{
32167- unsigned int i;
e4b2b4a8
JK
32168+ data->onmatch.synth_event_name = kstrdup(synth_event_name, GFP_KERNEL);
32169+ if (!data->onmatch.synth_event_name) {
b3bbd485 32170+ ret = -ENOMEM;
e4b2b4a8
JK
32171+ goto free;
32172+ }
b3bbd485
JK
32173
32174- for (i = 0; i < TRACING_MAP_FIELDS_MAX; i++) {
32175- if (hist_data->fields[i]) {
32176- destroy_hist_field(hist_data->fields[i]);
32177- hist_data->fields[i] = NULL;
32178- }
e4b2b4a8
JK
32179+ params = strsep(&str, ")");
32180+ if (!params || !str || (str && strlen(str))) {
32181+ hist_err("onmatch: Missing closing paramlist paren: ", params);
32182+ goto free;
b3bbd485 32183 }
e4b2b4a8
JK
32184+
32185+ ret = parse_action_params(params, data);
32186+ if (ret)
32187+ goto free;
32188+ out:
32189+ return data;
32190+ free:
32191+ onmatch_destroy(data);
32192+ data = ERR_PTR(ret);
32193+ goto out;
b3bbd485
JK
32194 }
32195
32196 static int create_hitcount_val(struct hist_trigger_data *hist_data)
32197 {
32198 hist_data->fields[HITCOUNT_IDX] =
32199- create_hist_field(NULL, HIST_FIELD_FL_HITCOUNT);
e4b2b4a8 32200+ create_hist_field(hist_data, NULL, HIST_FIELD_FL_HITCOUNT, NULL);
b3bbd485
JK
32201 if (!hist_data->fields[HITCOUNT_IDX])
32202 return -ENOMEM;
32203
32204 hist_data->n_vals++;
e4b2b4a8 32205+ hist_data->n_fields++;
b3bbd485
JK
32206
32207 if (WARN_ON(hist_data->n_vals > TRACING_MAP_VALS_MAX))
32208 return -EINVAL;
32209@@ -426,54 +3828,71 @@ static int create_hitcount_val(struct hist_trigger_data *hist_data)
32210 return 0;
32211 }
32212
e4b2b4a8
JK
32213+static int __create_val_field(struct hist_trigger_data *hist_data,
32214+ unsigned int val_idx,
32215+ struct trace_event_file *file,
32216+ char *var_name, char *field_str,
32217+ unsigned long flags)
32218+{
32219+ struct hist_field *hist_field;
32220+ int ret = 0;
32221+
32222+ hist_field = parse_expr(hist_data, file, field_str, flags, var_name, 0);
32223+ if (IS_ERR(hist_field)) {
32224+ ret = PTR_ERR(hist_field);
b3bbd485
JK
32225+ goto out;
32226+ }
32227+
e4b2b4a8
JK
32228+ hist_data->fields[val_idx] = hist_field;
32229+
b3bbd485 32230+ ++hist_data->n_vals;
e4b2b4a8 32231+ ++hist_data->n_fields;
b3bbd485 32232+
e4b2b4a8 32233+ if (WARN_ON(hist_data->n_vals > TRACING_MAP_VALS_MAX + TRACING_MAP_VARS_MAX))
b3bbd485
JK
32234+ ret = -EINVAL;
32235+ out:
32236+ return ret;
32237+}
1a6e0f06 32238+
b3bbd485
JK
32239 static int create_val_field(struct hist_trigger_data *hist_data,
32240 unsigned int val_idx,
32241 struct trace_event_file *file,
32242 char *field_str)
32243 {
32244- struct ftrace_event_field *field = NULL;
32245- unsigned long flags = 0;
32246- char *field_name;
32247- int ret = 0;
32248-
32249 if (WARN_ON(val_idx >= TRACING_MAP_VALS_MAX))
32250 return -EINVAL;
32251
32252- field_name = strsep(&field_str, ".");
32253- if (field_str) {
32254- if (strcmp(field_str, "hex") == 0)
32255- flags |= HIST_FIELD_FL_HEX;
32256- else {
32257- ret = -EINVAL;
32258- goto out;
32259- }
32260- }
e4b2b4a8 32261+ return __create_val_field(hist_data, val_idx, file, NULL, field_str, 0);
1a6e0f06 32262+}
b3bbd485
JK
32263
32264- field = trace_find_event_field(file->event_call, field_name);
32265- if (!field || !field->size) {
32266- ret = -EINVAL;
32267- goto out;
32268- }
e4b2b4a8
JK
32269+static int create_var_field(struct hist_trigger_data *hist_data,
32270+ unsigned int val_idx,
32271+ struct trace_event_file *file,
32272+ char *var_name, char *expr_str)
1a6e0f06 32273+{
e4b2b4a8 32274+ unsigned long flags = 0;
b3bbd485
JK
32275
32276- hist_data->fields[val_idx] = create_hist_field(field, flags);
32277- if (!hist_data->fields[val_idx]) {
32278- ret = -ENOMEM;
32279- goto out;
e4b2b4a8
JK
32280+ if (WARN_ON(val_idx >= TRACING_MAP_VALS_MAX + TRACING_MAP_VARS_MAX))
32281+ return -EINVAL;
1a6e0f06 32282+
e4b2b4a8
JK
32283+ if (find_var(hist_data, file, var_name) && !hist_data->remove) {
32284+ hist_err("Variable already defined: ", var_name);
32285+ return -EINVAL;
b3bbd485
JK
32286 }
32287
32288- ++hist_data->n_vals;
e4b2b4a8
JK
32289+ flags |= HIST_FIELD_FL_VAR;
32290+ hist_data->n_vars++;
32291+ if (WARN_ON(hist_data->n_vars > TRACING_MAP_VARS_MAX))
32292+ return -EINVAL;
b3bbd485
JK
32293
32294- if (WARN_ON(hist_data->n_vals > TRACING_MAP_VALS_MAX))
32295- ret = -EINVAL;
32296- out:
32297- return ret;
e4b2b4a8 32298+ return __create_val_field(hist_data, val_idx, file, var_name, expr_str, flags);
b3bbd485
JK
32299 }
32300
e4b2b4a8
JK
32301 static int create_val_fields(struct hist_trigger_data *hist_data,
32302 struct trace_event_file *file)
32303 {
32304 char *fields_str, *field_str;
32305- unsigned int i, j;
32306+ unsigned int i, j = 1;
32307 int ret;
1a6e0f06 32308
e4b2b4a8 32309 ret = create_hitcount_val(hist_data);
b3bbd485 32310@@ -493,12 +3912,15 @@ static int create_val_fields(struct hist_trigger_data *hist_data,
e4b2b4a8
JK
32311 field_str = strsep(&fields_str, ",");
32312 if (!field_str)
32313 break;
32314+
32315 if (strcmp(field_str, "hitcount") == 0)
32316 continue;
32317+
32318 ret = create_val_field(hist_data, j++, file, field_str);
32319 if (ret)
32320 goto out;
1a6e0f06 32321 }
e4b2b4a8
JK
32322+
32323 if (fields_str && (strcmp(fields_str, "hitcount") != 0))
32324 ret = -EINVAL;
32325 out:
b3bbd485 32326@@ -511,12 +3933,13 @@ static int create_key_field(struct hist_trigger_data *hist_data,
e4b2b4a8
JK
32327 struct trace_event_file *file,
32328 char *field_str)
32329 {
32330- struct ftrace_event_field *field = NULL;
32331+ struct hist_field *hist_field = NULL;
32332+
32333 unsigned long flags = 0;
32334 unsigned int key_size;
32335 int ret = 0;
1a6e0f06 32336
e4b2b4a8
JK
32337- if (WARN_ON(key_idx >= TRACING_MAP_FIELDS_MAX))
32338+ if (WARN_ON(key_idx >= HIST_FIELDS_MAX))
32339 return -EINVAL;
1a6e0f06 32340
e4b2b4a8 32341 flags |= HIST_FIELD_FL_KEY;
b3bbd485 32342@@ -524,57 +3947,40 @@ static int create_key_field(struct hist_trigger_data *hist_data,
e4b2b4a8
JK
32343 if (strcmp(field_str, "stacktrace") == 0) {
32344 flags |= HIST_FIELD_FL_STACKTRACE;
32345 key_size = sizeof(unsigned long) * HIST_STACKTRACE_DEPTH;
32346+ hist_field = create_hist_field(hist_data, NULL, flags, NULL);
32347 } else {
32348- char *field_name = strsep(&field_str, ".");
32349-
32350- if (field_str) {
32351- if (strcmp(field_str, "hex") == 0)
32352- flags |= HIST_FIELD_FL_HEX;
32353- else if (strcmp(field_str, "sym") == 0)
32354- flags |= HIST_FIELD_FL_SYM;
32355- else if (strcmp(field_str, "sym-offset") == 0)
32356- flags |= HIST_FIELD_FL_SYM_OFFSET;
32357- else if ((strcmp(field_str, "execname") == 0) &&
32358- (strcmp(field_name, "common_pid") == 0))
32359- flags |= HIST_FIELD_FL_EXECNAME;
32360- else if (strcmp(field_str, "syscall") == 0)
32361- flags |= HIST_FIELD_FL_SYSCALL;
32362- else if (strcmp(field_str, "log2") == 0)
32363- flags |= HIST_FIELD_FL_LOG2;
32364- else {
32365- ret = -EINVAL;
32366- goto out;
32367- }
32368+ hist_field = parse_expr(hist_data, file, field_str, flags,
32369+ NULL, 0);
32370+ if (IS_ERR(hist_field)) {
32371+ ret = PTR_ERR(hist_field);
32372+ goto out;
32373 }
1a6e0f06 32374
e4b2b4a8
JK
32375- field = trace_find_event_field(file->event_call, field_name);
32376- if (!field || !field->size) {
32377+ if (hist_field->flags & HIST_FIELD_FL_VAR_REF) {
32378+ hist_err("Using variable references as keys not supported: ", field_str);
32379+ destroy_hist_field(hist_field, 0);
32380 ret = -EINVAL;
32381 goto out;
1a6e0f06 32382 }
e4b2b4a8
JK
32383
32384- if (is_string_field(field))
32385- key_size = MAX_FILTER_STR_VAL;
32386- else
32387- key_size = field->size;
32388+ key_size = hist_field->size;
1a6e0f06 32389 }
1a6e0f06 32390
e4b2b4a8
JK
32391- hist_data->fields[key_idx] = create_hist_field(field, flags);
32392- if (!hist_data->fields[key_idx]) {
32393- ret = -ENOMEM;
32394- goto out;
32395- }
32396+ hist_data->fields[key_idx] = hist_field;
32397
32398 key_size = ALIGN(key_size, sizeof(u64));
32399 hist_data->fields[key_idx]->size = key_size;
32400 hist_data->fields[key_idx]->offset = key_offset;
32401+
32402 hist_data->key_size += key_size;
32403+
32404 if (hist_data->key_size > HIST_KEY_SIZE_MAX) {
32405 ret = -EINVAL;
32406 goto out;
1a6e0f06 32407 }
1a6e0f06 32408
e4b2b4a8
JK
32409 hist_data->n_keys++;
32410+ hist_data->n_fields++;
1a6e0f06 32411
e4b2b4a8
JK
32412 if (WARN_ON(hist_data->n_keys > TRACING_MAP_KEYS_MAX))
32413 return -EINVAL;
b3bbd485 32414@@ -618,21 +4024,113 @@ static int create_key_fields(struct hist_trigger_data *hist_data,
e4b2b4a8 32415 return ret;
1a6e0f06
JK
32416 }
32417
e4b2b4a8
JK
32418+static int create_var_fields(struct hist_trigger_data *hist_data,
32419+ struct trace_event_file *file)
32420+{
32421+ unsigned int i, j = hist_data->n_vals;
32422+ int ret = 0;
32423+
32424+ unsigned int n_vars = hist_data->attrs->var_defs.n_vars;
32425+
32426+ for (i = 0; i < n_vars; i++) {
32427+ char *var_name = hist_data->attrs->var_defs.name[i];
32428+ char *expr = hist_data->attrs->var_defs.expr[i];
32429+
32430+ ret = create_var_field(hist_data, j++, file, var_name, expr);
32431+ if (ret)
32432+ goto out;
32433+ }
32434+ out:
32435+ return ret;
32436+}
32437+
32438+static void free_var_defs(struct hist_trigger_data *hist_data)
32439+{
32440+ unsigned int i;
32441+
32442+ for (i = 0; i < hist_data->attrs->var_defs.n_vars; i++) {
32443+ kfree(hist_data->attrs->var_defs.name[i]);
32444+ kfree(hist_data->attrs->var_defs.expr[i]);
32445+ }
32446+
32447+ hist_data->attrs->var_defs.n_vars = 0;
32448+}
32449+
32450+static int parse_var_defs(struct hist_trigger_data *hist_data)
32451+{
32452+ char *s, *str, *var_name, *field_str;
32453+ unsigned int i, j, n_vars = 0;
32454+ int ret = 0;
32455+
32456+ for (i = 0; i < hist_data->attrs->n_assignments; i++) {
32457+ str = hist_data->attrs->assignment_str[i];
32458+ for (j = 0; j < TRACING_MAP_VARS_MAX; j++) {
32459+ field_str = strsep(&str, ",");
32460+ if (!field_str)
32461+ break;
32462+
32463+ var_name = strsep(&field_str, "=");
32464+ if (!var_name || !field_str) {
32465+ hist_err("Malformed assignment: ", var_name);
32466+ ret = -EINVAL;
32467+ goto free;
32468+ }
32469+
32470+ if (n_vars == TRACING_MAP_VARS_MAX) {
32471+ hist_err("Too many variables defined: ", var_name);
32472+ ret = -EINVAL;
32473+ goto free;
32474+ }
32475+
32476+ s = kstrdup(var_name, GFP_KERNEL);
32477+ if (!s) {
32478+ ret = -ENOMEM;
32479+ goto free;
32480+ }
32481+ hist_data->attrs->var_defs.name[n_vars] = s;
32482+
32483+ s = kstrdup(field_str, GFP_KERNEL);
32484+ if (!s) {
32485+ kfree(hist_data->attrs->var_defs.name[n_vars]);
32486+ ret = -ENOMEM;
32487+ goto free;
32488+ }
32489+ hist_data->attrs->var_defs.expr[n_vars++] = s;
32490+
32491+ hist_data->attrs->var_defs.n_vars = n_vars;
32492+ }
32493+ }
32494+
32495+ return ret;
32496+ free:
32497+ free_var_defs(hist_data);
32498+
32499+ return ret;
32500+}
32501+
32502 static int create_hist_fields(struct hist_trigger_data *hist_data,
32503 struct trace_event_file *file)
32504 {
32505 int ret;
1a6e0f06 32506
e4b2b4a8
JK
32507+ ret = parse_var_defs(hist_data);
32508+ if (ret)
32509+ goto out;
32510+
32511 ret = create_val_fields(hist_data, file);
32512 if (ret)
32513 goto out;
1a6e0f06 32514
e4b2b4a8
JK
32515- ret = create_key_fields(hist_data, file);
32516+ ret = create_var_fields(hist_data, file);
32517 if (ret)
32518 goto out;
1a6e0f06 32519
e4b2b4a8
JK
32520- hist_data->n_fields = hist_data->n_vals + hist_data->n_keys;
32521+ ret = create_key_fields(hist_data, file);
32522+ if (ret)
32523+ goto out;
32524 out:
32525+ free_var_defs(hist_data);
32526+
32527 return ret;
1a6e0f06
JK
32528 }
32529
b3bbd485 32530@@ -653,10 +4151,9 @@ static int is_descending(const char *str)
e4b2b4a8 32531 static int create_sort_keys(struct hist_trigger_data *hist_data)
1a6e0f06 32532 {
e4b2b4a8
JK
32533 char *fields_str = hist_data->attrs->sort_key_str;
32534- struct ftrace_event_field *field = NULL;
32535 struct tracing_map_sort_key *sort_key;
32536 int descending, ret = 0;
32537- unsigned int i, j;
32538+ unsigned int i, j, k;
1a6e0f06 32539
e4b2b4a8 32540 hist_data->n_sort_keys = 1; /* we always have at least one, hitcount */
1a6e0f06 32541
b3bbd485 32542@@ -670,7 +4167,9 @@ static int create_sort_keys(struct hist_trigger_data *hist_data)
e4b2b4a8 32543 }
1a6e0f06 32544
e4b2b4a8
JK
32545 for (i = 0; i < TRACING_MAP_SORT_KEYS_MAX; i++) {
32546+ struct hist_field *hist_field;
32547 char *field_str, *field_name;
32548+ const char *test_name;
1a6e0f06 32549
e4b2b4a8 32550 sort_key = &hist_data->sort_keys[i];
1a6e0f06 32551
b3bbd485 32552@@ -702,10 +4201,19 @@ static int create_sort_keys(struct hist_trigger_data *hist_data)
e4b2b4a8
JK
32553 continue;
32554 }
1a6e0f06 32555
e4b2b4a8
JK
32556- for (j = 1; j < hist_data->n_fields; j++) {
32557- field = hist_data->fields[j]->field;
32558- if (field && (strcmp(field_name, field->name) == 0)) {
32559- sort_key->field_idx = j;
32560+ for (j = 1, k = 1; j < hist_data->n_fields; j++) {
32561+ unsigned int idx;
1a6e0f06 32562+
e4b2b4a8
JK
32563+ hist_field = hist_data->fields[j];
32564+ if (hist_field->flags & HIST_FIELD_FL_VAR)
32565+ continue;
1a6e0f06 32566+
e4b2b4a8 32567+ idx = k++;
1a6e0f06 32568+
e4b2b4a8 32569+ test_name = hist_field_name(hist_field, 0);
1a6e0f06 32570+
e4b2b4a8
JK
32571+ if (strcmp(field_name, test_name) == 0) {
32572+ sort_key->field_idx = idx;
32573 descending = is_descending(field_str);
32574 if (descending < 0) {
32575 ret = descending;
b3bbd485 32576@@ -720,16 +4228,230 @@ static int create_sort_keys(struct hist_trigger_data *hist_data)
e4b2b4a8
JK
32577 break;
32578 }
32579 }
b3bbd485
JK
32580- hist_data->n_sort_keys = i;
32581- out:
32582- return ret;
32583+
32584+ hist_data->n_sort_keys = i;
32585+ out:
32586+ return ret;
32587+}
e4b2b4a8 32588+
e4b2b4a8
JK
32589+static void destroy_actions(struct hist_trigger_data *hist_data)
32590+{
32591+ unsigned int i;
1a6e0f06 32592+
e4b2b4a8
JK
32593+ for (i = 0; i < hist_data->n_actions; i++) {
32594+ struct action_data *data = hist_data->actions[i];
1a6e0f06 32595+
e4b2b4a8
JK
32596+ if (data->fn == action_trace)
32597+ onmatch_destroy(data);
32598+ else if (data->fn == onmax_save)
32599+ onmax_destroy(data);
32600+ else
32601+ kfree(data);
32602+ }
32603+}
1a6e0f06 32604+
e4b2b4a8
JK
32605+static int parse_actions(struct hist_trigger_data *hist_data)
32606+{
32607+ struct trace_array *tr = hist_data->event_file->tr;
32608+ struct action_data *data;
32609+ unsigned int i;
32610+ int ret = 0;
32611+ char *str;
1a6e0f06 32612+
e4b2b4a8
JK
32613+ for (i = 0; i < hist_data->attrs->n_actions; i++) {
32614+ str = hist_data->attrs->action_str[i];
1a6e0f06 32615+
e4b2b4a8
JK
32616+ if (strncmp(str, "onmatch(", strlen("onmatch(")) == 0) {
32617+ char *action_str = str + strlen("onmatch(");
1a6e0f06 32618+
e4b2b4a8
JK
32619+ data = onmatch_parse(tr, action_str);
32620+ if (IS_ERR(data)) {
32621+ ret = PTR_ERR(data);
32622+ break;
32623+ }
32624+ data->fn = action_trace;
32625+ } else if (strncmp(str, "onmax(", strlen("onmax(")) == 0) {
32626+ char *action_str = str + strlen("onmax(");
1a6e0f06 32627+
e4b2b4a8
JK
32628+ data = onmax_parse(action_str);
32629+ if (IS_ERR(data)) {
32630+ ret = PTR_ERR(data);
32631+ break;
32632+ }
32633+ data->fn = onmax_save;
32634+ } else {
32635+ ret = -EINVAL;
32636+ break;
32637+ }
1a6e0f06 32638+
e4b2b4a8
JK
32639+ hist_data->actions[hist_data->n_actions++] = data;
32640+ }
1a6e0f06 32641+
e4b2b4a8
JK
32642+ return ret;
32643+}
1a6e0f06 32644+
e4b2b4a8
JK
32645+static int create_actions(struct hist_trigger_data *hist_data,
32646+ struct trace_event_file *file)
32647+{
32648+ struct action_data *data;
32649+ unsigned int i;
32650+ int ret = 0;
1a6e0f06 32651+
e4b2b4a8
JK
32652+ for (i = 0; i < hist_data->attrs->n_actions; i++) {
32653+ data = hist_data->actions[i];
1a6e0f06 32654+
e4b2b4a8
JK
32655+ if (data->fn == action_trace) {
32656+ ret = onmatch_create(hist_data, file, data);
32657+ if (ret)
32658+ return ret;
32659+ } else if (data->fn == onmax_save) {
32660+ ret = onmax_create(hist_data, data);
32661+ if (ret)
32662+ return ret;
32663+ }
32664+ }
1a6e0f06 32665+
e4b2b4a8
JK
32666+ return ret;
32667+}
1a6e0f06 32668+
e4b2b4a8
JK
32669+static void print_actions(struct seq_file *m,
32670+ struct hist_trigger_data *hist_data,
32671+ struct tracing_map_elt *elt)
32672+{
32673+ unsigned int i;
1a6e0f06 32674+
e4b2b4a8
JK
32675+ for (i = 0; i < hist_data->n_actions; i++) {
32676+ struct action_data *data = hist_data->actions[i];
1a6e0f06 32677+
e4b2b4a8
JK
32678+ if (data->fn == onmax_save)
32679+ onmax_print(m, hist_data, elt, data);
32680+ }
32681+}
1a6e0f06 32682+
e4b2b4a8
JK
32683+static void print_onmax_spec(struct seq_file *m,
32684+ struct hist_trigger_data *hist_data,
32685+ struct action_data *data)
32686+{
32687+ unsigned int i;
1a6e0f06 32688+
e4b2b4a8
JK
32689+ seq_puts(m, ":onmax(");
32690+ seq_printf(m, "%s", data->onmax.var_str);
32691+ seq_printf(m, ").%s(", data->onmax.fn_name);
1a6e0f06 32692+
e4b2b4a8
JK
32693+ for (i = 0; i < hist_data->n_max_vars; i++) {
32694+ seq_printf(m, "%s", hist_data->max_vars[i]->var->var.name);
32695+ if (i < hist_data->n_max_vars - 1)
32696+ seq_puts(m, ",");
32697+ }
32698+ seq_puts(m, ")");
32699+}
1a6e0f06 32700+
e4b2b4a8
JK
32701+static void print_onmatch_spec(struct seq_file *m,
32702+ struct hist_trigger_data *hist_data,
32703+ struct action_data *data)
32704+{
32705+ unsigned int i;
1a6e0f06 32706+
e4b2b4a8
JK
32707+ seq_printf(m, ":onmatch(%s.%s).", data->onmatch.match_event_system,
32708+ data->onmatch.match_event);
1a6e0f06 32709+
e4b2b4a8 32710+ seq_printf(m, "%s(", data->onmatch.synth_event->name);
1a6e0f06 32711+
e4b2b4a8
JK
32712+ for (i = 0; i < data->n_params; i++) {
32713+ if (i)
32714+ seq_puts(m, ",");
32715+ seq_printf(m, "%s", data->params[i]);
32716+ }
1a6e0f06 32717+
e4b2b4a8
JK
32718+ seq_puts(m, ")");
32719+}
1a6e0f06 32720+
e4b2b4a8
JK
32721+static bool actions_match(struct hist_trigger_data *hist_data,
32722+ struct hist_trigger_data *hist_data_test)
1a6e0f06 32723+{
e4b2b4a8 32724+ unsigned int i, j;
1a6e0f06 32725+
e4b2b4a8
JK
32726+ if (hist_data->n_actions != hist_data_test->n_actions)
32727+ return false;
1a6e0f06 32728+
e4b2b4a8
JK
32729+ for (i = 0; i < hist_data->n_actions; i++) {
32730+ struct action_data *data = hist_data->actions[i];
32731+ struct action_data *data_test = hist_data_test->actions[i];
1a6e0f06 32732+
e4b2b4a8
JK
32733+ if (data->fn != data_test->fn)
32734+ return false;
1a6e0f06 32735+
e4b2b4a8
JK
32736+ if (data->n_params != data_test->n_params)
32737+ return false;
1a6e0f06 32738+
e4b2b4a8
JK
32739+ for (j = 0; j < data->n_params; j++) {
32740+ if (strcmp(data->params[j], data_test->params[j]) != 0)
32741+ return false;
32742+ }
1a6e0f06 32743+
e4b2b4a8
JK
32744+ if (data->fn == action_trace) {
32745+ if (strcmp(data->onmatch.synth_event_name,
32746+ data_test->onmatch.synth_event_name) != 0)
32747+ return false;
32748+ if (strcmp(data->onmatch.match_event_system,
32749+ data_test->onmatch.match_event_system) != 0)
32750+ return false;
32751+ if (strcmp(data->onmatch.match_event,
32752+ data_test->onmatch.match_event) != 0)
32753+ return false;
32754+ } else if (data->fn == onmax_save) {
32755+ if (strcmp(data->onmax.var_str,
32756+ data_test->onmax.var_str) != 0)
32757+ return false;
32758+ if (strcmp(data->onmax.fn_name,
32759+ data_test->onmax.fn_name) != 0)
32760+ return false;
1a6e0f06 32761+ }
1a6e0f06 32762+ }
1a6e0f06 32763+
e4b2b4a8
JK
32764+ return true;
32765+}
1a6e0f06 32766+
1a6e0f06 32767+
e4b2b4a8
JK
32768+static void print_actions_spec(struct seq_file *m,
32769+ struct hist_trigger_data *hist_data)
32770+{
32771+ unsigned int i;
1a6e0f06 32772+
e4b2b4a8
JK
32773+ for (i = 0; i < hist_data->n_actions; i++) {
32774+ struct action_data *data = hist_data->actions[i];
1a6e0f06 32775+
e4b2b4a8
JK
32776+ if (data->fn == action_trace)
32777+ print_onmatch_spec(m, hist_data, data);
32778+ else if (data->fn == onmax_save)
32779+ print_onmax_spec(m, hist_data, data);
1a6e0f06 32780+ }
1a6e0f06
JK
32781+}
32782+
e4b2b4a8 32783+static void destroy_field_var_hists(struct hist_trigger_data *hist_data)
1a6e0f06 32784+{
e4b2b4a8 32785+ unsigned int i;
1a6e0f06 32786+
e4b2b4a8
JK
32787+ for (i = 0; i < hist_data->n_field_var_hists; i++) {
32788+ kfree(hist_data->field_var_hists[i]->cmd);
32789+ kfree(hist_data->field_var_hists[i]);
1a6e0f06 32790+ }
b3bbd485
JK
32791 }
32792
e4b2b4a8
JK
32793 static void destroy_hist_data(struct hist_trigger_data *hist_data)
32794 {
32795+ if (!hist_data)
32796+ return;
1a6e0f06 32797+
e4b2b4a8
JK
32798 destroy_hist_trigger_attrs(hist_data->attrs);
32799 destroy_hist_fields(hist_data);
32800 tracing_map_destroy(hist_data->map);
1a6e0f06 32801+
e4b2b4a8
JK
32802+ destroy_actions(hist_data);
32803+ destroy_field_vars(hist_data);
32804+ destroy_field_var_hists(hist_data);
32805+ destroy_synth_var_refs(hist_data);
32806+
32807 kfree(hist_data);
32808 }
32809
b3bbd485 32810@@ -738,7 +4460,7 @@ static int create_tracing_map_fields(struct hist_trigger_data *hist_data)
e4b2b4a8
JK
32811 struct tracing_map *map = hist_data->map;
32812 struct ftrace_event_field *field;
32813 struct hist_field *hist_field;
32814- int i, idx;
32815+ int i, idx = 0;
32816
32817 for_each_hist_field(i, hist_data) {
32818 hist_field = hist_data->fields[i];
b3bbd485 32819@@ -749,6 +4471,9 @@ static int create_tracing_map_fields(struct hist_trigger_data *hist_data)
e4b2b4a8
JK
32820
32821 if (hist_field->flags & HIST_FIELD_FL_STACKTRACE)
32822 cmp_fn = tracing_map_cmp_none;
32823+ else if (!field)
32824+ cmp_fn = tracing_map_cmp_num(hist_field->size,
32825+ hist_field->is_signed);
32826 else if (is_string_field(field))
32827 cmp_fn = tracing_map_cmp_string;
32828 else
b3bbd485 32829@@ -757,36 +4482,29 @@ static int create_tracing_map_fields(struct hist_trigger_data *hist_data)
e4b2b4a8
JK
32830 idx = tracing_map_add_key_field(map,
32831 hist_field->offset,
32832 cmp_fn);
32833-
32834- } else
32835+ } else if (!(hist_field->flags & HIST_FIELD_FL_VAR))
32836 idx = tracing_map_add_sum_field(map);
32837
32838 if (idx < 0)
32839 return idx;
32840- }
32841-
32842- return 0;
32843-}
32844-
32845-static bool need_tracing_map_ops(struct hist_trigger_data *hist_data)
32846-{
32847- struct hist_field *key_field;
32848- unsigned int i;
32849-
32850- for_each_hist_key_field(i, hist_data) {
32851- key_field = hist_data->fields[i];
32852
32853- if (key_field->flags & HIST_FIELD_FL_EXECNAME)
32854- return true;
32855+ if (hist_field->flags & HIST_FIELD_FL_VAR) {
32856+ idx = tracing_map_add_var(map);
32857+ if (idx < 0)
32858+ return idx;
32859+ hist_field->var.idx = idx;
32860+ hist_field->var.hist_data = hist_data;
32861+ }
32862 }
32863
32864- return false;
1a6e0f06 32865+ return 0;
e4b2b4a8
JK
32866 }
32867
32868 static struct hist_trigger_data *
32869 create_hist_data(unsigned int map_bits,
32870 struct hist_trigger_attrs *attrs,
32871- struct trace_event_file *file)
32872+ struct trace_event_file *file,
32873+ bool remove)
32874 {
32875 const struct tracing_map_ops *map_ops = NULL;
32876 struct hist_trigger_data *hist_data;
b3bbd485 32877@@ -797,6 +4515,12 @@ create_hist_data(unsigned int map_bits,
e4b2b4a8
JK
32878 return ERR_PTR(-ENOMEM);
32879
32880 hist_data->attrs = attrs;
32881+ hist_data->remove = remove;
32882+ hist_data->event_file = file;
1a6e0f06 32883+
e4b2b4a8
JK
32884+ ret = parse_actions(hist_data);
32885+ if (ret)
32886+ goto free;
32887
32888 ret = create_hist_fields(hist_data, file);
32889 if (ret)
b3bbd485 32890@@ -806,8 +4530,7 @@ create_hist_data(unsigned int map_bits,
e4b2b4a8
JK
32891 if (ret)
32892 goto free;
32893
32894- if (need_tracing_map_ops(hist_data))
32895- map_ops = &hist_trigger_elt_comm_ops;
32896+ map_ops = &hist_trigger_elt_data_ops;
32897
32898 hist_data->map = tracing_map_create(map_bits, hist_data->key_size,
32899 map_ops, hist_data);
b3bbd485 32900@@ -820,12 +4543,6 @@ create_hist_data(unsigned int map_bits,
e4b2b4a8
JK
32901 ret = create_tracing_map_fields(hist_data);
32902 if (ret)
32903 goto free;
32904-
32905- ret = tracing_map_init(hist_data->map);
32906- if (ret)
32907- goto free;
32908-
32909- hist_data->event_file = file;
32910 out:
32911 return hist_data;
32912 free:
b3bbd485 32913@@ -839,18 +4556,39 @@ create_hist_data(unsigned int map_bits,
e4b2b4a8
JK
32914 }
32915
32916 static void hist_trigger_elt_update(struct hist_trigger_data *hist_data,
32917- struct tracing_map_elt *elt,
32918- void *rec)
32919+ struct tracing_map_elt *elt, void *rec,
32920+ struct ring_buffer_event *rbe,
32921+ u64 *var_ref_vals)
32922 {
32923+ struct hist_elt_data *elt_data;
32924 struct hist_field *hist_field;
32925- unsigned int i;
32926+ unsigned int i, var_idx;
32927 u64 hist_val;
32928
32929+ elt_data = elt->private_data;
32930+ elt_data->var_ref_vals = var_ref_vals;
32931+
32932 for_each_hist_val_field(i, hist_data) {
32933 hist_field = hist_data->fields[i];
32934- hist_val = hist_field->fn(hist_field, rec);
32935+ hist_val = hist_field->fn(hist_field, elt, rbe, rec);
32936+ if (hist_field->flags & HIST_FIELD_FL_VAR) {
32937+ var_idx = hist_field->var.idx;
32938+ tracing_map_set_var(elt, var_idx, hist_val);
32939+ continue;
32940+ }
32941 tracing_map_update_sum(elt, i, hist_val);
32942 }
32943+
32944+ for_each_hist_key_field(i, hist_data) {
32945+ hist_field = hist_data->fields[i];
32946+ if (hist_field->flags & HIST_FIELD_FL_VAR) {
32947+ hist_val = hist_field->fn(hist_field, elt, rbe, rec);
32948+ var_idx = hist_field->var.idx;
32949+ tracing_map_set_var(elt, var_idx, hist_val);
32950+ }
32951+ }
1a6e0f06 32952+
e4b2b4a8
JK
32953+ update_field_vars(hist_data, elt, rbe, rec);
32954 }
32955
32956 static inline void add_to_key(char *compound_key, void *key,
b3bbd485 32957@@ -877,15 +4615,31 @@ static inline void add_to_key(char *compound_key, void *key,
e4b2b4a8
JK
32958 memcpy(compound_key + key_field->offset, key, size);
32959 }
32960
32961-static void event_hist_trigger(struct event_trigger_data *data, void *rec)
32962+static void
32963+hist_trigger_actions(struct hist_trigger_data *hist_data,
32964+ struct tracing_map_elt *elt, void *rec,
32965+ struct ring_buffer_event *rbe, u64 *var_ref_vals)
1a6e0f06 32966+{
e4b2b4a8
JK
32967+ struct action_data *data;
32968+ unsigned int i;
1a6e0f06 32969+
e4b2b4a8
JK
32970+ for (i = 0; i < hist_data->n_actions; i++) {
32971+ data = hist_data->actions[i];
32972+ data->fn(hist_data, elt, rec, rbe, data, var_ref_vals);
1a6e0f06 32973+ }
1a6e0f06
JK
32974+}
32975+
e4b2b4a8
JK
32976+static void event_hist_trigger(struct event_trigger_data *data, void *rec,
32977+ struct ring_buffer_event *rbe)
32978 {
32979 struct hist_trigger_data *hist_data = data->private_data;
32980 bool use_compound_key = (hist_data->n_keys > 1);
32981 unsigned long entries[HIST_STACKTRACE_DEPTH];
32982+ u64 var_ref_vals[TRACING_MAP_VARS_MAX];
32983 char compound_key[HIST_KEY_SIZE_MAX];
32984+ struct tracing_map_elt *elt = NULL;
32985 struct stack_trace stacktrace;
32986 struct hist_field *key_field;
32987- struct tracing_map_elt *elt;
32988 u64 field_contents;
32989 void *key = NULL;
32990 unsigned int i;
b3bbd485 32991@@ -906,7 +4660,7 @@ static void event_hist_trigger(struct event_trigger_data *data, void *rec)
e4b2b4a8
JK
32992
32993 key = entries;
32994 } else {
32995- field_contents = key_field->fn(key_field, rec);
32996+ field_contents = key_field->fn(key_field, elt, rbe, rec);
32997 if (key_field->flags & HIST_FIELD_FL_STRING) {
32998 key = (void *)(unsigned long)field_contents;
32999 use_compound_key = true;
b3bbd485 33000@@ -921,9 +4675,18 @@ static void event_hist_trigger(struct event_trigger_data *data, void *rec)
e4b2b4a8
JK
33001 if (use_compound_key)
33002 key = compound_key;
33003
33004+ if (hist_data->n_var_refs &&
33005+ !resolve_var_refs(hist_data, key, var_ref_vals, false))
33006+ return;
1a6e0f06 33007+
e4b2b4a8
JK
33008 elt = tracing_map_insert(hist_data->map, key);
33009- if (elt)
33010- hist_trigger_elt_update(hist_data, elt, rec);
33011+ if (!elt)
33012+ return;
1a6e0f06 33013+
e4b2b4a8
JK
33014+ hist_trigger_elt_update(hist_data, elt, rec, rbe, var_ref_vals);
33015+
33016+ if (resolve_var_refs(hist_data, key, var_ref_vals, true))
33017+ hist_trigger_actions(hist_data, elt, rec, rbe, var_ref_vals);
33018 }
33019
33020 static void hist_trigger_stacktrace_print(struct seq_file *m,
b3bbd485 33021@@ -952,6 +4715,7 @@ hist_trigger_entry_print(struct seq_file *m,
e4b2b4a8
JK
33022 struct hist_field *key_field;
33023 char str[KSYM_SYMBOL_LEN];
33024 bool multiline = false;
33025+ const char *field_name;
33026 unsigned int i;
33027 u64 uval;
33028
b3bbd485 33029@@ -963,26 +4727,33 @@ hist_trigger_entry_print(struct seq_file *m,
e4b2b4a8
JK
33030 if (i > hist_data->n_vals)
33031 seq_puts(m, ", ");
33032
33033+ field_name = hist_field_name(key_field, 0);
33034+
33035 if (key_field->flags & HIST_FIELD_FL_HEX) {
33036 uval = *(u64 *)(key + key_field->offset);
33037- seq_printf(m, "%s: %llx",
33038- key_field->field->name, uval);
33039+ seq_printf(m, "%s: %llx", field_name, uval);
33040 } else if (key_field->flags & HIST_FIELD_FL_SYM) {
33041 uval = *(u64 *)(key + key_field->offset);
33042 sprint_symbol_no_offset(str, uval);
33043- seq_printf(m, "%s: [%llx] %-45s",
33044- key_field->field->name, uval, str);
33045+ seq_printf(m, "%s: [%llx] %-45s", field_name,
33046+ uval, str);
33047 } else if (key_field->flags & HIST_FIELD_FL_SYM_OFFSET) {
33048 uval = *(u64 *)(key + key_field->offset);
33049 sprint_symbol(str, uval);
33050- seq_printf(m, "%s: [%llx] %-55s",
33051- key_field->field->name, uval, str);
33052+ seq_printf(m, "%s: [%llx] %-55s", field_name,
33053+ uval, str);
33054 } else if (key_field->flags & HIST_FIELD_FL_EXECNAME) {
33055- char *comm = elt->private_data;
33056+ struct hist_elt_data *elt_data = elt->private_data;
33057+ char *comm;
33058+
33059+ if (WARN_ON_ONCE(!elt_data))
33060+ return;
1a6e0f06 33061+
e4b2b4a8
JK
33062+ comm = elt_data->comm;
33063
33064 uval = *(u64 *)(key + key_field->offset);
33065- seq_printf(m, "%s: %-16s[%10llu]",
33066- key_field->field->name, comm, uval);
33067+ seq_printf(m, "%s: %-16s[%10llu]", field_name,
33068+ comm, uval);
33069 } else if (key_field->flags & HIST_FIELD_FL_SYSCALL) {
33070 const char *syscall_name;
33071
b3bbd485 33072@@ -991,8 +4762,8 @@ hist_trigger_entry_print(struct seq_file *m,
e4b2b4a8
JK
33073 if (!syscall_name)
33074 syscall_name = "unknown_syscall";
33075
33076- seq_printf(m, "%s: %-30s[%3llu]",
33077- key_field->field->name, syscall_name, uval);
33078+ seq_printf(m, "%s: %-30s[%3llu]", field_name,
33079+ syscall_name, uval);
33080 } else if (key_field->flags & HIST_FIELD_FL_STACKTRACE) {
33081 seq_puts(m, "stacktrace:\n");
33082 hist_trigger_stacktrace_print(m,
b3bbd485 33083@@ -1000,15 +4771,14 @@ hist_trigger_entry_print(struct seq_file *m,
e4b2b4a8
JK
33084 HIST_STACKTRACE_DEPTH);
33085 multiline = true;
33086 } else if (key_field->flags & HIST_FIELD_FL_LOG2) {
33087- seq_printf(m, "%s: ~ 2^%-2llu", key_field->field->name,
33088+ seq_printf(m, "%s: ~ 2^%-2llu", field_name,
33089 *(u64 *)(key + key_field->offset));
33090 } else if (key_field->flags & HIST_FIELD_FL_STRING) {
33091- seq_printf(m, "%s: %-50s", key_field->field->name,
33092+ seq_printf(m, "%s: %-50s", field_name,
33093 (char *)(key + key_field->offset));
33094 } else {
33095 uval = *(u64 *)(key + key_field->offset);
33096- seq_printf(m, "%s: %10llu", key_field->field->name,
33097- uval);
33098+ seq_printf(m, "%s: %10llu", field_name, uval);
33099 }
33100 }
33101
b3bbd485 33102@@ -1021,17 +4791,23 @@ hist_trigger_entry_print(struct seq_file *m,
e4b2b4a8
JK
33103 tracing_map_read_sum(elt, HITCOUNT_IDX));
33104
33105 for (i = 1; i < hist_data->n_vals; i++) {
33106+ field_name = hist_field_name(hist_data->fields[i], 0);
1a6e0f06 33107+
e4b2b4a8
JK
33108+ if (hist_data->fields[i]->flags & HIST_FIELD_FL_VAR ||
33109+ hist_data->fields[i]->flags & HIST_FIELD_FL_EXPR)
33110+ continue;
1a6e0f06 33111+
e4b2b4a8
JK
33112 if (hist_data->fields[i]->flags & HIST_FIELD_FL_HEX) {
33113- seq_printf(m, " %s: %10llx",
33114- hist_data->fields[i]->field->name,
33115+ seq_printf(m, " %s: %10llx", field_name,
33116 tracing_map_read_sum(elt, i));
33117 } else {
33118- seq_printf(m, " %s: %10llu",
33119- hist_data->fields[i]->field->name,
33120+ seq_printf(m, " %s: %10llu", field_name,
33121 tracing_map_read_sum(elt, i));
33122 }
33123 }
33124
33125+ print_actions(m, hist_data, elt);
1a6e0f06 33126+
e4b2b4a8
JK
33127 seq_puts(m, "\n");
33128 }
33129
b3bbd485 33130@@ -1102,6 +4878,11 @@ static int hist_show(struct seq_file *m, void *v)
e4b2b4a8
JK
33131 hist_trigger_show(m, data, n++);
33132 }
33133
33134+ if (have_hist_err()) {
33135+ seq_printf(m, "\nERROR: %s\n", hist_err_str);
33136+ seq_printf(m, " Last command: %s\n", last_hist_cmd);
33137+ }
1a6e0f06 33138+
e4b2b4a8
JK
33139 out_unlock:
33140 mutex_unlock(&event_mutex);
33141
b3bbd485 33142@@ -1120,34 +4901,31 @@ const struct file_operations event_hist_fops = {
e4b2b4a8
JK
33143 .release = single_release,
33144 };
33145
33146-static const char *get_hist_field_flags(struct hist_field *hist_field)
33147+static void hist_field_print(struct seq_file *m, struct hist_field *hist_field)
33148 {
33149- const char *flags_str = NULL;
33150+ const char *field_name = hist_field_name(hist_field, 0);
33151
33152- if (hist_field->flags & HIST_FIELD_FL_HEX)
33153- flags_str = "hex";
33154- else if (hist_field->flags & HIST_FIELD_FL_SYM)
33155- flags_str = "sym";
33156- else if (hist_field->flags & HIST_FIELD_FL_SYM_OFFSET)
33157- flags_str = "sym-offset";
33158- else if (hist_field->flags & HIST_FIELD_FL_EXECNAME)
33159- flags_str = "execname";
33160- else if (hist_field->flags & HIST_FIELD_FL_SYSCALL)
33161- flags_str = "syscall";
33162- else if (hist_field->flags & HIST_FIELD_FL_LOG2)
33163- flags_str = "log2";
33164+ if (hist_field->var.name)
33165+ seq_printf(m, "%s=", hist_field->var.name);
33166
33167- return flags_str;
33168-}
33169+ if (hist_field->flags & HIST_FIELD_FL_CPU)
33170+ seq_puts(m, "cpu");
33171+ else if (field_name) {
33172+ if (hist_field->flags & HIST_FIELD_FL_VAR_REF ||
33173+ hist_field->flags & HIST_FIELD_FL_ALIAS)
33174+ seq_putc(m, '$');
33175+ seq_printf(m, "%s", field_name);
33176+ } else if (hist_field->flags & HIST_FIELD_FL_TIMESTAMP)
33177+ seq_puts(m, "common_timestamp");
33178
33179-static void hist_field_print(struct seq_file *m, struct hist_field *hist_field)
33180-{
33181- seq_printf(m, "%s", hist_field->field->name);
33182 if (hist_field->flags) {
33183- const char *flags_str = get_hist_field_flags(hist_field);
33184+ if (!(hist_field->flags & HIST_FIELD_FL_VAR_REF) &&
33185+ !(hist_field->flags & HIST_FIELD_FL_EXPR)) {
33186+ const char *flags = get_hist_field_flags(hist_field);
33187
33188- if (flags_str)
33189- seq_printf(m, ".%s", flags_str);
33190+ if (flags)
33191+ seq_printf(m, ".%s", flags);
33192+ }
33193 }
33194 }
33195
b3bbd485 33196@@ -1156,7 +4934,8 @@ static int event_hist_trigger_print(struct seq_file *m,
e4b2b4a8
JK
33197 struct event_trigger_data *data)
33198 {
33199 struct hist_trigger_data *hist_data = data->private_data;
33200- struct hist_field *key_field;
33201+ struct hist_field *field;
33202+ bool have_var = false;
33203 unsigned int i;
33204
33205 seq_puts(m, "hist:");
b3bbd485 33206@@ -1167,25 +4946,47 @@ static int event_hist_trigger_print(struct seq_file *m,
e4b2b4a8
JK
33207 seq_puts(m, "keys=");
33208
33209 for_each_hist_key_field(i, hist_data) {
33210- key_field = hist_data->fields[i];
33211+ field = hist_data->fields[i];
33212
33213 if (i > hist_data->n_vals)
33214 seq_puts(m, ",");
33215
33216- if (key_field->flags & HIST_FIELD_FL_STACKTRACE)
33217+ if (field->flags & HIST_FIELD_FL_STACKTRACE)
33218 seq_puts(m, "stacktrace");
33219 else
33220- hist_field_print(m, key_field);
33221+ hist_field_print(m, field);
33222 }
33223
33224 seq_puts(m, ":vals=");
33225
33226 for_each_hist_val_field(i, hist_data) {
33227+ field = hist_data->fields[i];
33228+ if (field->flags & HIST_FIELD_FL_VAR) {
33229+ have_var = true;
33230+ continue;
1a6e0f06
JK
33231+ }
33232+
e4b2b4a8
JK
33233 if (i == HITCOUNT_IDX)
33234 seq_puts(m, "hitcount");
33235 else {
33236 seq_puts(m, ",");
33237- hist_field_print(m, hist_data->fields[i]);
33238+ hist_field_print(m, field);
33239+ }
1a6e0f06
JK
33240+ }
33241+
e4b2b4a8
JK
33242+ if (have_var) {
33243+ unsigned int n = 0;
33244+
33245+ seq_puts(m, ":");
33246+
33247+ for_each_hist_val_field(i, hist_data) {
33248+ field = hist_data->fields[i];
33249+
33250+ if (field->flags & HIST_FIELD_FL_VAR) {
33251+ if (n++)
33252+ seq_puts(m, ",");
33253+ hist_field_print(m, field);
33254+ }
33255 }
33256 }
33257
b3bbd485 33258@@ -1193,28 +4994,36 @@ static int event_hist_trigger_print(struct seq_file *m,
e4b2b4a8
JK
33259
33260 for (i = 0; i < hist_data->n_sort_keys; i++) {
33261 struct tracing_map_sort_key *sort_key;
33262+ unsigned int idx, first_key_idx;
33263+
33264+ /* skip VAR vals */
33265+ first_key_idx = hist_data->n_vals - hist_data->n_vars;
33266
33267 sort_key = &hist_data->sort_keys[i];
33268+ idx = sort_key->field_idx;
33269+
33270+ if (WARN_ON(idx >= HIST_FIELDS_MAX))
33271+ return -EINVAL;
33272
33273 if (i > 0)
33274 seq_puts(m, ",");
33275
33276- if (sort_key->field_idx == HITCOUNT_IDX)
33277+ if (idx == HITCOUNT_IDX)
33278 seq_puts(m, "hitcount");
33279 else {
33280- unsigned int idx = sort_key->field_idx;
33281-
33282- if (WARN_ON(idx >= TRACING_MAP_FIELDS_MAX))
33283- return -EINVAL;
33284-
33285+ if (idx >= first_key_idx)
33286+ idx += hist_data->n_vars;
33287 hist_field_print(m, hist_data->fields[idx]);
33288 }
33289
33290 if (sort_key->descending)
33291 seq_puts(m, ".descending");
33292 }
33293-
33294 seq_printf(m, ":size=%u", (1 << hist_data->map->map_bits));
33295+ if (hist_data->enable_timestamps)
33296+ seq_printf(m, ":clock=%s", hist_data->attrs->clock);
1a6e0f06 33297+
e4b2b4a8
JK
33298+ print_actions_spec(m, hist_data);
33299
33300 if (data->filter_str)
33301 seq_printf(m, " if %s", data->filter_str);
b3bbd485 33302@@ -1242,6 +5051,21 @@ static int event_hist_trigger_init(struct event_trigger_ops *ops,
e4b2b4a8
JK
33303 return 0;
33304 }
33305
33306+static void unregister_field_var_hists(struct hist_trigger_data *hist_data)
1a6e0f06 33307+{
e4b2b4a8
JK
33308+ struct trace_event_file *file;
33309+ unsigned int i;
33310+ char *cmd;
33311+ int ret;
1a6e0f06 33312+
e4b2b4a8
JK
33313+ for (i = 0; i < hist_data->n_field_var_hists; i++) {
33314+ file = hist_data->field_var_hists[i]->hist_data->event_file;
33315+ cmd = hist_data->field_var_hists[i]->cmd;
33316+ ret = event_hist_trigger_func(&trigger_hist_cmd, file,
33317+ "!hist", "hist", cmd);
33318+ }
1a6e0f06
JK
33319+}
33320+
e4b2b4a8
JK
33321 static void event_hist_trigger_free(struct event_trigger_ops *ops,
33322 struct event_trigger_data *data)
33323 {
b3bbd485 33324@@ -1254,7 +5078,13 @@ static void event_hist_trigger_free(struct event_trigger_ops *ops,
e4b2b4a8
JK
33325 if (!data->ref) {
33326 if (data->name)
33327 del_named_trigger(data);
1a6e0f06 33328+
e4b2b4a8 33329 trigger_data_free(data);
1a6e0f06 33330+
e4b2b4a8 33331+ remove_hist_vars(hist_data);
1a6e0f06 33332+
e4b2b4a8 33333+ unregister_field_var_hists(hist_data);
1a6e0f06 33334+
e4b2b4a8
JK
33335 destroy_hist_data(hist_data);
33336 }
33337 }
b3bbd485 33338@@ -1381,6 +5211,15 @@ static bool hist_trigger_match(struct event_trigger_data *data,
e4b2b4a8
JK
33339 return false;
33340 if (key_field->offset != key_field_test->offset)
33341 return false;
33342+ if (key_field->size != key_field_test->size)
33343+ return false;
33344+ if (key_field->is_signed != key_field_test->is_signed)
33345+ return false;
33346+ if (!!key_field->var.name != !!key_field_test->var.name)
33347+ return false;
33348+ if (key_field->var.name &&
33349+ strcmp(key_field->var.name, key_field_test->var.name) != 0)
33350+ return false;
33351 }
33352
33353 for (i = 0; i < hist_data->n_sort_keys; i++) {
b3bbd485 33354@@ -1396,6 +5235,9 @@ static bool hist_trigger_match(struct event_trigger_data *data,
e4b2b4a8
JK
33355 (strcmp(data->filter_str, data_test->filter_str) != 0))
33356 return false;
33357
33358+ if (!actions_match(hist_data, hist_data_test))
33359+ return false;
1a6e0f06 33360+
e4b2b4a8
JK
33361 return true;
33362 }
33363
b3bbd485 33364@@ -1412,6 +5254,7 @@ static int hist_register_trigger(char *glob, struct event_trigger_ops *ops,
e4b2b4a8
JK
33365 if (named_data) {
33366 if (!hist_trigger_match(data, named_data, named_data,
33367 true)) {
33368+ hist_err("Named hist trigger doesn't match existing named trigger (includes variables): ", hist_data->attrs->name);
33369 ret = -EINVAL;
33370 goto out;
33371 }
b3bbd485 33372@@ -1431,13 +5274,16 @@ static int hist_register_trigger(char *glob, struct event_trigger_ops *ops,
e4b2b4a8
JK
33373 test->paused = false;
33374 else if (hist_data->attrs->clear)
33375 hist_clear(test);
33376- else
33377+ else {
33378+ hist_err("Hist trigger already exists", NULL);
33379 ret = -EEXIST;
33380+ }
33381 goto out;
33382 }
33383 }
33384 new:
33385 if (hist_data->attrs->cont || hist_data->attrs->clear) {
33386+ hist_err("Can't clear or continue a nonexistent hist trigger", NULL);
33387 ret = -ENOENT;
33388 goto out;
33389 }
b3bbd485 33390@@ -1446,7 +5292,6 @@ static int hist_register_trigger(char *glob, struct event_trigger_ops *ops,
e4b2b4a8
JK
33391 data->paused = true;
33392
33393 if (named_data) {
33394- destroy_hist_data(data->private_data);
33395 data->private_data = named_data->private_data;
33396 set_named_trigger_data(data, named_data);
33397 data->ops = &event_hist_trigger_named_ops;
b3bbd485 33398@@ -1458,8 +5303,32 @@ static int hist_register_trigger(char *glob, struct event_trigger_ops *ops,
e4b2b4a8
JK
33399 goto out;
33400 }
33401
33402- list_add_rcu(&data->list, &file->triggers);
33403+ if (hist_data->enable_timestamps) {
33404+ char *clock = hist_data->attrs->clock;
1a6e0f06 33405+
e4b2b4a8
JK
33406+ ret = tracing_set_clock(file->tr, hist_data->attrs->clock);
33407+ if (ret) {
33408+ hist_err("Couldn't set trace_clock: ", clock);
33409+ goto out;
33410+ }
1a6e0f06 33411+
e4b2b4a8 33412+ tracing_set_time_stamp_abs(file->tr, true);
1a6e0f06
JK
33413+ }
33414+
e4b2b4a8
JK
33415+ if (named_data)
33416+ destroy_hist_data(hist_data);
1a6e0f06 33417+
e4b2b4a8
JK
33418 ret++;
33419+ out:
33420+ return ret;
1a6e0f06 33421+}
1a6e0f06 33422+
e4b2b4a8
JK
33423+static int hist_trigger_enable(struct event_trigger_data *data,
33424+ struct trace_event_file *file)
1a6e0f06 33425+{
e4b2b4a8 33426+ int ret = 0;
1a6e0f06 33427+
e4b2b4a8
JK
33428+ list_add_tail_rcu(&data->list, &file->triggers);
33429
33430 update_cond_flag(file);
33431
b3bbd485 33432@@ -1468,10 +5337,55 @@ static int hist_register_trigger(char *glob, struct event_trigger_ops *ops,
e4b2b4a8
JK
33433 update_cond_flag(file);
33434 ret--;
33435 }
33436- out:
1a6e0f06 33437+
e4b2b4a8
JK
33438 return ret;
33439 }
33440
33441+static bool have_hist_trigger_match(struct event_trigger_data *data,
33442+ struct trace_event_file *file)
1a6e0f06 33443+{
e4b2b4a8
JK
33444+ struct hist_trigger_data *hist_data = data->private_data;
33445+ struct event_trigger_data *test, *named_data = NULL;
33446+ bool match = false;
1a6e0f06 33447+
e4b2b4a8
JK
33448+ if (hist_data->attrs->name)
33449+ named_data = find_named_trigger(hist_data->attrs->name);
1a6e0f06 33450+
e4b2b4a8
JK
33451+ list_for_each_entry_rcu(test, &file->triggers, list) {
33452+ if (test->cmd_ops->trigger_type == ETT_EVENT_HIST) {
33453+ if (hist_trigger_match(data, test, named_data, false)) {
33454+ match = true;
33455+ break;
1a6e0f06 33456+ }
1a6e0f06
JK
33457+ }
33458+ }
1a6e0f06 33459+
e4b2b4a8
JK
33460+ return match;
33461+}
1a6e0f06 33462+
e4b2b4a8
JK
33463+static bool hist_trigger_check_refs(struct event_trigger_data *data,
33464+ struct trace_event_file *file)
1a6e0f06 33465+{
e4b2b4a8
JK
33466+ struct hist_trigger_data *hist_data = data->private_data;
33467+ struct event_trigger_data *test, *named_data = NULL;
1a6e0f06 33468+
e4b2b4a8
JK
33469+ if (hist_data->attrs->name)
33470+ named_data = find_named_trigger(hist_data->attrs->name);
1a6e0f06 33471+
e4b2b4a8
JK
33472+ list_for_each_entry_rcu(test, &file->triggers, list) {
33473+ if (test->cmd_ops->trigger_type == ETT_EVENT_HIST) {
33474+ if (!hist_trigger_match(data, test, named_data, false))
33475+ continue;
33476+ hist_data = test->private_data;
33477+ if (check_var_refs(hist_data))
33478+ return true;
33479+ break;
1a6e0f06 33480+ }
e4b2b4a8 33481+ }
1a6e0f06 33482+
e4b2b4a8
JK
33483+ return false;
33484+}
1a6e0f06 33485+
e4b2b4a8
JK
33486 static void hist_unregister_trigger(char *glob, struct event_trigger_ops *ops,
33487 struct event_trigger_data *data,
33488 struct trace_event_file *file)
b3bbd485 33489@@ -1497,17 +5411,55 @@ static void hist_unregister_trigger(char *glob, struct event_trigger_ops *ops,
e4b2b4a8
JK
33490
33491 if (unregistered && test->ops->free)
33492 test->ops->free(test->ops, test);
1a6e0f06 33493+
e4b2b4a8
JK
33494+ if (hist_data->enable_timestamps) {
33495+ if (!hist_data->remove || unregistered)
33496+ tracing_set_time_stamp_abs(file->tr, false);
33497+ }
33498+}
1a6e0f06 33499+
e4b2b4a8
JK
33500+static bool hist_file_check_refs(struct trace_event_file *file)
33501+{
33502+ struct hist_trigger_data *hist_data;
33503+ struct event_trigger_data *test;
1a6e0f06 33504+
e4b2b4a8
JK
33505+ list_for_each_entry_rcu(test, &file->triggers, list) {
33506+ if (test->cmd_ops->trigger_type == ETT_EVENT_HIST) {
33507+ hist_data = test->private_data;
33508+ if (check_var_refs(hist_data))
33509+ return true;
1a6e0f06 33510+ }
e4b2b4a8 33511+ }
1a6e0f06 33512+
e4b2b4a8
JK
33513+ return false;
33514 }
33515
33516 static void hist_unreg_all(struct trace_event_file *file)
33517 {
33518 struct event_trigger_data *test, *n;
33519+ struct hist_trigger_data *hist_data;
33520+ struct synth_event *se;
33521+ const char *se_name;
1a6e0f06 33522+
e4b2b4a8
JK
33523+ if (hist_file_check_refs(file))
33524+ return;
33525
33526 list_for_each_entry_safe(test, n, &file->triggers, list) {
33527 if (test->cmd_ops->trigger_type == ETT_EVENT_HIST) {
33528+ hist_data = test->private_data;
33529 list_del_rcu(&test->list);
33530 trace_event_trigger_enable_disable(file, 0);
33531+
33532+ mutex_lock(&synth_event_mutex);
33533+ se_name = trace_event_name(file->event_call);
33534+ se = find_synth_event(se_name);
33535+ if (se)
33536+ se->ref--;
33537+ mutex_unlock(&synth_event_mutex);
33538+
33539 update_cond_flag(file);
33540+ if (hist_data->enable_timestamps)
33541+ tracing_set_time_stamp_abs(file->tr, false);
33542 if (test->ops->free)
33543 test->ops->free(test->ops, test);
33544 }
b3bbd485 33545@@ -1523,16 +5475,54 @@ static int event_hist_trigger_func(struct event_command *cmd_ops,
e4b2b4a8
JK
33546 struct hist_trigger_attrs *attrs;
33547 struct event_trigger_ops *trigger_ops;
33548 struct hist_trigger_data *hist_data;
33549- char *trigger;
33550+ struct synth_event *se;
33551+ const char *se_name;
33552+ bool remove = false;
33553+ char *trigger, *p;
33554 int ret = 0;
33555
33556+ if (glob && strlen(glob)) {
33557+ last_cmd_set(param);
33558+ hist_err_clear();
33559+ }
1a6e0f06 33560+
e4b2b4a8
JK
33561 if (!param)
33562 return -EINVAL;
33563
33564- /* separate the trigger from the filter (k:v [if filter]) */
33565- trigger = strsep(&param, " \t");
33566- if (!trigger)
33567- return -EINVAL;
33568+ if (glob[0] == '!')
33569+ remove = true;
1a6e0f06 33570+
e4b2b4a8
JK
33571+ /*
33572+ * separate the trigger from the filter (k:v [if filter])
33573+ * allowing for whitespace in the trigger
33574+ */
33575+ p = trigger = param;
33576+ do {
33577+ p = strstr(p, "if");
33578+ if (!p)
33579+ break;
33580+ if (p == param)
33581+ return -EINVAL;
33582+ if (*(p - 1) != ' ' && *(p - 1) != '\t') {
33583+ p++;
33584+ continue;
1a6e0f06 33585+ }
e4b2b4a8
JK
33586+ if (p >= param + strlen(param) - strlen("if") - 1)
33587+ return -EINVAL;
33588+ if (*(p + strlen("if")) != ' ' && *(p + strlen("if")) != '\t') {
33589+ p++;
33590+ continue;
33591+ }
33592+ break;
33593+ } while (p);
33594+
33595+ if (!p)
33596+ param = NULL;
33597+ else {
33598+ *(p - 1) = '\0';
33599+ param = strstrip(p);
33600+ trigger = strstrip(trigger);
1a6e0f06 33601+ }
e4b2b4a8
JK
33602
33603 attrs = parse_hist_trigger_attrs(trigger);
33604 if (IS_ERR(attrs))
b3bbd485 33605@@ -1541,7 +5531,7 @@ static int event_hist_trigger_func(struct event_command *cmd_ops,
e4b2b4a8
JK
33606 if (attrs->map_bits)
33607 hist_trigger_bits = attrs->map_bits;
33608
33609- hist_data = create_hist_data(hist_trigger_bits, attrs, file);
33610+ hist_data = create_hist_data(hist_trigger_bits, attrs, file, remove);
33611 if (IS_ERR(hist_data)) {
33612 destroy_hist_trigger_attrs(attrs);
33613 return PTR_ERR(hist_data);
b3bbd485 33614@@ -1549,10 +5539,11 @@ static int event_hist_trigger_func(struct event_command *cmd_ops,
e4b2b4a8
JK
33615
33616 trigger_ops = cmd_ops->get_trigger_ops(cmd, trigger);
33617
33618- ret = -ENOMEM;
33619 trigger_data = kzalloc(sizeof(*trigger_data), GFP_KERNEL);
33620- if (!trigger_data)
33621+ if (!trigger_data) {
33622+ ret = -ENOMEM;
33623 goto out_free;
33624+ }
33625
33626 trigger_data->count = -1;
33627 trigger_data->ops = trigger_ops;
b3bbd485 33628@@ -1570,8 +5561,24 @@ static int event_hist_trigger_func(struct event_command *cmd_ops,
e4b2b4a8
JK
33629 goto out_free;
33630 }
33631
33632- if (glob[0] == '!') {
33633+ if (remove) {
33634+ if (!have_hist_trigger_match(trigger_data, file))
33635+ goto out_free;
1a6e0f06 33636+
e4b2b4a8
JK
33637+ if (hist_trigger_check_refs(trigger_data, file)) {
33638+ ret = -EBUSY;
33639+ goto out_free;
1a6e0f06
JK
33640+ }
33641+
e4b2b4a8 33642 cmd_ops->unreg(glob+1, trigger_ops, trigger_data, file);
1a6e0f06 33643+
e4b2b4a8
JK
33644+ mutex_lock(&synth_event_mutex);
33645+ se_name = trace_event_name(file->event_call);
33646+ se = find_synth_event(se_name);
33647+ if (se)
33648+ se->ref--;
33649+ mutex_unlock(&synth_event_mutex);
1a6e0f06 33650+
e4b2b4a8
JK
33651 ret = 0;
33652 goto out_free;
33653 }
b3bbd485 33654@@ -1588,14 +5595,47 @@ static int event_hist_trigger_func(struct event_command *cmd_ops,
e4b2b4a8
JK
33655 goto out_free;
33656 } else if (ret < 0)
33657 goto out_free;
1a6e0f06 33658+
e4b2b4a8
JK
33659+ if (get_named_trigger_data(trigger_data))
33660+ goto enable;
1a6e0f06 33661+
e4b2b4a8
JK
33662+ if (has_hist_vars(hist_data))
33663+ save_hist_vars(hist_data);
1a6e0f06 33664+
e4b2b4a8
JK
33665+ ret = create_actions(hist_data, file);
33666+ if (ret)
33667+ goto out_unreg;
1a6e0f06 33668+
e4b2b4a8
JK
33669+ ret = tracing_map_init(hist_data->map);
33670+ if (ret)
33671+ goto out_unreg;
33672+enable:
33673+ ret = hist_trigger_enable(trigger_data, file);
33674+ if (ret)
33675+ goto out_unreg;
1a6e0f06 33676+
e4b2b4a8
JK
33677+ mutex_lock(&synth_event_mutex);
33678+ se_name = trace_event_name(file->event_call);
33679+ se = find_synth_event(se_name);
33680+ if (se)
33681+ se->ref++;
33682+ mutex_unlock(&synth_event_mutex);
1a6e0f06 33683+
e4b2b4a8
JK
33684 /* Just return zero, not the number of registered triggers */
33685 ret = 0;
33686 out:
33687+ if (ret == 0)
33688+ hist_err_clear();
1a6e0f06 33689+
e4b2b4a8
JK
33690 return ret;
33691+ out_unreg:
33692+ cmd_ops->unreg(glob+1, trigger_ops, trigger_data, file);
33693 out_free:
33694 if (cmd_ops->set_filter)
33695 cmd_ops->set_filter(NULL, trigger_data, NULL);
33696
33697+ remove_hist_vars(hist_data);
1a6e0f06 33698+
e4b2b4a8
JK
33699 kfree(trigger_data);
33700
33701 destroy_hist_data(hist_data);
b3bbd485 33702@@ -1625,7 +5665,8 @@ __init int register_trigger_hist_cmd(void)
e4b2b4a8
JK
33703 }
33704
33705 static void
33706-hist_enable_trigger(struct event_trigger_data *data, void *rec)
33707+hist_enable_trigger(struct event_trigger_data *data, void *rec,
33708+ struct ring_buffer_event *event)
33709 {
33710 struct enable_trigger_data *enable_data = data->private_data;
33711 struct event_trigger_data *test;
b3bbd485 33712@@ -1641,7 +5682,8 @@ hist_enable_trigger(struct event_trigger_data *data, void *rec)
e4b2b4a8
JK
33713 }
33714
33715 static void
33716-hist_enable_count_trigger(struct event_trigger_data *data, void *rec)
33717+hist_enable_count_trigger(struct event_trigger_data *data, void *rec,
33718+ struct ring_buffer_event *event)
33719 {
33720 if (!data->count)
33721 return;
b3bbd485 33722@@ -1649,7 +5691,7 @@ hist_enable_count_trigger(struct event_trigger_data *data, void *rec)
e4b2b4a8
JK
33723 if (data->count != -1)
33724 (data->count)--;
33725
33726- hist_enable_trigger(data, rec);
33727+ hist_enable_trigger(data, rec, event);
33728 }
33729
33730 static struct event_trigger_ops hist_enable_trigger_ops = {
b3bbd485 33731@@ -1754,3 +5796,31 @@ __init int register_trigger_hist_enable_disable_cmds(void)
e4b2b4a8
JK
33732
33733 return ret;
33734 }
1a6e0f06 33735+
e4b2b4a8
JK
33736+static __init int trace_events_hist_init(void)
33737+{
33738+ struct dentry *entry = NULL;
33739+ struct dentry *d_tracer;
33740+ int err = 0;
1a6e0f06 33741+
e4b2b4a8
JK
33742+ d_tracer = tracing_init_dentry();
33743+ if (IS_ERR(d_tracer)) {
33744+ err = PTR_ERR(d_tracer);
33745+ goto err;
1a6e0f06
JK
33746+ }
33747+
e4b2b4a8
JK
33748+ entry = tracefs_create_file("synthetic_events", 0644, d_tracer,
33749+ NULL, &synth_events_fops);
33750+ if (!entry) {
33751+ err = -ENODEV;
33752+ goto err;
1a6e0f06
JK
33753+ }
33754+
e4b2b4a8
JK
33755+ return err;
33756+ err:
33757+ pr_warn("Could not create tracefs 'synthetic_events' entry\n");
33758+
33759+ return err;
1a6e0f06 33760+}
1a6e0f06 33761+
e4b2b4a8 33762+fs_initcall(trace_events_hist_init);
b3bbd485
JK
33763diff --git a/kernel/trace/trace_events_trigger.c b/kernel/trace/trace_events_trigger.c
33764index 43254c5e7e16..24d42350d738 100644
33765--- a/kernel/trace/trace_events_trigger.c
33766+++ b/kernel/trace/trace_events_trigger.c
33767@@ -63,7 +63,8 @@ void trigger_data_free(struct event_trigger_data *data)
e4b2b4a8
JK
33768 * any trigger that should be deferred, ETT_NONE if nothing to defer.
33769 */
33770 enum event_trigger_type
33771-event_triggers_call(struct trace_event_file *file, void *rec)
33772+event_triggers_call(struct trace_event_file *file, void *rec,
33773+ struct ring_buffer_event *event)
33774 {
33775 struct event_trigger_data *data;
33776 enum event_trigger_type tt = ETT_NONE;
b3bbd485 33777@@ -76,7 +77,7 @@ event_triggers_call(struct trace_event_file *file, void *rec)
e4b2b4a8
JK
33778 if (data->paused)
33779 continue;
33780 if (!rec) {
33781- data->ops->func(data, rec);
33782+ data->ops->func(data, rec, event);
33783 continue;
33784 }
33785 filter = rcu_dereference_sched(data->filter);
b3bbd485 33786@@ -86,7 +87,7 @@ event_triggers_call(struct trace_event_file *file, void *rec)
e4b2b4a8
JK
33787 tt |= data->cmd_ops->trigger_type;
33788 continue;
33789 }
33790- data->ops->func(data, rec);
33791+ data->ops->func(data, rec, event);
33792 }
33793 return tt;
33794 }
b3bbd485 33795@@ -108,7 +109,7 @@ EXPORT_SYMBOL_GPL(event_triggers_call);
e4b2b4a8
JK
33796 void
33797 event_triggers_post_call(struct trace_event_file *file,
33798 enum event_trigger_type tt,
33799- void *rec)
33800+ void *rec, struct ring_buffer_event *event)
33801 {
33802 struct event_trigger_data *data;
33803
b3bbd485 33804@@ -116,7 +117,7 @@ event_triggers_post_call(struct trace_event_file *file,
e4b2b4a8
JK
33805 if (data->paused)
33806 continue;
33807 if (data->cmd_ops->trigger_type & tt)
33808- data->ops->func(data, rec);
33809+ data->ops->func(data, rec, event);
33810 }
33811 }
33812 EXPORT_SYMBOL_GPL(event_triggers_post_call);
b3bbd485 33813@@ -914,8 +915,15 @@ void set_named_trigger_data(struct event_trigger_data *data,
e4b2b4a8
JK
33814 data->named_data = named_data;
33815 }
33816
33817+struct event_trigger_data *
33818+get_named_trigger_data(struct event_trigger_data *data)
1a6e0f06 33819+{
e4b2b4a8 33820+ return data->named_data;
1a6e0f06
JK
33821+}
33822+
e4b2b4a8
JK
33823 static void
33824-traceon_trigger(struct event_trigger_data *data, void *rec)
33825+traceon_trigger(struct event_trigger_data *data, void *rec,
33826+ struct ring_buffer_event *event)
33827 {
33828 if (tracing_is_on())
33829 return;
b3bbd485 33830@@ -924,7 +932,8 @@ traceon_trigger(struct event_trigger_data *data, void *rec)
e4b2b4a8
JK
33831 }
33832
33833 static void
33834-traceon_count_trigger(struct event_trigger_data *data, void *rec)
33835+traceon_count_trigger(struct event_trigger_data *data, void *rec,
33836+ struct ring_buffer_event *event)
33837 {
33838 if (tracing_is_on())
33839 return;
b3bbd485 33840@@ -939,7 +948,8 @@ traceon_count_trigger(struct event_trigger_data *data, void *rec)
e4b2b4a8
JK
33841 }
33842
33843 static void
33844-traceoff_trigger(struct event_trigger_data *data, void *rec)
33845+traceoff_trigger(struct event_trigger_data *data, void *rec,
33846+ struct ring_buffer_event *event)
33847 {
33848 if (!tracing_is_on())
33849 return;
b3bbd485 33850@@ -948,7 +958,8 @@ traceoff_trigger(struct event_trigger_data *data, void *rec)
e4b2b4a8
JK
33851 }
33852
33853 static void
33854-traceoff_count_trigger(struct event_trigger_data *data, void *rec)
33855+traceoff_count_trigger(struct event_trigger_data *data, void *rec,
33856+ struct ring_buffer_event *event)
33857 {
33858 if (!tracing_is_on())
33859 return;
b3bbd485 33860@@ -1045,7 +1056,8 @@ static struct event_command trigger_traceoff_cmd = {
1a6e0f06 33861
e4b2b4a8
JK
33862 #ifdef CONFIG_TRACER_SNAPSHOT
33863 static void
33864-snapshot_trigger(struct event_trigger_data *data, void *rec)
33865+snapshot_trigger(struct event_trigger_data *data, void *rec,
33866+ struct ring_buffer_event *event)
33867 {
33868 struct trace_event_file *file = data->private_data;
33869
b3bbd485 33870@@ -1056,7 +1068,8 @@ snapshot_trigger(struct event_trigger_data *data, void *rec)
1a6e0f06 33871 }
1a6e0f06 33872
e4b2b4a8
JK
33873 static void
33874-snapshot_count_trigger(struct event_trigger_data *data, void *rec)
33875+snapshot_count_trigger(struct event_trigger_data *data, void *rec,
33876+ struct ring_buffer_event *event)
33877 {
33878 if (!data->count)
33879 return;
b3bbd485 33880@@ -1064,7 +1077,7 @@ snapshot_count_trigger(struct event_trigger_data *data, void *rec)
e4b2b4a8
JK
33881 if (data->count != -1)
33882 (data->count)--;
1a6e0f06 33883
e4b2b4a8
JK
33884- snapshot_trigger(data, rec);
33885+ snapshot_trigger(data, rec, event);
33886 }
33887
33888 static int
b3bbd485 33889@@ -1143,13 +1156,15 @@ static __init int register_trigger_snapshot_cmd(void) { return 0; }
e4b2b4a8
JK
33890 #define STACK_SKIP 3
33891
33892 static void
33893-stacktrace_trigger(struct event_trigger_data *data, void *rec)
33894+stacktrace_trigger(struct event_trigger_data *data, void *rec,
33895+ struct ring_buffer_event *event)
1a6e0f06 33896 {
e4b2b4a8 33897 trace_dump_stack(STACK_SKIP);
1a6e0f06
JK
33898 }
33899
e4b2b4a8
JK
33900 static void
33901-stacktrace_count_trigger(struct event_trigger_data *data, void *rec)
33902+stacktrace_count_trigger(struct event_trigger_data *data, void *rec,
33903+ struct ring_buffer_event *event)
33904 {
33905 if (!data->count)
33906 return;
b3bbd485 33907@@ -1157,7 +1172,7 @@ stacktrace_count_trigger(struct event_trigger_data *data, void *rec)
e4b2b4a8
JK
33908 if (data->count != -1)
33909 (data->count)--;
33910
33911- stacktrace_trigger(data, rec);
33912+ stacktrace_trigger(data, rec, event);
1a6e0f06
JK
33913 }
33914
e4b2b4a8 33915 static int
b3bbd485 33916@@ -1219,7 +1234,8 @@ static __init void unregister_trigger_traceon_traceoff_cmds(void)
e4b2b4a8
JK
33917 }
33918
33919 static void
33920-event_enable_trigger(struct event_trigger_data *data, void *rec)
33921+event_enable_trigger(struct event_trigger_data *data, void *rec,
33922+ struct ring_buffer_event *event)
33923 {
33924 struct enable_trigger_data *enable_data = data->private_data;
33925
b3bbd485 33926@@ -1230,7 +1246,8 @@ event_enable_trigger(struct event_trigger_data *data, void *rec)
e4b2b4a8
JK
33927 }
33928
33929 static void
33930-event_enable_count_trigger(struct event_trigger_data *data, void *rec)
33931+event_enable_count_trigger(struct event_trigger_data *data, void *rec,
33932+ struct ring_buffer_event *event)
33933 {
33934 struct enable_trigger_data *enable_data = data->private_data;
33935
b3bbd485 33936@@ -1244,7 +1261,7 @@ event_enable_count_trigger(struct event_trigger_data *data, void *rec)
e4b2b4a8
JK
33937 if (data->count != -1)
33938 (data->count)--;
33939
33940- event_enable_trigger(data, rec);
33941+ event_enable_trigger(data, rec, event);
33942 }
33943
33944 int event_enable_trigger_print(struct seq_file *m,
b3bbd485
JK
33945diff --git a/kernel/trace/trace_hwlat.c b/kernel/trace/trace_hwlat.c
33946index d7c8e4ec3d9d..518c61a1bceb 100644
33947--- a/kernel/trace/trace_hwlat.c
33948+++ b/kernel/trace/trace_hwlat.c
33949@@ -279,7 +279,7 @@ static void move_to_next_cpu(void)
e4b2b4a8
JK
33950 * of this thread, than stop migrating for the duration
33951 * of the current test.
33952 */
33953- if (!cpumask_equal(current_mask, &current->cpus_allowed))
33954+ if (!cpumask_equal(current_mask, current->cpus_ptr))
33955 goto disable;
1a6e0f06 33956
e4b2b4a8 33957 get_online_cpus();
b3bbd485
JK
33958diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
33959index ea20274a105a..3c40d4174052 100644
33960--- a/kernel/trace/trace_kprobe.c
33961+++ b/kernel/trace/trace_kprobe.c
33962@@ -918,8 +918,8 @@ static int probes_open(struct inode *inode, struct file *file)
e4b2b4a8
JK
33963 static ssize_t probes_write(struct file *file, const char __user *buffer,
33964 size_t count, loff_t *ppos)
33965 {
33966- return traceprobe_probes_write(file, buffer, count, ppos,
33967- create_trace_kprobe);
33968+ return trace_parse_run_command(file, buffer, count, ppos,
33969+ create_trace_kprobe);
33970 }
33971
33972 static const struct file_operations kprobe_events_ops = {
b3bbd485 33973@@ -1444,9 +1444,9 @@ static __init int kprobe_trace_self_tests_init(void)
e4b2b4a8
JK
33974
33975 pr_info("Testing kprobe tracing: ");
33976
33977- ret = traceprobe_command("p:testprobe kprobe_trace_selftest_target "
33978- "$stack $stack0 +0($stack)",
33979- create_trace_kprobe);
33980+ ret = trace_run_command("p:testprobe kprobe_trace_selftest_target "
33981+ "$stack $stack0 +0($stack)",
33982+ create_trace_kprobe);
33983 if (WARN_ON_ONCE(ret)) {
33984 pr_warn("error on probing function entry.\n");
33985 warn++;
b3bbd485 33986@@ -1466,8 +1466,8 @@ static __init int kprobe_trace_self_tests_init(void)
e4b2b4a8
JK
33987 }
33988 }
1a6e0f06 33989
e4b2b4a8
JK
33990- ret = traceprobe_command("r:testprobe2 kprobe_trace_selftest_target "
33991- "$retval", create_trace_kprobe);
33992+ ret = trace_run_command("r:testprobe2 kprobe_trace_selftest_target "
33993+ "$retval", create_trace_kprobe);
33994 if (WARN_ON_ONCE(ret)) {
33995 pr_warn("error on probing function return.\n");
33996 warn++;
b3bbd485 33997@@ -1537,13 +1537,13 @@ static __init int kprobe_trace_self_tests_init(void)
e4b2b4a8
JK
33998 disable_trace_kprobe(tk, file);
33999 }
1a6e0f06 34000
e4b2b4a8
JK
34001- ret = traceprobe_command("-:testprobe", create_trace_kprobe);
34002+ ret = trace_run_command("-:testprobe", create_trace_kprobe);
34003 if (WARN_ON_ONCE(ret)) {
34004 pr_warn("error on deleting a probe.\n");
34005 warn++;
34006 }
34007
34008- ret = traceprobe_command("-:testprobe2", create_trace_kprobe);
34009+ ret = trace_run_command("-:testprobe2", create_trace_kprobe);
34010 if (WARN_ON_ONCE(ret)) {
34011 pr_warn("error on deleting a probe.\n");
34012 warn++;
b3bbd485
JK
34013diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
34014index 4500b00e4e36..74a4bfc2c6b7 100644
34015--- a/kernel/trace/trace_output.c
34016+++ b/kernel/trace/trace_output.c
34017@@ -447,6 +447,7 @@ int trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry)
1a6e0f06
JK
34018 {
34019 char hardsoft_irq;
34020 char need_resched;
34021+ char need_resched_lazy;
34022 char irqs_off;
34023 int hardirq;
34024 int softirq;
b3bbd485 34025@@ -477,6 +478,9 @@ int trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry)
1a6e0f06
JK
34026 break;
34027 }
34028
34029+ need_resched_lazy =
34030+ (entry->flags & TRACE_FLAG_NEED_RESCHED_LAZY) ? 'L' : '.';
34031+
34032 hardsoft_irq =
34033 (nmi && hardirq) ? 'Z' :
34034 nmi ? 'z' :
b3bbd485 34035@@ -485,14 +489,25 @@ int trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry)
1a6e0f06
JK
34036 softirq ? 's' :
34037 '.' ;
34038
e4b2b4a8
JK
34039- trace_seq_printf(s, "%c%c%c",
34040- irqs_off, need_resched, hardsoft_irq);
34041+ trace_seq_printf(s, "%c%c%c%c",
34042+ irqs_off, need_resched, need_resched_lazy,
34043+ hardsoft_irq);
34044
34045 if (entry->preempt_count)
34046 trace_seq_printf(s, "%x", entry->preempt_count);
34047 else
34048 trace_seq_putc(s, '.');
34049
34050+ if (entry->preempt_lazy_count)
34051+ trace_seq_printf(s, "%x", entry->preempt_lazy_count);
34052+ else
34053+ trace_seq_putc(s, '.');
34054+
34055+ if (entry->migrate_disable)
34056+ trace_seq_printf(s, "%x", entry->migrate_disable);
34057+ else
34058+ trace_seq_putc(s, '.');
34059+
34060 return !trace_seq_has_overflowed(s);
34061 }
34062
b3bbd485
JK
34063diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c
34064index fe4513330412..daf54bda4dc8 100644
34065--- a/kernel/trace/trace_probe.c
34066+++ b/kernel/trace/trace_probe.c
34067@@ -621,92 +621,6 @@ void traceprobe_free_probe_arg(struct probe_arg *arg)
e4b2b4a8
JK
34068 kfree(arg->comm);
34069 }
34070
34071-int traceprobe_command(const char *buf, int (*createfn)(int, char **))
34072-{
34073- char **argv;
34074- int argc, ret;
34075-
34076- argc = 0;
34077- ret = 0;
34078- argv = argv_split(GFP_KERNEL, buf, &argc);
34079- if (!argv)
34080- return -ENOMEM;
34081-
34082- if (argc)
34083- ret = createfn(argc, argv);
34084-
34085- argv_free(argv);
34086-
34087- return ret;
34088-}
34089-
34090-#define WRITE_BUFSIZE 4096
34091-
34092-ssize_t traceprobe_probes_write(struct file *file, const char __user *buffer,
34093- size_t count, loff_t *ppos,
34094- int (*createfn)(int, char **))
34095-{
34096- char *kbuf, *buf, *tmp;
34097- int ret = 0;
34098- size_t done = 0;
34099- size_t size;
34100-
34101- kbuf = kmalloc(WRITE_BUFSIZE, GFP_KERNEL);
34102- if (!kbuf)
34103- return -ENOMEM;
34104-
34105- while (done < count) {
34106- size = count - done;
34107-
34108- if (size >= WRITE_BUFSIZE)
34109- size = WRITE_BUFSIZE - 1;
34110-
34111- if (copy_from_user(kbuf, buffer + done, size)) {
34112- ret = -EFAULT;
34113- goto out;
34114- }
34115- kbuf[size] = '\0';
34116- buf = kbuf;
34117- do {
34118- tmp = strchr(buf, '\n');
34119- if (tmp) {
34120- *tmp = '\0';
34121- size = tmp - buf + 1;
34122- } else {
34123- size = strlen(buf);
34124- if (done + size < count) {
34125- if (buf != kbuf)
34126- break;
34127- /* This can accept WRITE_BUFSIZE - 2 ('\n' + '\0') */
34128- pr_warn("Line length is too long: Should be less than %d\n",
34129- WRITE_BUFSIZE - 2);
34130- ret = -EINVAL;
34131- goto out;
34132- }
34133- }
34134- done += size;
34135-
34136- /* Remove comments */
34137- tmp = strchr(buf, '#');
34138-
34139- if (tmp)
34140- *tmp = '\0';
34141-
34142- ret = traceprobe_command(buf, createfn);
34143- if (ret)
34144- goto out;
34145- buf += size;
34146-
34147- } while (done < count);
34148- }
34149- ret = done;
34150-
34151-out:
34152- kfree(kbuf);
34153-
34154- return ret;
34155-}
34156-
34157 static int __set_print_fmt(struct trace_probe *tp, char *buf, int len,
34158 bool is_return)
34159 {
b3bbd485
JK
34160diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h
34161index dc39472ca9e4..a0d750e3d17c 100644
34162--- a/kernel/trace/trace_probe.h
34163+++ b/kernel/trace/trace_probe.h
e4b2b4a8
JK
34164@@ -42,7 +42,6 @@
34165
34166 #define MAX_TRACE_ARGS 128
34167 #define MAX_ARGSTR_LEN 63
34168-#define MAX_EVENT_NAME_LEN 64
34169 #define MAX_STRING_SIZE PATH_MAX
34170
34171 /* Reserved field names */
b3bbd485 34172@@ -356,12 +355,6 @@ extern void traceprobe_free_probe_arg(struct probe_arg *arg);
e4b2b4a8
JK
34173
34174 extern int traceprobe_split_symbol_offset(char *symbol, long *offset);
34175
34176-extern ssize_t traceprobe_probes_write(struct file *file,
34177- const char __user *buffer, size_t count, loff_t *ppos,
34178- int (*createfn)(int, char**));
34179-
34180-extern int traceprobe_command(const char *buf, int (*createfn)(int, char**));
34181-
34182 /* Sum up total data length for dynamic arraies (strings) */
34183 static nokprobe_inline int
34184 __get_data_size(struct trace_probe *tp, struct pt_regs *regs)
b3bbd485
JK
34185diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c
34186index ea0d90a31fc9..2ccfbb8efeb2 100644
34187--- a/kernel/trace/trace_uprobe.c
34188+++ b/kernel/trace/trace_uprobe.c
34189@@ -647,7 +647,7 @@ static int probes_open(struct inode *inode, struct file *file)
e4b2b4a8
JK
34190 static ssize_t probes_write(struct file *file, const char __user *buffer,
34191 size_t count, loff_t *ppos)
34192 {
34193- return traceprobe_probes_write(file, buffer, count, ppos, create_trace_uprobe);
34194+ return trace_parse_run_command(file, buffer, count, ppos, create_trace_uprobe);
34195 }
34196
34197 static const struct file_operations uprobe_events_ops = {
b3bbd485
JK
34198diff --git a/kernel/trace/tracing_map.c b/kernel/trace/tracing_map.c
34199index 305039b122fa..5cadb1b8b5fe 100644
34200--- a/kernel/trace/tracing_map.c
34201+++ b/kernel/trace/tracing_map.c
34202@@ -66,6 +66,73 @@ u64 tracing_map_read_sum(struct tracing_map_elt *elt, unsigned int i)
e4b2b4a8
JK
34203 return (u64)atomic64_read(&elt->fields[i].sum);
34204 }
34205
34206+/**
34207+ * tracing_map_set_var - Assign a tracing_map_elt's variable field
34208+ * @elt: The tracing_map_elt
34209+ * @i: The index of the given variable associated with the tracing_map_elt
34210+ * @n: The value to assign
34211+ *
34212+ * Assign n to variable i associated with the specified tracing_map_elt
34213+ * instance. The index i is the index returned by the call to
34214+ * tracing_map_add_var() when the tracing map was set up.
34215+ */
34216+void tracing_map_set_var(struct tracing_map_elt *elt, unsigned int i, u64 n)
34217+{
34218+ atomic64_set(&elt->vars[i], n);
34219+ elt->var_set[i] = true;
34220+}
34221+
34222+/**
34223+ * tracing_map_var_set - Return whether or not a variable has been set
34224+ * @elt: The tracing_map_elt
34225+ * @i: The index of the given variable associated with the tracing_map_elt
34226+ *
34227+ * Return true if the variable has been set, false otherwise. The
34228+ * index i is the index returned by the call to tracing_map_add_var()
34229+ * when the tracing map was set up.
34230+ */
34231+bool tracing_map_var_set(struct tracing_map_elt *elt, unsigned int i)
34232+{
34233+ return elt->var_set[i];
34234+}
34235+
34236+/**
34237+ * tracing_map_read_var - Return the value of a tracing_map_elt's variable field
34238+ * @elt: The tracing_map_elt
34239+ * @i: The index of the given variable associated with the tracing_map_elt
34240+ *
34241+ * Retrieve the value of the variable i associated with the specified
34242+ * tracing_map_elt instance. The index i is the index returned by the
34243+ * call to tracing_map_add_var() when the tracing map was set
34244+ * up.
34245+ *
34246+ * Return: The variable value associated with field i for elt.
34247+ */
34248+u64 tracing_map_read_var(struct tracing_map_elt *elt, unsigned int i)
34249+{
34250+ return (u64)atomic64_read(&elt->vars[i]);
34251+}
34252+
34253+/**
34254+ * tracing_map_read_var_once - Return and reset a tracing_map_elt's variable field
34255+ * @elt: The tracing_map_elt
34256+ * @i: The index of the given variable associated with the tracing_map_elt
34257+ *
34258+ * Retrieve the value of the variable i associated with the specified
34259+ * tracing_map_elt instance, and reset the variable to the 'not set'
34260+ * state. The index i is the index returned by the call to
34261+ * tracing_map_add_var() when the tracing map was set up. The reset
34262+ * essentially makes the variable a read-once variable if it's only
34263+ * accessed using this function.
34264+ *
34265+ * Return: The variable value associated with field i for elt.
34266+ */
34267+u64 tracing_map_read_var_once(struct tracing_map_elt *elt, unsigned int i)
34268+{
34269+ elt->var_set[i] = false;
34270+ return (u64)atomic64_read(&elt->vars[i]);
34271+}
34272+
34273 int tracing_map_cmp_string(void *val_a, void *val_b)
34274 {
34275 char *a = val_a;
b3bbd485
JK
34276@@ -170,6 +237,28 @@ int tracing_map_add_sum_field(struct tracing_map *map)
34277 return tracing_map_add_field(map, tracing_map_cmp_atomic64);
e4b2b4a8
JK
34278 }
34279
b3bbd485 34280+/**
e4b2b4a8
JK
34281+ * tracing_map_add_var - Add a field describing a tracing_map var
34282+ * @map: The tracing_map
34283+ *
34284+ * Add a var to the map and return the index identifying it in the map
34285+ * and associated tracing_map_elts. This is the index used for
34286+ * instance to update a var for a particular tracing_map_elt using
34287+ * tracing_map_update_var() or reading it via tracing_map_read_var().
34288+ *
34289+ * Return: The index identifying the var in the map and associated
34290+ * tracing_map_elts, or -EINVAL on error.
34291+ */
34292+int tracing_map_add_var(struct tracing_map *map)
34293+{
34294+ int ret = -EINVAL;
34295+
34296+ if (map->n_vars < TRACING_MAP_VARS_MAX)
34297+ ret = map->n_vars++;
34298+
34299+ return ret;
34300+}
34301+
b3bbd485 34302 /**
e4b2b4a8
JK
34303 * tracing_map_add_key_field - Add a field describing a tracing_map key
34304 * @map: The tracing_map
b3bbd485 34305@@ -280,6 +369,11 @@ static void tracing_map_elt_clear(struct tracing_map_elt *elt)
e4b2b4a8
JK
34306 if (elt->fields[i].cmp_fn == tracing_map_cmp_atomic64)
34307 atomic64_set(&elt->fields[i].sum, 0);
34308
34309+ for (i = 0; i < elt->map->n_vars; i++) {
34310+ atomic64_set(&elt->vars[i], 0);
34311+ elt->var_set[i] = false;
34312+ }
34313+
34314 if (elt->map->ops && elt->map->ops->elt_clear)
34315 elt->map->ops->elt_clear(elt);
34316 }
b3bbd485 34317@@ -306,6 +400,8 @@ static void tracing_map_elt_free(struct tracing_map_elt *elt)
e4b2b4a8
JK
34318 if (elt->map->ops && elt->map->ops->elt_free)
34319 elt->map->ops->elt_free(elt);
34320 kfree(elt->fields);
34321+ kfree(elt->vars);
34322+ kfree(elt->var_set);
34323 kfree(elt->key);
34324 kfree(elt);
34325 }
b3bbd485 34326@@ -333,6 +429,18 @@ static struct tracing_map_elt *tracing_map_elt_alloc(struct tracing_map *map)
e4b2b4a8
JK
34327 goto free;
34328 }
34329
34330+ elt->vars = kcalloc(map->n_vars, sizeof(*elt->vars), GFP_KERNEL);
34331+ if (!elt->vars) {
34332+ err = -ENOMEM;
34333+ goto free;
34334+ }
34335+
34336+ elt->var_set = kcalloc(map->n_vars, sizeof(*elt->var_set), GFP_KERNEL);
34337+ if (!elt->var_set) {
34338+ err = -ENOMEM;
34339+ goto free;
34340+ }
34341+
34342 tracing_map_elt_init_fields(elt);
34343
34344 if (map->ops && map->ops->elt_alloc) {
b3bbd485 34345@@ -414,7 +522,9 @@ static inline struct tracing_map_elt *
e4b2b4a8
JK
34346 __tracing_map_insert(struct tracing_map *map, void *key, bool lookup_only)
34347 {
34348 u32 idx, key_hash, test_key;
34349+ int dup_try = 0;
34350 struct tracing_map_entry *entry;
34351+ struct tracing_map_elt *val;
34352
34353 key_hash = jhash(key, map->key_size, 0);
34354 if (key_hash == 0)
b3bbd485 34355@@ -426,10 +536,33 @@ __tracing_map_insert(struct tracing_map *map, void *key, bool lookup_only)
e4b2b4a8
JK
34356 entry = TRACING_MAP_ENTRY(map->map, idx);
34357 test_key = entry->key;
34358
34359- if (test_key && test_key == key_hash && entry->val &&
34360- keys_match(key, entry->val->key, map->key_size)) {
34361- atomic64_inc(&map->hits);
34362- return entry->val;
34363+ if (test_key && test_key == key_hash) {
34364+ val = READ_ONCE(entry->val);
34365+ if (val &&
34366+ keys_match(key, val->key, map->key_size)) {
34367+ if (!lookup_only)
34368+ atomic64_inc(&map->hits);
34369+ return val;
34370+ } else if (unlikely(!val)) {
34371+ /*
34372+ * The key is present. But, val (pointer to elt
34373+ * struct) is still NULL. which means some other
34374+ * thread is in the process of inserting an
34375+ * element.
34376+ *
34377+ * On top of that, it's key_hash is same as the
34378+ * one being inserted right now. So, it's
34379+ * possible that the element has the same
34380+ * key as well.
34381+ */
34382+
34383+ dup_try++;
34384+ if (dup_try > map->map_size) {
34385+ atomic64_inc(&map->drops);
34386+ break;
34387+ }
34388+ continue;
34389+ }
34390 }
34391
34392 if (!test_key) {
b3bbd485 34393@@ -451,6 +584,13 @@ __tracing_map_insert(struct tracing_map *map, void *key, bool lookup_only)
e4b2b4a8
JK
34394 atomic64_inc(&map->hits);
34395
34396 return entry->val;
34397+ } else {
34398+ /*
34399+ * cmpxchg() failed. Loop around once
34400+ * more to check what key was inserted.
34401+ */
34402+ dup_try++;
34403+ continue;
34404 }
34405 }
34406
b3bbd485 34407@@ -815,67 +955,15 @@ create_sort_entry(void *key, struct tracing_map_elt *elt)
e4b2b4a8
JK
34408 return sort_entry;
34409 }
34410
34411-static struct tracing_map_elt *copy_elt(struct tracing_map_elt *elt)
34412-{
34413- struct tracing_map_elt *dup_elt;
34414- unsigned int i;
34415-
34416- dup_elt = tracing_map_elt_alloc(elt->map);
34417- if (IS_ERR(dup_elt))
34418- return NULL;
34419-
34420- if (elt->map->ops && elt->map->ops->elt_copy)
34421- elt->map->ops->elt_copy(dup_elt, elt);
34422-
34423- dup_elt->private_data = elt->private_data;
34424- memcpy(dup_elt->key, elt->key, elt->map->key_size);
34425-
34426- for (i = 0; i < elt->map->n_fields; i++) {
34427- atomic64_set(&dup_elt->fields[i].sum,
34428- atomic64_read(&elt->fields[i].sum));
34429- dup_elt->fields[i].cmp_fn = elt->fields[i].cmp_fn;
34430- }
34431-
34432- return dup_elt;
34433-}
34434-
34435-static int merge_dup(struct tracing_map_sort_entry **sort_entries,
34436- unsigned int target, unsigned int dup)
34437-{
34438- struct tracing_map_elt *target_elt, *elt;
34439- bool first_dup = (target - dup) == 1;
34440- int i;
34441-
34442- if (first_dup) {
34443- elt = sort_entries[target]->elt;
34444- target_elt = copy_elt(elt);
34445- if (!target_elt)
34446- return -ENOMEM;
34447- sort_entries[target]->elt = target_elt;
34448- sort_entries[target]->elt_copied = true;
34449- } else
34450- target_elt = sort_entries[target]->elt;
34451-
34452- elt = sort_entries[dup]->elt;
34453-
34454- for (i = 0; i < elt->map->n_fields; i++)
34455- atomic64_add(atomic64_read(&elt->fields[i].sum),
34456- &target_elt->fields[i].sum);
34457-
34458- sort_entries[dup]->dup = true;
34459-
34460- return 0;
34461-}
34462-
34463-static int merge_dups(struct tracing_map_sort_entry **sort_entries,
34464+static void detect_dups(struct tracing_map_sort_entry **sort_entries,
34465 int n_entries, unsigned int key_size)
34466 {
34467 unsigned int dups = 0, total_dups = 0;
34468- int err, i, j;
34469+ int i;
34470 void *key;
34471
34472 if (n_entries < 2)
34473- return total_dups;
34474+ return;
1a6e0f06 34475
e4b2b4a8
JK
34476 sort(sort_entries, n_entries, sizeof(struct tracing_map_sort_entry *),
34477 (int (*)(const void *, const void *))cmp_entries_dup, NULL);
b3bbd485 34478@@ -884,30 +972,14 @@ static int merge_dups(struct tracing_map_sort_entry **sort_entries,
e4b2b4a8
JK
34479 for (i = 1; i < n_entries; i++) {
34480 if (!memcmp(sort_entries[i]->key, key, key_size)) {
34481 dups++; total_dups++;
34482- err = merge_dup(sort_entries, i - dups, i);
34483- if (err)
34484- return err;
34485 continue;
34486 }
34487 key = sort_entries[i]->key;
34488 dups = 0;
34489 }
1a6e0f06 34490
e4b2b4a8
JK
34491- if (!total_dups)
34492- return total_dups;
34493-
34494- for (i = 0, j = 0; i < n_entries; i++) {
34495- if (!sort_entries[i]->dup) {
34496- sort_entries[j] = sort_entries[i];
34497- if (j++ != i)
34498- sort_entries[i] = NULL;
34499- } else {
34500- destroy_sort_entry(sort_entries[i]);
34501- sort_entries[i] = NULL;
34502- }
34503- }
34504-
34505- return total_dups;
34506+ WARN_ONCE(total_dups > 0,
34507+ "Duplicates detected: %d\n", total_dups);
1a6e0f06
JK
34508 }
34509
e4b2b4a8 34510 static bool is_key(struct tracing_map *map, unsigned int field_idx)
b3bbd485 34511@@ -1033,10 +1105,7 @@ int tracing_map_sort_entries(struct tracing_map *map,
e4b2b4a8
JK
34512 return 1;
34513 }
34514
34515- ret = merge_dups(entries, n_entries, map->key_size);
34516- if (ret < 0)
34517- goto free;
34518- n_entries -= ret;
34519+ detect_dups(entries, n_entries, map->key_size);
34520
34521 if (is_key(map, sort_keys[0].field_idx))
34522 cmp_entries_fn = cmp_entries_key;
b3bbd485
JK
34523diff --git a/kernel/trace/tracing_map.h b/kernel/trace/tracing_map.h
34524index ab0ca77331d0..053eb92b2d31 100644
34525--- a/kernel/trace/tracing_map.h
34526+++ b/kernel/trace/tracing_map.h
e4b2b4a8
JK
34527@@ -6,10 +6,11 @@
34528 #define TRACING_MAP_BITS_MAX 17
34529 #define TRACING_MAP_BITS_MIN 7
34530
34531-#define TRACING_MAP_KEYS_MAX 2
34532+#define TRACING_MAP_KEYS_MAX 3
34533 #define TRACING_MAP_VALS_MAX 3
34534 #define TRACING_MAP_FIELDS_MAX (TRACING_MAP_KEYS_MAX + \
34535 TRACING_MAP_VALS_MAX)
34536+#define TRACING_MAP_VARS_MAX 16
34537 #define TRACING_MAP_SORT_KEYS_MAX 2
34538
34539 typedef int (*tracing_map_cmp_fn_t) (void *val_a, void *val_b);
b3bbd485 34540@@ -137,6 +138,8 @@ struct tracing_map_field {
e4b2b4a8
JK
34541 struct tracing_map_elt {
34542 struct tracing_map *map;
34543 struct tracing_map_field *fields;
34544+ atomic64_t *vars;
34545+ bool *var_set;
34546 void *key;
34547 void *private_data;
34548 };
b3bbd485 34549@@ -192,6 +195,7 @@ struct tracing_map {
e4b2b4a8
JK
34550 int key_idx[TRACING_MAP_KEYS_MAX];
34551 unsigned int n_keys;
34552 struct tracing_map_sort_key sort_key;
34553+ unsigned int n_vars;
34554 atomic64_t hits;
34555 atomic64_t drops;
34556 };
b3bbd485 34557@@ -215,11 +219,6 @@ struct tracing_map {
e4b2b4a8
JK
34558 * Element allocation occurs before tracing begins, when the
34559 * tracing_map_init() call is made by client code.
34560 *
34561- * @elt_copy: At certain points in the lifetime of an element, it may
34562- * need to be copied. The copy should include a copy of the
34563- * client-allocated data, which can be copied into the 'to'
34564- * element from the 'from' element.
34565- *
34566 * @elt_free: When a tracing_map_elt is freed, this function is called
34567 * and allows client-allocated per-element data to be freed.
34568 *
b3bbd485 34569@@ -233,8 +232,6 @@ struct tracing_map {
e4b2b4a8
JK
34570 */
34571 struct tracing_map_ops {
34572 int (*elt_alloc)(struct tracing_map_elt *elt);
34573- void (*elt_copy)(struct tracing_map_elt *to,
34574- struct tracing_map_elt *from);
34575 void (*elt_free)(struct tracing_map_elt *elt);
34576 void (*elt_clear)(struct tracing_map_elt *elt);
34577 void (*elt_init)(struct tracing_map_elt *elt);
b3bbd485 34578@@ -248,6 +245,7 @@ tracing_map_create(unsigned int map_bits,
e4b2b4a8
JK
34579 extern int tracing_map_init(struct tracing_map *map);
34580
34581 extern int tracing_map_add_sum_field(struct tracing_map *map);
34582+extern int tracing_map_add_var(struct tracing_map *map);
34583 extern int tracing_map_add_key_field(struct tracing_map *map,
34584 unsigned int offset,
34585 tracing_map_cmp_fn_t cmp_fn);
b3bbd485 34586@@ -267,7 +265,13 @@ extern int tracing_map_cmp_none(void *val_a, void *val_b);
e4b2b4a8
JK
34587
34588 extern void tracing_map_update_sum(struct tracing_map_elt *elt,
34589 unsigned int i, u64 n);
34590+extern void tracing_map_set_var(struct tracing_map_elt *elt,
34591+ unsigned int i, u64 n);
34592+extern bool tracing_map_var_set(struct tracing_map_elt *elt, unsigned int i);
34593 extern u64 tracing_map_read_sum(struct tracing_map_elt *elt, unsigned int i);
34594+extern u64 tracing_map_read_var(struct tracing_map_elt *elt, unsigned int i);
34595+extern u64 tracing_map_read_var_once(struct tracing_map_elt *elt, unsigned int i);
34596+
34597 extern void tracing_map_set_field_descr(struct tracing_map *map,
34598 unsigned int i,
34599 unsigned int key_offset,
b3bbd485
JK
34600diff --git a/kernel/user.c b/kernel/user.c
34601index 00281add65b2..f4cf1841f2fd 100644
34602--- a/kernel/user.c
34603+++ b/kernel/user.c
34604@@ -162,11 +162,11 @@ void free_uid(struct user_struct *up)
1a6e0f06
JK
34605 if (!up)
34606 return;
34607
34608- local_irq_save(flags);
34609+ local_irq_save_nort(flags);
34610 if (atomic_dec_and_lock(&up->__count, &uidhash_lock))
34611 free_user(up, flags);
34612 else
34613- local_irq_restore(flags);
34614+ local_irq_restore_nort(flags);
34615 }
34616
34617 struct user_struct *alloc_uid(kuid_t uid)
b3bbd485
JK
34618diff --git a/kernel/watchdog.c b/kernel/watchdog.c
34619index 087994b23f8b..ea4c09109ce4 100644
34620--- a/kernel/watchdog.c
34621+++ b/kernel/watchdog.c
34622@@ -462,7 +462,7 @@ static void watchdog_enable(unsigned int cpu)
e4b2b4a8
JK
34623 * Start the timer first to prevent the NMI watchdog triggering
34624 * before the timer has a chance to fire.
34625 */
34626- hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
34627+ hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD);
34628 hrtimer->function = watchdog_timer_fn;
34629 hrtimer_start(hrtimer, ns_to_ktime(sample_period),
34630 HRTIMER_MODE_REL_PINNED);
b3bbd485
JK
34631diff --git a/kernel/watchdog_hld.c b/kernel/watchdog_hld.c
34632index 4ece6028007a..210dccc57c04 100644
34633--- a/kernel/watchdog_hld.c
34634+++ b/kernel/watchdog_hld.c
34635@@ -24,6 +24,8 @@ static DEFINE_PER_CPU(bool, hard_watchdog_warn);
e4b2b4a8
JK
34636 static DEFINE_PER_CPU(bool, watchdog_nmi_touch);
34637 static DEFINE_PER_CPU(struct perf_event *, watchdog_ev);
34638 static DEFINE_PER_CPU(struct perf_event *, dead_event);
1a6e0f06
JK
34639+static DEFINE_RAW_SPINLOCK(watchdog_output_lock);
34640+
e4b2b4a8
JK
34641 static struct cpumask dead_events_mask;
34642
34643 static unsigned long hardlockup_allcpu_dumped;
b3bbd485 34644@@ -134,6 +136,13 @@ static void watchdog_overflow_callback(struct perf_event *event,
1a6e0f06
JK
34645 /* only print hardlockups once */
34646 if (__this_cpu_read(hard_watchdog_warn) == true)
34647 return;
34648+ /*
34649+ * If early-printk is enabled then make sure we do not
34650+ * lock up in printk() and kill console logging:
34651+ */
34652+ printk_kill();
34653+
34654+ raw_spin_lock(&watchdog_output_lock);
34655
34656 pr_emerg("Watchdog detected hard LOCKUP on cpu %d", this_cpu);
34657 print_modules();
b3bbd485 34658@@ -151,6 +160,7 @@ static void watchdog_overflow_callback(struct perf_event *event,
1a6e0f06
JK
34659 !test_and_set_bit(0, &hardlockup_allcpu_dumped))
34660 trigger_allbutself_cpu_backtrace();
34661
34662+ raw_spin_unlock(&watchdog_output_lock);
34663 if (hardlockup_panic)
34664 nmi_panic(regs, "Hard LOCKUP");
34665
b3bbd485
JK
34666diff --git a/kernel/workqueue.c b/kernel/workqueue.c
34667index 08bc551976b2..76297cce5602 100644
34668--- a/kernel/workqueue.c
34669+++ b/kernel/workqueue.c
e4b2b4a8 34670@@ -49,6 +49,8 @@
1a6e0f06
JK
34671 #include <linux/moduleparam.h>
34672 #include <linux/uaccess.h>
e4b2b4a8 34673 #include <linux/nmi.h>
1a6e0f06
JK
34674+#include <linux/locallock.h>
34675+#include <linux/delay.h>
34676
34677 #include "workqueue_internal.h"
34678
b3bbd485 34679@@ -123,11 +125,16 @@ enum {
1a6e0f06
JK
34680 * cpu or grabbing pool->lock is enough for read access. If
34681 * POOL_DISASSOCIATED is set, it's identical to L.
34682 *
34683+ * On RT we need the extra protection via rt_lock_idle_list() for
34684+ * the list manipulations against read access from
34685+ * wq_worker_sleeping(). All other places are nicely serialized via
34686+ * pool->lock.
34687+ *
34688 * A: pool->attach_mutex protected.
34689 *
34690 * PL: wq_pool_mutex protected.
34691 *
34692- * PR: wq_pool_mutex protected for writes. Sched-RCU protected for reads.
34693+ * PR: wq_pool_mutex protected for writes. RCU protected for reads.
34694 *
34695 * PW: wq_pool_mutex and wq->mutex protected for writes. Either for reads.
34696 *
b3bbd485 34697@@ -136,7 +143,7 @@ enum {
1a6e0f06
JK
34698 *
34699 * WQ: wq->mutex protected.
34700 *
34701- * WR: wq->mutex protected for writes. Sched-RCU protected for reads.
34702+ * WR: wq->mutex protected for writes. RCU protected for reads.
34703 *
34704 * MD: wq_mayday_lock protected.
34705 */
b3bbd485 34706@@ -186,7 +193,7 @@ struct worker_pool {
1a6e0f06
JK
34707 atomic_t nr_running ____cacheline_aligned_in_smp;
34708
34709 /*
34710- * Destruction of pool is sched-RCU protected to allow dereferences
34711+ * Destruction of pool is RCU protected to allow dereferences
34712 * from get_work_pool().
34713 */
34714 struct rcu_head rcu;
b3bbd485 34715@@ -215,7 +222,7 @@ struct pool_workqueue {
1a6e0f06
JK
34716 /*
34717 * Release of unbound pwq is punted to system_wq. See put_pwq()
34718 * and pwq_unbound_release_workfn() for details. pool_workqueue
34719- * itself is also sched-RCU protected so that the first pwq can be
34720+ * itself is also RCU protected so that the first pwq can be
34721 * determined without grabbing wq->mutex.
34722 */
34723 struct work_struct unbound_release_work;
b3bbd485 34724@@ -352,6 +359,8 @@ EXPORT_SYMBOL_GPL(system_power_efficient_wq);
1a6e0f06
JK
34725 struct workqueue_struct *system_freezable_power_efficient_wq __read_mostly;
34726 EXPORT_SYMBOL_GPL(system_freezable_power_efficient_wq);
34727
34728+static DEFINE_LOCAL_IRQ_LOCK(pendingb_lock);
34729+
34730 static int worker_thread(void *__worker);
34731 static void workqueue_sysfs_unregister(struct workqueue_struct *wq);
34732
b3bbd485 34733@@ -359,20 +368,20 @@ static void workqueue_sysfs_unregister(struct workqueue_struct *wq);
1a6e0f06
JK
34734 #include <trace/events/workqueue.h>
34735
34736 #define assert_rcu_or_pool_mutex() \
34737- RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held() && \
34738+ RCU_LOCKDEP_WARN(!rcu_read_lock_held() && \
34739 !lockdep_is_held(&wq_pool_mutex), \
34740- "sched RCU or wq_pool_mutex should be held")
34741+ "RCU or wq_pool_mutex should be held")
34742
34743 #define assert_rcu_or_wq_mutex(wq) \
34744- RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held() && \
34745+ RCU_LOCKDEP_WARN(!rcu_read_lock_held() && \
34746 !lockdep_is_held(&wq->mutex), \
34747- "sched RCU or wq->mutex should be held")
34748+ "RCU or wq->mutex should be held")
34749
34750 #define assert_rcu_or_wq_mutex_or_pool_mutex(wq) \
34751- RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held() && \
34752+ RCU_LOCKDEP_WARN(!rcu_read_lock_held() && \
34753 !lockdep_is_held(&wq->mutex) && \
34754 !lockdep_is_held(&wq_pool_mutex), \
34755- "sched RCU, wq->mutex or wq_pool_mutex should be held")
34756+ "RCU, wq->mutex or wq_pool_mutex should be held")
34757
34758 #define for_each_cpu_worker_pool(pool, cpu) \
34759 for ((pool) = &per_cpu(cpu_worker_pools, cpu)[0]; \
b3bbd485 34760@@ -384,7 +393,7 @@ static void workqueue_sysfs_unregister(struct workqueue_struct *wq);
1a6e0f06
JK
34761 * @pool: iteration cursor
34762 * @pi: integer used for iteration
34763 *
34764- * This must be called either with wq_pool_mutex held or sched RCU read
34765+ * This must be called either with wq_pool_mutex held or RCU read
34766 * locked. If the pool needs to be used beyond the locking in effect, the
34767 * caller is responsible for guaranteeing that the pool stays online.
34768 *
b3bbd485 34769@@ -416,7 +425,7 @@ static void workqueue_sysfs_unregister(struct workqueue_struct *wq);
1a6e0f06
JK
34770 * @pwq: iteration cursor
34771 * @wq: the target workqueue
34772 *
34773- * This must be called either with wq->mutex held or sched RCU read locked.
34774+ * This must be called either with wq->mutex held or RCU read locked.
34775 * If the pwq needs to be used beyond the locking in effect, the caller is
34776 * responsible for guaranteeing that the pwq stays online.
34777 *
b3bbd485 34778@@ -428,6 +437,31 @@ static void workqueue_sysfs_unregister(struct workqueue_struct *wq);
1a6e0f06
JK
34779 if (({ assert_rcu_or_wq_mutex(wq); false; })) { } \
34780 else
34781
34782+#ifdef CONFIG_PREEMPT_RT_BASE
34783+static inline void rt_lock_idle_list(struct worker_pool *pool)
34784+{
34785+ preempt_disable();
34786+}
34787+static inline void rt_unlock_idle_list(struct worker_pool *pool)
34788+{
34789+ preempt_enable();
34790+}
34791+static inline void sched_lock_idle_list(struct worker_pool *pool) { }
34792+static inline void sched_unlock_idle_list(struct worker_pool *pool) { }
34793+#else
34794+static inline void rt_lock_idle_list(struct worker_pool *pool) { }
34795+static inline void rt_unlock_idle_list(struct worker_pool *pool) { }
34796+static inline void sched_lock_idle_list(struct worker_pool *pool)
34797+{
34798+ spin_lock_irq(&pool->lock);
34799+}
34800+static inline void sched_unlock_idle_list(struct worker_pool *pool)
34801+{
34802+ spin_unlock_irq(&pool->lock);
34803+}
34804+#endif
34805+
34806+
34807 #ifdef CONFIG_DEBUG_OBJECTS_WORK
34808
34809 static struct debug_obj_descr work_debug_descr;
b3bbd485 34810@@ -552,7 +586,7 @@ static int worker_pool_assign_id(struct worker_pool *pool)
1a6e0f06
JK
34811 * @wq: the target workqueue
34812 * @node: the node ID
34813 *
34814- * This must be called with any of wq_pool_mutex, wq->mutex or sched RCU
34815+ * This must be called with any of wq_pool_mutex, wq->mutex or RCU
34816 * read locked.
34817 * If the pwq needs to be used beyond the locking in effect, the caller is
34818 * responsible for guaranteeing that the pwq stays online.
b3bbd485 34819@@ -696,8 +730,8 @@ static struct pool_workqueue *get_work_pwq(struct work_struct *work)
1a6e0f06
JK
34820 * @work: the work item of interest
34821 *
34822 * Pools are created and destroyed under wq_pool_mutex, and allows read
34823- * access under sched-RCU read lock. As such, this function should be
34824- * called under wq_pool_mutex or with preemption disabled.
34825+ * access under RCU read lock. As such, this function should be
34826+ * called under wq_pool_mutex or inside of a rcu_read_lock() region.
34827 *
34828 * All fields of the returned pool are accessible as long as the above
34829 * mentioned locking is in effect. If the returned pool needs to be used
b3bbd485 34830@@ -834,50 +868,45 @@ static struct worker *first_idle_worker(struct worker_pool *pool)
1a6e0f06
JK
34831 */
34832 static void wake_up_worker(struct worker_pool *pool)
34833 {
34834- struct worker *worker = first_idle_worker(pool);
34835+ struct worker *worker;
34836+
34837+ rt_lock_idle_list(pool);
34838+
34839+ worker = first_idle_worker(pool);
34840
34841 if (likely(worker))
34842 wake_up_process(worker->task);
34843+
34844+ rt_unlock_idle_list(pool);
34845 }
34846
34847 /**
34848- * wq_worker_waking_up - a worker is waking up
34849+ * wq_worker_running - a worker is running again
34850 * @task: task waking up
34851- * @cpu: CPU @task is waking up to
b3bbd485 34852 *
1a6e0f06
JK
34853- * This function is called during try_to_wake_up() when a worker is
34854- * being awoken.
b3bbd485 34855- *
1a6e0f06
JK
34856- * CONTEXT:
34857- * spin_lock_irq(rq->lock)
34858+ * This function is called when a worker returns from schedule()
34859 */
34860-void wq_worker_waking_up(struct task_struct *task, int cpu)
34861+void wq_worker_running(struct task_struct *task)
34862 {
34863 struct worker *worker = kthread_data(task);
34864
34865- if (!(worker->flags & WORKER_NOT_RUNNING)) {
34866- WARN_ON_ONCE(worker->pool->cpu != cpu);
34867+ if (!worker->sleeping)
34868+ return;
34869+ if (!(worker->flags & WORKER_NOT_RUNNING))
34870 atomic_inc(&worker->pool->nr_running);
34871- }
34872+ worker->sleeping = 0;
34873 }
34874
34875 /**
34876 * wq_worker_sleeping - a worker is going to sleep
34877 * @task: task going to sleep
34878 *
34879- * This function is called during schedule() when a busy worker is
34880- * going to sleep. Worker on the same cpu can be woken up by
34881- * returning pointer to its task.
34882- *
34883- * CONTEXT:
34884- * spin_lock_irq(rq->lock)
34885- *
34886- * Return:
34887- * Worker task on @cpu to wake up, %NULL if none.
34888+ * This function is called from schedule() when a busy worker is
34889+ * going to sleep.
34890 */
34891-struct task_struct *wq_worker_sleeping(struct task_struct *task)
34892+void wq_worker_sleeping(struct task_struct *task)
34893 {
34894- struct worker *worker = kthread_data(task), *to_wakeup = NULL;
34895+ struct worker *worker = kthread_data(task);
34896 struct worker_pool *pool;
34897
34898 /*
b3bbd485 34899@@ -886,29 +915,26 @@ struct task_struct *wq_worker_sleeping(struct task_struct *task)
1a6e0f06
JK
34900 * checking NOT_RUNNING.
34901 */
34902 if (worker->flags & WORKER_NOT_RUNNING)
34903- return NULL;
34904+ return;
34905
34906 pool = worker->pool;
34907
34908- /* this can only happen on the local cpu */
34909- if (WARN_ON_ONCE(pool->cpu != raw_smp_processor_id()))
34910- return NULL;
34911+ if (WARN_ON_ONCE(worker->sleeping))
34912+ return;
34913+
34914+ worker->sleeping = 1;
34915
34916 /*
34917 * The counterpart of the following dec_and_test, implied mb,
34918 * worklist not empty test sequence is in insert_work().
34919 * Please read comment there.
34920- *
34921- * NOT_RUNNING is clear. This means that we're bound to and
34922- * running on the local cpu w/ rq lock held and preemption
34923- * disabled, which in turn means that none else could be
34924- * manipulating idle_list, so dereferencing idle_list without pool
34925- * lock is safe.
34926 */
34927 if (atomic_dec_and_test(&pool->nr_running) &&
34928- !list_empty(&pool->worklist))
34929- to_wakeup = first_idle_worker(pool);
34930- return to_wakeup ? to_wakeup->task : NULL;
34931+ !list_empty(&pool->worklist)) {
34932+ sched_lock_idle_list(pool);
34933+ wake_up_worker(pool);
34934+ sched_unlock_idle_list(pool);
34935+ }
34936 }
34937
34938 /**
b3bbd485 34939@@ -1102,12 +1128,14 @@ static void put_pwq_unlocked(struct pool_workqueue *pwq)
1a6e0f06
JK
34940 {
34941 if (pwq) {
34942 /*
34943- * As both pwqs and pools are sched-RCU protected, the
34944+ * As both pwqs and pools are RCU protected, the
34945 * following lock operations are safe.
34946 */
34947- spin_lock_irq(&pwq->pool->lock);
c7c16703 34948+ rcu_read_lock();
1a6e0f06
JK
34949+ local_spin_lock_irq(pendingb_lock, &pwq->pool->lock);
34950 put_pwq(pwq);
34951- spin_unlock_irq(&pwq->pool->lock);
34952+ local_spin_unlock_irq(pendingb_lock, &pwq->pool->lock);
c7c16703 34953+ rcu_read_unlock();
1a6e0f06
JK
34954 }
34955 }
34956
b3bbd485 34957@@ -1211,7 +1239,7 @@ static int try_to_grab_pending(struct work_struct *work, bool is_dwork,
1a6e0f06
JK
34958 struct worker_pool *pool;
34959 struct pool_workqueue *pwq;
34960
34961- local_irq_save(*flags);
34962+ local_lock_irqsave(pendingb_lock, *flags);
34963
34964 /* try to steal the timer if it exists */
34965 if (is_dwork) {
b3bbd485 34966@@ -1230,6 +1258,7 @@ static int try_to_grab_pending(struct work_struct *work, bool is_dwork,
1a6e0f06
JK
34967 if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work)))
34968 return 0;
34969
34970+ rcu_read_lock();
34971 /*
34972 * The queueing is in progress, or it is already queued. Try to
34973 * steal it from ->worklist without clearing WORK_STRUCT_PENDING.
b3bbd485 34974@@ -1268,14 +1297,16 @@ static int try_to_grab_pending(struct work_struct *work, bool is_dwork,
1a6e0f06
JK
34975 set_work_pool_and_keep_pending(work, pool->id);
34976
34977 spin_unlock(&pool->lock);
34978+ rcu_read_unlock();
34979 return 1;
34980 }
34981 spin_unlock(&pool->lock);
34982 fail:
34983- local_irq_restore(*flags);
34984+ rcu_read_unlock();
34985+ local_unlock_irqrestore(pendingb_lock, *flags);
34986 if (work_is_canceling(work))
34987 return -ENOENT;
34988- cpu_relax();
34989+ cpu_chill();
34990 return -EAGAIN;
34991 }
34992
b3bbd485 34993@@ -1377,7 +1408,7 @@ static void __queue_work(int cpu, struct workqueue_struct *wq,
1a6e0f06
JK
34994 * queued or lose PENDING. Grabbing PENDING and queueing should
34995 * happen with IRQ disabled.
34996 */
34997- WARN_ON_ONCE(!irqs_disabled());
34998+ WARN_ON_ONCE_NONRT(!irqs_disabled());
34999
35000 debug_work_activate(work);
35001
b3bbd485 35002@@ -1385,6 +1416,7 @@ static void __queue_work(int cpu, struct workqueue_struct *wq,
1a6e0f06
JK
35003 if (unlikely(wq->flags & __WQ_DRAINING) &&
35004 WARN_ON_ONCE(!is_chained_work(wq)))
35005 return;
35006+ rcu_read_lock();
35007 retry:
35008 if (req_cpu == WORK_CPU_UNBOUND)
35009 cpu = wq_select_unbound_cpu(raw_smp_processor_id());
b3bbd485 35010@@ -1441,10 +1473,8 @@ static void __queue_work(int cpu, struct workqueue_struct *wq,
1a6e0f06
JK
35011 /* pwq determined, queue */
35012 trace_workqueue_queue_work(req_cpu, pwq, work);
35013
35014- if (WARN_ON(!list_empty(&work->entry))) {
35015- spin_unlock(&pwq->pool->lock);
35016- return;
35017- }
35018+ if (WARN_ON(!list_empty(&work->entry)))
35019+ goto out;
35020
35021 pwq->nr_in_flight[pwq->work_color]++;
35022 work_flags = work_color_to_flags(pwq->work_color);
b3bbd485 35023@@ -1462,7 +1492,9 @@ static void __queue_work(int cpu, struct workqueue_struct *wq,
1a6e0f06
JK
35024
35025 insert_work(pwq, work, worklist, work_flags);
35026
35027+out:
35028 spin_unlock(&pwq->pool->lock);
35029+ rcu_read_unlock();
35030 }
35031
35032 /**
b3bbd485 35033@@ -1482,14 +1514,14 @@ bool queue_work_on(int cpu, struct workqueue_struct *wq,
1a6e0f06
JK
35034 bool ret = false;
35035 unsigned long flags;
35036
35037- local_irq_save(flags);
35038+ local_lock_irqsave(pendingb_lock,flags);
35039
35040 if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work))) {
35041 __queue_work(cpu, wq, work);
35042 ret = true;
35043 }
35044
35045- local_irq_restore(flags);
35046+ local_unlock_irqrestore(pendingb_lock, flags);
35047 return ret;
35048 }
35049 EXPORT_SYMBOL(queue_work_on);
b3bbd485 35050@@ -1498,8 +1530,11 @@ void delayed_work_timer_fn(unsigned long __data)
e4b2b4a8
JK
35051 {
35052 struct delayed_work *dwork = (struct delayed_work *)__data;
35053
35054+ /* XXX */
35055+ /* local_lock(pendingb_lock); */
35056 /* should have been called from irqsafe timer with irq already off */
35057 __queue_work(dwork->cpu, dwork->wq, &dwork->work);
35058+ /* local_unlock(pendingb_lock); */
35059 }
35060 EXPORT_SYMBOL(delayed_work_timer_fn);
35061
b3bbd485 35062@@ -1555,14 +1590,14 @@ bool queue_delayed_work_on(int cpu, struct workqueue_struct *wq,
1a6e0f06
JK
35063 unsigned long flags;
35064
35065 /* read the comment in __queue_work() */
35066- local_irq_save(flags);
35067+ local_lock_irqsave(pendingb_lock, flags);
35068
35069 if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work))) {
35070 __queue_delayed_work(cpu, wq, dwork, delay);
35071 ret = true;
35072 }
35073
35074- local_irq_restore(flags);
35075+ local_unlock_irqrestore(pendingb_lock, flags);
35076 return ret;
35077 }
35078 EXPORT_SYMBOL(queue_delayed_work_on);
b3bbd485 35079@@ -1597,7 +1632,7 @@ bool mod_delayed_work_on(int cpu, struct workqueue_struct *wq,
1a6e0f06
JK
35080
35081 if (likely(ret >= 0)) {
35082 __queue_delayed_work(cpu, wq, dwork, delay);
35083- local_irq_restore(flags);
35084+ local_unlock_irqrestore(pendingb_lock, flags);
35085 }
35086
35087 /* -ENOENT from try_to_grab_pending() becomes %true */
b3bbd485 35088@@ -1630,7 +1665,9 @@ static void worker_enter_idle(struct worker *worker)
1a6e0f06
JK
35089 worker->last_active = jiffies;
35090
35091 /* idle_list is LIFO */
35092+ rt_lock_idle_list(pool);
35093 list_add(&worker->entry, &pool->idle_list);
35094+ rt_unlock_idle_list(pool);
35095
35096 if (too_many_workers(pool) && !timer_pending(&pool->idle_timer))
35097 mod_timer(&pool->idle_timer, jiffies + IDLE_WORKER_TIMEOUT);
b3bbd485 35098@@ -1663,7 +1700,9 @@ static void worker_leave_idle(struct worker *worker)
1a6e0f06
JK
35099 return;
35100 worker_clr_flags(worker, WORKER_IDLE);
35101 pool->nr_idle--;
35102+ rt_lock_idle_list(pool);
35103 list_del_init(&worker->entry);
35104+ rt_unlock_idle_list(pool);
35105 }
35106
35107 static struct worker *alloc_worker(int node)
b3bbd485 35108@@ -1829,7 +1868,9 @@ static void destroy_worker(struct worker *worker)
1a6e0f06
JK
35109 pool->nr_workers--;
35110 pool->nr_idle--;
35111
35112+ rt_lock_idle_list(pool);
35113 list_del_init(&worker->entry);
35114+ rt_unlock_idle_list(pool);
35115 worker->flags |= WORKER_DIE;
35116 wake_up_process(worker->task);
35117 }
b3bbd485 35118@@ -2815,14 +2856,14 @@ static bool start_flush_work(struct work_struct *work, struct wq_barrier *barr)
1a6e0f06
JK
35119
35120 might_sleep();
35121
35122- local_irq_disable();
35123+ rcu_read_lock();
35124 pool = get_work_pool(work);
35125 if (!pool) {
35126- local_irq_enable();
35127+ rcu_read_unlock();
35128 return false;
35129 }
35130
35131- spin_lock(&pool->lock);
35132+ spin_lock_irq(&pool->lock);
35133 /* see the comment in try_to_grab_pending() with the same code */
35134 pwq = get_work_pwq(work);
35135 if (pwq) {
b3bbd485 35136@@ -2853,10 +2894,11 @@ static bool start_flush_work(struct work_struct *work, struct wq_barrier *barr)
e4b2b4a8
JK
35137 lock_map_acquire(&pwq->wq->lockdep_map);
35138 lock_map_release(&pwq->wq->lockdep_map);
35139 }
1a6e0f06
JK
35140-
35141+ rcu_read_unlock();
35142 return true;
35143 already_gone:
35144 spin_unlock_irq(&pool->lock);
35145+ rcu_read_unlock();
35146 return false;
35147 }
35148
b3bbd485 35149@@ -2946,7 +2988,7 @@ static bool __cancel_work_timer(struct work_struct *work, bool is_dwork)
1a6e0f06
JK
35150
35151 /* tell other tasks trying to grab @work to back off */
35152 mark_work_canceling(work);
35153- local_irq_restore(flags);
35154+ local_unlock_irqrestore(pendingb_lock, flags);
35155
e4b2b4a8
JK
35156 /*
35157 * This allows canceling during early boot. We know that @work
b3bbd485 35158@@ -3007,10 +3049,10 @@ EXPORT_SYMBOL_GPL(cancel_work_sync);
1a6e0f06
JK
35159 */
35160 bool flush_delayed_work(struct delayed_work *dwork)
35161 {
35162- local_irq_disable();
35163+ local_lock_irq(pendingb_lock);
35164 if (del_timer_sync(&dwork->timer))
35165 __queue_work(dwork->cpu, dwork->wq, &dwork->work);
35166- local_irq_enable();
35167+ local_unlock_irq(pendingb_lock);
35168 return flush_work(&dwork->work);
35169 }
35170 EXPORT_SYMBOL(flush_delayed_work);
b3bbd485 35171@@ -3028,7 +3070,7 @@ static bool __cancel_work(struct work_struct *work, bool is_dwork)
c7c16703 35172 return false;
1a6e0f06 35173
c7c16703 35174 set_work_pool_and_clear_pending(work, get_work_pool_id(work));
1a6e0f06
JK
35175- local_irq_restore(flags);
35176+ local_unlock_irqrestore(pendingb_lock, flags);
35177 return ret;
35178 }
c7c16703 35179
b3bbd485 35180@@ -3284,7 +3326,7 @@ static void rcu_free_pool(struct rcu_head *rcu)
1a6e0f06
JK
35181 * put_unbound_pool - put a worker_pool
35182 * @pool: worker_pool to put
35183 *
35184- * Put @pool. If its refcnt reaches zero, it gets destroyed in sched-RCU
35185+ * Put @pool. If its refcnt reaches zero, it gets destroyed in RCU
35186 * safe manner. get_unbound_pool() calls this function on its failure path
35187 * and this function should be able to release pools which went through,
35188 * successfully or not, init_worker_pool().
b3bbd485 35189@@ -3338,8 +3380,8 @@ static void put_unbound_pool(struct worker_pool *pool)
1a6e0f06
JK
35190 del_timer_sync(&pool->idle_timer);
35191 del_timer_sync(&pool->mayday_timer);
35192
35193- /* sched-RCU protected to allow dereferences from get_work_pool() */
35194- call_rcu_sched(&pool->rcu, rcu_free_pool);
35195+ /* RCU protected to allow dereferences from get_work_pool() */
35196+ call_rcu(&pool->rcu, rcu_free_pool);
35197 }
35198
35199 /**
b3bbd485 35200@@ -3446,14 +3488,14 @@ static void pwq_unbound_release_workfn(struct work_struct *work)
1a6e0f06
JK
35201 put_unbound_pool(pool);
35202 mutex_unlock(&wq_pool_mutex);
35203
35204- call_rcu_sched(&pwq->rcu, rcu_free_pwq);
35205+ call_rcu(&pwq->rcu, rcu_free_pwq);
35206
35207 /*
35208 * If we're the last pwq going away, @wq is already dead and no one
35209 * is gonna access it anymore. Schedule RCU free.
35210 */
35211 if (is_last)
35212- call_rcu_sched(&wq->rcu, rcu_free_wq);
35213+ call_rcu(&wq->rcu, rcu_free_wq);
35214 }
35215
35216 /**
b3bbd485 35217@@ -4128,7 +4170,7 @@ void destroy_workqueue(struct workqueue_struct *wq)
1a6e0f06
JK
35218 * The base ref is never dropped on per-cpu pwqs. Directly
35219 * schedule RCU free.
35220 */
35221- call_rcu_sched(&wq->rcu, rcu_free_wq);
35222+ call_rcu(&wq->rcu, rcu_free_wq);
35223 } else {
35224 /*
35225 * We're the sole accessor of @wq at this point. Directly
b3bbd485 35226@@ -4238,7 +4280,8 @@ bool workqueue_congested(int cpu, struct workqueue_struct *wq)
1a6e0f06
JK
35227 struct pool_workqueue *pwq;
35228 bool ret;
35229
35230- rcu_read_lock_sched();
35231+ rcu_read_lock();
35232+ preempt_disable();
35233
35234 if (cpu == WORK_CPU_UNBOUND)
35235 cpu = smp_processor_id();
b3bbd485 35236@@ -4249,7 +4292,8 @@ bool workqueue_congested(int cpu, struct workqueue_struct *wq)
1a6e0f06
JK
35237 pwq = unbound_pwq_by_node(wq, cpu_to_node(cpu));
35238
35239 ret = !list_empty(&pwq->delayed_works);
35240- rcu_read_unlock_sched();
35241+ preempt_enable();
35242+ rcu_read_unlock();
35243
35244 return ret;
35245 }
b3bbd485 35246@@ -4275,15 +4319,15 @@ unsigned int work_busy(struct work_struct *work)
1a6e0f06
JK
35247 if (work_pending(work))
35248 ret |= WORK_BUSY_PENDING;
35249
35250- local_irq_save(flags);
35251+ rcu_read_lock();
35252 pool = get_work_pool(work);
35253 if (pool) {
35254- spin_lock(&pool->lock);
35255+ spin_lock_irqsave(&pool->lock, flags);
35256 if (find_worker_executing_work(pool, work))
35257 ret |= WORK_BUSY_RUNNING;
35258- spin_unlock(&pool->lock);
35259+ spin_unlock_irqrestore(&pool->lock, flags);
35260 }
35261- local_irq_restore(flags);
35262+ rcu_read_unlock();
35263
35264 return ret;
35265 }
b3bbd485 35266@@ -4472,7 +4516,7 @@ void show_workqueue_state(void)
1a6e0f06
JK
35267 unsigned long flags;
35268 int pi;
35269
35270- rcu_read_lock_sched();
35271+ rcu_read_lock();
35272
35273 pr_info("Showing busy workqueues and worker pools:\n");
35274
b3bbd485 35275@@ -4537,7 +4581,7 @@ void show_workqueue_state(void)
e4b2b4a8 35276 touch_nmi_watchdog();
1a6e0f06
JK
35277 }
35278
35279- rcu_read_unlock_sched();
35280+ rcu_read_unlock();
35281 }
35282
35283 /*
b3bbd485 35284@@ -4898,16 +4942,16 @@ bool freeze_workqueues_busy(void)
1a6e0f06
JK
35285 * nr_active is monotonically decreasing. It's safe
35286 * to peek without lock.
35287 */
35288- rcu_read_lock_sched();
35289+ rcu_read_lock();
35290 for_each_pwq(pwq, wq) {
35291 WARN_ON_ONCE(pwq->nr_active < 0);
35292 if (pwq->nr_active) {
35293 busy = true;
35294- rcu_read_unlock_sched();
35295+ rcu_read_unlock();
35296 goto out_unlock;
35297 }
35298 }
35299- rcu_read_unlock_sched();
35300+ rcu_read_unlock();
35301 }
35302 out_unlock:
35303 mutex_unlock(&wq_pool_mutex);
b3bbd485 35304@@ -5097,7 +5141,8 @@ static ssize_t wq_pool_ids_show(struct device *dev,
1a6e0f06
JK
35305 const char *delim = "";
35306 int node, written = 0;
35307
35308- rcu_read_lock_sched();
35309+ get_online_cpus();
35310+ rcu_read_lock();
35311 for_each_node(node) {
35312 written += scnprintf(buf + written, PAGE_SIZE - written,
35313 "%s%d:%d", delim, node,
b3bbd485 35314@@ -5105,7 +5150,8 @@ static ssize_t wq_pool_ids_show(struct device *dev,
1a6e0f06
JK
35315 delim = " ";
35316 }
35317 written += scnprintf(buf + written, PAGE_SIZE - written, "\n");
35318- rcu_read_unlock_sched();
35319+ rcu_read_unlock();
35320+ put_online_cpus();
35321
35322 return written;
35323 }
b3bbd485
JK
35324diff --git a/kernel/workqueue_internal.h b/kernel/workqueue_internal.h
35325index d390d1be3748..2dbcfe9bc364 100644
35326--- a/kernel/workqueue_internal.h
35327+++ b/kernel/workqueue_internal.h
35328@@ -45,6 +45,7 @@ struct worker {
1a6e0f06
JK
35329 unsigned long last_active; /* L: last active timestamp */
35330 unsigned int flags; /* X: flags */
35331 int id; /* I: worker id */
35332+ int sleeping; /* None */
35333
35334 /*
35335 * Opaque string set with work_set_desc(). Printed out with task
b3bbd485 35336@@ -70,7 +71,7 @@ static inline struct worker *current_wq_worker(void)
1a6e0f06
JK
35337 * Scheduler hooks for concurrency managed workqueue. Only to be used from
35338 * sched/core.c and workqueue.c.
35339 */
35340-void wq_worker_waking_up(struct task_struct *task, int cpu);
35341-struct task_struct *wq_worker_sleeping(struct task_struct *task);
35342+void wq_worker_running(struct task_struct *task);
35343+void wq_worker_sleeping(struct task_struct *task);
35344
35345 #endif /* _KERNEL_WORKQUEUE_INTERNAL_H */
b3bbd485
JK
35346diff --git a/lib/Kconfig b/lib/Kconfig
35347index b1445b22a6de..9ab51b78991a 100644
35348--- a/lib/Kconfig
35349+++ b/lib/Kconfig
35350@@ -428,6 +428,7 @@ config CHECK_SIGNATURE
35351
35352 config CPUMASK_OFFSTACK
35353 bool "Force CPU masks off stack" if DEBUG_PER_CPU_MAPS
35354+ depends on !PREEMPT_RT_FULL
35355 help
35356 Use dynamic allocation for cpumask_var_t, instead of putting
35357 them on the stack. This is a bit more expensive, but avoids
35358diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
35359index 62d0e25c054c..401b7ed164b5 100644
35360--- a/lib/Kconfig.debug
35361+++ b/lib/Kconfig.debug
35362@@ -1197,7 +1197,7 @@ config DEBUG_ATOMIC_SLEEP
35363
35364 config DEBUG_LOCKING_API_SELFTESTS
35365 bool "Locking API boot-time self-tests"
35366- depends on DEBUG_KERNEL
35367+ depends on DEBUG_KERNEL && !PREEMPT_RT_FULL
35368 help
35369 Say Y here if you want the kernel to run a short self-test during
35370 bootup. The self-test checks whether common types of locking bugs
35371diff --git a/lib/debugobjects.c b/lib/debugobjects.c
35372index 99308479b1c8..161da6c6e173 100644
35373--- a/lib/debugobjects.c
35374+++ b/lib/debugobjects.c
35375@@ -339,7 +339,10 @@ __debug_object_init(void *addr, struct debug_obj_descr *descr, int onstack)
1a6e0f06
JK
35376 struct debug_obj *obj;
35377 unsigned long flags;
35378
35379- fill_pool();
35380+#ifdef CONFIG_PREEMPT_RT_FULL
35381+ if (preempt_count() == 0 && !irqs_disabled())
35382+#endif
35383+ fill_pool();
35384
35385 db = get_bucket((unsigned long) addr);
35386
b3bbd485
JK
35387diff --git a/lib/irq_poll.c b/lib/irq_poll.c
35388index 86a709954f5a..9c069ef83d6d 100644
35389--- a/lib/irq_poll.c
35390+++ b/lib/irq_poll.c
35391@@ -37,6 +37,7 @@ void irq_poll_sched(struct irq_poll *iop)
1a6e0f06
JK
35392 list_add_tail(&iop->list, this_cpu_ptr(&blk_cpu_iopoll));
35393 __raise_softirq_irqoff(IRQ_POLL_SOFTIRQ);
35394 local_irq_restore(flags);
35395+ preempt_check_resched_rt();
35396 }
35397 EXPORT_SYMBOL(irq_poll_sched);
35398
b3bbd485 35399@@ -72,6 +73,7 @@ void irq_poll_complete(struct irq_poll *iop)
1a6e0f06
JK
35400 local_irq_save(flags);
35401 __irq_poll_complete(iop);
35402 local_irq_restore(flags);
35403+ preempt_check_resched_rt();
35404 }
35405 EXPORT_SYMBOL(irq_poll_complete);
35406
b3bbd485 35407@@ -96,6 +98,7 @@ static void __latent_entropy irq_poll_softirq(struct softirq_action *h)
1a6e0f06
JK
35408 }
35409
35410 local_irq_enable();
35411+ preempt_check_resched_rt();
35412
35413 /* Even though interrupts have been re-enabled, this
35414 * access is safe because interrupts can only add new
b3bbd485 35415@@ -133,6 +136,7 @@ static void __latent_entropy irq_poll_softirq(struct softirq_action *h)
1a6e0f06
JK
35416 __raise_softirq_irqoff(IRQ_POLL_SOFTIRQ);
35417
35418 local_irq_enable();
35419+ preempt_check_resched_rt();
35420 }
35421
35422 /**
b3bbd485 35423@@ -196,6 +200,7 @@ static int irq_poll_cpu_dead(unsigned int cpu)
c7c16703
JK
35424 this_cpu_ptr(&blk_cpu_iopoll));
35425 __raise_softirq_irqoff(IRQ_POLL_SOFTIRQ);
35426 local_irq_enable();
35427+ preempt_check_resched_rt();
1a6e0f06 35428
c7c16703
JK
35429 return 0;
35430 }
b3bbd485
JK
35431diff --git a/lib/locking-selftest.c b/lib/locking-selftest.c
35432index b5c1293ce147..075e225f4111 100644
35433--- a/lib/locking-selftest.c
35434+++ b/lib/locking-selftest.c
35435@@ -742,6 +742,8 @@ GENERATE_TESTCASE(init_held_rtmutex);
1a6e0f06
JK
35436 #include "locking-selftest-spin-hardirq.h"
35437 GENERATE_PERMUTATIONS_2_EVENTS(irqsafe1_hard_spin)
35438
35439+#ifndef CONFIG_PREEMPT_RT_FULL
35440+
35441 #include "locking-selftest-rlock-hardirq.h"
35442 GENERATE_PERMUTATIONS_2_EVENTS(irqsafe1_hard_rlock)
35443
b3bbd485 35444@@ -757,9 +759,12 @@ GENERATE_PERMUTATIONS_2_EVENTS(irqsafe1_soft_rlock)
1a6e0f06
JK
35445 #include "locking-selftest-wlock-softirq.h"
35446 GENERATE_PERMUTATIONS_2_EVENTS(irqsafe1_soft_wlock)
35447
35448+#endif
35449+
35450 #undef E1
35451 #undef E2
35452
35453+#ifndef CONFIG_PREEMPT_RT_FULL
35454 /*
35455 * Enabling hardirqs with a softirq-safe lock held:
35456 */
b3bbd485 35457@@ -792,6 +797,8 @@ GENERATE_PERMUTATIONS_2_EVENTS(irqsafe2A_rlock)
1a6e0f06
JK
35458 #undef E1
35459 #undef E2
35460
35461+#endif
35462+
35463 /*
35464 * Enabling irqs with an irq-safe lock held:
35465 */
b3bbd485 35466@@ -815,6 +822,8 @@ GENERATE_PERMUTATIONS_2_EVENTS(irqsafe2A_rlock)
1a6e0f06
JK
35467 #include "locking-selftest-spin-hardirq.h"
35468 GENERATE_PERMUTATIONS_2_EVENTS(irqsafe2B_hard_spin)
35469
35470+#ifndef CONFIG_PREEMPT_RT_FULL
35471+
35472 #include "locking-selftest-rlock-hardirq.h"
35473 GENERATE_PERMUTATIONS_2_EVENTS(irqsafe2B_hard_rlock)
35474
b3bbd485 35475@@ -830,6 +839,8 @@ GENERATE_PERMUTATIONS_2_EVENTS(irqsafe2B_soft_rlock)
1a6e0f06
JK
35476 #include "locking-selftest-wlock-softirq.h"
35477 GENERATE_PERMUTATIONS_2_EVENTS(irqsafe2B_soft_wlock)
35478
35479+#endif
35480+
35481 #undef E1
35482 #undef E2
35483
b3bbd485 35484@@ -861,6 +872,8 @@ GENERATE_PERMUTATIONS_2_EVENTS(irqsafe2B_soft_wlock)
1a6e0f06
JK
35485 #include "locking-selftest-spin-hardirq.h"
35486 GENERATE_PERMUTATIONS_3_EVENTS(irqsafe3_hard_spin)
35487
35488+#ifndef CONFIG_PREEMPT_RT_FULL
35489+
35490 #include "locking-selftest-rlock-hardirq.h"
35491 GENERATE_PERMUTATIONS_3_EVENTS(irqsafe3_hard_rlock)
35492
b3bbd485 35493@@ -876,6 +889,8 @@ GENERATE_PERMUTATIONS_3_EVENTS(irqsafe3_soft_rlock)
1a6e0f06
JK
35494 #include "locking-selftest-wlock-softirq.h"
35495 GENERATE_PERMUTATIONS_3_EVENTS(irqsafe3_soft_wlock)
35496
35497+#endif
35498+
35499 #undef E1
35500 #undef E2
35501 #undef E3
b3bbd485 35502@@ -909,6 +924,8 @@ GENERATE_PERMUTATIONS_3_EVENTS(irqsafe3_soft_wlock)
1a6e0f06
JK
35503 #include "locking-selftest-spin-hardirq.h"
35504 GENERATE_PERMUTATIONS_3_EVENTS(irqsafe4_hard_spin)
35505
35506+#ifndef CONFIG_PREEMPT_RT_FULL
35507+
35508 #include "locking-selftest-rlock-hardirq.h"
35509 GENERATE_PERMUTATIONS_3_EVENTS(irqsafe4_hard_rlock)
35510
b3bbd485 35511@@ -924,10 +941,14 @@ GENERATE_PERMUTATIONS_3_EVENTS(irqsafe4_soft_rlock)
1a6e0f06
JK
35512 #include "locking-selftest-wlock-softirq.h"
35513 GENERATE_PERMUTATIONS_3_EVENTS(irqsafe4_soft_wlock)
35514
35515+#endif
35516+
35517 #undef E1
35518 #undef E2
35519 #undef E3
35520
35521+#ifndef CONFIG_PREEMPT_RT_FULL
35522+
35523 /*
35524 * read-lock / write-lock irq inversion.
35525 *
b3bbd485 35526@@ -990,6 +1011,10 @@ GENERATE_PERMUTATIONS_3_EVENTS(irq_inversion_soft_wlock)
1a6e0f06
JK
35527 #undef E2
35528 #undef E3
35529
35530+#endif
35531+
35532+#ifndef CONFIG_PREEMPT_RT_FULL
35533+
35534 /*
35535 * read-lock / write-lock recursion that is actually safe.
35536 */
b3bbd485 35537@@ -1028,6 +1053,8 @@ GENERATE_PERMUTATIONS_3_EVENTS(irq_read_recursion_soft)
1a6e0f06
JK
35538 #undef E2
35539 #undef E3
35540
35541+#endif
35542+
35543 /*
35544 * read-lock / write-lock recursion that is unsafe.
35545 */
b3bbd485 35546@@ -2057,6 +2084,7 @@ void locking_selftest(void)
1a6e0f06
JK
35547
35548 printk(" --------------------------------------------------------------------------\n");
35549
35550+#ifndef CONFIG_PREEMPT_RT_FULL
35551 /*
35552 * irq-context testcases:
35553 */
b3bbd485 35554@@ -2069,6 +2097,28 @@ void locking_selftest(void)
1a6e0f06
JK
35555
35556 DO_TESTCASE_6x2("irq read-recursion", irq_read_recursion);
35557 // DO_TESTCASE_6x2B("irq read-recursion #2", irq_read_recursion2);
35558+#else
35559+ /* On -rt, we only do hardirq context test for raw spinlock */
35560+ DO_TESTCASE_1B("hard-irqs-on + irq-safe-A", irqsafe1_hard_spin, 12);
35561+ DO_TESTCASE_1B("hard-irqs-on + irq-safe-A", irqsafe1_hard_spin, 21);
35562+
35563+ DO_TESTCASE_1B("hard-safe-A + irqs-on", irqsafe2B_hard_spin, 12);
35564+ DO_TESTCASE_1B("hard-safe-A + irqs-on", irqsafe2B_hard_spin, 21);
35565+
35566+ DO_TESTCASE_1B("hard-safe-A + unsafe-B #1", irqsafe3_hard_spin, 123);
35567+ DO_TESTCASE_1B("hard-safe-A + unsafe-B #1", irqsafe3_hard_spin, 132);
35568+ DO_TESTCASE_1B("hard-safe-A + unsafe-B #1", irqsafe3_hard_spin, 213);
35569+ DO_TESTCASE_1B("hard-safe-A + unsafe-B #1", irqsafe3_hard_spin, 231);
35570+ DO_TESTCASE_1B("hard-safe-A + unsafe-B #1", irqsafe3_hard_spin, 312);
35571+ DO_TESTCASE_1B("hard-safe-A + unsafe-B #1", irqsafe3_hard_spin, 321);
35572+
35573+ DO_TESTCASE_1B("hard-safe-A + unsafe-B #2", irqsafe4_hard_spin, 123);
35574+ DO_TESTCASE_1B("hard-safe-A + unsafe-B #2", irqsafe4_hard_spin, 132);
35575+ DO_TESTCASE_1B("hard-safe-A + unsafe-B #2", irqsafe4_hard_spin, 213);
35576+ DO_TESTCASE_1B("hard-safe-A + unsafe-B #2", irqsafe4_hard_spin, 231);
35577+ DO_TESTCASE_1B("hard-safe-A + unsafe-B #2", irqsafe4_hard_spin, 312);
35578+ DO_TESTCASE_1B("hard-safe-A + unsafe-B #2", irqsafe4_hard_spin, 321);
35579+#endif
35580
35581 ww_tests();
35582
b3bbd485
JK
35583diff --git a/lib/percpu_ida.c b/lib/percpu_ida.c
35584index 6016f1deb1f5..cdd43086b55b 100644
35585--- a/lib/percpu_ida.c
35586+++ b/lib/percpu_ida.c
e4b2b4a8 35587@@ -27,6 +27,9 @@
1a6e0f06
JK
35588 #include <linux/string.h>
35589 #include <linux/spinlock.h>
35590 #include <linux/percpu_ida.h>
35591+#include <linux/locallock.h>
35592+
35593+static DEFINE_LOCAL_IRQ_LOCK(irq_off_lock);
35594
35595 struct percpu_ida_cpu {
35596 /*
b3bbd485 35597@@ -149,13 +152,13 @@ int percpu_ida_alloc(struct percpu_ida *pool, int state)
1a6e0f06
JK
35598 unsigned long flags;
35599 int tag;
35600
35601- local_irq_save(flags);
35602+ local_lock_irqsave(irq_off_lock, flags);
35603 tags = this_cpu_ptr(pool->tag_cpu);
35604
35605 /* Fastpath */
35606 tag = alloc_local_tag(tags);
35607 if (likely(tag >= 0)) {
35608- local_irq_restore(flags);
35609+ local_unlock_irqrestore(irq_off_lock, flags);
35610 return tag;
35611 }
35612
b3bbd485 35613@@ -174,6 +177,7 @@ int percpu_ida_alloc(struct percpu_ida *pool, int state)
1a6e0f06
JK
35614
35615 if (!tags->nr_free)
35616 alloc_global_tags(pool, tags);
35617+
35618 if (!tags->nr_free)
35619 steal_tags(pool, tags);
35620
b3bbd485 35621@@ -185,7 +189,7 @@ int percpu_ida_alloc(struct percpu_ida *pool, int state)
1a6e0f06
JK
35622 }
35623
35624 spin_unlock(&pool->lock);
35625- local_irq_restore(flags);
35626+ local_unlock_irqrestore(irq_off_lock, flags);
35627
35628 if (tag >= 0 || state == TASK_RUNNING)
35629 break;
b3bbd485 35630@@ -197,7 +201,7 @@ int percpu_ida_alloc(struct percpu_ida *pool, int state)
1a6e0f06
JK
35631
35632 schedule();
35633
35634- local_irq_save(flags);
35635+ local_lock_irqsave(irq_off_lock, flags);
35636 tags = this_cpu_ptr(pool->tag_cpu);
35637 }
35638 if (state != TASK_RUNNING)
b3bbd485 35639@@ -222,7 +226,7 @@ void percpu_ida_free(struct percpu_ida *pool, unsigned tag)
1a6e0f06
JK
35640
35641 BUG_ON(tag >= pool->nr_tags);
35642
35643- local_irq_save(flags);
35644+ local_lock_irqsave(irq_off_lock, flags);
35645 tags = this_cpu_ptr(pool->tag_cpu);
35646
35647 spin_lock(&tags->lock);
b3bbd485 35648@@ -254,7 +258,7 @@ void percpu_ida_free(struct percpu_ida *pool, unsigned tag)
1a6e0f06
JK
35649 spin_unlock(&pool->lock);
35650 }
35651
35652- local_irq_restore(flags);
35653+ local_unlock_irqrestore(irq_off_lock, flags);
35654 }
35655 EXPORT_SYMBOL_GPL(percpu_ida_free);
35656
b3bbd485 35657@@ -346,7 +350,7 @@ int percpu_ida_for_each_free(struct percpu_ida *pool, percpu_ida_cb fn,
1a6e0f06
JK
35658 struct percpu_ida_cpu *remote;
35659 unsigned cpu, i, err = 0;
35660
35661- local_irq_save(flags);
35662+ local_lock_irqsave(irq_off_lock, flags);
35663 for_each_possible_cpu(cpu) {
35664 remote = per_cpu_ptr(pool->tag_cpu, cpu);
35665 spin_lock(&remote->lock);
b3bbd485 35666@@ -368,7 +372,7 @@ int percpu_ida_for_each_free(struct percpu_ida *pool, percpu_ida_cb fn,
1a6e0f06
JK
35667 }
35668 spin_unlock(&pool->lock);
35669 out:
35670- local_irq_restore(flags);
35671+ local_unlock_irqrestore(irq_off_lock, flags);
35672 return err;
35673 }
35674 EXPORT_SYMBOL_GPL(percpu_ida_for_each_free);
b3bbd485
JK
35675diff --git a/lib/radix-tree.c b/lib/radix-tree.c
35676index d172f0341b80..c1da1109a107 100644
35677--- a/lib/radix-tree.c
35678+++ b/lib/radix-tree.c
e4b2b4a8 35679@@ -37,7 +37,7 @@
1f39f580 35680 #include <linux/rcupdate.h>
e4b2b4a8
JK
35681 #include <linux/slab.h>
35682 #include <linux/string.h>
1f39f580
JK
35683-
35684+#include <linux/locallock.h>
35685
35686 /* Number of nodes in fully populated tree of given height */
35687 static unsigned long height_to_maxnodes[RADIX_TREE_MAX_PATH + 1] __read_mostly;
b3bbd485 35688@@ -86,6 +86,7 @@ struct radix_tree_preload {
1f39f580
JK
35689 struct radix_tree_node *nodes;
35690 };
35691 static DEFINE_PER_CPU(struct radix_tree_preload, radix_tree_preloads) = { 0, };
35692+static DEFINE_LOCAL_IRQ_LOCK(radix_tree_preloads_lock);
35693
e4b2b4a8 35694 static inline struct radix_tree_node *entry_to_node(void *ptr)
1f39f580 35695 {
b3bbd485 35696@@ -404,12 +405,13 @@ radix_tree_node_alloc(gfp_t gfp_mask, struct radix_tree_node *parent,
1a6e0f06
JK
35697 * succeed in getting a node here (and never reach
35698 * kmem_cache_alloc)
35699 */
35700- rtp = this_cpu_ptr(&radix_tree_preloads);
1f39f580 35701+ rtp = &get_locked_var(radix_tree_preloads_lock, radix_tree_preloads);
1a6e0f06
JK
35702 if (rtp->nr) {
35703 ret = rtp->nodes;
e4b2b4a8 35704 rtp->nodes = ret->parent;
1a6e0f06
JK
35705 rtp->nr--;
35706 }
1f39f580 35707+ put_locked_var(radix_tree_preloads_lock, radix_tree_preloads);
1a6e0f06
JK
35708 /*
35709 * Update the allocation stack trace as this is more useful
35710 * for debugging.
b3bbd485 35711@@ -475,14 +477,14 @@ static __must_check int __radix_tree_preload(gfp_t gfp_mask, unsigned nr)
1f39f580
JK
35712 */
35713 gfp_mask &= ~__GFP_ACCOUNT;
35714
35715- preempt_disable();
35716+ local_lock(radix_tree_preloads_lock);
35717 rtp = this_cpu_ptr(&radix_tree_preloads);
35718 while (rtp->nr < nr) {
35719- preempt_enable();
35720+ local_unlock(radix_tree_preloads_lock);
35721 node = kmem_cache_alloc(radix_tree_node_cachep, gfp_mask);
35722 if (node == NULL)
35723 goto out;
35724- preempt_disable();
35725+ local_lock(radix_tree_preloads_lock);
35726 rtp = this_cpu_ptr(&radix_tree_preloads);
35727 if (rtp->nr < nr) {
e4b2b4a8 35728 node->parent = rtp->nodes;
b3bbd485 35729@@ -524,7 +526,7 @@ int radix_tree_maybe_preload(gfp_t gfp_mask)
1f39f580
JK
35730 if (gfpflags_allow_blocking(gfp_mask))
35731 return __radix_tree_preload(gfp_mask, RADIX_TREE_PRELOAD_SIZE);
35732 /* Preloading doesn't help anything with this gfp mask, skip it */
35733- preempt_disable();
35734+ local_lock(radix_tree_preloads_lock);
35735 return 0;
1a6e0f06 35736 }
1f39f580 35737 EXPORT_SYMBOL(radix_tree_maybe_preload);
b3bbd485 35738@@ -562,7 +564,7 @@ int radix_tree_maybe_preload_order(gfp_t gfp_mask, int order)
1a6e0f06 35739
1f39f580
JK
35740 /* Preloading doesn't help anything with this gfp mask, skip it */
35741 if (!gfpflags_allow_blocking(gfp_mask)) {
35742- preempt_disable();
35743+ local_lock(radix_tree_preloads_lock);
35744 return 0;
35745 }
1a6e0f06 35746
b3bbd485 35747@@ -596,6 +598,12 @@ int radix_tree_maybe_preload_order(gfp_t gfp_mask, int order)
1a6e0f06
JK
35748 return __radix_tree_preload(gfp_mask, nr_nodes);
35749 }
1a6e0f06 35750
1f39f580
JK
35751+void radix_tree_preload_end(void)
35752+{
35753+ local_unlock(radix_tree_preloads_lock);
35754+}
35755+EXPORT_SYMBOL(radix_tree_preload_end);
35756+
e4b2b4a8
JK
35757 static unsigned radix_tree_load_root(const struct radix_tree_root *root,
35758 struct radix_tree_node **nodep, unsigned long *maxindex)
35759 {
b3bbd485 35760@@ -2105,10 +2113,16 @@ EXPORT_SYMBOL(radix_tree_tagged);
e4b2b4a8
JK
35761 void idr_preload(gfp_t gfp_mask)
35762 {
35763 if (__radix_tree_preload(gfp_mask, IDR_PRELOAD_SIZE))
35764- preempt_disable();
35765+ local_lock(radix_tree_preloads_lock);
35766 }
35767 EXPORT_SYMBOL(idr_preload);
35768
35769+void idr_preload_end(void)
35770+{
35771+ local_unlock(radix_tree_preloads_lock);
35772+}
35773+EXPORT_SYMBOL(idr_preload_end);
35774+
35775 /**
35776 * ida_pre_get - reserve resources for ida allocation
35777 * @ida: ida handle
b3bbd485 35778@@ -2125,7 +2139,7 @@ int ida_pre_get(struct ida *ida, gfp_t gfp)
e4b2b4a8
JK
35779 * to return to the ida_pre_get() step.
35780 */
35781 if (!__radix_tree_preload(gfp, IDA_PRELOAD_SIZE))
35782- preempt_enable();
35783+ local_unlock(radix_tree_preloads_lock);
35784
35785 if (!this_cpu_read(ida_bitmap)) {
35786 struct ida_bitmap *bitmap = kmalloc(sizeof(*bitmap), gfp);
b3bbd485
JK
35787diff --git a/lib/scatterlist.c b/lib/scatterlist.c
35788index be7b4dd6b68d..d06c15d3d186 100644
35789--- a/lib/scatterlist.c
35790+++ b/lib/scatterlist.c
35791@@ -620,7 +620,7 @@ void sg_miter_stop(struct sg_mapping_iter *miter)
1a6e0f06
JK
35792 flush_kernel_dcache_page(miter->page);
35793
35794 if (miter->__flags & SG_MITER_ATOMIC) {
35795- WARN_ON_ONCE(preemptible());
35796+ WARN_ON_ONCE(!pagefault_disabled());
35797 kunmap_atomic(miter->addr);
35798 } else
35799 kunmap(miter->page);
b3bbd485
JK
35800diff --git a/lib/smp_processor_id.c b/lib/smp_processor_id.c
35801index 835cc6df2776..6f4a4ae881c8 100644
35802--- a/lib/smp_processor_id.c
35803+++ b/lib/smp_processor_id.c
35804@@ -23,7 +23,7 @@ notrace static unsigned int check_preemption_disabled(const char *what1,
e4b2b4a8
JK
35805 * Kernel threads bound to a single CPU can safely use
35806 * smp_processor_id():
35807 */
35808- if (cpumask_equal(&current->cpus_allowed, cpumask_of(this_cpu)))
35809+ if (cpumask_equal(current->cpus_ptr, cpumask_of(this_cpu)))
35810 goto out;
1a6e0f06 35811
e4b2b4a8 35812 /*
b3bbd485
JK
35813diff --git a/lib/timerqueue.c b/lib/timerqueue.c
35814index 4a720ed4fdaf..0d54bcbc8170 100644
35815--- a/lib/timerqueue.c
35816+++ b/lib/timerqueue.c
e4b2b4a8
JK
35817@@ -33,8 +33,9 @@
35818 * @head: head of timerqueue
35819 * @node: timer node to be added
35820 *
35821- * Adds the timer node to the timerqueue, sorted by the
35822- * node's expires value.
35823+ * Adds the timer node to the timerqueue, sorted by the node's expires
35824+ * value. Returns true if the newly added timer is the first expiring timer in
35825+ * the queue.
35826 */
35827 bool timerqueue_add(struct timerqueue_head *head, struct timerqueue_node *node)
35828 {
b3bbd485 35829@@ -70,7 +71,8 @@ EXPORT_SYMBOL_GPL(timerqueue_add);
e4b2b4a8
JK
35830 * @head: head of timerqueue
35831 * @node: timer node to be removed
35832 *
35833- * Removes the timer node from the timerqueue.
35834+ * Removes the timer node from the timerqueue. Returns true if the queue is
35835+ * not empty after the remove.
35836 */
35837 bool timerqueue_del(struct timerqueue_head *head, struct timerqueue_node *node)
35838 {
b3bbd485
JK
35839diff --git a/localversion-rt b/localversion-rt
35840new file mode 100644
5dd41b01 35841index 000000000000..8a777ac42aab
b3bbd485
JK
35842--- /dev/null
35843+++ b/localversion-rt
1a6e0f06 35844@@ -0,0 +1 @@
5dd41b01 35845+-rt47
b3bbd485
JK
35846diff --git a/mm/Kconfig b/mm/Kconfig
35847index 59efbd3337e0..3df123c0bc3f 100644
35848--- a/mm/Kconfig
35849+++ b/mm/Kconfig
35850@@ -385,7 +385,7 @@ config NOMMU_INITIAL_TRIM_EXCESS
35851
35852 config TRANSPARENT_HUGEPAGE
35853 bool "Transparent Hugepage Support"
35854- depends on HAVE_ARCH_TRANSPARENT_HUGEPAGE
35855+ depends on HAVE_ARCH_TRANSPARENT_HUGEPAGE && !PREEMPT_RT_FULL
35856 select COMPACTION
35857 select RADIX_TREE_MULTIORDER
35858 help
35859diff --git a/mm/backing-dev.c b/mm/backing-dev.c
35860index 9386c98dac12..5e9d804c37cb 100644
35861--- a/mm/backing-dev.c
35862+++ b/mm/backing-dev.c
35863@@ -470,9 +470,9 @@ void wb_congested_put(struct bdi_writeback_congested *congested)
1a6e0f06
JK
35864 {
35865 unsigned long flags;
35866
35867- local_irq_save(flags);
35868+ local_irq_save_nort(flags);
35869 if (!atomic_dec_and_lock(&congested->refcnt, &cgwb_lock)) {
35870- local_irq_restore(flags);
35871+ local_irq_restore_nort(flags);
35872 return;
35873 }
35874
b3bbd485
JK
35875diff --git a/mm/compaction.c b/mm/compaction.c
35876index 85395dc6eb13..d6c8ed009e93 100644
35877--- a/mm/compaction.c
35878+++ b/mm/compaction.c
35879@@ -1634,10 +1634,12 @@ static enum compact_result compact_zone(struct zone *zone, struct compact_contro
1a6e0f06
JK
35880 block_start_pfn(cc->migrate_pfn, cc->order);
35881
35882 if (cc->last_migrated_pfn < current_block_start) {
35883- cpu = get_cpu();
35884+ cpu = get_cpu_light();
35885+ local_lock_irq(swapvec_lock);
35886 lru_add_drain_cpu(cpu);
35887+ local_unlock_irq(swapvec_lock);
35888 drain_local_pages(zone);
35889- put_cpu();
35890+ put_cpu_light();
35891 /* No more flushing until we migrate again */
35892 cc->last_migrated_pfn = 0;
35893 }
b3bbd485
JK
35894diff --git a/mm/filemap.c b/mm/filemap.c
35895index e2e738cc08b1..c47070dae8b9 100644
35896--- a/mm/filemap.c
35897+++ b/mm/filemap.c
e4b2b4a8
JK
35898@@ -110,6 +110,7 @@
35899 * ->i_mmap_rwsem
35900 * ->tasklist_lock (memory_failure, collect_procs_ao)
35901 */
35902+DECLARE_LOCAL_IRQ_LOCK(shadow_nodes_lock);
35903
35904 static int page_cache_tree_insert(struct address_space *mapping,
35905 struct page *page, void **shadowp)
b3bbd485 35906@@ -133,8 +134,10 @@ static int page_cache_tree_insert(struct address_space *mapping,
e4b2b4a8
JK
35907 if (shadowp)
35908 *shadowp = p;
1a6e0f06 35909 }
e4b2b4a8
JK
35910+ local_lock(shadow_nodes_lock);
35911 __radix_tree_replace(&mapping->page_tree, node, slot, page,
35912- workingset_update_node, mapping);
35913+ __workingset_update_node, mapping);
35914+ local_unlock(shadow_nodes_lock);
35915 mapping->nrpages++;
1a6e0f06
JK
35916 return 0;
35917 }
b3bbd485 35918@@ -151,6 +154,7 @@ static void page_cache_tree_delete(struct address_space *mapping,
e4b2b4a8
JK
35919 VM_BUG_ON_PAGE(PageTail(page), page);
35920 VM_BUG_ON_PAGE(nr != 1 && shadow, page);
35921
35922+ local_lock(shadow_nodes_lock);
35923 for (i = 0; i < nr; i++) {
35924 struct radix_tree_node *node;
35925 void **slot;
b3bbd485 35926@@ -162,8 +166,9 @@ static void page_cache_tree_delete(struct address_space *mapping,
1a6e0f06 35927
e4b2b4a8
JK
35928 radix_tree_clear_tags(&mapping->page_tree, node, slot);
35929 __radix_tree_replace(&mapping->page_tree, node, slot, shadow,
35930- workingset_update_node, mapping);
35931+ __workingset_update_node, mapping);
35932 }
35933+ local_unlock(shadow_nodes_lock);
35934
35935 if (shadow) {
35936 mapping->nrexceptional += nr;
b3bbd485
JK
35937diff --git a/mm/highmem.c b/mm/highmem.c
35938index 59db3223a5d6..22aa3ddbd87b 100644
35939--- a/mm/highmem.c
35940+++ b/mm/highmem.c
e4b2b4a8 35941@@ -30,10 +30,11 @@
1a6e0f06
JK
35942 #include <linux/kgdb.h>
35943 #include <asm/tlbflush.h>
35944
35945-
35946+#ifndef CONFIG_PREEMPT_RT_FULL
35947 #if defined(CONFIG_HIGHMEM) || defined(CONFIG_X86_32)
35948 DEFINE_PER_CPU(int, __kmap_atomic_idx);
35949 #endif
35950+#endif
35951
35952 /*
35953 * Virtual_count is not a pure "count".
b3bbd485 35954@@ -108,8 +109,9 @@ static inline wait_queue_head_t *get_pkmap_wait_queue_head(unsigned int color)
1a6e0f06
JK
35955 unsigned long totalhigh_pages __read_mostly;
35956 EXPORT_SYMBOL(totalhigh_pages);
35957
35958-
35959+#ifndef CONFIG_PREEMPT_RT_FULL
35960 EXPORT_PER_CPU_SYMBOL(__kmap_atomic_idx);
35961+#endif
35962
35963 unsigned int nr_free_highpages (void)
35964 {
b3bbd485
JK
35965diff --git a/mm/memcontrol.c b/mm/memcontrol.c
35966index 6a9a7e1066ef..3cc297730103 100644
35967--- a/mm/memcontrol.c
35968+++ b/mm/memcontrol.c
e4b2b4a8 35969@@ -69,6 +69,7 @@
1a6e0f06
JK
35970 #include <net/sock.h>
35971 #include <net/ip.h>
35972 #include "slab.h"
35973+#include <linux/locallock.h>
35974
e4b2b4a8 35975 #include <linux/uaccess.h>
1a6e0f06 35976
b3bbd485 35977@@ -94,6 +95,8 @@ int do_swap_account __read_mostly;
1a6e0f06
JK
35978 #define do_swap_account 0
35979 #endif
35980
35981+static DEFINE_LOCAL_IRQ_LOCK(event_lock);
35982+
35983 /* Whether legacy memory+swap accounting is active */
35984 static bool do_memsw_account(void)
35985 {
b3bbd485 35986@@ -1831,7 +1834,7 @@ static void drain_all_stock(struct mem_cgroup *root_memcg)
e4b2b4a8
JK
35987 * as well as workers from this path always operate on the local
35988 * per-cpu data. CPU up doesn't touch memcg_stock at all.
35989 */
1a6e0f06
JK
35990- curcpu = get_cpu();
35991+ curcpu = get_cpu_light();
35992 for_each_online_cpu(cpu) {
35993 struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu);
35994 struct mem_cgroup *memcg;
b3bbd485 35995@@ -1851,7 +1854,7 @@ static void drain_all_stock(struct mem_cgroup *root_memcg)
1a6e0f06 35996 }
e4b2b4a8 35997 css_put(&memcg->css);
1a6e0f06
JK
35998 }
35999- put_cpu();
36000+ put_cpu_light();
1a6e0f06
JK
36001 mutex_unlock(&percpu_charge_mutex);
36002 }
e4b2b4a8 36003
b3bbd485 36004@@ -4631,12 +4634,12 @@ static int mem_cgroup_move_account(struct page *page,
1a6e0f06
JK
36005
36006 ret = 0;
36007
36008- local_irq_disable();
36009+ local_lock_irq(event_lock);
36010 mem_cgroup_charge_statistics(to, page, compound, nr_pages);
36011 memcg_check_events(to, page);
36012 mem_cgroup_charge_statistics(from, page, compound, -nr_pages);
36013 memcg_check_events(from, page);
36014- local_irq_enable();
36015+ local_unlock_irq(event_lock);
36016 out_unlock:
36017 unlock_page(page);
36018 out:
b3bbd485 36019@@ -5579,10 +5582,10 @@ void mem_cgroup_commit_charge(struct page *page, struct mem_cgroup *memcg,
1a6e0f06
JK
36020
36021 commit_charge(page, memcg, lrucare);
36022
36023- local_irq_disable();
36024+ local_lock_irq(event_lock);
36025 mem_cgroup_charge_statistics(memcg, page, compound, nr_pages);
36026 memcg_check_events(memcg, page);
36027- local_irq_enable();
36028+ local_unlock_irq(event_lock);
36029
36030 if (do_memsw_account() && PageSwapCache(page)) {
36031 swp_entry_t entry = { .val = page_private(page) };
b3bbd485 36032@@ -5651,7 +5654,7 @@ static void uncharge_batch(const struct uncharge_gather *ug)
e4b2b4a8 36033 memcg_oom_recover(ug->memcg);
1a6e0f06
JK
36034 }
36035
36036- local_irq_save(flags);
36037+ local_lock_irqsave(event_lock, flags);
e4b2b4a8
JK
36038 __this_cpu_sub(ug->memcg->stat->count[MEMCG_RSS], ug->nr_anon);
36039 __this_cpu_sub(ug->memcg->stat->count[MEMCG_CACHE], ug->nr_file);
36040 __this_cpu_sub(ug->memcg->stat->count[MEMCG_RSS_HUGE], ug->nr_huge);
b3bbd485 36041@@ -5659,7 +5662,7 @@ static void uncharge_batch(const struct uncharge_gather *ug)
e4b2b4a8
JK
36042 __this_cpu_add(ug->memcg->stat->events[PGPGOUT], ug->pgpgout);
36043 __this_cpu_add(ug->memcg->stat->nr_page_events, nr_pages);
36044 memcg_check_events(ug->memcg, ug->dummy_page);
1a6e0f06
JK
36045- local_irq_restore(flags);
36046+ local_unlock_irqrestore(event_lock, flags);
36047
e4b2b4a8
JK
36048 if (!mem_cgroup_is_root(ug->memcg))
36049 css_put_many(&ug->memcg->css, nr_pages);
b3bbd485 36050@@ -5822,10 +5825,10 @@ void mem_cgroup_migrate(struct page *oldpage, struct page *newpage)
1a6e0f06
JK
36051
36052 commit_charge(newpage, memcg, false);
36053
36054- local_irq_save(flags);
36055+ local_lock_irqsave(event_lock, flags);
36056 mem_cgroup_charge_statistics(memcg, newpage, compound, nr_pages);
36057 memcg_check_events(memcg, newpage);
36058- local_irq_restore(flags);
36059+ local_unlock_irqrestore(event_lock, flags);
36060 }
36061
36062 DEFINE_STATIC_KEY_FALSE(memcg_sockets_enabled_key);
b3bbd485 36063@@ -6017,6 +6020,7 @@ void mem_cgroup_swapout(struct page *page, swp_entry_t entry)
1a6e0f06 36064 struct mem_cgroup *memcg, *swap_memcg;
e4b2b4a8 36065 unsigned int nr_entries;
1a6e0f06
JK
36066 unsigned short oldid;
36067+ unsigned long flags;
36068
36069 VM_BUG_ON_PAGE(PageLRU(page), page);
36070 VM_BUG_ON_PAGE(page_count(page), page);
b3bbd485 36071@@ -6062,13 +6066,17 @@ void mem_cgroup_swapout(struct page *page, swp_entry_t entry)
1a6e0f06
JK
36072 * important here to have the interrupts disabled because it is the
36073 * only synchronisation we have for udpating the per-CPU variables.
36074 */
36075+ local_lock_irqsave(event_lock, flags);
36076+#ifndef CONFIG_PREEMPT_RT_BASE
36077 VM_BUG_ON(!irqs_disabled());
36078+#endif
e4b2b4a8
JK
36079 mem_cgroup_charge_statistics(memcg, page, PageTransHuge(page),
36080 -nr_entries);
1a6e0f06
JK
36081 memcg_check_events(memcg, page);
36082
36083 if (!mem_cgroup_is_root(memcg))
e4b2b4a8 36084 css_put_many(&memcg->css, nr_entries);
1a6e0f06
JK
36085+ local_unlock_irqrestore(event_lock, flags);
36086 }
36087
e4b2b4a8 36088 /**
b3bbd485
JK
36089diff --git a/mm/mmu_context.c b/mm/mmu_context.c
36090index 3e612ae748e9..d0ccc070979f 100644
36091--- a/mm/mmu_context.c
36092+++ b/mm/mmu_context.c
36093@@ -25,6 +25,7 @@ void use_mm(struct mm_struct *mm)
1a6e0f06
JK
36094 struct task_struct *tsk = current;
36095
36096 task_lock(tsk);
36097+ preempt_disable_rt();
36098 active_mm = tsk->active_mm;
36099 if (active_mm != mm) {
e4b2b4a8 36100 mmgrab(mm);
b3bbd485 36101@@ -32,6 +33,7 @@ void use_mm(struct mm_struct *mm)
1a6e0f06
JK
36102 }
36103 tsk->mm = mm;
36104 switch_mm(active_mm, mm, tsk);
36105+ preempt_enable_rt();
36106 task_unlock(tsk);
36107 #ifdef finish_arch_post_lock_switch
36108 finish_arch_post_lock_switch();
b3bbd485 36109diff --git a/mm/page_alloc.c b/mm/page_alloc.c
5dd41b01 36110index a604b5da6755..525a6f2d5144 100644
b3bbd485
JK
36111--- a/mm/page_alloc.c
36112+++ b/mm/page_alloc.c
1a6e0f06 36113@@ -61,6 +61,7 @@
1a6e0f06
JK
36114 #include <linux/hugetlb.h>
36115 #include <linux/sched/rt.h>
e4b2b4a8 36116 #include <linux/sched/mm.h>
1a6e0f06
JK
36117+#include <linux/locallock.h>
36118 #include <linux/page_owner.h>
36119 #include <linux/kthread.h>
36120 #include <linux/memcontrol.h>
b3bbd485 36121@@ -286,6 +287,18 @@ EXPORT_SYMBOL(nr_node_ids);
1a6e0f06
JK
36122 EXPORT_SYMBOL(nr_online_nodes);
36123 #endif
36124
36125+static DEFINE_LOCAL_IRQ_LOCK(pa_lock);
36126+
36127+#ifdef CONFIG_PREEMPT_RT_BASE
36128+# define cpu_lock_irqsave(cpu, flags) \
36129+ local_lock_irqsave_on(pa_lock, flags, cpu)
36130+# define cpu_unlock_irqrestore(cpu, flags) \
36131+ local_unlock_irqrestore_on(pa_lock, flags, cpu)
36132+#else
36133+# define cpu_lock_irqsave(cpu, flags) local_irq_save(flags)
36134+# define cpu_unlock_irqrestore(cpu, flags) local_irq_restore(flags)
36135+#endif
36136+
36137 int page_group_by_mobility_disabled __read_mostly;
36138
36139 #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
b3bbd485 36140@@ -1094,7 +1107,7 @@ static bool bulkfree_pcp_prepare(struct page *page)
1a6e0f06
JK
36141 #endif /* CONFIG_DEBUG_VM */
36142
36143 /*
36144- * Frees a number of pages from the PCP lists
36145+ * Frees a number of pages which have been collected from the pcp lists.
36146 * Assumes all pages on list are in same zone, and of same order.
36147 * count is the number of pages to free.
36148 *
b3bbd485 36149@@ -1105,15 +1118,53 @@ static bool bulkfree_pcp_prepare(struct page *page)
1a6e0f06
JK
36150 * pinned" detection logic.
36151 */
36152 static void free_pcppages_bulk(struct zone *zone, int count,
36153- struct per_cpu_pages *pcp)
36154+ struct list_head *list)
36155 {
36156- int migratetype = 0;
36157- int batch_free = 0;
1a6e0f06
JK
36158 bool isolated_pageblocks;
36159+ unsigned long flags;
1a6e0f06
JK
36160
36161- spin_lock(&zone->lock);
e4b2b4a8 36162+ spin_lock_irqsave(&zone->lock, flags);
1a6e0f06 36163 isolated_pageblocks = has_isolate_pageblock(zone);
1a6e0f06
JK
36164
36165+ while (!list_empty(list)) {
36166+ struct page *page;
e4b2b4a8 36167+ int mt; /* migratetype of the to-be-freed page */
1a6e0f06
JK
36168+
36169+ page = list_first_entry(list, struct page, lru);
36170+ /* must delete as __free_one_page list manipulates */
36171+ list_del(&page->lru);
36172+
36173+ mt = get_pcppage_migratetype(page);
36174+ /* MIGRATE_ISOLATE page should not go to pcplists */
36175+ VM_BUG_ON_PAGE(is_migrate_isolate(mt), page);
36176+ /* Pageblock could have been isolated meanwhile */
36177+ if (unlikely(isolated_pageblocks))
36178+ mt = get_pageblock_migratetype(page);
36179+
36180+ if (bulkfree_pcp_prepare(page))
36181+ continue;
36182+
36183+ __free_one_page(page, page_to_pfn(page), zone, 0, mt);
36184+ trace_mm_page_pcpu_drain(page, 0, mt);
36185+ count--;
36186+ }
36187+ WARN_ON(count != 0);
36188+ spin_unlock_irqrestore(&zone->lock, flags);
36189+}
36190+
36191+/*
36192+ * Moves a number of pages from the PCP lists to free list which
36193+ * is freed outside of the locked region.
36194+ *
36195+ * Assumes all pages on list are in same zone, and of same order.
36196+ * count is the number of pages to free.
36197+ */
36198+static void isolate_pcp_pages(int count, struct per_cpu_pages *src,
36199+ struct list_head *dst)
36200+{
36201+ int migratetype = 0;
36202+ int batch_free = 0;
36203+
36204 while (count) {
36205 struct page *page;
36206 struct list_head *list;
b3bbd485 36207@@ -1129,7 +1180,7 @@ static void free_pcppages_bulk(struct zone *zone, int count,
1a6e0f06
JK
36208 batch_free++;
36209 if (++migratetype == MIGRATE_PCPTYPES)
36210 migratetype = 0;
36211- list = &pcp->lists[migratetype];
36212+ list = &src->lists[migratetype];
36213 } while (list_empty(list));
36214
36215 /* This is the only non-empty list. Free them all. */
b3bbd485 36216@@ -1137,27 +1188,12 @@ static void free_pcppages_bulk(struct zone *zone, int count,
1a6e0f06
JK
36217 batch_free = count;
36218
36219 do {
36220- int mt; /* migratetype of the to-be-freed page */
36221-
36222 page = list_last_entry(list, struct page, lru);
36223- /* must delete as __free_one_page list manipulates */
36224 list_del(&page->lru);
36225
36226- mt = get_pcppage_migratetype(page);
36227- /* MIGRATE_ISOLATE page should not go to pcplists */
36228- VM_BUG_ON_PAGE(is_migrate_isolate(mt), page);
36229- /* Pageblock could have been isolated meanwhile */
36230- if (unlikely(isolated_pageblocks))
36231- mt = get_pageblock_migratetype(page);
36232-
36233- if (bulkfree_pcp_prepare(page))
36234- continue;
36235-
36236- __free_one_page(page, page_to_pfn(page), zone, 0, mt);
36237- trace_mm_page_pcpu_drain(page, 0, mt);
36238+ list_add(&page->lru, dst);
36239 } while (--count && --batch_free && !list_empty(list));
36240 }
36241- spin_unlock(&zone->lock);
36242 }
36243
36244 static void free_one_page(struct zone *zone,
b3bbd485 36245@@ -1165,13 +1201,15 @@ static void free_one_page(struct zone *zone,
e4b2b4a8 36246 unsigned int order,
1a6e0f06
JK
36247 int migratetype)
36248 {
1a6e0f06
JK
36249- spin_lock(&zone->lock);
36250+ unsigned long flags;
36251+
36252+ spin_lock_irqsave(&zone->lock, flags);
e4b2b4a8
JK
36253 if (unlikely(has_isolate_pageblock(zone) ||
36254 is_migrate_isolate(migratetype))) {
1a6e0f06
JK
36255 migratetype = get_pfnblock_migratetype(page, pfn);
36256 }
36257 __free_one_page(page, pfn, zone, order, migratetype);
36258- spin_unlock(&zone->lock);
36259+ spin_unlock_irqrestore(&zone->lock, flags);
36260 }
36261
36262 static void __meminit __init_single_page(struct page *page, unsigned long pfn,
b3bbd485 36263@@ -1257,10 +1295,10 @@ static void __free_pages_ok(struct page *page, unsigned int order)
1a6e0f06
JK
36264 return;
36265
36266 migratetype = get_pfnblock_migratetype(page, pfn);
36267- local_irq_save(flags);
36268+ local_lock_irqsave(pa_lock, flags);
36269 __count_vm_events(PGFREE, 1 << order);
36270 free_one_page(page_zone(page), page, pfn, order, migratetype);
36271- local_irq_restore(flags);
36272+ local_unlock_irqrestore(pa_lock, flags);
36273 }
36274
36275 static void __init __free_pages_boot_core(struct page *page, unsigned int order)
b3bbd485 36276@@ -2378,16 +2416,18 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order,
1a6e0f06
JK
36277 void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp)
36278 {
36279 unsigned long flags;
36280+ LIST_HEAD(dst);
36281 int to_drain, batch;
36282
36283- local_irq_save(flags);
36284+ local_lock_irqsave(pa_lock, flags);
36285 batch = READ_ONCE(pcp->batch);
36286 to_drain = min(pcp->count, batch);
36287 if (to_drain > 0) {
36288- free_pcppages_bulk(zone, to_drain, pcp);
36289+ isolate_pcp_pages(to_drain, pcp, &dst);
36290 pcp->count -= to_drain;
36291 }
36292- local_irq_restore(flags);
36293+ local_unlock_irqrestore(pa_lock, flags);
36294+ free_pcppages_bulk(zone, to_drain, &dst);
36295 }
36296 #endif
36297
b3bbd485 36298@@ -2403,16 +2443,21 @@ static void drain_pages_zone(unsigned int cpu, struct zone *zone)
1a6e0f06
JK
36299 unsigned long flags;
36300 struct per_cpu_pageset *pset;
36301 struct per_cpu_pages *pcp;
36302+ LIST_HEAD(dst);
36303+ int count;
36304
36305- local_irq_save(flags);
36306+ cpu_lock_irqsave(cpu, flags);
36307 pset = per_cpu_ptr(zone->pageset, cpu);
36308
36309 pcp = &pset->pcp;
36310- if (pcp->count) {
36311- free_pcppages_bulk(zone, pcp->count, pcp);
36312+ count = pcp->count;
36313+ if (count) {
36314+ isolate_pcp_pages(count, pcp, &dst);
36315 pcp->count = 0;
36316 }
36317- local_irq_restore(flags);
36318+ cpu_unlock_irqrestore(cpu, flags);
36319+ if (count)
36320+ free_pcppages_bulk(zone, count, &dst);
36321 }
36322
36323 /*
b3bbd485 36324@@ -2447,6 +2492,7 @@ void drain_local_pages(struct zone *zone)
e4b2b4a8
JK
36325 drain_pages(cpu);
36326 }
36327
36328+#ifndef CONFIG_PREEMPT_RT_BASE
36329 static void drain_local_pages_wq(struct work_struct *work)
36330 {
36331 /*
b3bbd485 36332@@ -2460,6 +2506,7 @@ static void drain_local_pages_wq(struct work_struct *work)
e4b2b4a8
JK
36333 drain_local_pages(NULL);
36334 preempt_enable();
36335 }
36336+#endif
36337
36338 /*
36339 * Spill all the per-cpu pages from all CPUs back into the buddy allocator.
b3bbd485 36340@@ -2526,7 +2573,14 @@ void drain_all_pages(struct zone *zone)
1a6e0f06
JK
36341 else
36342 cpumask_clear_cpu(cpu, &cpus_with_pcps);
36343 }
e4b2b4a8
JK
36344-
36345+#ifdef CONFIG_PREEMPT_RT_BASE
1a6e0f06
JK
36346+ for_each_cpu(cpu, &cpus_with_pcps) {
36347+ if (zone)
36348+ drain_pages_zone(cpu, zone);
36349+ else
36350+ drain_pages(cpu);
36351+ }
e4b2b4a8
JK
36352+#else
36353 for_each_cpu(cpu, &cpus_with_pcps) {
36354 struct work_struct *work = per_cpu_ptr(&pcpu_drain, cpu);
36355 INIT_WORK(work, drain_local_pages_wq);
b3bbd485 36356@@ -2534,6 +2588,7 @@ void drain_all_pages(struct zone *zone)
e4b2b4a8
JK
36357 }
36358 for_each_cpu(cpu, &cpus_with_pcps)
36359 flush_work(per_cpu_ptr(&pcpu_drain, cpu));
1a6e0f06 36360+#endif
1a6e0f06 36361
e4b2b4a8
JK
36362 mutex_unlock(&pcpu_drain_mutex);
36363 }
b3bbd485 36364@@ -2610,7 +2665,7 @@ void free_hot_cold_page(struct page *page, bool cold)
1a6e0f06
JK
36365
36366 migratetype = get_pfnblock_migratetype(page, pfn);
36367 set_pcppage_migratetype(page, migratetype);
36368- local_irq_save(flags);
36369+ local_lock_irqsave(pa_lock, flags);
36370 __count_vm_event(PGFREE);
36371
36372 /*
b3bbd485 36373@@ -2636,12 +2691,17 @@ void free_hot_cold_page(struct page *page, bool cold)
1a6e0f06
JK
36374 pcp->count++;
36375 if (pcp->count >= pcp->high) {
36376 unsigned long batch = READ_ONCE(pcp->batch);
36377- free_pcppages_bulk(zone, batch, pcp);
36378+ LIST_HEAD(dst);
36379+
36380+ isolate_pcp_pages(batch, pcp, &dst);
36381 pcp->count -= batch;
36382+ local_unlock_irqrestore(pa_lock, flags);
36383+ free_pcppages_bulk(zone, batch, &dst);
36384+ return;
36385 }
36386
36387 out:
36388- local_irq_restore(flags);
36389+ local_unlock_irqrestore(pa_lock, flags);
36390 }
36391
36392 /*
b3bbd485 36393@@ -2789,7 +2849,7 @@ static struct page *rmqueue_pcplist(struct zone *preferred_zone,
e4b2b4a8
JK
36394 struct page *page;
36395 unsigned long flags;
1a6e0f06 36396
e4b2b4a8
JK
36397- local_irq_save(flags);
36398+ local_lock_irqsave(pa_lock, flags);
36399 pcp = &this_cpu_ptr(zone->pageset)->pcp;
36400 list = &pcp->lists[migratetype];
36401 page = __rmqueue_pcplist(zone, migratetype, cold, pcp, list);
b3bbd485 36402@@ -2797,7 +2857,7 @@ static struct page *rmqueue_pcplist(struct zone *preferred_zone,
e4b2b4a8
JK
36403 __count_zid_vm_events(PGALLOC, page_zonenum(page), 1 << order);
36404 zone_statistics(preferred_zone, zone);
1a6e0f06 36405 }
e4b2b4a8
JK
36406- local_irq_restore(flags);
36407+ local_unlock_irqrestore(pa_lock, flags);
36408 return page;
36409 }
36410
b3bbd485 36411@@ -2824,7 +2884,7 @@ struct page *rmqueue(struct zone *preferred_zone,
e4b2b4a8
JK
36412 * allocate greater than order-1 page units with __GFP_NOFAIL.
36413 */
36414 WARN_ON_ONCE((gfp_flags & __GFP_NOFAIL) && (order > 1));
36415- spin_lock_irqsave(&zone->lock, flags);
36416+ local_spin_lock_irqsave(pa_lock, &zone->lock, flags);
36417
36418 do {
36419 page = NULL;
b3bbd485 36420@@ -2844,14 +2904,14 @@ struct page *rmqueue(struct zone *preferred_zone,
1a6e0f06
JK
36421
36422 __count_zid_vm_events(PGALLOC, page_zonenum(page), 1 << order);
e4b2b4a8 36423 zone_statistics(preferred_zone, zone);
1a6e0f06
JK
36424- local_irq_restore(flags);
36425+ local_unlock_irqrestore(pa_lock, flags);
36426
e4b2b4a8
JK
36427 out:
36428 VM_BUG_ON_PAGE(page && bad_range(zone, page), page);
1a6e0f06
JK
36429 return page;
36430
36431 failed:
36432- local_irq_restore(flags);
36433+ local_unlock_irqrestore(pa_lock, flags);
36434 return NULL;
36435 }
36436
5dd41b01 36437@@ -6785,8 +6845,9 @@ void __init free_area_init(unsigned long *zones_size)
1a6e0f06 36438
e4b2b4a8 36439 static int page_alloc_cpu_dead(unsigned int cpu)
1a6e0f06 36440 {
e4b2b4a8
JK
36441-
36442+ local_lock_irq_on(swapvec_lock, cpu);
36443 lru_add_drain_cpu(cpu);
36444+ local_unlock_irq_on(swapvec_lock, cpu);
36445 drain_pages(cpu);
1a6e0f06 36446
e4b2b4a8 36447 /*
5dd41b01 36448@@ -7690,7 +7751,7 @@ void zone_pcp_reset(struct zone *zone)
1a6e0f06
JK
36449 struct per_cpu_pageset *pset;
36450
36451 /* avoid races with drain_pages() */
36452- local_irq_save(flags);
36453+ local_lock_irqsave(pa_lock, flags);
36454 if (zone->pageset != &boot_pageset) {
36455 for_each_online_cpu(cpu) {
36456 pset = per_cpu_ptr(zone->pageset, cpu);
5dd41b01 36457@@ -7699,7 +7760,7 @@ void zone_pcp_reset(struct zone *zone)
1a6e0f06
JK
36458 free_percpu(zone->pageset);
36459 zone->pageset = &boot_pageset;
36460 }
36461- local_irq_restore(flags);
36462+ local_unlock_irqrestore(pa_lock, flags);
36463 }
36464
36465 #ifdef CONFIG_MEMORY_HOTREMOVE
b3bbd485
JK
36466diff --git a/mm/slab.h b/mm/slab.h
36467index 485d9fbb8802..f3b06c48bf39 100644
36468--- a/mm/slab.h
36469+++ b/mm/slab.h
36470@@ -451,7 +451,11 @@ static inline void slab_post_alloc_hook(struct kmem_cache *s, gfp_t flags,
1a6e0f06
JK
36471 * The slab lists for all objects.
36472 */
36473 struct kmem_cache_node {
36474+#ifdef CONFIG_SLUB
36475+ raw_spinlock_t list_lock;
36476+#else
36477 spinlock_t list_lock;
36478+#endif
36479
36480 #ifdef CONFIG_SLAB
36481 struct list_head slabs_partial; /* partial list first, better asm code */
b3bbd485 36482diff --git a/mm/slub.c b/mm/slub.c
5dd41b01 36483index 220d42e592ef..9b337c28dd1f 100644
b3bbd485
JK
36484--- a/mm/slub.c
36485+++ b/mm/slub.c
36486@@ -1179,7 +1179,7 @@ static noinline int free_debug_processing(
1a6e0f06
JK
36487 unsigned long uninitialized_var(flags);
36488 int ret = 0;
36489
36490- spin_lock_irqsave(&n->list_lock, flags);
36491+ raw_spin_lock_irqsave(&n->list_lock, flags);
36492 slab_lock(page);
36493
36494 if (s->flags & SLAB_CONSISTENCY_CHECKS) {
b3bbd485 36495@@ -1214,7 +1214,7 @@ static noinline int free_debug_processing(
1a6e0f06
JK
36496 bulk_cnt, cnt);
36497
36498 slab_unlock(page);
36499- spin_unlock_irqrestore(&n->list_lock, flags);
36500+ raw_spin_unlock_irqrestore(&n->list_lock, flags);
36501 if (!ret)
36502 slab_fix(s, "Object at 0x%p not freed", object);
36503 return ret;
b3bbd485 36504@@ -1342,6 +1342,12 @@ static inline void dec_slabs_node(struct kmem_cache *s, int node,
1a6e0f06
JK
36505
36506 #endif /* CONFIG_SLUB_DEBUG */
36507
36508+struct slub_free_list {
36509+ raw_spinlock_t lock;
36510+ struct list_head list;
36511+};
36512+static DEFINE_PER_CPU(struct slub_free_list, slub_free_list);
36513+
36514 /*
36515 * Hooks for other subsystems that check memory allocations. In a typical
36516 * production configuration these hooks all should produce no code at all.
b3bbd485 36517@@ -1561,10 +1567,17 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
1a6e0f06
JK
36518 void *start, *p;
36519 int idx, order;
36520 bool shuffle;
36521+ bool enableirqs = false;
36522
36523 flags &= gfp_allowed_mask;
36524
36525 if (gfpflags_allow_blocking(flags))
36526+ enableirqs = true;
36527+#ifdef CONFIG_PREEMPT_RT_FULL
e4b2b4a8 36528+ if (system_state > SYSTEM_BOOTING)
1a6e0f06
JK
36529+ enableirqs = true;
36530+#endif
36531+ if (enableirqs)
36532 local_irq_enable();
36533
36534 flags |= s->allocflags;
b3bbd485 36535@@ -1623,7 +1636,7 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
1a6e0f06
JK
36536 page->frozen = 1;
36537
36538 out:
36539- if (gfpflags_allow_blocking(flags))
36540+ if (enableirqs)
36541 local_irq_disable();
36542 if (!page)
36543 return NULL;
b3bbd485 36544@@ -1681,6 +1694,16 @@ static void __free_slab(struct kmem_cache *s, struct page *page)
1a6e0f06
JK
36545 __free_pages(page, order);
36546 }
36547
36548+static void free_delayed(struct list_head *h)
36549+{
36550+ while(!list_empty(h)) {
36551+ struct page *page = list_first_entry(h, struct page, lru);
36552+
36553+ list_del(&page->lru);
36554+ __free_slab(page->slab_cache, page);
36555+ }
36556+}
36557+
36558 #define need_reserve_slab_rcu \
36559 (sizeof(((struct page *)NULL)->lru) < sizeof(struct rcu_head))
36560
b3bbd485 36561@@ -1712,6 +1735,12 @@ static void free_slab(struct kmem_cache *s, struct page *page)
1a6e0f06
JK
36562 }
36563
36564 call_rcu(head, rcu_free_slab);
36565+ } else if (irqs_disabled()) {
36566+ struct slub_free_list *f = this_cpu_ptr(&slub_free_list);
36567+
36568+ raw_spin_lock(&f->lock);
36569+ list_add(&page->lru, &f->list);
36570+ raw_spin_unlock(&f->lock);
36571 } else
36572 __free_slab(s, page);
36573 }
b3bbd485 36574@@ -1819,7 +1848,7 @@ static void *get_partial_node(struct kmem_cache *s, struct kmem_cache_node *n,
1a6e0f06
JK
36575 if (!n || !n->nr_partial)
36576 return NULL;
36577
36578- spin_lock(&n->list_lock);
36579+ raw_spin_lock(&n->list_lock);
36580 list_for_each_entry_safe(page, page2, &n->partial, lru) {
36581 void *t;
36582
b3bbd485 36583@@ -1844,7 +1873,7 @@ static void *get_partial_node(struct kmem_cache *s, struct kmem_cache_node *n,
1a6e0f06
JK
36584 break;
36585
36586 }
36587- spin_unlock(&n->list_lock);
36588+ raw_spin_unlock(&n->list_lock);
36589 return object;
36590 }
36591
b3bbd485 36592@@ -2090,7 +2119,7 @@ static void deactivate_slab(struct kmem_cache *s, struct page *page,
1a6e0f06
JK
36593 * that acquire_slab() will see a slab page that
36594 * is frozen
36595 */
36596- spin_lock(&n->list_lock);
36597+ raw_spin_lock(&n->list_lock);
36598 }
36599 } else {
36600 m = M_FULL;
b3bbd485 36601@@ -2101,7 +2130,7 @@ static void deactivate_slab(struct kmem_cache *s, struct page *page,
1a6e0f06
JK
36602 * slabs from diagnostic functions will not see
36603 * any frozen slabs.
36604 */
36605- spin_lock(&n->list_lock);
36606+ raw_spin_lock(&n->list_lock);
36607 }
36608 }
36609
b3bbd485 36610@@ -2136,7 +2165,7 @@ static void deactivate_slab(struct kmem_cache *s, struct page *page,
1a6e0f06
JK
36611 goto redo;
36612
36613 if (lock)
36614- spin_unlock(&n->list_lock);
36615+ raw_spin_unlock(&n->list_lock);
36616
36617 if (m == M_FREE) {
36618 stat(s, DEACTIVATE_EMPTY);
b3bbd485 36619@@ -2171,10 +2200,10 @@ static void unfreeze_partials(struct kmem_cache *s,
1a6e0f06
JK
36620 n2 = get_node(s, page_to_nid(page));
36621 if (n != n2) {
36622 if (n)
36623- spin_unlock(&n->list_lock);
36624+ raw_spin_unlock(&n->list_lock);
36625
36626 n = n2;
36627- spin_lock(&n->list_lock);
36628+ raw_spin_lock(&n->list_lock);
36629 }
36630
36631 do {
b3bbd485 36632@@ -2203,7 +2232,7 @@ static void unfreeze_partials(struct kmem_cache *s,
1a6e0f06
JK
36633 }
36634
36635 if (n)
36636- spin_unlock(&n->list_lock);
36637+ raw_spin_unlock(&n->list_lock);
36638
36639 while (discard_page) {
36640 page = discard_page;
b3bbd485 36641@@ -2242,14 +2271,21 @@ static void put_cpu_partial(struct kmem_cache *s, struct page *page, int drain)
1a6e0f06
JK
36642 pobjects = oldpage->pobjects;
36643 pages = oldpage->pages;
36644 if (drain && pobjects > s->cpu_partial) {
36645+ struct slub_free_list *f;
36646 unsigned long flags;
36647+ LIST_HEAD(tofree);
36648 /*
36649 * partial array is full. Move the existing
36650 * set to the per node partial list.
36651 */
36652 local_irq_save(flags);
36653 unfreeze_partials(s, this_cpu_ptr(s->cpu_slab));
36654+ f = this_cpu_ptr(&slub_free_list);
36655+ raw_spin_lock(&f->lock);
36656+ list_splice_init(&f->list, &tofree);
36657+ raw_spin_unlock(&f->lock);
36658 local_irq_restore(flags);
36659+ free_delayed(&tofree);
36660 oldpage = NULL;
36661 pobjects = 0;
36662 pages = 0;
b3bbd485 36663@@ -2319,7 +2355,22 @@ static bool has_cpu_slab(int cpu, void *info)
1a6e0f06
JK
36664
36665 static void flush_all(struct kmem_cache *s)
36666 {
36667+ LIST_HEAD(tofree);
36668+ int cpu;
36669+
36670 on_each_cpu_cond(has_cpu_slab, flush_cpu_slab, s, 1, GFP_ATOMIC);
36671+ for_each_online_cpu(cpu) {
36672+ struct slub_free_list *f;
36673+
36674+ if (!has_cpu_slab(cpu, s))
36675+ continue;
36676+
36677+ f = &per_cpu(slub_free_list, cpu);
36678+ raw_spin_lock_irq(&f->lock);
36679+ list_splice_init(&f->list, &tofree);
36680+ raw_spin_unlock_irq(&f->lock);
36681+ free_delayed(&tofree);
36682+ }
36683 }
36684
36685 /*
b3bbd485 36686@@ -2374,10 +2425,10 @@ static unsigned long count_partial(struct kmem_cache_node *n,
1a6e0f06
JK
36687 unsigned long x = 0;
36688 struct page *page;
36689
36690- spin_lock_irqsave(&n->list_lock, flags);
36691+ raw_spin_lock_irqsave(&n->list_lock, flags);
36692 list_for_each_entry(page, &n->partial, lru)
36693 x += get_count(page);
36694- spin_unlock_irqrestore(&n->list_lock, flags);
36695+ raw_spin_unlock_irqrestore(&n->list_lock, flags);
36696 return x;
36697 }
36698 #endif /* CONFIG_SLUB_DEBUG || CONFIG_SYSFS */
b3bbd485 36699@@ -2515,8 +2566,10 @@ static inline void *get_freelist(struct kmem_cache *s, struct page *page)
1a6e0f06
JK
36700 * already disabled (which is the case for bulk allocation).
36701 */
36702 static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
36703- unsigned long addr, struct kmem_cache_cpu *c)
36704+ unsigned long addr, struct kmem_cache_cpu *c,
36705+ struct list_head *to_free)
36706 {
36707+ struct slub_free_list *f;
36708 void *freelist;
36709 struct page *page;
36710
b3bbd485 36711@@ -2572,6 +2625,13 @@ static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
1a6e0f06
JK
36712 VM_BUG_ON(!c->page->frozen);
36713 c->freelist = get_freepointer(s, freelist);
36714 c->tid = next_tid(c->tid);
36715+
36716+out:
36717+ f = this_cpu_ptr(&slub_free_list);
36718+ raw_spin_lock(&f->lock);
36719+ list_splice_init(&f->list, to_free);
36720+ raw_spin_unlock(&f->lock);
36721+
36722 return freelist;
36723
36724 new_slab:
b3bbd485 36725@@ -2587,7 +2647,7 @@ static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
e4b2b4a8
JK
36726
36727 if (unlikely(!freelist)) {
36728 slab_out_of_memory(s, gfpflags, node);
36729- return NULL;
36730+ goto out;
36731 }
36732
36733 page = c->page;
b3bbd485 36734@@ -2600,7 +2660,7 @@ static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
e4b2b4a8
JK
36735 goto new_slab; /* Slab failed checks. Next slab needed */
36736
36737 deactivate_slab(s, page, get_freepointer(s, freelist), c);
1a6e0f06
JK
36738- return freelist;
36739+ goto out;
36740 }
36741
36742 /*
b3bbd485 36743@@ -2612,6 +2672,7 @@ static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
1a6e0f06
JK
36744 {
36745 void *p;
36746 unsigned long flags;
36747+ LIST_HEAD(tofree);
36748
36749 local_irq_save(flags);
36750 #ifdef CONFIG_PREEMPT
b3bbd485 36751@@ -2623,8 +2684,9 @@ static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
1a6e0f06
JK
36752 c = this_cpu_ptr(s->cpu_slab);
36753 #endif
36754
36755- p = ___slab_alloc(s, gfpflags, node, addr, c);
36756+ p = ___slab_alloc(s, gfpflags, node, addr, c, &tofree);
36757 local_irq_restore(flags);
36758+ free_delayed(&tofree);
36759 return p;
36760 }
36761
b3bbd485 36762@@ -2810,7 +2872,7 @@ static void __slab_free(struct kmem_cache *s, struct page *page,
1a6e0f06
JK
36763
36764 do {
36765 if (unlikely(n)) {
36766- spin_unlock_irqrestore(&n->list_lock, flags);
36767+ raw_spin_unlock_irqrestore(&n->list_lock, flags);
36768 n = NULL;
36769 }
36770 prior = page->freelist;
b3bbd485 36771@@ -2842,7 +2904,7 @@ static void __slab_free(struct kmem_cache *s, struct page *page,
1a6e0f06
JK
36772 * Otherwise the list_lock will synchronize with
36773 * other processors updating the list of slabs.
36774 */
36775- spin_lock_irqsave(&n->list_lock, flags);
36776+ raw_spin_lock_irqsave(&n->list_lock, flags);
36777
36778 }
36779 }
b3bbd485 36780@@ -2884,7 +2946,7 @@ static void __slab_free(struct kmem_cache *s, struct page *page,
1a6e0f06
JK
36781 add_partial(n, page, DEACTIVATE_TO_TAIL);
36782 stat(s, FREE_ADD_PARTIAL);
36783 }
36784- spin_unlock_irqrestore(&n->list_lock, flags);
36785+ raw_spin_unlock_irqrestore(&n->list_lock, flags);
36786 return;
36787
36788 slab_empty:
b3bbd485 36789@@ -2899,7 +2961,7 @@ static void __slab_free(struct kmem_cache *s, struct page *page,
1a6e0f06
JK
36790 remove_full(s, n, page);
36791 }
36792
36793- spin_unlock_irqrestore(&n->list_lock, flags);
36794+ raw_spin_unlock_irqrestore(&n->list_lock, flags);
36795 stat(s, FREE_SLAB);
36796 discard_slab(s, page);
36797 }
b3bbd485 36798@@ -3104,6 +3166,7 @@ int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size,
1a6e0f06
JK
36799 void **p)
36800 {
36801 struct kmem_cache_cpu *c;
36802+ LIST_HEAD(to_free);
36803 int i;
36804
36805 /* memcg and kmem_cache debug support */
b3bbd485 36806@@ -3127,7 +3190,7 @@ int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size,
1a6e0f06
JK
36807 * of re-populating per CPU c->freelist
36808 */
36809 p[i] = ___slab_alloc(s, flags, NUMA_NO_NODE,
36810- _RET_IP_, c);
36811+ _RET_IP_, c, &to_free);
36812 if (unlikely(!p[i]))
36813 goto error;
36814
b3bbd485 36815@@ -3139,6 +3202,7 @@ int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size,
1a6e0f06
JK
36816 }
36817 c->tid = next_tid(c->tid);
36818 local_irq_enable();
36819+ free_delayed(&to_free);
36820
36821 /* Clear memory outside IRQ disabled fastpath loop */
36822 if (unlikely(flags & __GFP_ZERO)) {
b3bbd485 36823@@ -3153,6 +3217,7 @@ int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size,
e4b2b4a8
JK
36824 return i;
36825 error:
36826 local_irq_enable();
36827+ free_delayed(&to_free);
36828 slab_post_alloc_hook(s, flags, i, p);
36829 __kmem_cache_free_bulk(s, i, p);
36830 return 0;
b3bbd485 36831@@ -3286,7 +3351,7 @@ static void
1a6e0f06
JK
36832 init_kmem_cache_node(struct kmem_cache_node *n)
36833 {
36834 n->nr_partial = 0;
36835- spin_lock_init(&n->list_lock);
36836+ raw_spin_lock_init(&n->list_lock);
36837 INIT_LIST_HEAD(&n->partial);
36838 #ifdef CONFIG_SLUB_DEBUG
36839 atomic_long_set(&n->nr_slabs, 0);
b3bbd485 36840@@ -3640,6 +3705,10 @@ static void list_slab_objects(struct kmem_cache *s, struct page *page,
1a6e0f06
JK
36841 const char *text)
36842 {
36843 #ifdef CONFIG_SLUB_DEBUG
36844+#ifdef CONFIG_PREEMPT_RT_BASE
36845+ /* XXX move out of irq-off section */
36846+ slab_err(s, page, text, s->name);
36847+#else
36848 void *addr = page_address(page);
36849 void *p;
36850 unsigned long *map = kzalloc(BITS_TO_LONGS(page->objects) *
b3bbd485 36851@@ -3660,6 +3729,7 @@ static void list_slab_objects(struct kmem_cache *s, struct page *page,
1a6e0f06
JK
36852 slab_unlock(page);
36853 kfree(map);
36854 #endif
36855+#endif
36856 }
36857
36858 /*
b3bbd485 36859@@ -3673,7 +3743,7 @@ static void free_partial(struct kmem_cache *s, struct kmem_cache_node *n)
1a6e0f06
JK
36860 struct page *page, *h;
36861
36862 BUG_ON(irqs_disabled());
36863- spin_lock_irq(&n->list_lock);
36864+ raw_spin_lock_irq(&n->list_lock);
36865 list_for_each_entry_safe(page, h, &n->partial, lru) {
36866 if (!page->inuse) {
36867 remove_partial(n, page);
b3bbd485 36868@@ -3683,7 +3753,7 @@ static void free_partial(struct kmem_cache *s, struct kmem_cache_node *n)
1a6e0f06
JK
36869 "Objects remaining in %s on __kmem_cache_shutdown()");
36870 }
36871 }
36872- spin_unlock_irq(&n->list_lock);
36873+ raw_spin_unlock_irq(&n->list_lock);
36874
36875 list_for_each_entry_safe(page, h, &discard, lru)
36876 discard_slab(s, page);
b3bbd485 36877@@ -3927,7 +3997,7 @@ int __kmem_cache_shrink(struct kmem_cache *s)
1a6e0f06
JK
36878 for (i = 0; i < SHRINK_PROMOTE_MAX; i++)
36879 INIT_LIST_HEAD(promote + i);
36880
36881- spin_lock_irqsave(&n->list_lock, flags);
36882+ raw_spin_lock_irqsave(&n->list_lock, flags);
36883
36884 /*
36885 * Build lists of slabs to discard or promote.
b3bbd485 36886@@ -3958,7 +4028,7 @@ int __kmem_cache_shrink(struct kmem_cache *s)
1a6e0f06
JK
36887 for (i = SHRINK_PROMOTE_MAX - 1; i >= 0; i--)
36888 list_splice(promote + i, &n->partial);
36889
36890- spin_unlock_irqrestore(&n->list_lock, flags);
36891+ raw_spin_unlock_irqrestore(&n->list_lock, flags);
36892
36893 /* Release empty slabs */
36894 list_for_each_entry_safe(page, t, &discard, lru)
b3bbd485 36895@@ -4171,6 +4241,12 @@ void __init kmem_cache_init(void)
1a6e0f06
JK
36896 {
36897 static __initdata struct kmem_cache boot_kmem_cache,
36898 boot_kmem_cache_node;
36899+ int cpu;
36900+
36901+ for_each_possible_cpu(cpu) {
36902+ raw_spin_lock_init(&per_cpu(slub_free_list, cpu).lock);
36903+ INIT_LIST_HEAD(&per_cpu(slub_free_list, cpu).list);
36904+ }
36905
36906 if (debug_guardpage_minorder())
36907 slub_max_order = 0;
b3bbd485 36908@@ -4379,7 +4455,7 @@ static int validate_slab_node(struct kmem_cache *s,
1a6e0f06
JK
36909 struct page *page;
36910 unsigned long flags;
36911
36912- spin_lock_irqsave(&n->list_lock, flags);
36913+ raw_spin_lock_irqsave(&n->list_lock, flags);
36914
36915 list_for_each_entry(page, &n->partial, lru) {
36916 validate_slab_slab(s, page, map);
b3bbd485 36917@@ -4401,7 +4477,7 @@ static int validate_slab_node(struct kmem_cache *s,
1a6e0f06
JK
36918 s->name, count, atomic_long_read(&n->nr_slabs));
36919
36920 out:
36921- spin_unlock_irqrestore(&n->list_lock, flags);
36922+ raw_spin_unlock_irqrestore(&n->list_lock, flags);
36923 return count;
36924 }
36925
b3bbd485 36926@@ -4589,12 +4665,12 @@ static int list_locations(struct kmem_cache *s, char *buf,
1a6e0f06
JK
36927 if (!atomic_long_read(&n->nr_slabs))
36928 continue;
36929
36930- spin_lock_irqsave(&n->list_lock, flags);
36931+ raw_spin_lock_irqsave(&n->list_lock, flags);
36932 list_for_each_entry(page, &n->partial, lru)
36933 process_slab(&t, s, page, alloc, map);
36934 list_for_each_entry(page, &n->full, lru)
36935 process_slab(&t, s, page, alloc, map);
36936- spin_unlock_irqrestore(&n->list_lock, flags);
36937+ raw_spin_unlock_irqrestore(&n->list_lock, flags);
36938 }
36939
36940 for (i = 0; i < t.count; i++) {
b3bbd485
JK
36941diff --git a/mm/swap.c b/mm/swap.c
36942index a77d68f2c1b6..30d62efe001b 100644
36943--- a/mm/swap.c
36944+++ b/mm/swap.c
1a6e0f06
JK
36945@@ -32,6 +32,7 @@
36946 #include <linux/memcontrol.h>
36947 #include <linux/gfp.h>
36948 #include <linux/uio.h>
36949+#include <linux/locallock.h>
36950 #include <linux/hugetlb.h>
36951 #include <linux/page_idle.h>
36952
b3bbd485 36953@@ -50,6 +51,8 @@ static DEFINE_PER_CPU(struct pagevec, lru_lazyfree_pvecs);
1a6e0f06
JK
36954 #ifdef CONFIG_SMP
36955 static DEFINE_PER_CPU(struct pagevec, activate_page_pvecs);
36956 #endif
36957+static DEFINE_LOCAL_IRQ_LOCK(rotate_lock);
36958+DEFINE_LOCAL_IRQ_LOCK(swapvec_lock);
36959
36960 /*
36961 * This path almost never happens for VM activity - pages are normally
b3bbd485 36962@@ -252,11 +255,11 @@ void rotate_reclaimable_page(struct page *page)
1a6e0f06
JK
36963 unsigned long flags;
36964
36965 get_page(page);
36966- local_irq_save(flags);
36967+ local_lock_irqsave(rotate_lock, flags);
36968 pvec = this_cpu_ptr(&lru_rotate_pvecs);
36969 if (!pagevec_add(pvec, page) || PageCompound(page))
36970 pagevec_move_tail(pvec);
36971- local_irq_restore(flags);
36972+ local_unlock_irqrestore(rotate_lock, flags);
36973 }
36974 }
36975
b3bbd485 36976@@ -306,12 +309,13 @@ void activate_page(struct page *page)
1a6e0f06
JK
36977 {
36978 page = compound_head(page);
36979 if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) {
36980- struct pagevec *pvec = &get_cpu_var(activate_page_pvecs);
36981+ struct pagevec *pvec = &get_locked_var(swapvec_lock,
36982+ activate_page_pvecs);
36983
36984 get_page(page);
36985 if (!pagevec_add(pvec, page) || PageCompound(page))
36986 pagevec_lru_move_fn(pvec, __activate_page, NULL);
36987- put_cpu_var(activate_page_pvecs);
36988+ put_locked_var(swapvec_lock, activate_page_pvecs);
36989 }
36990 }
36991
b3bbd485 36992@@ -338,7 +342,7 @@ void activate_page(struct page *page)
1a6e0f06
JK
36993
36994 static void __lru_cache_activate_page(struct page *page)
36995 {
36996- struct pagevec *pvec = &get_cpu_var(lru_add_pvec);
36997+ struct pagevec *pvec = &get_locked_var(swapvec_lock, lru_add_pvec);
36998 int i;
36999
37000 /*
b3bbd485 37001@@ -360,7 +364,7 @@ static void __lru_cache_activate_page(struct page *page)
1a6e0f06
JK
37002 }
37003 }
37004
37005- put_cpu_var(lru_add_pvec);
37006+ put_locked_var(swapvec_lock, lru_add_pvec);
37007 }
37008
37009 /*
b3bbd485 37010@@ -402,12 +406,12 @@ EXPORT_SYMBOL(mark_page_accessed);
1a6e0f06
JK
37011
37012 static void __lru_cache_add(struct page *page)
37013 {
37014- struct pagevec *pvec = &get_cpu_var(lru_add_pvec);
37015+ struct pagevec *pvec = &get_locked_var(swapvec_lock, lru_add_pvec);
37016
37017 get_page(page);
37018 if (!pagevec_add(pvec, page) || PageCompound(page))
37019 __pagevec_lru_add(pvec);
37020- put_cpu_var(lru_add_pvec);
37021+ put_locked_var(swapvec_lock, lru_add_pvec);
37022 }
37023
37024 /**
b3bbd485 37025@@ -613,9 +617,15 @@ void lru_add_drain_cpu(int cpu)
1a6e0f06
JK
37026 unsigned long flags;
37027
37028 /* No harm done if a racing interrupt already did this */
37029- local_irq_save(flags);
37030+#ifdef CONFIG_PREEMPT_RT_BASE
37031+ local_lock_irqsave_on(rotate_lock, flags, cpu);
37032 pagevec_move_tail(pvec);
37033- local_irq_restore(flags);
37034+ local_unlock_irqrestore_on(rotate_lock, flags, cpu);
37035+#else
37036+ local_lock_irqsave(rotate_lock, flags);
37037+ pagevec_move_tail(pvec);
37038+ local_unlock_irqrestore(rotate_lock, flags);
37039+#endif
37040 }
37041
37042 pvec = &per_cpu(lru_deactivate_file_pvecs, cpu);
b3bbd485 37043@@ -647,11 +657,12 @@ void deactivate_file_page(struct page *page)
1a6e0f06
JK
37044 return;
37045
37046 if (likely(get_page_unless_zero(page))) {
37047- struct pagevec *pvec = &get_cpu_var(lru_deactivate_file_pvecs);
37048+ struct pagevec *pvec = &get_locked_var(swapvec_lock,
37049+ lru_deactivate_file_pvecs);
37050
37051 if (!pagevec_add(pvec, page) || PageCompound(page))
37052 pagevec_lru_move_fn(pvec, lru_deactivate_file_fn, NULL);
37053- put_cpu_var(lru_deactivate_file_pvecs);
37054+ put_locked_var(swapvec_lock, lru_deactivate_file_pvecs);
37055 }
37056 }
37057
b3bbd485 37058@@ -666,21 +677,32 @@ void mark_page_lazyfree(struct page *page)
1a6e0f06 37059 {
e4b2b4a8
JK
37060 if (PageLRU(page) && PageAnon(page) && PageSwapBacked(page) &&
37061 !PageSwapCache(page) && !PageUnevictable(page)) {
37062- struct pagevec *pvec = &get_cpu_var(lru_lazyfree_pvecs);
1a6e0f06 37063+ struct pagevec *pvec = &get_locked_var(swapvec_lock,
e4b2b4a8 37064+ lru_lazyfree_pvecs);
1a6e0f06
JK
37065
37066 get_page(page);
37067 if (!pagevec_add(pvec, page) || PageCompound(page))
e4b2b4a8
JK
37068 pagevec_lru_move_fn(pvec, lru_lazyfree_fn, NULL);
37069- put_cpu_var(lru_lazyfree_pvecs);
37070+ put_locked_var(swapvec_lock, lru_lazyfree_pvecs);
1a6e0f06
JK
37071 }
37072 }
37073
37074 void lru_add_drain(void)
37075 {
37076- lru_add_drain_cpu(get_cpu());
37077- put_cpu();
37078+ lru_add_drain_cpu(local_lock_cpu(swapvec_lock));
37079+ local_unlock_cpu(swapvec_lock);
b3bbd485
JK
37080+}
37081+
1a6e0f06
JK
37082+#ifdef CONFIG_PREEMPT_RT_BASE
37083+static inline void remote_lru_add_drain(int cpu, struct cpumask *has_work)
e4b2b4a8 37084+{
1a6e0f06
JK
37085+ local_lock_on(swapvec_lock, cpu);
37086+ lru_add_drain_cpu(cpu);
37087+ local_unlock_on(swapvec_lock, cpu);
b3bbd485
JK
37088 }
37089
1a6e0f06 37090+#else
e4b2b4a8
JK
37091+
37092 static void lru_add_drain_per_cpu(struct work_struct *dummy)
37093 {
37094 lru_add_drain();
b3bbd485 37095@@ -688,6 +710,16 @@ static void lru_add_drain_per_cpu(struct work_struct *dummy)
1a6e0f06 37096
e4b2b4a8 37097 static DEFINE_PER_CPU(struct work_struct, lru_add_drain_work);
1a6e0f06 37098
1a6e0f06
JK
37099+static inline void remote_lru_add_drain(int cpu, struct cpumask *has_work)
37100+{
37101+ struct work_struct *work = &per_cpu(lru_add_drain_work, cpu);
37102+
37103+ INIT_WORK(work, lru_add_drain_per_cpu);
e4b2b4a8 37104+ queue_work_on(cpu, mm_percpu_wq, work);
1a6e0f06
JK
37105+ cpumask_set_cpu(cpu, has_work);
37106+}
37107+#endif
37108+
e4b2b4a8 37109 void lru_add_drain_all_cpuslocked(void)
1a6e0f06
JK
37110 {
37111 static DEFINE_MUTEX(lock);
b3bbd485 37112@@ -705,21 +737,19 @@ void lru_add_drain_all_cpuslocked(void)
1a6e0f06
JK
37113 cpumask_clear(&has_work);
37114
37115 for_each_online_cpu(cpu) {
37116- struct work_struct *work = &per_cpu(lru_add_drain_work, cpu);
e4b2b4a8 37117
1a6e0f06
JK
37118 if (pagevec_count(&per_cpu(lru_add_pvec, cpu)) ||
37119 pagevec_count(&per_cpu(lru_rotate_pvecs, cpu)) ||
37120 pagevec_count(&per_cpu(lru_deactivate_file_pvecs, cpu)) ||
e4b2b4a8 37121 pagevec_count(&per_cpu(lru_lazyfree_pvecs, cpu)) ||
1a6e0f06
JK
37122- need_activate_page_drain(cpu)) {
37123- INIT_WORK(work, lru_add_drain_per_cpu);
e4b2b4a8 37124- queue_work_on(cpu, mm_percpu_wq, work);
1a6e0f06
JK
37125- cpumask_set_cpu(cpu, &has_work);
37126- }
37127+ need_activate_page_drain(cpu))
37128+ remote_lru_add_drain(cpu, &has_work);
37129 }
37130
37131+#ifndef CONFIG_PREEMPT_RT_BASE
37132 for_each_cpu(cpu, &has_work)
37133 flush_work(&per_cpu(lru_add_drain_work, cpu));
37134+#endif
37135
1a6e0f06 37136 mutex_unlock(&lock);
e4b2b4a8 37137 }
b3bbd485
JK
37138diff --git a/mm/truncate.c b/mm/truncate.c
37139index 2330223841fb..d0c8e6c8fef5 100644
37140--- a/mm/truncate.c
37141+++ b/mm/truncate.c
37142@@ -41,8 +41,10 @@ static void clear_shadow_entry(struct address_space *mapping, pgoff_t index,
e4b2b4a8
JK
37143 goto unlock;
37144 if (*slot != entry)
37145 goto unlock;
37146+ local_lock(shadow_nodes_lock);
37147 __radix_tree_replace(&mapping->page_tree, node, slot, NULL,
37148- workingset_update_node, mapping);
37149+ __workingset_update_node, mapping);
37150+ local_unlock(shadow_nodes_lock);
37151 mapping->nrexceptional--;
1a6e0f06
JK
37152 unlock:
37153 spin_unlock_irq(&mapping->tree_lock);
b3bbd485
JK
37154diff --git a/mm/vmalloc.c b/mm/vmalloc.c
37155index 9ff21a12ea00..95c83b291548 100644
37156--- a/mm/vmalloc.c
37157+++ b/mm/vmalloc.c
37158@@ -865,7 +865,7 @@ static void *new_vmap_block(unsigned int order, gfp_t gfp_mask)
1a6e0f06
JK
37159 struct vmap_block *vb;
37160 struct vmap_area *va;
37161 unsigned long vb_idx;
37162- int node, err;
37163+ int node, err, cpu;
37164 void *vaddr;
37165
37166 node = numa_node_id();
b3bbd485 37167@@ -908,11 +908,12 @@ static void *new_vmap_block(unsigned int order, gfp_t gfp_mask)
1a6e0f06
JK
37168 BUG_ON(err);
37169 radix_tree_preload_end();
37170
37171- vbq = &get_cpu_var(vmap_block_queue);
37172+ cpu = get_cpu_light();
37173+ vbq = this_cpu_ptr(&vmap_block_queue);
37174 spin_lock(&vbq->lock);
37175 list_add_tail_rcu(&vb->free_list, &vbq->free);
37176 spin_unlock(&vbq->lock);
37177- put_cpu_var(vmap_block_queue);
37178+ put_cpu_light();
37179
37180 return vaddr;
37181 }
b3bbd485 37182@@ -981,6 +982,7 @@ static void *vb_alloc(unsigned long size, gfp_t gfp_mask)
1a6e0f06
JK
37183 struct vmap_block *vb;
37184 void *vaddr = NULL;
37185 unsigned int order;
37186+ int cpu;
37187
37188 BUG_ON(offset_in_page(size));
37189 BUG_ON(size > PAGE_SIZE*VMAP_MAX_ALLOC);
b3bbd485 37190@@ -995,7 +997,8 @@ static void *vb_alloc(unsigned long size, gfp_t gfp_mask)
1a6e0f06
JK
37191 order = get_order(size);
37192
37193 rcu_read_lock();
37194- vbq = &get_cpu_var(vmap_block_queue);
37195+ cpu = get_cpu_light();
37196+ vbq = this_cpu_ptr(&vmap_block_queue);
37197 list_for_each_entry_rcu(vb, &vbq->free, free_list) {
37198 unsigned long pages_off;
37199
b3bbd485 37200@@ -1018,7 +1021,7 @@ static void *vb_alloc(unsigned long size, gfp_t gfp_mask)
1a6e0f06
JK
37201 break;
37202 }
37203
37204- put_cpu_var(vmap_block_queue);
37205+ put_cpu_light();
37206 rcu_read_unlock();
37207
37208 /* Allocate new block if nothing was found */
b3bbd485 37209diff --git a/mm/vmstat.c b/mm/vmstat.c
5dd41b01 37210index 527ae727d547..ae6446b054d3 100644
b3bbd485
JK
37211--- a/mm/vmstat.c
37212+++ b/mm/vmstat.c
37213@@ -249,6 +249,7 @@ void __mod_zone_page_state(struct zone *zone, enum zone_stat_item item,
1a6e0f06
JK
37214 long x;
37215 long t;
37216
37217+ preempt_disable_rt();
37218 x = delta + __this_cpu_read(*p);
37219
37220 t = __this_cpu_read(pcp->stat_threshold);
b3bbd485 37221@@ -258,6 +259,7 @@ void __mod_zone_page_state(struct zone *zone, enum zone_stat_item item,
1a6e0f06
JK
37222 x = 0;
37223 }
37224 __this_cpu_write(*p, x);
37225+ preempt_enable_rt();
37226 }
37227 EXPORT_SYMBOL(__mod_zone_page_state);
37228
b3bbd485 37229@@ -269,6 +271,7 @@ void __mod_node_page_state(struct pglist_data *pgdat, enum node_stat_item item,
1a6e0f06
JK
37230 long x;
37231 long t;
37232
37233+ preempt_disable_rt();
37234 x = delta + __this_cpu_read(*p);
37235
37236 t = __this_cpu_read(pcp->stat_threshold);
b3bbd485 37237@@ -278,6 +281,7 @@ void __mod_node_page_state(struct pglist_data *pgdat, enum node_stat_item item,
1a6e0f06
JK
37238 x = 0;
37239 }
37240 __this_cpu_write(*p, x);
37241+ preempt_enable_rt();
37242 }
37243 EXPORT_SYMBOL(__mod_node_page_state);
37244
b3bbd485 37245@@ -310,6 +314,7 @@ void __inc_zone_state(struct zone *zone, enum zone_stat_item item)
1a6e0f06
JK
37246 s8 __percpu *p = pcp->vm_stat_diff + item;
37247 s8 v, t;
37248
37249+ preempt_disable_rt();
37250 v = __this_cpu_inc_return(*p);
37251 t = __this_cpu_read(pcp->stat_threshold);
37252 if (unlikely(v > t)) {
b3bbd485 37253@@ -318,6 +323,7 @@ void __inc_zone_state(struct zone *zone, enum zone_stat_item item)
1a6e0f06
JK
37254 zone_page_state_add(v + overstep, zone, item);
37255 __this_cpu_write(*p, -overstep);
37256 }
37257+ preempt_enable_rt();
37258 }
37259
37260 void __inc_node_state(struct pglist_data *pgdat, enum node_stat_item item)
b3bbd485 37261@@ -326,6 +332,7 @@ void __inc_node_state(struct pglist_data *pgdat, enum node_stat_item item)
1a6e0f06
JK
37262 s8 __percpu *p = pcp->vm_node_stat_diff + item;
37263 s8 v, t;
37264
37265+ preempt_disable_rt();
37266 v = __this_cpu_inc_return(*p);
37267 t = __this_cpu_read(pcp->stat_threshold);
37268 if (unlikely(v > t)) {
b3bbd485 37269@@ -334,6 +341,7 @@ void __inc_node_state(struct pglist_data *pgdat, enum node_stat_item item)
1a6e0f06
JK
37270 node_page_state_add(v + overstep, pgdat, item);
37271 __this_cpu_write(*p, -overstep);
37272 }
37273+ preempt_enable_rt();
37274 }
37275
37276 void __inc_zone_page_state(struct page *page, enum zone_stat_item item)
b3bbd485 37277@@ -354,6 +362,7 @@ void __dec_zone_state(struct zone *zone, enum zone_stat_item item)
1a6e0f06
JK
37278 s8 __percpu *p = pcp->vm_stat_diff + item;
37279 s8 v, t;
37280
37281+ preempt_disable_rt();
37282 v = __this_cpu_dec_return(*p);
37283 t = __this_cpu_read(pcp->stat_threshold);
37284 if (unlikely(v < - t)) {
b3bbd485 37285@@ -362,6 +371,7 @@ void __dec_zone_state(struct zone *zone, enum zone_stat_item item)
1a6e0f06
JK
37286 zone_page_state_add(v - overstep, zone, item);
37287 __this_cpu_write(*p, overstep);
37288 }
37289+ preempt_enable_rt();
37290 }
37291
37292 void __dec_node_state(struct pglist_data *pgdat, enum node_stat_item item)
b3bbd485 37293@@ -370,6 +380,7 @@ void __dec_node_state(struct pglist_data *pgdat, enum node_stat_item item)
1a6e0f06
JK
37294 s8 __percpu *p = pcp->vm_node_stat_diff + item;
37295 s8 v, t;
37296
37297+ preempt_disable_rt();
37298 v = __this_cpu_dec_return(*p);
37299 t = __this_cpu_read(pcp->stat_threshold);
37300 if (unlikely(v < - t)) {
b3bbd485 37301@@ -378,6 +389,7 @@ void __dec_node_state(struct pglist_data *pgdat, enum node_stat_item item)
1a6e0f06
JK
37302 node_page_state_add(v - overstep, pgdat, item);
37303 __this_cpu_write(*p, overstep);
37304 }
37305+ preempt_enable_rt();
37306 }
37307
37308 void __dec_zone_page_state(struct page *page, enum zone_stat_item item)
b3bbd485
JK
37309diff --git a/mm/workingset.c b/mm/workingset.c
37310index b997c9de28f6..e252cc69a3d4 100644
37311--- a/mm/workingset.c
37312+++ b/mm/workingset.c
37313@@ -338,9 +338,10 @@ void workingset_activation(struct page *page)
1a6e0f06
JK
37314 * point where they would still be useful.
37315 */
37316
e4b2b4a8
JK
37317-static struct list_lru shadow_nodes;
37318+static struct list_lru __shadow_nodes;
37319+DEFINE_LOCAL_IRQ_LOCK(shadow_nodes_lock);
37320
37321-void workingset_update_node(struct radix_tree_node *node, void *private)
37322+void __workingset_update_node(struct radix_tree_node *node, void *private)
37323 {
37324 struct address_space *mapping = private;
37325
b3bbd485 37326@@ -358,10 +359,10 @@ void workingset_update_node(struct radix_tree_node *node, void *private)
e4b2b4a8
JK
37327 */
37328 if (node->count && node->count == node->exceptional) {
37329 if (list_empty(&node->private_list))
37330- list_lru_add(&shadow_nodes, &node->private_list);
37331+ list_lru_add(&__shadow_nodes, &node->private_list);
37332 } else {
37333 if (!list_empty(&node->private_list))
37334- list_lru_del(&shadow_nodes, &node->private_list);
37335+ list_lru_del(&__shadow_nodes, &node->private_list);
37336 }
37337 }
1a6e0f06 37338
b3bbd485 37339@@ -373,9 +374,9 @@ static unsigned long count_shadow_nodes(struct shrinker *shrinker,
e4b2b4a8 37340 unsigned long cache;
1a6e0f06
JK
37341
37342 /* list_lru lock nests inside IRQ-safe mapping->tree_lock */
37343- local_irq_disable();
e4b2b4a8 37344- nodes = list_lru_shrink_count(&shadow_nodes, sc);
1a6e0f06 37345- local_irq_enable();
e4b2b4a8
JK
37346+ local_lock_irq(shadow_nodes_lock);
37347+ nodes = list_lru_shrink_count(&__shadow_nodes, sc);
37348+ local_unlock_irq(shadow_nodes_lock);
1a6e0f06 37349
e4b2b4a8
JK
37350 /*
37351 * Approximate a reasonable limit for the radix tree nodes
b3bbd485 37352@@ -475,15 +476,15 @@ static enum lru_status shadow_lru_isolate(struct list_head *item,
e4b2b4a8
JK
37353 goto out_invalid;
37354 inc_lruvec_page_state(virt_to_page(node), WORKINGSET_NODERECLAIM);
37355 __radix_tree_delete_node(&mapping->page_tree, node,
37356- workingset_update_node, mapping);
37357+ __workingset_update_node, mapping);
37358
37359 out_invalid:
1a6e0f06
JK
37360 spin_unlock(&mapping->tree_lock);
37361 ret = LRU_REMOVED_RETRY;
37362 out:
37363- local_irq_enable();
e4b2b4a8 37364+ local_unlock_irq(shadow_nodes_lock);
1a6e0f06
JK
37365 cond_resched();
37366- local_irq_disable();
e4b2b4a8 37367+ local_lock_irq(shadow_nodes_lock);
1a6e0f06
JK
37368 spin_lock(lru_lock);
37369 return ret;
37370 }
b3bbd485 37371@@ -494,9 +495,9 @@ static unsigned long scan_shadow_nodes(struct shrinker *shrinker,
1a6e0f06
JK
37372 unsigned long ret;
37373
37374 /* list_lru lock nests inside IRQ-safe mapping->tree_lock */
37375- local_irq_disable();
e4b2b4a8 37376- ret = list_lru_shrink_walk(&shadow_nodes, sc, shadow_lru_isolate, NULL);
1a6e0f06 37377- local_irq_enable();
e4b2b4a8
JK
37378+ local_lock_irq(shadow_nodes_lock);
37379+ ret = list_lru_shrink_walk(&__shadow_nodes, sc, shadow_lru_isolate, NULL);
37380+ local_unlock_irq(shadow_nodes_lock);
1a6e0f06
JK
37381 return ret;
37382 }
37383
b3bbd485 37384@@ -534,7 +535,7 @@ static int __init workingset_init(void)
1a6e0f06
JK
37385 pr_info("workingset: timestamp_bits=%d max_order=%d bucket_order=%u\n",
37386 timestamp_bits, max_order, bucket_order);
37387
e4b2b4a8
JK
37388- ret = __list_lru_init(&shadow_nodes, true, &shadow_nodes_key);
37389+ ret = __list_lru_init(&__shadow_nodes, true, &shadow_nodes_key);
1a6e0f06
JK
37390 if (ret)
37391 goto err;
37392 ret = register_shrinker(&workingset_shadow_shrinker);
b3bbd485 37393@@ -542,7 +543,7 @@ static int __init workingset_init(void)
1a6e0f06
JK
37394 goto err_list_lru;
37395 return 0;
37396 err_list_lru:
e4b2b4a8
JK
37397- list_lru_destroy(&shadow_nodes);
37398+ list_lru_destroy(&__shadow_nodes);
1a6e0f06
JK
37399 err:
37400 return ret;
37401 }
b3bbd485
JK
37402diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
37403index 685049a9048d..8d1489fd1dbc 100644
37404--- a/mm/zsmalloc.c
37405+++ b/mm/zsmalloc.c
1a6e0f06
JK
37406@@ -53,6 +53,7 @@
37407 #include <linux/mount.h>
37408 #include <linux/migrate.h>
37409 #include <linux/pagemap.h>
37410+#include <linux/locallock.h>
37411
37412 #define ZSPAGE_MAGIC 0x58
37413
37414@@ -70,9 +71,22 @@
37415 */
37416 #define ZS_MAX_ZSPAGE_ORDER 2
37417 #define ZS_MAX_PAGES_PER_ZSPAGE (_AC(1, UL) << ZS_MAX_ZSPAGE_ORDER)
37418-
37419 #define ZS_HANDLE_SIZE (sizeof(unsigned long))
37420
37421+#ifdef CONFIG_PREEMPT_RT_FULL
37422+
37423+struct zsmalloc_handle {
37424+ unsigned long addr;
37425+ struct mutex lock;
37426+};
37427+
37428+#define ZS_HANDLE_ALLOC_SIZE (sizeof(struct zsmalloc_handle))
37429+
37430+#else
37431+
37432+#define ZS_HANDLE_ALLOC_SIZE (sizeof(unsigned long))
37433+#endif
37434+
37435 /*
37436 * Object location (<PFN>, <obj_idx>) is encoded as
37437 * as single (unsigned long) handle value.
b3bbd485 37438@@ -320,7 +334,7 @@ static void SetZsPageMovable(struct zs_pool *pool, struct zspage *zspage) {}
1a6e0f06
JK
37439
37440 static int create_cache(struct zs_pool *pool)
37441 {
37442- pool->handle_cachep = kmem_cache_create("zs_handle", ZS_HANDLE_SIZE,
37443+ pool->handle_cachep = kmem_cache_create("zs_handle", ZS_HANDLE_ALLOC_SIZE,
37444 0, 0, NULL);
37445 if (!pool->handle_cachep)
37446 return 1;
b3bbd485 37447@@ -344,10 +358,27 @@ static void destroy_cache(struct zs_pool *pool)
1a6e0f06
JK
37448
37449 static unsigned long cache_alloc_handle(struct zs_pool *pool, gfp_t gfp)
37450 {
37451- return (unsigned long)kmem_cache_alloc(pool->handle_cachep,
37452- gfp & ~(__GFP_HIGHMEM|__GFP_MOVABLE));
37453+ void *p;
37454+
37455+ p = kmem_cache_alloc(pool->handle_cachep,
37456+ gfp & ~(__GFP_HIGHMEM|__GFP_MOVABLE));
37457+#ifdef CONFIG_PREEMPT_RT_FULL
37458+ if (p) {
37459+ struct zsmalloc_handle *zh = p;
37460+
37461+ mutex_init(&zh->lock);
37462+ }
37463+#endif
37464+ return (unsigned long)p;
b3bbd485
JK
37465 }
37466
1a6e0f06
JK
37467+#ifdef CONFIG_PREEMPT_RT_FULL
37468+static struct zsmalloc_handle *zs_get_pure_handle(unsigned long handle)
37469+{
37470+ return (void *)(handle &~((1 << OBJ_TAG_BITS) - 1));
b3bbd485 37471+}
1a6e0f06 37472+#endif
b3bbd485 37473+
1a6e0f06
JK
37474 static void cache_free_handle(struct zs_pool *pool, unsigned long handle)
37475 {
b3bbd485
JK
37476 kmem_cache_free(pool->handle_cachep, (void *)handle);
37477@@ -366,12 +397,18 @@ static void cache_free_zspage(struct zs_pool *pool, struct zspage *zspage)
1a6e0f06
JK
37478
37479 static void record_obj(unsigned long handle, unsigned long obj)
37480 {
37481+#ifdef CONFIG_PREEMPT_RT_FULL
37482+ struct zsmalloc_handle *zh = zs_get_pure_handle(handle);
37483+
37484+ WRITE_ONCE(zh->addr, obj);
37485+#else
37486 /*
37487 * lsb of @obj represents handle lock while other bits
37488 * represent object value the handle is pointing so
37489 * updating shouldn't do store tearing.
37490 */
37491 WRITE_ONCE(*(unsigned long *)handle, obj);
37492+#endif
37493 }
37494
37495 /* zpool driver */
b3bbd485 37496@@ -460,6 +497,7 @@ MODULE_ALIAS("zpool-zsmalloc");
1a6e0f06
JK
37497
37498 /* per-cpu VM mapping areas for zspage accesses that cross page boundaries */
37499 static DEFINE_PER_CPU(struct mapping_area, zs_map_area);
37500+static DEFINE_LOCAL_IRQ_LOCK(zs_map_area_lock);
37501
37502 static bool is_zspage_isolated(struct zspage *zspage)
37503 {
b3bbd485 37504@@ -898,7 +936,13 @@ static unsigned long location_to_obj(struct page *page, unsigned int obj_idx)
1a6e0f06
JK
37505
37506 static unsigned long handle_to_obj(unsigned long handle)
37507 {
37508+#ifdef CONFIG_PREEMPT_RT_FULL
37509+ struct zsmalloc_handle *zh = zs_get_pure_handle(handle);
37510+
37511+ return zh->addr;
37512+#else
37513 return *(unsigned long *)handle;
37514+#endif
37515 }
37516
37517 static unsigned long obj_to_head(struct page *page, void *obj)
b3bbd485 37518@@ -912,22 +956,46 @@ static unsigned long obj_to_head(struct page *page, void *obj)
1a6e0f06
JK
37519
37520 static inline int testpin_tag(unsigned long handle)
37521 {
37522+#ifdef CONFIG_PREEMPT_RT_FULL
37523+ struct zsmalloc_handle *zh = zs_get_pure_handle(handle);
37524+
37525+ return mutex_is_locked(&zh->lock);
37526+#else
37527 return bit_spin_is_locked(HANDLE_PIN_BIT, (unsigned long *)handle);
37528+#endif
37529 }
37530
37531 static inline int trypin_tag(unsigned long handle)
37532 {
37533+#ifdef CONFIG_PREEMPT_RT_FULL
37534+ struct zsmalloc_handle *zh = zs_get_pure_handle(handle);
37535+
37536+ return mutex_trylock(&zh->lock);
37537+#else
37538 return bit_spin_trylock(HANDLE_PIN_BIT, (unsigned long *)handle);
37539+#endif
37540 }
37541
37542 static void pin_tag(unsigned long handle)
37543 {
37544+#ifdef CONFIG_PREEMPT_RT_FULL
37545+ struct zsmalloc_handle *zh = zs_get_pure_handle(handle);
37546+
37547+ return mutex_lock(&zh->lock);
37548+#else
37549 bit_spin_lock(HANDLE_PIN_BIT, (unsigned long *)handle);
37550+#endif
37551 }
37552
37553 static void unpin_tag(unsigned long handle)
37554 {
37555+#ifdef CONFIG_PREEMPT_RT_FULL
37556+ struct zsmalloc_handle *zh = zs_get_pure_handle(handle);
37557+
37558+ return mutex_unlock(&zh->lock);
37559+#else
37560 bit_spin_unlock(HANDLE_PIN_BIT, (unsigned long *)handle);
37561+#endif
37562 }
37563
37564 static void reset_page(struct page *page)
b3bbd485 37565@@ -1365,7 +1433,7 @@ void *zs_map_object(struct zs_pool *pool, unsigned long handle,
1a6e0f06
JK
37566 class = pool->size_class[class_idx];
37567 off = (class->size * obj_idx) & ~PAGE_MASK;
37568
37569- area = &get_cpu_var(zs_map_area);
37570+ area = &get_locked_var(zs_map_area_lock, zs_map_area);
37571 area->vm_mm = mm;
37572 if (off + class->size <= PAGE_SIZE) {
37573 /* this object is contained entirely within a page */
b3bbd485 37574@@ -1419,7 +1487,7 @@ void zs_unmap_object(struct zs_pool *pool, unsigned long handle)
1a6e0f06
JK
37575
37576 __zs_unmap_object(area, pages, off, class->size);
37577 }
37578- put_cpu_var(zs_map_area);
37579+ put_locked_var(zs_map_area_lock, zs_map_area);
37580
37581 migrate_read_unlock(zspage);
37582 unpin_tag(handle);
b3bbd485
JK
37583diff --git a/net/9p/trans_xen.c b/net/9p/trans_xen.c
37584index c10bdf63eae7..84a49f2bcfbc 100644
37585--- a/net/9p/trans_xen.c
37586+++ b/net/9p/trans_xen.c
e4b2b4a8
JK
37587@@ -38,7 +38,6 @@
37588
37589 #include <linux/module.h>
37590 #include <linux/spinlock.h>
37591-#include <linux/rwlock.h>
37592 #include <net/9p/9p.h>
37593 #include <net/9p/client.h>
37594 #include <net/9p/transport.h>
b3bbd485
JK
37595diff --git a/net/Kconfig b/net/Kconfig
37596index 9dba2715919d..9c7b38379c09 100644
37597--- a/net/Kconfig
37598+++ b/net/Kconfig
37599@@ -272,7 +272,7 @@ config CGROUP_NET_CLASSID
37600
37601 config NET_RX_BUSY_POLL
37602 bool
37603- default y
37604+ default y if !PREEMPT_RT_FULL
37605
37606 config BQL
37607 bool
37608diff --git a/net/bluetooth/hci_sock.c b/net/bluetooth/hci_sock.c
37609index 65d734c165bd..923e9a271872 100644
37610--- a/net/bluetooth/hci_sock.c
37611+++ b/net/bluetooth/hci_sock.c
37612@@ -251,15 +251,13 @@ void hci_send_to_sock(struct hci_dev *hdev, struct sk_buff *skb)
e4b2b4a8
JK
37613 }
37614
37615 /* Send frame to sockets with specific channel */
37616-void hci_send_to_channel(unsigned short channel, struct sk_buff *skb,
37617- int flag, struct sock *skip_sk)
37618+static void __hci_send_to_channel(unsigned short channel, struct sk_buff *skb,
37619+ int flag, struct sock *skip_sk)
37620 {
37621 struct sock *sk;
37622
37623 BT_DBG("channel %u len %d", channel, skb->len);
37624
37625- read_lock(&hci_sk_list.lock);
37626-
37627 sk_for_each(sk, &hci_sk_list.head) {
37628 struct sk_buff *nskb;
37629
b3bbd485 37630@@ -285,6 +283,13 @@ void hci_send_to_channel(unsigned short channel, struct sk_buff *skb,
e4b2b4a8
JK
37631 kfree_skb(nskb);
37632 }
37633
37634+}
37635+
37636+void hci_send_to_channel(unsigned short channel, struct sk_buff *skb,
37637+ int flag, struct sock *skip_sk)
37638+{
37639+ read_lock(&hci_sk_list.lock);
37640+ __hci_send_to_channel(channel, skb, flag, skip_sk);
37641 read_unlock(&hci_sk_list.lock);
37642 }
37643
b3bbd485 37644@@ -388,8 +393,8 @@ void hci_send_monitor_ctrl_event(struct hci_dev *hdev, u16 event,
e4b2b4a8
JK
37645 hdr->index = index;
37646 hdr->len = cpu_to_le16(skb->len - HCI_MON_HDR_SIZE);
37647
37648- hci_send_to_channel(HCI_CHANNEL_MONITOR, skb,
37649- HCI_SOCK_TRUSTED, NULL);
37650+ __hci_send_to_channel(HCI_CHANNEL_MONITOR, skb,
37651+ HCI_SOCK_TRUSTED, NULL);
37652 kfree_skb(skb);
37653 }
37654
b3bbd485
JK
37655diff --git a/net/can/bcm.c b/net/can/bcm.c
37656index 13690334efa3..9cc67ac257f1 100644
37657--- a/net/can/bcm.c
37658+++ b/net/can/bcm.c
37659@@ -102,7 +102,6 @@ struct bcm_op {
e4b2b4a8
JK
37660 unsigned long frames_abs, frames_filtered;
37661 struct bcm_timeval ival1, ival2;
37662 struct hrtimer timer, thrtimer;
37663- struct tasklet_struct tsklet, thrtsklet;
37664 ktime_t rx_stamp, kt_ival1, kt_ival2, kt_lastmsg;
37665 int rx_ifindex;
37666 int cfsiz;
b3bbd485 37667@@ -364,25 +363,34 @@ static void bcm_send_to_user(struct bcm_op *op, struct bcm_msg_head *head,
e4b2b4a8
JK
37668 }
37669 }
37670
37671-static void bcm_tx_start_timer(struct bcm_op *op)
37672+static bool bcm_tx_set_expiry(struct bcm_op *op, struct hrtimer *hrt)
37673 {
37674+ ktime_t ival;
37675+
37676 if (op->kt_ival1 && op->count)
37677- hrtimer_start(&op->timer,
37678- ktime_add(ktime_get(), op->kt_ival1),
37679- HRTIMER_MODE_ABS);
37680+ ival = op->kt_ival1;
37681 else if (op->kt_ival2)
37682- hrtimer_start(&op->timer,
37683- ktime_add(ktime_get(), op->kt_ival2),
37684- HRTIMER_MODE_ABS);
37685+ ival = op->kt_ival2;
37686+ else
37687+ return false;
37688+
37689+ hrtimer_set_expires(hrt, ktime_add(ktime_get(), ival));
37690+ return true;
37691 }
37692
37693-static void bcm_tx_timeout_tsklet(unsigned long data)
37694+static void bcm_tx_start_timer(struct bcm_op *op)
37695 {
37696- struct bcm_op *op = (struct bcm_op *)data;
37697+ if (bcm_tx_set_expiry(op, &op->timer))
37698+ hrtimer_start_expires(&op->timer, HRTIMER_MODE_ABS_SOFT);
37699+}
37700+
37701+/* bcm_tx_timeout_handler - performs cyclic CAN frame transmissions */
37702+static enum hrtimer_restart bcm_tx_timeout_handler(struct hrtimer *hrtimer)
37703+{
37704+ struct bcm_op *op = container_of(hrtimer, struct bcm_op, timer);
37705 struct bcm_msg_head msg_head;
37706
37707 if (op->kt_ival1 && (op->count > 0)) {
37708-
37709 op->count--;
37710 if (!op->count && (op->flags & TX_COUNTEVT)) {
37711
b3bbd485 37712@@ -399,22 +407,12 @@ static void bcm_tx_timeout_tsklet(unsigned long data)
e4b2b4a8
JK
37713 }
37714 bcm_can_tx(op);
37715
37716- } else if (op->kt_ival2)
37717+ } else if (op->kt_ival2) {
37718 bcm_can_tx(op);
37719+ }
37720
37721- bcm_tx_start_timer(op);
37722-}
37723-
37724-/*
37725- * bcm_tx_timeout_handler - performs cyclic CAN frame transmissions
37726- */
37727-static enum hrtimer_restart bcm_tx_timeout_handler(struct hrtimer *hrtimer)
37728-{
37729- struct bcm_op *op = container_of(hrtimer, struct bcm_op, timer);
37730-
37731- tasklet_schedule(&op->tsklet);
37732-
37733- return HRTIMER_NORESTART;
37734+ return bcm_tx_set_expiry(op, &op->timer) ?
37735+ HRTIMER_RESTART : HRTIMER_NORESTART;
37736 }
37737
37738 /*
b3bbd485 37739@@ -480,7 +478,7 @@ static void bcm_rx_update_and_send(struct bcm_op *op,
e4b2b4a8
JK
37740 /* do not send the saved data - only start throttle timer */
37741 hrtimer_start(&op->thrtimer,
37742 ktime_add(op->kt_lastmsg, op->kt_ival2),
37743- HRTIMER_MODE_ABS);
37744+ HRTIMER_MODE_ABS_SOFT);
37745 return;
37746 }
37747
b3bbd485 37748@@ -539,14 +537,21 @@ static void bcm_rx_starttimer(struct bcm_op *op)
e4b2b4a8
JK
37749 return;
37750
37751 if (op->kt_ival1)
37752- hrtimer_start(&op->timer, op->kt_ival1, HRTIMER_MODE_REL);
37753+ hrtimer_start(&op->timer, op->kt_ival1, HRTIMER_MODE_REL_SOFT);
37754 }
37755
37756-static void bcm_rx_timeout_tsklet(unsigned long data)
37757+/* bcm_rx_timeout_handler - when the (cyclic) CAN frame reception timed out */
37758+static enum hrtimer_restart bcm_rx_timeout_handler(struct hrtimer *hrtimer)
37759 {
37760- struct bcm_op *op = (struct bcm_op *)data;
37761+ struct bcm_op *op = container_of(hrtimer, struct bcm_op, timer);
37762 struct bcm_msg_head msg_head;
37763
37764+ /* if user wants to be informed, when cyclic CAN-Messages come back */
37765+ if ((op->flags & RX_ANNOUNCE_RESUME) && op->last_frames) {
37766+ /* clear received CAN frames to indicate 'nothing received' */
37767+ memset(op->last_frames, 0, op->nframes * op->cfsiz);
37768+ }
37769+
37770 /* create notification to user */
37771 msg_head.opcode = RX_TIMEOUT;
37772 msg_head.flags = op->flags;
b3bbd485 37773@@ -557,25 +562,6 @@ static void bcm_rx_timeout_tsklet(unsigned long data)
e4b2b4a8
JK
37774 msg_head.nframes = 0;
37775
37776 bcm_send_to_user(op, &msg_head, NULL, 0);
37777-}
37778-
37779-/*
37780- * bcm_rx_timeout_handler - when the (cyclic) CAN frame reception timed out
37781- */
37782-static enum hrtimer_restart bcm_rx_timeout_handler(struct hrtimer *hrtimer)
37783-{
37784- struct bcm_op *op = container_of(hrtimer, struct bcm_op, timer);
37785-
37786- /* schedule before NET_RX_SOFTIRQ */
37787- tasklet_hi_schedule(&op->tsklet);
37788-
37789- /* no restart of the timer is done here! */
37790-
37791- /* if user wants to be informed, when cyclic CAN-Messages come back */
37792- if ((op->flags & RX_ANNOUNCE_RESUME) && op->last_frames) {
37793- /* clear received CAN frames to indicate 'nothing received' */
37794- memset(op->last_frames, 0, op->nframes * op->cfsiz);
37795- }
37796
37797 return HRTIMER_NORESTART;
37798 }
b3bbd485 37799@@ -583,14 +569,12 @@ static enum hrtimer_restart bcm_rx_timeout_handler(struct hrtimer *hrtimer)
e4b2b4a8
JK
37800 /*
37801 * bcm_rx_do_flush - helper for bcm_rx_thr_flush
37802 */
37803-static inline int bcm_rx_do_flush(struct bcm_op *op, int update,
37804- unsigned int index)
37805+static inline int bcm_rx_do_flush(struct bcm_op *op, unsigned int index)
37806 {
37807 struct canfd_frame *lcf = op->last_frames + op->cfsiz * index;
37808
37809 if ((op->last_frames) && (lcf->flags & RX_THR)) {
37810- if (update)
37811- bcm_rx_changed(op, lcf);
37812+ bcm_rx_changed(op, lcf);
37813 return 1;
37814 }
37815 return 0;
b3bbd485 37816@@ -598,11 +582,8 @@ static inline int bcm_rx_do_flush(struct bcm_op *op, int update,
e4b2b4a8
JK
37817
37818 /*
37819 * bcm_rx_thr_flush - Check for throttled data and send it to the userspace
37820- *
37821- * update == 0 : just check if throttled data is available (any irq context)
37822- * update == 1 : check and send throttled data to userspace (soft_irq context)
37823 */
37824-static int bcm_rx_thr_flush(struct bcm_op *op, int update)
37825+static int bcm_rx_thr_flush(struct bcm_op *op)
37826 {
37827 int updated = 0;
37828
b3bbd485 37829@@ -611,24 +592,16 @@ static int bcm_rx_thr_flush(struct bcm_op *op, int update)
e4b2b4a8
JK
37830
37831 /* for MUX filter we start at index 1 */
37832 for (i = 1; i < op->nframes; i++)
37833- updated += bcm_rx_do_flush(op, update, i);
37834+ updated += bcm_rx_do_flush(op, i);
37835
37836 } else {
37837 /* for RX_FILTER_ID and simple filter */
37838- updated += bcm_rx_do_flush(op, update, 0);
37839+ updated += bcm_rx_do_flush(op, 0);
37840 }
37841
37842 return updated;
37843 }
37844
37845-static void bcm_rx_thr_tsklet(unsigned long data)
37846-{
37847- struct bcm_op *op = (struct bcm_op *)data;
37848-
37849- /* push the changed data to the userspace */
37850- bcm_rx_thr_flush(op, 1);
37851-}
37852-
37853 /*
37854 * bcm_rx_thr_handler - the time for blocked content updates is over now:
37855 * Check for throttled data and send it to the userspace
b3bbd485 37856@@ -637,9 +610,7 @@ static enum hrtimer_restart bcm_rx_thr_handler(struct hrtimer *hrtimer)
e4b2b4a8
JK
37857 {
37858 struct bcm_op *op = container_of(hrtimer, struct bcm_op, thrtimer);
37859
37860- tasklet_schedule(&op->thrtsklet);
37861-
37862- if (bcm_rx_thr_flush(op, 0)) {
37863+ if (bcm_rx_thr_flush(op)) {
37864 hrtimer_forward(hrtimer, ktime_get(), op->kt_ival2);
37865 return HRTIMER_RESTART;
37866 } else {
b3bbd485 37867@@ -735,23 +706,8 @@ static struct bcm_op *bcm_find_op(struct list_head *ops,
e4b2b4a8
JK
37868
37869 static void bcm_remove_op(struct bcm_op *op)
37870 {
37871- if (op->tsklet.func) {
37872- while (test_bit(TASKLET_STATE_SCHED, &op->tsklet.state) ||
37873- test_bit(TASKLET_STATE_RUN, &op->tsklet.state) ||
37874- hrtimer_active(&op->timer)) {
37875- hrtimer_cancel(&op->timer);
37876- tasklet_kill(&op->tsklet);
37877- }
37878- }
37879-
37880- if (op->thrtsklet.func) {
37881- while (test_bit(TASKLET_STATE_SCHED, &op->thrtsklet.state) ||
37882- test_bit(TASKLET_STATE_RUN, &op->thrtsklet.state) ||
37883- hrtimer_active(&op->thrtimer)) {
37884- hrtimer_cancel(&op->thrtimer);
37885- tasklet_kill(&op->thrtsklet);
37886- }
37887- }
37888+ hrtimer_cancel(&op->timer);
37889+ hrtimer_cancel(&op->thrtimer);
37890
37891 if ((op->frames) && (op->frames != &op->sframe))
37892 kfree(op->frames);
b3bbd485 37893@@ -979,15 +935,13 @@ static int bcm_tx_setup(struct bcm_msg_head *msg_head, struct msghdr *msg,
e4b2b4a8
JK
37894 op->ifindex = ifindex;
37895
37896 /* initialize uninitialized (kzalloc) structure */
37897- hrtimer_init(&op->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
37898+ hrtimer_init(&op->timer, CLOCK_MONOTONIC,
37899+ HRTIMER_MODE_REL_SOFT);
37900 op->timer.function = bcm_tx_timeout_handler;
37901
37902- /* initialize tasklet for tx countevent notification */
37903- tasklet_init(&op->tsklet, bcm_tx_timeout_tsklet,
37904- (unsigned long) op);
37905-
37906 /* currently unused in tx_ops */
37907- hrtimer_init(&op->thrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
37908+ hrtimer_init(&op->thrtimer, CLOCK_MONOTONIC,
37909+ HRTIMER_MODE_REL_SOFT);
37910
37911 /* add this bcm_op to the list of the tx_ops */
37912 list_add(&op->list, &bo->tx_ops);
b3bbd485 37913@@ -1150,20 +1104,14 @@ static int bcm_rx_setup(struct bcm_msg_head *msg_head, struct msghdr *msg,
e4b2b4a8
JK
37914 op->rx_ifindex = ifindex;
37915
37916 /* initialize uninitialized (kzalloc) structure */
37917- hrtimer_init(&op->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
37918+ hrtimer_init(&op->timer, CLOCK_MONOTONIC,
37919+ HRTIMER_MODE_REL_SOFT);
37920 op->timer.function = bcm_rx_timeout_handler;
37921
37922- /* initialize tasklet for rx timeout notification */
37923- tasklet_init(&op->tsklet, bcm_rx_timeout_tsklet,
37924- (unsigned long) op);
37925-
37926- hrtimer_init(&op->thrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
37927+ hrtimer_init(&op->thrtimer, CLOCK_MONOTONIC,
37928+ HRTIMER_MODE_REL_SOFT);
37929 op->thrtimer.function = bcm_rx_thr_handler;
37930
37931- /* initialize tasklet for rx throttle handling */
37932- tasklet_init(&op->thrtsklet, bcm_rx_thr_tsklet,
37933- (unsigned long) op);
37934-
37935 /* add this bcm_op to the list of the rx_ops */
37936 list_add(&op->list, &bo->rx_ops);
37937
b3bbd485 37938@@ -1209,12 +1157,12 @@ static int bcm_rx_setup(struct bcm_msg_head *msg_head, struct msghdr *msg,
e4b2b4a8
JK
37939 */
37940 op->kt_lastmsg = 0;
37941 hrtimer_cancel(&op->thrtimer);
37942- bcm_rx_thr_flush(op, 1);
37943+ bcm_rx_thr_flush(op);
37944 }
37945
37946 if ((op->flags & STARTTIMER) && op->kt_ival1)
37947 hrtimer_start(&op->timer, op->kt_ival1,
37948- HRTIMER_MODE_REL);
37949+ HRTIMER_MODE_REL_SOFT);
37950 }
37951
37952 /* now we can register for can_ids, if we added a new bcm_op */
b3bbd485 37953diff --git a/net/core/dev.c b/net/core/dev.c
5dd41b01 37954index e8a66ad6d07c..fa9642bb0482 100644
b3bbd485
JK
37955--- a/net/core/dev.c
37956+++ b/net/core/dev.c
37957@@ -195,6 +195,7 @@ static unsigned int napi_gen_id = NR_CPUS;
1a6e0f06
JK
37958 static DEFINE_READ_MOSTLY_HASHTABLE(napi_hash, 8);
37959
37960 static seqcount_t devnet_rename_seq;
37961+static DEFINE_MUTEX(devnet_rename_mutex);
37962
37963 static inline void dev_base_seq_inc(struct net *net)
37964 {
b3bbd485 37965@@ -217,14 +218,14 @@ static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex)
1a6e0f06
JK
37966 static inline void rps_lock(struct softnet_data *sd)
37967 {
37968 #ifdef CONFIG_RPS
37969- spin_lock(&sd->input_pkt_queue.lock);
37970+ raw_spin_lock(&sd->input_pkt_queue.raw_lock);
37971 #endif
37972 }
37973
37974 static inline void rps_unlock(struct softnet_data *sd)
37975 {
37976 #ifdef CONFIG_RPS
37977- spin_unlock(&sd->input_pkt_queue.lock);
37978+ raw_spin_unlock(&sd->input_pkt_queue.raw_lock);
37979 #endif
37980 }
37981
b3bbd485 37982@@ -920,7 +921,8 @@ int netdev_get_name(struct net *net, char *name, int ifindex)
1a6e0f06
JK
37983 strcpy(name, dev->name);
37984 rcu_read_unlock();
37985 if (read_seqcount_retry(&devnet_rename_seq, seq)) {
37986- cond_resched();
37987+ mutex_lock(&devnet_rename_mutex);
37988+ mutex_unlock(&devnet_rename_mutex);
37989 goto retry;
37990 }
37991
b3bbd485 37992@@ -1189,20 +1191,17 @@ int dev_change_name(struct net_device *dev, const char *newname)
1a6e0f06
JK
37993 if (dev->flags & IFF_UP)
37994 return -EBUSY;
37995
37996- write_seqcount_begin(&devnet_rename_seq);
37997+ mutex_lock(&devnet_rename_mutex);
37998+ __raw_write_seqcount_begin(&devnet_rename_seq);
37999
38000- if (strncmp(newname, dev->name, IFNAMSIZ) == 0) {
38001- write_seqcount_end(&devnet_rename_seq);
38002- return 0;
38003- }
38004+ if (strncmp(newname, dev->name, IFNAMSIZ) == 0)
38005+ goto outunlock;
38006
38007 memcpy(oldname, dev->name, IFNAMSIZ);
38008
38009 err = dev_get_valid_name(net, dev, newname);
38010- if (err < 0) {
38011- write_seqcount_end(&devnet_rename_seq);
38012- return err;
38013- }
38014+ if (err < 0)
38015+ goto outunlock;
38016
38017 if (oldname[0] && !strchr(oldname, '%'))
38018 netdev_info(dev, "renamed from %s\n", oldname);
b3bbd485 38019@@ -1215,11 +1214,12 @@ int dev_change_name(struct net_device *dev, const char *newname)
1a6e0f06
JK
38020 if (ret) {
38021 memcpy(dev->name, oldname, IFNAMSIZ);
38022 dev->name_assign_type = old_assign_type;
38023- write_seqcount_end(&devnet_rename_seq);
38024- return ret;
38025+ err = ret;
38026+ goto outunlock;
38027 }
38028
38029- write_seqcount_end(&devnet_rename_seq);
38030+ __raw_write_seqcount_end(&devnet_rename_seq);
38031+ mutex_unlock(&devnet_rename_mutex);
38032
38033 netdev_adjacent_rename_links(dev, oldname);
38034
b3bbd485 38035@@ -1240,7 +1240,8 @@ int dev_change_name(struct net_device *dev, const char *newname)
1a6e0f06
JK
38036 /* err >= 0 after dev_alloc_name() or stores the first errno */
38037 if (err >= 0) {
38038 err = ret;
38039- write_seqcount_begin(&devnet_rename_seq);
38040+ mutex_lock(&devnet_rename_mutex);
38041+ __raw_write_seqcount_begin(&devnet_rename_seq);
38042 memcpy(dev->name, oldname, IFNAMSIZ);
38043 memcpy(oldname, newname, IFNAMSIZ);
38044 dev->name_assign_type = old_assign_type;
b3bbd485 38045@@ -1253,6 +1254,11 @@ int dev_change_name(struct net_device *dev, const char *newname)
1a6e0f06
JK
38046 }
38047
38048 return err;
38049+
38050+outunlock:
38051+ __raw_write_seqcount_end(&devnet_rename_seq);
38052+ mutex_unlock(&devnet_rename_mutex);
38053+ return err;
38054 }
38055
38056 /**
5dd41b01 38057@@ -2460,6 +2466,7 @@ static void __netif_reschedule(struct Qdisc *q)
1a6e0f06
JK
38058 sd->output_queue_tailp = &q->next_sched;
38059 raise_softirq_irqoff(NET_TX_SOFTIRQ);
38060 local_irq_restore(flags);
38061+ preempt_check_resched_rt();
38062 }
38063
38064 void __netif_schedule(struct Qdisc *q)
5dd41b01 38065@@ -2522,6 +2529,7 @@ void __dev_kfree_skb_irq(struct sk_buff *skb, enum skb_free_reason reason)
1a6e0f06
JK
38066 __this_cpu_write(softnet_data.completion_queue, skb);
38067 raise_softirq_irqoff(NET_TX_SOFTIRQ);
38068 local_irq_restore(flags);
38069+ preempt_check_resched_rt();
38070 }
38071 EXPORT_SYMBOL(__dev_kfree_skb_irq);
38072
5dd41b01 38073@@ -3197,7 +3205,11 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
1a6e0f06
JK
38074 * This permits qdisc->running owner to get the lock more
38075 * often and dequeue packets faster.
38076 */
38077+#ifdef CONFIG_PREEMPT_RT_FULL
38078+ contended = true;
38079+#else
38080 contended = qdisc_is_running(q);
38081+#endif
38082 if (unlikely(contended))
38083 spin_lock(&q->busylock);
38084
5dd41b01 38085@@ -3268,8 +3280,10 @@ static void skb_update_prio(struct sk_buff *skb)
1a6e0f06
JK
38086 #define skb_update_prio(skb)
38087 #endif
38088
38089+#ifndef CONFIG_PREEMPT_RT_FULL
38090 DEFINE_PER_CPU(int, xmit_recursion);
38091 EXPORT_SYMBOL(xmit_recursion);
38092+#endif
38093
38094 /**
38095 * dev_loopback_xmit - loop back @skb
5dd41b01 38096@@ -3509,9 +3523,12 @@ static int __dev_queue_xmit(struct sk_buff *skb, void *accel_priv)
e4b2b4a8 38097 if (dev->flags & IFF_UP) {
1a6e0f06
JK
38098 int cpu = smp_processor_id(); /* ok because BHs are off */
38099
e4b2b4a8
JK
38100+#ifdef CONFIG_PREEMPT_RT_FULL
38101+ if (txq->xmit_lock_owner != current) {
38102+#else
1a6e0f06
JK
38103 if (txq->xmit_lock_owner != cpu) {
38104- if (unlikely(__this_cpu_read(xmit_recursion) >
38105- XMIT_RECURSION_LIMIT))
e4b2b4a8 38106+#endif
1a6e0f06
JK
38107+ if (unlikely(xmit_rec_read() > XMIT_RECURSION_LIMIT))
38108 goto recursion_alert;
38109
38110 skb = validate_xmit_skb(skb, dev);
5dd41b01 38111@@ -3521,9 +3538,9 @@ static int __dev_queue_xmit(struct sk_buff *skb, void *accel_priv)
1a6e0f06
JK
38112 HARD_TX_LOCK(dev, txq, cpu);
38113
38114 if (!netif_xmit_stopped(txq)) {
38115- __this_cpu_inc(xmit_recursion);
38116+ xmit_rec_inc();
38117 skb = dev_hard_start_xmit(skb, dev, txq, &rc);
38118- __this_cpu_dec(xmit_recursion);
38119+ xmit_rec_dec();
38120 if (dev_xmit_complete(rc)) {
38121 HARD_TX_UNLOCK(dev, txq);
38122 goto out;
5dd41b01 38123@@ -3904,6 +3921,7 @@ static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
1a6e0f06
JK
38124 rps_unlock(sd);
38125
38126 local_irq_restore(flags);
38127+ preempt_check_resched_rt();
38128
38129 atomic_long_inc(&skb->dev->rx_dropped);
38130 kfree_skb(skb);
5dd41b01 38131@@ -4056,7 +4074,7 @@ static int netif_rx_internal(struct sk_buff *skb)
1a6e0f06
JK
38132 struct rps_dev_flow voidflow, *rflow = &voidflow;
38133 int cpu;
38134
38135- preempt_disable();
38136+ migrate_disable();
38137 rcu_read_lock();
38138
38139 cpu = get_rps_cpu(skb->dev, skb, &rflow);
5dd41b01 38140@@ -4066,14 +4084,14 @@ static int netif_rx_internal(struct sk_buff *skb)
1a6e0f06
JK
38141 ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
38142
38143 rcu_read_unlock();
38144- preempt_enable();
38145+ migrate_enable();
38146 } else
38147 #endif
38148 {
38149 unsigned int qtail;
e4b2b4a8 38150
1a6e0f06
JK
38151- ret = enqueue_to_backlog(skb, get_cpu(), &qtail);
38152- put_cpu();
38153+ ret = enqueue_to_backlog(skb, get_cpu_light(), &qtail);
38154+ put_cpu_light();
38155 }
38156 return ret;
38157 }
5dd41b01 38158@@ -4107,11 +4125,9 @@ int netif_rx_ni(struct sk_buff *skb)
1a6e0f06
JK
38159
38160 trace_netif_rx_ni_entry(skb);
38161
38162- preempt_disable();
38163+ local_bh_disable();
38164 err = netif_rx_internal(skb);
38165- if (local_softirq_pending())
38166- do_softirq();
38167- preempt_enable();
38168+ local_bh_enable();
38169
38170 return err;
38171 }
5dd41b01 38172@@ -4629,7 +4645,7 @@ static void flush_backlog(struct work_struct *work)
1a6e0f06 38173 skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) {
c7c16703 38174 if (skb->dev->reg_state == NETREG_UNREGISTERING) {
1a6e0f06
JK
38175 __skb_unlink(skb, &sd->input_pkt_queue);
38176- kfree_skb(skb);
38177+ __skb_queue_tail(&sd->tofree_queue, skb);
38178 input_queue_head_incr(sd);
38179 }
38180 }
5dd41b01 38181@@ -4639,11 +4655,14 @@ static void flush_backlog(struct work_struct *work)
1a6e0f06 38182 skb_queue_walk_safe(&sd->process_queue, skb, tmp) {
c7c16703 38183 if (skb->dev->reg_state == NETREG_UNREGISTERING) {
1a6e0f06
JK
38184 __skb_unlink(skb, &sd->process_queue);
38185- kfree_skb(skb);
38186+ __skb_queue_tail(&sd->tofree_queue, skb);
38187 input_queue_head_incr(sd);
38188 }
38189 }
1a6e0f06
JK
38190+ if (!skb_queue_empty(&sd->tofree_queue))
38191+ raise_softirq_irqoff(NET_RX_SOFTIRQ);
c7c16703
JK
38192 local_bh_enable();
38193+
1a6e0f06
JK
38194 }
38195
c7c16703 38196 static void flush_all_backlogs(void)
5dd41b01 38197@@ -5153,12 +5172,14 @@ static void net_rps_action_and_irq_enable(struct softnet_data *sd)
1a6e0f06
JK
38198 sd->rps_ipi_list = NULL;
38199
38200 local_irq_enable();
38201+ preempt_check_resched_rt();
38202
38203 /* Send pending IPI's to kick RPS processing on remote cpus. */
e4b2b4a8 38204 net_rps_send_ipi(remsd);
1a6e0f06
JK
38205 } else
38206 #endif
38207 local_irq_enable();
38208+ preempt_check_resched_rt();
38209 }
38210
38211 static bool sd_has_rps_ipi_waiting(struct softnet_data *sd)
5dd41b01 38212@@ -5188,7 +5209,9 @@ static int process_backlog(struct napi_struct *napi, int quota)
c7c16703
JK
38213 while (again) {
38214 struct sk_buff *skb;
38215
38216+ local_irq_disable();
38217 while ((skb = __skb_dequeue(&sd->process_queue))) {
38218+ local_irq_enable();
38219 rcu_read_lock();
38220 __netif_receive_skb(skb);
38221 rcu_read_unlock();
5dd41b01 38222@@ -5196,9 +5219,9 @@ static int process_backlog(struct napi_struct *napi, int quota)
c7c16703
JK
38223 if (++work >= quota)
38224 return work;
38225
38226+ local_irq_disable();
38227 }
38228
38229- local_irq_disable();
38230 rps_lock(sd);
38231 if (skb_queue_empty(&sd->input_pkt_queue)) {
38232 /*
5dd41b01 38233@@ -5236,6 +5259,7 @@ void __napi_schedule(struct napi_struct *n)
1a6e0f06
JK
38234 local_irq_save(flags);
38235 ____napi_schedule(this_cpu_ptr(&softnet_data), n);
38236 local_irq_restore(flags);
38237+ preempt_check_resched_rt();
38238 }
38239 EXPORT_SYMBOL(__napi_schedule);
38240
5dd41b01 38241@@ -5272,6 +5296,7 @@ bool napi_schedule_prep(struct napi_struct *n)
e4b2b4a8
JK
38242 }
38243 EXPORT_SYMBOL(napi_schedule_prep);
38244
c7c16703
JK
38245+#ifndef CONFIG_PREEMPT_RT_FULL
38246 /**
38247 * __napi_schedule_irqoff - schedule for receive
38248 * @n: entry to schedule
5dd41b01 38249@@ -5283,6 +5308,7 @@ void __napi_schedule_irqoff(struct napi_struct *n)
c7c16703
JK
38250 ____napi_schedule(this_cpu_ptr(&softnet_data), n);
38251 }
38252 EXPORT_SYMBOL(__napi_schedule_irqoff);
38253+#endif
38254
e4b2b4a8 38255 bool napi_complete_done(struct napi_struct *n, int work_done)
c7c16703 38256 {
5dd41b01 38257@@ -5637,13 +5663,21 @@ static __latent_entropy void net_rx_action(struct softirq_action *h)
e4b2b4a8
JK
38258 unsigned long time_limit = jiffies +
38259 usecs_to_jiffies(netdev_budget_usecs);
c7c16703
JK
38260 int budget = netdev_budget;
38261+ struct sk_buff_head tofree_q;
38262+ struct sk_buff *skb;
38263 LIST_HEAD(list);
38264 LIST_HEAD(repoll);
38265
38266+ __skb_queue_head_init(&tofree_q);
38267+
38268 local_irq_disable();
38269+ skb_queue_splice_init(&sd->tofree_queue, &tofree_q);
38270 list_splice_init(&sd->poll_list, &list);
38271 local_irq_enable();
38272
38273+ while ((skb = __skb_dequeue(&tofree_q)))
38274+ kfree_skb(skb);
38275+
38276 for (;;) {
38277 struct napi_struct *n;
38278
5dd41b01 38279@@ -5673,7 +5707,7 @@ static __latent_entropy void net_rx_action(struct softirq_action *h)
1a6e0f06
JK
38280 list_splice_tail(&repoll, &list);
38281 list_splice(&list, &sd->poll_list);
38282 if (!list_empty(&sd->poll_list))
38283- __raise_softirq_irqoff(NET_RX_SOFTIRQ);
38284+ __raise_softirq_irqoff_ksoft(NET_RX_SOFTIRQ);
38285
38286 net_rps_action_and_irq_enable(sd);
e4b2b4a8 38287 out:
5dd41b01 38288@@ -7502,7 +7536,7 @@ static void netdev_init_one_queue(struct net_device *dev,
e4b2b4a8
JK
38289 /* Initialize queue lock */
38290 spin_lock_init(&queue->_xmit_lock);
38291 netdev_set_xmit_lockdep_class(&queue->_xmit_lock, dev->type);
38292- queue->xmit_lock_owner = -1;
38293+ netdev_queue_clear_owner(queue);
38294 netdev_queue_numa_node_write(queue, NUMA_NO_NODE);
38295 queue->dev = dev;
38296 #ifdef CONFIG_BQL
5dd41b01 38297@@ -8442,6 +8476,7 @@ static int dev_cpu_dead(unsigned int oldcpu)
1a6e0f06
JK
38298
38299 raise_softirq_irqoff(NET_TX_SOFTIRQ);
38300 local_irq_enable();
38301+ preempt_check_resched_rt();
38302
e4b2b4a8
JK
38303 #ifdef CONFIG_RPS
38304 remsd = oldsd->rps_ipi_list;
5dd41b01 38305@@ -8455,10 +8490,13 @@ static int dev_cpu_dead(unsigned int oldcpu)
1a6e0f06
JK
38306 netif_rx_ni(skb);
38307 input_queue_head_incr(oldsd);
38308 }
38309- while ((skb = skb_dequeue(&oldsd->input_pkt_queue))) {
38310+ while ((skb = __skb_dequeue(&oldsd->input_pkt_queue))) {
38311 netif_rx_ni(skb);
38312 input_queue_head_incr(oldsd);
38313 }
38314+ while ((skb = __skb_dequeue(&oldsd->tofree_queue))) {
38315+ kfree_skb(skb);
38316+ }
38317
e4b2b4a8 38318 return 0;
1a6e0f06 38319 }
5dd41b01 38320@@ -8762,8 +8800,9 @@ static int __init net_dev_init(void)
c7c16703
JK
38321
38322 INIT_WORK(flush, flush_backlog);
1a6e0f06
JK
38323
38324- skb_queue_head_init(&sd->input_pkt_queue);
38325- skb_queue_head_init(&sd->process_queue);
38326+ skb_queue_head_init_raw(&sd->input_pkt_queue);
38327+ skb_queue_head_init_raw(&sd->process_queue);
38328+ skb_queue_head_init_raw(&sd->tofree_queue);
38329 INIT_LIST_HEAD(&sd->poll_list);
38330 sd->output_queue_tailp = &sd->output_queue;
38331 #ifdef CONFIG_RPS
b3bbd485
JK
38332diff --git a/net/core/filter.c b/net/core/filter.c
38333index d5158a10ac8f..ad96ec78f7b8 100644
38334--- a/net/core/filter.c
38335+++ b/net/core/filter.c
38336@@ -1696,7 +1696,7 @@ static inline int __bpf_tx_skb(struct net_device *dev, struct sk_buff *skb)
1a6e0f06
JK
38337 {
38338 int ret;
38339
38340- if (unlikely(__this_cpu_read(xmit_recursion) > XMIT_RECURSION_LIMIT)) {
38341+ if (unlikely(xmit_rec_read() > XMIT_RECURSION_LIMIT)) {
38342 net_crit_ratelimited("bpf: recursion limit reached on datapath, buggy bpf program?\n");
38343 kfree_skb(skb);
38344 return -ENETDOWN;
b3bbd485 38345@@ -1704,9 +1704,9 @@ static inline int __bpf_tx_skb(struct net_device *dev, struct sk_buff *skb)
1a6e0f06
JK
38346
38347 skb->dev = dev;
38348
38349- __this_cpu_inc(xmit_recursion);
38350+ xmit_rec_inc();
38351 ret = dev_queue_xmit(skb);
38352- __this_cpu_dec(xmit_recursion);
38353+ xmit_rec_dec();
38354
38355 return ret;
38356 }
b3bbd485
JK
38357diff --git a/net/core/gen_estimator.c b/net/core/gen_estimator.c
38358index 7f980bd7426e..7250106015ef 100644
38359--- a/net/core/gen_estimator.c
38360+++ b/net/core/gen_estimator.c
e4b2b4a8
JK
38361@@ -46,7 +46,7 @@
38362 struct net_rate_estimator {
1a6e0f06 38363 struct gnet_stats_basic_packed *bstats;
1a6e0f06
JK
38364 spinlock_t *stats_lock;
38365- seqcount_t *running;
38366+ net_seqlock_t *running;
e4b2b4a8
JK
38367 struct gnet_stats_basic_cpu __percpu *cpu_bstats;
38368 u8 ewma_log;
38369 u8 intvl_log; /* period : (250ms << intvl_log) */
b3bbd485 38370@@ -129,7 +129,7 @@ int gen_new_estimator(struct gnet_stats_basic_packed *bstats,
1a6e0f06 38371 struct gnet_stats_basic_cpu __percpu *cpu_bstats,
e4b2b4a8 38372 struct net_rate_estimator __rcu **rate_est,
1a6e0f06
JK
38373 spinlock_t *stats_lock,
38374- seqcount_t *running,
38375+ net_seqlock_t *running,
38376 struct nlattr *opt)
38377 {
e4b2b4a8 38378 struct gnet_estimator *parm = nla_data(opt);
b3bbd485 38379@@ -222,7 +222,7 @@ int gen_replace_estimator(struct gnet_stats_basic_packed *bstats,
1a6e0f06 38380 struct gnet_stats_basic_cpu __percpu *cpu_bstats,
e4b2b4a8 38381 struct net_rate_estimator __rcu **rate_est,
1a6e0f06
JK
38382 spinlock_t *stats_lock,
38383- seqcount_t *running, struct nlattr *opt)
38384+ net_seqlock_t *running, struct nlattr *opt)
38385 {
e4b2b4a8
JK
38386 return gen_new_estimator(bstats, cpu_bstats, rate_est,
38387 stats_lock, running, opt);
b3bbd485
JK
38388diff --git a/net/core/gen_stats.c b/net/core/gen_stats.c
38389index 441c04adedba..07f9a6a1f8e4 100644
38390--- a/net/core/gen_stats.c
38391+++ b/net/core/gen_stats.c
38392@@ -142,7 +142,7 @@ __gnet_stats_copy_basic_cpu(struct gnet_stats_basic_packed *bstats,
1a6e0f06
JK
38393 }
38394
38395 void
38396-__gnet_stats_copy_basic(const seqcount_t *running,
38397+__gnet_stats_copy_basic(net_seqlock_t *running,
38398 struct gnet_stats_basic_packed *bstats,
38399 struct gnet_stats_basic_cpu __percpu *cpu,
38400 struct gnet_stats_basic_packed *b)
b3bbd485 38401@@ -155,10 +155,10 @@ __gnet_stats_copy_basic(const seqcount_t *running,
1a6e0f06
JK
38402 }
38403 do {
38404 if (running)
38405- seq = read_seqcount_begin(running);
38406+ seq = net_seq_begin(running);
38407 bstats->bytes = b->bytes;
38408 bstats->packets = b->packets;
38409- } while (running && read_seqcount_retry(running, seq));
38410+ } while (running && net_seq_retry(running, seq));
38411 }
38412 EXPORT_SYMBOL(__gnet_stats_copy_basic);
38413
b3bbd485 38414@@ -176,7 +176,7 @@ EXPORT_SYMBOL(__gnet_stats_copy_basic);
1a6e0f06
JK
38415 * if the room in the socket buffer was not sufficient.
38416 */
38417 int
38418-gnet_stats_copy_basic(const seqcount_t *running,
38419+gnet_stats_copy_basic(net_seqlock_t *running,
38420 struct gnet_dump *d,
38421 struct gnet_stats_basic_cpu __percpu *cpu,
38422 struct gnet_stats_basic_packed *b)
b3bbd485
JK
38423diff --git a/net/core/pktgen.c b/net/core/pktgen.c
38424index 6e1e10ff433a..c1ae4075e0ed 100644
38425--- a/net/core/pktgen.c
38426+++ b/net/core/pktgen.c
38427@@ -2252,7 +2252,8 @@ static void spin(struct pktgen_dev *pkt_dev, ktime_t spin_until)
e4b2b4a8
JK
38428 s64 remaining;
38429 struct hrtimer_sleeper t;
38430
38431- hrtimer_init_on_stack(&t.timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
38432+ hrtimer_init_sleeper_on_stack(&t, CLOCK_MONOTONIC, HRTIMER_MODE_ABS,
38433+ current);
38434 hrtimer_set_expires(&t.timer, spin_until);
38435
38436 remaining = ktime_to_ns(hrtimer_expires_remaining(&t.timer));
b3bbd485 38437@@ -2267,7 +2268,6 @@ static void spin(struct pktgen_dev *pkt_dev, ktime_t spin_until)
e4b2b4a8
JK
38438 } while (ktime_compare(end_time, spin_until) < 0);
38439 } else {
38440 /* see do_nanosleep */
38441- hrtimer_init_sleeper(&t, current);
38442 do {
38443 set_current_state(TASK_INTERRUPTIBLE);
38444 hrtimer_start_expires(&t.timer, HRTIMER_MODE_ABS);
b3bbd485 38445diff --git a/net/core/skbuff.c b/net/core/skbuff.c
5dd41b01 38446index 9f80b947f53b..c0f23b8dcfc6 100644
b3bbd485
JK
38447--- a/net/core/skbuff.c
38448+++ b/net/core/skbuff.c
e4b2b4a8 38449@@ -63,6 +63,7 @@
1a6e0f06
JK
38450 #include <linux/errqueue.h>
38451 #include <linux/prefetch.h>
38452 #include <linux/if_vlan.h>
38453+#include <linux/locallock.h>
38454
38455 #include <net/protocol.h>
38456 #include <net/dst.h>
b3bbd485 38457@@ -330,6 +331,8 @@ struct napi_alloc_cache {
1a6e0f06
JK
38458
38459 static DEFINE_PER_CPU(struct page_frag_cache, netdev_alloc_cache);
38460 static DEFINE_PER_CPU(struct napi_alloc_cache, napi_alloc_cache);
38461+static DEFINE_LOCAL_IRQ_LOCK(netdev_alloc_lock);
38462+static DEFINE_LOCAL_IRQ_LOCK(napi_alloc_cache_lock);
38463
38464 static void *__netdev_alloc_frag(unsigned int fragsz, gfp_t gfp_mask)
38465 {
b3bbd485 38466@@ -337,10 +340,10 @@ static void *__netdev_alloc_frag(unsigned int fragsz, gfp_t gfp_mask)
1a6e0f06
JK
38467 unsigned long flags;
38468 void *data;
38469
38470- local_irq_save(flags);
38471+ local_lock_irqsave(netdev_alloc_lock, flags);
38472 nc = this_cpu_ptr(&netdev_alloc_cache);
e4b2b4a8 38473 data = page_frag_alloc(nc, fragsz, gfp_mask);
1a6e0f06
JK
38474- local_irq_restore(flags);
38475+ local_unlock_irqrestore(netdev_alloc_lock, flags);
38476 return data;
38477 }
38478
b3bbd485 38479@@ -359,9 +362,13 @@ EXPORT_SYMBOL(netdev_alloc_frag);
1a6e0f06
JK
38480
38481 static void *__napi_alloc_frag(unsigned int fragsz, gfp_t gfp_mask)
38482 {
38483- struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache);
38484+ struct napi_alloc_cache *nc;
38485+ void *data;
38486
e4b2b4a8 38487- return page_frag_alloc(&nc->page, fragsz, gfp_mask);
1a6e0f06 38488+ nc = &get_locked_var(napi_alloc_cache_lock, napi_alloc_cache);
e4b2b4a8 38489+ data = page_frag_alloc(&nc->page, fragsz, gfp_mask);
1a6e0f06
JK
38490+ put_locked_var(napi_alloc_cache_lock, napi_alloc_cache);
38491+ return data;
38492 }
38493
38494 void *napi_alloc_frag(unsigned int fragsz)
b3bbd485 38495@@ -408,13 +415,13 @@ struct sk_buff *__netdev_alloc_skb(struct net_device *dev, unsigned int len,
1a6e0f06
JK
38496 if (sk_memalloc_socks())
38497 gfp_mask |= __GFP_MEMALLOC;
38498
38499- local_irq_save(flags);
38500+ local_lock_irqsave(netdev_alloc_lock, flags);
38501
38502 nc = this_cpu_ptr(&netdev_alloc_cache);
e4b2b4a8 38503 data = page_frag_alloc(nc, len, gfp_mask);
1a6e0f06
JK
38504 pfmemalloc = nc->pfmemalloc;
38505
38506- local_irq_restore(flags);
38507+ local_unlock_irqrestore(netdev_alloc_lock, flags);
38508
38509 if (unlikely(!data))
38510 return NULL;
b3bbd485 38511@@ -455,9 +462,10 @@ EXPORT_SYMBOL(__netdev_alloc_skb);
1a6e0f06
JK
38512 struct sk_buff *__napi_alloc_skb(struct napi_struct *napi, unsigned int len,
38513 gfp_t gfp_mask)
38514 {
38515- struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache);
38516+ struct napi_alloc_cache *nc;
38517 struct sk_buff *skb;
38518 void *data;
38519+ bool pfmemalloc;
38520
38521 len += NET_SKB_PAD + NET_IP_ALIGN;
38522
b3bbd485 38523@@ -475,7 +483,10 @@ struct sk_buff *__napi_alloc_skb(struct napi_struct *napi, unsigned int len,
1a6e0f06
JK
38524 if (sk_memalloc_socks())
38525 gfp_mask |= __GFP_MEMALLOC;
38526
38527+ nc = &get_locked_var(napi_alloc_cache_lock, napi_alloc_cache);
e4b2b4a8 38528 data = page_frag_alloc(&nc->page, len, gfp_mask);
1a6e0f06
JK
38529+ pfmemalloc = nc->page.pfmemalloc;
38530+ put_locked_var(napi_alloc_cache_lock, napi_alloc_cache);
38531 if (unlikely(!data))
38532 return NULL;
38533
b3bbd485 38534@@ -486,7 +497,7 @@ struct sk_buff *__napi_alloc_skb(struct napi_struct *napi, unsigned int len,
1a6e0f06
JK
38535 }
38536
38537 /* use OR instead of assignment to avoid clearing of bits in mask */
38538- if (nc->page.pfmemalloc)
38539+ if (pfmemalloc)
38540 skb->pfmemalloc = 1;
38541 skb->head_frag = 1;
38542
b3bbd485 38543@@ -718,23 +729,26 @@ void __consume_stateless_skb(struct sk_buff *skb)
1a6e0f06
JK
38544
38545 void __kfree_skb_flush(void)
38546 {
38547- struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache);
38548+ struct napi_alloc_cache *nc;
38549
38550+ nc = &get_locked_var(napi_alloc_cache_lock, napi_alloc_cache);
38551 /* flush skb_cache if containing objects */
38552 if (nc->skb_count) {
38553 kmem_cache_free_bulk(skbuff_head_cache, nc->skb_count,
38554 nc->skb_cache);
38555 nc->skb_count = 0;
38556 }
38557+ put_locked_var(napi_alloc_cache_lock, napi_alloc_cache);
38558 }
38559
38560 static inline void _kfree_skb_defer(struct sk_buff *skb)
38561 {
38562- struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache);
38563+ struct napi_alloc_cache *nc;
38564
38565 /* drop skb->head and call any destructors for packet */
38566 skb_release_all(skb);
38567
38568+ nc = &get_locked_var(napi_alloc_cache_lock, napi_alloc_cache);
38569 /* record skb to CPU local list */
38570 nc->skb_cache[nc->skb_count++] = skb;
38571
b3bbd485 38572@@ -749,6 +763,7 @@ static inline void _kfree_skb_defer(struct sk_buff *skb)
1a6e0f06
JK
38573 nc->skb_cache);
38574 nc->skb_count = 0;
38575 }
38576+ put_locked_var(napi_alloc_cache_lock, napi_alloc_cache);
38577 }
38578 void __kfree_skb_defer(struct sk_buff *skb)
38579 {
b3bbd485
JK
38580diff --git a/net/core/sock.c b/net/core/sock.c
38581index 68d08ed5521e..ee242ff5d4b1 100644
38582--- a/net/core/sock.c
38583+++ b/net/core/sock.c
38584@@ -2757,12 +2757,11 @@ void lock_sock_nested(struct sock *sk, int subclass)
1a6e0f06
JK
38585 if (sk->sk_lock.owned)
38586 __lock_sock(sk);
38587 sk->sk_lock.owned = 1;
38588- spin_unlock(&sk->sk_lock.slock);
38589+ spin_unlock_bh(&sk->sk_lock.slock);
38590 /*
38591 * The sk_lock has mutex_lock() semantics here:
38592 */
38593 mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_);
38594- local_bh_enable();
38595 }
38596 EXPORT_SYMBOL(lock_sock_nested);
38597
b3bbd485
JK
38598diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c
38599index 3c1570d3e22f..0310ea93f877 100644
38600--- a/net/ipv4/icmp.c
38601+++ b/net/ipv4/icmp.c
e4b2b4a8 38602@@ -77,6 +77,7 @@
1a6e0f06
JK
38603 #include <linux/string.h>
38604 #include <linux/netfilter_ipv4.h>
38605 #include <linux/slab.h>
38606+#include <linux/locallock.h>
38607 #include <net/snmp.h>
38608 #include <net/ip.h>
38609 #include <net/route.h>
b3bbd485 38610@@ -204,6 +205,8 @@ static const struct icmp_control icmp_pointers[NR_ICMP_TYPES+1];
1a6e0f06
JK
38611 *
38612 * On SMP we have one ICMP socket per-cpu.
38613 */
38614+static DEFINE_LOCAL_IRQ_LOCK(icmp_sk_lock);
38615+
38616 static struct sock *icmp_sk(struct net *net)
38617 {
38618 return *this_cpu_ptr(net->ipv4.icmp_sk);
b3bbd485 38619@@ -214,12 +217,16 @@ static inline struct sock *icmp_xmit_lock(struct net *net)
e4b2b4a8
JK
38620 {
38621 struct sock *sk;
1a6e0f06 38622
e4b2b4a8
JK
38623+ if (!local_trylock(icmp_sk_lock))
38624+ return NULL;
38625+
1a6e0f06
JK
38626 sk = icmp_sk(net);
38627
38628 if (unlikely(!spin_trylock(&sk->sk_lock.slock))) {
38629 /* This can happen if the output path signals a
38630 * dst_link_failure() for an outgoing ICMP packet.
38631 */
38632+ local_unlock(icmp_sk_lock);
1a6e0f06
JK
38633 return NULL;
38634 }
e4b2b4a8 38635 return sk;
b3bbd485 38636@@ -228,6 +235,7 @@ static inline struct sock *icmp_xmit_lock(struct net *net)
1a6e0f06
JK
38637 static inline void icmp_xmit_unlock(struct sock *sk)
38638 {
e4b2b4a8 38639 spin_unlock(&sk->sk_lock.slock);
1a6e0f06
JK
38640+ local_unlock(icmp_sk_lock);
38641 }
38642
38643 int sysctl_icmp_msgs_per_sec __read_mostly = 1000;
b3bbd485 38644diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
5dd41b01 38645index 31b34c0c2d5f..851f241e70b5 100644
b3bbd485
JK
38646--- a/net/ipv4/tcp_ipv4.c
38647+++ b/net/ipv4/tcp_ipv4.c
1a6e0f06
JK
38648@@ -62,6 +62,7 @@
38649 #include <linux/init.h>
38650 #include <linux/times.h>
38651 #include <linux/slab.h>
38652+#include <linux/locallock.h>
38653
38654 #include <net/net_namespace.h>
38655 #include <net/icmp.h>
b3bbd485 38656@@ -580,6 +581,7 @@ void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
1a6e0f06
JK
38657 }
38658 EXPORT_SYMBOL(tcp_v4_send_check);
38659
38660+static DEFINE_LOCAL_IRQ_LOCK(tcp_sk_lock);
38661 /*
38662 * This routine will send an RST to the other tcp.
38663 *
b3bbd485 38664@@ -710,6 +712,7 @@ static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
1a6e0f06 38665 arg.tos = ip_hdr(skb)->tos;
e4b2b4a8 38666 arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL);
1a6e0f06 38667 local_bh_disable();
e4b2b4a8 38668+ local_lock(tcp_sk_lock);
1a6e0f06
JK
38669 ip_send_unicast_reply(*this_cpu_ptr(net->ipv4.tcp_sk),
38670 skb, &TCP_SKB_CB(skb)->header.h4.opt,
e4b2b4a8 38671 ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
b3bbd485 38672@@ -717,6 +720,7 @@ static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
e4b2b4a8 38673
1a6e0f06
JK
38674 __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
38675 __TCP_INC_STATS(net, TCP_MIB_OUTRSTS);
1a6e0f06 38676+ local_unlock(tcp_sk_lock);
e4b2b4a8 38677 local_bh_enable();
1a6e0f06
JK
38678
38679 #ifdef CONFIG_TCP_MD5SIG
b3bbd485 38680@@ -796,12 +800,14 @@ static void tcp_v4_send_ack(const struct sock *sk,
1a6e0f06 38681 arg.tos = tos;
e4b2b4a8 38682 arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL);
1a6e0f06 38683 local_bh_disable();
e4b2b4a8 38684+ local_lock(tcp_sk_lock);
1a6e0f06
JK
38685 ip_send_unicast_reply(*this_cpu_ptr(net->ipv4.tcp_sk),
38686 skb, &TCP_SKB_CB(skb)->header.h4.opt,
e4b2b4a8
JK
38687 ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
38688 &arg, arg.iov[0].iov_len);
1a6e0f06
JK
38689
38690 __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
1a6e0f06 38691+ local_unlock(tcp_sk_lock);
e4b2b4a8 38692 local_bh_enable();
1a6e0f06
JK
38693 }
38694
b3bbd485
JK
38695diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c
38696index dddd498e1338..8f39b8162df8 100644
38697--- a/net/mac80211/rx.c
38698+++ b/net/mac80211/rx.c
38699@@ -4252,7 +4252,7 @@ void ieee80211_rx_napi(struct ieee80211_hw *hw, struct ieee80211_sta *pubsta,
1a6e0f06
JK
38700 struct ieee80211_supported_band *sband;
38701 struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(skb);
38702
38703- WARN_ON_ONCE(softirq_count() == 0);
38704+ WARN_ON_ONCE_NONRT(softirq_count() == 0);
38705
38706 if (WARN_ON(status->band >= NUM_NL80211_BANDS))
38707 goto drop;
b3bbd485
JK
38708diff --git a/net/netfilter/core.c b/net/netfilter/core.c
38709index 52cd2901a097..c63e937b6676 100644
38710--- a/net/netfilter/core.c
38711+++ b/net/netfilter/core.c
e4b2b4a8
JK
38712@@ -21,6 +21,7 @@
38713 #include <linux/inetdevice.h>
1a6e0f06
JK
38714 #include <linux/proc_fs.h>
38715 #include <linux/mutex.h>
1a6e0f06 38716+#include <linux/locallock.h>
e4b2b4a8 38717 #include <linux/mm.h>
c7c16703 38718 #include <linux/rcupdate.h>
1a6e0f06 38719 #include <net/net_namespace.h>
e4b2b4a8 38720@@ -28,6 +29,11 @@
1a6e0f06
JK
38721
38722 #include "nf_internals.h"
38723
38724+#ifdef CONFIG_PREEMPT_RT_BASE
38725+DEFINE_LOCAL_IRQ_LOCK(xt_write_lock);
38726+EXPORT_PER_CPU_SYMBOL(xt_write_lock);
38727+#endif
38728+
38729 static DEFINE_MUTEX(afinfo_mutex);
38730
38731 const struct nf_afinfo __rcu *nf_afinfo[NFPROTO_NUMPROTO] __read_mostly;
b3bbd485 38732diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
5dd41b01 38733index 8d1a7c900393..f1f56be3b061 100644
b3bbd485
JK
38734--- a/net/packet/af_packet.c
38735+++ b/net/packet/af_packet.c
1a6e0f06
JK
38736@@ -63,6 +63,7 @@
38737 #include <linux/if_packet.h>
38738 #include <linux/wireless.h>
38739 #include <linux/kernel.h>
38740+#include <linux/delay.h>
38741 #include <linux/kmod.h>
38742 #include <linux/slab.h>
38743 #include <linux/vmalloc.h>
b3bbd485 38744@@ -707,7 +708,7 @@ static void prb_retire_rx_blk_timer_expired(unsigned long data)
1a6e0f06
JK
38745 if (BLOCK_NUM_PKTS(pbd)) {
38746 while (atomic_read(&pkc->blk_fill_in_prog)) {
38747 /* Waiting for skb_copy_bits to finish... */
38748- cpu_relax();
38749+ cpu_chill();
38750 }
38751 }
38752
b3bbd485 38753@@ -969,7 +970,7 @@ static void prb_retire_current_block(struct tpacket_kbdq_core *pkc,
1a6e0f06
JK
38754 if (!(status & TP_STATUS_BLK_TMO)) {
38755 while (atomic_read(&pkc->blk_fill_in_prog)) {
38756 /* Waiting for skb_copy_bits to finish... */
38757- cpu_relax();
38758+ cpu_chill();
38759 }
38760 }
38761 prb_close_block(pkc, pbd, po, status);
b3bbd485
JK
38762diff --git a/net/rds/ib_rdma.c b/net/rds/ib_rdma.c
38763index 9a3c54e659e9..2a95f1d587ac 100644
38764--- a/net/rds/ib_rdma.c
38765+++ b/net/rds/ib_rdma.c
1a6e0f06
JK
38766@@ -34,6 +34,7 @@
38767 #include <linux/slab.h>
38768 #include <linux/rculist.h>
38769 #include <linux/llist.h>
38770+#include <linux/delay.h>
38771
38772 #include "rds_single_path.h"
38773 #include "ib_mr.h"
b3bbd485 38774@@ -210,7 +211,7 @@ static inline void wait_clean_list_grace(void)
1a6e0f06
JK
38775 for_each_online_cpu(cpu) {
38776 flag = &per_cpu(clean_list_grace, cpu);
38777 while (test_bit(CLEAN_LIST_BUSY_BIT, flag))
38778- cpu_relax();
38779+ cpu_chill();
38780 }
38781 }
38782
b3bbd485
JK
38783diff --git a/net/rxrpc/security.c b/net/rxrpc/security.c
38784index e9f428351293..c4479afe8ae7 100644
38785--- a/net/rxrpc/security.c
38786+++ b/net/rxrpc/security.c
1a6e0f06
JK
38787@@ -19,9 +19,6 @@
38788 #include <keys/rxrpc-type.h>
38789 #include "ar-internal.h"
38790
38791-static LIST_HEAD(rxrpc_security_methods);
38792-static DECLARE_RWSEM(rxrpc_security_sem);
38793-
38794 static const struct rxrpc_security *rxrpc_security_types[] = {
38795 [RXRPC_SECURITY_NONE] = &rxrpc_no_security,
38796 #ifdef CONFIG_RXKAD
b3bbd485 38797diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c
5dd41b01 38798index cd69aa067543..73348ac5019f 100644
b3bbd485
JK
38799--- a/net/sched/sch_api.c
38800+++ b/net/sched/sch_api.c
38801@@ -1081,7 +1081,7 @@ static struct Qdisc *qdisc_create(struct net_device *dev,
1a6e0f06
JK
38802 rcu_assign_pointer(sch->stab, stab);
38803 }
38804 if (tca[TCA_RATE]) {
38805- seqcount_t *running;
38806+ net_seqlock_t *running;
38807
38808 err = -EOPNOTSUPP;
38809 if (sch->flags & TCQ_F_MQROOT)
b3bbd485
JK
38810diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
38811index 79549baf5804..341f7895659c 100644
38812--- a/net/sched/sch_generic.c
38813+++ b/net/sched/sch_generic.c
38814@@ -429,7 +429,11 @@ struct Qdisc noop_qdisc = {
c7c16703 38815 .ops = &noop_qdisc_ops,
1a6e0f06
JK
38816 .q.lock = __SPIN_LOCK_UNLOCKED(noop_qdisc.q.lock),
38817 .dev_queue = &noop_netdev_queue,
38818+#ifdef CONFIG_PREEMPT_RT_BASE
38819+ .running = __SEQLOCK_UNLOCKED(noop_qdisc.running),
38820+#else
38821 .running = SEQCNT_ZERO(noop_qdisc.running),
38822+#endif
38823 .busylock = __SPIN_LOCK_UNLOCKED(noop_qdisc.busylock),
38824 };
38825 EXPORT_SYMBOL(noop_qdisc);
b3bbd485 38826@@ -628,9 +632,17 @@ struct Qdisc *qdisc_alloc(struct netdev_queue *dev_queue,
1a6e0f06
JK
38827 lockdep_set_class(&sch->busylock,
38828 dev->qdisc_tx_busylock ?: &qdisc_tx_busylock);
38829
38830+#ifdef CONFIG_PREEMPT_RT_BASE
38831+ seqlock_init(&sch->running);
38832+ lockdep_set_class(&sch->running.seqcount,
38833+ dev->qdisc_running_key ?: &qdisc_running_key);
38834+ lockdep_set_class(&sch->running.lock,
38835+ dev->qdisc_running_key ?: &qdisc_running_key);
38836+#else
38837 seqcount_init(&sch->running);
38838 lockdep_set_class(&sch->running,
38839 dev->qdisc_running_key ?: &qdisc_running_key);
38840+#endif
38841
38842 sch->ops = ops;
38843 sch->enqueue = ops->enqueue;
b3bbd485 38844@@ -933,7 +945,7 @@ void dev_deactivate_many(struct list_head *head)
1a6e0f06 38845 /* Wait for outstanding qdisc_run calls. */
e4b2b4a8 38846 list_for_each_entry(dev, head, close_list) {
1a6e0f06
JK
38847 while (some_qdisc_is_busy(dev))
38848- yield();
38849+ msleep(1);
e4b2b4a8
JK
38850 /* The new qdisc is assigned at this point so we can safely
38851 * unwind stale skb lists and qdisc statistics
38852 */
b3bbd485
JK
38853diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c
38854index d16a8b423c20..cedaf909eb97 100644
38855--- a/net/sunrpc/svc_xprt.c
38856+++ b/net/sunrpc/svc_xprt.c
38857@@ -396,7 +396,7 @@ void svc_xprt_do_enqueue(struct svc_xprt *xprt)
1a6e0f06
JK
38858 goto out;
38859 }
38860
38861- cpu = get_cpu();
38862+ cpu = get_cpu_light();
38863 pool = svc_pool_for_cpu(xprt->xpt_server, cpu);
38864
38865 atomic_long_inc(&pool->sp_stats.packets);
b3bbd485 38866@@ -432,7 +432,7 @@ void svc_xprt_do_enqueue(struct svc_xprt *xprt)
1a6e0f06
JK
38867
38868 atomic_long_inc(&pool->sp_stats.threads_woken);
38869 wake_up_process(rqstp->rq_task);
38870- put_cpu();
38871+ put_cpu_light();
38872 goto out;
38873 }
38874 rcu_read_unlock();
b3bbd485 38875@@ -453,7 +453,7 @@ void svc_xprt_do_enqueue(struct svc_xprt *xprt)
1a6e0f06
JK
38876 goto redo_search;
38877 }
38878 rqstp = NULL;
38879- put_cpu();
38880+ put_cpu_light();
38881 out:
38882 trace_svc_xprt_do_enqueue(xprt, rqstp);
38883 }
b3bbd485
JK
38884diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c
38885index 6c4ec69e11a0..77f52dc790ec 100644
38886--- a/net/xfrm/xfrm_state.c
38887+++ b/net/xfrm/xfrm_state.c
38888@@ -427,7 +427,7 @@ static void xfrm_put_mode(struct xfrm_mode *mode)
e4b2b4a8
JK
38889
38890 static void xfrm_state_gc_destroy(struct xfrm_state *x)
38891 {
38892- tasklet_hrtimer_cancel(&x->mtimer);
38893+ hrtimer_cancel(&x->mtimer);
38894 del_timer_sync(&x->rtimer);
38895 kfree(x->aead);
38896 kfree(x->aalg);
b3bbd485 38897@@ -472,8 +472,8 @@ static void xfrm_state_gc_task(struct work_struct *work)
e4b2b4a8
JK
38898
38899 static enum hrtimer_restart xfrm_timer_handler(struct hrtimer *me)
38900 {
38901- struct tasklet_hrtimer *thr = container_of(me, struct tasklet_hrtimer, timer);
38902- struct xfrm_state *x = container_of(thr, struct xfrm_state, mtimer);
38903+ struct xfrm_state *x = container_of(me, struct xfrm_state, mtimer);
38904+ enum hrtimer_restart ret = HRTIMER_NORESTART;
38905 unsigned long now = get_seconds();
38906 long next = LONG_MAX;
38907 int warn = 0;
b3bbd485 38908@@ -537,7 +537,8 @@ static enum hrtimer_restart xfrm_timer_handler(struct hrtimer *me)
e4b2b4a8
JK
38909 km_state_expired(x, 0, 0);
38910 resched:
38911 if (next != LONG_MAX) {
38912- tasklet_hrtimer_start(&x->mtimer, ktime_set(next, 0), HRTIMER_MODE_REL);
38913+ hrtimer_forward_now(&x->mtimer, ktime_set(next, 0));
38914+ ret = HRTIMER_RESTART;
38915 }
38916
38917 goto out;
b3bbd485 38918@@ -554,7 +555,7 @@ static enum hrtimer_restart xfrm_timer_handler(struct hrtimer *me)
e4b2b4a8
JK
38919
38920 out:
38921 spin_unlock(&x->lock);
38922- return HRTIMER_NORESTART;
38923+ return ret;
38924 }
38925
38926 static void xfrm_replay_timer_handler(unsigned long data);
b3bbd485 38927@@ -573,8 +574,8 @@ struct xfrm_state *xfrm_state_alloc(struct net *net)
e4b2b4a8
JK
38928 INIT_HLIST_NODE(&x->bydst);
38929 INIT_HLIST_NODE(&x->bysrc);
38930 INIT_HLIST_NODE(&x->byspi);
38931- tasklet_hrtimer_init(&x->mtimer, xfrm_timer_handler,
38932- CLOCK_BOOTTIME, HRTIMER_MODE_ABS);
38933+ hrtimer_init(&x->mtimer, CLOCK_BOOTTIME, HRTIMER_MODE_ABS_SOFT);
38934+ x->mtimer.function = xfrm_timer_handler;
38935 setup_timer(&x->rtimer, xfrm_replay_timer_handler,
38936 (unsigned long)x);
38937 x->curlft.add_time = get_seconds();
b3bbd485 38938@@ -1031,7 +1032,9 @@ xfrm_state_find(const xfrm_address_t *daddr, const xfrm_address_t *saddr,
e4b2b4a8
JK
38939 hlist_add_head_rcu(&x->byspi, net->xfrm.state_byspi + h);
38940 }
38941 x->lft.hard_add_expires_seconds = net->xfrm.sysctl_acq_expires;
38942- tasklet_hrtimer_start(&x->mtimer, ktime_set(net->xfrm.sysctl_acq_expires, 0), HRTIMER_MODE_REL);
38943+ hrtimer_start(&x->mtimer,
38944+ ktime_set(net->xfrm.sysctl_acq_expires, 0),
38945+ HRTIMER_MODE_REL_SOFT);
38946 net->xfrm.state_num++;
38947 xfrm_hash_grow_check(net, x->bydst.next != NULL);
38948 spin_unlock_bh(&net->xfrm.xfrm_state_lock);
b3bbd485 38949@@ -1142,7 +1145,7 @@ static void __xfrm_state_insert(struct xfrm_state *x)
e4b2b4a8
JK
38950 hlist_add_head_rcu(&x->byspi, net->xfrm.state_byspi + h);
38951 }
38952
38953- tasklet_hrtimer_start(&x->mtimer, ktime_set(1, 0), HRTIMER_MODE_REL);
38954+ hrtimer_start(&x->mtimer, ktime_set(1, 0), HRTIMER_MODE_REL_SOFT);
38955 if (x->replay_maxage)
38956 mod_timer(&x->rtimer, jiffies + x->replay_maxage);
38957
b3bbd485 38958@@ -1246,7 +1249,9 @@ static struct xfrm_state *__find_acq_core(struct net *net,
e4b2b4a8
JK
38959 x->mark.m = m->m;
38960 x->lft.hard_add_expires_seconds = net->xfrm.sysctl_acq_expires;
38961 xfrm_state_hold(x);
38962- tasklet_hrtimer_start(&x->mtimer, ktime_set(net->xfrm.sysctl_acq_expires, 0), HRTIMER_MODE_REL);
38963+ hrtimer_start(&x->mtimer,
38964+ ktime_set(net->xfrm.sysctl_acq_expires, 0),
38965+ HRTIMER_MODE_REL_SOFT);
38966 list_add(&x->km.all, &net->xfrm.state_all);
38967 hlist_add_head_rcu(&x->bydst, net->xfrm.state_bydst + h);
38968 h = xfrm_src_hash(net, daddr, saddr, family);
b3bbd485 38969@@ -1546,7 +1551,8 @@ int xfrm_state_update(struct xfrm_state *x)
e4b2b4a8
JK
38970 memcpy(&x1->lft, &x->lft, sizeof(x1->lft));
38971 x1->km.dying = 0;
38972
38973- tasklet_hrtimer_start(&x1->mtimer, ktime_set(1, 0), HRTIMER_MODE_REL);
38974+ hrtimer_start(&x1->mtimer, ktime_set(1, 0),
38975+ HRTIMER_MODE_REL_SOFT);
38976 if (x1->curlft.use_time)
38977 xfrm_state_check_expire(x1);
38978
b3bbd485 38979@@ -1570,7 +1576,7 @@ int xfrm_state_check_expire(struct xfrm_state *x)
e4b2b4a8
JK
38980 if (x->curlft.bytes >= x->lft.hard_byte_limit ||
38981 x->curlft.packets >= x->lft.hard_packet_limit) {
38982 x->km.state = XFRM_STATE_EXPIRED;
38983- tasklet_hrtimer_start(&x->mtimer, 0, HRTIMER_MODE_REL);
38984+ hrtimer_start(&x->mtimer, 0, HRTIMER_MODE_REL_SOFT);
38985 return -EINVAL;
38986 }
38987
b3bbd485
JK
38988diff --git a/samples/trace_events/trace-events-sample.c b/samples/trace_events/trace-events-sample.c
38989index 5522692100ba..8b4be8e1802a 100644
38990--- a/samples/trace_events/trace-events-sample.c
38991+++ b/samples/trace_events/trace-events-sample.c
38992@@ -33,7 +33,7 @@ static void simple_thread_func(int cnt)
e4b2b4a8
JK
38993
38994 /* Silly tracepoints */
38995 trace_foo_bar("hello", cnt, array, random_strings[len],
38996- &current->cpus_allowed);
38997+ current->cpus_ptr);
38998
38999 trace_foo_with_template_simple("HELLO", cnt);
39000
b3bbd485
JK
39001diff --git a/scripts/mkcompile_h b/scripts/mkcompile_h
39002index 959199c3147e..3e68004ed345 100755
39003--- a/scripts/mkcompile_h
39004+++ b/scripts/mkcompile_h
39005@@ -5,7 +5,8 @@ TARGET=$1
1a6e0f06
JK
39006 ARCH=$2
39007 SMP=$3
39008 PREEMPT=$4
39009-CC=$5
39010+RT=$5
39011+CC=$6
39012
39013 vecho() { [ "${quiet}" = "silent_" ] || echo "$@" ; }
39014
b3bbd485 39015@@ -58,6 +59,7 @@ UTS_VERSION="#$VERSION"
1a6e0f06
JK
39016 CONFIG_FLAGS=""
39017 if [ -n "$SMP" ] ; then CONFIG_FLAGS="SMP"; fi
39018 if [ -n "$PREEMPT" ] ; then CONFIG_FLAGS="$CONFIG_FLAGS PREEMPT"; fi
39019+if [ -n "$RT" ] ; then CONFIG_FLAGS="$CONFIG_FLAGS RT"; fi
39020 UTS_VERSION="$UTS_VERSION $CONFIG_FLAGS $TIMESTAMP"
39021
39022 # Truncate to maximum length
b3bbd485
JK
39023diff --git a/security/apparmor/include/path.h b/security/apparmor/include/path.h
39024index 05fb3305671e..b26c16b02662 100644
39025--- a/security/apparmor/include/path.h
39026+++ b/security/apparmor/include/path.h
39027@@ -39,9 +39,10 @@ struct aa_buffers {
e4b2b4a8
JK
39028 };
39029
39030 #include <linux/percpu.h>
39031-#include <linux/preempt.h>
39032+#include <linux/locallock.h>
39033
39034 DECLARE_PER_CPU(struct aa_buffers, aa_buffers);
39035+DECLARE_LOCAL_IRQ_LOCK(aa_buffers_lock);
39036
39037 #define COUNT_ARGS(X...) COUNT_ARGS_HELPER(, ##X, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0)
39038 #define COUNT_ARGS_HELPER(_0, _1, _2, _3, _4, _5, _6, _7, _8, _9, n, X...) n
b3bbd485 39039@@ -55,12 +56,24 @@ DECLARE_PER_CPU(struct aa_buffers, aa_buffers);
e4b2b4a8
JK
39040
39041 #define for_each_cpu_buffer(I) for ((I) = 0; (I) < MAX_PATH_BUFFERS; (I)++)
39042
39043-#ifdef CONFIG_DEBUG_PREEMPT
39044+#ifdef CONFIG_PREEMPT_RT_BASE
39045+
39046+static inline void AA_BUG_PREEMPT_ENABLED(const char *s)
39047+{
39048+ struct local_irq_lock *lv;
39049+
39050+ lv = this_cpu_ptr(&aa_buffers_lock);
39051+ WARN_ONCE(lv->owner != current,
39052+ "__get_buffer without aa_buffers_lock\n");
39053+}
39054+
39055+#elif defined(CONFIG_DEBUG_PREEMPT)
39056 #define AA_BUG_PREEMPT_ENABLED(X) AA_BUG(preempt_count() <= 0, X)
39057 #else
39058 #define AA_BUG_PREEMPT_ENABLED(X) /* nop */
39059 #endif
39060
39061+
39062 #define __get_buffer(N) ({ \
39063 struct aa_buffers *__cpu_var; \
39064 AA_BUG_PREEMPT_ENABLED("__get_buffer without preempt disabled"); \
b3bbd485 39065@@ -73,14 +86,14 @@ DECLARE_PER_CPU(struct aa_buffers, aa_buffers);
e4b2b4a8
JK
39066
39067 #define get_buffers(X...) \
39068 do { \
39069- preempt_disable(); \
39070+ local_lock(aa_buffers_lock); \
39071 __get_buffers(X); \
39072 } while (0)
39073
39074 #define put_buffers(X, Y...) \
39075 do { \
39076 __put_buffers(X, Y); \
39077- preempt_enable(); \
39078+ local_unlock(aa_buffers_lock); \
39079 } while (0)
39080
39081 #endif /* __AA_PATH_H */
b3bbd485
JK
39082diff --git a/security/apparmor/lsm.c b/security/apparmor/lsm.c
39083index 1346ee5be04f..aa7e4dee107b 100644
39084--- a/security/apparmor/lsm.c
39085+++ b/security/apparmor/lsm.c
e4b2b4a8
JK
39086@@ -44,7 +44,7 @@
39087 int apparmor_initialized;
39088
39089 DEFINE_PER_CPU(struct aa_buffers, aa_buffers);
39090-
39091+DEFINE_LOCAL_IRQ_LOCK(aa_buffers_lock);
39092
39093 /*
39094 * LSM hook functions
b3bbd485
JK
39095diff --git a/sound/core/pcm_native.c b/sound/core/pcm_native.c
39096index ab3bf36786b6..f0bb7c9aa4be 100644
39097--- a/sound/core/pcm_native.c
39098+++ b/sound/core/pcm_native.c
39099@@ -148,7 +148,7 @@ EXPORT_SYMBOL_GPL(snd_pcm_stream_unlock);
1a6e0f06
JK
39100 void snd_pcm_stream_lock_irq(struct snd_pcm_substream *substream)
39101 {
39102 if (!substream->pcm->nonatomic)
39103- local_irq_disable();
39104+ local_irq_disable_nort();
39105 snd_pcm_stream_lock(substream);
39106 }
39107 EXPORT_SYMBOL_GPL(snd_pcm_stream_lock_irq);
b3bbd485 39108@@ -163,7 +163,7 @@ void snd_pcm_stream_unlock_irq(struct snd_pcm_substream *substream)
1a6e0f06
JK
39109 {
39110 snd_pcm_stream_unlock(substream);
39111 if (!substream->pcm->nonatomic)
39112- local_irq_enable();
39113+ local_irq_enable_nort();
39114 }
39115 EXPORT_SYMBOL_GPL(snd_pcm_stream_unlock_irq);
39116
b3bbd485 39117@@ -171,7 +171,7 @@ unsigned long _snd_pcm_stream_lock_irqsave(struct snd_pcm_substream *substream)
1a6e0f06
JK
39118 {
39119 unsigned long flags = 0;
39120 if (!substream->pcm->nonatomic)
39121- local_irq_save(flags);
39122+ local_irq_save_nort(flags);
39123 snd_pcm_stream_lock(substream);
39124 return flags;
39125 }
b3bbd485 39126@@ -189,7 +189,7 @@ void snd_pcm_stream_unlock_irqrestore(struct snd_pcm_substream *substream,
1a6e0f06
JK
39127 {
39128 snd_pcm_stream_unlock(substream);
39129 if (!substream->pcm->nonatomic)
39130- local_irq_restore(flags);
39131+ local_irq_restore_nort(flags);
39132 }
39133 EXPORT_SYMBOL_GPL(snd_pcm_stream_unlock_irqrestore);
39134
b3bbd485
JK
39135diff --git a/sound/drivers/dummy.c b/sound/drivers/dummy.c
39136index c0939a0164a6..549e014ecc0d 100644
39137--- a/sound/drivers/dummy.c
39138+++ b/sound/drivers/dummy.c
39139@@ -376,17 +376,9 @@ struct dummy_hrtimer_pcm {
e4b2b4a8
JK
39140 ktime_t period_time;
39141 atomic_t running;
39142 struct hrtimer timer;
39143- struct tasklet_struct tasklet;
39144 struct snd_pcm_substream *substream;
39145 };
39146
39147-static void dummy_hrtimer_pcm_elapsed(unsigned long priv)
39148-{
39149- struct dummy_hrtimer_pcm *dpcm = (struct dummy_hrtimer_pcm *)priv;
39150- if (atomic_read(&dpcm->running))
39151- snd_pcm_period_elapsed(dpcm->substream);
39152-}
39153-
39154 static enum hrtimer_restart dummy_hrtimer_callback(struct hrtimer *timer)
39155 {
39156 struct dummy_hrtimer_pcm *dpcm;
b3bbd485 39157@@ -394,7 +386,14 @@ static enum hrtimer_restart dummy_hrtimer_callback(struct hrtimer *timer)
e4b2b4a8
JK
39158 dpcm = container_of(timer, struct dummy_hrtimer_pcm, timer);
39159 if (!atomic_read(&dpcm->running))
39160 return HRTIMER_NORESTART;
39161- tasklet_schedule(&dpcm->tasklet);
39162+ /*
39163+ * In cases of XRUN and draining, this calls .trigger to stop PCM
39164+ * substream.
39165+ */
39166+ snd_pcm_period_elapsed(dpcm->substream);
39167+ if (!atomic_read(&dpcm->running))
39168+ return HRTIMER_NORESTART;
39169+
39170 hrtimer_forward_now(timer, dpcm->period_time);
39171 return HRTIMER_RESTART;
39172 }
b3bbd485 39173@@ -404,7 +403,7 @@ static int dummy_hrtimer_start(struct snd_pcm_substream *substream)
e4b2b4a8
JK
39174 struct dummy_hrtimer_pcm *dpcm = substream->runtime->private_data;
39175
39176 dpcm->base_time = hrtimer_cb_get_time(&dpcm->timer);
39177- hrtimer_start(&dpcm->timer, dpcm->period_time, HRTIMER_MODE_REL);
39178+ hrtimer_start(&dpcm->timer, dpcm->period_time, HRTIMER_MODE_REL_SOFT);
39179 atomic_set(&dpcm->running, 1);
39180 return 0;
39181 }
b3bbd485 39182@@ -414,14 +413,14 @@ static int dummy_hrtimer_stop(struct snd_pcm_substream *substream)
e4b2b4a8
JK
39183 struct dummy_hrtimer_pcm *dpcm = substream->runtime->private_data;
39184
39185 atomic_set(&dpcm->running, 0);
39186- hrtimer_cancel(&dpcm->timer);
39187+ if (!hrtimer_callback_running(&dpcm->timer))
39188+ hrtimer_cancel(&dpcm->timer);
39189 return 0;
39190 }
39191
39192 static inline void dummy_hrtimer_sync(struct dummy_hrtimer_pcm *dpcm)
39193 {
39194 hrtimer_cancel(&dpcm->timer);
39195- tasklet_kill(&dpcm->tasklet);
39196 }
39197
39198 static snd_pcm_uframes_t
b3bbd485 39199@@ -466,12 +465,10 @@ static int dummy_hrtimer_create(struct snd_pcm_substream *substream)
e4b2b4a8
JK
39200 if (!dpcm)
39201 return -ENOMEM;
39202 substream->runtime->private_data = dpcm;
39203- hrtimer_init(&dpcm->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
39204+ hrtimer_init(&dpcm->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_SOFT);
39205 dpcm->timer.function = dummy_hrtimer_callback;
39206 dpcm->substream = substream;
39207 atomic_set(&dpcm->running, 0);
39208- tasklet_init(&dpcm->tasklet, dummy_hrtimer_pcm_elapsed,
39209- (unsigned long)dpcm);
39210 return 0;
39211 }
39212
b3bbd485
JK
39213diff --git a/tools/testing/selftests/ftrace/test.d/functions b/tools/testing/selftests/ftrace/test.d/functions
39214index 6a4982d029bf..843c2b0d948e 100644
39215--- a/tools/testing/selftests/ftrace/test.d/functions
39216+++ b/tools/testing/selftests/ftrace/test.d/functions
39217@@ -70,6 +70,13 @@ disable_events() {
e4b2b4a8
JK
39218 echo 0 > events/enable
39219 }
39220
39221+clear_synthetic_events() { # reset all current synthetic events
39222+ grep -v ^# synthetic_events |
39223+ while read line; do
39224+ echo "!$line" >> synthetic_events
39225+ done
39226+}
39227+
39228 initialize_ftrace() { # Reset ftrace to initial-state
39229 # As the initial state, ftrace will be set to nop tracer,
39230 # no events, no triggers, no filters, no function filters,
b3bbd485
JK
39231diff --git a/tools/testing/selftests/ftrace/test.d/trigger/inter-event/trigger-extended-error-support.tc b/tools/testing/selftests/ftrace/test.d/trigger/inter-event/trigger-extended-error-support.tc
39232new file mode 100644
39233index 000000000000..786dce7e48be
39234--- /dev/null
39235+++ b/tools/testing/selftests/ftrace/test.d/trigger/inter-event/trigger-extended-error-support.tc
e4b2b4a8
JK
39236@@ -0,0 +1,39 @@
39237+#!/bin/sh
39238+# description: event trigger - test extended error support
39239+
39240+
39241+do_reset() {
39242+ reset_trigger
39243+ echo > set_event
39244+ clear_trace
39245+}
39246+
39247+fail() { #msg
39248+ do_reset
39249+ echo $1
39250+ exit_fail
39251+}
39252+
39253+if [ ! -f set_event ]; then
39254+ echo "event tracing is not supported"
39255+ exit_unsupported
39256+fi
39257+
39258+if [ ! -f synthetic_events ]; then
39259+ echo "synthetic event is not supported"
39260+ exit_unsupported
39261+fi
39262+
39263+reset_tracer
39264+do_reset
39265+
39266+echo "Test extended error support"
39267+echo 'hist:keys=pid:ts0=common_timestamp.usecs if comm=="ping"' > events/sched/sched_wakeup/trigger
39268+echo 'hist:keys=pid:ts0=common_timestamp.usecs if comm=="ping"' >> events/sched/sched_wakeup/trigger &>/dev/null
39269+if ! grep -q "ERROR:" events/sched/sched_wakeup/hist; then
39270+ fail "Failed to generate extended error in histogram"
39271+fi
39272+
39273+do_reset
39274+
39275+exit 0
b3bbd485
JK
39276diff --git a/tools/testing/selftests/ftrace/test.d/trigger/inter-event/trigger-field-variable-support.tc b/tools/testing/selftests/ftrace/test.d/trigger/inter-event/trigger-field-variable-support.tc
39277new file mode 100644
39278index 000000000000..7fd5b4a8f060
39279--- /dev/null
39280+++ b/tools/testing/selftests/ftrace/test.d/trigger/inter-event/trigger-field-variable-support.tc
e4b2b4a8
JK
39281@@ -0,0 +1,54 @@
39282+#!/bin/sh
39283+# description: event trigger - test field variable support
39284+
39285+do_reset() {
39286+ reset_trigger
39287+ echo > set_event
39288+ clear_trace
39289+}
39290+
39291+fail() { #msg
39292+ do_reset
39293+ echo $1
39294+ exit_fail
39295+}
39296+
39297+if [ ! -f set_event ]; then
39298+ echo "event tracing is not supported"
39299+ exit_unsupported
39300+fi
39301+
39302+if [ ! -f synthetic_events ]; then
39303+ echo "synthetic event is not supported"
39304+ exit_unsupported
39305+fi
39306+
39307+clear_synthetic_events
39308+reset_tracer
39309+do_reset
39310+
39311+echo "Test field variable support"
39312+
39313+echo 'wakeup_latency u64 lat; pid_t pid; int prio; char comm[16]' > synthetic_events
39314+echo 'hist:keys=comm:ts0=common_timestamp.usecs if comm=="ping"' > events/sched/sched_waking/trigger
39315+echo 'hist:keys=next_comm:wakeup_lat=common_timestamp.usecs-$ts0:onmatch(sched.sched_waking).wakeup_latency($wakeup_lat,next_pid,sched.sched_waking.prio,next_comm) if next_comm=="ping"' > events/sched/sched_switch/trigger
39316+echo 'hist:keys=pid,prio,comm:vals=lat:sort=pid,prio' > events/synthetic/wakeup_latency/trigger
39317+
39318+ping localhost -c 3
39319+if ! grep -q "ping" events/synthetic/wakeup_latency/hist; then
39320+ fail "Failed to create inter-event histogram"
39321+fi
39322+
39323+if ! grep -q "synthetic_prio=prio" events/sched/sched_waking/hist; then
39324+ fail "Failed to create histogram with field variable"
39325+fi
39326+
39327+echo '!hist:keys=next_comm:wakeup_lat=common_timestamp.usecs-$ts0:onmatch(sched.sched_waking).wakeup_latency($wakeup_lat,next_pid,sched.sched_waking.prio,next_comm) if next_comm=="ping"' >> events/sched/sched_switch/trigger
39328+
39329+if grep -q "synthetic_prio=prio" events/sched/sched_waking/hist; then
39330+ fail "Failed to remove histogram with field variable"
39331+fi
39332+
39333+do_reset
39334+
39335+exit 0
b3bbd485
JK
39336diff --git a/tools/testing/selftests/ftrace/test.d/trigger/inter-event/trigger-inter-event-combined-hist.tc b/tools/testing/selftests/ftrace/test.d/trigger/inter-event/trigger-inter-event-combined-hist.tc
39337new file mode 100644
39338index 000000000000..c93dbe38b5df
39339--- /dev/null
39340+++ b/tools/testing/selftests/ftrace/test.d/trigger/inter-event/trigger-inter-event-combined-hist.tc
e4b2b4a8
JK
39341@@ -0,0 +1,58 @@
39342+#!/bin/sh
39343+# description: event trigger - test inter-event combined histogram trigger
39344+
39345+do_reset() {
39346+ reset_trigger
39347+ echo > set_event
39348+ clear_trace
39349+}
39350+
39351+fail() { #msg
39352+ do_reset
39353+ echo $1
39354+ exit_fail
39355+}
39356+
39357+if [ ! -f set_event ]; then
39358+ echo "event tracing is not supported"
39359+ exit_unsupported
39360+fi
39361+
39362+if [ ! -f synthetic_events ]; then
39363+ echo "synthetic event is not supported"
39364+ exit_unsupported
39365+fi
39366+
39367+reset_tracer
39368+do_reset
39369+clear_synthetic_events
39370+
39371+echo "Test create synthetic event"
39372+
39373+echo 'waking_latency u64 lat pid_t pid' > synthetic_events
39374+if [ ! -d events/synthetic/waking_latency ]; then
39375+ fail "Failed to create waking_latency synthetic event"
39376+fi
39377+
39378+echo "Test combined histogram"
39379+
39380+echo 'hist:keys=pid:ts0=common_timestamp.usecs if comm=="ping"' > events/sched/sched_waking/trigger
39381+echo 'hist:keys=pid:waking_lat=common_timestamp.usecs-$ts0:onmatch(sched.sched_waking).waking_latency($waking_lat,pid) if comm=="ping"' > events/sched/sched_wakeup/trigger
39382+echo 'hist:keys=pid,lat:sort=pid,lat' > events/synthetic/waking_latency/trigger
39383+
39384+echo 'wakeup_latency u64 lat pid_t pid' >> synthetic_events
39385+echo 'hist:keys=pid:ts1=common_timestamp.usecs if comm=="ping"' >> events/sched/sched_wakeup/trigger
39386+echo 'hist:keys=next_pid:wakeup_lat=common_timestamp.usecs-$ts1:onmatch(sched.sched_wakeup).wakeup_latency($wakeup_lat,next_pid) if next_comm=="ping"' > events/sched/sched_switch/trigger
39387+
39388+echo 'waking+wakeup_latency u64 lat; pid_t pid' >> synthetic_events
39389+echo 'hist:keys=pid,lat:sort=pid,lat:ww_lat=$waking_lat+$wakeup_lat:onmatch(synthetic.wakeup_latency).waking+wakeup_latency($ww_lat,pid)' >> events/synthetic/wakeup_latency/trigger
39390+echo 'hist:keys=pid,lat:sort=pid,lat' >> events/synthetic/waking+wakeup_latency/trigger
39391+
39392+ping localhost -c 3
39393+if ! grep -q "pid:" events/synthetic/waking+wakeup_latency/hist; then
39394+ fail "Failed to create combined histogram"
39395+fi
39396+
39397+do_reset
39398+
39399+exit 0
b3bbd485
JK
39400diff --git a/tools/testing/selftests/ftrace/test.d/trigger/inter-event/trigger-onmatch-action-hist.tc b/tools/testing/selftests/ftrace/test.d/trigger/inter-event/trigger-onmatch-action-hist.tc
39401new file mode 100644
39402index 000000000000..e84e7d048566
39403--- /dev/null
39404+++ b/tools/testing/selftests/ftrace/test.d/trigger/inter-event/trigger-onmatch-action-hist.tc
e4b2b4a8
JK
39405@@ -0,0 +1,50 @@
39406+#!/bin/sh
39407+# description: event trigger - test inter-event histogram trigger onmatch action
39408+
39409+do_reset() {
39410+ reset_trigger
39411+ echo > set_event
39412+ clear_trace
39413+}
39414+
39415+fail() { #msg
39416+ do_reset
39417+ echo $1
39418+ exit_fail
39419+}
39420+
39421+if [ ! -f set_event ]; then
39422+ echo "event tracing is not supported"
39423+ exit_unsupported
39424+fi
39425+
39426+if [ ! -f synthetic_events ]; then
39427+ echo "synthetic event is not supported"
39428+ exit_unsupported
39429+fi
39430+
39431+clear_synthetic_events
39432+reset_tracer
39433+do_reset
39434+
39435+echo "Test create synthetic event"
39436+
39437+echo 'wakeup_latency u64 lat pid_t pid char comm[16]' > synthetic_events
39438+if [ ! -d events/synthetic/wakeup_latency ]; then
39439+ fail "Failed to create wakeup_latency synthetic event"
39440+fi
39441+
39442+echo "Test create histogram for synthetic event"
39443+echo "Test histogram variables,simple expression support and onmatch action"
39444+
39445+echo 'hist:keys=pid:ts0=common_timestamp.usecs if comm=="ping"' > events/sched/sched_wakeup/trigger
39446+echo 'hist:keys=next_pid:wakeup_lat=common_timestamp.usecs-$ts0:onmatch(sched.sched_wakeup).wakeup_latency($wakeup_lat,next_pid,next_comm) if next_comm=="ping"' > events/sched/sched_switch/trigger
39447+echo 'hist:keys=comm,pid,lat:wakeup_lat=lat:sort=lat' > events/synthetic/wakeup_latency/trigger
39448+ping localhost -c 5
39449+if ! grep -q "ping" events/synthetic/wakeup_latency/hist; then
39450+ fail "Failed to create onmatch action inter-event histogram"
39451+fi
39452+
39453+do_reset
39454+
39455+exit 0
b3bbd485
JK
39456diff --git a/tools/testing/selftests/ftrace/test.d/trigger/inter-event/trigger-onmatch-onmax-action-hist.tc b/tools/testing/selftests/ftrace/test.d/trigger/inter-event/trigger-onmatch-onmax-action-hist.tc
39457new file mode 100644
39458index 000000000000..7907d8aacde3
39459--- /dev/null
39460+++ b/tools/testing/selftests/ftrace/test.d/trigger/inter-event/trigger-onmatch-onmax-action-hist.tc
e4b2b4a8
JK
39461@@ -0,0 +1,50 @@
39462+#!/bin/sh
39463+# description: event trigger - test inter-event histogram trigger onmatch-onmax action
39464+
39465+do_reset() {
39466+ reset_trigger
39467+ echo > set_event
39468+ clear_trace
39469+}
39470+
39471+fail() { #msg
39472+ do_reset
39473+ echo $1
39474+ exit_fail
39475+}
39476+
39477+if [ ! -f set_event ]; then
39478+ echo "event tracing is not supported"
39479+ exit_unsupported
39480+fi
39481+
39482+if [ ! -f synthetic_events ]; then
39483+ echo "synthetic event is not supported"
39484+ exit_unsupported
39485+fi
39486+
39487+clear_synthetic_events
39488+reset_tracer
39489+do_reset
39490+
39491+echo "Test create synthetic event"
39492+
39493+echo 'wakeup_latency u64 lat pid_t pid char comm[16]' > synthetic_events
39494+if [ ! -d events/synthetic/wakeup_latency ]; then
39495+ fail "Failed to create wakeup_latency synthetic event"
39496+fi
39497+
39498+echo "Test create histogram for synthetic event"
39499+echo "Test histogram variables,simple expression support and onmatch-onmax action"
39500+
39501+echo 'hist:keys=pid:ts0=common_timestamp.usecs if comm=="ping"' > events/sched/sched_wakeup/trigger
39502+echo 'hist:keys=next_pid:wakeup_lat=common_timestamp.usecs-$ts0:onmatch(sched.sched_wakeup).wakeup_latency($wakeup_lat,next_pid,next_comm):onmax($wakeup_lat).save(next_comm,prev_pid,prev_prio,prev_comm) if next_comm=="ping"' >> events/sched/sched_switch/trigger
39503+echo 'hist:keys=comm,pid,lat:wakeup_lat=lat:sort=lat' > events/synthetic/wakeup_latency/trigger
39504+ping localhost -c 5
39505+if [ ! grep -q "ping" events/synthetic/wakeup_latency/hist -o ! grep -q "max:" events/sched/sched_switch/hist]; then
39506+ fail "Failed to create onmatch-onmax action inter-event histogram"
39507+fi
39508+
39509+do_reset
39510+
39511+exit 0
b3bbd485
JK
39512diff --git a/tools/testing/selftests/ftrace/test.d/trigger/inter-event/trigger-onmax-action-hist.tc b/tools/testing/selftests/ftrace/test.d/trigger/inter-event/trigger-onmax-action-hist.tc
39513new file mode 100644
39514index 000000000000..38b7ed6242b2
39515--- /dev/null
39516+++ b/tools/testing/selftests/ftrace/test.d/trigger/inter-event/trigger-onmax-action-hist.tc
e4b2b4a8
JK
39517@@ -0,0 +1,48 @@
39518+#!/bin/sh
39519+# description: event trigger - test inter-event histogram trigger onmax action
39520+
39521+do_reset() {
39522+ reset_trigger
39523+ echo > set_event
39524+ clear_trace
39525+}
39526+
39527+fail() { #msg
39528+ do_reset
39529+ echo $1
39530+ exit_fail
39531+}
39532+
39533+if [ ! -f set_event ]; then
39534+ echo "event tracing is not supported"
39535+ exit_unsupported
39536+fi
39537+
39538+if [ ! -f synthetic_events ]; then
39539+ echo "synthetic event is not supported"
39540+ exit_unsupported
39541+fi
39542+
39543+clear_synthetic_events
39544+reset_tracer
39545+do_reset
39546+
39547+echo "Test create synthetic event"
39548+
39549+echo 'wakeup_latency u64 lat pid_t pid char comm[16]' > synthetic_events
39550+if [ ! -d events/synthetic/wakeup_latency ]; then
39551+ fail "Failed to create wakeup_latency synthetic event"
39552+fi
39553+
39554+echo "Test onmax action"
39555+
39556+echo 'hist:keys=pid:ts0=common_timestamp.usecs if comm=="ping"' >> events/sched/sched_waking/trigger
39557+echo 'hist:keys=next_pid:wakeup_lat=common_timestamp.usecs-$ts0:onmax($wakeup_lat).save(next_comm,prev_pid,prev_prio,prev_comm) if next_comm=="ping"' >> events/sched/sched_switch/trigger
39558+ping localhost -c 3
39559+if ! grep -q "max:" events/sched/sched_switch/hist; then
39560+ fail "Failed to create onmax action inter-event histogram"
39561+fi
39562+
39563+do_reset
39564+
39565+exit 0
b3bbd485
JK
39566diff --git a/tools/testing/selftests/ftrace/test.d/trigger/inter-event/trigger-synthetic-event-createremove.tc b/tools/testing/selftests/ftrace/test.d/trigger/inter-event/trigger-synthetic-event-createremove.tc
39567new file mode 100644
39568index 000000000000..cef11377dcbd
39569--- /dev/null
39570+++ b/tools/testing/selftests/ftrace/test.d/trigger/inter-event/trigger-synthetic-event-createremove.tc
e4b2b4a8
JK
39571@@ -0,0 +1,54 @@
39572+#!/bin/sh
39573+# description: event trigger - test synthetic event create remove
39574+do_reset() {
39575+ reset_trigger
39576+ echo > set_event
39577+ clear_trace
39578+}
39579+
39580+fail() { #msg
39581+ do_reset
39582+ echo $1
39583+ exit_fail
39584+}
39585+
39586+if [ ! -f set_event ]; then
39587+ echo "event tracing is not supported"
39588+ exit_unsupported
39589+fi
39590+
39591+if [ ! -f synthetic_events ]; then
39592+ echo "synthetic event is not supported"
39593+ exit_unsupported
39594+fi
39595+
39596+clear_synthetic_events
39597+reset_tracer
39598+do_reset
39599+
39600+echo "Test create synthetic event"
39601+
39602+echo 'wakeup_latency u64 lat pid_t pid char comm[16]' > synthetic_events
39603+if [ ! -d events/synthetic/wakeup_latency ]; then
39604+ fail "Failed to create wakeup_latency synthetic event"
39605+fi
39606+
39607+reset_trigger
39608+
39609+echo "Test create synthetic event with an error"
39610+echo 'wakeup_latency u64 lat pid_t pid char' > synthetic_events > /dev/null
39611+if [ -d events/synthetic/wakeup_latency ]; then
39612+ fail "Created wakeup_latency synthetic event with an invalid format"
39613+fi
39614+
39615+reset_trigger
39616+
39617+echo "Test remove synthetic event"
39618+echo '!wakeup_latency u64 lat pid_t pid char comm[16]' > synthetic_events
39619+if [ -d events/synthetic/wakeup_latency ]; then
39620+ fail "Failed to delete wakeup_latency synthetic event"
39621+fi
39622+
39623+do_reset
39624+
39625+exit 0
b3bbd485
JK
39626diff --git a/virt/kvm/arm/arm.c b/virt/kvm/arm/arm.c
39627index d5f1d8364571..c09e04130bfe 100644
39628--- a/virt/kvm/arm/arm.c
39629+++ b/virt/kvm/arm/arm.c
39630@@ -69,7 +69,6 @@ static DEFINE_PER_CPU(unsigned char, kvm_arm_hardware_enabled);
e4b2b4a8
JK
39631
39632 static void kvm_arm_set_running_vcpu(struct kvm_vcpu *vcpu)
39633 {
39634- BUG_ON(preemptible());
39635 __this_cpu_write(kvm_arm_running_vcpu, vcpu);
39636 }
39637
b3bbd485 39638@@ -79,7 +78,6 @@ static void kvm_arm_set_running_vcpu(struct kvm_vcpu *vcpu)
e4b2b4a8
JK
39639 */
39640 struct kvm_vcpu *kvm_arm_get_running_vcpu(void)
39641 {
39642- BUG_ON(preemptible());
39643 return __this_cpu_read(kvm_arm_running_vcpu);
39644 }
39645
b3bbd485 39646@@ -653,7 +651,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run)
e4b2b4a8
JK
39647 * involves poking the GIC, which must be done in a
39648 * non-preemptible context.
39649 */
39650- preempt_disable();
39651+ migrate_disable();
39652
39653 kvm_pmu_flush_hwstate(vcpu);
39654
b3bbd485 39655@@ -690,7 +688,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run)
e4b2b4a8
JK
39656 kvm_pmu_sync_hwstate(vcpu);
39657 kvm_timer_sync_hwstate(vcpu);
39658 kvm_vgic_sync_hwstate(vcpu);
39659- preempt_enable();
39660+ migrate_enable();
39661 continue;
39662 }
39663
b3bbd485 39664@@ -745,7 +743,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run)
e4b2b4a8
JK
39665
39666 kvm_vgic_sync_hwstate(vcpu);
39667
39668- preempt_enable();
39669+ migrate_enable();
39670
39671 ret = handle_exit(vcpu, run, ret);
39672 }
This page took 5.796529 seconds and 4 git commands to generate.